1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
26 /* All Rights Reserved */
27
28 /*
29 * University Copyright- Copyright (c) 1982, 1986, 1988
30 * The Regents of the University of California
31 * All Rights Reserved
32 *
33 * University Acknowledgment- Portions of this document are derived from
34 * software developed by the University of California, Berkeley, and its
35 * contributors.
36 */
37
38 /*
39 * VM - physical page management.
40 */
41
42 #include <sys/types.h>
43 #include <sys/t_lock.h>
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/errno.h>
47 #include <sys/time.h>
48 #include <sys/vnode.h>
49 #include <sys/vm.h>
50 #include <sys/vtrace.h>
51 #include <sys/swap.h>
52 #include <sys/cmn_err.h>
53 #include <sys/tuneable.h>
54 #include <sys/sysmacros.h>
55 #include <sys/cpuvar.h>
56 #include <sys/callb.h>
57 #include <sys/debug.h>
58 #include <sys/tnf_probe.h>
59 #include <sys/condvar_impl.h>
60 #include <sys/mem_config.h>
61 #include <sys/mem_cage.h>
62 #include <sys/kmem.h>
63 #include <sys/atomic.h>
64 #include <sys/strlog.h>
65 #include <sys/mman.h>
66 #include <sys/ontrap.h>
67 #include <sys/lgrp.h>
68 #include <sys/vfs.h>
69
70 #include <vm/hat.h>
71 #include <vm/anon.h>
72 #include <vm/page.h>
73 #include <vm/seg.h>
74 #include <vm/pvn.h>
75 #include <vm/seg_kmem.h>
76 #include <vm/vm_dep.h>
77 #include <sys/vm_usage.h>
78 #include <fs/fs_subr.h>
79 #include <sys/ddi.h>
80 #include <sys/modctl.h>
81
82 static pgcnt_t max_page_get; /* max page_get request size in pages */
83 pgcnt_t total_pages = 0; /* total number of pages (used by /proc) */
84
85 /*
86 * freemem_lock protects all freemem variables:
87 * availrmem. Also this lock protects the globals which track the
88 * availrmem changes for accurate kernel footprint calculation.
89 * See below for an explanation of these
90 * globals.
91 */
92 kmutex_t freemem_lock;
93 pgcnt_t availrmem;
94 pgcnt_t availrmem_initial;
95
96 /*
97 * These globals track availrmem changes to get a more accurate
98 * estimate of tke kernel size. Historically pp_kernel is used for
99 * kernel size and is based on availrmem. But availrmem is adjusted for
100 * locked pages in the system not just for kernel locked pages.
101 * These new counters will track the pages locked through segvn and
102 * by explicit user locking.
103 *
104 * pages_locked : How many pages are locked because of user specified
105 * locking through mlock or plock.
106 *
107 * pages_useclaim,pages_claimed : These two variables track the
108 * claim adjustments because of the protection changes on a segvn segment.
109 *
110 * All these globals are protected by the same lock which protects availrmem.
111 */
112 pgcnt_t pages_locked = 0;
113 pgcnt_t pages_useclaim = 0;
114 pgcnt_t pages_claimed = 0;
115
116
117 /*
118 * new_freemem_lock protects freemem, freemem_wait & freemem_cv.
119 */
120 static kmutex_t new_freemem_lock;
121 static uint_t freemem_wait; /* someone waiting for freemem */
122 static kcondvar_t freemem_cv;
123
124 /*
125 * The logical page free list is maintained as two lists, the 'free'
126 * and the 'cache' lists.
127 * The free list contains those pages that should be reused first.
128 *
129 * The implementation of the lists is machine dependent.
130 * page_get_freelist(), page_get_cachelist(),
131 * page_list_sub(), and page_list_add()
132 * form the interface to the machine dependent implementation.
133 *
134 * Pages with p_free set are on the cache list.
135 * Pages with p_free and p_age set are on the free list,
136 *
137 * A page may be locked while on either list.
138 */
139
140 /*
141 * free list accounting stuff.
142 *
143 *
144 * Spread out the value for the number of pages on the
145 * page free and page cache lists. If there is just one
146 * value, then it must be under just one lock.
147 * The lock contention and cache traffic are a real bother.
148 *
149 * When we acquire and then drop a single pcf lock
150 * we can start in the middle of the array of pcf structures.
151 * If we acquire more than one pcf lock at a time, we need to
152 * start at the front to avoid deadlocking.
153 *
154 * pcf_count holds the number of pages in each pool.
155 *
156 * pcf_block is set when page_create_get_something() has asked the
157 * PSM page freelist and page cachelist routines without specifying
158 * a color and nothing came back. This is used to block anything
159 * else from moving pages from one list to the other while the
160 * lists are searched again. If a page is freeed while pcf_block is
161 * set, then pcf_reserve is incremented. pcgs_unblock() takes care
162 * of clearning pcf_block, doing the wakeups, etc.
163 */
164
165 #define MAX_PCF_FANOUT NCPU
166 static uint_t pcf_fanout = 1; /* Will get changed at boot time */
167 static uint_t pcf_fanout_mask = 0;
168
169 struct pcf {
170 kmutex_t pcf_lock; /* protects the structure */
171 uint_t pcf_count; /* page count */
172 uint_t pcf_wait; /* number of waiters */
173 uint_t pcf_block; /* pcgs flag to page_free() */
174 uint_t pcf_reserve; /* pages freed after pcf_block set */
175 uint_t pcf_fill[10]; /* to line up on the caches */
176 };
177
178 /*
179 * PCF_INDEX hash needs to be dynamic (every so often the hash changes where
180 * it will hash the cpu to). This is done to prevent a drain condition
181 * from happening. This drain condition will occur when pcf_count decrement
182 * occurs on cpu A and the increment of pcf_count always occurs on cpu B. An
183 * example of this shows up with device interrupts. The dma buffer is allocated
184 * by the cpu requesting the IO thus the pcf_count is decremented based on that.
185 * When the memory is returned by the interrupt thread, the pcf_count will be
186 * incremented based on the cpu servicing the interrupt.
187 */
188 static struct pcf pcf[MAX_PCF_FANOUT];
189 #define PCF_INDEX() ((int)(((long)CPU->cpu_seqid) + \
190 (randtick() >> 24)) & (pcf_fanout_mask))
191
192 static int pcf_decrement_bucket(pgcnt_t);
193 static int pcf_decrement_multiple(pgcnt_t *, pgcnt_t, int);
194
195 kmutex_t pcgs_lock; /* serializes page_create_get_ */
196 kmutex_t pcgs_cagelock; /* serializes NOSLEEP cage allocs */
197 kmutex_t pcgs_wait_lock; /* used for delay in pcgs */
198 static kcondvar_t pcgs_cv; /* cv for delay in pcgs */
199
200 #ifdef VM_STATS
201
202 /*
203 * No locks, but so what, they are only statistics.
204 */
205
206 static struct page_tcnt {
207 int pc_free_cache; /* free's into cache list */
208 int pc_free_dontneed; /* free's with dontneed */
209 int pc_free_pageout; /* free's from pageout */
210 int pc_free_free; /* free's into free list */
211 int pc_free_pages; /* free's into large page free list */
212 int pc_destroy_pages; /* large page destroy's */
213 int pc_get_cache; /* get's from cache list */
214 int pc_get_free; /* get's from free list */
215 int pc_reclaim; /* reclaim's */
216 int pc_abortfree; /* abort's of free pages */
217 int pc_find_hit; /* find's that find page */
218 int pc_find_miss; /* find's that don't find page */
219 int pc_destroy_free; /* # of free pages destroyed */
220 #define PC_HASH_CNT (4*PAGE_HASHAVELEN)
221 int pc_find_hashlen[PC_HASH_CNT+1];
222 int pc_addclaim_pages;
223 int pc_subclaim_pages;
224 int pc_free_replacement_page[2];
225 int pc_try_demote_pages[6];
226 int pc_demote_pages[2];
227 } pagecnt;
228
229 uint_t hashin_count;
230 uint_t hashin_not_held;
231 uint_t hashin_already;
232
233 uint_t hashout_count;
234 uint_t hashout_not_held;
235
236 uint_t page_create_count;
237 uint_t page_create_not_enough;
238 uint_t page_create_not_enough_again;
239 uint_t page_create_zero;
240 uint_t page_create_hashout;
241 uint_t page_create_page_lock_failed;
242 uint_t page_create_trylock_failed;
243 uint_t page_create_found_one;
244 uint_t page_create_hashin_failed;
245 uint_t page_create_dropped_phm;
246
247 uint_t page_create_new;
248 uint_t page_create_exists;
249 uint_t page_create_putbacks;
250 uint_t page_create_overshoot;
251
252 uint_t page_reclaim_zero;
253 uint_t page_reclaim_zero_locked;
254
255 uint_t page_rename_exists;
256 uint_t page_rename_count;
257
258 uint_t page_lookup_cnt[20];
259 uint_t page_lookup_nowait_cnt[10];
260 uint_t page_find_cnt;
261 uint_t page_exists_cnt;
262 uint_t page_exists_forreal_cnt;
263 uint_t page_lookup_dev_cnt;
264 uint_t get_cachelist_cnt;
265 uint_t page_create_cnt[10];
266 uint_t alloc_pages[9];
267 uint_t page_exphcontg[19];
268 uint_t page_create_large_cnt[10];
269
270 /*
271 * Collects statistics.
272 */
273 #define PAGE_HASH_SEARCH(index, pp, vp, off) { \
274 uint_t mylen = 0; \
275 \
276 for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash, mylen++) { \
277 if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \
278 break; \
279 } \
280 if ((pp) != NULL) \
281 pagecnt.pc_find_hit++; \
282 else \
283 pagecnt.pc_find_miss++; \
284 if (mylen > PC_HASH_CNT) \
285 mylen = PC_HASH_CNT; \
286 pagecnt.pc_find_hashlen[mylen]++; \
287 }
288
289 #else /* VM_STATS */
290
291 /*
292 * Don't collect statistics
293 */
294 #define PAGE_HASH_SEARCH(index, pp, vp, off) { \
295 for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \
296 if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \
297 break; \
298 } \
299 }
300
301 #endif /* VM_STATS */
302
303
304
305 #ifdef DEBUG
306 #define MEMSEG_SEARCH_STATS
307 #endif
308
309 #ifdef MEMSEG_SEARCH_STATS
310 struct memseg_stats {
311 uint_t nsearch;
312 uint_t nlastwon;
313 uint_t nhashwon;
314 uint_t nnotfound;
315 } memseg_stats;
316
317 #define MEMSEG_STAT_INCR(v) \
318 atomic_inc_32(&memseg_stats.v)
319 #else
320 #define MEMSEG_STAT_INCR(x)
321 #endif
322
323 struct memseg *memsegs; /* list of memory segments */
324
325 /*
326 * /etc/system tunable to control large page allocation hueristic.
327 *
328 * Setting to LPAP_LOCAL will heavily prefer the local lgroup over remote lgroup
329 * for large page allocation requests. If a large page is not readily
330 * avaliable on the local freelists we will go through additional effort
331 * to create a large page, potentially moving smaller pages around to coalesce
332 * larger pages in the local lgroup.
333 * Default value of LPAP_DEFAULT will go to remote freelists if large pages
334 * are not readily available in the local lgroup.
335 */
336 enum lpap {
337 LPAP_DEFAULT, /* default large page allocation policy */
338 LPAP_LOCAL /* local large page allocation policy */
339 };
340
341 enum lpap lpg_alloc_prefer = LPAP_DEFAULT;
342
343 static void page_init_mem_config(void);
344 static int page_do_hashin(page_t *, vnode_t *, u_offset_t);
345 static void page_do_hashout(page_t *);
346 static void page_capture_init();
347 int page_capture_take_action(page_t *, uint_t, void *);
348
349 static void page_demote_vp_pages(page_t *);
350
351
352 void
pcf_init(void)353 pcf_init(void)
354
355 {
356 if (boot_ncpus != -1) {
357 pcf_fanout = boot_ncpus;
358 } else {
359 pcf_fanout = max_ncpus;
360 }
361 #ifdef sun4v
362 /*
363 * Force at least 4 buckets if possible for sun4v.
364 */
365 pcf_fanout = MAX(pcf_fanout, 4);
366 #endif /* sun4v */
367
368 /*
369 * Round up to the nearest power of 2.
370 */
371 pcf_fanout = MIN(pcf_fanout, MAX_PCF_FANOUT);
372 if (!ISP2(pcf_fanout)) {
373 pcf_fanout = 1 << highbit(pcf_fanout);
374
375 if (pcf_fanout > MAX_PCF_FANOUT) {
376 pcf_fanout = 1 << (highbit(MAX_PCF_FANOUT) - 1);
377 }
378 }
379 pcf_fanout_mask = pcf_fanout - 1;
380 }
381
382 /*
383 * vm subsystem related initialization
384 */
385 void
vm_init(void)386 vm_init(void)
387 {
388 boolean_t callb_vm_cpr(void *, int);
389
390 (void) callb_add(callb_vm_cpr, 0, CB_CL_CPR_VM, "vm");
391 page_init_mem_config();
392 page_retire_init();
393 vm_usage_init();
394 page_capture_init();
395 }
396
397 /*
398 * This function is called at startup and when memory is added or deleted.
399 */
400 void
init_pages_pp_maximum()401 init_pages_pp_maximum()
402 {
403 static pgcnt_t p_min;
404 static pgcnt_t pages_pp_maximum_startup;
405 static pgcnt_t avrmem_delta;
406 static int init_done;
407 static int user_set; /* true if set in /etc/system */
408
409 if (init_done == 0) {
410
411 /* If the user specified a value, save it */
412 if (pages_pp_maximum != 0) {
413 user_set = 1;
414 pages_pp_maximum_startup = pages_pp_maximum;
415 }
416
417 /*
418 * Setting of pages_pp_maximum is based first time
419 * on the value of availrmem just after the start-up
420 * allocations. To preserve this relationship at run
421 * time, use a delta from availrmem_initial.
422 */
423 ASSERT(availrmem_initial >= availrmem);
424 avrmem_delta = availrmem_initial - availrmem;
425
426 /* The allowable floor of pages_pp_maximum */
427 p_min = tune.t_minarmem + 100;
428
429 /* Make sure we don't come through here again. */
430 init_done = 1;
431 }
432 /*
433 * Determine pages_pp_maximum, the number of currently available
434 * pages (availrmem) that can't be `locked'. If not set by
435 * the user, we set it to 4% of the currently available memory
436 * plus 4MB.
437 * But we also insist that it be greater than tune.t_minarmem;
438 * otherwise a process could lock down a lot of memory, get swapped
439 * out, and never have enough to get swapped back in.
440 */
441 if (user_set)
442 pages_pp_maximum = pages_pp_maximum_startup;
443 else
444 pages_pp_maximum = ((availrmem_initial - avrmem_delta) / 25)
445 + btop(4 * 1024 * 1024);
446
447 if (pages_pp_maximum <= p_min) {
448 pages_pp_maximum = p_min;
449 }
450 }
451
452 void
set_max_page_get(pgcnt_t target_total_pages)453 set_max_page_get(pgcnt_t target_total_pages)
454 {
455 max_page_get = target_total_pages / 2;
456 }
457
458 static pgcnt_t pending_delete;
459
460 /*ARGSUSED*/
461 static void
page_mem_config_post_add(void * arg,pgcnt_t delta_pages)462 page_mem_config_post_add(
463 void *arg,
464 pgcnt_t delta_pages)
465 {
466 set_max_page_get(total_pages - pending_delete);
467 init_pages_pp_maximum();
468 }
469
470 /*ARGSUSED*/
471 static int
page_mem_config_pre_del(void * arg,pgcnt_t delta_pages)472 page_mem_config_pre_del(
473 void *arg,
474 pgcnt_t delta_pages)
475 {
476 pgcnt_t nv;
477
478 nv = atomic_add_long_nv(&pending_delete, (spgcnt_t)delta_pages);
479 set_max_page_get(total_pages - nv);
480 return (0);
481 }
482
483 /*ARGSUSED*/
484 static void
page_mem_config_post_del(void * arg,pgcnt_t delta_pages,int cancelled)485 page_mem_config_post_del(
486 void *arg,
487 pgcnt_t delta_pages,
488 int cancelled)
489 {
490 pgcnt_t nv;
491
492 nv = atomic_add_long_nv(&pending_delete, -(spgcnt_t)delta_pages);
493 set_max_page_get(total_pages - nv);
494 if (!cancelled)
495 init_pages_pp_maximum();
496 }
497
498 static kphysm_setup_vector_t page_mem_config_vec = {
499 KPHYSM_SETUP_VECTOR_VERSION,
500 page_mem_config_post_add,
501 page_mem_config_pre_del,
502 page_mem_config_post_del,
503 };
504
505 static void
page_init_mem_config(void)506 page_init_mem_config(void)
507 {
508 int ret;
509
510 ret = kphysm_setup_func_register(&page_mem_config_vec, (void *)NULL);
511 ASSERT(ret == 0);
512 }
513
514 /*
515 * Evenly spread out the PCF counters for large free pages
516 */
517 static void
page_free_large_ctr(pgcnt_t npages)518 page_free_large_ctr(pgcnt_t npages)
519 {
520 static struct pcf *p = pcf;
521 pgcnt_t lump;
522
523 freemem += npages;
524
525 lump = roundup(npages, pcf_fanout) / pcf_fanout;
526
527 while (npages > 0) {
528
529 ASSERT(!p->pcf_block);
530
531 if (lump < npages) {
532 p->pcf_count += (uint_t)lump;
533 npages -= lump;
534 } else {
535 p->pcf_count += (uint_t)npages;
536 npages = 0;
537 }
538
539 ASSERT(!p->pcf_wait);
540
541 if (++p > &pcf[pcf_fanout - 1])
542 p = pcf;
543 }
544
545 ASSERT(npages == 0);
546 }
547
548 /*
549 * Add a physical chunk of memory to the system free lists during startup.
550 * Platform specific startup() allocates the memory for the page structs.
551 *
552 * num - number of page structures
553 * base - page number (pfn) to be associated with the first page.
554 *
555 * Since we are doing this during startup (ie. single threaded), we will
556 * use shortcut routines to avoid any locking overhead while putting all
557 * these pages on the freelists.
558 *
559 * NOTE: Any changes performed to page_free(), must also be performed to
560 * add_physmem() since this is how we initialize all page_t's at
561 * boot time.
562 */
563 void
add_physmem(page_t * pp,pgcnt_t num,pfn_t pnum)564 add_physmem(
565 page_t *pp,
566 pgcnt_t num,
567 pfn_t pnum)
568 {
569 page_t *root = NULL;
570 uint_t szc = page_num_pagesizes() - 1;
571 pgcnt_t large = page_get_pagecnt(szc);
572 pgcnt_t cnt = 0;
573
574 TRACE_2(TR_FAC_VM, TR_PAGE_INIT,
575 "add_physmem:pp %p num %lu", pp, num);
576
577 /*
578 * Arbitrarily limit the max page_get request
579 * to 1/2 of the page structs we have.
580 */
581 total_pages += num;
582 set_max_page_get(total_pages);
583
584 PLCNT_MODIFY_MAX(pnum, (long)num);
585
586 /*
587 * The physical space for the pages array
588 * representing ram pages has already been
589 * allocated. Here we initialize each lock
590 * in the page structure, and put each on
591 * the free list
592 */
593 for (; num; pp++, pnum++, num--) {
594
595 /*
596 * this needs to fill in the page number
597 * and do any other arch specific initialization
598 */
599 add_physmem_cb(pp, pnum);
600
601 pp->p_lckcnt = 0;
602 pp->p_cowcnt = 0;
603 pp->p_slckcnt = 0;
604
605 /*
606 * Initialize the page lock as unlocked, since nobody
607 * can see or access this page yet.
608 */
609 pp->p_selock = 0;
610
611 /*
612 * Initialize IO lock
613 */
614 page_iolock_init(pp);
615
616 /*
617 * initialize other fields in the page_t
618 */
619 PP_SETFREE(pp);
620 page_clr_all_props(pp);
621 PP_SETAGED(pp);
622 pp->p_offset = (u_offset_t)-1;
623 pp->p_next = pp;
624 pp->p_prev = pp;
625
626 /*
627 * Simple case: System doesn't support large pages.
628 */
629 if (szc == 0) {
630 pp->p_szc = 0;
631 page_free_at_startup(pp);
632 continue;
633 }
634
635 /*
636 * Handle unaligned pages, we collect them up onto
637 * the root page until we have a full large page.
638 */
639 if (!IS_P2ALIGNED(pnum, large)) {
640
641 /*
642 * If not in a large page,
643 * just free as small page.
644 */
645 if (root == NULL) {
646 pp->p_szc = 0;
647 page_free_at_startup(pp);
648 continue;
649 }
650
651 /*
652 * Link a constituent page into the large page.
653 */
654 pp->p_szc = szc;
655 page_list_concat(&root, &pp);
656
657 /*
658 * When large page is fully formed, free it.
659 */
660 if (++cnt == large) {
661 page_free_large_ctr(cnt);
662 page_list_add_pages(root, PG_LIST_ISINIT);
663 root = NULL;
664 cnt = 0;
665 }
666 continue;
667 }
668
669 /*
670 * At this point we have a page number which
671 * is aligned. We assert that we aren't already
672 * in a different large page.
673 */
674 ASSERT(IS_P2ALIGNED(pnum, large));
675 ASSERT(root == NULL && cnt == 0);
676
677 /*
678 * If insufficient number of pages left to form
679 * a large page, just free the small page.
680 */
681 if (num < large) {
682 pp->p_szc = 0;
683 page_free_at_startup(pp);
684 continue;
685 }
686
687 /*
688 * Otherwise start a new large page.
689 */
690 pp->p_szc = szc;
691 cnt++;
692 root = pp;
693 }
694 ASSERT(root == NULL && cnt == 0);
695 }
696
697 /*
698 * Find a page representing the specified [vp, offset].
699 * If we find the page but it is intransit coming in,
700 * it will have an "exclusive" lock and we wait for
701 * the i/o to complete. A page found on the free list
702 * is always reclaimed and then locked. On success, the page
703 * is locked, its data is valid and it isn't on the free
704 * list, while a NULL is returned if the page doesn't exist.
705 */
706 page_t *
page_lookup(vnode_t * vp,u_offset_t off,se_t se)707 page_lookup(vnode_t *vp, u_offset_t off, se_t se)
708 {
709 return (page_lookup_create(vp, off, se, NULL, NULL, 0));
710 }
711
712 /*
713 * Find a page representing the specified [vp, offset].
714 * We either return the one we found or, if passed in,
715 * create one with identity of [vp, offset] of the
716 * pre-allocated page. If we find existing page but it is
717 * intransit coming in, it will have an "exclusive" lock
718 * and we wait for the i/o to complete. A page found on
719 * the free list is always reclaimed and then locked.
720 * On success, the page is locked, its data is valid and
721 * it isn't on the free list, while a NULL is returned
722 * if the page doesn't exist and newpp is NULL;
723 */
724 page_t *
page_lookup_create(vnode_t * vp,u_offset_t off,se_t se,page_t * newpp,spgcnt_t * nrelocp,int flags)725 page_lookup_create(
726 vnode_t *vp,
727 u_offset_t off,
728 se_t se,
729 page_t *newpp,
730 spgcnt_t *nrelocp,
731 int flags)
732 {
733 page_t *pp;
734 kmutex_t *phm;
735 ulong_t index;
736 uint_t hash_locked;
737 uint_t es;
738
739 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
740 VM_STAT_ADD(page_lookup_cnt[0]);
741 ASSERT(newpp ? PAGE_EXCL(newpp) : 1);
742
743 /*
744 * Acquire the appropriate page hash lock since
745 * we have to search the hash list. Pages that
746 * hash to this list can't change identity while
747 * this lock is held.
748 */
749 hash_locked = 0;
750 index = PAGE_HASH_FUNC(vp, off);
751 phm = NULL;
752 top:
753 PAGE_HASH_SEARCH(index, pp, vp, off);
754 if (pp != NULL) {
755 VM_STAT_ADD(page_lookup_cnt[1]);
756 es = (newpp != NULL) ? 1 : 0;
757 es |= flags;
758 if (!hash_locked) {
759 VM_STAT_ADD(page_lookup_cnt[2]);
760 if (!page_try_reclaim_lock(pp, se, es)) {
761 /*
762 * On a miss, acquire the phm. Then
763 * next time, page_lock() will be called,
764 * causing a wait if the page is busy.
765 * just looping with page_trylock() would
766 * get pretty boring.
767 */
768 VM_STAT_ADD(page_lookup_cnt[3]);
769 phm = PAGE_HASH_MUTEX(index);
770 mutex_enter(phm);
771 hash_locked = 1;
772 goto top;
773 }
774 } else {
775 VM_STAT_ADD(page_lookup_cnt[4]);
776 if (!page_lock_es(pp, se, phm, P_RECLAIM, es)) {
777 VM_STAT_ADD(page_lookup_cnt[5]);
778 goto top;
779 }
780 }
781
782 /*
783 * Since `pp' is locked it can not change identity now.
784 * Reconfirm we locked the correct page.
785 *
786 * Both the p_vnode and p_offset *must* be cast volatile
787 * to force a reload of their values: The PAGE_HASH_SEARCH
788 * macro will have stuffed p_vnode and p_offset into
789 * registers before calling page_trylock(); another thread,
790 * actually holding the hash lock, could have changed the
791 * page's identity in memory, but our registers would not
792 * be changed, fooling the reconfirmation. If the hash
793 * lock was held during the search, the casting would
794 * not be needed.
795 */
796 VM_STAT_ADD(page_lookup_cnt[6]);
797 if (((volatile struct vnode *)(pp->p_vnode) != vp) ||
798 ((volatile u_offset_t)(pp->p_offset) != off)) {
799 VM_STAT_ADD(page_lookup_cnt[7]);
800 if (hash_locked) {
801 panic("page_lookup_create: lost page %p",
802 (void *)pp);
803 /*NOTREACHED*/
804 }
805 page_unlock(pp);
806 phm = PAGE_HASH_MUTEX(index);
807 mutex_enter(phm);
808 hash_locked = 1;
809 goto top;
810 }
811
812 /*
813 * If page_trylock() was called, then pp may still be on
814 * the cachelist (can't be on the free list, it would not
815 * have been found in the search). If it is on the
816 * cachelist it must be pulled now. To pull the page from
817 * the cachelist, it must be exclusively locked.
818 *
819 * The other big difference between page_trylock() and
820 * page_lock(), is that page_lock() will pull the
821 * page from whatever free list (the cache list in this
822 * case) the page is on. If page_trylock() was used
823 * above, then we have to do the reclaim ourselves.
824 */
825 if ((!hash_locked) && (PP_ISFREE(pp))) {
826 ASSERT(PP_ISAGED(pp) == 0);
827 VM_STAT_ADD(page_lookup_cnt[8]);
828
829 /*
830 * page_relcaim will insure that we
831 * have this page exclusively
832 */
833
834 if (!page_reclaim(pp, NULL)) {
835 /*
836 * Page_reclaim dropped whatever lock
837 * we held.
838 */
839 VM_STAT_ADD(page_lookup_cnt[9]);
840 phm = PAGE_HASH_MUTEX(index);
841 mutex_enter(phm);
842 hash_locked = 1;
843 goto top;
844 } else if (se == SE_SHARED && newpp == NULL) {
845 VM_STAT_ADD(page_lookup_cnt[10]);
846 page_downgrade(pp);
847 }
848 }
849
850 if (hash_locked) {
851 mutex_exit(phm);
852 }
853
854 if (newpp != NULL && pp->p_szc < newpp->p_szc &&
855 PAGE_EXCL(pp) && nrelocp != NULL) {
856 ASSERT(nrelocp != NULL);
857 (void) page_relocate(&pp, &newpp, 1, 1, nrelocp,
858 NULL);
859 if (*nrelocp > 0) {
860 VM_STAT_COND_ADD(*nrelocp == 1,
861 page_lookup_cnt[11]);
862 VM_STAT_COND_ADD(*nrelocp > 1,
863 page_lookup_cnt[12]);
864 pp = newpp;
865 se = SE_EXCL;
866 } else {
867 if (se == SE_SHARED) {
868 page_downgrade(pp);
869 }
870 VM_STAT_ADD(page_lookup_cnt[13]);
871 }
872 } else if (newpp != NULL && nrelocp != NULL) {
873 if (PAGE_EXCL(pp) && se == SE_SHARED) {
874 page_downgrade(pp);
875 }
876 VM_STAT_COND_ADD(pp->p_szc < newpp->p_szc,
877 page_lookup_cnt[14]);
878 VM_STAT_COND_ADD(pp->p_szc == newpp->p_szc,
879 page_lookup_cnt[15]);
880 VM_STAT_COND_ADD(pp->p_szc > newpp->p_szc,
881 page_lookup_cnt[16]);
882 } else if (newpp != NULL && PAGE_EXCL(pp)) {
883 se = SE_EXCL;
884 }
885 } else if (!hash_locked) {
886 VM_STAT_ADD(page_lookup_cnt[17]);
887 phm = PAGE_HASH_MUTEX(index);
888 mutex_enter(phm);
889 hash_locked = 1;
890 goto top;
891 } else if (newpp != NULL) {
892 /*
893 * If we have a preallocated page then
894 * insert it now and basically behave like
895 * page_create.
896 */
897 VM_STAT_ADD(page_lookup_cnt[18]);
898 /*
899 * Since we hold the page hash mutex and
900 * just searched for this page, page_hashin
901 * had better not fail. If it does, that
902 * means some thread did not follow the
903 * page hash mutex rules. Panic now and
904 * get it over with. As usual, go down
905 * holding all the locks.
906 */
907 ASSERT(MUTEX_HELD(phm));
908 if (!page_hashin(newpp, vp, off, phm)) {
909 ASSERT(MUTEX_HELD(phm));
910 panic("page_lookup_create: hashin failed %p %p %llx %p",
911 (void *)newpp, (void *)vp, off, (void *)phm);
912 /*NOTREACHED*/
913 }
914 ASSERT(MUTEX_HELD(phm));
915 mutex_exit(phm);
916 phm = NULL;
917 page_set_props(newpp, P_REF);
918 page_io_lock(newpp);
919 pp = newpp;
920 se = SE_EXCL;
921 } else {
922 VM_STAT_ADD(page_lookup_cnt[19]);
923 mutex_exit(phm);
924 }
925
926 ASSERT(pp ? PAGE_LOCKED_SE(pp, se) : 1);
927
928 ASSERT(pp ? ((PP_ISFREE(pp) == 0) && (PP_ISAGED(pp) == 0)) : 1);
929
930 return (pp);
931 }
932
933 /*
934 * Search the hash list for the page representing the
935 * specified [vp, offset] and return it locked. Skip
936 * free pages and pages that cannot be locked as requested.
937 * Used while attempting to kluster pages.
938 */
939 page_t *
page_lookup_nowait(vnode_t * vp,u_offset_t off,se_t se)940 page_lookup_nowait(vnode_t *vp, u_offset_t off, se_t se)
941 {
942 page_t *pp;
943 kmutex_t *phm;
944 ulong_t index;
945 uint_t locked;
946
947 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
948 VM_STAT_ADD(page_lookup_nowait_cnt[0]);
949
950 index = PAGE_HASH_FUNC(vp, off);
951 PAGE_HASH_SEARCH(index, pp, vp, off);
952 locked = 0;
953 if (pp == NULL) {
954 top:
955 VM_STAT_ADD(page_lookup_nowait_cnt[1]);
956 locked = 1;
957 phm = PAGE_HASH_MUTEX(index);
958 mutex_enter(phm);
959 PAGE_HASH_SEARCH(index, pp, vp, off);
960 }
961
962 if (pp == NULL || PP_ISFREE(pp)) {
963 VM_STAT_ADD(page_lookup_nowait_cnt[2]);
964 pp = NULL;
965 } else {
966 if (!page_trylock(pp, se)) {
967 VM_STAT_ADD(page_lookup_nowait_cnt[3]);
968 pp = NULL;
969 } else {
970 VM_STAT_ADD(page_lookup_nowait_cnt[4]);
971 /*
972 * See the comment in page_lookup()
973 */
974 if (((volatile struct vnode *)(pp->p_vnode) != vp) ||
975 ((u_offset_t)(pp->p_offset) != off)) {
976 VM_STAT_ADD(page_lookup_nowait_cnt[5]);
977 if (locked) {
978 panic("page_lookup_nowait %p",
979 (void *)pp);
980 /*NOTREACHED*/
981 }
982 page_unlock(pp);
983 goto top;
984 }
985 if (PP_ISFREE(pp)) {
986 VM_STAT_ADD(page_lookup_nowait_cnt[6]);
987 page_unlock(pp);
988 pp = NULL;
989 }
990 }
991 }
992 if (locked) {
993 VM_STAT_ADD(page_lookup_nowait_cnt[7]);
994 mutex_exit(phm);
995 }
996
997 ASSERT(pp ? PAGE_LOCKED_SE(pp, se) : 1);
998
999 return (pp);
1000 }
1001
1002 /*
1003 * Search the hash list for a page with the specified [vp, off]
1004 * that is known to exist and is already locked. This routine
1005 * is typically used by segment SOFTUNLOCK routines.
1006 */
1007 page_t *
page_find(vnode_t * vp,u_offset_t off)1008 page_find(vnode_t *vp, u_offset_t off)
1009 {
1010 page_t *pp;
1011 kmutex_t *phm;
1012 ulong_t index;
1013
1014 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
1015 VM_STAT_ADD(page_find_cnt);
1016
1017 index = PAGE_HASH_FUNC(vp, off);
1018 phm = PAGE_HASH_MUTEX(index);
1019
1020 mutex_enter(phm);
1021 PAGE_HASH_SEARCH(index, pp, vp, off);
1022 mutex_exit(phm);
1023
1024 ASSERT(pp == NULL || PAGE_LOCKED(pp) || panicstr);
1025 return (pp);
1026 }
1027
1028 /*
1029 * Determine whether a page with the specified [vp, off]
1030 * currently exists in the system. Obviously this should
1031 * only be considered as a hint since nothing prevents the
1032 * page from disappearing or appearing immediately after
1033 * the return from this routine. Subsequently, we don't
1034 * even bother to lock the list.
1035 */
1036 page_t *
page_exists(vnode_t * vp,u_offset_t off)1037 page_exists(vnode_t *vp, u_offset_t off)
1038 {
1039 page_t *pp;
1040 ulong_t index;
1041
1042 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
1043 VM_STAT_ADD(page_exists_cnt);
1044
1045 index = PAGE_HASH_FUNC(vp, off);
1046 PAGE_HASH_SEARCH(index, pp, vp, off);
1047
1048 return (pp);
1049 }
1050
1051 /*
1052 * Determine if physically contiguous pages exist for [vp, off] - [vp, off +
1053 * page_size(szc)) range. if they exist and ppa is not NULL fill ppa array
1054 * with these pages locked SHARED. If necessary reclaim pages from
1055 * freelist. Return 1 if contiguous pages exist and 0 otherwise.
1056 *
1057 * If we fail to lock pages still return 1 if pages exist and contiguous.
1058 * But in this case return value is just a hint. ppa array won't be filled.
1059 * Caller should initialize ppa[0] as NULL to distinguish return value.
1060 *
1061 * Returns 0 if pages don't exist or not physically contiguous.
1062 *
1063 * This routine doesn't work for anonymous(swapfs) pages.
1064 */
1065 int
page_exists_physcontig(vnode_t * vp,u_offset_t off,uint_t szc,page_t * ppa[])1066 page_exists_physcontig(vnode_t *vp, u_offset_t off, uint_t szc, page_t *ppa[])
1067 {
1068 pgcnt_t pages;
1069 pfn_t pfn;
1070 page_t *rootpp;
1071 pgcnt_t i;
1072 pgcnt_t j;
1073 u_offset_t save_off = off;
1074 ulong_t index;
1075 kmutex_t *phm;
1076 page_t *pp;
1077 uint_t pszc;
1078 int loopcnt = 0;
1079
1080 ASSERT(szc != 0);
1081 ASSERT(vp != NULL);
1082 ASSERT(!IS_SWAPFSVP(vp));
1083 ASSERT(!VN_ISKAS(vp));
1084
1085 again:
1086 if (++loopcnt > 3) {
1087 VM_STAT_ADD(page_exphcontg[0]);
1088 return (0);
1089 }
1090
1091 index = PAGE_HASH_FUNC(vp, off);
1092 phm = PAGE_HASH_MUTEX(index);
1093
1094 mutex_enter(phm);
1095 PAGE_HASH_SEARCH(index, pp, vp, off);
1096 mutex_exit(phm);
1097
1098 VM_STAT_ADD(page_exphcontg[1]);
1099
1100 if (pp == NULL) {
1101 VM_STAT_ADD(page_exphcontg[2]);
1102 return (0);
1103 }
1104
1105 pages = page_get_pagecnt(szc);
1106 rootpp = pp;
1107 pfn = rootpp->p_pagenum;
1108
1109 if ((pszc = pp->p_szc) >= szc && ppa != NULL) {
1110 VM_STAT_ADD(page_exphcontg[3]);
1111 if (!page_trylock(pp, SE_SHARED)) {
1112 VM_STAT_ADD(page_exphcontg[4]);
1113 return (1);
1114 }
1115 /*
1116 * Also check whether p_pagenum was modified by DR.
1117 */
1118 if (pp->p_szc != pszc || pp->p_vnode != vp ||
1119 pp->p_offset != off || pp->p_pagenum != pfn) {
1120 VM_STAT_ADD(page_exphcontg[5]);
1121 page_unlock(pp);
1122 off = save_off;
1123 goto again;
1124 }
1125 /*
1126 * szc was non zero and vnode and offset matched after we
1127 * locked the page it means it can't become free on us.
1128 */
1129 ASSERT(!PP_ISFREE(pp));
1130 if (!IS_P2ALIGNED(pfn, pages)) {
1131 page_unlock(pp);
1132 return (0);
1133 }
1134 ppa[0] = pp;
1135 pp++;
1136 off += PAGESIZE;
1137 pfn++;
1138 for (i = 1; i < pages; i++, pp++, off += PAGESIZE, pfn++) {
1139 if (!page_trylock(pp, SE_SHARED)) {
1140 VM_STAT_ADD(page_exphcontg[6]);
1141 pp--;
1142 while (i-- > 0) {
1143 page_unlock(pp);
1144 pp--;
1145 }
1146 ppa[0] = NULL;
1147 return (1);
1148 }
1149 if (pp->p_szc != pszc) {
1150 VM_STAT_ADD(page_exphcontg[7]);
1151 page_unlock(pp);
1152 pp--;
1153 while (i-- > 0) {
1154 page_unlock(pp);
1155 pp--;
1156 }
1157 ppa[0] = NULL;
1158 off = save_off;
1159 goto again;
1160 }
1161 /*
1162 * szc the same as for previous already locked pages
1163 * with right identity. Since this page had correct
1164 * szc after we locked it can't get freed or destroyed
1165 * and therefore must have the expected identity.
1166 */
1167 ASSERT(!PP_ISFREE(pp));
1168 if (pp->p_vnode != vp ||
1169 pp->p_offset != off) {
1170 panic("page_exists_physcontig: "
1171 "large page identity doesn't match");
1172 }
1173 ppa[i] = pp;
1174 ASSERT(pp->p_pagenum == pfn);
1175 }
1176 VM_STAT_ADD(page_exphcontg[8]);
1177 ppa[pages] = NULL;
1178 return (1);
1179 } else if (pszc >= szc) {
1180 VM_STAT_ADD(page_exphcontg[9]);
1181 if (!IS_P2ALIGNED(pfn, pages)) {
1182 return (0);
1183 }
1184 return (1);
1185 }
1186
1187 if (!IS_P2ALIGNED(pfn, pages)) {
1188 VM_STAT_ADD(page_exphcontg[10]);
1189 return (0);
1190 }
1191
1192 if (page_numtomemseg_nolock(pfn) !=
1193 page_numtomemseg_nolock(pfn + pages - 1)) {
1194 VM_STAT_ADD(page_exphcontg[11]);
1195 return (0);
1196 }
1197
1198 /*
1199 * We loop up 4 times across pages to promote page size.
1200 * We're extra cautious to promote page size atomically with respect
1201 * to everybody else. But we can probably optimize into 1 loop if
1202 * this becomes an issue.
1203 */
1204
1205 for (i = 0; i < pages; i++, pp++, off += PAGESIZE, pfn++) {
1206 if (!page_trylock(pp, SE_EXCL)) {
1207 VM_STAT_ADD(page_exphcontg[12]);
1208 break;
1209 }
1210 /*
1211 * Check whether p_pagenum was modified by DR.
1212 */
1213 if (pp->p_pagenum != pfn) {
1214 page_unlock(pp);
1215 break;
1216 }
1217 if (pp->p_vnode != vp ||
1218 pp->p_offset != off) {
1219 VM_STAT_ADD(page_exphcontg[13]);
1220 page_unlock(pp);
1221 break;
1222 }
1223 if (pp->p_szc >= szc) {
1224 ASSERT(i == 0);
1225 page_unlock(pp);
1226 off = save_off;
1227 goto again;
1228 }
1229 }
1230
1231 if (i != pages) {
1232 VM_STAT_ADD(page_exphcontg[14]);
1233 --pp;
1234 while (i-- > 0) {
1235 page_unlock(pp);
1236 --pp;
1237 }
1238 return (0);
1239 }
1240
1241 pp = rootpp;
1242 for (i = 0; i < pages; i++, pp++) {
1243 if (PP_ISFREE(pp)) {
1244 VM_STAT_ADD(page_exphcontg[15]);
1245 ASSERT(!PP_ISAGED(pp));
1246 ASSERT(pp->p_szc == 0);
1247 if (!page_reclaim(pp, NULL)) {
1248 break;
1249 }
1250 } else {
1251 ASSERT(pp->p_szc < szc);
1252 VM_STAT_ADD(page_exphcontg[16]);
1253 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1254 }
1255 }
1256 if (i < pages) {
1257 VM_STAT_ADD(page_exphcontg[17]);
1258 /*
1259 * page_reclaim failed because we were out of memory.
1260 * drop the rest of the locks and return because this page
1261 * must be already reallocated anyway.
1262 */
1263 pp = rootpp;
1264 for (j = 0; j < pages; j++, pp++) {
1265 if (j != i) {
1266 page_unlock(pp);
1267 }
1268 }
1269 return (0);
1270 }
1271
1272 off = save_off;
1273 pp = rootpp;
1274 for (i = 0; i < pages; i++, pp++, off += PAGESIZE) {
1275 ASSERT(PAGE_EXCL(pp));
1276 ASSERT(!PP_ISFREE(pp));
1277 ASSERT(!hat_page_is_mapped(pp));
1278 ASSERT(pp->p_vnode == vp);
1279 ASSERT(pp->p_offset == off);
1280 pp->p_szc = szc;
1281 }
1282 pp = rootpp;
1283 for (i = 0; i < pages; i++, pp++) {
1284 if (ppa == NULL) {
1285 page_unlock(pp);
1286 } else {
1287 ppa[i] = pp;
1288 page_downgrade(ppa[i]);
1289 }
1290 }
1291 if (ppa != NULL) {
1292 ppa[pages] = NULL;
1293 }
1294 VM_STAT_ADD(page_exphcontg[18]);
1295 ASSERT(vp->v_pages != NULL);
1296 return (1);
1297 }
1298
1299 /*
1300 * Determine whether a page with the specified [vp, off]
1301 * currently exists in the system and if so return its
1302 * size code. Obviously this should only be considered as
1303 * a hint since nothing prevents the page from disappearing
1304 * or appearing immediately after the return from this routine.
1305 */
1306 int
page_exists_forreal(vnode_t * vp,u_offset_t off,uint_t * szc)1307 page_exists_forreal(vnode_t *vp, u_offset_t off, uint_t *szc)
1308 {
1309 page_t *pp;
1310 kmutex_t *phm;
1311 ulong_t index;
1312 int rc = 0;
1313
1314 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
1315 ASSERT(szc != NULL);
1316 VM_STAT_ADD(page_exists_forreal_cnt);
1317
1318 index = PAGE_HASH_FUNC(vp, off);
1319 phm = PAGE_HASH_MUTEX(index);
1320
1321 mutex_enter(phm);
1322 PAGE_HASH_SEARCH(index, pp, vp, off);
1323 if (pp != NULL) {
1324 *szc = pp->p_szc;
1325 rc = 1;
1326 }
1327 mutex_exit(phm);
1328 return (rc);
1329 }
1330
1331 /* wakeup threads waiting for pages in page_create_get_something() */
1332 void
wakeup_pcgs(void)1333 wakeup_pcgs(void)
1334 {
1335 if (!CV_HAS_WAITERS(&pcgs_cv))
1336 return;
1337 cv_broadcast(&pcgs_cv);
1338 }
1339
1340 /*
1341 * 'freemem' is used all over the kernel as an indication of how many
1342 * pages are free (either on the cache list or on the free page list)
1343 * in the system. In very few places is a really accurate 'freemem'
1344 * needed. To avoid contention of the lock protecting a the
1345 * single freemem, it was spread out into NCPU buckets. Set_freemem
1346 * sets freemem to the total of all NCPU buckets. It is called from
1347 * clock() on each TICK.
1348 */
1349 void
set_freemem()1350 set_freemem()
1351 {
1352 struct pcf *p;
1353 ulong_t t;
1354 uint_t i;
1355
1356 t = 0;
1357 p = pcf;
1358 for (i = 0; i < pcf_fanout; i++) {
1359 t += p->pcf_count;
1360 p++;
1361 }
1362 freemem = t;
1363
1364 /*
1365 * Don't worry about grabbing mutex. It's not that
1366 * critical if we miss a tick or two. This is
1367 * where we wakeup possible delayers in
1368 * page_create_get_something().
1369 */
1370 wakeup_pcgs();
1371 }
1372
1373 ulong_t
get_freemem()1374 get_freemem()
1375 {
1376 struct pcf *p;
1377 ulong_t t;
1378 uint_t i;
1379
1380 t = 0;
1381 p = pcf;
1382 for (i = 0; i < pcf_fanout; i++) {
1383 t += p->pcf_count;
1384 p++;
1385 }
1386 /*
1387 * We just calculated it, might as well set it.
1388 */
1389 freemem = t;
1390 return (t);
1391 }
1392
1393 /*
1394 * Acquire all of the page cache & free (pcf) locks.
1395 */
1396 void
pcf_acquire_all()1397 pcf_acquire_all()
1398 {
1399 struct pcf *p;
1400 uint_t i;
1401
1402 p = pcf;
1403 for (i = 0; i < pcf_fanout; i++) {
1404 mutex_enter(&p->pcf_lock);
1405 p++;
1406 }
1407 }
1408
1409 /*
1410 * Release all the pcf_locks.
1411 */
1412 void
pcf_release_all()1413 pcf_release_all()
1414 {
1415 struct pcf *p;
1416 uint_t i;
1417
1418 p = pcf;
1419 for (i = 0; i < pcf_fanout; i++) {
1420 mutex_exit(&p->pcf_lock);
1421 p++;
1422 }
1423 }
1424
1425 /*
1426 * Inform the VM system that we need some pages freed up.
1427 * Calls must be symmetric, e.g.:
1428 *
1429 * page_needfree(100);
1430 * wait a bit;
1431 * page_needfree(-100);
1432 */
1433 void
page_needfree(spgcnt_t npages)1434 page_needfree(spgcnt_t npages)
1435 {
1436 mutex_enter(&new_freemem_lock);
1437 needfree += npages;
1438 mutex_exit(&new_freemem_lock);
1439 }
1440
1441 /*
1442 * Throttle for page_create(): try to prevent freemem from dropping
1443 * below throttlefree. We can't provide a 100% guarantee because
1444 * KM_NOSLEEP allocations, page_reclaim(), and various other things
1445 * nibble away at the freelist. However, we can block all PG_WAIT
1446 * allocations until memory becomes available. The motivation is
1447 * that several things can fall apart when there's no free memory:
1448 *
1449 * (1) If pageout() needs memory to push a page, the system deadlocks.
1450 *
1451 * (2) By (broken) specification, timeout(9F) can neither fail nor
1452 * block, so it has no choice but to panic the system if it
1453 * cannot allocate a callout structure.
1454 *
1455 * (3) Like timeout(), ddi_set_callback() cannot fail and cannot block;
1456 * it panics if it cannot allocate a callback structure.
1457 *
1458 * (4) Untold numbers of third-party drivers have not yet been hardened
1459 * against KM_NOSLEEP and/or allocb() failures; they simply assume
1460 * success and panic the system with a data fault on failure.
1461 * (The long-term solution to this particular problem is to ship
1462 * hostile fault-injecting DEBUG kernels with the DDK.)
1463 *
1464 * It is theoretically impossible to guarantee success of non-blocking
1465 * allocations, but in practice, this throttle is very hard to break.
1466 */
1467 static int
page_create_throttle(pgcnt_t npages,int flags)1468 page_create_throttle(pgcnt_t npages, int flags)
1469 {
1470 ulong_t fm;
1471 uint_t i;
1472 pgcnt_t tf; /* effective value of throttlefree */
1473
1474 /*
1475 * Normal priority allocations.
1476 */
1477 if ((flags & (PG_WAIT | PG_NORMALPRI)) == PG_NORMALPRI) {
1478 ASSERT(!(flags & (PG_PANIC | PG_PUSHPAGE)));
1479 return (freemem >= npages + throttlefree);
1480 }
1481
1482 /*
1483 * Never deny pages when:
1484 * - it's a thread that cannot block [NOMEMWAIT()]
1485 * - the allocation cannot block and must not fail
1486 * - the allocation cannot block and is pageout dispensated
1487 */
1488 if (NOMEMWAIT() ||
1489 ((flags & (PG_WAIT | PG_PANIC)) == PG_PANIC) ||
1490 ((flags & (PG_WAIT | PG_PUSHPAGE)) == PG_PUSHPAGE))
1491 return (1);
1492
1493 /*
1494 * If the allocation can't block, we look favorably upon it
1495 * unless we're below pageout_reserve. In that case we fail
1496 * the allocation because we want to make sure there are a few
1497 * pages available for pageout.
1498 */
1499 if ((flags & PG_WAIT) == 0)
1500 return (freemem >= npages + pageout_reserve);
1501
1502 /* Calculate the effective throttlefree value */
1503 tf = throttlefree -
1504 ((flags & PG_PUSHPAGE) ? pageout_reserve : 0);
1505
1506 cv_signal(&proc_pageout->p_cv);
1507
1508 for (;;) {
1509 fm = 0;
1510 pcf_acquire_all();
1511 mutex_enter(&new_freemem_lock);
1512 for (i = 0; i < pcf_fanout; i++) {
1513 fm += pcf[i].pcf_count;
1514 pcf[i].pcf_wait++;
1515 mutex_exit(&pcf[i].pcf_lock);
1516 }
1517 freemem = fm;
1518 if (freemem >= npages + tf) {
1519 mutex_exit(&new_freemem_lock);
1520 break;
1521 }
1522 needfree += npages;
1523 freemem_wait++;
1524 cv_wait(&freemem_cv, &new_freemem_lock);
1525 freemem_wait--;
1526 needfree -= npages;
1527 mutex_exit(&new_freemem_lock);
1528 }
1529 return (1);
1530 }
1531
1532 /*
1533 * page_create_wait() is called to either coalesce pages from the
1534 * different pcf buckets or to wait because there simply are not
1535 * enough pages to satisfy the caller's request.
1536 *
1537 * Sadly, this is called from platform/vm/vm_machdep.c
1538 */
1539 int
page_create_wait(pgcnt_t npages,uint_t flags)1540 page_create_wait(pgcnt_t npages, uint_t flags)
1541 {
1542 pgcnt_t total;
1543 uint_t i;
1544 struct pcf *p;
1545
1546 /*
1547 * Wait until there are enough free pages to satisfy our
1548 * entire request.
1549 * We set needfree += npages before prodding pageout, to make sure
1550 * it does real work when npages > lotsfree > freemem.
1551 */
1552 VM_STAT_ADD(page_create_not_enough);
1553
1554 ASSERT(!kcage_on ? !(flags & PG_NORELOC) : 1);
1555 checkagain:
1556 if ((flags & PG_NORELOC) &&
1557 kcage_freemem < kcage_throttlefree + npages)
1558 (void) kcage_create_throttle(npages, flags);
1559
1560 if (freemem < npages + throttlefree)
1561 if (!page_create_throttle(npages, flags))
1562 return (0);
1563
1564 if (pcf_decrement_bucket(npages) ||
1565 pcf_decrement_multiple(&total, npages, 0))
1566 return (1);
1567
1568 /*
1569 * All of the pcf locks are held, there are not enough pages
1570 * to satisfy the request (npages < total).
1571 * Be sure to acquire the new_freemem_lock before dropping
1572 * the pcf locks. This prevents dropping wakeups in page_free().
1573 * The order is always pcf_lock then new_freemem_lock.
1574 *
1575 * Since we hold all the pcf locks, it is a good time to set freemem.
1576 *
1577 * If the caller does not want to wait, return now.
1578 * Else turn the pageout daemon loose to find something
1579 * and wait till it does.
1580 *
1581 */
1582 freemem = total;
1583
1584 if ((flags & PG_WAIT) == 0) {
1585 pcf_release_all();
1586
1587 TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_NOMEM,
1588 "page_create_nomem:npages %ld freemem %ld", npages, freemem);
1589 return (0);
1590 }
1591
1592 ASSERT(proc_pageout != NULL);
1593 cv_signal(&proc_pageout->p_cv);
1594
1595 TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SLEEP_START,
1596 "page_create_sleep_start: freemem %ld needfree %ld",
1597 freemem, needfree);
1598
1599 /*
1600 * We are going to wait.
1601 * We currently hold all of the pcf_locks,
1602 * get the new_freemem_lock (it protects freemem_wait),
1603 * before dropping the pcf_locks.
1604 */
1605 mutex_enter(&new_freemem_lock);
1606
1607 p = pcf;
1608 for (i = 0; i < pcf_fanout; i++) {
1609 p->pcf_wait++;
1610 mutex_exit(&p->pcf_lock);
1611 p++;
1612 }
1613
1614 needfree += npages;
1615 freemem_wait++;
1616
1617 cv_wait(&freemem_cv, &new_freemem_lock);
1618
1619 freemem_wait--;
1620 needfree -= npages;
1621
1622 mutex_exit(&new_freemem_lock);
1623
1624 TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SLEEP_END,
1625 "page_create_sleep_end: freemem %ld needfree %ld",
1626 freemem, needfree);
1627
1628 VM_STAT_ADD(page_create_not_enough_again);
1629 goto checkagain;
1630 }
1631 /*
1632 * A routine to do the opposite of page_create_wait().
1633 */
1634 void
page_create_putback(spgcnt_t npages)1635 page_create_putback(spgcnt_t npages)
1636 {
1637 struct pcf *p;
1638 pgcnt_t lump;
1639 uint_t *which;
1640
1641 /*
1642 * When a contiguous lump is broken up, we have to
1643 * deal with lots of pages (min 64) so lets spread
1644 * the wealth around.
1645 */
1646 lump = roundup(npages, pcf_fanout) / pcf_fanout;
1647 freemem += npages;
1648
1649 for (p = pcf; (npages > 0) && (p < &pcf[pcf_fanout]); p++) {
1650 which = &p->pcf_count;
1651
1652 mutex_enter(&p->pcf_lock);
1653
1654 if (p->pcf_block) {
1655 which = &p->pcf_reserve;
1656 }
1657
1658 if (lump < npages) {
1659 *which += (uint_t)lump;
1660 npages -= lump;
1661 } else {
1662 *which += (uint_t)npages;
1663 npages = 0;
1664 }
1665
1666 if (p->pcf_wait) {
1667 mutex_enter(&new_freemem_lock);
1668 /*
1669 * Check to see if some other thread
1670 * is actually waiting. Another bucket
1671 * may have woken it up by now. If there
1672 * are no waiters, then set our pcf_wait
1673 * count to zero to avoid coming in here
1674 * next time.
1675 */
1676 if (freemem_wait) {
1677 if (npages > 1) {
1678 cv_broadcast(&freemem_cv);
1679 } else {
1680 cv_signal(&freemem_cv);
1681 }
1682 p->pcf_wait--;
1683 } else {
1684 p->pcf_wait = 0;
1685 }
1686 mutex_exit(&new_freemem_lock);
1687 }
1688 mutex_exit(&p->pcf_lock);
1689 }
1690 ASSERT(npages == 0);
1691 }
1692
1693 /*
1694 * A helper routine for page_create_get_something.
1695 * The indenting got to deep down there.
1696 * Unblock the pcf counters. Any pages freed after
1697 * pcf_block got set are moved to pcf_count and
1698 * wakeups (cv_broadcast() or cv_signal()) are done as needed.
1699 */
1700 static void
pcgs_unblock(void)1701 pcgs_unblock(void)
1702 {
1703 int i;
1704 struct pcf *p;
1705
1706 /* Update freemem while we're here. */
1707 freemem = 0;
1708 p = pcf;
1709 for (i = 0; i < pcf_fanout; i++) {
1710 mutex_enter(&p->pcf_lock);
1711 ASSERT(p->pcf_count == 0);
1712 p->pcf_count = p->pcf_reserve;
1713 p->pcf_block = 0;
1714 freemem += p->pcf_count;
1715 if (p->pcf_wait) {
1716 mutex_enter(&new_freemem_lock);
1717 if (freemem_wait) {
1718 if (p->pcf_reserve > 1) {
1719 cv_broadcast(&freemem_cv);
1720 p->pcf_wait = 0;
1721 } else {
1722 cv_signal(&freemem_cv);
1723 p->pcf_wait--;
1724 }
1725 } else {
1726 p->pcf_wait = 0;
1727 }
1728 mutex_exit(&new_freemem_lock);
1729 }
1730 p->pcf_reserve = 0;
1731 mutex_exit(&p->pcf_lock);
1732 p++;
1733 }
1734 }
1735
1736 /*
1737 * Called from page_create_va() when both the cache and free lists
1738 * have been checked once.
1739 *
1740 * Either returns a page or panics since the accounting was done
1741 * way before we got here.
1742 *
1743 * We don't come here often, so leave the accounting on permanently.
1744 */
1745
1746 #define MAX_PCGS 100
1747
1748 #ifdef DEBUG
1749 #define PCGS_TRIES 100
1750 #else /* DEBUG */
1751 #define PCGS_TRIES 10
1752 #endif /* DEBUG */
1753
1754 #ifdef VM_STATS
1755 uint_t pcgs_counts[PCGS_TRIES];
1756 uint_t pcgs_too_many;
1757 uint_t pcgs_entered;
1758 uint_t pcgs_entered_noreloc;
1759 uint_t pcgs_locked;
1760 uint_t pcgs_cagelocked;
1761 #endif /* VM_STATS */
1762
1763 static page_t *
page_create_get_something(vnode_t * vp,u_offset_t off,struct seg * seg,caddr_t vaddr,uint_t flags)1764 page_create_get_something(vnode_t *vp, u_offset_t off, struct seg *seg,
1765 caddr_t vaddr, uint_t flags)
1766 {
1767 uint_t count;
1768 page_t *pp;
1769 uint_t locked, i;
1770 struct pcf *p;
1771 lgrp_t *lgrp;
1772 int cagelocked = 0;
1773
1774 VM_STAT_ADD(pcgs_entered);
1775
1776 /*
1777 * Tap any reserve freelists: if we fail now, we'll die
1778 * since the page(s) we're looking for have already been
1779 * accounted for.
1780 */
1781 flags |= PG_PANIC;
1782
1783 if ((flags & PG_NORELOC) != 0) {
1784 VM_STAT_ADD(pcgs_entered_noreloc);
1785 /*
1786 * Requests for free pages from critical threads
1787 * such as pageout still won't throttle here, but
1788 * we must try again, to give the cageout thread
1789 * another chance to catch up. Since we already
1790 * accounted for the pages, we had better get them
1791 * this time.
1792 *
1793 * N.B. All non-critical threads acquire the pcgs_cagelock
1794 * to serialize access to the freelists. This implements a
1795 * turnstile-type synchornization to avoid starvation of
1796 * critical requests for PG_NORELOC memory by non-critical
1797 * threads: all non-critical threads must acquire a 'ticket'
1798 * before passing through, which entails making sure
1799 * kcage_freemem won't fall below minfree prior to grabbing
1800 * pages from the freelists.
1801 */
1802 if (kcage_create_throttle(1, flags) == KCT_NONCRIT) {
1803 mutex_enter(&pcgs_cagelock);
1804 cagelocked = 1;
1805 VM_STAT_ADD(pcgs_cagelocked);
1806 }
1807 }
1808
1809 /*
1810 * Time to get serious.
1811 * We failed to get a `correctly colored' page from both the
1812 * free and cache lists.
1813 * We escalate in stage.
1814 *
1815 * First try both lists without worring about color.
1816 *
1817 * Then, grab all page accounting locks (ie. pcf[]) and
1818 * steal any pages that they have and set the pcf_block flag to
1819 * stop deletions from the lists. This will help because
1820 * a page can get added to the free list while we are looking
1821 * at the cache list, then another page could be added to the cache
1822 * list allowing the page on the free list to be removed as we
1823 * move from looking at the cache list to the free list. This
1824 * could happen over and over. We would never find the page
1825 * we have accounted for.
1826 *
1827 * Noreloc pages are a subset of the global (relocatable) page pool.
1828 * They are not tracked separately in the pcf bins, so it is
1829 * impossible to know when doing pcf accounting if the available
1830 * page(s) are noreloc pages or not. When looking for a noreloc page
1831 * it is quite easy to end up here even if the global (relocatable)
1832 * page pool has plenty of free pages but the noreloc pool is empty.
1833 *
1834 * When the noreloc pool is empty (or low), additional noreloc pages
1835 * are created by converting pages from the global page pool. This
1836 * process will stall during pcf accounting if the pcf bins are
1837 * already locked. Such is the case when a noreloc allocation is
1838 * looping here in page_create_get_something waiting for more noreloc
1839 * pages to appear.
1840 *
1841 * Short of adding a new field to the pcf bins to accurately track
1842 * the number of free noreloc pages, we instead do not grab the
1843 * pcgs_lock, do not set the pcf blocks and do not timeout when
1844 * allocating a noreloc page. This allows noreloc allocations to
1845 * loop without blocking global page pool allocations.
1846 *
1847 * NOTE: the behaviour of page_create_get_something has not changed
1848 * for the case of global page pool allocations.
1849 */
1850
1851 flags &= ~PG_MATCH_COLOR;
1852 locked = 0;
1853 #if defined(__i386) || defined(__amd64)
1854 flags = page_create_update_flags_x86(flags);
1855 #endif
1856
1857 lgrp = lgrp_mem_choose(seg, vaddr, PAGESIZE);
1858
1859 for (count = 0; kcage_on || count < MAX_PCGS; count++) {
1860 pp = page_get_freelist(vp, off, seg, vaddr, PAGESIZE,
1861 flags, lgrp);
1862 if (pp == NULL) {
1863 pp = page_get_cachelist(vp, off, seg, vaddr,
1864 flags, lgrp);
1865 }
1866 if (pp == NULL) {
1867 /*
1868 * Serialize. Don't fight with other pcgs().
1869 */
1870 if (!locked && (!kcage_on || !(flags & PG_NORELOC))) {
1871 mutex_enter(&pcgs_lock);
1872 VM_STAT_ADD(pcgs_locked);
1873 locked = 1;
1874 p = pcf;
1875 for (i = 0; i < pcf_fanout; i++) {
1876 mutex_enter(&p->pcf_lock);
1877 ASSERT(p->pcf_block == 0);
1878 p->pcf_block = 1;
1879 p->pcf_reserve = p->pcf_count;
1880 p->pcf_count = 0;
1881 mutex_exit(&p->pcf_lock);
1882 p++;
1883 }
1884 freemem = 0;
1885 }
1886
1887 if (count) {
1888 /*
1889 * Since page_free() puts pages on
1890 * a list then accounts for it, we
1891 * just have to wait for page_free()
1892 * to unlock any page it was working
1893 * with. The page_lock()-page_reclaim()
1894 * path falls in the same boat.
1895 *
1896 * We don't need to check on the
1897 * PG_WAIT flag, we have already
1898 * accounted for the page we are
1899 * looking for in page_create_va().
1900 *
1901 * We just wait a moment to let any
1902 * locked pages on the lists free up,
1903 * then continue around and try again.
1904 *
1905 * Will be awakened by set_freemem().
1906 */
1907 mutex_enter(&pcgs_wait_lock);
1908 cv_wait(&pcgs_cv, &pcgs_wait_lock);
1909 mutex_exit(&pcgs_wait_lock);
1910 }
1911 } else {
1912 #ifdef VM_STATS
1913 if (count >= PCGS_TRIES) {
1914 VM_STAT_ADD(pcgs_too_many);
1915 } else {
1916 VM_STAT_ADD(pcgs_counts[count]);
1917 }
1918 #endif
1919 if (locked) {
1920 pcgs_unblock();
1921 mutex_exit(&pcgs_lock);
1922 }
1923 if (cagelocked)
1924 mutex_exit(&pcgs_cagelock);
1925 return (pp);
1926 }
1927 }
1928 /*
1929 * we go down holding the pcf locks.
1930 */
1931 panic("no %spage found %d",
1932 ((flags & PG_NORELOC) ? "non-reloc " : ""), count);
1933 /*NOTREACHED*/
1934 }
1935
1936 /*
1937 * Create enough pages for "bytes" worth of data starting at
1938 * "off" in "vp".
1939 *
1940 * Where flag must be one of:
1941 *
1942 * PG_EXCL: Exclusive create (fail if any page already
1943 * exists in the page cache) which does not
1944 * wait for memory to become available.
1945 *
1946 * PG_WAIT: Non-exclusive create which can wait for
1947 * memory to become available.
1948 *
1949 * PG_PHYSCONTIG: Allocate physically contiguous pages.
1950 * (Not Supported)
1951 *
1952 * A doubly linked list of pages is returned to the caller. Each page
1953 * on the list has the "exclusive" (p_selock) lock and "iolock" (p_iolock)
1954 * lock.
1955 *
1956 * Unable to change the parameters to page_create() in a minor release,
1957 * we renamed page_create() to page_create_va(), changed all known calls
1958 * from page_create() to page_create_va(), and created this wrapper.
1959 *
1960 * Upon a major release, we should break compatibility by deleting this
1961 * wrapper, and replacing all the strings "page_create_va", with "page_create".
1962 *
1963 * NOTE: There is a copy of this interface as page_create_io() in
1964 * i86/vm/vm_machdep.c. Any bugs fixed here should be applied
1965 * there.
1966 */
1967 page_t *
page_create(vnode_t * vp,u_offset_t off,size_t bytes,uint_t flags)1968 page_create(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags)
1969 {
1970 caddr_t random_vaddr;
1971 struct seg kseg;
1972
1973 #ifdef DEBUG
1974 cmn_err(CE_WARN, "Using deprecated interface page_create: caller %p",
1975 (void *)caller());
1976 #endif
1977
1978 random_vaddr = (caddr_t)(((uintptr_t)vp >> 7) ^
1979 (uintptr_t)(off >> PAGESHIFT));
1980 kseg.s_as = &kas;
1981
1982 return (page_create_va(vp, off, bytes, flags, &kseg, random_vaddr));
1983 }
1984
1985 #ifdef DEBUG
1986 uint32_t pg_alloc_pgs_mtbf = 0;
1987 #endif
1988
1989 /*
1990 * Used for large page support. It will attempt to allocate
1991 * a large page(s) off the freelist.
1992 *
1993 * Returns non zero on failure.
1994 */
1995 int
page_alloc_pages(struct vnode * vp,struct seg * seg,caddr_t addr,page_t ** basepp,page_t * ppa[],uint_t szc,int anypgsz,int pgflags)1996 page_alloc_pages(struct vnode *vp, struct seg *seg, caddr_t addr,
1997 page_t **basepp, page_t *ppa[], uint_t szc, int anypgsz, int pgflags)
1998 {
1999 pgcnt_t npgs, curnpgs, totpgs;
2000 size_t pgsz;
2001 page_t *pplist = NULL, *pp;
2002 int err = 0;
2003 lgrp_t *lgrp;
2004
2005 ASSERT(szc != 0 && szc <= (page_num_pagesizes() - 1));
2006 ASSERT(pgflags == 0 || pgflags == PG_LOCAL);
2007
2008 /*
2009 * Check if system heavily prefers local large pages over remote
2010 * on systems with multiple lgroups.
2011 */
2012 if (lpg_alloc_prefer == LPAP_LOCAL && nlgrps > 1) {
2013 pgflags = PG_LOCAL;
2014 }
2015
2016 VM_STAT_ADD(alloc_pages[0]);
2017
2018 #ifdef DEBUG
2019 if (pg_alloc_pgs_mtbf && !(gethrtime() % pg_alloc_pgs_mtbf)) {
2020 return (ENOMEM);
2021 }
2022 #endif
2023
2024 /*
2025 * One must be NULL but not both.
2026 * And one must be non NULL but not both.
2027 */
2028 ASSERT(basepp != NULL || ppa != NULL);
2029 ASSERT(basepp == NULL || ppa == NULL);
2030
2031 #if defined(__i386) || defined(__amd64)
2032 while (page_chk_freelist(szc) == 0) {
2033 VM_STAT_ADD(alloc_pages[8]);
2034 if (anypgsz == 0 || --szc == 0)
2035 return (ENOMEM);
2036 }
2037 #endif
2038
2039 pgsz = page_get_pagesize(szc);
2040 totpgs = curnpgs = npgs = pgsz >> PAGESHIFT;
2041
2042 ASSERT(((uintptr_t)addr & (pgsz - 1)) == 0);
2043
2044 (void) page_create_wait(npgs, PG_WAIT);
2045
2046 while (npgs && szc) {
2047 lgrp = lgrp_mem_choose(seg, addr, pgsz);
2048 if (pgflags == PG_LOCAL) {
2049 pp = page_get_freelist(vp, 0, seg, addr, pgsz,
2050 pgflags, lgrp);
2051 if (pp == NULL) {
2052 pp = page_get_freelist(vp, 0, seg, addr, pgsz,
2053 0, lgrp);
2054 }
2055 } else {
2056 pp = page_get_freelist(vp, 0, seg, addr, pgsz,
2057 0, lgrp);
2058 }
2059 if (pp != NULL) {
2060 VM_STAT_ADD(alloc_pages[1]);
2061 page_list_concat(&pplist, &pp);
2062 ASSERT(npgs >= curnpgs);
2063 npgs -= curnpgs;
2064 } else if (anypgsz) {
2065 VM_STAT_ADD(alloc_pages[2]);
2066 szc--;
2067 pgsz = page_get_pagesize(szc);
2068 curnpgs = pgsz >> PAGESHIFT;
2069 } else {
2070 VM_STAT_ADD(alloc_pages[3]);
2071 ASSERT(npgs == totpgs);
2072 page_create_putback(npgs);
2073 return (ENOMEM);
2074 }
2075 }
2076 if (szc == 0) {
2077 VM_STAT_ADD(alloc_pages[4]);
2078 ASSERT(npgs != 0);
2079 page_create_putback(npgs);
2080 err = ENOMEM;
2081 } else if (basepp != NULL) {
2082 ASSERT(npgs == 0);
2083 ASSERT(ppa == NULL);
2084 *basepp = pplist;
2085 }
2086
2087 npgs = totpgs - npgs;
2088 pp = pplist;
2089
2090 /*
2091 * Clear the free and age bits. Also if we were passed in a ppa then
2092 * fill it in with all the constituent pages from the large page. But
2093 * if we failed to allocate all the pages just free what we got.
2094 */
2095 while (npgs != 0) {
2096 ASSERT(PP_ISFREE(pp));
2097 ASSERT(PP_ISAGED(pp));
2098 if (ppa != NULL || err != 0) {
2099 if (err == 0) {
2100 VM_STAT_ADD(alloc_pages[5]);
2101 PP_CLRFREE(pp);
2102 PP_CLRAGED(pp);
2103 page_sub(&pplist, pp);
2104 *ppa++ = pp;
2105 npgs--;
2106 } else {
2107 VM_STAT_ADD(alloc_pages[6]);
2108 ASSERT(pp->p_szc != 0);
2109 curnpgs = page_get_pagecnt(pp->p_szc);
2110 page_list_break(&pp, &pplist, curnpgs);
2111 page_list_add_pages(pp, 0);
2112 page_create_putback(curnpgs);
2113 ASSERT(npgs >= curnpgs);
2114 npgs -= curnpgs;
2115 }
2116 pp = pplist;
2117 } else {
2118 VM_STAT_ADD(alloc_pages[7]);
2119 PP_CLRFREE(pp);
2120 PP_CLRAGED(pp);
2121 pp = pp->p_next;
2122 npgs--;
2123 }
2124 }
2125 return (err);
2126 }
2127
2128 /*
2129 * Get a single large page off of the freelists, and set it up for use.
2130 * Number of bytes requested must be a supported page size.
2131 *
2132 * Note that this call may fail even if there is sufficient
2133 * memory available or PG_WAIT is set, so the caller must
2134 * be willing to fallback on page_create_va(), block and retry,
2135 * or fail the requester.
2136 */
2137 page_t *
page_create_va_large(vnode_t * vp,u_offset_t off,size_t bytes,uint_t flags,struct seg * seg,caddr_t vaddr,void * arg)2138 page_create_va_large(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags,
2139 struct seg *seg, caddr_t vaddr, void *arg)
2140 {
2141 pgcnt_t npages;
2142 page_t *pp;
2143 page_t *rootpp;
2144 lgrp_t *lgrp;
2145 lgrp_id_t *lgrpid = (lgrp_id_t *)arg;
2146
2147 ASSERT(vp != NULL);
2148
2149 ASSERT((flags & ~(PG_EXCL | PG_WAIT |
2150 PG_NORELOC | PG_PANIC | PG_PUSHPAGE | PG_NORMALPRI)) == 0);
2151 /* but no others */
2152
2153 ASSERT((flags & PG_EXCL) == PG_EXCL);
2154
2155 npages = btop(bytes);
2156
2157 if (!kcage_on || panicstr) {
2158 /*
2159 * Cage is OFF, or we are single threaded in
2160 * panic, so make everything a RELOC request.
2161 */
2162 flags &= ~PG_NORELOC;
2163 }
2164
2165 /*
2166 * Make sure there's adequate physical memory available.
2167 * Note: PG_WAIT is ignored here.
2168 */
2169 if (freemem <= throttlefree + npages) {
2170 VM_STAT_ADD(page_create_large_cnt[1]);
2171 return (NULL);
2172 }
2173
2174 /*
2175 * If cage is on, dampen draw from cage when available
2176 * cage space is low.
2177 */
2178 if ((flags & (PG_NORELOC | PG_WAIT)) == (PG_NORELOC | PG_WAIT) &&
2179 kcage_freemem < kcage_throttlefree + npages) {
2180
2181 /*
2182 * The cage is on, the caller wants PG_NORELOC
2183 * pages and available cage memory is very low.
2184 * Call kcage_create_throttle() to attempt to
2185 * control demand on the cage.
2186 */
2187 if (kcage_create_throttle(npages, flags) == KCT_FAILURE) {
2188 VM_STAT_ADD(page_create_large_cnt[2]);
2189 return (NULL);
2190 }
2191 }
2192
2193 if (!pcf_decrement_bucket(npages) &&
2194 !pcf_decrement_multiple(NULL, npages, 1)) {
2195 VM_STAT_ADD(page_create_large_cnt[4]);
2196 return (NULL);
2197 }
2198
2199 /*
2200 * This is where this function behaves fundamentally differently
2201 * than page_create_va(); since we're intending to map the page
2202 * with a single TTE, we have to get it as a physically contiguous
2203 * hardware pagesize chunk. If we can't, we fail.
2204 */
2205 if (lgrpid != NULL && *lgrpid >= 0 && *lgrpid <= lgrp_alloc_max &&
2206 LGRP_EXISTS(lgrp_table[*lgrpid]))
2207 lgrp = lgrp_table[*lgrpid];
2208 else
2209 lgrp = lgrp_mem_choose(seg, vaddr, bytes);
2210
2211 if ((rootpp = page_get_freelist(&kvp, off, seg, vaddr,
2212 bytes, flags & ~PG_MATCH_COLOR, lgrp)) == NULL) {
2213 page_create_putback(npages);
2214 VM_STAT_ADD(page_create_large_cnt[5]);
2215 return (NULL);
2216 }
2217
2218 /*
2219 * if we got the page with the wrong mtype give it back this is a
2220 * workaround for CR 6249718. When CR 6249718 is fixed we never get
2221 * inside "if" and the workaround becomes just a nop
2222 */
2223 if (kcage_on && (flags & PG_NORELOC) && !PP_ISNORELOC(rootpp)) {
2224 page_list_add_pages(rootpp, 0);
2225 page_create_putback(npages);
2226 VM_STAT_ADD(page_create_large_cnt[6]);
2227 return (NULL);
2228 }
2229
2230 /*
2231 * If satisfying this request has left us with too little
2232 * memory, start the wheels turning to get some back. The
2233 * first clause of the test prevents waking up the pageout
2234 * daemon in situations where it would decide that there's
2235 * nothing to do.
2236 */
2237 if (nscan < desscan && freemem < minfree) {
2238 TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
2239 "pageout_cv_signal:freemem %ld", freemem);
2240 cv_signal(&proc_pageout->p_cv);
2241 }
2242
2243 pp = rootpp;
2244 while (npages--) {
2245 ASSERT(PAGE_EXCL(pp));
2246 ASSERT(pp->p_vnode == NULL);
2247 ASSERT(!hat_page_is_mapped(pp));
2248 PP_CLRFREE(pp);
2249 PP_CLRAGED(pp);
2250 if (!page_hashin(pp, vp, off, NULL))
2251 panic("page_create_large: hashin failed: page %p",
2252 (void *)pp);
2253 page_io_lock(pp);
2254 off += PAGESIZE;
2255 pp = pp->p_next;
2256 }
2257
2258 VM_STAT_ADD(page_create_large_cnt[0]);
2259 return (rootpp);
2260 }
2261
2262 page_t *
page_create_va(vnode_t * vp,u_offset_t off,size_t bytes,uint_t flags,struct seg * seg,caddr_t vaddr)2263 page_create_va(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags,
2264 struct seg *seg, caddr_t vaddr)
2265 {
2266 page_t *plist = NULL;
2267 pgcnt_t npages;
2268 pgcnt_t found_on_free = 0;
2269 pgcnt_t pages_req;
2270 page_t *npp = NULL;
2271 struct pcf *p;
2272 lgrp_t *lgrp;
2273
2274 TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START,
2275 "page_create_start:vp %p off %llx bytes %lu flags %x",
2276 vp, off, bytes, flags);
2277
2278 ASSERT(bytes != 0 && vp != NULL);
2279
2280 if ((flags & PG_EXCL) == 0 && (flags & PG_WAIT) == 0) {
2281 panic("page_create: invalid flags");
2282 /*NOTREACHED*/
2283 }
2284 ASSERT((flags & ~(PG_EXCL | PG_WAIT |
2285 PG_NORELOC | PG_PANIC | PG_PUSHPAGE | PG_NORMALPRI)) == 0);
2286 /* but no others */
2287
2288 pages_req = npages = btopr(bytes);
2289 /*
2290 * Try to see whether request is too large to *ever* be
2291 * satisfied, in order to prevent deadlock. We arbitrarily
2292 * decide to limit maximum size requests to max_page_get.
2293 */
2294 if (npages >= max_page_get) {
2295 if ((flags & PG_WAIT) == 0) {
2296 TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_TOOBIG,
2297 "page_create_toobig:vp %p off %llx npages "
2298 "%lu max_page_get %lu",
2299 vp, off, npages, max_page_get);
2300 return (NULL);
2301 } else {
2302 cmn_err(CE_WARN,
2303 "Request for too much kernel memory "
2304 "(%lu bytes), will hang forever", bytes);
2305 for (;;)
2306 delay(1000000000);
2307 }
2308 }
2309
2310 if (!kcage_on || panicstr) {
2311 /*
2312 * Cage is OFF, or we are single threaded in
2313 * panic, so make everything a RELOC request.
2314 */
2315 flags &= ~PG_NORELOC;
2316 }
2317
2318 if (freemem <= throttlefree + npages)
2319 if (!page_create_throttle(npages, flags))
2320 return (NULL);
2321
2322 /*
2323 * If cage is on, dampen draw from cage when available
2324 * cage space is low.
2325 */
2326 if ((flags & PG_NORELOC) &&
2327 kcage_freemem < kcage_throttlefree + npages) {
2328
2329 /*
2330 * The cage is on, the caller wants PG_NORELOC
2331 * pages and available cage memory is very low.
2332 * Call kcage_create_throttle() to attempt to
2333 * control demand on the cage.
2334 */
2335 if (kcage_create_throttle(npages, flags) == KCT_FAILURE)
2336 return (NULL);
2337 }
2338
2339 VM_STAT_ADD(page_create_cnt[0]);
2340
2341 if (!pcf_decrement_bucket(npages)) {
2342 /*
2343 * Have to look harder. If npages is greater than
2344 * one, then we might have to coalesce the counters.
2345 *
2346 * Go wait. We come back having accounted
2347 * for the memory.
2348 */
2349 VM_STAT_ADD(page_create_cnt[1]);
2350 if (!page_create_wait(npages, flags)) {
2351 VM_STAT_ADD(page_create_cnt[2]);
2352 return (NULL);
2353 }
2354 }
2355
2356 TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS,
2357 "page_create_success:vp %p off %llx", vp, off);
2358
2359 /*
2360 * If satisfying this request has left us with too little
2361 * memory, start the wheels turning to get some back. The
2362 * first clause of the test prevents waking up the pageout
2363 * daemon in situations where it would decide that there's
2364 * nothing to do.
2365 */
2366 if (nscan < desscan && freemem < minfree) {
2367 TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
2368 "pageout_cv_signal:freemem %ld", freemem);
2369 cv_signal(&proc_pageout->p_cv);
2370 }
2371
2372 /*
2373 * Loop around collecting the requested number of pages.
2374 * Most of the time, we have to `create' a new page. With
2375 * this in mind, pull the page off the free list before
2376 * getting the hash lock. This will minimize the hash
2377 * lock hold time, nesting, and the like. If it turns
2378 * out we don't need the page, we put it back at the end.
2379 */
2380 while (npages--) {
2381 page_t *pp;
2382 kmutex_t *phm = NULL;
2383 ulong_t index;
2384
2385 index = PAGE_HASH_FUNC(vp, off);
2386 top:
2387 ASSERT(phm == NULL);
2388 ASSERT(index == PAGE_HASH_FUNC(vp, off));
2389 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
2390
2391 if (npp == NULL) {
2392 /*
2393 * Try to get a page from the freelist (ie,
2394 * a page with no [vp, off] tag). If that
2395 * fails, use the cachelist.
2396 *
2397 * During the first attempt at both the free
2398 * and cache lists we try for the correct color.
2399 */
2400 /*
2401 * XXXX-how do we deal with virtual indexed
2402 * caches and and colors?
2403 */
2404 VM_STAT_ADD(page_create_cnt[4]);
2405 /*
2406 * Get lgroup to allocate next page of shared memory
2407 * from and use it to specify where to allocate
2408 * the physical memory
2409 */
2410 lgrp = lgrp_mem_choose(seg, vaddr, PAGESIZE);
2411 npp = page_get_freelist(vp, off, seg, vaddr, PAGESIZE,
2412 flags | PG_MATCH_COLOR, lgrp);
2413 if (npp == NULL) {
2414 npp = page_get_cachelist(vp, off, seg,
2415 vaddr, flags | PG_MATCH_COLOR, lgrp);
2416 if (npp == NULL) {
2417 npp = page_create_get_something(vp,
2418 off, seg, vaddr,
2419 flags & ~PG_MATCH_COLOR);
2420 }
2421
2422 if (PP_ISAGED(npp) == 0) {
2423 /*
2424 * Since this page came from the
2425 * cachelist, we must destroy the
2426 * old vnode association.
2427 */
2428 page_hashout(npp, NULL);
2429 }
2430 }
2431 }
2432
2433 /*
2434 * We own this page!
2435 */
2436 ASSERT(PAGE_EXCL(npp));
2437 ASSERT(npp->p_vnode == NULL);
2438 ASSERT(!hat_page_is_mapped(npp));
2439 PP_CLRFREE(npp);
2440 PP_CLRAGED(npp);
2441
2442 /*
2443 * Here we have a page in our hot little mits and are
2444 * just waiting to stuff it on the appropriate lists.
2445 * Get the mutex and check to see if it really does
2446 * not exist.
2447 */
2448 phm = PAGE_HASH_MUTEX(index);
2449 mutex_enter(phm);
2450 PAGE_HASH_SEARCH(index, pp, vp, off);
2451 if (pp == NULL) {
2452 VM_STAT_ADD(page_create_new);
2453 pp = npp;
2454 npp = NULL;
2455 if (!page_hashin(pp, vp, off, phm)) {
2456 /*
2457 * Since we hold the page hash mutex and
2458 * just searched for this page, page_hashin
2459 * had better not fail. If it does, that
2460 * means somethread did not follow the
2461 * page hash mutex rules. Panic now and
2462 * get it over with. As usual, go down
2463 * holding all the locks.
2464 */
2465 ASSERT(MUTEX_HELD(phm));
2466 panic("page_create: "
2467 "hashin failed %p %p %llx %p",
2468 (void *)pp, (void *)vp, off, (void *)phm);
2469 /*NOTREACHED*/
2470 }
2471 ASSERT(MUTEX_HELD(phm));
2472 mutex_exit(phm);
2473 phm = NULL;
2474
2475 /*
2476 * Hat layer locking need not be done to set
2477 * the following bits since the page is not hashed
2478 * and was on the free list (i.e., had no mappings).
2479 *
2480 * Set the reference bit to protect
2481 * against immediate pageout
2482 *
2483 * XXXmh modify freelist code to set reference
2484 * bit so we don't have to do it here.
2485 */
2486 page_set_props(pp, P_REF);
2487 found_on_free++;
2488 } else {
2489 VM_STAT_ADD(page_create_exists);
2490 if (flags & PG_EXCL) {
2491 /*
2492 * Found an existing page, and the caller
2493 * wanted all new pages. Undo all of the work
2494 * we have done.
2495 */
2496 mutex_exit(phm);
2497 phm = NULL;
2498 while (plist != NULL) {
2499 pp = plist;
2500 page_sub(&plist, pp);
2501 page_io_unlock(pp);
2502 /* large pages should not end up here */
2503 ASSERT(pp->p_szc == 0);
2504 /*LINTED: constant in conditional ctx*/
2505 VN_DISPOSE(pp, B_INVAL, 0, kcred);
2506 }
2507 VM_STAT_ADD(page_create_found_one);
2508 goto fail;
2509 }
2510 ASSERT(flags & PG_WAIT);
2511 if (!page_lock(pp, SE_EXCL, phm, P_NO_RECLAIM)) {
2512 /*
2513 * Start all over again if we blocked trying
2514 * to lock the page.
2515 */
2516 mutex_exit(phm);
2517 VM_STAT_ADD(page_create_page_lock_failed);
2518 phm = NULL;
2519 goto top;
2520 }
2521 mutex_exit(phm);
2522 phm = NULL;
2523
2524 if (PP_ISFREE(pp)) {
2525 ASSERT(PP_ISAGED(pp) == 0);
2526 VM_STAT_ADD(pagecnt.pc_get_cache);
2527 page_list_sub(pp, PG_CACHE_LIST);
2528 PP_CLRFREE(pp);
2529 found_on_free++;
2530 }
2531 }
2532
2533 /*
2534 * Got a page! It is locked. Acquire the i/o
2535 * lock since we are going to use the p_next and
2536 * p_prev fields to link the requested pages together.
2537 */
2538 page_io_lock(pp);
2539 page_add(&plist, pp);
2540 plist = plist->p_next;
2541 off += PAGESIZE;
2542 vaddr += PAGESIZE;
2543 }
2544
2545 ASSERT((flags & PG_EXCL) ? (found_on_free == pages_req) : 1);
2546 fail:
2547 if (npp != NULL) {
2548 /*
2549 * Did not need this page after all.
2550 * Put it back on the free list.
2551 */
2552 VM_STAT_ADD(page_create_putbacks);
2553 PP_SETFREE(npp);
2554 PP_SETAGED(npp);
2555 npp->p_offset = (u_offset_t)-1;
2556 page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL);
2557 page_unlock(npp);
2558
2559 }
2560
2561 ASSERT(pages_req >= found_on_free);
2562
2563 {
2564 uint_t overshoot = (uint_t)(pages_req - found_on_free);
2565
2566 if (overshoot) {
2567 VM_STAT_ADD(page_create_overshoot);
2568 p = &pcf[PCF_INDEX()];
2569 mutex_enter(&p->pcf_lock);
2570 if (p->pcf_block) {
2571 p->pcf_reserve += overshoot;
2572 } else {
2573 p->pcf_count += overshoot;
2574 if (p->pcf_wait) {
2575 mutex_enter(&new_freemem_lock);
2576 if (freemem_wait) {
2577 cv_signal(&freemem_cv);
2578 p->pcf_wait--;
2579 } else {
2580 p->pcf_wait = 0;
2581 }
2582 mutex_exit(&new_freemem_lock);
2583 }
2584 }
2585 mutex_exit(&p->pcf_lock);
2586 /* freemem is approximate, so this test OK */
2587 if (!p->pcf_block)
2588 freemem += overshoot;
2589 }
2590 }
2591
2592 return (plist);
2593 }
2594
2595 /*
2596 * One or more constituent pages of this large page has been marked
2597 * toxic. Simply demote the large page to PAGESIZE pages and let
2598 * page_free() handle it. This routine should only be called by
2599 * large page free routines (page_free_pages() and page_destroy_pages().
2600 * All pages are locked SE_EXCL and have already been marked free.
2601 */
2602 static void
page_free_toxic_pages(page_t * rootpp)2603 page_free_toxic_pages(page_t *rootpp)
2604 {
2605 page_t *tpp;
2606 pgcnt_t i, pgcnt = page_get_pagecnt(rootpp->p_szc);
2607 uint_t szc = rootpp->p_szc;
2608
2609 for (i = 0, tpp = rootpp; i < pgcnt; i++, tpp = tpp->p_next) {
2610 ASSERT(tpp->p_szc == szc);
2611 ASSERT((PAGE_EXCL(tpp) &&
2612 !page_iolock_assert(tpp)) || panicstr);
2613 tpp->p_szc = 0;
2614 }
2615
2616 while (rootpp != NULL) {
2617 tpp = rootpp;
2618 page_sub(&rootpp, tpp);
2619 ASSERT(PP_ISFREE(tpp));
2620 PP_CLRFREE(tpp);
2621 page_free(tpp, 1);
2622 }
2623 }
2624
2625 /*
2626 * Put page on the "free" list.
2627 * The free list is really two lists maintained by
2628 * the PSM of whatever machine we happen to be on.
2629 */
2630 void
page_free(page_t * pp,int dontneed)2631 page_free(page_t *pp, int dontneed)
2632 {
2633 struct pcf *p;
2634 uint_t pcf_index;
2635
2636 ASSERT((PAGE_EXCL(pp) &&
2637 !page_iolock_assert(pp)) || panicstr);
2638
2639 if (PP_ISFREE(pp)) {
2640 panic("page_free: page %p is free", (void *)pp);
2641 }
2642
2643 if (pp->p_szc != 0) {
2644 if (pp->p_vnode == NULL || IS_SWAPFSVP(pp->p_vnode) ||
2645 PP_ISKAS(pp)) {
2646 panic("page_free: anon or kernel "
2647 "or no vnode large page %p", (void *)pp);
2648 }
2649 page_demote_vp_pages(pp);
2650 ASSERT(pp->p_szc == 0);
2651 }
2652
2653 /*
2654 * The page_struct_lock need not be acquired to examine these
2655 * fields since the page has an "exclusive" lock.
2656 */
2657 if (hat_page_is_mapped(pp) || pp->p_lckcnt != 0 || pp->p_cowcnt != 0 ||
2658 pp->p_slckcnt != 0) {
2659 panic("page_free pp=%p, pfn=%lx, lckcnt=%d, cowcnt=%d "
2660 "slckcnt = %d", (void *)pp, page_pptonum(pp), pp->p_lckcnt,
2661 pp->p_cowcnt, pp->p_slckcnt);
2662 /*NOTREACHED*/
2663 }
2664
2665 ASSERT(!hat_page_getshare(pp));
2666
2667 PP_SETFREE(pp);
2668 ASSERT(pp->p_vnode == NULL || !IS_VMODSORT(pp->p_vnode) ||
2669 !hat_ismod(pp));
2670 page_clr_all_props(pp);
2671 ASSERT(!hat_page_getshare(pp));
2672
2673 /*
2674 * Now we add the page to the head of the free list.
2675 * But if this page is associated with a paged vnode
2676 * then we adjust the head forward so that the page is
2677 * effectively at the end of the list.
2678 */
2679 if (pp->p_vnode == NULL) {
2680 /*
2681 * Page has no identity, put it on the free list.
2682 */
2683 PP_SETAGED(pp);
2684 pp->p_offset = (u_offset_t)-1;
2685 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
2686 VM_STAT_ADD(pagecnt.pc_free_free);
2687 TRACE_1(TR_FAC_VM, TR_PAGE_FREE_FREE,
2688 "page_free_free:pp %p", pp);
2689 } else {
2690 PP_CLRAGED(pp);
2691
2692 if (!dontneed) {
2693 /* move it to the tail of the list */
2694 page_list_add(pp, PG_CACHE_LIST | PG_LIST_TAIL);
2695
2696 VM_STAT_ADD(pagecnt.pc_free_cache);
2697 TRACE_1(TR_FAC_VM, TR_PAGE_FREE_CACHE_TAIL,
2698 "page_free_cache_tail:pp %p", pp);
2699 } else {
2700 page_list_add(pp, PG_CACHE_LIST | PG_LIST_HEAD);
2701
2702 VM_STAT_ADD(pagecnt.pc_free_dontneed);
2703 TRACE_1(TR_FAC_VM, TR_PAGE_FREE_CACHE_HEAD,
2704 "page_free_cache_head:pp %p", pp);
2705 }
2706 }
2707 page_unlock(pp);
2708
2709 /*
2710 * Now do the `freemem' accounting.
2711 */
2712 pcf_index = PCF_INDEX();
2713 p = &pcf[pcf_index];
2714
2715 mutex_enter(&p->pcf_lock);
2716 if (p->pcf_block) {
2717 p->pcf_reserve += 1;
2718 } else {
2719 p->pcf_count += 1;
2720 if (p->pcf_wait) {
2721 mutex_enter(&new_freemem_lock);
2722 /*
2723 * Check to see if some other thread
2724 * is actually waiting. Another bucket
2725 * may have woken it up by now. If there
2726 * are no waiters, then set our pcf_wait
2727 * count to zero to avoid coming in here
2728 * next time. Also, since only one page
2729 * was put on the free list, just wake
2730 * up one waiter.
2731 */
2732 if (freemem_wait) {
2733 cv_signal(&freemem_cv);
2734 p->pcf_wait--;
2735 } else {
2736 p->pcf_wait = 0;
2737 }
2738 mutex_exit(&new_freemem_lock);
2739 }
2740 }
2741 mutex_exit(&p->pcf_lock);
2742
2743 /* freemem is approximate, so this test OK */
2744 if (!p->pcf_block)
2745 freemem += 1;
2746 }
2747
2748 /*
2749 * Put page on the "free" list during intial startup.
2750 * This happens during initial single threaded execution.
2751 */
2752 void
page_free_at_startup(page_t * pp)2753 page_free_at_startup(page_t *pp)
2754 {
2755 struct pcf *p;
2756 uint_t pcf_index;
2757
2758 page_list_add(pp, PG_FREE_LIST | PG_LIST_HEAD | PG_LIST_ISINIT);
2759 VM_STAT_ADD(pagecnt.pc_free_free);
2760
2761 /*
2762 * Now do the `freemem' accounting.
2763 */
2764 pcf_index = PCF_INDEX();
2765 p = &pcf[pcf_index];
2766
2767 ASSERT(p->pcf_block == 0);
2768 ASSERT(p->pcf_wait == 0);
2769 p->pcf_count += 1;
2770
2771 /* freemem is approximate, so this is OK */
2772 freemem += 1;
2773 }
2774
2775 void
page_free_pages(page_t * pp)2776 page_free_pages(page_t *pp)
2777 {
2778 page_t *tpp, *rootpp = NULL;
2779 pgcnt_t pgcnt = page_get_pagecnt(pp->p_szc);
2780 pgcnt_t i;
2781 uint_t szc = pp->p_szc;
2782
2783 VM_STAT_ADD(pagecnt.pc_free_pages);
2784 TRACE_1(TR_FAC_VM, TR_PAGE_FREE_FREE,
2785 "page_free_free:pp %p", pp);
2786
2787 ASSERT(pp->p_szc != 0 && pp->p_szc < page_num_pagesizes());
2788 if ((page_pptonum(pp) & (pgcnt - 1)) != 0) {
2789 panic("page_free_pages: not root page %p", (void *)pp);
2790 /*NOTREACHED*/
2791 }
2792
2793 for (i = 0, tpp = pp; i < pgcnt; i++, tpp++) {
2794 ASSERT((PAGE_EXCL(tpp) &&
2795 !page_iolock_assert(tpp)) || panicstr);
2796 if (PP_ISFREE(tpp)) {
2797 panic("page_free_pages: page %p is free", (void *)tpp);
2798 /*NOTREACHED*/
2799 }
2800 if (hat_page_is_mapped(tpp) || tpp->p_lckcnt != 0 ||
2801 tpp->p_cowcnt != 0 || tpp->p_slckcnt != 0) {
2802 panic("page_free_pages %p", (void *)tpp);
2803 /*NOTREACHED*/
2804 }
2805
2806 ASSERT(!hat_page_getshare(tpp));
2807 ASSERT(tpp->p_vnode == NULL);
2808 ASSERT(tpp->p_szc == szc);
2809
2810 PP_SETFREE(tpp);
2811 page_clr_all_props(tpp);
2812 PP_SETAGED(tpp);
2813 tpp->p_offset = (u_offset_t)-1;
2814 ASSERT(tpp->p_next == tpp);
2815 ASSERT(tpp->p_prev == tpp);
2816 page_list_concat(&rootpp, &tpp);
2817 }
2818 ASSERT(rootpp == pp);
2819
2820 page_list_add_pages(rootpp, 0);
2821 page_create_putback(pgcnt);
2822 }
2823
2824 int free_pages = 1;
2825
2826 /*
2827 * This routine attempts to return pages to the cachelist via page_release().
2828 * It does not *have* to be successful in all cases, since the pageout scanner
2829 * will catch any pages it misses. It does need to be fast and not introduce
2830 * too much overhead.
2831 *
2832 * If a page isn't found on the unlocked sweep of the page_hash bucket, we
2833 * don't lock and retry. This is ok, since the page scanner will eventually
2834 * find any page we miss in free_vp_pages().
2835 */
2836 void
free_vp_pages(vnode_t * vp,u_offset_t off,size_t len)2837 free_vp_pages(vnode_t *vp, u_offset_t off, size_t len)
2838 {
2839 page_t *pp;
2840 u_offset_t eoff;
2841 extern int swap_in_range(vnode_t *, u_offset_t, size_t);
2842
2843 eoff = off + len;
2844
2845 if (free_pages == 0)
2846 return;
2847 if (swap_in_range(vp, off, len))
2848 return;
2849
2850 for (; off < eoff; off += PAGESIZE) {
2851
2852 /*
2853 * find the page using a fast, but inexact search. It'll be OK
2854 * if a few pages slip through the cracks here.
2855 */
2856 pp = page_exists(vp, off);
2857
2858 /*
2859 * If we didn't find the page (it may not exist), the page
2860 * is free, looks still in use (shared), or we can't lock it,
2861 * just give up.
2862 */
2863 if (pp == NULL ||
2864 PP_ISFREE(pp) ||
2865 page_share_cnt(pp) > 0 ||
2866 !page_trylock(pp, SE_EXCL))
2867 continue;
2868
2869 /*
2870 * Once we have locked pp, verify that it's still the
2871 * correct page and not already free
2872 */
2873 ASSERT(PAGE_LOCKED_SE(pp, SE_EXCL));
2874 if (pp->p_vnode != vp || pp->p_offset != off || PP_ISFREE(pp)) {
2875 page_unlock(pp);
2876 continue;
2877 }
2878
2879 /*
2880 * try to release the page...
2881 */
2882 (void) page_release(pp, 1);
2883 }
2884 }
2885
2886 /*
2887 * Reclaim the given page from the free list.
2888 * If pp is part of a large pages, only the given constituent page is reclaimed
2889 * and the large page it belonged to will be demoted. This can only happen
2890 * if the page is not on the cachelist.
2891 *
2892 * Returns 1 on success or 0 on failure.
2893 *
2894 * The page is unlocked if it can't be reclaimed (when freemem == 0).
2895 * If `lock' is non-null, it will be dropped and re-acquired if
2896 * the routine must wait while freemem is 0.
2897 *
2898 * As it turns out, boot_getpages() does this. It picks a page,
2899 * based on where OBP mapped in some address, gets its pfn, searches
2900 * the memsegs, locks the page, then pulls it off the free list!
2901 */
2902 int
page_reclaim(page_t * pp,kmutex_t * lock)2903 page_reclaim(page_t *pp, kmutex_t *lock)
2904 {
2905 struct pcf *p;
2906 struct cpu *cpup;
2907 int enough;
2908 uint_t i;
2909
2910 ASSERT(lock != NULL ? MUTEX_HELD(lock) : 1);
2911 ASSERT(PAGE_EXCL(pp) && PP_ISFREE(pp));
2912
2913 /*
2914 * If `freemem' is 0, we cannot reclaim this page from the
2915 * freelist, so release every lock we might hold: the page,
2916 * and the `lock' before blocking.
2917 *
2918 * The only way `freemem' can become 0 while there are pages
2919 * marked free (have their p->p_free bit set) is when the
2920 * system is low on memory and doing a page_create(). In
2921 * order to guarantee that once page_create() starts acquiring
2922 * pages it will be able to get all that it needs since `freemem'
2923 * was decreased by the requested amount. So, we need to release
2924 * this page, and let page_create() have it.
2925 *
2926 * Since `freemem' being zero is not supposed to happen, just
2927 * use the usual hash stuff as a starting point. If that bucket
2928 * is empty, then assume the worst, and start at the beginning
2929 * of the pcf array. If we always start at the beginning
2930 * when acquiring more than one pcf lock, there won't be any
2931 * deadlock problems.
2932 */
2933
2934 /* TODO: Do we need to test kcage_freemem if PG_NORELOC(pp)? */
2935
2936 if (freemem <= throttlefree && !page_create_throttle(1l, 0)) {
2937 pcf_acquire_all();
2938 goto page_reclaim_nomem;
2939 }
2940
2941 enough = pcf_decrement_bucket(1);
2942
2943 if (!enough) {
2944 VM_STAT_ADD(page_reclaim_zero);
2945 /*
2946 * Check again. Its possible that some other thread
2947 * could have been right behind us, and added one
2948 * to a list somewhere. Acquire each of the pcf locks
2949 * until we find a page.
2950 */
2951 p = pcf;
2952 for (i = 0; i < pcf_fanout; i++) {
2953 mutex_enter(&p->pcf_lock);
2954 if (p->pcf_count >= 1) {
2955 p->pcf_count -= 1;
2956 /*
2957 * freemem is not protected by any lock. Thus,
2958 * we cannot have any assertion containing
2959 * freemem here.
2960 */
2961 freemem -= 1;
2962 enough = 1;
2963 break;
2964 }
2965 p++;
2966 }
2967
2968 if (!enough) {
2969 page_reclaim_nomem:
2970 /*
2971 * We really can't have page `pp'.
2972 * Time for the no-memory dance with
2973 * page_free(). This is just like
2974 * page_create_wait(). Plus the added
2975 * attraction of releasing whatever mutex
2976 * we held when we were called with in `lock'.
2977 * Page_unlock() will wakeup any thread
2978 * waiting around for this page.
2979 */
2980 if (lock) {
2981 VM_STAT_ADD(page_reclaim_zero_locked);
2982 mutex_exit(lock);
2983 }
2984 page_unlock(pp);
2985
2986 /*
2987 * get this before we drop all the pcf locks.
2988 */
2989 mutex_enter(&new_freemem_lock);
2990
2991 p = pcf;
2992 for (i = 0; i < pcf_fanout; i++) {
2993 p->pcf_wait++;
2994 mutex_exit(&p->pcf_lock);
2995 p++;
2996 }
2997
2998 freemem_wait++;
2999 cv_wait(&freemem_cv, &new_freemem_lock);
3000 freemem_wait--;
3001
3002 mutex_exit(&new_freemem_lock);
3003
3004 if (lock) {
3005 mutex_enter(lock);
3006 }
3007 return (0);
3008 }
3009
3010 /*
3011 * The pcf accounting has been done,
3012 * though none of the pcf_wait flags have been set,
3013 * drop the locks and continue on.
3014 */
3015 while (p >= pcf) {
3016 mutex_exit(&p->pcf_lock);
3017 p--;
3018 }
3019 }
3020
3021
3022 VM_STAT_ADD(pagecnt.pc_reclaim);
3023
3024 /*
3025 * page_list_sub will handle the case where pp is a large page.
3026 * It's possible that the page was promoted while on the freelist
3027 */
3028 if (PP_ISAGED(pp)) {
3029 page_list_sub(pp, PG_FREE_LIST);
3030 TRACE_1(TR_FAC_VM, TR_PAGE_UNFREE_FREE,
3031 "page_reclaim_free:pp %p", pp);
3032 } else {
3033 page_list_sub(pp, PG_CACHE_LIST);
3034 TRACE_1(TR_FAC_VM, TR_PAGE_UNFREE_CACHE,
3035 "page_reclaim_cache:pp %p", pp);
3036 }
3037
3038 /*
3039 * clear the p_free & p_age bits since this page is no longer
3040 * on the free list. Notice that there was a brief time where
3041 * a page is marked as free, but is not on the list.
3042 *
3043 * Set the reference bit to protect against immediate pageout.
3044 */
3045 PP_CLRFREE(pp);
3046 PP_CLRAGED(pp);
3047 page_set_props(pp, P_REF);
3048
3049 CPU_STATS_ENTER_K();
3050 cpup = CPU; /* get cpup now that CPU cannot change */
3051 CPU_STATS_ADDQ(cpup, vm, pgrec, 1);
3052 CPU_STATS_ADDQ(cpup, vm, pgfrec, 1);
3053 CPU_STATS_EXIT_K();
3054 ASSERT(pp->p_szc == 0);
3055
3056 return (1);
3057 }
3058
3059 /*
3060 * Destroy identity of the page and put it back on
3061 * the page free list. Assumes that the caller has
3062 * acquired the "exclusive" lock on the page.
3063 */
3064 void
page_destroy(page_t * pp,int dontfree)3065 page_destroy(page_t *pp, int dontfree)
3066 {
3067 ASSERT((PAGE_EXCL(pp) &&
3068 !page_iolock_assert(pp)) || panicstr);
3069 ASSERT(pp->p_slckcnt == 0 || panicstr);
3070
3071 if (pp->p_szc != 0) {
3072 if (pp->p_vnode == NULL || IS_SWAPFSVP(pp->p_vnode) ||
3073 PP_ISKAS(pp)) {
3074 panic("page_destroy: anon or kernel or no vnode "
3075 "large page %p", (void *)pp);
3076 }
3077 page_demote_vp_pages(pp);
3078 ASSERT(pp->p_szc == 0);
3079 }
3080
3081 TRACE_1(TR_FAC_VM, TR_PAGE_DESTROY, "page_destroy:pp %p", pp);
3082
3083 /*
3084 * Unload translations, if any, then hash out the
3085 * page to erase its identity.
3086 */
3087 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
3088 page_hashout(pp, NULL);
3089
3090 if (!dontfree) {
3091 /*
3092 * Acquire the "freemem_lock" for availrmem.
3093 * The page_struct_lock need not be acquired for lckcnt
3094 * and cowcnt since the page has an "exclusive" lock.
3095 * We are doing a modified version of page_pp_unlock here.
3096 */
3097 if ((pp->p_lckcnt != 0) || (pp->p_cowcnt != 0)) {
3098 mutex_enter(&freemem_lock);
3099 if (pp->p_lckcnt != 0) {
3100 availrmem++;
3101 pages_locked--;
3102 pp->p_lckcnt = 0;
3103 }
3104 if (pp->p_cowcnt != 0) {
3105 availrmem += pp->p_cowcnt;
3106 pages_locked -= pp->p_cowcnt;
3107 pp->p_cowcnt = 0;
3108 }
3109 mutex_exit(&freemem_lock);
3110 }
3111 /*
3112 * Put the page on the "free" list.
3113 */
3114 page_free(pp, 0);
3115 }
3116 }
3117
3118 void
page_destroy_pages(page_t * pp)3119 page_destroy_pages(page_t *pp)
3120 {
3121
3122 page_t *tpp, *rootpp = NULL;
3123 pgcnt_t pgcnt = page_get_pagecnt(pp->p_szc);
3124 pgcnt_t i, pglcks = 0;
3125 uint_t szc = pp->p_szc;
3126
3127 ASSERT(pp->p_szc != 0 && pp->p_szc < page_num_pagesizes());
3128
3129 VM_STAT_ADD(pagecnt.pc_destroy_pages);
3130
3131 TRACE_1(TR_FAC_VM, TR_PAGE_DESTROY, "page_destroy_pages:pp %p", pp);
3132
3133 if ((page_pptonum(pp) & (pgcnt - 1)) != 0) {
3134 panic("page_destroy_pages: not root page %p", (void *)pp);
3135 /*NOTREACHED*/
3136 }
3137
3138 for (i = 0, tpp = pp; i < pgcnt; i++, tpp++) {
3139 ASSERT((PAGE_EXCL(tpp) &&
3140 !page_iolock_assert(tpp)) || panicstr);
3141 ASSERT(tpp->p_slckcnt == 0 || panicstr);
3142 (void) hat_pageunload(tpp, HAT_FORCE_PGUNLOAD);
3143 page_hashout(tpp, NULL);
3144 ASSERT(tpp->p_offset == (u_offset_t)-1);
3145 if (tpp->p_lckcnt != 0) {
3146 pglcks++;
3147 tpp->p_lckcnt = 0;
3148 } else if (tpp->p_cowcnt != 0) {
3149 pglcks += tpp->p_cowcnt;
3150 tpp->p_cowcnt = 0;
3151 }
3152 ASSERT(!hat_page_getshare(tpp));
3153 ASSERT(tpp->p_vnode == NULL);
3154 ASSERT(tpp->p_szc == szc);
3155
3156 PP_SETFREE(tpp);
3157 page_clr_all_props(tpp);
3158 PP_SETAGED(tpp);
3159 ASSERT(tpp->p_next == tpp);
3160 ASSERT(tpp->p_prev == tpp);
3161 page_list_concat(&rootpp, &tpp);
3162 }
3163
3164 ASSERT(rootpp == pp);
3165 if (pglcks != 0) {
3166 mutex_enter(&freemem_lock);
3167 availrmem += pglcks;
3168 mutex_exit(&freemem_lock);
3169 }
3170
3171 page_list_add_pages(rootpp, 0);
3172 page_create_putback(pgcnt);
3173 }
3174
3175 /*
3176 * Similar to page_destroy(), but destroys pages which are
3177 * locked and known to be on the page free list. Since
3178 * the page is known to be free and locked, no one can access
3179 * it.
3180 *
3181 * Also, the number of free pages does not change.
3182 */
3183 void
page_destroy_free(page_t * pp)3184 page_destroy_free(page_t *pp)
3185 {
3186 ASSERT(PAGE_EXCL(pp));
3187 ASSERT(PP_ISFREE(pp));
3188 ASSERT(pp->p_vnode);
3189 ASSERT(hat_page_getattr(pp, P_MOD | P_REF | P_RO) == 0);
3190 ASSERT(!hat_page_is_mapped(pp));
3191 ASSERT(PP_ISAGED(pp) == 0);
3192 ASSERT(pp->p_szc == 0);
3193
3194 VM_STAT_ADD(pagecnt.pc_destroy_free);
3195 page_list_sub(pp, PG_CACHE_LIST);
3196
3197 page_hashout(pp, NULL);
3198 ASSERT(pp->p_vnode == NULL);
3199 ASSERT(pp->p_offset == (u_offset_t)-1);
3200 ASSERT(pp->p_hash == NULL);
3201
3202 PP_SETAGED(pp);
3203 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
3204 page_unlock(pp);
3205
3206 mutex_enter(&new_freemem_lock);
3207 if (freemem_wait) {
3208 cv_signal(&freemem_cv);
3209 }
3210 mutex_exit(&new_freemem_lock);
3211 }
3212
3213 /*
3214 * Rename the page "opp" to have an identity specified
3215 * by [vp, off]. If a page already exists with this name
3216 * it is locked and destroyed. Note that the page's
3217 * translations are not unloaded during the rename.
3218 *
3219 * This routine is used by the anon layer to "steal" the
3220 * original page and is not unlike destroying a page and
3221 * creating a new page using the same page frame.
3222 *
3223 * XXX -- Could deadlock if caller 1 tries to rename A to B while
3224 * caller 2 tries to rename B to A.
3225 */
3226 void
page_rename(page_t * opp,vnode_t * vp,u_offset_t off)3227 page_rename(page_t *opp, vnode_t *vp, u_offset_t off)
3228 {
3229 page_t *pp;
3230 int olckcnt = 0;
3231 int ocowcnt = 0;
3232 kmutex_t *phm;
3233 ulong_t index;
3234
3235 ASSERT(PAGE_EXCL(opp) && !page_iolock_assert(opp));
3236 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
3237 ASSERT(PP_ISFREE(opp) == 0);
3238
3239 VM_STAT_ADD(page_rename_count);
3240
3241 TRACE_3(TR_FAC_VM, TR_PAGE_RENAME,
3242 "page rename:pp %p vp %p off %llx", opp, vp, off);
3243
3244 /*
3245 * CacheFS may call page_rename for a large NFS page
3246 * when both CacheFS and NFS mount points are used
3247 * by applications. Demote this large page before
3248 * renaming it, to ensure that there are no "partial"
3249 * large pages left lying around.
3250 */
3251 if (opp->p_szc != 0) {
3252 vnode_t *ovp = opp->p_vnode;
3253 ASSERT(ovp != NULL);
3254 ASSERT(!IS_SWAPFSVP(ovp));
3255 ASSERT(!VN_ISKAS(ovp));
3256 page_demote_vp_pages(opp);
3257 ASSERT(opp->p_szc == 0);
3258 }
3259
3260 page_hashout(opp, NULL);
3261 PP_CLRAGED(opp);
3262
3263 /*
3264 * Acquire the appropriate page hash lock, since
3265 * we're going to rename the page.
3266 */
3267 index = PAGE_HASH_FUNC(vp, off);
3268 phm = PAGE_HASH_MUTEX(index);
3269 mutex_enter(phm);
3270 top:
3271 /*
3272 * Look for an existing page with this name and destroy it if found.
3273 * By holding the page hash lock all the way to the page_hashin()
3274 * call, we are assured that no page can be created with this
3275 * identity. In the case when the phm lock is dropped to undo any
3276 * hat layer mappings, the existing page is held with an "exclusive"
3277 * lock, again preventing another page from being created with
3278 * this identity.
3279 */
3280 PAGE_HASH_SEARCH(index, pp, vp, off);
3281 if (pp != NULL) {
3282 VM_STAT_ADD(page_rename_exists);
3283
3284 /*
3285 * As it turns out, this is one of only two places where
3286 * page_lock() needs to hold the passed in lock in the
3287 * successful case. In all of the others, the lock could
3288 * be dropped as soon as the attempt is made to lock
3289 * the page. It is tempting to add yet another arguement,
3290 * PL_KEEP or PL_DROP, to let page_lock know what to do.
3291 */
3292 if (!page_lock(pp, SE_EXCL, phm, P_RECLAIM)) {
3293 /*
3294 * Went to sleep because the page could not
3295 * be locked. We were woken up when the page
3296 * was unlocked, or when the page was destroyed.
3297 * In either case, `phm' was dropped while we
3298 * slept. Hence we should not just roar through
3299 * this loop.
3300 */
3301 goto top;
3302 }
3303
3304 /*
3305 * If an existing page is a large page, then demote
3306 * it to ensure that no "partial" large pages are
3307 * "created" after page_rename. An existing page
3308 * can be a CacheFS page, and can't belong to swapfs.
3309 */
3310 if (hat_page_is_mapped(pp)) {
3311 /*
3312 * Unload translations. Since we hold the
3313 * exclusive lock on this page, the page
3314 * can not be changed while we drop phm.
3315 * This is also not a lock protocol violation,
3316 * but rather the proper way to do things.
3317 */
3318 mutex_exit(phm);
3319 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
3320 if (pp->p_szc != 0) {
3321 ASSERT(!IS_SWAPFSVP(vp));
3322 ASSERT(!VN_ISKAS(vp));
3323 page_demote_vp_pages(pp);
3324 ASSERT(pp->p_szc == 0);
3325 }
3326 mutex_enter(phm);
3327 } else if (pp->p_szc != 0) {
3328 ASSERT(!IS_SWAPFSVP(vp));
3329 ASSERT(!VN_ISKAS(vp));
3330 mutex_exit(phm);
3331 page_demote_vp_pages(pp);
3332 ASSERT(pp->p_szc == 0);
3333 mutex_enter(phm);
3334 }
3335 page_hashout(pp, phm);
3336 }
3337 /*
3338 * Hash in the page with the new identity.
3339 */
3340 if (!page_hashin(opp, vp, off, phm)) {
3341 /*
3342 * We were holding phm while we searched for [vp, off]
3343 * and only dropped phm if we found and locked a page.
3344 * If we can't create this page now, then some thing
3345 * is really broken.
3346 */
3347 panic("page_rename: Can't hash in page: %p", (void *)pp);
3348 /*NOTREACHED*/
3349 }
3350
3351 ASSERT(MUTEX_HELD(phm));
3352 mutex_exit(phm);
3353
3354 /*
3355 * Now that we have dropped phm, lets get around to finishing up
3356 * with pp.
3357 */
3358 if (pp != NULL) {
3359 ASSERT(!hat_page_is_mapped(pp));
3360 /* for now large pages should not end up here */
3361 ASSERT(pp->p_szc == 0);
3362 /*
3363 * Save the locks for transfer to the new page and then
3364 * clear them so page_free doesn't think they're important.
3365 * The page_struct_lock need not be acquired for lckcnt and
3366 * cowcnt since the page has an "exclusive" lock.
3367 */
3368 olckcnt = pp->p_lckcnt;
3369 ocowcnt = pp->p_cowcnt;
3370 pp->p_lckcnt = pp->p_cowcnt = 0;
3371
3372 /*
3373 * Put the page on the "free" list after we drop
3374 * the lock. The less work under the lock the better.
3375 */
3376 /*LINTED: constant in conditional context*/
3377 VN_DISPOSE(pp, B_FREE, 0, kcred);
3378 }
3379
3380 /*
3381 * Transfer the lock count from the old page (if any).
3382 * The page_struct_lock need not be acquired for lckcnt and
3383 * cowcnt since the page has an "exclusive" lock.
3384 */
3385 opp->p_lckcnt += olckcnt;
3386 opp->p_cowcnt += ocowcnt;
3387 }
3388
3389 /*
3390 * low level routine to add page `pp' to the hash and vp chains for [vp, offset]
3391 *
3392 * Pages are normally inserted at the start of a vnode's v_pages list.
3393 * If the vnode is VMODSORT and the page is modified, it goes at the end.
3394 * This can happen when a modified page is relocated for DR.
3395 *
3396 * Returns 1 on success and 0 on failure.
3397 */
3398 static int
page_do_hashin(page_t * pp,vnode_t * vp,u_offset_t offset)3399 page_do_hashin(page_t *pp, vnode_t *vp, u_offset_t offset)
3400 {
3401 page_t **listp;
3402 page_t *tp;
3403 ulong_t index;
3404
3405 ASSERT(PAGE_EXCL(pp));
3406 ASSERT(vp != NULL);
3407 ASSERT(MUTEX_HELD(page_vnode_mutex(vp)));
3408
3409 /*
3410 * Be sure to set these up before the page is inserted on the hash
3411 * list. As soon as the page is placed on the list some other
3412 * thread might get confused and wonder how this page could
3413 * possibly hash to this list.
3414 */
3415 pp->p_vnode = vp;
3416 pp->p_offset = offset;
3417
3418 /*
3419 * record if this page is on a swap vnode
3420 */
3421 if ((vp->v_flag & VISSWAP) != 0)
3422 PP_SETSWAP(pp);
3423
3424 index = PAGE_HASH_FUNC(vp, offset);
3425 ASSERT(MUTEX_HELD(PAGE_HASH_MUTEX(index)));
3426 listp = &page_hash[index];
3427
3428 /*
3429 * If this page is already hashed in, fail this attempt to add it.
3430 */
3431 for (tp = *listp; tp != NULL; tp = tp->p_hash) {
3432 if (tp->p_vnode == vp && tp->p_offset == offset) {
3433 pp->p_vnode = NULL;
3434 pp->p_offset = (u_offset_t)(-1);
3435 return (0);
3436 }
3437 }
3438 pp->p_hash = *listp;
3439 *listp = pp;
3440
3441 /*
3442 * Add the page to the vnode's list of pages
3443 */
3444 if (vp->v_pages != NULL && IS_VMODSORT(vp) && hat_ismod(pp))
3445 listp = &vp->v_pages->p_vpprev->p_vpnext;
3446 else
3447 listp = &vp->v_pages;
3448
3449 page_vpadd(listp, pp);
3450
3451 return (1);
3452 }
3453
3454 /*
3455 * Add page `pp' to both the hash and vp chains for [vp, offset].
3456 *
3457 * Returns 1 on success and 0 on failure.
3458 * If hold is passed in, it is not dropped.
3459 */
3460 int
page_hashin(page_t * pp,vnode_t * vp,u_offset_t offset,kmutex_t * hold)3461 page_hashin(page_t *pp, vnode_t *vp, u_offset_t offset, kmutex_t *hold)
3462 {
3463 kmutex_t *phm = NULL;
3464 kmutex_t *vphm;
3465 int rc;
3466
3467 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
3468 ASSERT(pp->p_fsdata == 0 || panicstr);
3469
3470 TRACE_3(TR_FAC_VM, TR_PAGE_HASHIN,
3471 "page_hashin:pp %p vp %p offset %llx",
3472 pp, vp, offset);
3473
3474 VM_STAT_ADD(hashin_count);
3475
3476 if (hold != NULL)
3477 phm = hold;
3478 else {
3479 VM_STAT_ADD(hashin_not_held);
3480 phm = PAGE_HASH_MUTEX(PAGE_HASH_FUNC(vp, offset));
3481 mutex_enter(phm);
3482 }
3483
3484 vphm = page_vnode_mutex(vp);
3485 mutex_enter(vphm);
3486 rc = page_do_hashin(pp, vp, offset);
3487 mutex_exit(vphm);
3488 if (hold == NULL)
3489 mutex_exit(phm);
3490 if (rc == 0)
3491 VM_STAT_ADD(hashin_already);
3492 return (rc);
3493 }
3494
3495 /*
3496 * Remove page ``pp'' from the hash and vp chains and remove vp association.
3497 * All mutexes must be held
3498 */
3499 static void
page_do_hashout(page_t * pp)3500 page_do_hashout(page_t *pp)
3501 {
3502 page_t **hpp;
3503 page_t *hp;
3504 vnode_t *vp = pp->p_vnode;
3505
3506 ASSERT(vp != NULL);
3507 ASSERT(MUTEX_HELD(page_vnode_mutex(vp)));
3508
3509 /*
3510 * First, take pp off of its hash chain.
3511 */
3512 hpp = &page_hash[PAGE_HASH_FUNC(vp, pp->p_offset)];
3513
3514 for (;;) {
3515 hp = *hpp;
3516 if (hp == pp)
3517 break;
3518 if (hp == NULL) {
3519 panic("page_do_hashout");
3520 /*NOTREACHED*/
3521 }
3522 hpp = &hp->p_hash;
3523 }
3524 *hpp = pp->p_hash;
3525
3526 /*
3527 * Now remove it from its associated vnode.
3528 */
3529 if (vp->v_pages)
3530 page_vpsub(&vp->v_pages, pp);
3531
3532 pp->p_hash = NULL;
3533 page_clr_all_props(pp);
3534 PP_CLRSWAP(pp);
3535 pp->p_vnode = NULL;
3536 pp->p_offset = (u_offset_t)-1;
3537 pp->p_fsdata = 0;
3538 }
3539
3540 /*
3541 * Remove page ``pp'' from the hash and vp chains and remove vp association.
3542 *
3543 * When `phm' is non-NULL it contains the address of the mutex protecting the
3544 * hash list pp is on. It is not dropped.
3545 */
3546 void
page_hashout(page_t * pp,kmutex_t * phm)3547 page_hashout(page_t *pp, kmutex_t *phm)
3548 {
3549 vnode_t *vp;
3550 ulong_t index;
3551 kmutex_t *nphm;
3552 kmutex_t *vphm;
3553 kmutex_t *sep;
3554
3555 ASSERT(phm != NULL ? MUTEX_HELD(phm) : 1);
3556 ASSERT(pp->p_vnode != NULL);
3557 ASSERT((PAGE_EXCL(pp) && !page_iolock_assert(pp)) || panicstr);
3558 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(pp->p_vnode)));
3559
3560 vp = pp->p_vnode;
3561
3562 TRACE_2(TR_FAC_VM, TR_PAGE_HASHOUT,
3563 "page_hashout:pp %p vp %p", pp, vp);
3564
3565 /* Kernel probe */
3566 TNF_PROBE_2(page_unmap, "vm pagefault", /* CSTYLED */,
3567 tnf_opaque, vnode, vp,
3568 tnf_offset, offset, pp->p_offset);
3569
3570 /*
3571 *
3572 */
3573 VM_STAT_ADD(hashout_count);
3574 index = PAGE_HASH_FUNC(vp, pp->p_offset);
3575 if (phm == NULL) {
3576 VM_STAT_ADD(hashout_not_held);
3577 nphm = PAGE_HASH_MUTEX(index);
3578 mutex_enter(nphm);
3579 }
3580 ASSERT(phm ? phm == PAGE_HASH_MUTEX(index) : 1);
3581
3582
3583 /*
3584 * grab page vnode mutex and remove it...
3585 */
3586 vphm = page_vnode_mutex(vp);
3587 mutex_enter(vphm);
3588
3589 page_do_hashout(pp);
3590
3591 mutex_exit(vphm);
3592 if (phm == NULL)
3593 mutex_exit(nphm);
3594
3595 /*
3596 * Wake up processes waiting for this page. The page's
3597 * identity has been changed, and is probably not the
3598 * desired page any longer.
3599 */
3600 sep = page_se_mutex(pp);
3601 mutex_enter(sep);
3602 pp->p_selock &= ~SE_EWANTED;
3603 if (CV_HAS_WAITERS(&pp->p_cv))
3604 cv_broadcast(&pp->p_cv);
3605 mutex_exit(sep);
3606 }
3607
3608 /*
3609 * Add the page to the front of a linked list of pages
3610 * using the p_next & p_prev pointers for the list.
3611 * The caller is responsible for protecting the list pointers.
3612 */
3613 void
page_add(page_t ** ppp,page_t * pp)3614 page_add(page_t **ppp, page_t *pp)
3615 {
3616 ASSERT(PAGE_EXCL(pp) || (PAGE_SHARED(pp) && page_iolock_assert(pp)));
3617
3618 page_add_common(ppp, pp);
3619 }
3620
3621
3622
3623 /*
3624 * Common code for page_add() and mach_page_add()
3625 */
3626 void
page_add_common(page_t ** ppp,page_t * pp)3627 page_add_common(page_t **ppp, page_t *pp)
3628 {
3629 if (*ppp == NULL) {
3630 pp->p_next = pp->p_prev = pp;
3631 } else {
3632 pp->p_next = *ppp;
3633 pp->p_prev = (*ppp)->p_prev;
3634 (*ppp)->p_prev = pp;
3635 pp->p_prev->p_next = pp;
3636 }
3637 *ppp = pp;
3638 }
3639
3640
3641 /*
3642 * Remove this page from a linked list of pages
3643 * using the p_next & p_prev pointers for the list.
3644 *
3645 * The caller is responsible for protecting the list pointers.
3646 */
3647 void
page_sub(page_t ** ppp,page_t * pp)3648 page_sub(page_t **ppp, page_t *pp)
3649 {
3650 ASSERT((PP_ISFREE(pp)) ? 1 :
3651 (PAGE_EXCL(pp)) || (PAGE_SHARED(pp) && page_iolock_assert(pp)));
3652
3653 if (*ppp == NULL || pp == NULL) {
3654 panic("page_sub: bad arg(s): pp %p, *ppp %p",
3655 (void *)pp, (void *)(*ppp));
3656 /*NOTREACHED*/
3657 }
3658
3659 page_sub_common(ppp, pp);
3660 }
3661
3662
3663 /*
3664 * Common code for page_sub() and mach_page_sub()
3665 */
3666 void
page_sub_common(page_t ** ppp,page_t * pp)3667 page_sub_common(page_t **ppp, page_t *pp)
3668 {
3669 if (*ppp == pp)
3670 *ppp = pp->p_next; /* go to next page */
3671
3672 if (*ppp == pp)
3673 *ppp = NULL; /* page list is gone */
3674 else {
3675 pp->p_prev->p_next = pp->p_next;
3676 pp->p_next->p_prev = pp->p_prev;
3677 }
3678 pp->p_prev = pp->p_next = pp; /* make pp a list of one */
3679 }
3680
3681
3682 /*
3683 * Break page list cppp into two lists with npages in the first list.
3684 * The tail is returned in nppp.
3685 */
3686 void
page_list_break(page_t ** oppp,page_t ** nppp,pgcnt_t npages)3687 page_list_break(page_t **oppp, page_t **nppp, pgcnt_t npages)
3688 {
3689 page_t *s1pp = *oppp;
3690 page_t *s2pp;
3691 page_t *e1pp, *e2pp;
3692 long n = 0;
3693
3694 if (s1pp == NULL) {
3695 *nppp = NULL;
3696 return;
3697 }
3698 if (npages == 0) {
3699 *nppp = s1pp;
3700 *oppp = NULL;
3701 return;
3702 }
3703 for (n = 0, s2pp = *oppp; n < npages; n++) {
3704 s2pp = s2pp->p_next;
3705 }
3706 /* Fix head and tail of new lists */
3707 e1pp = s2pp->p_prev;
3708 e2pp = s1pp->p_prev;
3709 s1pp->p_prev = e1pp;
3710 e1pp->p_next = s1pp;
3711 s2pp->p_prev = e2pp;
3712 e2pp->p_next = s2pp;
3713
3714 /* second list empty */
3715 if (s2pp == s1pp) {
3716 *oppp = s1pp;
3717 *nppp = NULL;
3718 } else {
3719 *oppp = s1pp;
3720 *nppp = s2pp;
3721 }
3722 }
3723
3724 /*
3725 * Concatenate page list nppp onto the end of list ppp.
3726 */
3727 void
page_list_concat(page_t ** ppp,page_t ** nppp)3728 page_list_concat(page_t **ppp, page_t **nppp)
3729 {
3730 page_t *s1pp, *s2pp, *e1pp, *e2pp;
3731
3732 if (*nppp == NULL) {
3733 return;
3734 }
3735 if (*ppp == NULL) {
3736 *ppp = *nppp;
3737 return;
3738 }
3739 s1pp = *ppp;
3740 e1pp = s1pp->p_prev;
3741 s2pp = *nppp;
3742 e2pp = s2pp->p_prev;
3743 s1pp->p_prev = e2pp;
3744 e2pp->p_next = s1pp;
3745 e1pp->p_next = s2pp;
3746 s2pp->p_prev = e1pp;
3747 }
3748
3749 /*
3750 * return the next page in the page list
3751 */
3752 page_t *
page_list_next(page_t * pp)3753 page_list_next(page_t *pp)
3754 {
3755 return (pp->p_next);
3756 }
3757
3758
3759 /*
3760 * Add the page to the front of the linked list of pages
3761 * using p_vpnext/p_vpprev pointers for the list.
3762 *
3763 * The caller is responsible for protecting the lists.
3764 */
3765 void
page_vpadd(page_t ** ppp,page_t * pp)3766 page_vpadd(page_t **ppp, page_t *pp)
3767 {
3768 if (*ppp == NULL) {
3769 pp->p_vpnext = pp->p_vpprev = pp;
3770 } else {
3771 pp->p_vpnext = *ppp;
3772 pp->p_vpprev = (*ppp)->p_vpprev;
3773 (*ppp)->p_vpprev = pp;
3774 pp->p_vpprev->p_vpnext = pp;
3775 }
3776 *ppp = pp;
3777 }
3778
3779 /*
3780 * Remove this page from the linked list of pages
3781 * using p_vpnext/p_vpprev pointers for the list.
3782 *
3783 * The caller is responsible for protecting the lists.
3784 */
3785 void
page_vpsub(page_t ** ppp,page_t * pp)3786 page_vpsub(page_t **ppp, page_t *pp)
3787 {
3788 if (*ppp == NULL || pp == NULL) {
3789 panic("page_vpsub: bad arg(s): pp %p, *ppp %p",
3790 (void *)pp, (void *)(*ppp));
3791 /*NOTREACHED*/
3792 }
3793
3794 if (*ppp == pp)
3795 *ppp = pp->p_vpnext; /* go to next page */
3796
3797 if (*ppp == pp)
3798 *ppp = NULL; /* page list is gone */
3799 else {
3800 pp->p_vpprev->p_vpnext = pp->p_vpnext;
3801 pp->p_vpnext->p_vpprev = pp->p_vpprev;
3802 }
3803 pp->p_vpprev = pp->p_vpnext = pp; /* make pp a list of one */
3804 }
3805
3806 /*
3807 * Lock a physical page into memory "long term". Used to support "lock
3808 * in memory" functions. Accepts the page to be locked, and a cow variable
3809 * to indicate whether a the lock will travel to the new page during
3810 * a potential copy-on-write.
3811 */
3812 int
page_pp_lock(page_t * pp,int cow,int kernel)3813 page_pp_lock(
3814 page_t *pp, /* page to be locked */
3815 int cow, /* cow lock */
3816 int kernel) /* must succeed -- ignore checking */
3817 {
3818 int r = 0; /* result -- assume failure */
3819
3820 ASSERT(PAGE_LOCKED(pp));
3821
3822 page_struct_lock(pp);
3823 /*
3824 * Acquire the "freemem_lock" for availrmem.
3825 */
3826 if (cow) {
3827 mutex_enter(&freemem_lock);
3828 if ((availrmem > pages_pp_maximum) &&
3829 (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM)) {
3830 availrmem--;
3831 pages_locked++;
3832 mutex_exit(&freemem_lock);
3833 r = 1;
3834 if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
3835 cmn_err(CE_WARN,
3836 "COW lock limit reached on pfn 0x%lx",
3837 page_pptonum(pp));
3838 }
3839 } else
3840 mutex_exit(&freemem_lock);
3841 } else {
3842 if (pp->p_lckcnt) {
3843 if (pp->p_lckcnt < (ushort_t)PAGE_LOCK_MAXIMUM) {
3844 r = 1;
3845 if (++pp->p_lckcnt ==
3846 (ushort_t)PAGE_LOCK_MAXIMUM) {
3847 cmn_err(CE_WARN, "Page lock limit "
3848 "reached on pfn 0x%lx",
3849 page_pptonum(pp));
3850 }
3851 }
3852 } else {
3853 if (kernel) {
3854 /* availrmem accounting done by caller */
3855 ++pp->p_lckcnt;
3856 r = 1;
3857 } else {
3858 mutex_enter(&freemem_lock);
3859 if (availrmem > pages_pp_maximum) {
3860 availrmem--;
3861 pages_locked++;
3862 ++pp->p_lckcnt;
3863 r = 1;
3864 }
3865 mutex_exit(&freemem_lock);
3866 }
3867 }
3868 }
3869 page_struct_unlock(pp);
3870 return (r);
3871 }
3872
3873 /*
3874 * Decommit a lock on a physical page frame. Account for cow locks if
3875 * appropriate.
3876 */
3877 void
page_pp_unlock(page_t * pp,int cow,int kernel)3878 page_pp_unlock(
3879 page_t *pp, /* page to be unlocked */
3880 int cow, /* expect cow lock */
3881 int kernel) /* this was a kernel lock */
3882 {
3883 ASSERT(PAGE_LOCKED(pp));
3884
3885 page_struct_lock(pp);
3886 /*
3887 * Acquire the "freemem_lock" for availrmem.
3888 * If cowcnt or lcknt is already 0 do nothing; i.e., we
3889 * could be called to unlock even if nothing is locked. This could
3890 * happen if locked file pages were truncated (removing the lock)
3891 * and the file was grown again and new pages faulted in; the new
3892 * pages are unlocked but the segment still thinks they're locked.
3893 */
3894 if (cow) {
3895 if (pp->p_cowcnt) {
3896 mutex_enter(&freemem_lock);
3897 pp->p_cowcnt--;
3898 availrmem++;
3899 pages_locked--;
3900 mutex_exit(&freemem_lock);
3901 }
3902 } else {
3903 if (pp->p_lckcnt && --pp->p_lckcnt == 0) {
3904 if (!kernel) {
3905 mutex_enter(&freemem_lock);
3906 availrmem++;
3907 pages_locked--;
3908 mutex_exit(&freemem_lock);
3909 }
3910 }
3911 }
3912 page_struct_unlock(pp);
3913 }
3914
3915 /*
3916 * This routine reserves availrmem for npages;
3917 * flags: KM_NOSLEEP or KM_SLEEP
3918 * returns 1 on success or 0 on failure
3919 */
3920 int
page_resv(pgcnt_t npages,uint_t flags)3921 page_resv(pgcnt_t npages, uint_t flags)
3922 {
3923 mutex_enter(&freemem_lock);
3924 while (availrmem < tune.t_minarmem + npages) {
3925 if (flags & KM_NOSLEEP) {
3926 mutex_exit(&freemem_lock);
3927 return (0);
3928 }
3929 mutex_exit(&freemem_lock);
3930 page_needfree(npages);
3931 kmem_reap();
3932 delay(hz >> 2);
3933 page_needfree(-(spgcnt_t)npages);
3934 mutex_enter(&freemem_lock);
3935 }
3936 availrmem -= npages;
3937 mutex_exit(&freemem_lock);
3938 return (1);
3939 }
3940
3941 /*
3942 * This routine unreserves availrmem for npages;
3943 */
3944 void
page_unresv(pgcnt_t npages)3945 page_unresv(pgcnt_t npages)
3946 {
3947 mutex_enter(&freemem_lock);
3948 availrmem += npages;
3949 mutex_exit(&freemem_lock);
3950 }
3951
3952 /*
3953 * See Statement at the beginning of segvn_lockop() regarding
3954 * the way we handle cowcnts and lckcnts.
3955 *
3956 * Transfer cowcnt on 'opp' to cowcnt on 'npp' if the vpage
3957 * that breaks COW has PROT_WRITE.
3958 *
3959 * Note that, we may also break COW in case we are softlocking
3960 * on read access during physio;
3961 * in this softlock case, the vpage may not have PROT_WRITE.
3962 * So, we need to transfer lckcnt on 'opp' to lckcnt on 'npp'
3963 * if the vpage doesn't have PROT_WRITE.
3964 *
3965 * This routine is never called if we are stealing a page
3966 * in anon_private.
3967 *
3968 * The caller subtracted from availrmem for read only mapping.
3969 * if lckcnt is 1 increment availrmem.
3970 */
3971 void
page_pp_useclaim(page_t * opp,page_t * npp,uint_t write_perm)3972 page_pp_useclaim(
3973 page_t *opp, /* original page frame losing lock */
3974 page_t *npp, /* new page frame gaining lock */
3975 uint_t write_perm) /* set if vpage has PROT_WRITE */
3976 {
3977 int payback = 0;
3978 int nidx, oidx;
3979
3980 ASSERT(PAGE_LOCKED(opp));
3981 ASSERT(PAGE_LOCKED(npp));
3982
3983 /*
3984 * Since we have two pages we probably have two locks. We need to take
3985 * them in a defined order to avoid deadlocks. It's also possible they
3986 * both hash to the same lock in which case this is a non-issue.
3987 */
3988 nidx = PAGE_LLOCK_HASH(PP_PAGEROOT(npp));
3989 oidx = PAGE_LLOCK_HASH(PP_PAGEROOT(opp));
3990 if (nidx < oidx) {
3991 page_struct_lock(npp);
3992 page_struct_lock(opp);
3993 } else if (oidx < nidx) {
3994 page_struct_lock(opp);
3995 page_struct_lock(npp);
3996 } else { /* The pages hash to the same lock */
3997 page_struct_lock(npp);
3998 }
3999
4000 ASSERT(npp->p_cowcnt == 0);
4001 ASSERT(npp->p_lckcnt == 0);
4002
4003 /* Don't use claim if nothing is locked (see page_pp_unlock above) */
4004 if ((write_perm && opp->p_cowcnt != 0) ||
4005 (!write_perm && opp->p_lckcnt != 0)) {
4006
4007 if (write_perm) {
4008 npp->p_cowcnt++;
4009 ASSERT(opp->p_cowcnt != 0);
4010 opp->p_cowcnt--;
4011 } else {
4012
4013 ASSERT(opp->p_lckcnt != 0);
4014
4015 /*
4016 * We didn't need availrmem decremented if p_lckcnt on
4017 * original page is 1. Here, we are unlocking
4018 * read-only copy belonging to original page and
4019 * are locking a copy belonging to new page.
4020 */
4021 if (opp->p_lckcnt == 1)
4022 payback = 1;
4023
4024 npp->p_lckcnt++;
4025 opp->p_lckcnt--;
4026 }
4027 }
4028 if (payback) {
4029 mutex_enter(&freemem_lock);
4030 availrmem++;
4031 pages_useclaim--;
4032 mutex_exit(&freemem_lock);
4033 }
4034
4035 if (nidx < oidx) {
4036 page_struct_unlock(opp);
4037 page_struct_unlock(npp);
4038 } else if (oidx < nidx) {
4039 page_struct_unlock(npp);
4040 page_struct_unlock(opp);
4041 } else { /* The pages hash to the same lock */
4042 page_struct_unlock(npp);
4043 }
4044 }
4045
4046 /*
4047 * Simple claim adjust functions -- used to support changes in
4048 * claims due to changes in access permissions. Used by segvn_setprot().
4049 */
4050 int
page_addclaim(page_t * pp)4051 page_addclaim(page_t *pp)
4052 {
4053 int r = 0; /* result */
4054
4055 ASSERT(PAGE_LOCKED(pp));
4056
4057 page_struct_lock(pp);
4058 ASSERT(pp->p_lckcnt != 0);
4059
4060 if (pp->p_lckcnt == 1) {
4061 if (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM) {
4062 --pp->p_lckcnt;
4063 r = 1;
4064 if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
4065 cmn_err(CE_WARN,
4066 "COW lock limit reached on pfn 0x%lx",
4067 page_pptonum(pp));
4068 }
4069 }
4070 } else {
4071 mutex_enter(&freemem_lock);
4072 if ((availrmem > pages_pp_maximum) &&
4073 (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM)) {
4074 --availrmem;
4075 ++pages_claimed;
4076 mutex_exit(&freemem_lock);
4077 --pp->p_lckcnt;
4078 r = 1;
4079 if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
4080 cmn_err(CE_WARN,
4081 "COW lock limit reached on pfn 0x%lx",
4082 page_pptonum(pp));
4083 }
4084 } else
4085 mutex_exit(&freemem_lock);
4086 }
4087 page_struct_unlock(pp);
4088 return (r);
4089 }
4090
4091 int
page_subclaim(page_t * pp)4092 page_subclaim(page_t *pp)
4093 {
4094 int r = 0;
4095
4096 ASSERT(PAGE_LOCKED(pp));
4097
4098 page_struct_lock(pp);
4099 ASSERT(pp->p_cowcnt != 0);
4100
4101 if (pp->p_lckcnt) {
4102 if (pp->p_lckcnt < (ushort_t)PAGE_LOCK_MAXIMUM) {
4103 r = 1;
4104 /*
4105 * for availrmem
4106 */
4107 mutex_enter(&freemem_lock);
4108 availrmem++;
4109 pages_claimed--;
4110 mutex_exit(&freemem_lock);
4111
4112 pp->p_cowcnt--;
4113
4114 if (++pp->p_lckcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
4115 cmn_err(CE_WARN,
4116 "Page lock limit reached on pfn 0x%lx",
4117 page_pptonum(pp));
4118 }
4119 }
4120 } else {
4121 r = 1;
4122 pp->p_cowcnt--;
4123 pp->p_lckcnt++;
4124 }
4125 page_struct_unlock(pp);
4126 return (r);
4127 }
4128
4129 /*
4130 * Variant of page_addclaim(), where ppa[] contains the pages of a single large
4131 * page.
4132 */
4133 int
page_addclaim_pages(page_t ** ppa)4134 page_addclaim_pages(page_t **ppa)
4135 {
4136 pgcnt_t lckpgs = 0, pg_idx;
4137
4138 VM_STAT_ADD(pagecnt.pc_addclaim_pages);
4139
4140 /*
4141 * Only need to take the page struct lock on the large page root.
4142 */
4143 page_struct_lock(ppa[0]);
4144 for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) {
4145
4146 ASSERT(PAGE_LOCKED(ppa[pg_idx]));
4147 ASSERT(ppa[pg_idx]->p_lckcnt != 0);
4148 if (ppa[pg_idx]->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
4149 page_struct_unlock(ppa[0]);
4150 return (0);
4151 }
4152 if (ppa[pg_idx]->p_lckcnt > 1)
4153 lckpgs++;
4154 }
4155
4156 if (lckpgs != 0) {
4157 mutex_enter(&freemem_lock);
4158 if (availrmem >= pages_pp_maximum + lckpgs) {
4159 availrmem -= lckpgs;
4160 pages_claimed += lckpgs;
4161 } else {
4162 mutex_exit(&freemem_lock);
4163 page_struct_unlock(ppa[0]);
4164 return (0);
4165 }
4166 mutex_exit(&freemem_lock);
4167 }
4168
4169 for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) {
4170 ppa[pg_idx]->p_lckcnt--;
4171 ppa[pg_idx]->p_cowcnt++;
4172 }
4173 page_struct_unlock(ppa[0]);
4174 return (1);
4175 }
4176
4177 /*
4178 * Variant of page_subclaim(), where ppa[] contains the pages of a single large
4179 * page.
4180 */
4181 int
page_subclaim_pages(page_t ** ppa)4182 page_subclaim_pages(page_t **ppa)
4183 {
4184 pgcnt_t ulckpgs = 0, pg_idx;
4185
4186 VM_STAT_ADD(pagecnt.pc_subclaim_pages);
4187
4188 /*
4189 * Only need to take the page struct lock on the large page root.
4190 */
4191 page_struct_lock(ppa[0]);
4192 for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) {
4193
4194 ASSERT(PAGE_LOCKED(ppa[pg_idx]));
4195 ASSERT(ppa[pg_idx]->p_cowcnt != 0);
4196 if (ppa[pg_idx]->p_lckcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
4197 page_struct_unlock(ppa[0]);
4198 return (0);
4199 }
4200 if (ppa[pg_idx]->p_lckcnt != 0)
4201 ulckpgs++;
4202 }
4203
4204 if (ulckpgs != 0) {
4205 mutex_enter(&freemem_lock);
4206 availrmem += ulckpgs;
4207 pages_claimed -= ulckpgs;
4208 mutex_exit(&freemem_lock);
4209 }
4210
4211 for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) {
4212 ppa[pg_idx]->p_cowcnt--;
4213 ppa[pg_idx]->p_lckcnt++;
4214
4215 }
4216 page_struct_unlock(ppa[0]);
4217 return (1);
4218 }
4219
4220 page_t *
page_numtopp(pfn_t pfnum,se_t se)4221 page_numtopp(pfn_t pfnum, se_t se)
4222 {
4223 page_t *pp;
4224
4225 retry:
4226 pp = page_numtopp_nolock(pfnum);
4227 if (pp == NULL) {
4228 return ((page_t *)NULL);
4229 }
4230
4231 /*
4232 * Acquire the appropriate lock on the page.
4233 */
4234 while (!page_lock(pp, se, (kmutex_t *)NULL, P_RECLAIM)) {
4235 if (page_pptonum(pp) != pfnum)
4236 goto retry;
4237 continue;
4238 }
4239
4240 if (page_pptonum(pp) != pfnum) {
4241 page_unlock(pp);
4242 goto retry;
4243 }
4244
4245 return (pp);
4246 }
4247
4248 page_t *
page_numtopp_noreclaim(pfn_t pfnum,se_t se)4249 page_numtopp_noreclaim(pfn_t pfnum, se_t se)
4250 {
4251 page_t *pp;
4252
4253 retry:
4254 pp = page_numtopp_nolock(pfnum);
4255 if (pp == NULL) {
4256 return ((page_t *)NULL);
4257 }
4258
4259 /*
4260 * Acquire the appropriate lock on the page.
4261 */
4262 while (!page_lock(pp, se, (kmutex_t *)NULL, P_NO_RECLAIM)) {
4263 if (page_pptonum(pp) != pfnum)
4264 goto retry;
4265 continue;
4266 }
4267
4268 if (page_pptonum(pp) != pfnum) {
4269 page_unlock(pp);
4270 goto retry;
4271 }
4272
4273 return (pp);
4274 }
4275
4276 /*
4277 * This routine is like page_numtopp, but will only return page structs
4278 * for pages which are ok for loading into hardware using the page struct.
4279 */
4280 page_t *
page_numtopp_nowait(pfn_t pfnum,se_t se)4281 page_numtopp_nowait(pfn_t pfnum, se_t se)
4282 {
4283 page_t *pp;
4284
4285 retry:
4286 pp = page_numtopp_nolock(pfnum);
4287 if (pp == NULL) {
4288 return ((page_t *)NULL);
4289 }
4290
4291 /*
4292 * Try to acquire the appropriate lock on the page.
4293 */
4294 if (PP_ISFREE(pp))
4295 pp = NULL;
4296 else {
4297 if (!page_trylock(pp, se))
4298 pp = NULL;
4299 else {
4300 if (page_pptonum(pp) != pfnum) {
4301 page_unlock(pp);
4302 goto retry;
4303 }
4304 if (PP_ISFREE(pp)) {
4305 page_unlock(pp);
4306 pp = NULL;
4307 }
4308 }
4309 }
4310 return (pp);
4311 }
4312
4313 #define SYNC_PROGRESS_NPAGES 1000
4314
4315 /*
4316 * Returns a count of dirty pages that are in the process
4317 * of being written out. If 'cleanit' is set, try to push the page.
4318 */
4319 pgcnt_t
page_busy(int cleanit)4320 page_busy(int cleanit)
4321 {
4322 page_t *page0 = page_first();
4323 page_t *pp = page0;
4324 pgcnt_t nppbusy = 0;
4325 int counter = 0;
4326 u_offset_t off;
4327
4328 do {
4329 vnode_t *vp = pp->p_vnode;
4330
4331 /*
4332 * Reset the sync timeout. The page list is very long
4333 * on large memory systems.
4334 */
4335 if (++counter > SYNC_PROGRESS_NPAGES) {
4336 counter = 0;
4337 vfs_syncprogress();
4338 }
4339
4340 /*
4341 * A page is a candidate for syncing if it is:
4342 *
4343 * (a) On neither the freelist nor the cachelist
4344 * (b) Hashed onto a vnode
4345 * (c) Not a kernel page
4346 * (d) Dirty
4347 * (e) Not part of a swapfile
4348 * (f) a page which belongs to a real vnode; eg has a non-null
4349 * v_vfsp pointer.
4350 * (g) Backed by a filesystem which doesn't have a
4351 * stubbed-out sync operation
4352 */
4353 if (!PP_ISFREE(pp) && vp != NULL && !VN_ISKAS(vp) &&
4354 hat_ismod(pp) && !IS_SWAPVP(vp) && vp->v_vfsp != NULL &&
4355 vfs_can_sync(vp->v_vfsp)) {
4356 nppbusy++;
4357
4358 if (!cleanit)
4359 continue;
4360 if (!page_trylock(pp, SE_EXCL))
4361 continue;
4362
4363 if (PP_ISFREE(pp) || vp == NULL || IS_SWAPVP(vp) ||
4364 pp->p_lckcnt != 0 || pp->p_cowcnt != 0 ||
4365 !(hat_pagesync(pp,
4366 HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD) & P_MOD)) {
4367 page_unlock(pp);
4368 continue;
4369 }
4370 off = pp->p_offset;
4371 VN_HOLD(vp);
4372 page_unlock(pp);
4373 (void) VOP_PUTPAGE(vp, off, PAGESIZE,
4374 B_ASYNC | B_FREE, kcred, NULL);
4375 VN_RELE(vp);
4376 }
4377 } while ((pp = page_next(pp)) != page0);
4378
4379 vfs_syncprogress();
4380 return (nppbusy);
4381 }
4382
4383 void page_invalidate_pages(void);
4384
4385 /*
4386 * callback handler to vm sub-system
4387 *
4388 * callers make sure no recursive entries to this func.
4389 */
4390 /*ARGSUSED*/
4391 boolean_t
callb_vm_cpr(void * arg,int code)4392 callb_vm_cpr(void *arg, int code)
4393 {
4394 if (code == CB_CODE_CPR_CHKPT)
4395 page_invalidate_pages();
4396 return (B_TRUE);
4397 }
4398
4399 /*
4400 * Invalidate all pages of the system.
4401 * It shouldn't be called until all user page activities are all stopped.
4402 */
4403 void
page_invalidate_pages()4404 page_invalidate_pages()
4405 {
4406 page_t *pp;
4407 page_t *page0;
4408 pgcnt_t nbusypages;
4409 int retry = 0;
4410 const int MAXRETRIES = 4;
4411 top:
4412 /*
4413 * Flush dirty pages and destroy the clean ones.
4414 */
4415 nbusypages = 0;
4416
4417 pp = page0 = page_first();
4418 do {
4419 struct vnode *vp;
4420 u_offset_t offset;
4421 int mod;
4422
4423 /*
4424 * skip the page if it has no vnode or the page associated
4425 * with the kernel vnode or prom allocated kernel mem.
4426 */
4427 if ((vp = pp->p_vnode) == NULL || VN_ISKAS(vp))
4428 continue;
4429
4430 /*
4431 * skip the page which is already free invalidated.
4432 */
4433 if (PP_ISFREE(pp) && PP_ISAGED(pp))
4434 continue;
4435
4436 /*
4437 * skip pages that are already locked or can't be "exclusively"
4438 * locked or are already free. After we lock the page, check
4439 * the free and age bits again to be sure it's not destroyed
4440 * yet.
4441 * To achieve max. parallelization, we use page_trylock instead
4442 * of page_lock so that we don't get block on individual pages
4443 * while we have thousands of other pages to process.
4444 */
4445 if (!page_trylock(pp, SE_EXCL)) {
4446 nbusypages++;
4447 continue;
4448 } else if (PP_ISFREE(pp)) {
4449 if (!PP_ISAGED(pp)) {
4450 page_destroy_free(pp);
4451 } else {
4452 page_unlock(pp);
4453 }
4454 continue;
4455 }
4456 /*
4457 * Is this page involved in some I/O? shared?
4458 *
4459 * The page_struct_lock need not be acquired to
4460 * examine these fields since the page has an
4461 * "exclusive" lock.
4462 */
4463 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
4464 page_unlock(pp);
4465 continue;
4466 }
4467
4468 if (vp->v_type == VCHR) {
4469 panic("vp->v_type == VCHR");
4470 /*NOTREACHED*/
4471 }
4472
4473 if (!page_try_demote_pages(pp)) {
4474 page_unlock(pp);
4475 continue;
4476 }
4477
4478 /*
4479 * Check the modified bit. Leave the bits alone in hardware
4480 * (they will be modified if we do the putpage).
4481 */
4482 mod = (hat_pagesync(pp, HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD)
4483 & P_MOD);
4484 if (mod) {
4485 offset = pp->p_offset;
4486 /*
4487 * Hold the vnode before releasing the page lock
4488 * to prevent it from being freed and re-used by
4489 * some other thread.
4490 */
4491 VN_HOLD(vp);
4492 page_unlock(pp);
4493 /*
4494 * No error return is checked here. Callers such as
4495 * cpr deals with the dirty pages at the dump time
4496 * if this putpage fails.
4497 */
4498 (void) VOP_PUTPAGE(vp, offset, PAGESIZE, B_INVAL,
4499 kcred, NULL);
4500 VN_RELE(vp);
4501 } else {
4502 /*LINTED: constant in conditional context*/
4503 VN_DISPOSE(pp, B_INVAL, 0, kcred);
4504 }
4505 } while ((pp = page_next(pp)) != page0);
4506 if (nbusypages && retry++ < MAXRETRIES) {
4507 delay(1);
4508 goto top;
4509 }
4510 }
4511
4512 /*
4513 * Replace the page "old" with the page "new" on the page hash and vnode lists
4514 *
4515 * the replacement must be done in place, ie the equivalent sequence:
4516 *
4517 * vp = old->p_vnode;
4518 * off = old->p_offset;
4519 * page_do_hashout(old)
4520 * page_do_hashin(new, vp, off)
4521 *
4522 * doesn't work, since
4523 * 1) if old is the only page on the vnode, the v_pages list has a window
4524 * where it looks empty. This will break file system assumptions.
4525 * and
4526 * 2) pvn_vplist_dirty() can't deal with pages moving on the v_pages list.
4527 */
4528 static void
page_do_relocate_hash(page_t * new,page_t * old)4529 page_do_relocate_hash(page_t *new, page_t *old)
4530 {
4531 page_t **hash_list;
4532 vnode_t *vp = old->p_vnode;
4533 kmutex_t *sep;
4534
4535 ASSERT(PAGE_EXCL(old));
4536 ASSERT(PAGE_EXCL(new));
4537 ASSERT(vp != NULL);
4538 ASSERT(MUTEX_HELD(page_vnode_mutex(vp)));
4539 ASSERT(MUTEX_HELD(PAGE_HASH_MUTEX(PAGE_HASH_FUNC(vp, old->p_offset))));
4540
4541 /*
4542 * First find old page on the page hash list
4543 */
4544 hash_list = &page_hash[PAGE_HASH_FUNC(vp, old->p_offset)];
4545
4546 for (;;) {
4547 if (*hash_list == old)
4548 break;
4549 if (*hash_list == NULL) {
4550 panic("page_do_hashout");
4551 /*NOTREACHED*/
4552 }
4553 hash_list = &(*hash_list)->p_hash;
4554 }
4555
4556 /*
4557 * update new and replace old with new on the page hash list
4558 */
4559 new->p_vnode = old->p_vnode;
4560 new->p_offset = old->p_offset;
4561 new->p_hash = old->p_hash;
4562 *hash_list = new;
4563
4564 if ((new->p_vnode->v_flag & VISSWAP) != 0)
4565 PP_SETSWAP(new);
4566
4567 /*
4568 * replace old with new on the vnode's page list
4569 */
4570 if (old->p_vpnext == old) {
4571 new->p_vpnext = new;
4572 new->p_vpprev = new;
4573 } else {
4574 new->p_vpnext = old->p_vpnext;
4575 new->p_vpprev = old->p_vpprev;
4576 new->p_vpnext->p_vpprev = new;
4577 new->p_vpprev->p_vpnext = new;
4578 }
4579 if (vp->v_pages == old)
4580 vp->v_pages = new;
4581
4582 /*
4583 * clear out the old page
4584 */
4585 old->p_hash = NULL;
4586 old->p_vpnext = NULL;
4587 old->p_vpprev = NULL;
4588 old->p_vnode = NULL;
4589 PP_CLRSWAP(old);
4590 old->p_offset = (u_offset_t)-1;
4591 page_clr_all_props(old);
4592
4593 /*
4594 * Wake up processes waiting for this page. The page's
4595 * identity has been changed, and is probably not the
4596 * desired page any longer.
4597 */
4598 sep = page_se_mutex(old);
4599 mutex_enter(sep);
4600 old->p_selock &= ~SE_EWANTED;
4601 if (CV_HAS_WAITERS(&old->p_cv))
4602 cv_broadcast(&old->p_cv);
4603 mutex_exit(sep);
4604 }
4605
4606 /*
4607 * This function moves the identity of page "pp_old" to page "pp_new".
4608 * Both pages must be locked on entry. "pp_new" is free, has no identity,
4609 * and need not be hashed out from anywhere.
4610 */
4611 void
page_relocate_hash(page_t * pp_new,page_t * pp_old)4612 page_relocate_hash(page_t *pp_new, page_t *pp_old)
4613 {
4614 vnode_t *vp = pp_old->p_vnode;
4615 u_offset_t off = pp_old->p_offset;
4616 kmutex_t *phm, *vphm;
4617
4618 /*
4619 * Rehash two pages
4620 */
4621 ASSERT(PAGE_EXCL(pp_old));
4622 ASSERT(PAGE_EXCL(pp_new));
4623 ASSERT(vp != NULL);
4624 ASSERT(pp_new->p_vnode == NULL);
4625
4626 /*
4627 * hashout then hashin while holding the mutexes
4628 */
4629 phm = PAGE_HASH_MUTEX(PAGE_HASH_FUNC(vp, off));
4630 mutex_enter(phm);
4631 vphm = page_vnode_mutex(vp);
4632 mutex_enter(vphm);
4633
4634 page_do_relocate_hash(pp_new, pp_old);
4635
4636 /* The following comment preserved from page_flip(). */
4637 pp_new->p_fsdata = pp_old->p_fsdata;
4638 pp_old->p_fsdata = 0;
4639 mutex_exit(vphm);
4640 mutex_exit(phm);
4641
4642 /*
4643 * The page_struct_lock need not be acquired for lckcnt and
4644 * cowcnt since the page has an "exclusive" lock.
4645 */
4646 ASSERT(pp_new->p_lckcnt == 0);
4647 ASSERT(pp_new->p_cowcnt == 0);
4648 pp_new->p_lckcnt = pp_old->p_lckcnt;
4649 pp_new->p_cowcnt = pp_old->p_cowcnt;
4650 pp_old->p_lckcnt = pp_old->p_cowcnt = 0;
4651
4652 }
4653
4654 /*
4655 * Helper routine used to lock all remaining members of a
4656 * large page. The caller is responsible for passing in a locked
4657 * pp. If pp is a large page, then it succeeds in locking all the
4658 * remaining constituent pages or it returns with only the
4659 * original page locked.
4660 *
4661 * Returns 1 on success, 0 on failure.
4662 *
4663 * If success is returned this routine guarantees p_szc for all constituent
4664 * pages of a large page pp belongs to can't change. To achieve this we
4665 * recheck szc of pp after locking all constituent pages and retry if szc
4666 * changed (it could only decrease). Since hat_page_demote() needs an EXCL
4667 * lock on one of constituent pages it can't be running after all constituent
4668 * pages are locked. hat_page_demote() with a lock on a constituent page
4669 * outside of this large page (i.e. pp belonged to a larger large page) is
4670 * already done with all constituent pages of pp since the root's p_szc is
4671 * changed last. Therefore no need to synchronize with hat_page_demote() that
4672 * locked a constituent page outside of pp's current large page.
4673 */
4674 #ifdef DEBUG
4675 uint32_t gpg_trylock_mtbf = 0;
4676 #endif
4677
4678 int
group_page_trylock(page_t * pp,se_t se)4679 group_page_trylock(page_t *pp, se_t se)
4680 {
4681 page_t *tpp;
4682 pgcnt_t npgs, i, j;
4683 uint_t pszc = pp->p_szc;
4684
4685 #ifdef DEBUG
4686 if (gpg_trylock_mtbf && !(gethrtime() % gpg_trylock_mtbf)) {
4687 return (0);
4688 }
4689 #endif
4690
4691 if (pp != PP_GROUPLEADER(pp, pszc)) {
4692 return (0);
4693 }
4694
4695 retry:
4696 ASSERT(PAGE_LOCKED_SE(pp, se));
4697 ASSERT(!PP_ISFREE(pp));
4698 if (pszc == 0) {
4699 return (1);
4700 }
4701 npgs = page_get_pagecnt(pszc);
4702 tpp = pp + 1;
4703 for (i = 1; i < npgs; i++, tpp++) {
4704 if (!page_trylock(tpp, se)) {
4705 tpp = pp + 1;
4706 for (j = 1; j < i; j++, tpp++) {
4707 page_unlock(tpp);
4708 }
4709 return (0);
4710 }
4711 }
4712 if (pp->p_szc != pszc) {
4713 ASSERT(pp->p_szc < pszc);
4714 ASSERT(pp->p_vnode != NULL && !PP_ISKAS(pp) &&
4715 !IS_SWAPFSVP(pp->p_vnode));
4716 tpp = pp + 1;
4717 for (i = 1; i < npgs; i++, tpp++) {
4718 page_unlock(tpp);
4719 }
4720 pszc = pp->p_szc;
4721 goto retry;
4722 }
4723 return (1);
4724 }
4725
4726 void
group_page_unlock(page_t * pp)4727 group_page_unlock(page_t *pp)
4728 {
4729 page_t *tpp;
4730 pgcnt_t npgs, i;
4731
4732 ASSERT(PAGE_LOCKED(pp));
4733 ASSERT(!PP_ISFREE(pp));
4734 ASSERT(pp == PP_PAGEROOT(pp));
4735 npgs = page_get_pagecnt(pp->p_szc);
4736 for (i = 1, tpp = pp + 1; i < npgs; i++, tpp++) {
4737 page_unlock(tpp);
4738 }
4739 }
4740
4741 /*
4742 * returns
4743 * 0 : on success and *nrelocp is number of relocated PAGESIZE pages
4744 * ERANGE : this is not a base page
4745 * EBUSY : failure to get locks on the page/pages
4746 * ENOMEM : failure to obtain replacement pages
4747 * EAGAIN : OBP has not yet completed its boot-time handoff to the kernel
4748 * EIO : An error occurred while trying to copy the page data
4749 *
4750 * Return with all constituent members of target and replacement
4751 * SE_EXCL locked. It is the callers responsibility to drop the
4752 * locks.
4753 */
4754 int
do_page_relocate(page_t ** target,page_t ** replacement,int grouplock,spgcnt_t * nrelocp,lgrp_t * lgrp)4755 do_page_relocate(
4756 page_t **target,
4757 page_t **replacement,
4758 int grouplock,
4759 spgcnt_t *nrelocp,
4760 lgrp_t *lgrp)
4761 {
4762 page_t *first_repl;
4763 page_t *repl;
4764 page_t *targ;
4765 page_t *pl = NULL;
4766 uint_t ppattr;
4767 pfn_t pfn, repl_pfn;
4768 uint_t szc;
4769 spgcnt_t npgs, i;
4770 int repl_contig = 0;
4771 uint_t flags = 0;
4772 spgcnt_t dofree = 0;
4773
4774 *nrelocp = 0;
4775
4776 #if defined(__sparc)
4777 /*
4778 * We need to wait till OBP has completed
4779 * its boot-time handoff of its resources to the kernel
4780 * before we allow page relocation
4781 */
4782 if (page_relocate_ready == 0) {
4783 return (EAGAIN);
4784 }
4785 #endif
4786
4787 /*
4788 * If this is not a base page,
4789 * just return with 0x0 pages relocated.
4790 */
4791 targ = *target;
4792 ASSERT(PAGE_EXCL(targ));
4793 ASSERT(!PP_ISFREE(targ));
4794 szc = targ->p_szc;
4795 ASSERT(szc < mmu_page_sizes);
4796 VM_STAT_ADD(vmm_vmstats.ppr_reloc[szc]);
4797 pfn = targ->p_pagenum;
4798 if (pfn != PFN_BASE(pfn, szc)) {
4799 VM_STAT_ADD(vmm_vmstats.ppr_relocnoroot[szc]);
4800 return (ERANGE);
4801 }
4802
4803 if ((repl = *replacement) != NULL && repl->p_szc >= szc) {
4804 repl_pfn = repl->p_pagenum;
4805 if (repl_pfn != PFN_BASE(repl_pfn, szc)) {
4806 VM_STAT_ADD(vmm_vmstats.ppr_reloc_replnoroot[szc]);
4807 return (ERANGE);
4808 }
4809 repl_contig = 1;
4810 }
4811
4812 /*
4813 * We must lock all members of this large page or we cannot
4814 * relocate any part of it.
4815 */
4816 if (grouplock != 0 && !group_page_trylock(targ, SE_EXCL)) {
4817 VM_STAT_ADD(vmm_vmstats.ppr_relocnolock[targ->p_szc]);
4818 return (EBUSY);
4819 }
4820
4821 /*
4822 * reread szc it could have been decreased before
4823 * group_page_trylock() was done.
4824 */
4825 szc = targ->p_szc;
4826 ASSERT(szc < mmu_page_sizes);
4827 VM_STAT_ADD(vmm_vmstats.ppr_reloc[szc]);
4828 ASSERT(pfn == PFN_BASE(pfn, szc));
4829
4830 npgs = page_get_pagecnt(targ->p_szc);
4831
4832 if (repl == NULL) {
4833 dofree = npgs; /* Size of target page in MMU pages */
4834 if (!page_create_wait(dofree, 0)) {
4835 if (grouplock != 0) {
4836 group_page_unlock(targ);
4837 }
4838 VM_STAT_ADD(vmm_vmstats.ppr_relocnomem[szc]);
4839 return (ENOMEM);
4840 }
4841
4842 /*
4843 * seg kmem pages require that the target and replacement
4844 * page be the same pagesize.
4845 */
4846 flags = (VN_ISKAS(targ->p_vnode)) ? PGR_SAMESZC : 0;
4847 repl = page_get_replacement_page(targ, lgrp, flags);
4848 if (repl == NULL) {
4849 if (grouplock != 0) {
4850 group_page_unlock(targ);
4851 }
4852 page_create_putback(dofree);
4853 VM_STAT_ADD(vmm_vmstats.ppr_relocnomem[szc]);
4854 return (ENOMEM);
4855 }
4856 }
4857 #ifdef DEBUG
4858 else {
4859 ASSERT(PAGE_LOCKED(repl));
4860 }
4861 #endif /* DEBUG */
4862
4863 #if defined(__sparc)
4864 /*
4865 * Let hat_page_relocate() complete the relocation if it's kernel page
4866 */
4867 if (VN_ISKAS(targ->p_vnode)) {
4868 *replacement = repl;
4869 if (hat_page_relocate(target, replacement, nrelocp) != 0) {
4870 if (grouplock != 0) {
4871 group_page_unlock(targ);
4872 }
4873 if (dofree) {
4874 *replacement = NULL;
4875 page_free_replacement_page(repl);
4876 page_create_putback(dofree);
4877 }
4878 VM_STAT_ADD(vmm_vmstats.ppr_krelocfail[szc]);
4879 return (EAGAIN);
4880 }
4881 VM_STAT_ADD(vmm_vmstats.ppr_relocok[szc]);
4882 return (0);
4883 }
4884 #else
4885 #if defined(lint)
4886 dofree = dofree;
4887 #endif
4888 #endif
4889
4890 first_repl = repl;
4891
4892 for (i = 0; i < npgs; i++) {
4893 ASSERT(PAGE_EXCL(targ));
4894 ASSERT(targ->p_slckcnt == 0);
4895 ASSERT(repl->p_slckcnt == 0);
4896
4897 (void) hat_pageunload(targ, HAT_FORCE_PGUNLOAD);
4898
4899 ASSERT(hat_page_getshare(targ) == 0);
4900 ASSERT(!PP_ISFREE(targ));
4901 ASSERT(targ->p_pagenum == (pfn + i));
4902 ASSERT(repl_contig == 0 ||
4903 repl->p_pagenum == (repl_pfn + i));
4904
4905 /*
4906 * Copy the page contents and attributes then
4907 * relocate the page in the page hash.
4908 */
4909 if (ppcopy(targ, repl) == 0) {
4910 targ = *target;
4911 repl = first_repl;
4912 VM_STAT_ADD(vmm_vmstats.ppr_copyfail);
4913 if (grouplock != 0) {
4914 group_page_unlock(targ);
4915 }
4916 if (dofree) {
4917 *replacement = NULL;
4918 page_free_replacement_page(repl);
4919 page_create_putback(dofree);
4920 }
4921 return (EIO);
4922 }
4923
4924 targ++;
4925 if (repl_contig != 0) {
4926 repl++;
4927 } else {
4928 repl = repl->p_next;
4929 }
4930 }
4931
4932 repl = first_repl;
4933 targ = *target;
4934
4935 for (i = 0; i < npgs; i++) {
4936 ppattr = hat_page_getattr(targ, (P_MOD | P_REF | P_RO));
4937 page_clr_all_props(repl);
4938 page_set_props(repl, ppattr);
4939 page_relocate_hash(repl, targ);
4940
4941 ASSERT(hat_page_getshare(targ) == 0);
4942 ASSERT(hat_page_getshare(repl) == 0);
4943 /*
4944 * Now clear the props on targ, after the
4945 * page_relocate_hash(), they no longer
4946 * have any meaning.
4947 */
4948 page_clr_all_props(targ);
4949 ASSERT(targ->p_next == targ);
4950 ASSERT(targ->p_prev == targ);
4951 page_list_concat(&pl, &targ);
4952
4953 targ++;
4954 if (repl_contig != 0) {
4955 repl++;
4956 } else {
4957 repl = repl->p_next;
4958 }
4959 }
4960 /* assert that we have come full circle with repl */
4961 ASSERT(repl_contig == 1 || first_repl == repl);
4962
4963 *target = pl;
4964 if (*replacement == NULL) {
4965 ASSERT(first_repl == repl);
4966 *replacement = repl;
4967 }
4968 VM_STAT_ADD(vmm_vmstats.ppr_relocok[szc]);
4969 *nrelocp = npgs;
4970 return (0);
4971 }
4972 /*
4973 * On success returns 0 and *nrelocp the number of PAGESIZE pages relocated.
4974 */
4975 int
page_relocate(page_t ** target,page_t ** replacement,int grouplock,int freetarget,spgcnt_t * nrelocp,lgrp_t * lgrp)4976 page_relocate(
4977 page_t **target,
4978 page_t **replacement,
4979 int grouplock,
4980 int freetarget,
4981 spgcnt_t *nrelocp,
4982 lgrp_t *lgrp)
4983 {
4984 spgcnt_t ret;
4985
4986 /* do_page_relocate returns 0 on success or errno value */
4987 ret = do_page_relocate(target, replacement, grouplock, nrelocp, lgrp);
4988
4989 if (ret != 0 || freetarget == 0) {
4990 return (ret);
4991 }
4992 if (*nrelocp == 1) {
4993 ASSERT(*target != NULL);
4994 page_free(*target, 1);
4995 } else {
4996 page_t *tpp = *target;
4997 uint_t szc = tpp->p_szc;
4998 pgcnt_t npgs = page_get_pagecnt(szc);
4999 ASSERT(npgs > 1);
5000 ASSERT(szc != 0);
5001 do {
5002 ASSERT(PAGE_EXCL(tpp));
5003 ASSERT(!hat_page_is_mapped(tpp));
5004 ASSERT(tpp->p_szc == szc);
5005 PP_SETFREE(tpp);
5006 PP_SETAGED(tpp);
5007 npgs--;
5008 } while ((tpp = tpp->p_next) != *target);
5009 ASSERT(npgs == 0);
5010 page_list_add_pages(*target, 0);
5011 npgs = page_get_pagecnt(szc);
5012 page_create_putback(npgs);
5013 }
5014 return (ret);
5015 }
5016
5017 /*
5018 * it is up to the caller to deal with pcf accounting.
5019 */
5020 void
page_free_replacement_page(page_t * pplist)5021 page_free_replacement_page(page_t *pplist)
5022 {
5023 page_t *pp;
5024
5025 while (pplist != NULL) {
5026 /*
5027 * pp_targ is a linked list.
5028 */
5029 pp = pplist;
5030 if (pp->p_szc == 0) {
5031 page_sub(&pplist, pp);
5032 page_clr_all_props(pp);
5033 PP_SETFREE(pp);
5034 PP_SETAGED(pp);
5035 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
5036 page_unlock(pp);
5037 VM_STAT_ADD(pagecnt.pc_free_replacement_page[0]);
5038 } else {
5039 spgcnt_t curnpgs = page_get_pagecnt(pp->p_szc);
5040 page_t *tpp;
5041 page_list_break(&pp, &pplist, curnpgs);
5042 tpp = pp;
5043 do {
5044 ASSERT(PAGE_EXCL(tpp));
5045 ASSERT(!hat_page_is_mapped(tpp));
5046 page_clr_all_props(tpp);
5047 PP_SETFREE(tpp);
5048 PP_SETAGED(tpp);
5049 } while ((tpp = tpp->p_next) != pp);
5050 page_list_add_pages(pp, 0);
5051 VM_STAT_ADD(pagecnt.pc_free_replacement_page[1]);
5052 }
5053 }
5054 }
5055
5056 /*
5057 * Relocate target to non-relocatable replacement page.
5058 */
5059 int
page_relocate_cage(page_t ** target,page_t ** replacement)5060 page_relocate_cage(page_t **target, page_t **replacement)
5061 {
5062 page_t *tpp, *rpp;
5063 spgcnt_t pgcnt, npgs;
5064 int result;
5065
5066 tpp = *target;
5067
5068 ASSERT(PAGE_EXCL(tpp));
5069 ASSERT(tpp->p_szc == 0);
5070
5071 pgcnt = btop(page_get_pagesize(tpp->p_szc));
5072
5073 do {
5074 (void) page_create_wait(pgcnt, PG_WAIT | PG_NORELOC);
5075 rpp = page_get_replacement_page(tpp, NULL, PGR_NORELOC);
5076 if (rpp == NULL) {
5077 page_create_putback(pgcnt);
5078 kcage_cageout_wakeup();
5079 }
5080 } while (rpp == NULL);
5081
5082 ASSERT(PP_ISNORELOC(rpp));
5083
5084 result = page_relocate(&tpp, &rpp, 0, 1, &npgs, NULL);
5085
5086 if (result == 0) {
5087 *replacement = rpp;
5088 if (pgcnt != npgs)
5089 panic("page_relocate_cage: partial relocation");
5090 }
5091
5092 return (result);
5093 }
5094
5095 /*
5096 * Release the page lock on a page, place on cachelist
5097 * tail if no longer mapped. Caller can let us know if
5098 * the page is known to be clean.
5099 */
5100 int
page_release(page_t * pp,int checkmod)5101 page_release(page_t *pp, int checkmod)
5102 {
5103 int status;
5104
5105 ASSERT(PAGE_LOCKED(pp) && !PP_ISFREE(pp) &&
5106 (pp->p_vnode != NULL));
5107
5108 if (!hat_page_is_mapped(pp) && !IS_SWAPVP(pp->p_vnode) &&
5109 ((PAGE_SHARED(pp) && page_tryupgrade(pp)) || PAGE_EXCL(pp)) &&
5110 pp->p_lckcnt == 0 && pp->p_cowcnt == 0 &&
5111 !hat_page_is_mapped(pp)) {
5112
5113 /*
5114 * If page is modified, unlock it
5115 *
5116 * (p_nrm & P_MOD) bit has the latest stuff because:
5117 * (1) We found that this page doesn't have any mappings
5118 * _after_ holding SE_EXCL and
5119 * (2) We didn't drop SE_EXCL lock after the check in (1)
5120 */
5121 if (checkmod && hat_ismod(pp)) {
5122 page_unlock(pp);
5123 status = PGREL_MOD;
5124 } else {
5125 /*LINTED: constant in conditional context*/
5126 VN_DISPOSE(pp, B_FREE, 0, kcred);
5127 status = PGREL_CLEAN;
5128 }
5129 } else {
5130 page_unlock(pp);
5131 status = PGREL_NOTREL;
5132 }
5133 return (status);
5134 }
5135
5136 /*
5137 * Given a constituent page, try to demote the large page on the freelist.
5138 *
5139 * Returns nonzero if the page could be demoted successfully. Returns with
5140 * the constituent page still locked.
5141 */
5142 int
page_try_demote_free_pages(page_t * pp)5143 page_try_demote_free_pages(page_t *pp)
5144 {
5145 page_t *rootpp = pp;
5146 pfn_t pfn = page_pptonum(pp);
5147 spgcnt_t npgs;
5148 uint_t szc = pp->p_szc;
5149
5150 ASSERT(PP_ISFREE(pp));
5151 ASSERT(PAGE_EXCL(pp));
5152
5153 /*
5154 * Adjust rootpp and lock it, if `pp' is not the base
5155 * constituent page.
5156 */
5157 npgs = page_get_pagecnt(pp->p_szc);
5158 if (npgs == 1) {
5159 return (0);
5160 }
5161
5162 if (!IS_P2ALIGNED(pfn, npgs)) {
5163 pfn = P2ALIGN(pfn, npgs);
5164 rootpp = page_numtopp_nolock(pfn);
5165 }
5166
5167 if (pp != rootpp && !page_trylock(rootpp, SE_EXCL)) {
5168 return (0);
5169 }
5170
5171 if (rootpp->p_szc != szc) {
5172 if (pp != rootpp)
5173 page_unlock(rootpp);
5174 return (0);
5175 }
5176
5177 page_demote_free_pages(rootpp);
5178
5179 if (pp != rootpp)
5180 page_unlock(rootpp);
5181
5182 ASSERT(PP_ISFREE(pp));
5183 ASSERT(PAGE_EXCL(pp));
5184 return (1);
5185 }
5186
5187 /*
5188 * Given a constituent page, try to demote the large page.
5189 *
5190 * Returns nonzero if the page could be demoted successfully. Returns with
5191 * the constituent page still locked.
5192 */
5193 int
page_try_demote_pages(page_t * pp)5194 page_try_demote_pages(page_t *pp)
5195 {
5196 page_t *tpp, *rootpp = pp;
5197 pfn_t pfn = page_pptonum(pp);
5198 spgcnt_t i, npgs;
5199 uint_t szc = pp->p_szc;
5200 vnode_t *vp = pp->p_vnode;
5201
5202 ASSERT(PAGE_EXCL(pp));
5203
5204 VM_STAT_ADD(pagecnt.pc_try_demote_pages[0]);
5205
5206 if (pp->p_szc == 0) {
5207 VM_STAT_ADD(pagecnt.pc_try_demote_pages[1]);
5208 return (1);
5209 }
5210
5211 if (vp != NULL && !IS_SWAPFSVP(vp) && !VN_ISKAS(vp)) {
5212 VM_STAT_ADD(pagecnt.pc_try_demote_pages[2]);
5213 page_demote_vp_pages(pp);
5214 ASSERT(pp->p_szc == 0);
5215 return (1);
5216 }
5217
5218 /*
5219 * Adjust rootpp if passed in is not the base
5220 * constituent page.
5221 */
5222 npgs = page_get_pagecnt(pp->p_szc);
5223 ASSERT(npgs > 1);
5224 if (!IS_P2ALIGNED(pfn, npgs)) {
5225 pfn = P2ALIGN(pfn, npgs);
5226 rootpp = page_numtopp_nolock(pfn);
5227 VM_STAT_ADD(pagecnt.pc_try_demote_pages[3]);
5228 ASSERT(rootpp->p_vnode != NULL);
5229 ASSERT(rootpp->p_szc == szc);
5230 }
5231
5232 /*
5233 * We can't demote kernel pages since we can't hat_unload()
5234 * the mappings.
5235 */
5236 if (VN_ISKAS(rootpp->p_vnode))
5237 return (0);
5238
5239 /*
5240 * Attempt to lock all constituent pages except the page passed
5241 * in since it's already locked.
5242 */
5243 for (tpp = rootpp, i = 0; i < npgs; i++, tpp++) {
5244 ASSERT(!PP_ISFREE(tpp));
5245 ASSERT(tpp->p_vnode != NULL);
5246
5247 if (tpp != pp && !page_trylock(tpp, SE_EXCL))
5248 break;
5249 ASSERT(tpp->p_szc == rootpp->p_szc);
5250 ASSERT(page_pptonum(tpp) == page_pptonum(rootpp) + i);
5251 }
5252
5253 /*
5254 * If we failed to lock them all then unlock what we have
5255 * locked so far and bail.
5256 */
5257 if (i < npgs) {
5258 tpp = rootpp;
5259 while (i-- > 0) {
5260 if (tpp != pp)
5261 page_unlock(tpp);
5262 tpp++;
5263 }
5264 VM_STAT_ADD(pagecnt.pc_try_demote_pages[4]);
5265 return (0);
5266 }
5267
5268 for (tpp = rootpp, i = 0; i < npgs; i++, tpp++) {
5269 ASSERT(PAGE_EXCL(tpp));
5270 ASSERT(tpp->p_slckcnt == 0);
5271 (void) hat_pageunload(tpp, HAT_FORCE_PGUNLOAD);
5272 tpp->p_szc = 0;
5273 }
5274
5275 /*
5276 * Unlock all pages except the page passed in.
5277 */
5278 for (tpp = rootpp, i = 0; i < npgs; i++, tpp++) {
5279 ASSERT(!hat_page_is_mapped(tpp));
5280 if (tpp != pp)
5281 page_unlock(tpp);
5282 }
5283
5284 VM_STAT_ADD(pagecnt.pc_try_demote_pages[5]);
5285 return (1);
5286 }
5287
5288 /*
5289 * Called by page_free() and page_destroy() to demote the page size code
5290 * (p_szc) to 0 (since we can't just put a single PAGESIZE page with non zero
5291 * p_szc on free list, neither can we just clear p_szc of a single page_t
5292 * within a large page since it will break other code that relies on p_szc
5293 * being the same for all page_t's of a large page). Anonymous pages should
5294 * never end up here because anon_map_getpages() cannot deal with p_szc
5295 * changes after a single constituent page is locked. While anonymous or
5296 * kernel large pages are demoted or freed the entire large page at a time
5297 * with all constituent pages locked EXCL for the file system pages we
5298 * have to be able to demote a large page (i.e. decrease all constituent pages
5299 * p_szc) with only just an EXCL lock on one of constituent pages. The reason
5300 * we can easily deal with anonymous page demotion the entire large page at a
5301 * time is that those operation originate at address space level and concern
5302 * the entire large page region with actual demotion only done when pages are
5303 * not shared with any other processes (therefore we can always get EXCL lock
5304 * on all anonymous constituent pages after clearing segment page
5305 * cache). However file system pages can be truncated or invalidated at a
5306 * PAGESIZE level from the file system side and end up in page_free() or
5307 * page_destroy() (we also allow only part of the large page to be SOFTLOCKed
5308 * and therefore pageout should be able to demote a large page by EXCL locking
5309 * any constituent page that is not under SOFTLOCK). In those cases we cannot
5310 * rely on being able to lock EXCL all constituent pages.
5311 *
5312 * To prevent szc changes on file system pages one has to lock all constituent
5313 * pages at least SHARED (or call page_szc_lock()). The only subsystem that
5314 * doesn't rely on locking all constituent pages (or using page_szc_lock()) to
5315 * prevent szc changes is hat layer that uses its own page level mlist
5316 * locks. hat assumes that szc doesn't change after mlist lock for a page is
5317 * taken. Therefore we need to change szc under hat level locks if we only
5318 * have an EXCL lock on a single constituent page and hat still references any
5319 * of constituent pages. (Note we can't "ignore" hat layer by simply
5320 * hat_pageunload() all constituent pages without having EXCL locks on all of
5321 * constituent pages). We use hat_page_demote() call to safely demote szc of
5322 * all constituent pages under hat locks when we only have an EXCL lock on one
5323 * of constituent pages.
5324 *
5325 * This routine calls page_szc_lock() before calling hat_page_demote() to
5326 * allow segvn in one special case not to lock all constituent pages SHARED
5327 * before calling hat_memload_array() that relies on p_szc not changing even
5328 * before hat level mlist lock is taken. In that case segvn uses
5329 * page_szc_lock() to prevent hat_page_demote() changing p_szc values.
5330 *
5331 * Anonymous or kernel page demotion still has to lock all pages exclusively
5332 * and do hat_pageunload() on all constituent pages before demoting the page
5333 * therefore there's no need for anonymous or kernel page demotion to use
5334 * hat_page_demote() mechanism.
5335 *
5336 * hat_page_demote() removes all large mappings that map pp and then decreases
5337 * p_szc starting from the last constituent page of the large page. By working
5338 * from the tail of a large page in pfn decreasing order allows one looking at
5339 * the root page to know that hat_page_demote() is done for root's szc area.
5340 * e.g. if a root page has szc 1 one knows it only has to lock all constituent
5341 * pages within szc 1 area to prevent szc changes because hat_page_demote()
5342 * that started on this page when it had szc > 1 is done for this szc 1 area.
5343 *
5344 * We are guaranteed that all constituent pages of pp's large page belong to
5345 * the same vnode with the consecutive offsets increasing in the direction of
5346 * the pfn i.e. the identity of constituent pages can't change until their
5347 * p_szc is decreased. Therefore it's safe for hat_page_demote() to remove
5348 * large mappings to pp even though we don't lock any constituent page except
5349 * pp (i.e. we won't unload e.g. kernel locked page).
5350 */
5351 static void
page_demote_vp_pages(page_t * pp)5352 page_demote_vp_pages(page_t *pp)
5353 {
5354 kmutex_t *mtx;
5355
5356 ASSERT(PAGE_EXCL(pp));
5357 ASSERT(!PP_ISFREE(pp));
5358 ASSERT(pp->p_vnode != NULL);
5359 ASSERT(!IS_SWAPFSVP(pp->p_vnode));
5360 ASSERT(!PP_ISKAS(pp));
5361
5362 VM_STAT_ADD(pagecnt.pc_demote_pages[0]);
5363
5364 mtx = page_szc_lock(pp);
5365 if (mtx != NULL) {
5366 hat_page_demote(pp);
5367 mutex_exit(mtx);
5368 }
5369 ASSERT(pp->p_szc == 0);
5370 }
5371
5372 /*
5373 * Mark any existing pages for migration in the given range
5374 */
5375 void
page_mark_migrate(struct seg * seg,caddr_t addr,size_t len,struct anon_map * amp,ulong_t anon_index,vnode_t * vp,u_offset_t vnoff,int rflag)5376 page_mark_migrate(struct seg *seg, caddr_t addr, size_t len,
5377 struct anon_map *amp, ulong_t anon_index, vnode_t *vp,
5378 u_offset_t vnoff, int rflag)
5379 {
5380 struct anon *ap;
5381 vnode_t *curvp;
5382 lgrp_t *from;
5383 pgcnt_t nlocked;
5384 u_offset_t off;
5385 pfn_t pfn;
5386 size_t pgsz;
5387 size_t segpgsz;
5388 pgcnt_t pages;
5389 uint_t pszc;
5390 page_t *pp0, *pp;
5391 caddr_t va;
5392 ulong_t an_idx;
5393 anon_sync_obj_t cookie;
5394
5395 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
5396
5397 /*
5398 * Don't do anything if don't need to do lgroup optimizations
5399 * on this system
5400 */
5401 if (!lgrp_optimizations())
5402 return;
5403
5404 /*
5405 * Align address and length to (potentially large) page boundary
5406 */
5407 segpgsz = page_get_pagesize(seg->s_szc);
5408 addr = (caddr_t)P2ALIGN((uintptr_t)addr, segpgsz);
5409 if (rflag)
5410 len = P2ROUNDUP(len, segpgsz);
5411
5412 /*
5413 * Do one (large) page at a time
5414 */
5415 va = addr;
5416 while (va < addr + len) {
5417 /*
5418 * Lookup (root) page for vnode and offset corresponding to
5419 * this virtual address
5420 * Try anonmap first since there may be copy-on-write
5421 * pages, but initialize vnode pointer and offset using
5422 * vnode arguments just in case there isn't an amp.
5423 */
5424 curvp = vp;
5425 off = vnoff + va - seg->s_base;
5426 if (amp) {
5427 ANON_LOCK_ENTER(&->a_rwlock, RW_READER);
5428 an_idx = anon_index + seg_page(seg, va);
5429 anon_array_enter(amp, an_idx, &cookie);
5430 ap = anon_get_ptr(amp->ahp, an_idx);
5431 if (ap)
5432 swap_xlate(ap, &curvp, &off);
5433 anon_array_exit(&cookie);
5434 ANON_LOCK_EXIT(&->a_rwlock);
5435 }
5436
5437 pp = NULL;
5438 if (curvp)
5439 pp = page_lookup(curvp, off, SE_SHARED);
5440
5441 /*
5442 * If there isn't a page at this virtual address,
5443 * skip to next page
5444 */
5445 if (pp == NULL) {
5446 va += PAGESIZE;
5447 continue;
5448 }
5449
5450 /*
5451 * Figure out which lgroup this page is in for kstats
5452 */
5453 pfn = page_pptonum(pp);
5454 from = lgrp_pfn_to_lgrp(pfn);
5455
5456 /*
5457 * Get page size, and round up and skip to next page boundary
5458 * if unaligned address
5459 */
5460 pszc = pp->p_szc;
5461 pgsz = page_get_pagesize(pszc);
5462 pages = btop(pgsz);
5463 if (!IS_P2ALIGNED(va, pgsz) ||
5464 !IS_P2ALIGNED(pfn, pages) ||
5465 pgsz > segpgsz) {
5466 pgsz = MIN(pgsz, segpgsz);
5467 page_unlock(pp);
5468 pages = btop(P2END((uintptr_t)va, pgsz) -
5469 (uintptr_t)va);
5470 va = (caddr_t)P2END((uintptr_t)va, pgsz);
5471 lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS, pages);
5472 continue;
5473 }
5474
5475 /*
5476 * Upgrade to exclusive lock on page
5477 */
5478 if (!page_tryupgrade(pp)) {
5479 page_unlock(pp);
5480 va += pgsz;
5481 lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS,
5482 btop(pgsz));
5483 continue;
5484 }
5485
5486 pp0 = pp++;
5487 nlocked = 1;
5488
5489 /*
5490 * Lock constituent pages if this is large page
5491 */
5492 if (pages > 1) {
5493 /*
5494 * Lock all constituents except root page, since it
5495 * should be locked already.
5496 */
5497 for (; nlocked < pages; nlocked++) {
5498 if (!page_trylock(pp, SE_EXCL)) {
5499 break;
5500 }
5501 if (PP_ISFREE(pp) ||
5502 pp->p_szc != pszc) {
5503 /*
5504 * hat_page_demote() raced in with us.
5505 */
5506 ASSERT(!IS_SWAPFSVP(curvp));
5507 page_unlock(pp);
5508 break;
5509 }
5510 pp++;
5511 }
5512 }
5513
5514 /*
5515 * If all constituent pages couldn't be locked,
5516 * unlock pages locked so far and skip to next page.
5517 */
5518 if (nlocked < pages) {
5519 while (pp0 < pp) {
5520 page_unlock(pp0++);
5521 }
5522 va += pgsz;
5523 lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS,
5524 btop(pgsz));
5525 continue;
5526 }
5527
5528 /*
5529 * hat_page_demote() can no longer happen
5530 * since last cons page had the right p_szc after
5531 * all cons pages were locked. all cons pages
5532 * should now have the same p_szc.
5533 */
5534
5535 /*
5536 * All constituent pages locked successfully, so mark
5537 * large page for migration and unload the mappings of
5538 * constituent pages, so a fault will occur on any part of the
5539 * large page
5540 */
5541 PP_SETMIGRATE(pp0);
5542 while (pp0 < pp) {
5543 (void) hat_pageunload(pp0, HAT_FORCE_PGUNLOAD);
5544 ASSERT(hat_page_getshare(pp0) == 0);
5545 page_unlock(pp0++);
5546 }
5547 lgrp_stat_add(from->lgrp_id, LGRP_PMM_PGS, nlocked);
5548
5549 va += pgsz;
5550 }
5551 }
5552
5553 /*
5554 * Migrate any pages that have been marked for migration in the given range
5555 */
5556 void
page_migrate(struct seg * seg,caddr_t addr,page_t ** ppa,pgcnt_t npages)5557 page_migrate(
5558 struct seg *seg,
5559 caddr_t addr,
5560 page_t **ppa,
5561 pgcnt_t npages)
5562 {
5563 lgrp_t *from;
5564 lgrp_t *to;
5565 page_t *newpp;
5566 page_t *pp;
5567 pfn_t pfn;
5568 size_t pgsz;
5569 spgcnt_t page_cnt;
5570 spgcnt_t i;
5571 uint_t pszc;
5572
5573 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as));
5574
5575 while (npages > 0) {
5576 pp = *ppa;
5577 pszc = pp->p_szc;
5578 pgsz = page_get_pagesize(pszc);
5579 page_cnt = btop(pgsz);
5580
5581 /*
5582 * Check to see whether this page is marked for migration
5583 *
5584 * Assume that root page of large page is marked for
5585 * migration and none of the other constituent pages
5586 * are marked. This really simplifies clearing the
5587 * migrate bit by not having to clear it from each
5588 * constituent page.
5589 *
5590 * note we don't want to relocate an entire large page if
5591 * someone is only using one subpage.
5592 */
5593 if (npages < page_cnt)
5594 break;
5595
5596 /*
5597 * Is it marked for migration?
5598 */
5599 if (!PP_ISMIGRATE(pp))
5600 goto next;
5601
5602 /*
5603 * Determine lgroups that page is being migrated between
5604 */
5605 pfn = page_pptonum(pp);
5606 if (!IS_P2ALIGNED(pfn, page_cnt)) {
5607 break;
5608 }
5609 from = lgrp_pfn_to_lgrp(pfn);
5610 to = lgrp_mem_choose(seg, addr, pgsz);
5611
5612 /*
5613 * Need to get exclusive lock's to migrate
5614 */
5615 for (i = 0; i < page_cnt; i++) {
5616 ASSERT(PAGE_LOCKED(ppa[i]));
5617 if (page_pptonum(ppa[i]) != pfn + i ||
5618 ppa[i]->p_szc != pszc) {
5619 break;
5620 }
5621 if (!page_tryupgrade(ppa[i])) {
5622 lgrp_stat_add(from->lgrp_id,
5623 LGRP_PM_FAIL_LOCK_PGS,
5624 page_cnt);
5625 break;
5626 }
5627
5628 /*
5629 * Check to see whether we are trying to migrate
5630 * page to lgroup where it is allocated already.
5631 * If so, clear the migrate bit and skip to next
5632 * page.
5633 */
5634 if (i == 0 && to == from) {
5635 PP_CLRMIGRATE(ppa[0]);
5636 page_downgrade(ppa[0]);
5637 goto next;
5638 }
5639 }
5640
5641 /*
5642 * If all constituent pages couldn't be locked,
5643 * unlock pages locked so far and skip to next page.
5644 */
5645 if (i != page_cnt) {
5646 while (--i != -1) {
5647 page_downgrade(ppa[i]);
5648 }
5649 goto next;
5650 }
5651
5652 (void) page_create_wait(page_cnt, PG_WAIT);
5653 newpp = page_get_replacement_page(pp, to, PGR_SAMESZC);
5654 if (newpp == NULL) {
5655 page_create_putback(page_cnt);
5656 for (i = 0; i < page_cnt; i++) {
5657 page_downgrade(ppa[i]);
5658 }
5659 lgrp_stat_add(to->lgrp_id, LGRP_PM_FAIL_ALLOC_PGS,
5660 page_cnt);
5661 goto next;
5662 }
5663 ASSERT(newpp->p_szc == pszc);
5664 /*
5665 * Clear migrate bit and relocate page
5666 */
5667 PP_CLRMIGRATE(pp);
5668 if (page_relocate(&pp, &newpp, 0, 1, &page_cnt, to)) {
5669 panic("page_migrate: page_relocate failed");
5670 }
5671 ASSERT(page_cnt * PAGESIZE == pgsz);
5672
5673 /*
5674 * Keep stats for number of pages migrated from and to
5675 * each lgroup
5676 */
5677 lgrp_stat_add(from->lgrp_id, LGRP_PM_SRC_PGS, page_cnt);
5678 lgrp_stat_add(to->lgrp_id, LGRP_PM_DEST_PGS, page_cnt);
5679 /*
5680 * update the page_t array we were passed in and
5681 * unlink constituent pages of a large page.
5682 */
5683 for (i = 0; i < page_cnt; ++i, ++pp) {
5684 ASSERT(PAGE_EXCL(newpp));
5685 ASSERT(newpp->p_szc == pszc);
5686 ppa[i] = newpp;
5687 pp = newpp;
5688 page_sub(&newpp, pp);
5689 page_downgrade(pp);
5690 }
5691 ASSERT(newpp == NULL);
5692 next:
5693 addr += pgsz;
5694 ppa += page_cnt;
5695 npages -= page_cnt;
5696 }
5697 }
5698
5699 #define MAX_CNT 60 /* max num of iterations */
5700 /*
5701 * Reclaim/reserve availrmem for npages.
5702 * If there is not enough memory start reaping seg, kmem caches.
5703 * Start pageout scanner (via page_needfree()).
5704 * Exit after ~ MAX_CNT s regardless of how much memory has been released.
5705 * Note: There is no guarantee that any availrmem will be freed as
5706 * this memory typically is locked (kernel heap) or reserved for swap.
5707 * Also due to memory fragmentation kmem allocator may not be able
5708 * to free any memory (single user allocated buffer will prevent
5709 * freeing slab or a page).
5710 */
5711 int
page_reclaim_mem(pgcnt_t npages,pgcnt_t epages,int adjust)5712 page_reclaim_mem(pgcnt_t npages, pgcnt_t epages, int adjust)
5713 {
5714 int i = 0;
5715 int ret = 0;
5716 pgcnt_t deficit;
5717 pgcnt_t old_availrmem;
5718
5719 mutex_enter(&freemem_lock);
5720 old_availrmem = availrmem - 1;
5721 while ((availrmem < tune.t_minarmem + npages + epages) &&
5722 (old_availrmem < availrmem) && (i++ < MAX_CNT)) {
5723 old_availrmem = availrmem;
5724 deficit = tune.t_minarmem + npages + epages - availrmem;
5725 mutex_exit(&freemem_lock);
5726 page_needfree(deficit);
5727 kmem_reap();
5728 delay(hz);
5729 page_needfree(-(spgcnt_t)deficit);
5730 mutex_enter(&freemem_lock);
5731 }
5732
5733 if (adjust && (availrmem >= tune.t_minarmem + npages + epages)) {
5734 availrmem -= npages;
5735 ret = 1;
5736 }
5737
5738 mutex_exit(&freemem_lock);
5739
5740 return (ret);
5741 }
5742
5743 /*
5744 * Search the memory segments to locate the desired page. Within a
5745 * segment, pages increase linearly with one page structure per
5746 * physical page frame (size PAGESIZE). The search begins
5747 * with the segment that was accessed last, to take advantage of locality.
5748 * If the hint misses, we start from the beginning of the sorted memseg list
5749 */
5750
5751
5752 /*
5753 * Some data structures for pfn to pp lookup.
5754 */
5755 ulong_t mhash_per_slot;
5756 struct memseg *memseg_hash[N_MEM_SLOTS];
5757
5758 page_t *
page_numtopp_nolock(pfn_t pfnum)5759 page_numtopp_nolock(pfn_t pfnum)
5760 {
5761 struct memseg *seg;
5762 page_t *pp;
5763 vm_cpu_data_t *vc;
5764
5765 /*
5766 * We need to disable kernel preemption while referencing the
5767 * cpu_vm_data field in order to prevent us from being switched to
5768 * another cpu and trying to reference it after it has been freed.
5769 * This will keep us on cpu and prevent it from being removed while
5770 * we are still on it.
5771 *
5772 * We may be caching a memseg in vc_pnum_memseg/vc_pnext_memseg
5773 * which is being resued by DR who will flush those references
5774 * before modifying the reused memseg. See memseg_cpu_vm_flush().
5775 */
5776 kpreempt_disable();
5777 vc = CPU->cpu_vm_data;
5778 ASSERT(vc != NULL);
5779
5780 MEMSEG_STAT_INCR(nsearch);
5781
5782 /* Try last winner first */
5783 if (((seg = vc->vc_pnum_memseg) != NULL) &&
5784 (pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) {
5785 MEMSEG_STAT_INCR(nlastwon);
5786 pp = seg->pages + (pfnum - seg->pages_base);
5787 if (pp->p_pagenum == pfnum) {
5788 kpreempt_enable();
5789 return ((page_t *)pp);
5790 }
5791 }
5792
5793 /* Else Try hash */
5794 if (((seg = memseg_hash[MEMSEG_PFN_HASH(pfnum)]) != NULL) &&
5795 (pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) {
5796 MEMSEG_STAT_INCR(nhashwon);
5797 vc->vc_pnum_memseg = seg;
5798 pp = seg->pages + (pfnum - seg->pages_base);
5799 if (pp->p_pagenum == pfnum) {
5800 kpreempt_enable();
5801 return ((page_t *)pp);
5802 }
5803 }
5804
5805 /* Else Brute force */
5806 for (seg = memsegs; seg != NULL; seg = seg->next) {
5807 if (pfnum >= seg->pages_base && pfnum < seg->pages_end) {
5808 vc->vc_pnum_memseg = seg;
5809 pp = seg->pages + (pfnum - seg->pages_base);
5810 if (pp->p_pagenum == pfnum) {
5811 kpreempt_enable();
5812 return ((page_t *)pp);
5813 }
5814 }
5815 }
5816 vc->vc_pnum_memseg = NULL;
5817 kpreempt_enable();
5818 MEMSEG_STAT_INCR(nnotfound);
5819 return ((page_t *)NULL);
5820
5821 }
5822
5823 struct memseg *
page_numtomemseg_nolock(pfn_t pfnum)5824 page_numtomemseg_nolock(pfn_t pfnum)
5825 {
5826 struct memseg *seg;
5827 page_t *pp;
5828
5829 /*
5830 * We may be caching a memseg in vc_pnum_memseg/vc_pnext_memseg
5831 * which is being resued by DR who will flush those references
5832 * before modifying the reused memseg. See memseg_cpu_vm_flush().
5833 */
5834 kpreempt_disable();
5835 /* Try hash */
5836 if (((seg = memseg_hash[MEMSEG_PFN_HASH(pfnum)]) != NULL) &&
5837 (pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) {
5838 pp = seg->pages + (pfnum - seg->pages_base);
5839 if (pp->p_pagenum == pfnum) {
5840 kpreempt_enable();
5841 return (seg);
5842 }
5843 }
5844
5845 /* Else Brute force */
5846 for (seg = memsegs; seg != NULL; seg = seg->next) {
5847 if (pfnum >= seg->pages_base && pfnum < seg->pages_end) {
5848 pp = seg->pages + (pfnum - seg->pages_base);
5849 if (pp->p_pagenum == pfnum) {
5850 kpreempt_enable();
5851 return (seg);
5852 }
5853 }
5854 }
5855 kpreempt_enable();
5856 return ((struct memseg *)NULL);
5857 }
5858
5859 /*
5860 * Given a page and a count return the page struct that is
5861 * n structs away from the current one in the global page
5862 * list.
5863 *
5864 * This function wraps to the first page upon
5865 * reaching the end of the memseg list.
5866 */
5867 page_t *
page_nextn(page_t * pp,ulong_t n)5868 page_nextn(page_t *pp, ulong_t n)
5869 {
5870 struct memseg *seg;
5871 page_t *ppn;
5872 vm_cpu_data_t *vc;
5873
5874 /*
5875 * We need to disable kernel preemption while referencing the
5876 * cpu_vm_data field in order to prevent us from being switched to
5877 * another cpu and trying to reference it after it has been freed.
5878 * This will keep us on cpu and prevent it from being removed while
5879 * we are still on it.
5880 *
5881 * We may be caching a memseg in vc_pnum_memseg/vc_pnext_memseg
5882 * which is being resued by DR who will flush those references
5883 * before modifying the reused memseg. See memseg_cpu_vm_flush().
5884 */
5885 kpreempt_disable();
5886 vc = (vm_cpu_data_t *)CPU->cpu_vm_data;
5887
5888 ASSERT(vc != NULL);
5889
5890 if (((seg = vc->vc_pnext_memseg) == NULL) ||
5891 (seg->pages_base == seg->pages_end) ||
5892 !(pp >= seg->pages && pp < seg->epages)) {
5893
5894 for (seg = memsegs; seg; seg = seg->next) {
5895 if (pp >= seg->pages && pp < seg->epages)
5896 break;
5897 }
5898
5899 if (seg == NULL) {
5900 /* Memory delete got in, return something valid. */
5901 /* TODO: fix me. */
5902 seg = memsegs;
5903 pp = seg->pages;
5904 }
5905 }
5906
5907 /* check for wraparound - possible if n is large */
5908 while ((ppn = (pp + n)) >= seg->epages || ppn < pp) {
5909 n -= seg->epages - pp;
5910 seg = seg->next;
5911 if (seg == NULL)
5912 seg = memsegs;
5913 pp = seg->pages;
5914 }
5915 vc->vc_pnext_memseg = seg;
5916 kpreempt_enable();
5917 return (ppn);
5918 }
5919
5920 /*
5921 * Initialize for a loop using page_next_scan_large().
5922 */
5923 page_t *
page_next_scan_init(void ** cookie)5924 page_next_scan_init(void **cookie)
5925 {
5926 ASSERT(cookie != NULL);
5927 *cookie = (void *)memsegs;
5928 return ((page_t *)memsegs->pages);
5929 }
5930
5931 /*
5932 * Return the next page in a scan of page_t's, assuming we want
5933 * to skip over sub-pages within larger page sizes.
5934 *
5935 * The cookie is used to keep track of the current memseg.
5936 */
5937 page_t *
page_next_scan_large(page_t * pp,ulong_t * n,void ** cookie)5938 page_next_scan_large(
5939 page_t *pp,
5940 ulong_t *n,
5941 void **cookie)
5942 {
5943 struct memseg *seg = (struct memseg *)*cookie;
5944 page_t *new_pp;
5945 ulong_t cnt;
5946 pfn_t pfn;
5947
5948
5949 /*
5950 * get the count of page_t's to skip based on the page size
5951 */
5952 ASSERT(pp != NULL);
5953 if (pp->p_szc == 0) {
5954 cnt = 1;
5955 } else {
5956 pfn = page_pptonum(pp);
5957 cnt = page_get_pagecnt(pp->p_szc);
5958 cnt -= pfn & (cnt - 1);
5959 }
5960 *n += cnt;
5961 new_pp = pp + cnt;
5962
5963 /*
5964 * Catch if we went past the end of the current memory segment. If so,
5965 * just move to the next segment with pages.
5966 */
5967 if (new_pp >= seg->epages || seg->pages_base == seg->pages_end) {
5968 do {
5969 seg = seg->next;
5970 if (seg == NULL)
5971 seg = memsegs;
5972 } while (seg->pages_base == seg->pages_end);
5973 new_pp = seg->pages;
5974 *cookie = (void *)seg;
5975 }
5976
5977 return (new_pp);
5978 }
5979
5980
5981 /*
5982 * Returns next page in list. Note: this function wraps
5983 * to the first page in the list upon reaching the end
5984 * of the list. Callers should be aware of this fact.
5985 */
5986
5987 /* We should change this be a #define */
5988
5989 page_t *
page_next(page_t * pp)5990 page_next(page_t *pp)
5991 {
5992 return (page_nextn(pp, 1));
5993 }
5994
5995 page_t *
page_first()5996 page_first()
5997 {
5998 return ((page_t *)memsegs->pages);
5999 }
6000
6001
6002 /*
6003 * This routine is called at boot with the initial memory configuration
6004 * and when memory is added or removed.
6005 */
6006 void
build_pfn_hash()6007 build_pfn_hash()
6008 {
6009 pfn_t cur;
6010 pgcnt_t index;
6011 struct memseg *pseg;
6012 int i;
6013
6014 /*
6015 * Clear memseg_hash array.
6016 * Since memory add/delete is designed to operate concurrently
6017 * with normal operation, the hash rebuild must be able to run
6018 * concurrently with page_numtopp_nolock(). To support this
6019 * functionality, assignments to memseg_hash array members must
6020 * be done atomically.
6021 *
6022 * NOTE: bzero() does not currently guarantee this for kernel
6023 * threads, and cannot be used here.
6024 */
6025 for (i = 0; i < N_MEM_SLOTS; i++)
6026 memseg_hash[i] = NULL;
6027
6028 hat_kpm_mseghash_clear(N_MEM_SLOTS);
6029
6030 /*
6031 * Physmax is the last valid pfn.
6032 */
6033 mhash_per_slot = (physmax + 1) >> MEM_HASH_SHIFT;
6034 for (pseg = memsegs; pseg != NULL; pseg = pseg->next) {
6035 index = MEMSEG_PFN_HASH(pseg->pages_base);
6036 cur = pseg->pages_base;
6037 do {
6038 if (index >= N_MEM_SLOTS)
6039 index = MEMSEG_PFN_HASH(cur);
6040
6041 if (memseg_hash[index] == NULL ||
6042 memseg_hash[index]->pages_base > pseg->pages_base) {
6043 memseg_hash[index] = pseg;
6044 hat_kpm_mseghash_update(index, pseg);
6045 }
6046 cur += mhash_per_slot;
6047 index++;
6048 } while (cur < pseg->pages_end);
6049 }
6050 }
6051
6052 /*
6053 * Return the pagenum for the pp
6054 */
6055 pfn_t
page_pptonum(page_t * pp)6056 page_pptonum(page_t *pp)
6057 {
6058 return (pp->p_pagenum);
6059 }
6060
6061 /*
6062 * interface to the referenced and modified etc bits
6063 * in the PSM part of the page struct
6064 * when no locking is desired.
6065 */
6066 void
page_set_props(page_t * pp,uint_t flags)6067 page_set_props(page_t *pp, uint_t flags)
6068 {
6069 ASSERT((flags & ~(P_MOD | P_REF | P_RO)) == 0);
6070 pp->p_nrm |= (uchar_t)flags;
6071 }
6072
6073 void
page_clr_all_props(page_t * pp)6074 page_clr_all_props(page_t *pp)
6075 {
6076 pp->p_nrm = 0;
6077 }
6078
6079 /*
6080 * Clear p_lckcnt and p_cowcnt, adjusting freemem if required.
6081 */
6082 int
page_clear_lck_cow(page_t * pp,int adjust)6083 page_clear_lck_cow(page_t *pp, int adjust)
6084 {
6085 int f_amount;
6086
6087 ASSERT(PAGE_EXCL(pp));
6088
6089 /*
6090 * The page_struct_lock need not be acquired here since
6091 * we require the caller hold the page exclusively locked.
6092 */
6093 f_amount = 0;
6094 if (pp->p_lckcnt) {
6095 f_amount = 1;
6096 pp->p_lckcnt = 0;
6097 }
6098 if (pp->p_cowcnt) {
6099 f_amount += pp->p_cowcnt;
6100 pp->p_cowcnt = 0;
6101 }
6102
6103 if (adjust && f_amount) {
6104 mutex_enter(&freemem_lock);
6105 availrmem += f_amount;
6106 mutex_exit(&freemem_lock);
6107 }
6108
6109 return (f_amount);
6110 }
6111
6112 /*
6113 * The following functions is called from free_vp_pages()
6114 * for an inexact estimate of a newly free'd page...
6115 */
6116 ulong_t
page_share_cnt(page_t * pp)6117 page_share_cnt(page_t *pp)
6118 {
6119 return (hat_page_getshare(pp));
6120 }
6121
6122 int
page_isshared(page_t * pp)6123 page_isshared(page_t *pp)
6124 {
6125 return (hat_page_checkshare(pp, 1));
6126 }
6127
6128 int
page_isfree(page_t * pp)6129 page_isfree(page_t *pp)
6130 {
6131 return (PP_ISFREE(pp));
6132 }
6133
6134 int
page_isref(page_t * pp)6135 page_isref(page_t *pp)
6136 {
6137 return (hat_page_getattr(pp, P_REF));
6138 }
6139
6140 int
page_ismod(page_t * pp)6141 page_ismod(page_t *pp)
6142 {
6143 return (hat_page_getattr(pp, P_MOD));
6144 }
6145
6146 /*
6147 * The following code all currently relates to the page capture logic:
6148 *
6149 * This logic is used for cases where there is a desire to claim a certain
6150 * physical page in the system for the caller. As it may not be possible
6151 * to capture the page immediately, the p_toxic bits are used in the page
6152 * structure to indicate that someone wants to capture this page. When the
6153 * page gets unlocked, the toxic flag will be noted and an attempt to capture
6154 * the page will be made. If it is successful, the original callers callback
6155 * will be called with the page to do with it what they please.
6156 *
6157 * There is also an async thread which wakes up to attempt to capture
6158 * pages occasionally which have the capture bit set. All of the pages which
6159 * need to be captured asynchronously have been inserted into the
6160 * page_capture_hash and thus this thread walks that hash list. Items in the
6161 * hash have an expiration time so this thread handles that as well by removing
6162 * the item from the hash if it has expired.
6163 *
6164 * Some important things to note are:
6165 * - if the PR_CAPTURE bit is set on a page, then the page is in the
6166 * page_capture_hash. The page_capture_hash_head.pchh_mutex is needed
6167 * to set and clear this bit, and while the lock is held is the only time
6168 * you can add or remove an entry from the hash.
6169 * - the PR_CAPTURE bit can only be set and cleared while holding the
6170 * page_capture_hash_head.pchh_mutex
6171 * - the t_flag field of the thread struct is used with the T_CAPTURING
6172 * flag to prevent recursion while dealing with large pages.
6173 * - pages which need to be retired never expire on the page_capture_hash.
6174 */
6175
6176 static void page_capture_thread(void);
6177 static kthread_t *pc_thread_id;
6178 kcondvar_t pc_cv;
6179 static kmutex_t pc_thread_mutex;
6180 static clock_t pc_thread_shortwait;
6181 static clock_t pc_thread_longwait;
6182 static int pc_thread_retry;
6183
6184 struct page_capture_callback pc_cb[PC_NUM_CALLBACKS];
6185
6186 /* Note that this is a circular linked list */
6187 typedef struct page_capture_hash_bucket {
6188 page_t *pp;
6189 uchar_t szc;
6190 uchar_t pri;
6191 uint_t flags;
6192 clock_t expires; /* lbolt at which this request expires. */
6193 void *datap; /* Cached data passed in for callback */
6194 struct page_capture_hash_bucket *next;
6195 struct page_capture_hash_bucket *prev;
6196 } page_capture_hash_bucket_t;
6197
6198 #define PC_PRI_HI 0 /* capture now */
6199 #define PC_PRI_LO 1 /* capture later */
6200 #define PC_NUM_PRI 2
6201
6202 #define PAGE_CAPTURE_PRIO(pp) (PP_ISRAF(pp) ? PC_PRI_LO : PC_PRI_HI)
6203
6204
6205 /*
6206 * Each hash bucket will have it's own mutex and two lists which are:
6207 * active (0): represents requests which have not been processed by
6208 * the page_capture async thread yet.
6209 * walked (1): represents requests which have been processed by the
6210 * page_capture async thread within it's given walk of this bucket.
6211 *
6212 * These are all needed so that we can synchronize all async page_capture
6213 * events. When the async thread moves to a new bucket, it will append the
6214 * walked list to the active list and walk each item one at a time, moving it
6215 * from the active list to the walked list. Thus if there is an async request
6216 * outstanding for a given page, it will always be in one of the two lists.
6217 * New requests will always be added to the active list.
6218 * If we were not able to capture a page before the request expired, we'd free
6219 * up the request structure which would indicate to page_capture that there is
6220 * no longer a need for the given page, and clear the PR_CAPTURE flag if
6221 * possible.
6222 */
6223 typedef struct page_capture_hash_head {
6224 kmutex_t pchh_mutex;
6225 uint_t num_pages[PC_NUM_PRI];
6226 page_capture_hash_bucket_t lists[2]; /* sentinel nodes */
6227 } page_capture_hash_head_t;
6228
6229 #ifdef DEBUG
6230 #define NUM_PAGE_CAPTURE_BUCKETS 4
6231 #else
6232 #define NUM_PAGE_CAPTURE_BUCKETS 64
6233 #endif
6234
6235 page_capture_hash_head_t page_capture_hash[NUM_PAGE_CAPTURE_BUCKETS];
6236
6237 /* for now use a very simple hash based upon the size of a page struct */
6238 #define PAGE_CAPTURE_HASH(pp) \
6239 ((int)(((uintptr_t)pp >> 7) & (NUM_PAGE_CAPTURE_BUCKETS - 1)))
6240
6241 extern pgcnt_t swapfs_minfree;
6242
6243 int page_trycapture(page_t *pp, uint_t szc, uint_t flags, void *datap);
6244
6245 /*
6246 * a callback function is required for page capture requests.
6247 */
6248 void
page_capture_register_callback(uint_t index,clock_t duration,int (* cb_func)(page_t *,void *,uint_t))6249 page_capture_register_callback(uint_t index, clock_t duration,
6250 int (*cb_func)(page_t *, void *, uint_t))
6251 {
6252 ASSERT(pc_cb[index].cb_active == 0);
6253 ASSERT(cb_func != NULL);
6254 rw_enter(&pc_cb[index].cb_rwlock, RW_WRITER);
6255 pc_cb[index].duration = duration;
6256 pc_cb[index].cb_func = cb_func;
6257 pc_cb[index].cb_active = 1;
6258 rw_exit(&pc_cb[index].cb_rwlock);
6259 }
6260
6261 void
page_capture_unregister_callback(uint_t index)6262 page_capture_unregister_callback(uint_t index)
6263 {
6264 int i, j;
6265 struct page_capture_hash_bucket *bp1;
6266 struct page_capture_hash_bucket *bp2;
6267 struct page_capture_hash_bucket *head = NULL;
6268 uint_t flags = (1 << index);
6269
6270 rw_enter(&pc_cb[index].cb_rwlock, RW_WRITER);
6271 ASSERT(pc_cb[index].cb_active == 1);
6272 pc_cb[index].duration = 0; /* Paranoia */
6273 pc_cb[index].cb_func = NULL; /* Paranoia */
6274 pc_cb[index].cb_active = 0;
6275 rw_exit(&pc_cb[index].cb_rwlock);
6276
6277 /*
6278 * Just move all the entries to a private list which we can walk
6279 * through without the need to hold any locks.
6280 * No more requests can get added to the hash lists for this consumer
6281 * as the cb_active field for the callback has been cleared.
6282 */
6283 for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) {
6284 mutex_enter(&page_capture_hash[i].pchh_mutex);
6285 for (j = 0; j < 2; j++) {
6286 bp1 = page_capture_hash[i].lists[j].next;
6287 /* walk through all but first (sentinel) element */
6288 while (bp1 != &page_capture_hash[i].lists[j]) {
6289 bp2 = bp1;
6290 if (bp2->flags & flags) {
6291 bp1 = bp2->next;
6292 bp1->prev = bp2->prev;
6293 bp2->prev->next = bp1;
6294 bp2->next = head;
6295 head = bp2;
6296 /*
6297 * Clear the PR_CAPTURE bit as we
6298 * hold appropriate locks here.
6299 */
6300 page_clrtoxic(head->pp, PR_CAPTURE);
6301 page_capture_hash[i].
6302 num_pages[bp2->pri]--;
6303 continue;
6304 }
6305 bp1 = bp1->next;
6306 }
6307 }
6308 mutex_exit(&page_capture_hash[i].pchh_mutex);
6309 }
6310
6311 while (head != NULL) {
6312 bp1 = head;
6313 head = head->next;
6314 kmem_free(bp1, sizeof (*bp1));
6315 }
6316 }
6317
6318
6319 /*
6320 * Find pp in the active list and move it to the walked list if it
6321 * exists.
6322 * Note that most often pp should be at the front of the active list
6323 * as it is currently used and thus there is no other sort of optimization
6324 * being done here as this is a linked list data structure.
6325 * Returns 1 on successful move or 0 if page could not be found.
6326 */
6327 static int
page_capture_move_to_walked(page_t * pp)6328 page_capture_move_to_walked(page_t *pp)
6329 {
6330 page_capture_hash_bucket_t *bp;
6331 int index;
6332
6333 index = PAGE_CAPTURE_HASH(pp);
6334
6335 mutex_enter(&page_capture_hash[index].pchh_mutex);
6336 bp = page_capture_hash[index].lists[0].next;
6337 while (bp != &page_capture_hash[index].lists[0]) {
6338 if (bp->pp == pp) {
6339 /* Remove from old list */
6340 bp->next->prev = bp->prev;
6341 bp->prev->next = bp->next;
6342
6343 /* Add to new list */
6344 bp->next = page_capture_hash[index].lists[1].next;
6345 bp->prev = &page_capture_hash[index].lists[1];
6346 page_capture_hash[index].lists[1].next = bp;
6347 bp->next->prev = bp;
6348
6349 /*
6350 * There is a small probability of page on a free
6351 * list being retired while being allocated
6352 * and before P_RAF is set on it. The page may
6353 * end up marked as high priority request instead
6354 * of low priority request.
6355 * If P_RAF page is not marked as low priority request
6356 * change it to low priority request.
6357 */
6358 page_capture_hash[index].num_pages[bp->pri]--;
6359 bp->pri = PAGE_CAPTURE_PRIO(pp);
6360 page_capture_hash[index].num_pages[bp->pri]++;
6361 mutex_exit(&page_capture_hash[index].pchh_mutex);
6362 return (1);
6363 }
6364 bp = bp->next;
6365 }
6366 mutex_exit(&page_capture_hash[index].pchh_mutex);
6367 return (0);
6368 }
6369
6370 /*
6371 * Add a new entry to the page capture hash. The only case where a new
6372 * entry is not added is when the page capture consumer is no longer registered.
6373 * In this case, we'll silently not add the page to the hash. We know that
6374 * page retire will always be registered for the case where we are currently
6375 * unretiring a page and thus there are no conflicts.
6376 */
6377 static void
page_capture_add_hash(page_t * pp,uint_t szc,uint_t flags,void * datap)6378 page_capture_add_hash(page_t *pp, uint_t szc, uint_t flags, void *datap)
6379 {
6380 page_capture_hash_bucket_t *bp1;
6381 page_capture_hash_bucket_t *bp2;
6382 int index;
6383 int cb_index;
6384 int i;
6385 uchar_t pri;
6386 #ifdef DEBUG
6387 page_capture_hash_bucket_t *tp1;
6388 int l;
6389 #endif
6390
6391 ASSERT(!(flags & CAPTURE_ASYNC));
6392
6393 bp1 = kmem_alloc(sizeof (struct page_capture_hash_bucket), KM_SLEEP);
6394
6395 bp1->pp = pp;
6396 bp1->szc = szc;
6397 bp1->flags = flags;
6398 bp1->datap = datap;
6399
6400 for (cb_index = 0; cb_index < PC_NUM_CALLBACKS; cb_index++) {
6401 if ((flags >> cb_index) & 1) {
6402 break;
6403 }
6404 }
6405
6406 ASSERT(cb_index != PC_NUM_CALLBACKS);
6407
6408 rw_enter(&pc_cb[cb_index].cb_rwlock, RW_READER);
6409 if (pc_cb[cb_index].cb_active) {
6410 if (pc_cb[cb_index].duration == -1) {
6411 bp1->expires = (clock_t)-1;
6412 } else {
6413 bp1->expires = ddi_get_lbolt() +
6414 pc_cb[cb_index].duration;
6415 }
6416 } else {
6417 /* There's no callback registered so don't add to the hash */
6418 rw_exit(&pc_cb[cb_index].cb_rwlock);
6419 kmem_free(bp1, sizeof (*bp1));
6420 return;
6421 }
6422
6423 index = PAGE_CAPTURE_HASH(pp);
6424
6425 /*
6426 * Only allow capture flag to be modified under this mutex.
6427 * Prevents multiple entries for same page getting added.
6428 */
6429 mutex_enter(&page_capture_hash[index].pchh_mutex);
6430
6431 /*
6432 * if not already on the hash, set capture bit and add to the hash
6433 */
6434 if (!(pp->p_toxic & PR_CAPTURE)) {
6435 #ifdef DEBUG
6436 /* Check for duplicate entries */
6437 for (l = 0; l < 2; l++) {
6438 tp1 = page_capture_hash[index].lists[l].next;
6439 while (tp1 != &page_capture_hash[index].lists[l]) {
6440 if (tp1->pp == pp) {
6441 panic("page pp 0x%p already on hash "
6442 "at 0x%p\n",
6443 (void *)pp, (void *)tp1);
6444 }
6445 tp1 = tp1->next;
6446 }
6447 }
6448
6449 #endif
6450 page_settoxic(pp, PR_CAPTURE);
6451 pri = PAGE_CAPTURE_PRIO(pp);
6452 bp1->pri = pri;
6453 bp1->next = page_capture_hash[index].lists[0].next;
6454 bp1->prev = &page_capture_hash[index].lists[0];
6455 bp1->next->prev = bp1;
6456 page_capture_hash[index].lists[0].next = bp1;
6457 page_capture_hash[index].num_pages[pri]++;
6458 if (flags & CAPTURE_RETIRE) {
6459 page_retire_incr_pend_count(datap);
6460 }
6461 mutex_exit(&page_capture_hash[index].pchh_mutex);
6462 rw_exit(&pc_cb[cb_index].cb_rwlock);
6463 cv_signal(&pc_cv);
6464 return;
6465 }
6466
6467 /*
6468 * A page retire request will replace any other request.
6469 * A second physmem request which is for a different process than
6470 * the currently registered one will be dropped as there is
6471 * no way to hold the private data for both calls.
6472 * In the future, once there are more callers, this will have to
6473 * be worked out better as there needs to be private storage for
6474 * at least each type of caller (maybe have datap be an array of
6475 * *void's so that we can index based upon callers index).
6476 */
6477
6478 /* walk hash list to update expire time */
6479 for (i = 0; i < 2; i++) {
6480 bp2 = page_capture_hash[index].lists[i].next;
6481 while (bp2 != &page_capture_hash[index].lists[i]) {
6482 if (bp2->pp == pp) {
6483 if (flags & CAPTURE_RETIRE) {
6484 if (!(bp2->flags & CAPTURE_RETIRE)) {
6485 page_retire_incr_pend_count(
6486 datap);
6487 bp2->flags = flags;
6488 bp2->expires = bp1->expires;
6489 bp2->datap = datap;
6490 }
6491 } else {
6492 ASSERT(flags & CAPTURE_PHYSMEM);
6493 if (!(bp2->flags & CAPTURE_RETIRE) &&
6494 (datap == bp2->datap)) {
6495 bp2->expires = bp1->expires;
6496 }
6497 }
6498 mutex_exit(&page_capture_hash[index].
6499 pchh_mutex);
6500 rw_exit(&pc_cb[cb_index].cb_rwlock);
6501 kmem_free(bp1, sizeof (*bp1));
6502 return;
6503 }
6504 bp2 = bp2->next;
6505 }
6506 }
6507
6508 /*
6509 * the PR_CAPTURE flag is protected by the page_capture_hash mutexes
6510 * and thus it either has to be set or not set and can't change
6511 * while holding the mutex above.
6512 */
6513 panic("page_capture_add_hash, PR_CAPTURE flag set on pp %p\n",
6514 (void *)pp);
6515 }
6516
6517 /*
6518 * We have a page in our hands, lets try and make it ours by turning
6519 * it into a clean page like it had just come off the freelists.
6520 *
6521 * Returns 0 on success, with the page still EXCL locked.
6522 * On failure, the page will be unlocked, and returns EAGAIN
6523 */
6524 static int
page_capture_clean_page(page_t * pp)6525 page_capture_clean_page(page_t *pp)
6526 {
6527 page_t *newpp;
6528 int skip_unlock = 0;
6529 spgcnt_t count;
6530 page_t *tpp;
6531 int ret = 0;
6532 int extra;
6533
6534 ASSERT(PAGE_EXCL(pp));
6535 ASSERT(!PP_RETIRED(pp));
6536 ASSERT(curthread->t_flag & T_CAPTURING);
6537
6538 if (PP_ISFREE(pp)) {
6539 if (!page_reclaim(pp, NULL)) {
6540 skip_unlock = 1;
6541 ret = EAGAIN;
6542 goto cleanup;
6543 }
6544 ASSERT(pp->p_szc == 0);
6545 if (pp->p_vnode != NULL) {
6546 /*
6547 * Since this page came from the
6548 * cachelist, we must destroy the
6549 * old vnode association.
6550 */
6551 page_hashout(pp, NULL);
6552 }
6553 goto cleanup;
6554 }
6555
6556 /*
6557 * If we know page_relocate will fail, skip it
6558 * It could still fail due to a UE on another page but we
6559 * can't do anything about that.
6560 */
6561 if (pp->p_toxic & PR_UE) {
6562 goto skip_relocate;
6563 }
6564
6565 /*
6566 * It's possible that pages can not have a vnode as fsflush comes
6567 * through and cleans up these pages. It's ugly but that's how it is.
6568 */
6569 if (pp->p_vnode == NULL) {
6570 goto skip_relocate;
6571 }
6572
6573 /*
6574 * Page was not free, so lets try to relocate it.
6575 * page_relocate only works with root pages, so if this is not a root
6576 * page, we need to demote it to try and relocate it.
6577 * Unfortunately this is the best we can do right now.
6578 */
6579 newpp = NULL;
6580 if ((pp->p_szc > 0) && (pp != PP_PAGEROOT(pp))) {
6581 if (page_try_demote_pages(pp) == 0) {
6582 ret = EAGAIN;
6583 goto cleanup;
6584 }
6585 }
6586 ret = page_relocate(&pp, &newpp, 1, 0, &count, NULL);
6587 if (ret == 0) {
6588 page_t *npp;
6589 /* unlock the new page(s) */
6590 while (count-- > 0) {
6591 ASSERT(newpp != NULL);
6592 npp = newpp;
6593 page_sub(&newpp, npp);
6594 page_unlock(npp);
6595 }
6596 ASSERT(newpp == NULL);
6597 /*
6598 * Check to see if the page we have is too large.
6599 * If so, demote it freeing up the extra pages.
6600 */
6601 if (pp->p_szc > 0) {
6602 /* For now demote extra pages to szc == 0 */
6603 extra = page_get_pagecnt(pp->p_szc) - 1;
6604 while (extra > 0) {
6605 tpp = pp->p_next;
6606 page_sub(&pp, tpp);
6607 tpp->p_szc = 0;
6608 page_free(tpp, 1);
6609 extra--;
6610 }
6611 /* Make sure to set our page to szc 0 as well */
6612 ASSERT(pp->p_next == pp && pp->p_prev == pp);
6613 pp->p_szc = 0;
6614 }
6615 goto cleanup;
6616 } else if (ret == EIO) {
6617 ret = EAGAIN;
6618 goto cleanup;
6619 } else {
6620 /*
6621 * Need to reset return type as we failed to relocate the page
6622 * but that does not mean that some of the next steps will not
6623 * work.
6624 */
6625 ret = 0;
6626 }
6627
6628 skip_relocate:
6629
6630 if (pp->p_szc > 0) {
6631 if (page_try_demote_pages(pp) == 0) {
6632 ret = EAGAIN;
6633 goto cleanup;
6634 }
6635 }
6636
6637 ASSERT(pp->p_szc == 0);
6638
6639 if (hat_ismod(pp)) {
6640 ret = EAGAIN;
6641 goto cleanup;
6642 }
6643 if (PP_ISKAS(pp)) {
6644 ret = EAGAIN;
6645 goto cleanup;
6646 }
6647 if (pp->p_lckcnt || pp->p_cowcnt) {
6648 ret = EAGAIN;
6649 goto cleanup;
6650 }
6651
6652 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
6653 ASSERT(!hat_page_is_mapped(pp));
6654
6655 if (hat_ismod(pp)) {
6656 /*
6657 * This is a semi-odd case as the page is now modified but not
6658 * mapped as we just unloaded the mappings above.
6659 */
6660 ret = EAGAIN;
6661 goto cleanup;
6662 }
6663 if (pp->p_vnode != NULL) {
6664 page_hashout(pp, NULL);
6665 }
6666
6667 /*
6668 * At this point, the page should be in a clean state and
6669 * we can do whatever we want with it.
6670 */
6671
6672 cleanup:
6673 if (ret != 0) {
6674 if (!skip_unlock) {
6675 page_unlock(pp);
6676 }
6677 } else {
6678 ASSERT(pp->p_szc == 0);
6679 ASSERT(PAGE_EXCL(pp));
6680
6681 pp->p_next = pp;
6682 pp->p_prev = pp;
6683 }
6684 return (ret);
6685 }
6686
6687 /*
6688 * Various callers of page_trycapture() can have different restrictions upon
6689 * what memory they have access to.
6690 * Returns 0 on success, with the following error codes on failure:
6691 * EPERM - The requested page is long term locked, and thus repeated
6692 * requests to capture this page will likely fail.
6693 * ENOMEM - There was not enough free memory in the system to safely
6694 * map the requested page.
6695 * ENOENT - The requested page was inside the kernel cage, and the
6696 * PHYSMEM_CAGE flag was not set.
6697 */
6698 int
page_capture_pre_checks(page_t * pp,uint_t flags)6699 page_capture_pre_checks(page_t *pp, uint_t flags)
6700 {
6701 ASSERT(pp != NULL);
6702
6703 #if defined(__sparc)
6704 if (pp->p_vnode == &promvp) {
6705 return (EPERM);
6706 }
6707
6708 if (PP_ISNORELOC(pp) && !(flags & CAPTURE_GET_CAGE) &&
6709 (flags & CAPTURE_PHYSMEM)) {
6710 return (ENOENT);
6711 }
6712
6713 if (PP_ISNORELOCKERNEL(pp)) {
6714 return (EPERM);
6715 }
6716 #else
6717 if (PP_ISKAS(pp)) {
6718 return (EPERM);
6719 }
6720 #endif /* __sparc */
6721
6722 /* only physmem currently has the restrictions checked below */
6723 if (!(flags & CAPTURE_PHYSMEM)) {
6724 return (0);
6725 }
6726
6727 if (availrmem < swapfs_minfree) {
6728 /*
6729 * We won't try to capture this page as we are
6730 * running low on memory.
6731 */
6732 return (ENOMEM);
6733 }
6734 return (0);
6735 }
6736
6737 /*
6738 * Once we have a page in our mits, go ahead and complete the capture
6739 * operation.
6740 * Returns 1 on failure where page is no longer needed
6741 * Returns 0 on success
6742 * Returns -1 if there was a transient failure.
6743 * Failure cases must release the SE_EXCL lock on pp (usually via page_free).
6744 */
6745 int
page_capture_take_action(page_t * pp,uint_t flags,void * datap)6746 page_capture_take_action(page_t *pp, uint_t flags, void *datap)
6747 {
6748 int cb_index;
6749 int ret = 0;
6750 page_capture_hash_bucket_t *bp1;
6751 page_capture_hash_bucket_t *bp2;
6752 int index;
6753 int found = 0;
6754 int i;
6755
6756 ASSERT(PAGE_EXCL(pp));
6757 ASSERT(curthread->t_flag & T_CAPTURING);
6758
6759 for (cb_index = 0; cb_index < PC_NUM_CALLBACKS; cb_index++) {
6760 if ((flags >> cb_index) & 1) {
6761 break;
6762 }
6763 }
6764 ASSERT(cb_index < PC_NUM_CALLBACKS);
6765
6766 /*
6767 * Remove the entry from the page_capture hash, but don't free it yet
6768 * as we may need to put it back.
6769 * Since we own the page at this point in time, we should find it
6770 * in the hash if this is an ASYNC call. If we don't it's likely
6771 * that the page_capture_async() thread decided that this request
6772 * had expired, in which case we just continue on.
6773 */
6774 if (flags & CAPTURE_ASYNC) {
6775
6776 index = PAGE_CAPTURE_HASH(pp);
6777
6778 mutex_enter(&page_capture_hash[index].pchh_mutex);
6779 for (i = 0; i < 2 && !found; i++) {
6780 bp1 = page_capture_hash[index].lists[i].next;
6781 while (bp1 != &page_capture_hash[index].lists[i]) {
6782 if (bp1->pp == pp) {
6783 bp1->next->prev = bp1->prev;
6784 bp1->prev->next = bp1->next;
6785 page_capture_hash[index].
6786 num_pages[bp1->pri]--;
6787 page_clrtoxic(pp, PR_CAPTURE);
6788 found = 1;
6789 break;
6790 }
6791 bp1 = bp1->next;
6792 }
6793 }
6794 mutex_exit(&page_capture_hash[index].pchh_mutex);
6795 }
6796
6797 /* Synchronize with the unregister func. */
6798 rw_enter(&pc_cb[cb_index].cb_rwlock, RW_READER);
6799 if (!pc_cb[cb_index].cb_active) {
6800 page_free(pp, 1);
6801 rw_exit(&pc_cb[cb_index].cb_rwlock);
6802 if (found) {
6803 kmem_free(bp1, sizeof (*bp1));
6804 }
6805 return (1);
6806 }
6807
6808 /*
6809 * We need to remove the entry from the page capture hash and turn off
6810 * the PR_CAPTURE bit before calling the callback. We'll need to cache
6811 * the entry here, and then based upon the return value, cleanup
6812 * appropriately or re-add it to the hash, making sure that someone else
6813 * hasn't already done so.
6814 * It should be rare for the callback to fail and thus it's ok for
6815 * the failure path to be a bit complicated as the success path is
6816 * cleaner and the locking rules are easier to follow.
6817 */
6818
6819 ret = pc_cb[cb_index].cb_func(pp, datap, flags);
6820
6821 rw_exit(&pc_cb[cb_index].cb_rwlock);
6822
6823 /*
6824 * If this was an ASYNC request, we need to cleanup the hash if the
6825 * callback was successful or if the request was no longer valid.
6826 * For non-ASYNC requests, we return failure to map and the caller
6827 * will take care of adding the request to the hash.
6828 * Note also that the callback itself is responsible for the page
6829 * at this point in time in terms of locking ... The most common
6830 * case for the failure path should just be a page_free.
6831 */
6832 if (ret >= 0) {
6833 if (found) {
6834 if (bp1->flags & CAPTURE_RETIRE) {
6835 page_retire_decr_pend_count(datap);
6836 }
6837 kmem_free(bp1, sizeof (*bp1));
6838 }
6839 return (ret);
6840 }
6841 if (!found) {
6842 return (ret);
6843 }
6844
6845 ASSERT(flags & CAPTURE_ASYNC);
6846
6847 /*
6848 * Check for expiration time first as we can just free it up if it's
6849 * expired.
6850 */
6851 if (ddi_get_lbolt() > bp1->expires && bp1->expires != -1) {
6852 kmem_free(bp1, sizeof (*bp1));
6853 return (ret);
6854 }
6855
6856 /*
6857 * The callback failed and there used to be an entry in the hash for
6858 * this page, so we need to add it back to the hash.
6859 */
6860 mutex_enter(&page_capture_hash[index].pchh_mutex);
6861 if (!(pp->p_toxic & PR_CAPTURE)) {
6862 /* just add bp1 back to head of walked list */
6863 page_settoxic(pp, PR_CAPTURE);
6864 bp1->next = page_capture_hash[index].lists[1].next;
6865 bp1->prev = &page_capture_hash[index].lists[1];
6866 bp1->next->prev = bp1;
6867 bp1->pri = PAGE_CAPTURE_PRIO(pp);
6868 page_capture_hash[index].lists[1].next = bp1;
6869 page_capture_hash[index].num_pages[bp1->pri]++;
6870 mutex_exit(&page_capture_hash[index].pchh_mutex);
6871 return (ret);
6872 }
6873
6874 /*
6875 * Otherwise there was a new capture request added to list
6876 * Need to make sure that our original data is represented if
6877 * appropriate.
6878 */
6879 for (i = 0; i < 2; i++) {
6880 bp2 = page_capture_hash[index].lists[i].next;
6881 while (bp2 != &page_capture_hash[index].lists[i]) {
6882 if (bp2->pp == pp) {
6883 if (bp1->flags & CAPTURE_RETIRE) {
6884 if (!(bp2->flags & CAPTURE_RETIRE)) {
6885 bp2->szc = bp1->szc;
6886 bp2->flags = bp1->flags;
6887 bp2->expires = bp1->expires;
6888 bp2->datap = bp1->datap;
6889 }
6890 } else {
6891 ASSERT(bp1->flags & CAPTURE_PHYSMEM);
6892 if (!(bp2->flags & CAPTURE_RETIRE)) {
6893 bp2->szc = bp1->szc;
6894 bp2->flags = bp1->flags;
6895 bp2->expires = bp1->expires;
6896 bp2->datap = bp1->datap;
6897 }
6898 }
6899 page_capture_hash[index].num_pages[bp2->pri]--;
6900 bp2->pri = PAGE_CAPTURE_PRIO(pp);
6901 page_capture_hash[index].num_pages[bp2->pri]++;
6902 mutex_exit(&page_capture_hash[index].
6903 pchh_mutex);
6904 kmem_free(bp1, sizeof (*bp1));
6905 return (ret);
6906 }
6907 bp2 = bp2->next;
6908 }
6909 }
6910 panic("PR_CAPTURE set but not on hash for pp 0x%p\n", (void *)pp);
6911 /*NOTREACHED*/
6912 }
6913
6914 /*
6915 * Try to capture the given page for the caller specified in the flags
6916 * parameter. The page will either be captured and handed over to the
6917 * appropriate callback, or will be queued up in the page capture hash
6918 * to be captured asynchronously.
6919 * If the current request is due to an async capture, the page must be
6920 * exclusively locked before calling this function.
6921 * Currently szc must be 0 but in the future this should be expandable to
6922 * other page sizes.
6923 * Returns 0 on success, with the following error codes on failure:
6924 * EPERM - The requested page is long term locked, and thus repeated
6925 * requests to capture this page will likely fail.
6926 * ENOMEM - There was not enough free memory in the system to safely
6927 * map the requested page.
6928 * ENOENT - The requested page was inside the kernel cage, and the
6929 * CAPTURE_GET_CAGE flag was not set.
6930 * EAGAIN - The requested page could not be capturead at this point in
6931 * time but future requests will likely work.
6932 * EBUSY - The requested page is retired and the CAPTURE_GET_RETIRED flag
6933 * was not set.
6934 */
6935 int
page_itrycapture(page_t * pp,uint_t szc,uint_t flags,void * datap)6936 page_itrycapture(page_t *pp, uint_t szc, uint_t flags, void *datap)
6937 {
6938 int ret;
6939 int cb_index;
6940
6941 if (flags & CAPTURE_ASYNC) {
6942 ASSERT(PAGE_EXCL(pp));
6943 goto async;
6944 }
6945
6946 /* Make sure there's enough availrmem ... */
6947 ret = page_capture_pre_checks(pp, flags);
6948 if (ret != 0) {
6949 return (ret);
6950 }
6951
6952 if (!page_trylock(pp, SE_EXCL)) {
6953 for (cb_index = 0; cb_index < PC_NUM_CALLBACKS; cb_index++) {
6954 if ((flags >> cb_index) & 1) {
6955 break;
6956 }
6957 }
6958 ASSERT(cb_index < PC_NUM_CALLBACKS);
6959 ret = EAGAIN;
6960 /* Special case for retired pages */
6961 if (PP_RETIRED(pp)) {
6962 if (flags & CAPTURE_GET_RETIRED) {
6963 if (!page_unretire_pp(pp, PR_UNR_TEMP)) {
6964 /*
6965 * Need to set capture bit and add to
6966 * hash so that the page will be
6967 * retired when freed.
6968 */
6969 page_capture_add_hash(pp, szc,
6970 CAPTURE_RETIRE, NULL);
6971 ret = 0;
6972 goto own_page;
6973 }
6974 } else {
6975 return (EBUSY);
6976 }
6977 }
6978 page_capture_add_hash(pp, szc, flags, datap);
6979 return (ret);
6980 }
6981
6982 async:
6983 ASSERT(PAGE_EXCL(pp));
6984
6985 /* Need to check for physmem async requests that availrmem is sane */
6986 if ((flags & (CAPTURE_ASYNC | CAPTURE_PHYSMEM)) ==
6987 (CAPTURE_ASYNC | CAPTURE_PHYSMEM) &&
6988 (availrmem < swapfs_minfree)) {
6989 page_unlock(pp);
6990 return (ENOMEM);
6991 }
6992
6993 ret = page_capture_clean_page(pp);
6994
6995 if (ret != 0) {
6996 /* We failed to get the page, so lets add it to the hash */
6997 if (!(flags & CAPTURE_ASYNC)) {
6998 page_capture_add_hash(pp, szc, flags, datap);
6999 }
7000 return (ret);
7001 }
7002
7003 own_page:
7004 ASSERT(PAGE_EXCL(pp));
7005 ASSERT(pp->p_szc == 0);
7006
7007 /* Call the callback */
7008 ret = page_capture_take_action(pp, flags, datap);
7009
7010 if (ret == 0) {
7011 return (0);
7012 }
7013
7014 /*
7015 * Note that in the failure cases from page_capture_take_action, the
7016 * EXCL lock will have already been dropped.
7017 */
7018 if ((ret == -1) && (!(flags & CAPTURE_ASYNC))) {
7019 page_capture_add_hash(pp, szc, flags, datap);
7020 }
7021 return (EAGAIN);
7022 }
7023
7024 int
page_trycapture(page_t * pp,uint_t szc,uint_t flags,void * datap)7025 page_trycapture(page_t *pp, uint_t szc, uint_t flags, void *datap)
7026 {
7027 int ret;
7028
7029 curthread->t_flag |= T_CAPTURING;
7030 ret = page_itrycapture(pp, szc, flags, datap);
7031 curthread->t_flag &= ~T_CAPTURING; /* xor works as we know its set */
7032 return (ret);
7033 }
7034
7035 /*
7036 * When unlocking a page which has the PR_CAPTURE bit set, this routine
7037 * gets called to try and capture the page.
7038 */
7039 void
page_unlock_capture(page_t * pp)7040 page_unlock_capture(page_t *pp)
7041 {
7042 page_capture_hash_bucket_t *bp;
7043 int index;
7044 int i;
7045 uint_t szc;
7046 uint_t flags = 0;
7047 void *datap;
7048 kmutex_t *mp;
7049 extern vnode_t retired_pages;
7050
7051 /*
7052 * We need to protect against a possible deadlock here where we own
7053 * the vnode page hash mutex and want to acquire it again as there
7054 * are locations in the code, where we unlock a page while holding
7055 * the mutex which can lead to the page being captured and eventually
7056 * end up here. As we may be hashing out the old page and hashing into
7057 * the retire vnode, we need to make sure we don't own them.
7058 * Other callbacks who do hash operations also need to make sure that
7059 * before they hashin to a vnode that they do not currently own the
7060 * vphm mutex otherwise there will be a panic.
7061 */
7062 if (mutex_owned(page_vnode_mutex(&retired_pages))) {
7063 page_unlock_nocapture(pp);
7064 return;
7065 }
7066 if (pp->p_vnode != NULL && mutex_owned(page_vnode_mutex(pp->p_vnode))) {
7067 page_unlock_nocapture(pp);
7068 return;
7069 }
7070
7071 index = PAGE_CAPTURE_HASH(pp);
7072
7073 mp = &page_capture_hash[index].pchh_mutex;
7074 mutex_enter(mp);
7075 for (i = 0; i < 2; i++) {
7076 bp = page_capture_hash[index].lists[i].next;
7077 while (bp != &page_capture_hash[index].lists[i]) {
7078 if (bp->pp == pp) {
7079 szc = bp->szc;
7080 flags = bp->flags | CAPTURE_ASYNC;
7081 datap = bp->datap;
7082 mutex_exit(mp);
7083 (void) page_trycapture(pp, szc, flags, datap);
7084 return;
7085 }
7086 bp = bp->next;
7087 }
7088 }
7089
7090 /* Failed to find page in hash so clear flags and unlock it. */
7091 page_clrtoxic(pp, PR_CAPTURE);
7092 page_unlock(pp);
7093
7094 mutex_exit(mp);
7095 }
7096
7097 void
page_capture_init()7098 page_capture_init()
7099 {
7100 int i;
7101 for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) {
7102 page_capture_hash[i].lists[0].next =
7103 &page_capture_hash[i].lists[0];
7104 page_capture_hash[i].lists[0].prev =
7105 &page_capture_hash[i].lists[0];
7106 page_capture_hash[i].lists[1].next =
7107 &page_capture_hash[i].lists[1];
7108 page_capture_hash[i].lists[1].prev =
7109 &page_capture_hash[i].lists[1];
7110 }
7111
7112 pc_thread_shortwait = 23 * hz;
7113 pc_thread_longwait = 1201 * hz;
7114 pc_thread_retry = 3;
7115 mutex_init(&pc_thread_mutex, NULL, MUTEX_DEFAULT, NULL);
7116 cv_init(&pc_cv, NULL, CV_DEFAULT, NULL);
7117 pc_thread_id = thread_create(NULL, 0, page_capture_thread, NULL, 0, &p0,
7118 TS_RUN, minclsyspri);
7119 }
7120
7121 /*
7122 * It is necessary to scrub any failing pages prior to reboot in order to
7123 * prevent a latent error trap from occurring on the next boot.
7124 */
7125 void
page_retire_mdboot()7126 page_retire_mdboot()
7127 {
7128 page_t *pp;
7129 int i, j;
7130 page_capture_hash_bucket_t *bp;
7131 uchar_t pri;
7132
7133 /* walk lists looking for pages to scrub */
7134 for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) {
7135 for (pri = 0; pri < PC_NUM_PRI; pri++) {
7136 if (page_capture_hash[i].num_pages[pri] != 0) {
7137 break;
7138 }
7139 }
7140 if (pri == PC_NUM_PRI)
7141 continue;
7142
7143 mutex_enter(&page_capture_hash[i].pchh_mutex);
7144
7145 for (j = 0; j < 2; j++) {
7146 bp = page_capture_hash[i].lists[j].next;
7147 while (bp != &page_capture_hash[i].lists[j]) {
7148 pp = bp->pp;
7149 if (PP_TOXIC(pp)) {
7150 if (page_trylock(pp, SE_EXCL)) {
7151 PP_CLRFREE(pp);
7152 pagescrub(pp, 0, PAGESIZE);
7153 page_unlock(pp);
7154 }
7155 }
7156 bp = bp->next;
7157 }
7158 }
7159 mutex_exit(&page_capture_hash[i].pchh_mutex);
7160 }
7161 }
7162
7163 /*
7164 * Walk the page_capture_hash trying to capture pages and also cleanup old
7165 * entries which have expired.
7166 */
7167 void
page_capture_async()7168 page_capture_async()
7169 {
7170 page_t *pp;
7171 int i;
7172 int ret;
7173 page_capture_hash_bucket_t *bp1, *bp2;
7174 uint_t szc;
7175 uint_t flags;
7176 void *datap;
7177 uchar_t pri;
7178
7179 /* If there are outstanding pages to be captured, get to work */
7180 for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) {
7181 for (pri = 0; pri < PC_NUM_PRI; pri++) {
7182 if (page_capture_hash[i].num_pages[pri] != 0)
7183 break;
7184 }
7185 if (pri == PC_NUM_PRI)
7186 continue;
7187
7188 /* Append list 1 to list 0 and then walk through list 0 */
7189 mutex_enter(&page_capture_hash[i].pchh_mutex);
7190 bp1 = &page_capture_hash[i].lists[1];
7191 bp2 = bp1->next;
7192 if (bp1 != bp2) {
7193 bp1->prev->next = page_capture_hash[i].lists[0].next;
7194 bp2->prev = &page_capture_hash[i].lists[0];
7195 page_capture_hash[i].lists[0].next->prev = bp1->prev;
7196 page_capture_hash[i].lists[0].next = bp2;
7197 bp1->next = bp1;
7198 bp1->prev = bp1;
7199 }
7200
7201 /* list[1] will be empty now */
7202
7203 bp1 = page_capture_hash[i].lists[0].next;
7204 while (bp1 != &page_capture_hash[i].lists[0]) {
7205 /* Check expiration time */
7206 if ((ddi_get_lbolt() > bp1->expires &&
7207 bp1->expires != -1) ||
7208 page_deleted(bp1->pp)) {
7209 page_capture_hash[i].lists[0].next = bp1->next;
7210 bp1->next->prev =
7211 &page_capture_hash[i].lists[0];
7212 page_capture_hash[i].num_pages[bp1->pri]--;
7213
7214 /*
7215 * We can safely remove the PR_CAPTURE bit
7216 * without holding the EXCL lock on the page
7217 * as the PR_CAPTURE bit requres that the
7218 * page_capture_hash[].pchh_mutex be held
7219 * to modify it.
7220 */
7221 page_clrtoxic(bp1->pp, PR_CAPTURE);
7222 mutex_exit(&page_capture_hash[i].pchh_mutex);
7223 kmem_free(bp1, sizeof (*bp1));
7224 mutex_enter(&page_capture_hash[i].pchh_mutex);
7225 bp1 = page_capture_hash[i].lists[0].next;
7226 continue;
7227 }
7228 pp = bp1->pp;
7229 szc = bp1->szc;
7230 flags = bp1->flags;
7231 datap = bp1->datap;
7232 mutex_exit(&page_capture_hash[i].pchh_mutex);
7233 if (page_trylock(pp, SE_EXCL)) {
7234 ret = page_trycapture(pp, szc,
7235 flags | CAPTURE_ASYNC, datap);
7236 } else {
7237 ret = 1; /* move to walked hash */
7238 }
7239
7240 if (ret != 0) {
7241 /* Move to walked hash */
7242 (void) page_capture_move_to_walked(pp);
7243 }
7244 mutex_enter(&page_capture_hash[i].pchh_mutex);
7245 bp1 = page_capture_hash[i].lists[0].next;
7246 }
7247
7248 mutex_exit(&page_capture_hash[i].pchh_mutex);
7249 }
7250 }
7251
7252 /*
7253 * This function is called by the page_capture_thread, and is needed in
7254 * in order to initiate aio cleanup, so that pages used in aio
7255 * will be unlocked and subsequently retired by page_capture_thread.
7256 */
7257 static int
do_aio_cleanup(void)7258 do_aio_cleanup(void)
7259 {
7260 proc_t *procp;
7261 int (*aio_cleanup_dr_delete_memory)(proc_t *);
7262 int cleaned = 0;
7263
7264 if (modload("sys", "kaio") == -1) {
7265 cmn_err(CE_WARN, "do_aio_cleanup: cannot load kaio");
7266 return (0);
7267 }
7268 /*
7269 * We use the aio_cleanup_dr_delete_memory function to
7270 * initiate the actual clean up; this function will wake
7271 * up the per-process aio_cleanup_thread.
7272 */
7273 aio_cleanup_dr_delete_memory = (int (*)(proc_t *))
7274 modgetsymvalue("aio_cleanup_dr_delete_memory", 0);
7275 if (aio_cleanup_dr_delete_memory == NULL) {
7276 cmn_err(CE_WARN,
7277 "aio_cleanup_dr_delete_memory not found in kaio");
7278 return (0);
7279 }
7280 mutex_enter(&pidlock);
7281 for (procp = practive; (procp != NULL); procp = procp->p_next) {
7282 mutex_enter(&procp->p_lock);
7283 if (procp->p_aio != NULL) {
7284 /* cleanup proc's outstanding kaio */
7285 cleaned += (*aio_cleanup_dr_delete_memory)(procp);
7286 }
7287 mutex_exit(&procp->p_lock);
7288 }
7289 mutex_exit(&pidlock);
7290 return (cleaned);
7291 }
7292
7293 /*
7294 * helper function for page_capture_thread
7295 */
7296 static void
page_capture_handle_outstanding(void)7297 page_capture_handle_outstanding(void)
7298 {
7299 int ntry;
7300
7301 /* Reap pages before attempting capture pages */
7302 kmem_reap();
7303
7304 if ((page_retire_pend_count() > page_retire_pend_kas_count()) &&
7305 hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) {
7306 /*
7307 * Note: Purging only for platforms that support
7308 * ISM hat_pageunload() - mainly SPARC. On x86/x64
7309 * platforms ISM pages SE_SHARED locked until destroyed.
7310 */
7311
7312 /* disable and purge seg_pcache */
7313 (void) seg_p_disable();
7314 for (ntry = 0; ntry < pc_thread_retry; ntry++) {
7315 if (!page_retire_pend_count())
7316 break;
7317 if (do_aio_cleanup()) {
7318 /*
7319 * allow the apps cleanup threads
7320 * to run
7321 */
7322 delay(pc_thread_shortwait);
7323 }
7324 page_capture_async();
7325 }
7326 /* reenable seg_pcache */
7327 seg_p_enable();
7328
7329 /* completed what can be done. break out */
7330 return;
7331 }
7332
7333 /*
7334 * For kernel pages and/or unsupported HAT_DYNAMIC_ISM_UNMAP, reap
7335 * and then attempt to capture.
7336 */
7337 seg_preap();
7338 page_capture_async();
7339 }
7340
7341 /*
7342 * The page_capture_thread loops forever, looking to see if there are
7343 * pages still waiting to be captured.
7344 */
7345 static void
page_capture_thread(void)7346 page_capture_thread(void)
7347 {
7348 callb_cpr_t c;
7349 int i;
7350 int high_pri_pages;
7351 int low_pri_pages;
7352 clock_t timeout;
7353
7354 CALLB_CPR_INIT(&c, &pc_thread_mutex, callb_generic_cpr, "page_capture");
7355
7356 mutex_enter(&pc_thread_mutex);
7357 for (;;) {
7358 high_pri_pages = 0;
7359 low_pri_pages = 0;
7360 for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) {
7361 high_pri_pages +=
7362 page_capture_hash[i].num_pages[PC_PRI_HI];
7363 low_pri_pages +=
7364 page_capture_hash[i].num_pages[PC_PRI_LO];
7365 }
7366
7367 timeout = pc_thread_longwait;
7368 if (high_pri_pages != 0) {
7369 timeout = pc_thread_shortwait;
7370 page_capture_handle_outstanding();
7371 } else if (low_pri_pages != 0) {
7372 page_capture_async();
7373 }
7374 CALLB_CPR_SAFE_BEGIN(&c);
7375 (void) cv_reltimedwait(&pc_cv, &pc_thread_mutex,
7376 timeout, TR_CLOCK_TICK);
7377 CALLB_CPR_SAFE_END(&c, &pc_thread_mutex);
7378 }
7379 /*NOTREACHED*/
7380 }
7381 /*
7382 * Attempt to locate a bucket that has enough pages to satisfy the request.
7383 * The initial check is done without the lock to avoid unneeded contention.
7384 * The function returns 1 if enough pages were found, else 0 if it could not
7385 * find enough pages in a bucket.
7386 */
7387 static int
pcf_decrement_bucket(pgcnt_t npages)7388 pcf_decrement_bucket(pgcnt_t npages)
7389 {
7390 struct pcf *p;
7391 struct pcf *q;
7392 int i;
7393
7394 p = &pcf[PCF_INDEX()];
7395 q = &pcf[pcf_fanout];
7396 for (i = 0; i < pcf_fanout; i++) {
7397 if (p->pcf_count > npages) {
7398 /*
7399 * a good one to try.
7400 */
7401 mutex_enter(&p->pcf_lock);
7402 if (p->pcf_count > npages) {
7403 p->pcf_count -= (uint_t)npages;
7404 /*
7405 * freemem is not protected by any lock.
7406 * Thus, we cannot have any assertion
7407 * containing freemem here.
7408 */
7409 freemem -= npages;
7410 mutex_exit(&p->pcf_lock);
7411 return (1);
7412 }
7413 mutex_exit(&p->pcf_lock);
7414 }
7415 p++;
7416 if (p >= q) {
7417 p = pcf;
7418 }
7419 }
7420 return (0);
7421 }
7422
7423 /*
7424 * Arguments:
7425 * pcftotal_ret: If the value is not NULL and we have walked all the
7426 * buckets but did not find enough pages then it will
7427 * be set to the total number of pages in all the pcf
7428 * buckets.
7429 * npages: Is the number of pages we have been requested to
7430 * find.
7431 * unlock: If set to 0 we will leave the buckets locked if the
7432 * requested number of pages are not found.
7433 *
7434 * Go and try to satisfy the page request from any number of buckets.
7435 * This can be a very expensive operation as we have to lock the buckets
7436 * we are checking (and keep them locked), starting at bucket 0.
7437 *
7438 * The function returns 1 if enough pages were found, else 0 if it could not
7439 * find enough pages in the buckets.
7440 *
7441 */
7442 static int
pcf_decrement_multiple(pgcnt_t * pcftotal_ret,pgcnt_t npages,int unlock)7443 pcf_decrement_multiple(pgcnt_t *pcftotal_ret, pgcnt_t npages, int unlock)
7444 {
7445 struct pcf *p;
7446 pgcnt_t pcftotal;
7447 int i;
7448
7449 p = pcf;
7450 /* try to collect pages from several pcf bins */
7451 for (pcftotal = 0, i = 0; i < pcf_fanout; i++) {
7452 mutex_enter(&p->pcf_lock);
7453 pcftotal += p->pcf_count;
7454 if (pcftotal >= npages) {
7455 /*
7456 * Wow! There are enough pages laying around
7457 * to satisfy the request. Do the accounting,
7458 * drop the locks we acquired, and go back.
7459 *
7460 * freemem is not protected by any lock. So,
7461 * we cannot have any assertion containing
7462 * freemem.
7463 */
7464 freemem -= npages;
7465 while (p >= pcf) {
7466 if (p->pcf_count <= npages) {
7467 npages -= p->pcf_count;
7468 p->pcf_count = 0;
7469 } else {
7470 p->pcf_count -= (uint_t)npages;
7471 npages = 0;
7472 }
7473 mutex_exit(&p->pcf_lock);
7474 p--;
7475 }
7476 ASSERT(npages == 0);
7477 return (1);
7478 }
7479 p++;
7480 }
7481 if (unlock) {
7482 /* failed to collect pages - release the locks */
7483 while (--p >= pcf) {
7484 mutex_exit(&p->pcf_lock);
7485 }
7486 }
7487 if (pcftotal_ret != NULL)
7488 *pcftotal_ret = pcftotal;
7489 return (0);
7490 }
7491