1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25 /*
26 * Copyright 2012 Joyent, Inc. All rights reserved.
27 */
28
29 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
30 /* All Rights Reserved */
31
32 /*
33 * Portions of this source code were derived from Berkeley 4.3 BSD
34 * under license from the Regents of the University of California.
35 */
36
37
38 /*
39 * This file contains common functions to access and manage the page lists.
40 * Many of these routines originated from platform dependent modules
41 * (sun4/vm/vm_dep.c, i86pc/vm/vm_machdep.c) and modified to function in
42 * a platform independent manner.
43 *
44 * vm/vm_dep.h provides for platform specific support.
45 */
46
47 #include <sys/types.h>
48 #include <sys/debug.h>
49 #include <sys/cmn_err.h>
50 #include <sys/systm.h>
51 #include <sys/atomic.h>
52 #include <sys/sysmacros.h>
53 #include <vm/as.h>
54 #include <vm/page.h>
55 #include <vm/seg_kmem.h>
56 #include <vm/seg_vn.h>
57 #include <sys/vmsystm.h>
58 #include <sys/memnode.h>
59 #include <vm/vm_dep.h>
60 #include <sys/lgrp.h>
61 #include <sys/mem_config.h>
62 #include <sys/callb.h>
63 #include <sys/mem_cage.h>
64 #include <sys/sdt.h>
65 #include <sys/dumphdr.h>
66 #include <sys/swap.h>
67
68 extern uint_t vac_colors;
69
70 #define MAX_PRAGMA_ALIGN 128
71
72 /* vm_cpu_data0 for the boot cpu before kmem is initialized */
73
74 #if L2CACHE_ALIGN_MAX <= MAX_PRAGMA_ALIGN
75 #pragma align L2CACHE_ALIGN_MAX(vm_cpu_data0)
76 #else
77 #pragma align MAX_PRAGMA_ALIGN(vm_cpu_data0)
78 #endif
79 char vm_cpu_data0[VM_CPU_DATA_PADSIZE];
80
81 /*
82 * number of page colors equivalent to reqested color in page_get routines.
83 * If set, keeps large pages intact longer and keeps MPO allocation
84 * from the local mnode in favor of acquiring the 'correct' page color from
85 * a demoted large page or from a remote mnode.
86 */
87 uint_t colorequiv;
88
89 /*
90 * color equivalency mask for each page size.
91 * Mask is computed based on cpu L2$ way sizes and colorequiv global.
92 * High 4 bits determine the number of high order bits of the color to ignore.
93 * Low 4 bits determines number of low order bits of color to ignore (it's only
94 * relevant for hashed index based page coloring).
95 */
96 uchar_t colorequivszc[MMU_PAGE_SIZES];
97
98 /*
99 * if set, specifies the percentage of large pages that are free from within
100 * a large page region before attempting to lock those pages for
101 * page_get_contig_pages processing.
102 *
103 * Should be turned on when kpr is available when page_trylock_contig_pages
104 * can be more selective.
105 */
106
107 int ptcpthreshold;
108
109 /*
110 * Limit page get contig page search based on failure cnts in pgcpfailcnt[].
111 * Enabled by default via pgcplimitsearch.
112 *
113 * pgcpfailcnt[] is bounded by PGCPFAILMAX (>= 1/2 of installed
114 * memory). When reached, pgcpfailcnt[] is reset to 1/2 of this upper
115 * bound. This upper bound range guarantees:
116 * - all large page 'slots' will be searched over time
117 * - the minimum (1) large page candidates considered on each pgcp call
118 * - count doesn't wrap around to 0
119 */
120 pgcnt_t pgcpfailcnt[MMU_PAGE_SIZES];
121 int pgcplimitsearch = 1;
122
123 #define PGCPFAILMAX (1 << (highbit(physinstalled) - 1))
124 #define SETPGCPFAILCNT(szc) \
125 if (++pgcpfailcnt[szc] >= PGCPFAILMAX) \
126 pgcpfailcnt[szc] = PGCPFAILMAX / 2;
127
128 #ifdef VM_STATS
129 struct vmm_vmstats_str vmm_vmstats;
130
131 #endif /* VM_STATS */
132
133 #if defined(__sparc)
134 #define LPGCREATE 0
135 #else
136 /* enable page_get_contig_pages */
137 #define LPGCREATE 1
138 #endif
139
140 int pg_contig_disable;
141 int pg_lpgcreate_nocage = LPGCREATE;
142
143 /*
144 * page_freelist_split pfn flag to signify no lo or hi pfn requirement.
145 */
146 #define PFNNULL 0
147
148 /* Flags involved in promotion and demotion routines */
149 #define PC_FREE 0x1 /* put page on freelist */
150 #define PC_ALLOC 0x2 /* return page for allocation */
151
152 /*
153 * Flag for page_demote to be used with PC_FREE to denote that we don't care
154 * what the color is as the color parameter to the function is ignored.
155 */
156 #define PC_NO_COLOR (-1)
157
158 /* mtype value for page_promote to use when mtype does not matter */
159 #define PC_MTYPE_ANY (-1)
160
161 /*
162 * page counters candidates info
163 * See page_ctrs_cands comment below for more details.
164 * fields are as follows:
165 * pcc_pages_free: # pages which freelist coalesce can create
166 * pcc_color_free: pointer to page free counts per color
167 */
168 typedef struct pcc_info {
169 pgcnt_t pcc_pages_free;
170 pgcnt_t *pcc_color_free;
171 uint_t pad[12];
172 } pcc_info_t;
173
174 /*
175 * On big machines it can take a long time to check page_counters
176 * arrays. page_ctrs_cands is a summary array whose elements are a dynamically
177 * updated sum of all elements of the corresponding page_counters arrays.
178 * page_freelist_coalesce() searches page_counters only if an appropriate
179 * element of page_ctrs_cands array is greater than 0.
180 *
181 * page_ctrs_cands is indexed by mutex (i), region (r), mnode (m), mrange (g)
182 */
183 pcc_info_t **page_ctrs_cands[NPC_MUTEX][MMU_PAGE_SIZES];
184
185 /*
186 * Return in val the total number of free pages which can be created
187 * for the given mnode (m), mrange (g), and region size (r)
188 */
189 #define PGCTRS_CANDS_GETVALUE(m, g, r, val) { \
190 int i; \
191 val = 0; \
192 for (i = 0; i < NPC_MUTEX; i++) { \
193 val += page_ctrs_cands[i][(r)][(m)][(g)].pcc_pages_free; \
194 } \
195 }
196
197 /*
198 * Return in val the total number of free pages which can be created
199 * for the given mnode (m), mrange (g), region size (r), and color (c)
200 */
201 #define PGCTRS_CANDS_GETVALUECOLOR(m, g, r, c, val) { \
202 int i; \
203 val = 0; \
204 ASSERT((c) < PAGE_GET_PAGECOLORS(r)); \
205 for (i = 0; i < NPC_MUTEX; i++) { \
206 val += \
207 page_ctrs_cands[i][(r)][(m)][(g)].pcc_color_free[(c)]; \
208 } \
209 }
210
211 /*
212 * We can only allow a single thread to update a counter within the physical
213 * range of the largest supported page size. That is the finest granularity
214 * possible since the counter values are dependent on each other
215 * as you move accross region sizes. PP_CTR_LOCK_INDX is used to determine the
216 * ctr_mutex lock index for a particular physical range.
217 */
218 static kmutex_t *ctr_mutex[NPC_MUTEX];
219
220 #define PP_CTR_LOCK_INDX(pp) \
221 (((pp)->p_pagenum >> \
222 (PAGE_BSZS_SHIFT(mmu_page_sizes - 1))) & (NPC_MUTEX - 1))
223
224 #define INVALID_COLOR 0xffffffff
225 #define INVALID_MASK 0xffffffff
226
227 /*
228 * Local functions prototypes.
229 */
230
231 void page_ctr_add(int, int, page_t *, int);
232 void page_ctr_add_internal(int, int, page_t *, int);
233 void page_ctr_sub(int, int, page_t *, int);
234 void page_ctr_sub_internal(int, int, page_t *, int);
235 void page_freelist_lock(int);
236 void page_freelist_unlock(int);
237 page_t *page_promote(int, pfn_t, uchar_t, int, int);
238 page_t *page_demote(int, pfn_t, pfn_t, uchar_t, uchar_t, int, int);
239 page_t *page_freelist_split(uchar_t,
240 uint_t, int, int, pfn_t, pfn_t, page_list_walker_t *);
241 page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int);
242 static int page_trylock_cons(page_t *pp, se_t se);
243
244 /*
245 * The page_counters array below is used to keep track of free contiguous
246 * physical memory. A hw_page_map_t will be allocated per mnode per szc.
247 * This contains an array of counters, the size of the array, a shift value
248 * used to convert a pagenum into a counter array index or vice versa, as
249 * well as a cache of the last successful index to be promoted to a larger
250 * page size. As an optimization, we keep track of the last successful index
251 * to be promoted per page color for the given size region, and this is
252 * allocated dynamically based upon the number of colors for a given
253 * region size.
254 *
255 * Conceptually, the page counters are represented as:
256 *
257 * page_counters[region_size][mnode]
258 *
259 * region_size: size code of a candidate larger page made up
260 * of contiguous free smaller pages.
261 *
262 * page_counters[region_size][mnode].hpm_counters[index]:
263 * represents how many (region_size - 1) pages either
264 * exist or can be created within the given index range.
265 *
266 * Let's look at a sparc example:
267 * If we want to create a free 512k page, we look at region_size 2
268 * for the mnode we want. We calculate the index and look at a specific
269 * hpm_counters location. If we see 8 (FULL_REGION_CNT on sparc) at
270 * this location, it means that 8 64k pages either exist or can be created
271 * from 8K pages in order to make a single free 512k page at the given
272 * index. Note that when a region is full, it will contribute to the
273 * counts in the region above it. Thus we will not know what page
274 * size the free pages will be which can be promoted to this new free
275 * page unless we look at all regions below the current region.
276 */
277
278 /*
279 * Note: hpmctr_t is defined in platform vm_dep.h
280 * hw_page_map_t contains all the information needed for the page_counters
281 * logic. The fields are as follows:
282 *
283 * hpm_counters: dynamically allocated array to hold counter data
284 * hpm_entries: entries in hpm_counters
285 * hpm_shift: shift for pnum/array index conv
286 * hpm_base: PFN mapped to counter index 0
287 * hpm_color_current: last index in counter array for this color at
288 * which we successfully created a large page
289 */
290 typedef struct hw_page_map {
291 hpmctr_t *hpm_counters;
292 size_t hpm_entries;
293 int hpm_shift;
294 pfn_t hpm_base;
295 size_t *hpm_color_current[MAX_MNODE_MRANGES];
296 #if defined(__sparc)
297 uint_t pad[4];
298 #endif
299 } hw_page_map_t;
300
301 /*
302 * Element zero is not used, but is allocated for convenience.
303 */
304 static hw_page_map_t *page_counters[MMU_PAGE_SIZES];
305
306 /*
307 * Cached value of MNODE_RANGE_CNT(mnode).
308 * This is a function call in x86.
309 */
310 static int mnode_nranges[MAX_MEM_NODES];
311 static int mnode_maxmrange[MAX_MEM_NODES];
312
313 /*
314 * The following macros are convenient ways to get access to the individual
315 * elements of the page_counters arrays. They can be used on both
316 * the left side and right side of equations.
317 */
318 #define PAGE_COUNTERS(mnode, rg_szc, idx) \
319 (page_counters[(rg_szc)][(mnode)].hpm_counters[(idx)])
320
321 #define PAGE_COUNTERS_COUNTERS(mnode, rg_szc) \
322 (page_counters[(rg_szc)][(mnode)].hpm_counters)
323
324 #define PAGE_COUNTERS_SHIFT(mnode, rg_szc) \
325 (page_counters[(rg_szc)][(mnode)].hpm_shift)
326
327 #define PAGE_COUNTERS_ENTRIES(mnode, rg_szc) \
328 (page_counters[(rg_szc)][(mnode)].hpm_entries)
329
330 #define PAGE_COUNTERS_BASE(mnode, rg_szc) \
331 (page_counters[(rg_szc)][(mnode)].hpm_base)
332
333 #define PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, rg_szc, g) \
334 (page_counters[(rg_szc)][(mnode)].hpm_color_current[(g)])
335
336 #define PAGE_COUNTERS_CURRENT_COLOR(mnode, rg_szc, color, mrange) \
337 (page_counters[(rg_szc)][(mnode)]. \
338 hpm_color_current[(mrange)][(color)])
339
340 #define PNUM_TO_IDX(mnode, rg_szc, pnum) \
341 (((pnum) - PAGE_COUNTERS_BASE((mnode), (rg_szc))) >> \
342 PAGE_COUNTERS_SHIFT((mnode), (rg_szc)))
343
344 #define IDX_TO_PNUM(mnode, rg_szc, index) \
345 (PAGE_COUNTERS_BASE((mnode), (rg_szc)) + \
346 ((index) << PAGE_COUNTERS_SHIFT((mnode), (rg_szc))))
347
348 /*
349 * Protects the hpm_counters and hpm_color_current memory from changing while
350 * looking at page counters information.
351 * Grab the write lock to modify what these fields point at.
352 * Grab the read lock to prevent any pointers from changing.
353 * The write lock can not be held during memory allocation due to a possible
354 * recursion deadlock with trying to grab the read lock while the
355 * write lock is already held.
356 */
357 krwlock_t page_ctrs_rwlock[MAX_MEM_NODES];
358
359
360 /*
361 * initialize cpu_vm_data to point at cache aligned vm_cpu_data_t.
362 */
363 void
cpu_vm_data_init(struct cpu * cp)364 cpu_vm_data_init(struct cpu *cp)
365 {
366 if (cp == CPU0) {
367 cp->cpu_vm_data = (void *)&vm_cpu_data0;
368 } else {
369 void *kmptr;
370 int align;
371 size_t sz;
372
373 align = (L2CACHE_ALIGN) ? L2CACHE_ALIGN : L2CACHE_ALIGN_MAX;
374 sz = P2ROUNDUP(sizeof (vm_cpu_data_t), align) + align;
375 kmptr = kmem_zalloc(sz, KM_SLEEP);
376 cp->cpu_vm_data = (void *) P2ROUNDUP((uintptr_t)kmptr, align);
377 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr = kmptr;
378 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize = sz;
379 }
380 }
381
382 /*
383 * free cpu_vm_data
384 */
385 void
cpu_vm_data_destroy(struct cpu * cp)386 cpu_vm_data_destroy(struct cpu *cp)
387 {
388 if (cp->cpu_seqid && cp->cpu_vm_data) {
389 ASSERT(cp != CPU0);
390 kmem_free(((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr,
391 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize);
392 }
393 cp->cpu_vm_data = NULL;
394 }
395
396
397 /*
398 * page size to page size code
399 */
400 int
page_szc(size_t pagesize)401 page_szc(size_t pagesize)
402 {
403 int i = 0;
404
405 while (hw_page_array[i].hp_size) {
406 if (pagesize == hw_page_array[i].hp_size)
407 return (i);
408 i++;
409 }
410 return (-1);
411 }
412
413 /*
414 * page size to page size code with the restriction that it be a supported
415 * user page size. If it's not a supported user page size, -1 will be returned.
416 */
417 int
page_szc_user_filtered(size_t pagesize)418 page_szc_user_filtered(size_t pagesize)
419 {
420 int szc = page_szc(pagesize);
421 if ((szc != -1) && (SZC_2_USERSZC(szc) != -1)) {
422 return (szc);
423 }
424 return (-1);
425 }
426
427 /*
428 * Return how many page sizes are available for the user to use. This is
429 * what the hardware supports and not based upon how the OS implements the
430 * support of different page sizes.
431 *
432 * If legacy is non-zero, return the number of pagesizes available to legacy
433 * applications. The number of legacy page sizes might be less than the
434 * exported user page sizes. This is to prevent legacy applications that
435 * use the largest page size returned from getpagesizes(3c) from inadvertantly
436 * using the 'new' large pagesizes.
437 */
438 uint_t
page_num_user_pagesizes(int legacy)439 page_num_user_pagesizes(int legacy)
440 {
441 if (legacy)
442 return (mmu_legacy_page_sizes);
443 return (mmu_exported_page_sizes);
444 }
445
446 uint_t
page_num_pagesizes(void)447 page_num_pagesizes(void)
448 {
449 return (mmu_page_sizes);
450 }
451
452 /*
453 * returns the count of the number of base pagesize pages associated with szc
454 */
455 pgcnt_t
page_get_pagecnt(uint_t szc)456 page_get_pagecnt(uint_t szc)
457 {
458 if (szc >= mmu_page_sizes)
459 panic("page_get_pagecnt: out of range %d", szc);
460 return (hw_page_array[szc].hp_pgcnt);
461 }
462
463 size_t
page_get_pagesize(uint_t szc)464 page_get_pagesize(uint_t szc)
465 {
466 if (szc >= mmu_page_sizes)
467 panic("page_get_pagesize: out of range %d", szc);
468 return (hw_page_array[szc].hp_size);
469 }
470
471 /*
472 * Return the size of a page based upon the index passed in. An index of
473 * zero refers to the smallest page size in the system, and as index increases
474 * it refers to the next larger supported page size in the system.
475 * Note that szc and userszc may not be the same due to unsupported szc's on
476 * some systems.
477 */
478 size_t
page_get_user_pagesize(uint_t userszc)479 page_get_user_pagesize(uint_t userszc)
480 {
481 uint_t szc = USERSZC_2_SZC(userszc);
482
483 if (szc >= mmu_page_sizes)
484 panic("page_get_user_pagesize: out of range %d", szc);
485 return (hw_page_array[szc].hp_size);
486 }
487
488 uint_t
page_get_shift(uint_t szc)489 page_get_shift(uint_t szc)
490 {
491 if (szc >= mmu_page_sizes)
492 panic("page_get_shift: out of range %d", szc);
493 return (PAGE_GET_SHIFT(szc));
494 }
495
496 uint_t
page_get_pagecolors(uint_t szc)497 page_get_pagecolors(uint_t szc)
498 {
499 if (szc >= mmu_page_sizes)
500 panic("page_get_pagecolors: out of range %d", szc);
501 return (PAGE_GET_PAGECOLORS(szc));
502 }
503
504 /*
505 * this assigns the desired equivalent color after a split
506 */
507 uint_t
page_correct_color(uchar_t szc,uchar_t nszc,uint_t color,uint_t ncolor,uint_t ceq_mask)508 page_correct_color(uchar_t szc, uchar_t nszc, uint_t color,
509 uint_t ncolor, uint_t ceq_mask)
510 {
511 ASSERT(nszc > szc);
512 ASSERT(szc < mmu_page_sizes);
513 ASSERT(color < PAGE_GET_PAGECOLORS(szc));
514 ASSERT(ncolor < PAGE_GET_PAGECOLORS(nszc));
515
516 color &= ceq_mask;
517 ncolor = PAGE_CONVERT_COLOR(ncolor, szc, nszc);
518 return (color | (ncolor & ~ceq_mask));
519 }
520
521 /*
522 * The interleaved_mnodes flag is set when mnodes overlap in
523 * the physbase..physmax range, but have disjoint slices.
524 * In this case hpm_counters is shared by all mnodes.
525 * This flag is set dynamically by the platform.
526 */
527 int interleaved_mnodes = 0;
528
529 /*
530 * Called by startup().
531 * Size up the per page size free list counters based on physmax
532 * of each node and max_mem_nodes.
533 *
534 * If interleaved_mnodes is set we need to find the first mnode that
535 * exists. hpm_counters for the first mnode will then be shared by
536 * all other mnodes. If interleaved_mnodes is not set, just set
537 * first=mnode each time. That means there will be no sharing.
538 */
539 size_t
page_ctrs_sz(void)540 page_ctrs_sz(void)
541 {
542 int r; /* region size */
543 int mnode;
544 int firstmn; /* first mnode that exists */
545 int nranges;
546 pfn_t physbase;
547 pfn_t physmax;
548 uint_t ctrs_sz = 0;
549 int i;
550 pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
551
552 /*
553 * We need to determine how many page colors there are for each
554 * page size in order to allocate memory for any color specific
555 * arrays.
556 */
557 for (i = 0; i < mmu_page_sizes; i++) {
558 colors_per_szc[i] = PAGE_GET_PAGECOLORS(i);
559 }
560
561 for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) {
562
563 pgcnt_t r_pgcnt;
564 pfn_t r_base;
565 pgcnt_t r_align;
566
567 if (mem_node_config[mnode].exists == 0)
568 continue;
569
570 HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn);
571 nranges = MNODE_RANGE_CNT(mnode);
572 mnode_nranges[mnode] = nranges;
573 mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode);
574
575 /*
576 * determine size needed for page counter arrays with
577 * base aligned to large page size.
578 */
579 for (r = 1; r < mmu_page_sizes; r++) {
580 /* add in space for hpm_color_current */
581 ctrs_sz += sizeof (size_t) *
582 colors_per_szc[r] * nranges;
583
584 if (firstmn != mnode)
585 continue;
586
587 /* add in space for hpm_counters */
588 r_align = page_get_pagecnt(r);
589 r_base = physbase;
590 r_base &= ~(r_align - 1);
591 r_pgcnt = howmany(physmax - r_base + 1, r_align);
592
593 /*
594 * Round up to always allocate on pointer sized
595 * boundaries.
596 */
597 ctrs_sz += P2ROUNDUP((r_pgcnt * sizeof (hpmctr_t)),
598 sizeof (hpmctr_t *));
599 }
600 }
601
602 for (r = 1; r < mmu_page_sizes; r++) {
603 ctrs_sz += (max_mem_nodes * sizeof (hw_page_map_t));
604 }
605
606 /* add in space for page_ctrs_cands and pcc_color_free */
607 ctrs_sz += sizeof (pcc_info_t *) * max_mem_nodes *
608 mmu_page_sizes * NPC_MUTEX;
609
610 for (mnode = 0; mnode < max_mem_nodes; mnode++) {
611
612 if (mem_node_config[mnode].exists == 0)
613 continue;
614
615 nranges = mnode_nranges[mnode];
616 ctrs_sz += sizeof (pcc_info_t) * nranges *
617 mmu_page_sizes * NPC_MUTEX;
618 for (r = 1; r < mmu_page_sizes; r++) {
619 ctrs_sz += sizeof (pgcnt_t) * nranges *
620 colors_per_szc[r] * NPC_MUTEX;
621 }
622 }
623
624 /* ctr_mutex */
625 ctrs_sz += (max_mem_nodes * NPC_MUTEX * sizeof (kmutex_t));
626
627 /* size for page list counts */
628 PLCNT_SZ(ctrs_sz);
629
630 /*
631 * add some slop for roundups. page_ctrs_alloc will roundup the start
632 * address of the counters to ecache_alignsize boundary for every
633 * memory node.
634 */
635 return (ctrs_sz + max_mem_nodes * L2CACHE_ALIGN);
636 }
637
638 caddr_t
page_ctrs_alloc(caddr_t alloc_base)639 page_ctrs_alloc(caddr_t alloc_base)
640 {
641 int mnode;
642 int mrange, nranges;
643 int r; /* region size */
644 int i;
645 int firstmn; /* first mnode that exists */
646 pfn_t physbase;
647 pfn_t physmax;
648 pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
649
650 /*
651 * We need to determine how many page colors there are for each
652 * page size in order to allocate memory for any color specific
653 * arrays.
654 */
655 for (i = 0; i < mmu_page_sizes; i++) {
656 colors_per_szc[i] = PAGE_GET_PAGECOLORS(i);
657 }
658
659 for (r = 1; r < mmu_page_sizes; r++) {
660 page_counters[r] = (hw_page_map_t *)alloc_base;
661 alloc_base += (max_mem_nodes * sizeof (hw_page_map_t));
662 }
663
664 /* page_ctrs_cands and pcc_color_free array */
665 for (i = 0; i < NPC_MUTEX; i++) {
666 for (r = 1; r < mmu_page_sizes; r++) {
667
668 page_ctrs_cands[i][r] = (pcc_info_t **)alloc_base;
669 alloc_base += sizeof (pcc_info_t *) * max_mem_nodes;
670
671 for (mnode = 0; mnode < max_mem_nodes; mnode++) {
672 pcc_info_t *pi;
673
674 if (mem_node_config[mnode].exists == 0)
675 continue;
676
677 nranges = mnode_nranges[mnode];
678
679 pi = (pcc_info_t *)alloc_base;
680 alloc_base += sizeof (pcc_info_t) * nranges;
681 page_ctrs_cands[i][r][mnode] = pi;
682
683 for (mrange = 0; mrange < nranges; mrange++) {
684 pi->pcc_color_free =
685 (pgcnt_t *)alloc_base;
686 alloc_base += sizeof (pgcnt_t) *
687 colors_per_szc[r];
688 pi++;
689 }
690 }
691 }
692 }
693
694 /* ctr_mutex */
695 for (i = 0; i < NPC_MUTEX; i++) {
696 ctr_mutex[i] = (kmutex_t *)alloc_base;
697 alloc_base += (max_mem_nodes * sizeof (kmutex_t));
698 }
699
700 /* initialize page list counts */
701 PLCNT_INIT(alloc_base);
702
703 for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) {
704
705 pgcnt_t r_pgcnt;
706 pfn_t r_base;
707 pgcnt_t r_align;
708 int r_shift;
709 int nranges = mnode_nranges[mnode];
710
711 if (mem_node_config[mnode].exists == 0)
712 continue;
713
714 HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn);
715
716 for (r = 1; r < mmu_page_sizes; r++) {
717 /*
718 * the page_counters base has to be aligned to the
719 * page count of page size code r otherwise the counts
720 * will cross large page boundaries.
721 */
722 r_align = page_get_pagecnt(r);
723 r_base = physbase;
724 /* base needs to be aligned - lower to aligned value */
725 r_base &= ~(r_align - 1);
726 r_pgcnt = howmany(physmax - r_base + 1, r_align);
727 r_shift = PAGE_BSZS_SHIFT(r);
728
729 PAGE_COUNTERS_SHIFT(mnode, r) = r_shift;
730 PAGE_COUNTERS_ENTRIES(mnode, r) = r_pgcnt;
731 PAGE_COUNTERS_BASE(mnode, r) = r_base;
732 for (mrange = 0; mrange < nranges; mrange++) {
733 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode,
734 r, mrange) = (size_t *)alloc_base;
735 alloc_base += sizeof (size_t) *
736 colors_per_szc[r];
737 }
738 for (i = 0; i < colors_per_szc[r]; i++) {
739 uint_t color_mask = colors_per_szc[r] - 1;
740 pfn_t pfnum = r_base;
741 size_t idx;
742 int mrange;
743 MEM_NODE_ITERATOR_DECL(it);
744
745 MEM_NODE_ITERATOR_INIT(pfnum, mnode, r, &it);
746 if (pfnum == (pfn_t)-1) {
747 idx = 0;
748 } else {
749 PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i,
750 color_mask, color_mask, &it);
751 idx = PNUM_TO_IDX(mnode, r, pfnum);
752 idx = (idx >= r_pgcnt) ? 0 : idx;
753 }
754 for (mrange = 0; mrange < nranges; mrange++) {
755 PAGE_COUNTERS_CURRENT_COLOR(mnode,
756 r, i, mrange) = idx;
757 }
758 }
759
760 /* hpm_counters may be shared by all mnodes */
761 if (firstmn == mnode) {
762 PAGE_COUNTERS_COUNTERS(mnode, r) =
763 (hpmctr_t *)alloc_base;
764 alloc_base +=
765 P2ROUNDUP((sizeof (hpmctr_t) * r_pgcnt),
766 sizeof (hpmctr_t *));
767 } else {
768 PAGE_COUNTERS_COUNTERS(mnode, r) =
769 PAGE_COUNTERS_COUNTERS(firstmn, r);
770 }
771
772 /*
773 * Verify that PNUM_TO_IDX and IDX_TO_PNUM
774 * satisfy the identity requirement.
775 * We should be able to go from one to the other
776 * and get consistent values.
777 */
778 ASSERT(PNUM_TO_IDX(mnode, r,
779 (IDX_TO_PNUM(mnode, r, 0))) == 0);
780 ASSERT(IDX_TO_PNUM(mnode, r,
781 (PNUM_TO_IDX(mnode, r, r_base))) == r_base);
782 }
783 /*
784 * Roundup the start address of the page_counters to
785 * cache aligned boundary for every memory node.
786 * page_ctrs_sz() has added some slop for these roundups.
787 */
788 alloc_base = (caddr_t)P2ROUNDUP((uintptr_t)alloc_base,
789 L2CACHE_ALIGN);
790 }
791
792 /* Initialize other page counter specific data structures. */
793 for (mnode = 0; mnode < MAX_MEM_NODES; mnode++) {
794 rw_init(&page_ctrs_rwlock[mnode], NULL, RW_DEFAULT, NULL);
795 }
796
797 return (alloc_base);
798 }
799
800 /*
801 * Functions to adjust region counters for each size free list.
802 * Caller is responsible to acquire the ctr_mutex lock if necessary and
803 * thus can be called during startup without locks.
804 */
805 /* ARGSUSED */
806 void
page_ctr_add_internal(int mnode,int mtype,page_t * pp,int flags)807 page_ctr_add_internal(int mnode, int mtype, page_t *pp, int flags)
808 {
809 ssize_t r; /* region size */
810 ssize_t idx;
811 pfn_t pfnum;
812 int lckidx;
813
814 ASSERT(mnode == PP_2_MEM_NODE(pp));
815 ASSERT(mtype == PP_2_MTYPE(pp));
816
817 ASSERT(pp->p_szc < mmu_page_sizes);
818
819 PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags);
820
821 /* no counter update needed for largest page size */
822 if (pp->p_szc >= mmu_page_sizes - 1) {
823 return;
824 }
825
826 r = pp->p_szc + 1;
827 pfnum = pp->p_pagenum;
828 lckidx = PP_CTR_LOCK_INDX(pp);
829
830 /*
831 * Increment the count of free pages for the current
832 * region. Continue looping up in region size incrementing
833 * count if the preceeding region is full.
834 */
835 while (r < mmu_page_sizes) {
836 idx = PNUM_TO_IDX(mnode, r, pfnum);
837
838 ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
839 ASSERT(PAGE_COUNTERS(mnode, r, idx) < FULL_REGION_CNT(r));
840
841 if (++PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r)) {
842 break;
843 } else {
844 int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r));
845 pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode]
846 [MTYPE_2_MRANGE(mnode, root_mtype)];
847
848 cand->pcc_pages_free++;
849 cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]++;
850 }
851 r++;
852 }
853 }
854
855 void
page_ctr_add(int mnode,int mtype,page_t * pp,int flags)856 page_ctr_add(int mnode, int mtype, page_t *pp, int flags)
857 {
858 int lckidx = PP_CTR_LOCK_INDX(pp);
859 kmutex_t *lock = &ctr_mutex[lckidx][mnode];
860
861 mutex_enter(lock);
862 page_ctr_add_internal(mnode, mtype, pp, flags);
863 mutex_exit(lock);
864 }
865
866 void
page_ctr_sub_internal(int mnode,int mtype,page_t * pp,int flags)867 page_ctr_sub_internal(int mnode, int mtype, page_t *pp, int flags)
868 {
869 int lckidx;
870 ssize_t r; /* region size */
871 ssize_t idx;
872 pfn_t pfnum;
873
874 ASSERT(mnode == PP_2_MEM_NODE(pp));
875 ASSERT(mtype == PP_2_MTYPE(pp));
876
877 ASSERT(pp->p_szc < mmu_page_sizes);
878
879 PLCNT_DECR(pp, mnode, mtype, pp->p_szc, flags);
880
881 /* no counter update needed for largest page size */
882 if (pp->p_szc >= mmu_page_sizes - 1) {
883 return;
884 }
885
886 r = pp->p_szc + 1;
887 pfnum = pp->p_pagenum;
888 lckidx = PP_CTR_LOCK_INDX(pp);
889
890 /*
891 * Decrement the count of free pages for the current
892 * region. Continue looping up in region size decrementing
893 * count if the preceeding region was full.
894 */
895 while (r < mmu_page_sizes) {
896 idx = PNUM_TO_IDX(mnode, r, pfnum);
897
898 ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
899 ASSERT(PAGE_COUNTERS(mnode, r, idx) > 0);
900
901 if (--PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r) - 1) {
902 break;
903 } else {
904 int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r));
905 pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode]
906 [MTYPE_2_MRANGE(mnode, root_mtype)];
907
908 ASSERT(cand->pcc_pages_free != 0);
909 ASSERT(cand->pcc_color_free[PP_2_BIN_SZC(pp, r)] != 0);
910
911 cand->pcc_pages_free--;
912 cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]--;
913 }
914 r++;
915 }
916 }
917
918 void
page_ctr_sub(int mnode,int mtype,page_t * pp,int flags)919 page_ctr_sub(int mnode, int mtype, page_t *pp, int flags)
920 {
921 int lckidx = PP_CTR_LOCK_INDX(pp);
922 kmutex_t *lock = &ctr_mutex[lckidx][mnode];
923
924 mutex_enter(lock);
925 page_ctr_sub_internal(mnode, mtype, pp, flags);
926 mutex_exit(lock);
927 }
928
929 /*
930 * Adjust page counters following a memory attach, since typically the
931 * size of the array needs to change, and the PFN to counter index
932 * mapping needs to change.
933 *
934 * It is possible this mnode did not exist at startup. In that case
935 * allocate pcc_info_t and pcc_color_free arrays. Also, allow for nranges
936 * to change (a theoretical possibility on x86), which means pcc_color_free
937 * arrays must be extended.
938 */
939 uint_t
page_ctrs_adjust(int mnode)940 page_ctrs_adjust(int mnode)
941 {
942 pgcnt_t npgs;
943 int r; /* region size */
944 int i;
945 size_t pcsz, old_csz;
946 hpmctr_t *new_ctr, *old_ctr;
947 pfn_t oldbase, newbase;
948 pfn_t physbase, physmax;
949 size_t old_npgs;
950 hpmctr_t *ctr_cache[MMU_PAGE_SIZES];
951 size_t size_cache[MMU_PAGE_SIZES];
952 size_t *color_cache[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
953 size_t *old_color_array[MAX_MNODE_MRANGES];
954 pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
955 pcc_info_t **cands_cache;
956 pcc_info_t *old_pi, *pi;
957 pgcnt_t *pgcntp;
958 int nr, old_nranges, mrange, nranges = MNODE_RANGE_CNT(mnode);
959 int cands_cache_nranges;
960 int old_maxmrange, new_maxmrange;
961 int rc = 0;
962 int oldmnode;
963
964 cands_cache = kmem_zalloc(sizeof (pcc_info_t *) * NPC_MUTEX *
965 MMU_PAGE_SIZES, KM_NOSLEEP);
966 if (cands_cache == NULL)
967 return (ENOMEM);
968
969 i = -1;
970 HPM_COUNTERS_LIMITS(mnode, physbase, physmax, i);
971
972 newbase = physbase & ~PC_BASE_ALIGN_MASK;
973 npgs = roundup(physmax, PC_BASE_ALIGN) - newbase;
974
975 /* prepare to free non-null pointers on the way out */
976 cands_cache_nranges = nranges;
977 bzero(ctr_cache, sizeof (ctr_cache));
978 bzero(color_cache, sizeof (color_cache));
979
980 /*
981 * We need to determine how many page colors there are for each
982 * page size in order to allocate memory for any color specific
983 * arrays.
984 */
985 for (r = 0; r < mmu_page_sizes; r++) {
986 colors_per_szc[r] = PAGE_GET_PAGECOLORS(r);
987 }
988
989 /*
990 * Preallocate all of the new hpm_counters arrays as we can't
991 * hold the page_ctrs_rwlock as a writer and allocate memory.
992 * If we can't allocate all of the arrays, undo our work so far
993 * and return failure.
994 */
995 for (r = 1; r < mmu_page_sizes; r++) {
996 pcsz = npgs >> PAGE_BSZS_SHIFT(r);
997 size_cache[r] = pcsz;
998 ctr_cache[r] = kmem_zalloc(pcsz *
999 sizeof (hpmctr_t), KM_NOSLEEP);
1000 if (ctr_cache[r] == NULL) {
1001 rc = ENOMEM;
1002 goto cleanup;
1003 }
1004 }
1005
1006 /*
1007 * Preallocate all of the new color current arrays as we can't
1008 * hold the page_ctrs_rwlock as a writer and allocate memory.
1009 * If we can't allocate all of the arrays, undo our work so far
1010 * and return failure.
1011 */
1012 for (r = 1; r < mmu_page_sizes; r++) {
1013 for (mrange = 0; mrange < nranges; mrange++) {
1014 color_cache[r][mrange] = kmem_zalloc(sizeof (size_t) *
1015 colors_per_szc[r], KM_NOSLEEP);
1016 if (color_cache[r][mrange] == NULL) {
1017 rc = ENOMEM;
1018 goto cleanup;
1019 }
1020 }
1021 }
1022
1023 /*
1024 * Preallocate all of the new pcc_info_t arrays as we can't
1025 * hold the page_ctrs_rwlock as a writer and allocate memory.
1026 * If we can't allocate all of the arrays, undo our work so far
1027 * and return failure.
1028 */
1029 for (r = 1; r < mmu_page_sizes; r++) {
1030 for (i = 0; i < NPC_MUTEX; i++) {
1031 pi = kmem_zalloc(nranges * sizeof (pcc_info_t),
1032 KM_NOSLEEP);
1033 if (pi == NULL) {
1034 rc = ENOMEM;
1035 goto cleanup;
1036 }
1037 cands_cache[i * MMU_PAGE_SIZES + r] = pi;
1038
1039 for (mrange = 0; mrange < nranges; mrange++, pi++) {
1040 pgcntp = kmem_zalloc(colors_per_szc[r] *
1041 sizeof (pgcnt_t), KM_NOSLEEP);
1042 if (pgcntp == NULL) {
1043 rc = ENOMEM;
1044 goto cleanup;
1045 }
1046 pi->pcc_color_free = pgcntp;
1047 }
1048 }
1049 }
1050
1051 /*
1052 * Grab the write lock to prevent others from walking these arrays
1053 * while we are modifying them.
1054 */
1055 PAGE_CTRS_WRITE_LOCK(mnode);
1056
1057 /*
1058 * For interleaved mnodes, find the first mnode
1059 * with valid page counters since the current
1060 * mnode may have just been added and not have
1061 * valid page counters.
1062 */
1063 if (interleaved_mnodes) {
1064 for (i = 0; i < max_mem_nodes; i++)
1065 if (PAGE_COUNTERS_COUNTERS(i, 1) != NULL)
1066 break;
1067 ASSERT(i < max_mem_nodes);
1068 oldmnode = i;
1069 } else
1070 oldmnode = mnode;
1071
1072 old_nranges = mnode_nranges[mnode];
1073 cands_cache_nranges = old_nranges;
1074 mnode_nranges[mnode] = nranges;
1075 old_maxmrange = mnode_maxmrange[mnode];
1076 mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode);
1077 new_maxmrange = mnode_maxmrange[mnode];
1078
1079 for (r = 1; r < mmu_page_sizes; r++) {
1080 PAGE_COUNTERS_SHIFT(mnode, r) = PAGE_BSZS_SHIFT(r);
1081 old_ctr = PAGE_COUNTERS_COUNTERS(oldmnode, r);
1082 old_csz = PAGE_COUNTERS_ENTRIES(oldmnode, r);
1083 oldbase = PAGE_COUNTERS_BASE(oldmnode, r);
1084 old_npgs = old_csz << PAGE_COUNTERS_SHIFT(oldmnode, r);
1085 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1086 old_color_array[mrange] =
1087 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode,
1088 r, mrange);
1089 }
1090
1091 pcsz = npgs >> PAGE_COUNTERS_SHIFT(mnode, r);
1092 new_ctr = ctr_cache[r];
1093 ctr_cache[r] = NULL;
1094 if (old_ctr != NULL &&
1095 (oldbase + old_npgs > newbase) &&
1096 (newbase + npgs > oldbase)) {
1097 /*
1098 * Map the intersection of the old and new
1099 * counters into the new array.
1100 */
1101 size_t offset;
1102 if (newbase > oldbase) {
1103 offset = (newbase - oldbase) >>
1104 PAGE_COUNTERS_SHIFT(mnode, r);
1105 bcopy(old_ctr + offset, new_ctr,
1106 MIN(pcsz, (old_csz - offset)) *
1107 sizeof (hpmctr_t));
1108 } else {
1109 offset = (oldbase - newbase) >>
1110 PAGE_COUNTERS_SHIFT(mnode, r);
1111 bcopy(old_ctr, new_ctr + offset,
1112 MIN(pcsz - offset, old_csz) *
1113 sizeof (hpmctr_t));
1114 }
1115 }
1116
1117 PAGE_COUNTERS_COUNTERS(mnode, r) = new_ctr;
1118 PAGE_COUNTERS_ENTRIES(mnode, r) = pcsz;
1119 PAGE_COUNTERS_BASE(mnode, r) = newbase;
1120
1121 /* update shared hpm_counters in other mnodes */
1122 if (interleaved_mnodes) {
1123 for (i = 0; i < max_mem_nodes; i++) {
1124 if ((i == mnode) ||
1125 (mem_node_config[i].exists == 0))
1126 continue;
1127 ASSERT(
1128 PAGE_COUNTERS_COUNTERS(i, r) == old_ctr ||
1129 PAGE_COUNTERS_COUNTERS(i, r) == NULL);
1130 PAGE_COUNTERS_COUNTERS(i, r) = new_ctr;
1131 PAGE_COUNTERS_ENTRIES(i, r) = pcsz;
1132 PAGE_COUNTERS_BASE(i, r) = newbase;
1133 }
1134 }
1135
1136 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1137 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r, mrange) =
1138 color_cache[r][mrange];
1139 color_cache[r][mrange] = NULL;
1140 }
1141 /*
1142 * for now, just reset on these events as it's probably
1143 * not worthwhile to try and optimize this.
1144 */
1145 for (i = 0; i < colors_per_szc[r]; i++) {
1146 uint_t color_mask = colors_per_szc[r] - 1;
1147 int mlo = interleaved_mnodes ? 0 : mnode;
1148 int mhi = interleaved_mnodes ? max_mem_nodes :
1149 (mnode + 1);
1150 int m;
1151 pfn_t pfnum;
1152 size_t idx;
1153 MEM_NODE_ITERATOR_DECL(it);
1154
1155 for (m = mlo; m < mhi; m++) {
1156 if (mem_node_config[m].exists == 0)
1157 continue;
1158 pfnum = newbase;
1159 MEM_NODE_ITERATOR_INIT(pfnum, m, r, &it);
1160 if (pfnum == (pfn_t)-1) {
1161 idx = 0;
1162 } else {
1163 PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i,
1164 color_mask, color_mask, &it);
1165 idx = PNUM_TO_IDX(m, r, pfnum);
1166 idx = (idx < pcsz) ? idx : 0;
1167 }
1168 for (mrange = 0; mrange < nranges; mrange++) {
1169 if (PAGE_COUNTERS_CURRENT_COLOR_ARRAY(m,
1170 r, mrange) != NULL)
1171 PAGE_COUNTERS_CURRENT_COLOR(m,
1172 r, i, mrange) = idx;
1173 }
1174 }
1175 }
1176
1177 /* cache info for freeing out of the critical path */
1178 if ((caddr_t)old_ctr >= kernelheap &&
1179 (caddr_t)old_ctr < ekernelheap) {
1180 ctr_cache[r] = old_ctr;
1181 size_cache[r] = old_csz;
1182 }
1183 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1184 size_t *tmp = old_color_array[mrange];
1185 if ((caddr_t)tmp >= kernelheap &&
1186 (caddr_t)tmp < ekernelheap) {
1187 color_cache[r][mrange] = tmp;
1188 }
1189 }
1190 /*
1191 * Verify that PNUM_TO_IDX and IDX_TO_PNUM
1192 * satisfy the identity requirement.
1193 * We should be able to go from one to the other
1194 * and get consistent values.
1195 */
1196 ASSERT(PNUM_TO_IDX(mnode, r,
1197 (IDX_TO_PNUM(mnode, r, 0))) == 0);
1198 ASSERT(IDX_TO_PNUM(mnode, r,
1199 (PNUM_TO_IDX(mnode, r, newbase))) == newbase);
1200
1201 /* pcc_info_t and pcc_color_free */
1202 for (i = 0; i < NPC_MUTEX; i++) {
1203 pcc_info_t *epi;
1204 pcc_info_t *eold_pi;
1205
1206 pi = cands_cache[i * MMU_PAGE_SIZES + r];
1207 old_pi = page_ctrs_cands[i][r][mnode];
1208 page_ctrs_cands[i][r][mnode] = pi;
1209 cands_cache[i * MMU_PAGE_SIZES + r] = old_pi;
1210
1211 /* preserve old pcc_color_free values, if any */
1212 if (old_pi == NULL)
1213 continue;
1214
1215 /*
1216 * when/if x86 does DR, must account for
1217 * possible change in range index when
1218 * preserving pcc_info
1219 */
1220 epi = &pi[nranges];
1221 eold_pi = &old_pi[old_nranges];
1222 if (new_maxmrange > old_maxmrange) {
1223 pi += new_maxmrange - old_maxmrange;
1224 } else if (new_maxmrange < old_maxmrange) {
1225 old_pi += old_maxmrange - new_maxmrange;
1226 }
1227 for (; pi < epi && old_pi < eold_pi; pi++, old_pi++) {
1228 pcc_info_t tmp = *pi;
1229 *pi = *old_pi;
1230 *old_pi = tmp;
1231 }
1232 }
1233 }
1234 PAGE_CTRS_WRITE_UNLOCK(mnode);
1235
1236 /*
1237 * Now that we have dropped the write lock, it is safe to free all
1238 * of the memory we have cached above.
1239 * We come thru here to free memory when pre-alloc fails, and also to
1240 * free old pointers which were recorded while locked.
1241 */
1242 cleanup:
1243 for (r = 1; r < mmu_page_sizes; r++) {
1244 if (ctr_cache[r] != NULL) {
1245 kmem_free(ctr_cache[r],
1246 size_cache[r] * sizeof (hpmctr_t));
1247 }
1248 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1249 if (color_cache[r][mrange] != NULL) {
1250 kmem_free(color_cache[r][mrange],
1251 colors_per_szc[r] * sizeof (size_t));
1252 }
1253 }
1254 for (i = 0; i < NPC_MUTEX; i++) {
1255 pi = cands_cache[i * MMU_PAGE_SIZES + r];
1256 if (pi == NULL)
1257 continue;
1258 nr = cands_cache_nranges;
1259 for (mrange = 0; mrange < nr; mrange++, pi++) {
1260 pgcntp = pi->pcc_color_free;
1261 if (pgcntp == NULL)
1262 continue;
1263 if ((caddr_t)pgcntp >= kernelheap &&
1264 (caddr_t)pgcntp < ekernelheap) {
1265 kmem_free(pgcntp,
1266 colors_per_szc[r] *
1267 sizeof (pgcnt_t));
1268 }
1269 }
1270 pi = cands_cache[i * MMU_PAGE_SIZES + r];
1271 if ((caddr_t)pi >= kernelheap &&
1272 (caddr_t)pi < ekernelheap) {
1273 kmem_free(pi, nr * sizeof (pcc_info_t));
1274 }
1275 }
1276 }
1277
1278 kmem_free(cands_cache,
1279 sizeof (pcc_info_t *) * NPC_MUTEX * MMU_PAGE_SIZES);
1280 return (rc);
1281 }
1282
1283 /*
1284 * Cleanup the hpm_counters field in the page counters
1285 * array.
1286 */
1287 void
page_ctrs_cleanup(void)1288 page_ctrs_cleanup(void)
1289 {
1290 int r; /* region size */
1291 int i; /* mnode index */
1292
1293 /*
1294 * Get the page counters write lock while we are
1295 * setting the page hpm_counters field to NULL
1296 * for non-existent mnodes.
1297 */
1298 for (i = 0; i < max_mem_nodes; i++) {
1299 PAGE_CTRS_WRITE_LOCK(i);
1300 if (mem_node_config[i].exists) {
1301 PAGE_CTRS_WRITE_UNLOCK(i);
1302 continue;
1303 }
1304 for (r = 1; r < mmu_page_sizes; r++) {
1305 PAGE_COUNTERS_COUNTERS(i, r) = NULL;
1306 }
1307 PAGE_CTRS_WRITE_UNLOCK(i);
1308 }
1309 }
1310
1311 #ifdef DEBUG
1312
1313 /*
1314 * confirm pp is a large page corresponding to szc
1315 */
1316 void
chk_lpg(page_t * pp,uchar_t szc)1317 chk_lpg(page_t *pp, uchar_t szc)
1318 {
1319 spgcnt_t npgs = page_get_pagecnt(pp->p_szc);
1320 uint_t noreloc;
1321
1322 if (npgs == 1) {
1323 ASSERT(pp->p_szc == 0);
1324 ASSERT(pp->p_next == pp);
1325 ASSERT(pp->p_prev == pp);
1326 return;
1327 }
1328
1329 ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL);
1330 ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL);
1331
1332 ASSERT(IS_P2ALIGNED(pp->p_pagenum, npgs));
1333 ASSERT(pp->p_pagenum == (pp->p_next->p_pagenum - 1));
1334 ASSERT(pp->p_prev->p_pagenum == (pp->p_pagenum + (npgs - 1)));
1335 ASSERT(pp->p_prev == (pp + (npgs - 1)));
1336
1337 /*
1338 * Check list of pages.
1339 */
1340 noreloc = PP_ISNORELOC(pp);
1341 while (npgs--) {
1342 if (npgs != 0) {
1343 ASSERT(pp->p_pagenum == pp->p_next->p_pagenum - 1);
1344 ASSERT(pp->p_next == (pp + 1));
1345 }
1346 ASSERT(pp->p_szc == szc);
1347 ASSERT(PP_ISFREE(pp));
1348 ASSERT(PP_ISAGED(pp));
1349 ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL);
1350 ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL);
1351 ASSERT(pp->p_vnode == NULL);
1352 ASSERT(PP_ISNORELOC(pp) == noreloc);
1353
1354 pp = pp->p_next;
1355 }
1356 }
1357 #endif /* DEBUG */
1358
1359 void
page_freelist_lock(int mnode)1360 page_freelist_lock(int mnode)
1361 {
1362 int i;
1363 for (i = 0; i < NPC_MUTEX; i++) {
1364 mutex_enter(FPC_MUTEX(mnode, i));
1365 mutex_enter(CPC_MUTEX(mnode, i));
1366 }
1367 }
1368
1369 void
page_freelist_unlock(int mnode)1370 page_freelist_unlock(int mnode)
1371 {
1372 int i;
1373 for (i = 0; i < NPC_MUTEX; i++) {
1374 mutex_exit(FPC_MUTEX(mnode, i));
1375 mutex_exit(CPC_MUTEX(mnode, i));
1376 }
1377 }
1378
1379 /*
1380 * add pp to the specified page list. Defaults to head of the page list
1381 * unless PG_LIST_TAIL is specified.
1382 */
1383 void
page_list_add(page_t * pp,int flags)1384 page_list_add(page_t *pp, int flags)
1385 {
1386 page_t **ppp;
1387 kmutex_t *pcm;
1388 uint_t bin, mtype;
1389 int mnode;
1390
1391 ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT));
1392 ASSERT(PP_ISFREE(pp));
1393 ASSERT(!hat_page_is_mapped(pp));
1394 ASSERT(hat_page_getshare(pp) == 0);
1395
1396 /*
1397 * Large pages should be freed via page_list_add_pages().
1398 */
1399 ASSERT(pp->p_szc == 0);
1400
1401 /*
1402 * Don't need to lock the freelist first here
1403 * because the page isn't on the freelist yet.
1404 * This means p_szc can't change on us.
1405 */
1406
1407 bin = PP_2_BIN(pp);
1408 mnode = PP_2_MEM_NODE(pp);
1409 mtype = PP_2_MTYPE(pp);
1410
1411 if (flags & PG_LIST_ISINIT) {
1412 /*
1413 * PG_LIST_ISINIT is set during system startup (ie. single
1414 * threaded), add a page to the free list and add to the
1415 * the free region counters w/o any locking
1416 */
1417 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1418
1419 /* inline version of page_add() */
1420 if (*ppp != NULL) {
1421 pp->p_next = *ppp;
1422 pp->p_prev = (*ppp)->p_prev;
1423 (*ppp)->p_prev = pp;
1424 pp->p_prev->p_next = pp;
1425 } else
1426 *ppp = pp;
1427
1428 page_ctr_add_internal(mnode, mtype, pp, flags);
1429 VM_STAT_ADD(vmm_vmstats.pladd_free[0]);
1430 } else {
1431 pcm = PC_BIN_MUTEX(mnode, bin, flags);
1432
1433 if (flags & PG_FREE_LIST) {
1434 VM_STAT_ADD(vmm_vmstats.pladd_free[0]);
1435 ASSERT(PP_ISAGED(pp));
1436 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1437
1438 } else {
1439 VM_STAT_ADD(vmm_vmstats.pladd_cache);
1440 ASSERT(pp->p_vnode);
1441 ASSERT((pp->p_offset & PAGEOFFSET) == 0);
1442 ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1443 }
1444 mutex_enter(pcm);
1445 page_add(ppp, pp);
1446
1447 if (flags & PG_LIST_TAIL)
1448 *ppp = (*ppp)->p_next;
1449 /*
1450 * Add counters before releasing pcm mutex to avoid a race with
1451 * page_freelist_coalesce and page_freelist_split.
1452 */
1453 page_ctr_add(mnode, mtype, pp, flags);
1454 mutex_exit(pcm);
1455 }
1456
1457
1458 #if defined(__sparc)
1459 if (PP_ISNORELOC(pp)) {
1460 kcage_freemem_add(1);
1461 }
1462 #endif
1463 /*
1464 * It is up to the caller to unlock the page!
1465 */
1466 ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT));
1467 }
1468
1469
1470 #ifdef __sparc
1471 /*
1472 * This routine is only used by kcage_init during system startup.
1473 * It performs the function of page_list_sub/PP_SETNORELOC/page_list_add
1474 * without the overhead of taking locks and updating counters.
1475 */
1476 void
page_list_noreloc_startup(page_t * pp)1477 page_list_noreloc_startup(page_t *pp)
1478 {
1479 page_t **ppp;
1480 uint_t bin;
1481 int mnode;
1482 int mtype;
1483 int flags = 0;
1484
1485 /*
1486 * If this is a large page on the freelist then
1487 * break it up into smaller pages.
1488 */
1489 if (pp->p_szc != 0)
1490 page_boot_demote(pp);
1491
1492 /*
1493 * Get list page is currently on.
1494 */
1495 bin = PP_2_BIN(pp);
1496 mnode = PP_2_MEM_NODE(pp);
1497 mtype = PP_2_MTYPE(pp);
1498 ASSERT(mtype == MTYPE_RELOC);
1499 ASSERT(pp->p_szc == 0);
1500
1501 if (PP_ISAGED(pp)) {
1502 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1503 flags |= PG_FREE_LIST;
1504 } else {
1505 ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1506 flags |= PG_CACHE_LIST;
1507 }
1508
1509 ASSERT(*ppp != NULL);
1510
1511 /*
1512 * Delete page from current list.
1513 */
1514 if (*ppp == pp)
1515 *ppp = pp->p_next; /* go to next page */
1516 if (*ppp == pp) {
1517 *ppp = NULL; /* page list is gone */
1518 } else {
1519 pp->p_prev->p_next = pp->p_next;
1520 pp->p_next->p_prev = pp->p_prev;
1521 }
1522
1523 /*
1524 * Decrement page counters
1525 */
1526 page_ctr_sub_internal(mnode, mtype, pp, flags);
1527
1528 /*
1529 * Set no reloc for cage initted pages.
1530 */
1531 PP_SETNORELOC(pp);
1532
1533 mtype = PP_2_MTYPE(pp);
1534 ASSERT(mtype == MTYPE_NORELOC);
1535
1536 /*
1537 * Get new list for page.
1538 */
1539 if (PP_ISAGED(pp)) {
1540 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1541 } else {
1542 ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1543 }
1544
1545 /*
1546 * Insert page on new list.
1547 */
1548 if (*ppp == NULL) {
1549 *ppp = pp;
1550 pp->p_next = pp->p_prev = pp;
1551 } else {
1552 pp->p_next = *ppp;
1553 pp->p_prev = (*ppp)->p_prev;
1554 (*ppp)->p_prev = pp;
1555 pp->p_prev->p_next = pp;
1556 }
1557
1558 /*
1559 * Increment page counters
1560 */
1561 page_ctr_add_internal(mnode, mtype, pp, flags);
1562
1563 /*
1564 * Update cage freemem counter
1565 */
1566 atomic_inc_ulong(&kcage_freemem);
1567 }
1568 #else /* __sparc */
1569
1570 /* ARGSUSED */
1571 void
page_list_noreloc_startup(page_t * pp)1572 page_list_noreloc_startup(page_t *pp)
1573 {
1574 panic("page_list_noreloc_startup: should be here only for sparc");
1575 }
1576 #endif
1577
1578 void
page_list_add_pages(page_t * pp,int flags)1579 page_list_add_pages(page_t *pp, int flags)
1580 {
1581 kmutex_t *pcm;
1582 pgcnt_t pgcnt;
1583 uint_t bin, mtype, i;
1584 int mnode;
1585
1586 /* default to freelist/head */
1587 ASSERT((flags & (PG_CACHE_LIST | PG_LIST_TAIL)) == 0);
1588
1589 CHK_LPG(pp, pp->p_szc);
1590 VM_STAT_ADD(vmm_vmstats.pladd_free[pp->p_szc]);
1591
1592 bin = PP_2_BIN(pp);
1593 mnode = PP_2_MEM_NODE(pp);
1594 mtype = PP_2_MTYPE(pp);
1595
1596 if (flags & PG_LIST_ISINIT) {
1597 ASSERT(pp->p_szc == mmu_page_sizes - 1);
1598 page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1599 ASSERT(!PP_ISNORELOC(pp));
1600 PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags);
1601 } else {
1602
1603 ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes);
1604
1605 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
1606
1607 mutex_enter(pcm);
1608 page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1609 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
1610 mutex_exit(pcm);
1611
1612 pgcnt = page_get_pagecnt(pp->p_szc);
1613 #if defined(__sparc)
1614 if (PP_ISNORELOC(pp))
1615 kcage_freemem_add(pgcnt);
1616 #endif
1617 for (i = 0; i < pgcnt; i++, pp++)
1618 page_unlock_nocapture(pp);
1619 }
1620 }
1621
1622 /*
1623 * During boot, need to demote a large page to base
1624 * pagesize pages for seg_kmem for use in boot_alloc()
1625 */
1626 void
page_boot_demote(page_t * pp)1627 page_boot_demote(page_t *pp)
1628 {
1629 ASSERT(pp->p_szc != 0);
1630 ASSERT(PP_ISFREE(pp));
1631 ASSERT(PP_ISAGED(pp));
1632
1633 (void) page_demote(PP_2_MEM_NODE(pp),
1634 PFN_BASE(pp->p_pagenum, pp->p_szc), 0, pp->p_szc, 0, PC_NO_COLOR,
1635 PC_FREE);
1636
1637 ASSERT(PP_ISFREE(pp));
1638 ASSERT(PP_ISAGED(pp));
1639 ASSERT(pp->p_szc == 0);
1640 }
1641
1642 /*
1643 * Take a particular page off of whatever freelist the page
1644 * is claimed to be on.
1645 *
1646 * NOTE: Only used for PAGESIZE pages.
1647 */
1648 void
page_list_sub(page_t * pp,int flags)1649 page_list_sub(page_t *pp, int flags)
1650 {
1651 int bin;
1652 uint_t mtype;
1653 int mnode;
1654 kmutex_t *pcm;
1655 page_t **ppp;
1656
1657 ASSERT(PAGE_EXCL(pp));
1658 ASSERT(PP_ISFREE(pp));
1659
1660 /*
1661 * The p_szc field can only be changed by page_promote()
1662 * and page_demote(). Only free pages can be promoted and
1663 * demoted and the free list MUST be locked during these
1664 * operations. So to prevent a race in page_list_sub()
1665 * between computing which bin of the freelist lock to
1666 * grab and actually grabing the lock we check again that
1667 * the bin we locked is still the correct one. Notice that
1668 * the p_szc field could have actually changed on us but
1669 * if the bin happens to still be the same we are safe.
1670 */
1671 try_again:
1672 bin = PP_2_BIN(pp);
1673 mnode = PP_2_MEM_NODE(pp);
1674 pcm = PC_BIN_MUTEX(mnode, bin, flags);
1675 mutex_enter(pcm);
1676 if (PP_2_BIN(pp) != bin) {
1677 mutex_exit(pcm);
1678 goto try_again;
1679 }
1680 mtype = PP_2_MTYPE(pp);
1681
1682 if (flags & PG_FREE_LIST) {
1683 VM_STAT_ADD(vmm_vmstats.plsub_free[0]);
1684 ASSERT(PP_ISAGED(pp));
1685 ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype);
1686 } else {
1687 VM_STAT_ADD(vmm_vmstats.plsub_cache);
1688 ASSERT(!PP_ISAGED(pp));
1689 ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1690 }
1691
1692 /*
1693 * Common PAGESIZE case.
1694 *
1695 * Note that we locked the freelist. This prevents
1696 * any page promotion/demotion operations. Therefore
1697 * the p_szc will not change until we drop pcm mutex.
1698 */
1699 if (pp->p_szc == 0) {
1700 page_sub(ppp, pp);
1701 /*
1702 * Subtract counters before releasing pcm mutex
1703 * to avoid race with page_freelist_coalesce.
1704 */
1705 page_ctr_sub(mnode, mtype, pp, flags);
1706 mutex_exit(pcm);
1707
1708 #if defined(__sparc)
1709 if (PP_ISNORELOC(pp)) {
1710 kcage_freemem_sub(1);
1711 }
1712 #endif
1713 return;
1714 }
1715
1716 /*
1717 * Large pages on the cache list are not supported.
1718 */
1719 if (flags & PG_CACHE_LIST)
1720 panic("page_list_sub: large page on cachelist");
1721
1722 /*
1723 * Slow but rare.
1724 *
1725 * Somebody wants this particular page which is part
1726 * of a large page. In this case we just demote the page
1727 * if it's on the freelist.
1728 *
1729 * We have to drop pcm before locking the entire freelist.
1730 * Once we have re-locked the freelist check to make sure
1731 * the page hasn't already been demoted or completely
1732 * freed.
1733 */
1734 mutex_exit(pcm);
1735 page_freelist_lock(mnode);
1736 if (pp->p_szc != 0) {
1737 /*
1738 * Large page is on freelist.
1739 */
1740 (void) page_demote(mnode, PFN_BASE(pp->p_pagenum, pp->p_szc),
1741 0, pp->p_szc, 0, PC_NO_COLOR, PC_FREE);
1742 }
1743 ASSERT(PP_ISFREE(pp));
1744 ASSERT(PP_ISAGED(pp));
1745 ASSERT(pp->p_szc == 0);
1746
1747 /*
1748 * Subtract counters before releasing pcm mutex
1749 * to avoid race with page_freelist_coalesce.
1750 */
1751 bin = PP_2_BIN(pp);
1752 mtype = PP_2_MTYPE(pp);
1753 ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype);
1754
1755 page_sub(ppp, pp);
1756 page_ctr_sub(mnode, mtype, pp, flags);
1757 page_freelist_unlock(mnode);
1758
1759 #if defined(__sparc)
1760 if (PP_ISNORELOC(pp)) {
1761 kcage_freemem_sub(1);
1762 }
1763 #endif
1764 }
1765
1766 void
page_list_sub_pages(page_t * pp,uint_t szc)1767 page_list_sub_pages(page_t *pp, uint_t szc)
1768 {
1769 kmutex_t *pcm;
1770 uint_t bin, mtype;
1771 int mnode;
1772
1773 ASSERT(PAGE_EXCL(pp));
1774 ASSERT(PP_ISFREE(pp));
1775 ASSERT(PP_ISAGED(pp));
1776
1777 /*
1778 * See comment in page_list_sub().
1779 */
1780 try_again:
1781 bin = PP_2_BIN(pp);
1782 mnode = PP_2_MEM_NODE(pp);
1783 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
1784 mutex_enter(pcm);
1785 if (PP_2_BIN(pp) != bin) {
1786 mutex_exit(pcm);
1787 goto try_again;
1788 }
1789
1790 /*
1791 * If we're called with a page larger than szc or it got
1792 * promoted above szc before we locked the freelist then
1793 * drop pcm and re-lock entire freelist. If page still larger
1794 * than szc then demote it.
1795 */
1796 if (pp->p_szc > szc) {
1797 mutex_exit(pcm);
1798 pcm = NULL;
1799 page_freelist_lock(mnode);
1800 if (pp->p_szc > szc) {
1801 VM_STAT_ADD(vmm_vmstats.plsubpages_szcbig);
1802 (void) page_demote(mnode,
1803 PFN_BASE(pp->p_pagenum, pp->p_szc), 0,
1804 pp->p_szc, szc, PC_NO_COLOR, PC_FREE);
1805 }
1806 bin = PP_2_BIN(pp);
1807 }
1808 ASSERT(PP_ISFREE(pp));
1809 ASSERT(PP_ISAGED(pp));
1810 ASSERT(pp->p_szc <= szc);
1811 ASSERT(pp == PP_PAGEROOT(pp));
1812
1813 VM_STAT_ADD(vmm_vmstats.plsub_free[pp->p_szc]);
1814
1815 mtype = PP_2_MTYPE(pp);
1816 if (pp->p_szc != 0) {
1817 page_vpsub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1818 CHK_LPG(pp, pp->p_szc);
1819 } else {
1820 VM_STAT_ADD(vmm_vmstats.plsubpages_szc0);
1821 page_sub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1822 }
1823 page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
1824
1825 if (pcm != NULL) {
1826 mutex_exit(pcm);
1827 } else {
1828 page_freelist_unlock(mnode);
1829 }
1830
1831 #if defined(__sparc)
1832 if (PP_ISNORELOC(pp)) {
1833 pgcnt_t pgcnt;
1834
1835 pgcnt = page_get_pagecnt(pp->p_szc);
1836 kcage_freemem_sub(pgcnt);
1837 }
1838 #endif
1839 }
1840
1841 /*
1842 * Add the page to the front of a linked list of pages
1843 * using the p_next & p_prev pointers for the list.
1844 * The caller is responsible for protecting the list pointers.
1845 */
1846 void
mach_page_add(page_t ** ppp,page_t * pp)1847 mach_page_add(page_t **ppp, page_t *pp)
1848 {
1849 if (*ppp == NULL) {
1850 pp->p_next = pp->p_prev = pp;
1851 } else {
1852 pp->p_next = *ppp;
1853 pp->p_prev = (*ppp)->p_prev;
1854 (*ppp)->p_prev = pp;
1855 pp->p_prev->p_next = pp;
1856 }
1857 *ppp = pp;
1858 }
1859
1860 /*
1861 * Remove this page from a linked list of pages
1862 * using the p_next & p_prev pointers for the list.
1863 *
1864 * The caller is responsible for protecting the list pointers.
1865 */
1866 void
mach_page_sub(page_t ** ppp,page_t * pp)1867 mach_page_sub(page_t **ppp, page_t *pp)
1868 {
1869 ASSERT(pp != NULL && PP_ISFREE(pp));
1870
1871 if (*ppp == NULL || pp == NULL)
1872 panic("mach_page_sub");
1873
1874 if (*ppp == pp)
1875 *ppp = pp->p_next; /* go to next page */
1876
1877 if (*ppp == pp)
1878 *ppp = NULL; /* page list is gone */
1879 else {
1880 pp->p_prev->p_next = pp->p_next;
1881 pp->p_next->p_prev = pp->p_prev;
1882 }
1883 pp->p_prev = pp->p_next = pp; /* make pp a list of one */
1884 }
1885
1886 /*
1887 * Routine fsflush uses to gradually coalesce the free list into larger pages.
1888 */
1889 void
page_promote_size(page_t * pp,uint_t cur_szc)1890 page_promote_size(page_t *pp, uint_t cur_szc)
1891 {
1892 pfn_t pfn;
1893 int mnode;
1894 int idx;
1895 int new_szc = cur_szc + 1;
1896 int full = FULL_REGION_CNT(new_szc);
1897
1898 pfn = page_pptonum(pp);
1899 mnode = PFN_2_MEM_NODE(pfn);
1900
1901 page_freelist_lock(mnode);
1902
1903 idx = PNUM_TO_IDX(mnode, new_szc, pfn);
1904 if (PAGE_COUNTERS(mnode, new_szc, idx) == full)
1905 (void) page_promote(mnode, pfn, new_szc, PC_FREE, PC_MTYPE_ANY);
1906
1907 page_freelist_unlock(mnode);
1908 }
1909
1910 static uint_t page_promote_err;
1911 static uint_t page_promote_noreloc_err;
1912
1913 /*
1914 * Create a single larger page (of szc new_szc) from smaller contiguous pages
1915 * for the given mnode starting at pfnum. Pages involved are on the freelist
1916 * before the call and may be returned to the caller if requested, otherwise
1917 * they will be placed back on the freelist.
1918 * If flags is PC_ALLOC, then the large page will be returned to the user in
1919 * a state which is consistent with a page being taken off the freelist. If
1920 * we failed to lock the new large page, then we will return NULL to the
1921 * caller and put the large page on the freelist instead.
1922 * If flags is PC_FREE, then the large page will be placed on the freelist,
1923 * and NULL will be returned.
1924 * The caller is responsible for locking the freelist as well as any other
1925 * accounting which needs to be done for a returned page.
1926 *
1927 * RFE: For performance pass in pp instead of pfnum so
1928 * we can avoid excessive calls to page_numtopp_nolock().
1929 * This would depend on an assumption that all contiguous
1930 * pages are in the same memseg so we can just add/dec
1931 * our pp.
1932 *
1933 * Lock ordering:
1934 *
1935 * There is a potential but rare deadlock situation
1936 * for page promotion and demotion operations. The problem
1937 * is there are two paths into the freelist manager and
1938 * they have different lock orders:
1939 *
1940 * page_create()
1941 * lock freelist
1942 * page_lock(EXCL)
1943 * unlock freelist
1944 * return
1945 * caller drops page_lock
1946 *
1947 * page_free() and page_reclaim()
1948 * caller grabs page_lock(EXCL)
1949 *
1950 * lock freelist
1951 * unlock freelist
1952 * drop page_lock
1953 *
1954 * What prevents a thread in page_create() from deadlocking
1955 * with a thread freeing or reclaiming the same page is the
1956 * page_trylock() in page_get_freelist(). If the trylock fails
1957 * it skips the page.
1958 *
1959 * The lock ordering for promotion and demotion is the same as
1960 * for page_create(). Since the same deadlock could occur during
1961 * page promotion and freeing or reclaiming of a page on the
1962 * cache list we might have to fail the operation and undo what
1963 * have done so far. Again this is rare.
1964 */
1965 page_t *
page_promote(int mnode,pfn_t pfnum,uchar_t new_szc,int flags,int mtype)1966 page_promote(int mnode, pfn_t pfnum, uchar_t new_szc, int flags, int mtype)
1967 {
1968 page_t *pp, *pplist, *tpp, *start_pp;
1969 pgcnt_t new_npgs, npgs;
1970 uint_t bin;
1971 pgcnt_t tmpnpgs, pages_left;
1972 uint_t noreloc;
1973 int which_list;
1974 ulong_t index;
1975 kmutex_t *phm;
1976
1977 /*
1978 * General algorithm:
1979 * Find the starting page
1980 * Walk each page struct removing it from the freelist,
1981 * and linking it to all the other pages removed.
1982 * Once all pages are off the freelist,
1983 * walk the list, modifying p_szc to new_szc and what
1984 * ever other info needs to be done to create a large free page.
1985 * According to the flags, either return the page or put it
1986 * on the freelist.
1987 */
1988
1989 start_pp = page_numtopp_nolock(pfnum);
1990 ASSERT(start_pp && (start_pp->p_pagenum == pfnum));
1991 new_npgs = page_get_pagecnt(new_szc);
1992 ASSERT(IS_P2ALIGNED(pfnum, new_npgs));
1993
1994 /* don't return page of the wrong mtype */
1995 if (mtype != PC_MTYPE_ANY && mtype != PP_2_MTYPE(start_pp))
1996 return (NULL);
1997
1998 /*
1999 * Loop through smaller pages to confirm that all pages
2000 * give the same result for PP_ISNORELOC().
2001 * We can check this reliably here as the protocol for setting
2002 * P_NORELOC requires pages to be taken off the free list first.
2003 */
2004 noreloc = PP_ISNORELOC(start_pp);
2005 for (pp = start_pp + new_npgs; --pp > start_pp; ) {
2006 if (noreloc != PP_ISNORELOC(pp)) {
2007 page_promote_noreloc_err++;
2008 page_promote_err++;
2009 return (NULL);
2010 }
2011 }
2012
2013 pages_left = new_npgs;
2014 pplist = NULL;
2015 pp = start_pp;
2016
2017 /* Loop around coalescing the smaller pages into a big page. */
2018 while (pages_left) {
2019 /*
2020 * Remove from the freelist.
2021 */
2022 ASSERT(PP_ISFREE(pp));
2023 bin = PP_2_BIN(pp);
2024 ASSERT(mnode == PP_2_MEM_NODE(pp));
2025 mtype = PP_2_MTYPE(pp);
2026 if (PP_ISAGED(pp)) {
2027
2028 /*
2029 * PG_FREE_LIST
2030 */
2031 if (pp->p_szc) {
2032 page_vpsub(&PAGE_FREELISTS(mnode,
2033 pp->p_szc, bin, mtype), pp);
2034 } else {
2035 mach_page_sub(&PAGE_FREELISTS(mnode, 0,
2036 bin, mtype), pp);
2037 }
2038 which_list = PG_FREE_LIST;
2039 } else {
2040 ASSERT(pp->p_szc == 0);
2041
2042 /*
2043 * PG_CACHE_LIST
2044 *
2045 * Since this page comes from the
2046 * cachelist, we must destroy the
2047 * vnode association.
2048 */
2049 if (!page_trylock(pp, SE_EXCL)) {
2050 goto fail_promote;
2051 }
2052
2053 /*
2054 * We need to be careful not to deadlock
2055 * with another thread in page_lookup().
2056 * The page_lookup() thread could be holding
2057 * the same phm that we need if the two
2058 * pages happen to hash to the same phm lock.
2059 * At this point we have locked the entire
2060 * freelist and page_lookup() could be trying
2061 * to grab a freelist lock.
2062 */
2063 index = PAGE_HASH_FUNC(pp->p_vnode, pp->p_offset);
2064 phm = PAGE_HASH_MUTEX(index);
2065 if (!mutex_tryenter(phm)) {
2066 page_unlock_nocapture(pp);
2067 goto fail_promote;
2068 }
2069
2070 mach_page_sub(&PAGE_CACHELISTS(mnode, bin, mtype), pp);
2071 page_hashout(pp, phm);
2072 mutex_exit(phm);
2073 PP_SETAGED(pp);
2074 page_unlock_nocapture(pp);
2075 which_list = PG_CACHE_LIST;
2076 }
2077 page_ctr_sub(mnode, mtype, pp, which_list);
2078
2079 /*
2080 * Concatenate the smaller page(s) onto
2081 * the large page list.
2082 */
2083 tmpnpgs = npgs = page_get_pagecnt(pp->p_szc);
2084 pages_left -= npgs;
2085 tpp = pp;
2086 while (npgs--) {
2087 tpp->p_szc = new_szc;
2088 tpp = tpp->p_next;
2089 }
2090 page_list_concat(&pplist, &pp);
2091 pp += tmpnpgs;
2092 }
2093 CHK_LPG(pplist, new_szc);
2094
2095 /*
2096 * return the page to the user if requested
2097 * in the properly locked state.
2098 */
2099 if (flags == PC_ALLOC && (page_trylock_cons(pplist, SE_EXCL))) {
2100 return (pplist);
2101 }
2102
2103 /*
2104 * Otherwise place the new large page on the freelist
2105 */
2106 bin = PP_2_BIN(pplist);
2107 mnode = PP_2_MEM_NODE(pplist);
2108 mtype = PP_2_MTYPE(pplist);
2109 page_vpadd(&PAGE_FREELISTS(mnode, new_szc, bin, mtype), pplist);
2110
2111 page_ctr_add(mnode, mtype, pplist, PG_FREE_LIST);
2112 return (NULL);
2113
2114 fail_promote:
2115 /*
2116 * A thread must have still been freeing or
2117 * reclaiming the page on the cachelist.
2118 * To prevent a deadlock undo what we have
2119 * done sofar and return failure. This
2120 * situation can only happen while promoting
2121 * PAGESIZE pages.
2122 */
2123 page_promote_err++;
2124 while (pplist) {
2125 pp = pplist;
2126 mach_page_sub(&pplist, pp);
2127 pp->p_szc = 0;
2128 bin = PP_2_BIN(pp);
2129 mtype = PP_2_MTYPE(pp);
2130 mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, mtype), pp);
2131 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
2132 }
2133 return (NULL);
2134
2135 }
2136
2137 /*
2138 * Break up a large page into smaller size pages.
2139 * Pages involved are on the freelist before the call and may
2140 * be returned to the caller if requested, otherwise they will
2141 * be placed back on the freelist.
2142 * The caller is responsible for locking the freelist as well as any other
2143 * accounting which needs to be done for a returned page.
2144 * If flags is not PC_ALLOC, the color argument is ignored, and thus
2145 * technically, any value may be passed in but PC_NO_COLOR is the standard
2146 * which should be followed for clarity's sake.
2147 * Returns a page whose pfn is < pfnmax
2148 */
2149 page_t *
page_demote(int mnode,pfn_t pfnum,pfn_t pfnmax,uchar_t cur_szc,uchar_t new_szc,int color,int flags)2150 page_demote(int mnode, pfn_t pfnum, pfn_t pfnmax, uchar_t cur_szc,
2151 uchar_t new_szc, int color, int flags)
2152 {
2153 page_t *pp, *pplist, *npplist;
2154 pgcnt_t npgs, n;
2155 uint_t bin;
2156 uint_t mtype;
2157 page_t *ret_pp = NULL;
2158
2159 ASSERT(cur_szc != 0);
2160 ASSERT(new_szc < cur_szc);
2161
2162 pplist = page_numtopp_nolock(pfnum);
2163 ASSERT(pplist != NULL);
2164
2165 ASSERT(pplist->p_szc == cur_szc);
2166
2167 bin = PP_2_BIN(pplist);
2168 ASSERT(mnode == PP_2_MEM_NODE(pplist));
2169 mtype = PP_2_MTYPE(pplist);
2170 page_vpsub(&PAGE_FREELISTS(mnode, cur_szc, bin, mtype), pplist);
2171
2172 CHK_LPG(pplist, cur_szc);
2173 page_ctr_sub(mnode, mtype, pplist, PG_FREE_LIST);
2174
2175 /*
2176 * Number of PAGESIZE pages for smaller new_szc
2177 * page.
2178 */
2179 npgs = page_get_pagecnt(new_szc);
2180
2181 while (pplist) {
2182 pp = pplist;
2183
2184 ASSERT(pp->p_szc == cur_szc);
2185
2186 /*
2187 * We either break it up into PAGESIZE pages or larger.
2188 */
2189 if (npgs == 1) { /* PAGESIZE case */
2190 mach_page_sub(&pplist, pp);
2191 ASSERT(pp->p_szc == cur_szc);
2192 ASSERT(new_szc == 0);
2193 ASSERT(mnode == PP_2_MEM_NODE(pp));
2194 pp->p_szc = new_szc;
2195 bin = PP_2_BIN(pp);
2196 if ((bin == color) && (flags == PC_ALLOC) &&
2197 (ret_pp == NULL) && (pfnmax == 0 ||
2198 pp->p_pagenum < pfnmax) &&
2199 page_trylock_cons(pp, SE_EXCL)) {
2200 ret_pp = pp;
2201 } else {
2202 mtype = PP_2_MTYPE(pp);
2203 mach_page_add(&PAGE_FREELISTS(mnode, 0, bin,
2204 mtype), pp);
2205 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
2206 }
2207 } else {
2208 page_t *try_to_return_this_page = NULL;
2209 int count = 0;
2210
2211 /*
2212 * Break down into smaller lists of pages.
2213 */
2214 page_list_break(&pplist, &npplist, npgs);
2215
2216 pp = pplist;
2217 n = npgs;
2218 while (n--) {
2219 ASSERT(pp->p_szc == cur_szc);
2220 /*
2221 * Check whether all the pages in this list
2222 * fit the request criteria.
2223 */
2224 if (pfnmax == 0 || pp->p_pagenum < pfnmax) {
2225 count++;
2226 }
2227 pp->p_szc = new_szc;
2228 pp = pp->p_next;
2229 }
2230
2231 if (count == npgs &&
2232 (pfnmax == 0 || pp->p_pagenum < pfnmax)) {
2233 try_to_return_this_page = pp;
2234 }
2235
2236 CHK_LPG(pplist, new_szc);
2237
2238 bin = PP_2_BIN(pplist);
2239 if (try_to_return_this_page)
2240 ASSERT(mnode ==
2241 PP_2_MEM_NODE(try_to_return_this_page));
2242 if ((bin == color) && (flags == PC_ALLOC) &&
2243 (ret_pp == NULL) && try_to_return_this_page &&
2244 page_trylock_cons(try_to_return_this_page,
2245 SE_EXCL)) {
2246 ret_pp = try_to_return_this_page;
2247 } else {
2248 mtype = PP_2_MTYPE(pp);
2249 page_vpadd(&PAGE_FREELISTS(mnode, new_szc,
2250 bin, mtype), pplist);
2251
2252 page_ctr_add(mnode, mtype, pplist,
2253 PG_FREE_LIST);
2254 }
2255 pplist = npplist;
2256 }
2257 }
2258 return (ret_pp);
2259 }
2260
2261 int mpss_coalesce_disable = 0;
2262
2263 /*
2264 * Coalesce free pages into a page of the given szc and color if possible.
2265 * Return the pointer to the page created, otherwise, return NULL.
2266 *
2267 * If pfnhi is non-zero, search for large page with pfn range less than pfnhi.
2268 */
2269 page_t *
page_freelist_coalesce(int mnode,uchar_t szc,uint_t color,uint_t ceq_mask,int mtype,pfn_t pfnhi)2270 page_freelist_coalesce(int mnode, uchar_t szc, uint_t color, uint_t ceq_mask,
2271 int mtype, pfn_t pfnhi)
2272 {
2273 int r = szc; /* region size */
2274 int mrange;
2275 uint_t full, bin, color_mask, wrap = 0;
2276 pfn_t pfnum, lo, hi;
2277 size_t len, idx, idx0;
2278 pgcnt_t cands = 0, szcpgcnt = page_get_pagecnt(szc);
2279 page_t *ret_pp;
2280 MEM_NODE_ITERATOR_DECL(it);
2281
2282 if (mpss_coalesce_disable) {
2283 ASSERT(szc < MMU_PAGE_SIZES);
2284 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[szc][0]);
2285 return (NULL);
2286 }
2287
2288 ASSERT(szc < mmu_page_sizes);
2289 color_mask = PAGE_GET_PAGECOLORS(szc) - 1;
2290 ASSERT(ceq_mask <= color_mask);
2291 ASSERT(color <= color_mask);
2292 color &= ceq_mask;
2293
2294 /* Prevent page_counters dynamic memory from being freed */
2295 rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
2296
2297 mrange = MTYPE_2_MRANGE(mnode, mtype);
2298 ASSERT(mrange < mnode_nranges[mnode]);
2299 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[r][mrange]);
2300
2301 /* get pfn range for mtype */
2302 len = PAGE_COUNTERS_ENTRIES(mnode, r);
2303 MNODETYPE_2_PFN(mnode, mtype, lo, hi);
2304 hi++;
2305
2306 /* use lower limit if given */
2307 if (pfnhi != PFNNULL && pfnhi < hi)
2308 hi = pfnhi;
2309
2310 /* round to szcpgcnt boundaries */
2311 lo = P2ROUNDUP(lo, szcpgcnt);
2312 MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it);
2313 if (lo == (pfn_t)-1) {
2314 rw_exit(&page_ctrs_rwlock[mnode]);
2315 return (NULL);
2316 }
2317 hi = hi & ~(szcpgcnt - 1);
2318
2319 /* set lo to the closest pfn of the right color */
2320 if (((PFN_2_COLOR(lo, szc, &it) ^ color) & ceq_mask) ||
2321 (interleaved_mnodes && PFN_2_MEM_NODE(lo) != mnode)) {
2322 PAGE_NEXT_PFN_FOR_COLOR(lo, szc, color, ceq_mask, color_mask,
2323 &it);
2324 }
2325
2326 if (hi <= lo) {
2327 rw_exit(&page_ctrs_rwlock[mnode]);
2328 return (NULL);
2329 }
2330
2331 full = FULL_REGION_CNT(r);
2332
2333 /* calculate the number of page candidates and initial search index */
2334 bin = color;
2335 idx0 = (size_t)(-1);
2336 do {
2337 pgcnt_t acand;
2338
2339 PGCTRS_CANDS_GETVALUECOLOR(mnode, mrange, r, bin, acand);
2340 if (acand) {
2341 idx = PAGE_COUNTERS_CURRENT_COLOR(mnode,
2342 r, bin, mrange);
2343 idx0 = MIN(idx0, idx);
2344 cands += acand;
2345 }
2346 bin = ADD_MASKED(bin, 1, ceq_mask, color_mask);
2347 } while (bin != color);
2348
2349 if (cands == 0) {
2350 VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip[r][mrange]);
2351 rw_exit(&page_ctrs_rwlock[mnode]);
2352 return (NULL);
2353 }
2354
2355 pfnum = IDX_TO_PNUM(mnode, r, idx0);
2356 if (pfnum < lo || pfnum >= hi) {
2357 pfnum = lo;
2358 } else {
2359 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2360 if (pfnum == (pfn_t)-1) {
2361 pfnum = lo;
2362 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2363 ASSERT(pfnum != (pfn_t)-1);
2364 } else if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) & ceq_mask ||
2365 (interleaved_mnodes && PFN_2_MEM_NODE(pfnum) != mnode)) {
2366 /* invalid color, get the closest correct pfn */
2367 PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask,
2368 color_mask, &it);
2369 if (pfnum >= hi) {
2370 pfnum = lo;
2371 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2372 }
2373 }
2374 }
2375
2376 /* set starting index */
2377 idx0 = PNUM_TO_IDX(mnode, r, pfnum);
2378 ASSERT(idx0 < len);
2379
2380 for (idx = idx0; wrap == 0 || (idx < idx0 && wrap < 2); ) {
2381
2382 if (PAGE_COUNTERS(mnode, r, idx) != full)
2383 goto next;
2384
2385 /*
2386 * RFE: For performance maybe we can do something less
2387 * brutal than locking the entire freelist. So far
2388 * this doesn't seem to be a performance problem?
2389 */
2390 page_freelist_lock(mnode);
2391 if (PAGE_COUNTERS(mnode, r, idx) == full) {
2392 ret_pp =
2393 page_promote(mnode, pfnum, r, PC_ALLOC, mtype);
2394 if (ret_pp != NULL) {
2395 VM_STAT_ADD(vmm_vmstats.pfc_coalok[r][mrange]);
2396 PAGE_COUNTERS_CURRENT_COLOR(mnode, r,
2397 PFN_2_COLOR(pfnum, szc, &it), mrange) = idx;
2398 page_freelist_unlock(mnode);
2399 rw_exit(&page_ctrs_rwlock[mnode]);
2400
2401 return (ret_pp);
2402 }
2403 } else {
2404 VM_STAT_ADD(vmm_vmstats.page_ctrs_changed[r][mrange]);
2405 }
2406
2407 page_freelist_unlock(mnode);
2408 /*
2409 * No point looking for another page if we've
2410 * already tried all of the ones that
2411 * page_ctr_cands indicated. Stash off where we left
2412 * off.
2413 * Note: this is not exact since we don't hold the
2414 * page_freelist_locks before we initially get the
2415 * value of cands for performance reasons, but should
2416 * be a decent approximation.
2417 */
2418 if (--cands == 0) {
2419 PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color, mrange) =
2420 idx;
2421 break;
2422 }
2423 next:
2424 PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask,
2425 color_mask, &it);
2426 idx = PNUM_TO_IDX(mnode, r, pfnum);
2427 if (idx >= len || pfnum >= hi) {
2428 pfnum = lo;
2429 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it);
2430 idx = PNUM_TO_IDX(mnode, r, pfnum);
2431 wrap++;
2432 }
2433 }
2434
2435 rw_exit(&page_ctrs_rwlock[mnode]);
2436 VM_STAT_ADD(vmm_vmstats.page_ctrs_failed[r][mrange]);
2437 return (NULL);
2438 }
2439
2440 /*
2441 * For the given mnode, promote as many small pages to large pages as possible.
2442 * mnode can be -1, which means do them all
2443 */
2444 void
page_freelist_coalesce_all(int mnode)2445 page_freelist_coalesce_all(int mnode)
2446 {
2447 int r; /* region size */
2448 int idx, full;
2449 size_t len;
2450 int doall = interleaved_mnodes || mnode < 0;
2451 int mlo = doall ? 0 : mnode;
2452 int mhi = doall ? max_mem_nodes : (mnode + 1);
2453
2454 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce_all);
2455
2456 if (mpss_coalesce_disable) {
2457 return;
2458 }
2459
2460 /*
2461 * Lock the entire freelist and coalesce what we can.
2462 *
2463 * Always promote to the largest page possible
2464 * first to reduce the number of page promotions.
2465 */
2466 for (mnode = mlo; mnode < mhi; mnode++) {
2467 rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
2468 page_freelist_lock(mnode);
2469 }
2470 for (r = mmu_page_sizes - 1; r > 0; r--) {
2471 for (mnode = mlo; mnode < mhi; mnode++) {
2472 pgcnt_t cands = 0;
2473 int mrange, nranges = mnode_nranges[mnode];
2474
2475 for (mrange = 0; mrange < nranges; mrange++) {
2476 PGCTRS_CANDS_GETVALUE(mnode, mrange, r, cands);
2477 if (cands != 0)
2478 break;
2479 }
2480 if (cands == 0) {
2481 VM_STAT_ADD(vmm_vmstats.
2482 page_ctrs_cands_skip_all);
2483 continue;
2484 }
2485
2486 full = FULL_REGION_CNT(r);
2487 len = PAGE_COUNTERS_ENTRIES(mnode, r);
2488
2489 for (idx = 0; idx < len; idx++) {
2490 if (PAGE_COUNTERS(mnode, r, idx) == full) {
2491 pfn_t pfnum =
2492 IDX_TO_PNUM(mnode, r, idx);
2493 int tmnode = interleaved_mnodes ?
2494 PFN_2_MEM_NODE(pfnum) : mnode;
2495
2496 ASSERT(pfnum >=
2497 mem_node_config[tmnode].physbase &&
2498 pfnum <
2499 mem_node_config[tmnode].physmax);
2500
2501 (void) page_promote(tmnode,
2502 pfnum, r, PC_FREE, PC_MTYPE_ANY);
2503 }
2504 }
2505 /* shared hpm_counters covers all mnodes, so we quit */
2506 if (interleaved_mnodes)
2507 break;
2508 }
2509 }
2510 for (mnode = mlo; mnode < mhi; mnode++) {
2511 page_freelist_unlock(mnode);
2512 rw_exit(&page_ctrs_rwlock[mnode]);
2513 }
2514 }
2515
2516 /*
2517 * This is where all polices for moving pages around
2518 * to different page size free lists is implemented.
2519 * Returns 1 on success, 0 on failure.
2520 *
2521 * So far these are the priorities for this algorithm in descending
2522 * order:
2523 *
2524 * 1) When servicing a request try to do so with a free page
2525 * from next size up. Helps defer fragmentation as long
2526 * as possible.
2527 *
2528 * 2) Page coalesce on demand. Only when a freelist
2529 * larger than PAGESIZE is empty and step 1
2530 * will not work since all larger size lists are
2531 * also empty.
2532 *
2533 * If pfnhi is non-zero, search for large page with pfn range less than pfnhi.
2534 */
2535
2536 page_t *
page_freelist_split(uchar_t szc,uint_t color,int mnode,int mtype,pfn_t pfnlo,pfn_t pfnhi,page_list_walker_t * plw)2537 page_freelist_split(uchar_t szc, uint_t color, int mnode, int mtype,
2538 pfn_t pfnlo, pfn_t pfnhi, page_list_walker_t *plw)
2539 {
2540 uchar_t nszc = szc + 1;
2541 uint_t bin, sbin, bin_prev;
2542 page_t *pp, *firstpp;
2543 page_t *ret_pp = NULL;
2544 uint_t color_mask;
2545
2546 if (nszc == mmu_page_sizes)
2547 return (NULL);
2548
2549 ASSERT(nszc < mmu_page_sizes);
2550 color_mask = PAGE_GET_PAGECOLORS(nszc) - 1;
2551 bin = sbin = PAGE_GET_NSZ_COLOR(szc, color);
2552 bin_prev = (plw->plw_bin_split_prev == color) ? INVALID_COLOR :
2553 PAGE_GET_NSZ_COLOR(szc, plw->plw_bin_split_prev);
2554
2555 VM_STAT_ADD(vmm_vmstats.pfs_req[szc]);
2556 /*
2557 * First try to break up a larger page to fill current size freelist.
2558 */
2559 while (plw->plw_bins[nszc] != 0) {
2560
2561 ASSERT(nszc < mmu_page_sizes);
2562
2563 /*
2564 * If page found then demote it.
2565 */
2566 if (PAGE_FREELISTS(mnode, nszc, bin, mtype)) {
2567 page_freelist_lock(mnode);
2568 firstpp = pp = PAGE_FREELISTS(mnode, nszc, bin, mtype);
2569
2570 /*
2571 * If pfnhi is not PFNNULL, look for large page below
2572 * pfnhi. PFNNULL signifies no pfn requirement.
2573 */
2574 if (pp &&
2575 ((pfnhi != PFNNULL && pp->p_pagenum >= pfnhi) ||
2576 (pfnlo != PFNNULL && pp->p_pagenum < pfnlo))) {
2577 do {
2578 pp = pp->p_vpnext;
2579 if (pp == firstpp) {
2580 pp = NULL;
2581 break;
2582 }
2583 } while ((pfnhi != PFNNULL &&
2584 pp->p_pagenum >= pfnhi) ||
2585 (pfnlo != PFNNULL &&
2586 pp->p_pagenum < pfnlo));
2587
2588 if (pfnhi != PFNNULL && pp != NULL)
2589 ASSERT(pp->p_pagenum < pfnhi);
2590
2591 if (pfnlo != PFNNULL && pp != NULL)
2592 ASSERT(pp->p_pagenum >= pfnlo);
2593 }
2594 if (pp) {
2595 uint_t ccolor = page_correct_color(szc, nszc,
2596 color, bin, plw->plw_ceq_mask[szc]);
2597
2598 ASSERT(pp->p_szc == nszc);
2599 VM_STAT_ADD(vmm_vmstats.pfs_demote[nszc]);
2600 ret_pp = page_demote(mnode, pp->p_pagenum,
2601 pfnhi, pp->p_szc, szc, ccolor, PC_ALLOC);
2602 if (ret_pp) {
2603 page_freelist_unlock(mnode);
2604 #if defined(__sparc)
2605 if (PP_ISNORELOC(ret_pp)) {
2606 pgcnt_t npgs;
2607
2608 npgs = page_get_pagecnt(
2609 ret_pp->p_szc);
2610 kcage_freemem_sub(npgs);
2611 }
2612 #endif
2613 return (ret_pp);
2614 }
2615 }
2616 page_freelist_unlock(mnode);
2617 }
2618
2619 /* loop through next size bins */
2620 bin = ADD_MASKED(bin, 1, plw->plw_ceq_mask[nszc], color_mask);
2621 plw->plw_bins[nszc]--;
2622
2623 if (bin == sbin) {
2624 uchar_t nnszc = nszc + 1;
2625
2626 /* we are done with this page size - check next */
2627 if (plw->plw_bins[nnszc] == 0)
2628 /* we have already checked next size bins */
2629 break;
2630
2631 bin = sbin = PAGE_GET_NSZ_COLOR(nszc, bin);
2632 if (bin_prev != INVALID_COLOR) {
2633 bin_prev = PAGE_GET_NSZ_COLOR(nszc, bin_prev);
2634 if (!((bin ^ bin_prev) &
2635 plw->plw_ceq_mask[nnszc]))
2636 break;
2637 }
2638 ASSERT(nnszc < mmu_page_sizes);
2639 color_mask = PAGE_GET_PAGECOLORS(nnszc) - 1;
2640 nszc = nnszc;
2641 ASSERT(nszc < mmu_page_sizes);
2642 }
2643 }
2644
2645 return (ret_pp);
2646 }
2647
2648 /*
2649 * Helper routine used only by the freelist code to lock
2650 * a page. If the page is a large page then it succeeds in
2651 * locking all the constituent pages or none at all.
2652 * Returns 1 on sucess, 0 on failure.
2653 */
2654 static int
page_trylock_cons(page_t * pp,se_t se)2655 page_trylock_cons(page_t *pp, se_t se)
2656 {
2657 page_t *tpp, *first_pp = pp;
2658
2659 /*
2660 * Fail if can't lock first or only page.
2661 */
2662 if (!page_trylock(pp, se)) {
2663 return (0);
2664 }
2665
2666 /*
2667 * PAGESIZE: common case.
2668 */
2669 if (pp->p_szc == 0) {
2670 return (1);
2671 }
2672
2673 /*
2674 * Large page case.
2675 */
2676 tpp = pp->p_next;
2677 while (tpp != pp) {
2678 if (!page_trylock(tpp, se)) {
2679 /*
2680 * On failure unlock what we have locked so far.
2681 * We want to avoid attempting to capture these
2682 * pages as the pcm mutex may be held which could
2683 * lead to a recursive mutex panic.
2684 */
2685 while (first_pp != tpp) {
2686 page_unlock_nocapture(first_pp);
2687 first_pp = first_pp->p_next;
2688 }
2689 return (0);
2690 }
2691 tpp = tpp->p_next;
2692 }
2693 return (1);
2694 }
2695
2696 /*
2697 * init context for walking page lists
2698 * Called when a page of the given szc in unavailable. Sets markers
2699 * for the beginning of the search to detect when search has
2700 * completed a full cycle. Sets flags for splitting larger pages
2701 * and coalescing smaller pages. Page walking procedes until a page
2702 * of the desired equivalent color is found.
2703 */
2704 void
page_list_walk_init(uchar_t szc,uint_t flags,uint_t bin,int can_split,int use_ceq,page_list_walker_t * plw)2705 page_list_walk_init(uchar_t szc, uint_t flags, uint_t bin, int can_split,
2706 int use_ceq, page_list_walker_t *plw)
2707 {
2708 uint_t nszc, ceq_mask, colors;
2709 uchar_t ceq = use_ceq ? colorequivszc[szc] : 0;
2710
2711 ASSERT(szc < mmu_page_sizes);
2712 colors = PAGE_GET_PAGECOLORS(szc);
2713
2714 plw->plw_colors = colors;
2715 plw->plw_color_mask = colors - 1;
2716 plw->plw_bin_marker = plw->plw_bin0 = bin;
2717 plw->plw_bin_split_prev = bin;
2718 plw->plw_bin_step = (szc == 0) ? vac_colors : 1;
2719
2720 /*
2721 * if vac aliasing is possible make sure lower order color
2722 * bits are never ignored
2723 */
2724 if (vac_colors > 1)
2725 ceq &= 0xf0;
2726
2727 /*
2728 * calculate the number of non-equivalent colors and
2729 * color equivalency mask
2730 */
2731 plw->plw_ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf));
2732 ASSERT(szc > 0 || plw->plw_ceq_dif >= vac_colors);
2733 ASSERT(plw->plw_ceq_dif > 0);
2734 plw->plw_ceq_mask[szc] = (plw->plw_ceq_dif - 1) << (ceq & 0xf);
2735
2736 if (flags & PG_MATCH_COLOR) {
2737 if (cpu_page_colors < 0) {
2738 /*
2739 * this is a heterogeneous machine with different CPUs
2740 * having different size e$ (not supported for ni2/rock
2741 */
2742 uint_t cpucolors = CPUSETSIZE() >> PAGE_GET_SHIFT(szc);
2743 cpucolors = MAX(cpucolors, 1);
2744 ceq_mask = plw->plw_color_mask & (cpucolors - 1);
2745 plw->plw_ceq_mask[szc] =
2746 MIN(ceq_mask, plw->plw_ceq_mask[szc]);
2747 }
2748 plw->plw_ceq_dif = 1;
2749 }
2750
2751 /* we can split pages in the freelist, but not the cachelist */
2752 if (can_split) {
2753 plw->plw_do_split = (szc + 1 < mmu_page_sizes) ? 1 : 0;
2754
2755 /* set next szc color masks and number of free list bins */
2756 for (nszc = szc + 1; nszc < mmu_page_sizes; nszc++, szc++) {
2757 plw->plw_ceq_mask[nszc] = PAGE_GET_NSZ_MASK(szc,
2758 plw->plw_ceq_mask[szc]);
2759 plw->plw_bins[nszc] = PAGE_GET_PAGECOLORS(nszc);
2760 }
2761 plw->plw_ceq_mask[nszc] = INVALID_MASK;
2762 plw->plw_bins[nszc] = 0;
2763
2764 } else {
2765 ASSERT(szc == 0);
2766 plw->plw_do_split = 0;
2767 plw->plw_bins[1] = 0;
2768 plw->plw_ceq_mask[1] = INVALID_MASK;
2769 }
2770 }
2771
2772 /*
2773 * set mark to flag where next split should occur
2774 */
2775 #define PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw) { \
2776 uint_t bin_nsz = PAGE_GET_NSZ_COLOR(szc, bin); \
2777 uint_t bin0_nsz = PAGE_GET_NSZ_COLOR(szc, plw->plw_bin0); \
2778 uint_t neq_mask = ~plw->plw_ceq_mask[nszc] & plw->plw_color_mask; \
2779 plw->plw_split_next = \
2780 INC_MASKED(bin_nsz, neq_mask, plw->plw_color_mask); \
2781 if (!((plw->plw_split_next ^ bin0_nsz) & plw->plw_ceq_mask[nszc])) { \
2782 plw->plw_split_next = \
2783 INC_MASKED(plw->plw_split_next, \
2784 neq_mask, plw->plw_color_mask); \
2785 } \
2786 }
2787
2788 uint_t
page_list_walk_next_bin(uchar_t szc,uint_t bin,page_list_walker_t * plw)2789 page_list_walk_next_bin(uchar_t szc, uint_t bin, page_list_walker_t *plw)
2790 {
2791 uint_t neq_mask = ~plw->plw_ceq_mask[szc] & plw->plw_color_mask;
2792 uint_t bin0_nsz, nbin_nsz, nbin0, nbin;
2793 uchar_t nszc = szc + 1;
2794
2795 nbin = ADD_MASKED(bin,
2796 plw->plw_bin_step, neq_mask, plw->plw_color_mask);
2797
2798 if (plw->plw_do_split) {
2799 plw->plw_bin_split_prev = bin;
2800 PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw);
2801 plw->plw_do_split = 0;
2802 }
2803
2804 if (szc == 0) {
2805 if (plw->plw_count != 0 || plw->plw_ceq_dif == vac_colors) {
2806 if (nbin == plw->plw_bin0 &&
2807 (vac_colors == 1 || nbin != plw->plw_bin_marker)) {
2808 nbin = ADD_MASKED(nbin, plw->plw_bin_step,
2809 neq_mask, plw->plw_color_mask);
2810 plw->plw_bin_split_prev = plw->plw_bin0;
2811 }
2812
2813 if (vac_colors > 1 && nbin == plw->plw_bin_marker) {
2814 plw->plw_bin_marker =
2815 nbin = INC_MASKED(nbin, neq_mask,
2816 plw->plw_color_mask);
2817 plw->plw_bin_split_prev = plw->plw_bin0;
2818 /*
2819 * large pages all have the same vac color
2820 * so by now we should be done with next
2821 * size page splitting process
2822 */
2823 ASSERT(plw->plw_bins[1] == 0);
2824 plw->plw_do_split = 0;
2825 return (nbin);
2826 }
2827
2828 } else {
2829 uint_t bin_jump = (vac_colors == 1) ?
2830 (BIN_STEP & ~3) - (plw->plw_bin0 & 3) : BIN_STEP;
2831
2832 bin_jump &= ~(vac_colors - 1);
2833
2834 nbin0 = ADD_MASKED(plw->plw_bin0, bin_jump, neq_mask,
2835 plw->plw_color_mask);
2836
2837 if ((nbin0 ^ plw->plw_bin0) & plw->plw_ceq_mask[szc]) {
2838
2839 plw->plw_bin_marker = nbin = nbin0;
2840
2841 if (plw->plw_bins[nszc] != 0) {
2842 /*
2843 * check if next page size bin is the
2844 * same as the next page size bin for
2845 * bin0
2846 */
2847 nbin_nsz = PAGE_GET_NSZ_COLOR(szc,
2848 nbin);
2849 bin0_nsz = PAGE_GET_NSZ_COLOR(szc,
2850 plw->plw_bin0);
2851
2852 if ((bin0_nsz ^ nbin_nsz) &
2853 plw->plw_ceq_mask[nszc])
2854 plw->plw_do_split = 1;
2855 }
2856 return (nbin);
2857 }
2858 }
2859 }
2860
2861 if (plw->plw_bins[nszc] != 0) {
2862 nbin_nsz = PAGE_GET_NSZ_COLOR(szc, nbin);
2863 if (!((plw->plw_split_next ^ nbin_nsz) &
2864 plw->plw_ceq_mask[nszc]))
2865 plw->plw_do_split = 1;
2866 }
2867
2868 return (nbin);
2869 }
2870
2871 page_t *
page_get_mnode_freelist(int mnode,uint_t bin,int mtype,uchar_t szc,uint_t flags)2872 page_get_mnode_freelist(int mnode, uint_t bin, int mtype, uchar_t szc,
2873 uint_t flags)
2874 {
2875 kmutex_t *pcm;
2876 page_t *pp, *first_pp;
2877 uint_t sbin;
2878 int plw_initialized;
2879 page_list_walker_t plw;
2880
2881 ASSERT(szc < mmu_page_sizes);
2882
2883 VM_STAT_ADD(vmm_vmstats.pgmf_alloc[szc]);
2884
2885 MTYPE_START(mnode, mtype, flags);
2886 if (mtype < 0) { /* mnode does not have memory in mtype range */
2887 VM_STAT_ADD(vmm_vmstats.pgmf_allocempty[szc]);
2888 return (NULL);
2889 }
2890 try_again:
2891
2892 plw_initialized = 0;
2893 plw.plw_ceq_dif = 1;
2894
2895 /*
2896 * Only hold one freelist lock at a time, that way we
2897 * can start anywhere and not have to worry about lock
2898 * ordering.
2899 */
2900 for (plw.plw_count = 0;
2901 plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) {
2902 sbin = bin;
2903 do {
2904 if (!PAGE_FREELISTS(mnode, szc, bin, mtype))
2905 goto bin_empty_1;
2906
2907 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
2908 mutex_enter(pcm);
2909 pp = PAGE_FREELISTS(mnode, szc, bin, mtype);
2910 if (pp == NULL)
2911 goto bin_empty_0;
2912
2913 /*
2914 * These were set before the page
2915 * was put on the free list,
2916 * they must still be set.
2917 */
2918 ASSERT(PP_ISFREE(pp));
2919 ASSERT(PP_ISAGED(pp));
2920 ASSERT(pp->p_vnode == NULL);
2921 ASSERT(pp->p_hash == NULL);
2922 ASSERT(pp->p_offset == (u_offset_t)-1);
2923 ASSERT(pp->p_szc == szc);
2924 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
2925
2926 /*
2927 * Walk down the hash chain.
2928 * 8k pages are linked on p_next
2929 * and p_prev fields. Large pages
2930 * are a contiguous group of
2931 * constituent pages linked together
2932 * on their p_next and p_prev fields.
2933 * The large pages are linked together
2934 * on the hash chain using p_vpnext
2935 * p_vpprev of the base constituent
2936 * page of each large page.
2937 */
2938 first_pp = pp;
2939 while (IS_DUMP_PAGE(pp) || !page_trylock_cons(pp,
2940 SE_EXCL)) {
2941 if (szc == 0) {
2942 pp = pp->p_next;
2943 } else {
2944 pp = pp->p_vpnext;
2945 }
2946
2947 ASSERT(PP_ISFREE(pp));
2948 ASSERT(PP_ISAGED(pp));
2949 ASSERT(pp->p_vnode == NULL);
2950 ASSERT(pp->p_hash == NULL);
2951 ASSERT(pp->p_offset == (u_offset_t)-1);
2952 ASSERT(pp->p_szc == szc);
2953 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
2954
2955 if (pp == first_pp)
2956 goto bin_empty_0;
2957 }
2958
2959 ASSERT(pp != NULL);
2960 ASSERT(mtype == PP_2_MTYPE(pp));
2961 ASSERT(pp->p_szc == szc);
2962 if (szc == 0) {
2963 page_sub(&PAGE_FREELISTS(mnode,
2964 szc, bin, mtype), pp);
2965 } else {
2966 page_vpsub(&PAGE_FREELISTS(mnode,
2967 szc, bin, mtype), pp);
2968 CHK_LPG(pp, szc);
2969 }
2970 page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
2971
2972 if ((PP_ISFREE(pp) == 0) || (PP_ISAGED(pp) == 0))
2973 panic("free page is not. pp %p", (void *)pp);
2974 mutex_exit(pcm);
2975
2976 #if defined(__sparc)
2977 ASSERT(!kcage_on || PP_ISNORELOC(pp) ||
2978 (flags & PG_NORELOC) == 0);
2979
2980 if (PP_ISNORELOC(pp))
2981 kcage_freemem_sub(page_get_pagecnt(szc));
2982 #endif
2983 VM_STAT_ADD(vmm_vmstats.pgmf_allocok[szc]);
2984 return (pp);
2985
2986 bin_empty_0:
2987 mutex_exit(pcm);
2988 bin_empty_1:
2989 if (plw_initialized == 0) {
2990 page_list_walk_init(szc, flags, bin, 1, 1,
2991 &plw);
2992 plw_initialized = 1;
2993 ASSERT(plw.plw_colors <=
2994 PAGE_GET_PAGECOLORS(szc));
2995 ASSERT(plw.plw_colors > 0);
2996 ASSERT((plw.plw_colors &
2997 (plw.plw_colors - 1)) == 0);
2998 ASSERT(bin < plw.plw_colors);
2999 ASSERT(plw.plw_ceq_mask[szc] < plw.plw_colors);
3000 }
3001 /* calculate the next bin with equivalent color */
3002 bin = ADD_MASKED(bin, plw.plw_bin_step,
3003 plw.plw_ceq_mask[szc], plw.plw_color_mask);
3004 } while (sbin != bin);
3005
3006 /*
3007 * color bins are all empty if color match. Try and
3008 * satisfy the request by breaking up or coalescing
3009 * pages from a different size freelist of the correct
3010 * color that satisfies the ORIGINAL color requested.
3011 * If that fails then try pages of the same size but
3012 * different colors assuming we are not called with
3013 * PG_MATCH_COLOR.
3014 */
3015 if (plw.plw_do_split &&
3016 (pp = page_freelist_split(szc, bin, mnode,
3017 mtype, PFNNULL, PFNNULL, &plw)) != NULL)
3018 return (pp);
3019
3020 if (szc > 0 && (pp = page_freelist_coalesce(mnode, szc,
3021 bin, plw.plw_ceq_mask[szc], mtype, PFNNULL)) != NULL)
3022 return (pp);
3023
3024 if (plw.plw_ceq_dif > 1)
3025 bin = page_list_walk_next_bin(szc, bin, &plw);
3026 }
3027
3028 /* if allowed, cycle through additional mtypes */
3029 MTYPE_NEXT(mnode, mtype, flags);
3030 if (mtype >= 0)
3031 goto try_again;
3032
3033 VM_STAT_ADD(vmm_vmstats.pgmf_allocfailed[szc]);
3034
3035 return (NULL);
3036 }
3037
3038 /*
3039 * Returns the count of free pages for 'pp' with size code 'szc'.
3040 * Note: This function does not return an exact value as the page freelist
3041 * locks are not held and thus the values in the page_counters may be
3042 * changing as we walk through the data.
3043 */
3044 static int
page_freecnt(int mnode,page_t * pp,uchar_t szc)3045 page_freecnt(int mnode, page_t *pp, uchar_t szc)
3046 {
3047 pgcnt_t pgfree;
3048 pgcnt_t cnt;
3049 ssize_t r = szc; /* region size */
3050 ssize_t idx;
3051 int i;
3052 int full, range;
3053
3054 /* Make sure pagenum passed in is aligned properly */
3055 ASSERT((pp->p_pagenum & (PNUM_SIZE(szc) - 1)) == 0);
3056 ASSERT(szc > 0);
3057
3058 /* Prevent page_counters dynamic memory from being freed */
3059 rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
3060 idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum);
3061 cnt = PAGE_COUNTERS(mnode, r, idx);
3062 pgfree = cnt << PNUM_SHIFT(r - 1);
3063 range = FULL_REGION_CNT(szc);
3064
3065 /* Check for completely full region */
3066 if (cnt == range) {
3067 rw_exit(&page_ctrs_rwlock[mnode]);
3068 return (pgfree);
3069 }
3070
3071 while (--r > 0) {
3072 idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum);
3073 full = FULL_REGION_CNT(r);
3074 for (i = 0; i < range; i++, idx++) {
3075 cnt = PAGE_COUNTERS(mnode, r, idx);
3076 /*
3077 * If cnt here is full, that means we have already
3078 * accounted for these pages earlier.
3079 */
3080 if (cnt != full) {
3081 pgfree += (cnt << PNUM_SHIFT(r - 1));
3082 }
3083 }
3084 range *= full;
3085 }
3086 rw_exit(&page_ctrs_rwlock[mnode]);
3087 return (pgfree);
3088 }
3089
3090 /*
3091 * Called from page_geti_contig_pages to exclusively lock constituent pages
3092 * starting from 'spp' for page size code 'szc'.
3093 *
3094 * If 'ptcpthreshold' is set, the number of free pages needed in the 'szc'
3095 * region needs to be greater than or equal to the threshold.
3096 */
3097 static int
page_trylock_contig_pages(int mnode,page_t * spp,uchar_t szc,int flags)3098 page_trylock_contig_pages(int mnode, page_t *spp, uchar_t szc, int flags)
3099 {
3100 pgcnt_t pgcnt = PNUM_SIZE(szc);
3101 pgcnt_t pgfree, i;
3102 page_t *pp;
3103
3104 VM_STAT_ADD(vmm_vmstats.ptcp[szc]);
3105
3106
3107 if ((ptcpthreshold == 0) || (flags & PGI_PGCPHIPRI))
3108 goto skipptcpcheck;
3109 /*
3110 * check if there are sufficient free pages available before attempting
3111 * to trylock. Count is approximate as page counters can change.
3112 */
3113 pgfree = page_freecnt(mnode, spp, szc);
3114
3115 /* attempt to trylock if there are sufficient already free pages */
3116 if (pgfree < pgcnt/ptcpthreshold) {
3117 VM_STAT_ADD(vmm_vmstats.ptcpfreethresh[szc]);
3118 return (0);
3119 }
3120
3121 skipptcpcheck:
3122
3123 for (i = 0; i < pgcnt; i++) {
3124 pp = &spp[i];
3125 if (!page_trylock(pp, SE_EXCL)) {
3126 VM_STAT_ADD(vmm_vmstats.ptcpfailexcl[szc]);
3127 while (--i != (pgcnt_t)-1) {
3128 pp = &spp[i];
3129 ASSERT(PAGE_EXCL(pp));
3130 page_unlock_nocapture(pp);
3131 }
3132 return (0);
3133 }
3134 ASSERT(spp[i].p_pagenum == spp->p_pagenum + i);
3135 if ((pp->p_szc > szc || (szc && pp->p_szc == szc)) &&
3136 !PP_ISFREE(pp)) {
3137 VM_STAT_ADD(vmm_vmstats.ptcpfailszc[szc]);
3138 ASSERT(i == 0);
3139 page_unlock_nocapture(pp);
3140 return (0);
3141 }
3142
3143 /*
3144 * If a page has been marked non-relocatable or has been
3145 * explicitly locked in memory, we don't want to relocate it;
3146 * unlock the pages and fail the operation.
3147 */
3148 if (PP_ISNORELOC(pp) ||
3149 pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
3150 VM_STAT_ADD(vmm_vmstats.ptcpfailcage[szc]);
3151 while (i != (pgcnt_t)-1) {
3152 pp = &spp[i];
3153 ASSERT(PAGE_EXCL(pp));
3154 page_unlock_nocapture(pp);
3155 i--;
3156 }
3157 return (0);
3158 }
3159 }
3160 VM_STAT_ADD(vmm_vmstats.ptcpok[szc]);
3161 return (1);
3162 }
3163
3164 /*
3165 * Claim large page pointed to by 'pp'. 'pp' is the starting set
3166 * of 'szc' constituent pages that had been locked exclusively previously.
3167 * Will attempt to relocate constituent pages in use.
3168 */
3169 static page_t *
page_claim_contig_pages(page_t * pp,uchar_t szc,int flags)3170 page_claim_contig_pages(page_t *pp, uchar_t szc, int flags)
3171 {
3172 spgcnt_t pgcnt, npgs, i;
3173 page_t *targpp, *rpp, *hpp;
3174 page_t *replpp = NULL;
3175 page_t *pplist = NULL;
3176
3177 ASSERT(pp != NULL);
3178
3179 pgcnt = page_get_pagecnt(szc);
3180 while (pgcnt) {
3181 ASSERT(PAGE_EXCL(pp));
3182 ASSERT(!PP_ISNORELOC(pp));
3183 if (PP_ISFREE(pp)) {
3184 /*
3185 * If this is a PG_FREE_LIST page then its
3186 * size code can change underneath us due to
3187 * page promotion or demotion. As an optimzation
3188 * use page_list_sub_pages() instead of
3189 * page_list_sub().
3190 */
3191 if (PP_ISAGED(pp)) {
3192 page_list_sub_pages(pp, szc);
3193 if (pp->p_szc == szc) {
3194 return (pp);
3195 }
3196 ASSERT(pp->p_szc < szc);
3197 npgs = page_get_pagecnt(pp->p_szc);
3198 hpp = pp;
3199 for (i = 0; i < npgs; i++, pp++) {
3200 pp->p_szc = szc;
3201 }
3202 page_list_concat(&pplist, &hpp);
3203 pgcnt -= npgs;
3204 continue;
3205 }
3206 ASSERT(!PP_ISAGED(pp));
3207 ASSERT(pp->p_szc == 0);
3208 page_list_sub(pp, PG_CACHE_LIST);
3209 page_hashout(pp, NULL);
3210 PP_SETAGED(pp);
3211 pp->p_szc = szc;
3212 page_list_concat(&pplist, &pp);
3213 pp++;
3214 pgcnt--;
3215 continue;
3216 }
3217 npgs = page_get_pagecnt(pp->p_szc);
3218
3219 /*
3220 * page_create_wait freemem accounting done by caller of
3221 * page_get_freelist and not necessary to call it prior to
3222 * calling page_get_replacement_page.
3223 *
3224 * page_get_replacement_page can call page_get_contig_pages
3225 * to acquire a large page (szc > 0); the replacement must be
3226 * smaller than the contig page size to avoid looping or
3227 * szc == 0 and PGI_PGCPSZC0 is set.
3228 */
3229 if (pp->p_szc < szc || (szc == 0 && (flags & PGI_PGCPSZC0))) {
3230 replpp = page_get_replacement_page(pp, NULL, 0);
3231 if (replpp) {
3232 npgs = page_get_pagecnt(pp->p_szc);
3233 ASSERT(npgs <= pgcnt);
3234 targpp = pp;
3235 }
3236 }
3237
3238 /*
3239 * If replacement is NULL or do_page_relocate fails, fail
3240 * coalescing of pages.
3241 */
3242 if (replpp == NULL || (do_page_relocate(&targpp, &replpp, 0,
3243 &npgs, NULL) != 0)) {
3244 /*
3245 * Unlock un-processed target list
3246 */
3247 while (pgcnt--) {
3248 ASSERT(PAGE_EXCL(pp));
3249 page_unlock_nocapture(pp);
3250 pp++;
3251 }
3252 /*
3253 * Free the processed target list.
3254 */
3255 while (pplist) {
3256 pp = pplist;
3257 page_sub(&pplist, pp);
3258 ASSERT(PAGE_EXCL(pp));
3259 ASSERT(pp->p_szc == szc);
3260 ASSERT(PP_ISFREE(pp));
3261 ASSERT(PP_ISAGED(pp));
3262 pp->p_szc = 0;
3263 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
3264 page_unlock_nocapture(pp);
3265 }
3266
3267 if (replpp != NULL)
3268 page_free_replacement_page(replpp);
3269
3270 return (NULL);
3271 }
3272 ASSERT(pp == targpp);
3273
3274 /* LINTED */
3275 ASSERT(hpp = pp); /* That's right, it's an assignment */
3276
3277 pp += npgs;
3278 pgcnt -= npgs;
3279
3280 while (npgs--) {
3281 ASSERT(PAGE_EXCL(targpp));
3282 ASSERT(!PP_ISFREE(targpp));
3283 ASSERT(!PP_ISNORELOC(targpp));
3284 PP_SETFREE(targpp);
3285 ASSERT(PP_ISAGED(targpp));
3286 ASSERT(targpp->p_szc < szc || (szc == 0 &&
3287 (flags & PGI_PGCPSZC0)));
3288 targpp->p_szc = szc;
3289 targpp = targpp->p_next;
3290
3291 rpp = replpp;
3292 ASSERT(rpp != NULL);
3293 page_sub(&replpp, rpp);
3294 ASSERT(PAGE_EXCL(rpp));
3295 ASSERT(!PP_ISFREE(rpp));
3296 page_unlock_nocapture(rpp);
3297 }
3298 ASSERT(targpp == hpp);
3299 ASSERT(replpp == NULL);
3300 page_list_concat(&pplist, &targpp);
3301 }
3302 CHK_LPG(pplist, szc);
3303 return (pplist);
3304 }
3305
3306 /*
3307 * Trim kernel cage from pfnlo-pfnhi and store result in lo-hi. Return code
3308 * of 0 means nothing left after trim.
3309 */
3310 int
trimkcage(struct memseg * mseg,pfn_t * lo,pfn_t * hi,pfn_t pfnlo,pfn_t pfnhi)3311 trimkcage(struct memseg *mseg, pfn_t *lo, pfn_t *hi, pfn_t pfnlo, pfn_t pfnhi)
3312 {
3313 pfn_t kcagepfn;
3314 int decr;
3315 int rc = 0;
3316
3317 if (PP_ISNORELOC(mseg->pages)) {
3318 if (PP_ISNORELOC(mseg->epages - 1) == 0) {
3319
3320 /* lower part of this mseg inside kernel cage */
3321 decr = kcage_current_pfn(&kcagepfn);
3322
3323 /* kernel cage may have transitioned past mseg */
3324 if (kcagepfn >= mseg->pages_base &&
3325 kcagepfn < mseg->pages_end) {
3326 ASSERT(decr == 0);
3327 *lo = MAX(kcagepfn, pfnlo);
3328 *hi = MIN(pfnhi, (mseg->pages_end - 1));
3329 rc = 1;
3330 }
3331 }
3332 /* else entire mseg in the cage */
3333 } else {
3334 if (PP_ISNORELOC(mseg->epages - 1)) {
3335
3336 /* upper part of this mseg inside kernel cage */
3337 decr = kcage_current_pfn(&kcagepfn);
3338
3339 /* kernel cage may have transitioned past mseg */
3340 if (kcagepfn >= mseg->pages_base &&
3341 kcagepfn < mseg->pages_end) {
3342 ASSERT(decr);
3343 *hi = MIN(kcagepfn, pfnhi);
3344 *lo = MAX(pfnlo, mseg->pages_base);
3345 rc = 1;
3346 }
3347 } else {
3348 /* entire mseg outside of kernel cage */
3349 *lo = MAX(pfnlo, mseg->pages_base);
3350 *hi = MIN(pfnhi, (mseg->pages_end - 1));
3351 rc = 1;
3352 }
3353 }
3354 return (rc);
3355 }
3356
3357 /*
3358 * called from page_get_contig_pages to search 'pfnlo' thru 'pfnhi' to claim a
3359 * page with size code 'szc'. Claiming such a page requires acquiring
3360 * exclusive locks on all constituent pages (page_trylock_contig_pages),
3361 * relocating pages in use and concatenating these constituent pages into a
3362 * large page.
3363 *
3364 * The page lists do not have such a large page and page_freelist_split has
3365 * already failed to demote larger pages and/or coalesce smaller free pages.
3366 *
3367 * 'flags' may specify PG_COLOR_MATCH which would limit the search of large
3368 * pages with the same color as 'bin'.
3369 *
3370 * 'pfnflag' specifies the subset of the pfn range to search.
3371 */
3372
3373 static page_t *
page_geti_contig_pages(int mnode,uint_t bin,uchar_t szc,int flags,pfn_t pfnlo,pfn_t pfnhi,pgcnt_t pfnflag)3374 page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags,
3375 pfn_t pfnlo, pfn_t pfnhi, pgcnt_t pfnflag)
3376 {
3377 struct memseg *mseg;
3378 pgcnt_t szcpgcnt = page_get_pagecnt(szc);
3379 pgcnt_t szcpgmask = szcpgcnt - 1;
3380 pfn_t randpfn;
3381 page_t *pp, *randpp, *endpp;
3382 uint_t colors, ceq_mask;
3383 /* LINTED : set but not used in function */
3384 uint_t color_mask __unused;
3385 pfn_t hi, lo;
3386 uint_t skip;
3387 MEM_NODE_ITERATOR_DECL(it);
3388
3389 ASSERT(szc != 0 || (flags & PGI_PGCPSZC0));
3390
3391 pfnlo = P2ROUNDUP(pfnlo, szcpgcnt);
3392
3393 if ((pfnhi - pfnlo) + 1 < szcpgcnt || pfnlo >= pfnhi)
3394 return (NULL);
3395
3396 ASSERT(szc < mmu_page_sizes);
3397
3398 colors = PAGE_GET_PAGECOLORS(szc);
3399 color_mask = colors - 1;
3400 if ((colors > 1) && (flags & PG_MATCH_COLOR)) {
3401 uchar_t ceq = colorequivszc[szc];
3402 uint_t ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf));
3403
3404 ASSERT(ceq_dif > 0);
3405 ceq_mask = (ceq_dif - 1) << (ceq & 0xf);
3406 } else {
3407 ceq_mask = 0;
3408 }
3409
3410 ASSERT(bin < colors);
3411
3412 /* clear "non-significant" color bits */
3413 bin &= ceq_mask;
3414
3415 /*
3416 * trim the pfn range to search based on pfnflag. pfnflag is set
3417 * when there have been previous page_get_contig_page failures to
3418 * limit the search.
3419 *
3420 * The high bit in pfnflag specifies the number of 'slots' in the
3421 * pfn range and the remainder of pfnflag specifies which slot.
3422 * For example, a value of 1010b would mean the second slot of
3423 * the pfn range that has been divided into 8 slots.
3424 */
3425 if (pfnflag > 1) {
3426 int slots = 1 << (highbit(pfnflag) - 1);
3427 int slotid = pfnflag & (slots - 1);
3428 pgcnt_t szcpages;
3429 int slotlen;
3430
3431 pfnhi = P2ALIGN((pfnhi + 1), szcpgcnt) - 1;
3432 szcpages = ((pfnhi - pfnlo) + 1) / szcpgcnt;
3433 slotlen = howmany(szcpages, slots);
3434 /* skip if 'slotid' slot is empty */
3435 if (slotid * slotlen >= szcpages)
3436 return (NULL);
3437 pfnlo = pfnlo + (((slotid * slotlen) % szcpages) * szcpgcnt);
3438 ASSERT(pfnlo < pfnhi);
3439 if (pfnhi > pfnlo + (slotlen * szcpgcnt))
3440 pfnhi = pfnlo + (slotlen * szcpgcnt) - 1;
3441 }
3442
3443 /*
3444 * This routine is can be called recursively so we shouldn't
3445 * acquire a reader lock if a write request is pending. This
3446 * could lead to a deadlock with the DR thread.
3447 *
3448 * Returning NULL informs the caller that we could not get
3449 * a contig page with the required characteristics.
3450 */
3451
3452 if (!memsegs_trylock(0))
3453 return (NULL);
3454
3455 /*
3456 * loop through memsegs to look for contig page candidates
3457 */
3458
3459 for (mseg = memsegs; mseg != NULL; mseg = mseg->next) {
3460 if (pfnhi < mseg->pages_base || pfnlo >= mseg->pages_end) {
3461 /* no overlap */
3462 continue;
3463 }
3464
3465 if (mseg->pages_end - mseg->pages_base < szcpgcnt)
3466 /* mseg too small */
3467 continue;
3468
3469 /*
3470 * trim off kernel cage pages from pfn range and check for
3471 * a trimmed pfn range returned that does not span the
3472 * desired large page size.
3473 */
3474 if (kcage_on) {
3475 if (trimkcage(mseg, &lo, &hi, pfnlo, pfnhi) == 0 ||
3476 lo >= hi || ((hi - lo) + 1) < szcpgcnt)
3477 continue;
3478 } else {
3479 lo = MAX(pfnlo, mseg->pages_base);
3480 hi = MIN(pfnhi, (mseg->pages_end - 1));
3481 }
3482
3483 /* round to szcpgcnt boundaries */
3484 lo = P2ROUNDUP(lo, szcpgcnt);
3485
3486 MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it);
3487 hi = P2ALIGN((hi + 1), szcpgcnt) - 1;
3488
3489 if (hi <= lo)
3490 continue;
3491
3492 /*
3493 * set lo to point to the pfn for the desired bin. Large
3494 * page sizes may only have a single page color
3495 */
3496 skip = szcpgcnt;
3497 if (ceq_mask > 0 || interleaved_mnodes) {
3498 /* set lo to point at appropriate color */
3499 if (((PFN_2_COLOR(lo, szc, &it) ^ bin) & ceq_mask) ||
3500 (interleaved_mnodes &&
3501 PFN_2_MEM_NODE(lo) != mnode)) {
3502 PAGE_NEXT_PFN_FOR_COLOR(lo, szc, bin, ceq_mask,
3503 color_mask, &it);
3504 }
3505 if (hi <= lo)
3506 /* mseg cannot satisfy color request */
3507 continue;
3508 }
3509
3510 /* randomly choose a point between lo and hi to begin search */
3511
3512 randpfn = (pfn_t)GETTICK();
3513 randpfn = ((randpfn % (hi - lo)) + lo) & ~(skip - 1);
3514 MEM_NODE_ITERATOR_INIT(randpfn, mnode, szc, &it);
3515 if (ceq_mask || interleaved_mnodes || randpfn == (pfn_t)-1) {
3516 if (randpfn != (pfn_t)-1) {
3517 PAGE_NEXT_PFN_FOR_COLOR(randpfn, szc, bin,
3518 ceq_mask, color_mask, &it);
3519 }
3520 if (randpfn >= hi) {
3521 randpfn = lo;
3522 MEM_NODE_ITERATOR_INIT(randpfn, mnode, szc,
3523 &it);
3524 }
3525 }
3526 randpp = mseg->pages + (randpfn - mseg->pages_base);
3527
3528 ASSERT(randpp->p_pagenum == randpfn);
3529
3530 pp = randpp;
3531 endpp = mseg->pages + (hi - mseg->pages_base) + 1;
3532
3533 ASSERT(randpp + szcpgcnt <= endpp);
3534
3535 do {
3536 ASSERT(!(pp->p_pagenum & szcpgmask));
3537 ASSERT(((PP_2_BIN(pp) ^ bin) & ceq_mask) == 0);
3538
3539 if (page_trylock_contig_pages(mnode, pp, szc, flags)) {
3540 /* pages unlocked by page_claim on failure */
3541 if (page_claim_contig_pages(pp, szc, flags)) {
3542 memsegs_unlock(0);
3543 return (pp);
3544 }
3545 }
3546
3547 if (ceq_mask == 0 && !interleaved_mnodes) {
3548 pp += skip;
3549 } else {
3550 pfn_t pfn = pp->p_pagenum;
3551
3552 PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, bin,
3553 ceq_mask, color_mask, &it);
3554 if (pfn == (pfn_t)-1) {
3555 pp = endpp;
3556 } else {
3557 pp = mseg->pages +
3558 (pfn - mseg->pages_base);
3559 }
3560 }
3561 if (pp >= endpp) {
3562 /* start from the beginning */
3563 MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it);
3564 pp = mseg->pages + (lo - mseg->pages_base);
3565 ASSERT(pp->p_pagenum == lo);
3566 ASSERT(pp + szcpgcnt <= endpp);
3567 }
3568 } while (pp != randpp);
3569 }
3570 memsegs_unlock(0);
3571 return (NULL);
3572 }
3573
3574
3575 /*
3576 * controlling routine that searches through physical memory in an attempt to
3577 * claim a large page based on the input parameters.
3578 * on the page free lists.
3579 *
3580 * calls page_geti_contig_pages with an initial pfn range from the mnode
3581 * and mtype. page_geti_contig_pages will trim off the parts of the pfn range
3582 * that overlaps with the kernel cage or does not match the requested page
3583 * color if PG_MATCH_COLOR is set. Since this search is very expensive,
3584 * page_geti_contig_pages may further limit the search range based on
3585 * previous failure counts (pgcpfailcnt[]).
3586 *
3587 * for PGI_PGCPSZC0 requests, page_get_contig_pages will relocate a base
3588 * pagesize page that satisfies mtype.
3589 */
3590 page_t *
page_get_contig_pages(int mnode,uint_t bin,int mtype,uchar_t szc,uint_t flags)3591 page_get_contig_pages(int mnode, uint_t bin, int mtype, uchar_t szc,
3592 uint_t flags)
3593 {
3594 pfn_t pfnlo, pfnhi; /* contig pages pfn range */
3595 page_t *pp;
3596 pgcnt_t pfnflag = 0; /* no limit on search if 0 */
3597
3598 VM_STAT_ADD(vmm_vmstats.pgcp_alloc[szc]);
3599
3600 /* no allocations from cage */
3601 flags |= PGI_NOCAGE;
3602
3603 /* LINTED */
3604 MTYPE_START(mnode, mtype, flags);
3605 if (mtype < 0) { /* mnode does not have memory in mtype range */
3606 VM_STAT_ADD(vmm_vmstats.pgcp_allocempty[szc]);
3607 return (NULL);
3608 }
3609
3610 ASSERT(szc > 0 || (flags & PGI_PGCPSZC0));
3611
3612 /* do not limit search and ignore color if hi pri */
3613
3614 if (pgcplimitsearch && ((flags & PGI_PGCPHIPRI) == 0))
3615 pfnflag = pgcpfailcnt[szc];
3616
3617 /* remove color match to improve chances */
3618
3619 if (flags & PGI_PGCPHIPRI || pfnflag)
3620 flags &= ~PG_MATCH_COLOR;
3621
3622 do {
3623 /* get pfn range based on mnode and mtype */
3624 MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi);
3625
3626 ASSERT(pfnhi >= pfnlo);
3627
3628 pp = page_geti_contig_pages(mnode, bin, szc, flags,
3629 pfnlo, pfnhi, pfnflag);
3630
3631 if (pp != NULL) {
3632 pfnflag = pgcpfailcnt[szc];
3633 if (pfnflag) {
3634 /* double the search size */
3635 pgcpfailcnt[szc] = pfnflag >> 1;
3636 }
3637 VM_STAT_ADD(vmm_vmstats.pgcp_allocok[szc]);
3638 return (pp);
3639 }
3640 MTYPE_NEXT(mnode, mtype, flags);
3641 } while (mtype >= 0);
3642
3643 VM_STAT_ADD(vmm_vmstats.pgcp_allocfailed[szc]);
3644 return (NULL);
3645 }
3646
3647 #if defined(__x86)
3648 /*
3649 * Determine the likelihood of finding/coalescing a szc page.
3650 * Return 0 if the likelihood is small otherwise return 1.
3651 *
3652 * For now, be conservative and check only 1g pages and return 0
3653 * if there had been previous coalescing failures and the szc pages
3654 * needed to satisfy request would exhaust most of freemem.
3655 */
3656 int
page_chk_freelist(uint_t szc)3657 page_chk_freelist(uint_t szc)
3658 {
3659 pgcnt_t pgcnt;
3660
3661 if (szc <= 1)
3662 return (1);
3663
3664 pgcnt = page_get_pagecnt(szc);
3665 if (pgcpfailcnt[szc] && pgcnt + throttlefree >= freemem) {
3666 VM_STAT_ADD(vmm_vmstats.pcf_deny[szc]);
3667 return (0);
3668 }
3669 VM_STAT_ADD(vmm_vmstats.pcf_allow[szc]);
3670 return (1);
3671 }
3672 #endif
3673
3674 /*
3675 * Find the `best' page on the freelist for this (vp,off) (as,vaddr) pair.
3676 *
3677 * Does its own locking and accounting.
3678 * If PG_MATCH_COLOR is set, then NULL will be returned if there are no
3679 * pages of the proper color even if there are pages of a different color.
3680 *
3681 * Finds a page, removes it, THEN locks it.
3682 */
3683
3684 /*ARGSUSED*/
3685 page_t *
page_get_freelist(struct vnode * vp,u_offset_t off,struct seg * seg,caddr_t vaddr,size_t size,uint_t flags,struct lgrp * lgrp)3686 page_get_freelist(struct vnode *vp, u_offset_t off, struct seg *seg,
3687 caddr_t vaddr, size_t size, uint_t flags, struct lgrp *lgrp)
3688 {
3689 struct as *as = seg->s_as;
3690 page_t *pp = NULL;
3691 ulong_t bin;
3692 uchar_t szc;
3693 int mnode;
3694 int mtype;
3695 page_t *(*page_get_func)(int, uint_t, int, uchar_t, uint_t);
3696 lgrp_mnode_cookie_t lgrp_cookie;
3697
3698 page_get_func = page_get_mnode_freelist;
3699
3700 /*
3701 * If we aren't passed a specific lgroup, or passed a freed lgrp
3702 * assume we wish to allocate near to the current thread's home.
3703 */
3704 if (!LGRP_EXISTS(lgrp))
3705 lgrp = lgrp_home_lgrp();
3706
3707 if (kcage_on) {
3708 if ((flags & (PG_NORELOC | PG_PANIC)) == PG_NORELOC &&
3709 kcage_freemem < kcage_throttlefree + btop(size) &&
3710 curthread != kcage_cageout_thread) {
3711 /*
3712 * Set a "reserve" of kcage_throttlefree pages for
3713 * PG_PANIC and cageout thread allocations.
3714 *
3715 * Everybody else has to serialize in
3716 * page_create_get_something() to get a cage page, so
3717 * that we don't deadlock cageout!
3718 */
3719 return (NULL);
3720 }
3721 } else {
3722 flags &= ~PG_NORELOC;
3723 flags |= PGI_NOCAGE;
3724 }
3725
3726 /* LINTED */
3727 MTYPE_INIT(mtype, vp, vaddr, flags, size);
3728
3729 /*
3730 * Convert size to page size code.
3731 */
3732 if ((szc = page_szc(size)) == (uchar_t)-1)
3733 panic("page_get_freelist: illegal page size request");
3734 ASSERT(szc < mmu_page_sizes);
3735
3736 VM_STAT_ADD(vmm_vmstats.pgf_alloc[szc]);
3737
3738 /* LINTED */
3739 AS_2_BIN(as, seg, vp, vaddr, bin, szc);
3740
3741 ASSERT(bin < PAGE_GET_PAGECOLORS(szc));
3742
3743 /*
3744 * Try to get a local page first, but try remote if we can't
3745 * get a page of the right color.
3746 */
3747 pgretry:
3748 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL);
3749 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3750 pp = page_get_func(mnode, bin, mtype, szc, flags);
3751 if (pp != NULL) {
3752 VM_STAT_ADD(vmm_vmstats.pgf_allocok[szc]);
3753 DTRACE_PROBE4(page__get,
3754 lgrp_t *, lgrp,
3755 int, mnode,
3756 ulong_t, bin,
3757 uint_t, flags);
3758 return (pp);
3759 }
3760 }
3761 ASSERT(pp == NULL);
3762
3763 /*
3764 * for non-SZC0 PAGESIZE requests, check cachelist before checking
3765 * remote free lists. Caller expected to call page_get_cachelist which
3766 * will check local cache lists and remote free lists.
3767 */
3768 if (szc == 0 && ((flags & PGI_PGCPSZC0) == 0)) {
3769 VM_STAT_ADD(vmm_vmstats.pgf_allocdeferred);
3770 return (NULL);
3771 }
3772
3773 ASSERT(szc > 0 || (flags & PGI_PGCPSZC0));
3774
3775 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
3776
3777 if (!(flags & PG_LOCAL)) {
3778 /*
3779 * Try to get a non-local freelist page.
3780 */
3781 LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie);
3782 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3783 pp = page_get_func(mnode, bin, mtype, szc, flags);
3784 if (pp != NULL) {
3785 DTRACE_PROBE4(page__get,
3786 lgrp_t *, lgrp,
3787 int, mnode,
3788 ulong_t, bin,
3789 uint_t, flags);
3790 VM_STAT_ADD(vmm_vmstats.pgf_allocokrem[szc]);
3791 return (pp);
3792 }
3793 }
3794 ASSERT(pp == NULL);
3795 }
3796
3797 /*
3798 * when the cage is off chances are page_get_contig_pages() will fail
3799 * to lock a large page chunk therefore when the cage is off it's not
3800 * called by default. this can be changed via /etc/system.
3801 *
3802 * page_get_contig_pages() also called to acquire a base pagesize page
3803 * for page_create_get_something().
3804 */
3805 if (!(flags & PG_NORELOC) && (pg_contig_disable == 0) &&
3806 (kcage_on || pg_lpgcreate_nocage || szc == 0) &&
3807 (page_get_func != page_get_contig_pages)) {
3808
3809 VM_STAT_ADD(vmm_vmstats.pgf_allocretry[szc]);
3810 page_get_func = page_get_contig_pages;
3811 goto pgretry;
3812 }
3813
3814 if (!(flags & PG_LOCAL) && pgcplimitsearch &&
3815 page_get_func == page_get_contig_pages)
3816 SETPGCPFAILCNT(szc);
3817
3818 VM_STAT_ADD(vmm_vmstats.pgf_allocfailed[szc]);
3819 return (NULL);
3820 }
3821
3822 /*
3823 * Find the `best' page on the cachelist for this (vp,off) (as,vaddr) pair.
3824 *
3825 * Does its own locking.
3826 * If PG_MATCH_COLOR is set, then NULL will be returned if there are no
3827 * pages of the proper color even if there are pages of a different color.
3828 * Otherwise, scan the bins for ones with pages. For each bin with pages,
3829 * try to lock one of them. If no page can be locked, try the
3830 * next bin. Return NULL if a page can not be found and locked.
3831 *
3832 * Finds a pages, trys to lock it, then removes it.
3833 */
3834
3835 /*ARGSUSED*/
3836 page_t *
page_get_cachelist(struct vnode * vp,u_offset_t off,struct seg * seg,caddr_t vaddr,uint_t flags,struct lgrp * lgrp)3837 page_get_cachelist(struct vnode *vp, u_offset_t off, struct seg *seg,
3838 caddr_t vaddr, uint_t flags, struct lgrp *lgrp)
3839 {
3840 page_t *pp;
3841 struct as *as = seg->s_as;
3842 ulong_t bin;
3843 /*LINTED*/
3844 int mnode;
3845 int mtype;
3846 lgrp_mnode_cookie_t lgrp_cookie;
3847
3848 /*
3849 * If we aren't passed a specific lgroup, or pasased a freed lgrp
3850 * assume we wish to allocate near to the current thread's home.
3851 */
3852 if (!LGRP_EXISTS(lgrp))
3853 lgrp = lgrp_home_lgrp();
3854
3855 if (!kcage_on) {
3856 flags &= ~PG_NORELOC;
3857 flags |= PGI_NOCAGE;
3858 }
3859
3860 if ((flags & (PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == PG_NORELOC &&
3861 kcage_freemem <= kcage_throttlefree) {
3862 /*
3863 * Reserve kcage_throttlefree pages for critical kernel
3864 * threads.
3865 *
3866 * Everybody else has to go to page_create_get_something()
3867 * to get a cage page, so we don't deadlock cageout.
3868 */
3869 return (NULL);
3870 }
3871
3872 /* LINTED */
3873 AS_2_BIN(as, seg, vp, vaddr, bin, 0);
3874
3875 ASSERT(bin < PAGE_GET_PAGECOLORS(0));
3876
3877 /* LINTED */
3878 MTYPE_INIT(mtype, vp, vaddr, flags, MMU_PAGESIZE);
3879
3880 VM_STAT_ADD(vmm_vmstats.pgc_alloc);
3881
3882 /*
3883 * Try local cachelists first
3884 */
3885 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL);
3886 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3887 pp = page_get_mnode_cachelist(bin, flags, mnode, mtype);
3888 if (pp != NULL) {
3889 VM_STAT_ADD(vmm_vmstats.pgc_allocok);
3890 DTRACE_PROBE4(page__get,
3891 lgrp_t *, lgrp,
3892 int, mnode,
3893 ulong_t, bin,
3894 uint_t, flags);
3895 return (pp);
3896 }
3897 }
3898
3899 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
3900
3901 /*
3902 * Try freelists/cachelists that are farther away
3903 * This is our only chance to allocate remote pages for PAGESIZE
3904 * requests.
3905 */
3906 LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie);
3907 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3908 pp = page_get_mnode_freelist(mnode, bin, mtype,
3909 0, flags);
3910 if (pp != NULL) {
3911 VM_STAT_ADD(vmm_vmstats.pgc_allocokdeferred);
3912 DTRACE_PROBE4(page__get,
3913 lgrp_t *, lgrp,
3914 int, mnode,
3915 ulong_t, bin,
3916 uint_t, flags);
3917 return (pp);
3918 }
3919 pp = page_get_mnode_cachelist(bin, flags, mnode, mtype);
3920 if (pp != NULL) {
3921 VM_STAT_ADD(vmm_vmstats.pgc_allocokrem);
3922 DTRACE_PROBE4(page__get,
3923 lgrp_t *, lgrp,
3924 int, mnode,
3925 ulong_t, bin,
3926 uint_t, flags);
3927 return (pp);
3928 }
3929 }
3930
3931 VM_STAT_ADD(vmm_vmstats.pgc_allocfailed);
3932 return (NULL);
3933 }
3934
3935 page_t *
page_get_mnode_cachelist(uint_t bin,uint_t flags,int mnode,int mtype)3936 page_get_mnode_cachelist(uint_t bin, uint_t flags, int mnode, int mtype)
3937 {
3938 kmutex_t *pcm;
3939 page_t *pp, *first_pp;
3940 uint_t sbin;
3941 int plw_initialized;
3942 page_list_walker_t plw;
3943
3944 VM_STAT_ADD(vmm_vmstats.pgmc_alloc);
3945
3946 /* LINTED */
3947 MTYPE_START(mnode, mtype, flags);
3948 if (mtype < 0) { /* mnode does not have memory in mtype range */
3949 VM_STAT_ADD(vmm_vmstats.pgmc_allocempty);
3950 return (NULL);
3951 }
3952
3953 try_again:
3954
3955 plw_initialized = 0;
3956 plw.plw_ceq_dif = 1;
3957
3958 /*
3959 * Only hold one cachelist lock at a time, that way we
3960 * can start anywhere and not have to worry about lock
3961 * ordering.
3962 */
3963
3964 for (plw.plw_count = 0;
3965 plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) {
3966 sbin = bin;
3967 do {
3968
3969 if (!PAGE_CACHELISTS(mnode, bin, mtype))
3970 goto bin_empty_1;
3971 pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST);
3972 mutex_enter(pcm);
3973 pp = PAGE_CACHELISTS(mnode, bin, mtype);
3974 if (pp == NULL)
3975 goto bin_empty_0;
3976
3977 first_pp = pp;
3978 ASSERT(pp->p_vnode);
3979 ASSERT(PP_ISAGED(pp) == 0);
3980 ASSERT(pp->p_szc == 0);
3981 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
3982 while (IS_DUMP_PAGE(pp) || !page_trylock(pp, SE_EXCL)) {
3983 pp = pp->p_next;
3984 ASSERT(pp->p_szc == 0);
3985 if (pp == first_pp) {
3986 /*
3987 * We have searched the complete list!
3988 * And all of them (might only be one)
3989 * are locked. This can happen since
3990 * these pages can also be found via
3991 * the hash list. When found via the
3992 * hash list, they are locked first,
3993 * then removed. We give up to let the
3994 * other thread run.
3995 */
3996 pp = NULL;
3997 break;
3998 }
3999 ASSERT(pp->p_vnode);
4000 ASSERT(PP_ISFREE(pp));
4001 ASSERT(PP_ISAGED(pp) == 0);
4002 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) ==
4003 mnode);
4004 }
4005
4006 if (pp) {
4007 page_t **ppp;
4008 /*
4009 * Found and locked a page.
4010 * Pull it off the list.
4011 */
4012 ASSERT(mtype == PP_2_MTYPE(pp));
4013 ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
4014 page_sub(ppp, pp);
4015 /*
4016 * Subtract counters before releasing pcm mutex
4017 * to avoid a race with page_freelist_coalesce
4018 * and page_freelist_split.
4019 */
4020 page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST);
4021 mutex_exit(pcm);
4022 ASSERT(pp->p_vnode);
4023 ASSERT(PP_ISAGED(pp) == 0);
4024 #if defined(__sparc)
4025 ASSERT(!kcage_on ||
4026 (flags & PG_NORELOC) == 0 ||
4027 PP_ISNORELOC(pp));
4028 if (PP_ISNORELOC(pp)) {
4029 kcage_freemem_sub(1);
4030 }
4031 #endif
4032 VM_STAT_ADD(vmm_vmstats. pgmc_allocok);
4033 return (pp);
4034 }
4035 bin_empty_0:
4036 mutex_exit(pcm);
4037 bin_empty_1:
4038 if (plw_initialized == 0) {
4039 page_list_walk_init(0, flags, bin, 0, 1, &plw);
4040 plw_initialized = 1;
4041 }
4042 /* calculate the next bin with equivalent color */
4043 bin = ADD_MASKED(bin, plw.plw_bin_step,
4044 plw.plw_ceq_mask[0], plw.plw_color_mask);
4045 } while (sbin != bin);
4046
4047 if (plw.plw_ceq_dif > 1)
4048 bin = page_list_walk_next_bin(0, bin, &plw);
4049 }
4050
4051 MTYPE_NEXT(mnode, mtype, flags);
4052 if (mtype >= 0)
4053 goto try_again;
4054
4055 VM_STAT_ADD(vmm_vmstats.pgmc_allocfailed);
4056 return (NULL);
4057 }
4058
4059 #ifdef DEBUG
4060 #define REPL_PAGE_STATS
4061 #endif /* DEBUG */
4062
4063 #ifdef REPL_PAGE_STATS
4064 struct repl_page_stats {
4065 uint_t ngets;
4066 uint_t ngets_noreloc;
4067 uint_t npgr_noreloc;
4068 uint_t nnopage_first;
4069 uint_t nnopage;
4070 uint_t nhashout;
4071 uint_t nnofree;
4072 uint_t nnext_pp;
4073 } repl_page_stats;
4074 #define REPL_STAT_INCR(v) atomic_inc_32(&repl_page_stats.v)
4075 #else /* REPL_PAGE_STATS */
4076 #define REPL_STAT_INCR(v)
4077 #endif /* REPL_PAGE_STATS */
4078
4079 int pgrppgcp;
4080
4081 /*
4082 * The freemem accounting must be done by the caller.
4083 * First we try to get a replacement page of the same size as like_pp,
4084 * if that is not possible, then we just get a set of discontiguous
4085 * PAGESIZE pages.
4086 */
4087 page_t *
page_get_replacement_page(page_t * orig_like_pp,struct lgrp * lgrp_target,uint_t pgrflags)4088 page_get_replacement_page(page_t *orig_like_pp, struct lgrp *lgrp_target,
4089 uint_t pgrflags)
4090 {
4091 page_t *like_pp;
4092 page_t *pp, *pplist;
4093 page_t *pl = NULL;
4094 ulong_t bin;
4095 int mnode, page_mnode;
4096 int szc;
4097 spgcnt_t npgs, pg_cnt;
4098 pfn_t pfnum;
4099 int mtype;
4100 int flags = 0;
4101 lgrp_mnode_cookie_t lgrp_cookie;
4102 lgrp_t *lgrp;
4103
4104 mnode = 0;
4105 lgrp = NULL;
4106 REPL_STAT_INCR(ngets);
4107 like_pp = orig_like_pp;
4108 ASSERT(PAGE_EXCL(like_pp));
4109
4110 szc = like_pp->p_szc;
4111 npgs = page_get_pagecnt(szc);
4112 /*
4113 * Now we reset like_pp to the base page_t.
4114 * That way, we won't walk past the end of this 'szc' page.
4115 */
4116 pfnum = PFN_BASE(like_pp->p_pagenum, szc);
4117 like_pp = page_numtopp_nolock(pfnum);
4118 ASSERT(like_pp->p_szc == szc);
4119
4120 if (PP_ISNORELOC(like_pp)) {
4121 ASSERT(kcage_on);
4122 REPL_STAT_INCR(ngets_noreloc);
4123 flags = PGI_RELOCONLY;
4124 } else if (pgrflags & PGR_NORELOC) {
4125 ASSERT(kcage_on);
4126 REPL_STAT_INCR(npgr_noreloc);
4127 flags = PG_NORELOC;
4128 }
4129
4130 /*
4131 * Kernel pages must always be replaced with the same size
4132 * pages, since we cannot properly handle demotion of kernel
4133 * pages.
4134 */
4135 if (PP_ISKAS(like_pp))
4136 pgrflags |= PGR_SAMESZC;
4137
4138 MTYPE_PGR_INIT(mtype, flags, like_pp, npgs);
4139
4140 while (npgs) {
4141 pplist = NULL;
4142 for (;;) {
4143 pg_cnt = page_get_pagecnt(szc);
4144 bin = PP_2_BIN(like_pp);
4145 ASSERT(like_pp->p_szc == orig_like_pp->p_szc);
4146 ASSERT(pg_cnt <= npgs);
4147
4148 /*
4149 * If an lgroup was specified, try to get the
4150 * page from that lgroup.
4151 * NOTE: Must be careful with code below because
4152 * lgroup may disappear and reappear since there
4153 * is no locking for lgroup here.
4154 */
4155 if (LGRP_EXISTS(lgrp_target)) {
4156 /*
4157 * Keep local variable for lgroup separate
4158 * from lgroup argument since this code should
4159 * only be exercised when lgroup argument
4160 * exists....
4161 */
4162 lgrp = lgrp_target;
4163
4164 /* Try the lgroup's freelists first */
4165 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4166 LGRP_SRCH_LOCAL);
4167 while ((pplist == NULL) &&
4168 (mnode = lgrp_memnode_choose(&lgrp_cookie))
4169 != -1) {
4170 pplist =
4171 page_get_mnode_freelist(mnode, bin,
4172 mtype, szc, flags);
4173 }
4174
4175 /*
4176 * Now try it's cachelists if this is a
4177 * small page. Don't need to do it for
4178 * larger ones since page_freelist_coalesce()
4179 * already failed.
4180 */
4181 if (pplist != NULL || szc != 0)
4182 break;
4183
4184 /* Now try it's cachelists */
4185 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4186 LGRP_SRCH_LOCAL);
4187
4188 while ((pplist == NULL) &&
4189 (mnode = lgrp_memnode_choose(&lgrp_cookie))
4190 != -1) {
4191 pplist =
4192 page_get_mnode_cachelist(bin, flags,
4193 mnode, mtype);
4194 }
4195 if (pplist != NULL) {
4196 page_hashout(pplist, NULL);
4197 PP_SETAGED(pplist);
4198 REPL_STAT_INCR(nhashout);
4199 break;
4200 }
4201 /* Done looking in this lgroup. Bail out. */
4202 break;
4203 }
4204
4205 /*
4206 * No lgroup was specified (or lgroup was removed by
4207 * DR, so just try to get the page as close to
4208 * like_pp's mnode as possible.
4209 * First try the local freelist...
4210 */
4211 mnode = PP_2_MEM_NODE(like_pp);
4212 pplist = page_get_mnode_freelist(mnode, bin,
4213 mtype, szc, flags);
4214 if (pplist != NULL)
4215 break;
4216
4217 REPL_STAT_INCR(nnofree);
4218
4219 /*
4220 * ...then the local cachelist. Don't need to do it for
4221 * larger pages cause page_freelist_coalesce() already
4222 * failed there anyway.
4223 */
4224 if (szc == 0) {
4225 pplist = page_get_mnode_cachelist(bin, flags,
4226 mnode, mtype);
4227 if (pplist != NULL) {
4228 page_hashout(pplist, NULL);
4229 PP_SETAGED(pplist);
4230 REPL_STAT_INCR(nhashout);
4231 break;
4232 }
4233 }
4234
4235 /* Now try remote freelists */
4236 page_mnode = mnode;
4237 lgrp =
4238 lgrp_hand_to_lgrp(MEM_NODE_2_LGRPHAND(page_mnode));
4239 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4240 LGRP_SRCH_HIER);
4241 while (pplist == NULL &&
4242 (mnode = lgrp_memnode_choose(&lgrp_cookie))
4243 != -1) {
4244 /*
4245 * Skip local mnode.
4246 */
4247 if ((mnode == page_mnode) ||
4248 (mem_node_config[mnode].exists == 0))
4249 continue;
4250
4251 pplist = page_get_mnode_freelist(mnode,
4252 bin, mtype, szc, flags);
4253 }
4254
4255 if (pplist != NULL)
4256 break;
4257
4258
4259 /* Now try remote cachelists */
4260 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4261 LGRP_SRCH_HIER);
4262 while (pplist == NULL && szc == 0) {
4263 mnode = lgrp_memnode_choose(&lgrp_cookie);
4264 if (mnode == -1)
4265 break;
4266 /*
4267 * Skip local mnode.
4268 */
4269 if ((mnode == page_mnode) ||
4270 (mem_node_config[mnode].exists == 0))
4271 continue;
4272
4273 pplist = page_get_mnode_cachelist(bin,
4274 flags, mnode, mtype);
4275
4276 if (pplist != NULL) {
4277 page_hashout(pplist, NULL);
4278 PP_SETAGED(pplist);
4279 REPL_STAT_INCR(nhashout);
4280 break;
4281 }
4282 }
4283
4284 /*
4285 * Break out of while loop under the following cases:
4286 * - If we successfully got a page.
4287 * - If pgrflags specified only returning a specific
4288 * page size and we could not find that page size.
4289 * - If we could not satisfy the request with PAGESIZE
4290 * or larger pages.
4291 */
4292 if (pplist != NULL || szc == 0)
4293 break;
4294
4295 if ((pgrflags & PGR_SAMESZC) || pgrppgcp) {
4296 /* try to find contig page */
4297
4298 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4299 LGRP_SRCH_HIER);
4300
4301 while ((pplist == NULL) &&
4302 (mnode =
4303 lgrp_memnode_choose(&lgrp_cookie))
4304 != -1) {
4305 pplist = page_get_contig_pages(
4306 mnode, bin, mtype, szc,
4307 flags | PGI_PGCPHIPRI);
4308 }
4309 break;
4310 }
4311
4312 /*
4313 * The correct thing to do here is try the next
4314 * page size down using szc--. Due to a bug
4315 * with the processing of HAT_RELOAD_SHARE
4316 * where the sfmmu_ttecnt arrays of all
4317 * hats sharing an ISM segment don't get updated,
4318 * using intermediate size pages for relocation
4319 * can lead to continuous page faults.
4320 */
4321 szc = 0;
4322 }
4323
4324 if (pplist != NULL) {
4325 DTRACE_PROBE4(page__get,
4326 lgrp_t *, lgrp,
4327 int, mnode,
4328 ulong_t, bin,
4329 uint_t, flags);
4330
4331 while (pplist != NULL && pg_cnt--) {
4332 ASSERT(pplist != NULL);
4333 pp = pplist;
4334 page_sub(&pplist, pp);
4335 PP_CLRFREE(pp);
4336 PP_CLRAGED(pp);
4337 page_list_concat(&pl, &pp);
4338 npgs--;
4339 like_pp = like_pp + 1;
4340 REPL_STAT_INCR(nnext_pp);
4341 }
4342 ASSERT(pg_cnt == 0);
4343 } else {
4344 break;
4345 }
4346 }
4347
4348 if (npgs) {
4349 /*
4350 * We were unable to allocate the necessary number
4351 * of pages.
4352 * We need to free up any pl.
4353 */
4354 REPL_STAT_INCR(nnopage);
4355 page_free_replacement_page(pl);
4356 return (NULL);
4357 } else {
4358 return (pl);
4359 }
4360 }
4361
4362 /*
4363 * demote a free large page to it's constituent pages
4364 */
4365 void
page_demote_free_pages(page_t * pp)4366 page_demote_free_pages(page_t *pp)
4367 {
4368
4369 int mnode;
4370
4371 ASSERT(pp != NULL);
4372 ASSERT(PAGE_LOCKED(pp));
4373 ASSERT(PP_ISFREE(pp));
4374 ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes);
4375
4376 mnode = PP_2_MEM_NODE(pp);
4377 page_freelist_lock(mnode);
4378 if (pp->p_szc != 0) {
4379 (void) page_demote(mnode, PFN_BASE(pp->p_pagenum,
4380 pp->p_szc), 0, pp->p_szc, 0, PC_NO_COLOR, PC_FREE);
4381 }
4382 page_freelist_unlock(mnode);
4383 ASSERT(pp->p_szc == 0);
4384 }
4385
4386 /*
4387 * Factor in colorequiv to check additional 'equivalent' bins.
4388 * colorequiv may be set in /etc/system
4389 */
4390 void
page_set_colorequiv_arr(void)4391 page_set_colorequiv_arr(void)
4392 {
4393 if (colorequiv > 1) {
4394 int i;
4395 uint_t sv_a = lowbit(colorequiv) - 1;
4396
4397 if (sv_a > 15)
4398 sv_a = 15;
4399
4400 for (i = 0; i < MMU_PAGE_SIZES; i++) {
4401 uint_t colors;
4402 uint_t a = sv_a;
4403
4404 if ((colors = hw_page_array[i].hp_colors) <= 1) {
4405 continue;
4406 }
4407 while ((colors >> a) == 0)
4408 a--;
4409 if ((a << 4) > colorequivszc[i]) {
4410 colorequivszc[i] = (a << 4);
4411 }
4412 }
4413 }
4414 }
4415