1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * Portions of this source code were derived from Berkeley 4.3 BSD 31 * under license from the Regents of the University of California. 32 */ 33 34 #pragma ident "%Z%%M% %I% %E% SMI" 35 36 /* 37 * This file contains common functions to access and manage the page lists. 38 * Many of these routines originated from platform dependent modules 39 * (sun4/vm/vm_dep.c, i86pc/vm/vm_machdep.c) and modified to function in 40 * a platform independent manner. 41 * 42 * vm/vm_dep.h provides for platform specific support. 43 */ 44 45 #include <sys/types.h> 46 #include <sys/debug.h> 47 #include <sys/cmn_err.h> 48 #include <sys/systm.h> 49 #include <sys/atomic.h> 50 #include <sys/sysmacros.h> 51 #include <vm/as.h> 52 #include <vm/page.h> 53 #include <vm/seg_kmem.h> 54 #include <vm/seg_vn.h> 55 #include <sys/vmsystm.h> 56 #include <sys/memnode.h> 57 #include <vm/vm_dep.h> 58 #include <sys/lgrp.h> 59 #include <sys/mem_config.h> 60 #include <sys/callb.h> 61 #include <sys/mem_cage.h> 62 #include <sys/sdt.h> 63 64 extern uint_t vac_colors; 65 66 #define MAX_PRAGMA_ALIGN 128 67 68 /* vm_cpu_data0 for the boot cpu before kmem is initialized */ 69 70 #if L2CACHE_ALIGN_MAX <= MAX_PRAGMA_ALIGN 71 #pragma align L2CACHE_ALIGN_MAX(vm_cpu_data0) 72 #else 73 #pragma align MAX_PRAGMA_ALIGN(vm_cpu_data0) 74 #endif 75 char vm_cpu_data0[VM_CPU_DATA_PADSIZE]; 76 77 /* 78 * number of page colors equivalent to reqested color in page_get routines. 79 * If set, keeps large pages intact longer and keeps MPO allocation 80 * from the local mnode in favor of acquiring the 'correct' page color from 81 * a demoted large page or from a remote mnode. 82 */ 83 uint_t colorequiv; 84 85 /* 86 * color equivalency mask for each page size. 87 * Mask is computed based on cpu L2$ way sizes and colorequiv global. 88 * High 4 bits determine the number of high order bits of the color to ignore. 89 * Low 4 bits determines number of low order bits of color to ignore (it's only 90 * relevant for hashed index based page coloring). 91 */ 92 uchar_t colorequivszc[MMU_PAGE_SIZES]; 93 94 /* 95 * if set, specifies the percentage of large pages that are free from within 96 * a large page region before attempting to lock those pages for 97 * page_get_contig_pages processing. 98 * 99 * Should be turned on when kpr is available when page_trylock_contig_pages 100 * can be more selective. 101 */ 102 103 int ptcpthreshold; 104 105 /* 106 * Limit page get contig page search based on failure cnts in pgcpfailcnt[]. 107 * Enabled by default via pgcplimitsearch. 108 * 109 * pgcpfailcnt[] is bounded by PGCPFAILMAX (>= 1/2 of installed 110 * memory). When reached, pgcpfailcnt[] is reset to 1/2 of this upper 111 * bound. This upper bound range guarantees: 112 * - all large page 'slots' will be searched over time 113 * - the minimum (1) large page candidates considered on each pgcp call 114 * - count doesn't wrap around to 0 115 */ 116 pgcnt_t pgcpfailcnt[MMU_PAGE_SIZES]; 117 int pgcplimitsearch = 1; 118 119 #define PGCPFAILMAX (1 << (highbit(physinstalled) - 1)) 120 #define SETPGCPFAILCNT(szc) \ 121 if (++pgcpfailcnt[szc] >= PGCPFAILMAX) \ 122 pgcpfailcnt[szc] = PGCPFAILMAX / 2; 123 124 #ifdef VM_STATS 125 struct vmm_vmstats_str vmm_vmstats; 126 127 #endif /* VM_STATS */ 128 129 #if defined(__sparc) 130 #define LPGCREATE 0 131 #else 132 /* enable page_get_contig_pages */ 133 #define LPGCREATE 1 134 #endif 135 136 int pg_contig_disable; 137 int pg_lpgcreate_nocage = LPGCREATE; 138 139 /* 140 * page_freelist_split pfn flag to signify no hi pfn requirement. 141 */ 142 #define PFNNULL 0 143 144 /* Flags involved in promotion and demotion routines */ 145 #define PC_FREE 0x1 /* put page on freelist */ 146 #define PC_ALLOC 0x2 /* return page for allocation */ 147 148 /* 149 * Flag for page_demote to be used with PC_FREE to denote that we don't care 150 * what the color is as the color parameter to the function is ignored. 151 */ 152 #define PC_NO_COLOR (-1) 153 154 /* mtype value for page_promote to use when mtype does not matter */ 155 #define PC_MTYPE_ANY (-1) 156 157 /* 158 * page counters candidates info 159 * See page_ctrs_cands comment below for more details. 160 * fields are as follows: 161 * pcc_pages_free: # pages which freelist coalesce can create 162 * pcc_color_free: pointer to page free counts per color 163 */ 164 typedef struct pcc_info { 165 pgcnt_t pcc_pages_free; 166 pgcnt_t *pcc_color_free; 167 } pcc_info_t; 168 169 /* 170 * On big machines it can take a long time to check page_counters 171 * arrays. page_ctrs_cands is a summary array whose elements are a dynamically 172 * updated sum of all elements of the corresponding page_counters arrays. 173 * page_freelist_coalesce() searches page_counters only if an appropriate 174 * element of page_ctrs_cands array is greater than 0. 175 * 176 * page_ctrs_cands is indexed by mutex (i), region (r), mnode (m), mrange (g) 177 */ 178 pcc_info_t **page_ctrs_cands[NPC_MUTEX][MMU_PAGE_SIZES]; 179 180 /* 181 * Return in val the total number of free pages which can be created 182 * for the given mnode (m), mrange (g), and region size (r) 183 */ 184 #define PGCTRS_CANDS_GETVALUE(m, g, r, val) { \ 185 int i; \ 186 val = 0; \ 187 for (i = 0; i < NPC_MUTEX; i++) { \ 188 val += page_ctrs_cands[i][(r)][(m)][(g)].pcc_pages_free; \ 189 } \ 190 } 191 192 /* 193 * Return in val the total number of free pages which can be created 194 * for the given mnode (m), mrange (g), region size (r), and color (c) 195 */ 196 #define PGCTRS_CANDS_GETVALUECOLOR(m, g, r, c, val) { \ 197 int i; \ 198 val = 0; \ 199 ASSERT((c) < PAGE_GET_PAGECOLORS(r)); \ 200 for (i = 0; i < NPC_MUTEX; i++) { \ 201 val += \ 202 page_ctrs_cands[i][(r)][(m)][(g)].pcc_color_free[(c)]; \ 203 } \ 204 } 205 206 /* 207 * We can only allow a single thread to update a counter within the physical 208 * range of the largest supported page size. That is the finest granularity 209 * possible since the counter values are dependent on each other 210 * as you move accross region sizes. PP_CTR_LOCK_INDX is used to determine the 211 * ctr_mutex lock index for a particular physical range. 212 */ 213 static kmutex_t *ctr_mutex[NPC_MUTEX]; 214 215 #define PP_CTR_LOCK_INDX(pp) \ 216 (((pp)->p_pagenum >> \ 217 (PAGE_BSZS_SHIFT(mmu_page_sizes - 1))) & (NPC_MUTEX - 1)) 218 219 #define INVALID_COLOR 0xffffffff 220 #define INVALID_MASK 0xffffffff 221 222 /* 223 * Local functions prototypes. 224 */ 225 226 void page_ctr_add(int, int, page_t *, int); 227 void page_ctr_add_internal(int, int, page_t *, int); 228 void page_ctr_sub(int, int, page_t *, int); 229 void page_ctr_sub_internal(int, int, page_t *, int); 230 void page_freelist_lock(int); 231 void page_freelist_unlock(int); 232 page_t *page_promote(int, pfn_t, uchar_t, int, int); 233 page_t *page_demote(int, pfn_t, uchar_t, uchar_t, int, int); 234 page_t *page_freelist_split(uchar_t, 235 uint_t, int, int, pfn_t, page_list_walker_t *); 236 page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int); 237 static int page_trylock_cons(page_t *pp, se_t se); 238 239 /* 240 * The page_counters array below is used to keep track of free contiguous 241 * physical memory. A hw_page_map_t will be allocated per mnode per szc. 242 * This contains an array of counters, the size of the array, a shift value 243 * used to convert a pagenum into a counter array index or vice versa, as 244 * well as a cache of the last successful index to be promoted to a larger 245 * page size. As an optimization, we keep track of the last successful index 246 * to be promoted per page color for the given size region, and this is 247 * allocated dynamically based upon the number of colors for a given 248 * region size. 249 * 250 * Conceptually, the page counters are represented as: 251 * 252 * page_counters[region_size][mnode] 253 * 254 * region_size: size code of a candidate larger page made up 255 * of contiguous free smaller pages. 256 * 257 * page_counters[region_size][mnode].hpm_counters[index]: 258 * represents how many (region_size - 1) pages either 259 * exist or can be created within the given index range. 260 * 261 * Let's look at a sparc example: 262 * If we want to create a free 512k page, we look at region_size 2 263 * for the mnode we want. We calculate the index and look at a specific 264 * hpm_counters location. If we see 8 (FULL_REGION_CNT on sparc) at 265 * this location, it means that 8 64k pages either exist or can be created 266 * from 8K pages in order to make a single free 512k page at the given 267 * index. Note that when a region is full, it will contribute to the 268 * counts in the region above it. Thus we will not know what page 269 * size the free pages will be which can be promoted to this new free 270 * page unless we look at all regions below the current region. 271 */ 272 273 /* 274 * Note: hpmctr_t is defined in platform vm_dep.h 275 * hw_page_map_t contains all the information needed for the page_counters 276 * logic. The fields are as follows: 277 * 278 * hpm_counters: dynamically allocated array to hold counter data 279 * hpm_entries: entries in hpm_counters 280 * hpm_shift: shift for pnum/array index conv 281 * hpm_base: PFN mapped to counter index 0 282 * hpm_color_current: last index in counter array for this color at 283 * which we successfully created a large page 284 */ 285 typedef struct hw_page_map { 286 hpmctr_t *hpm_counters; 287 size_t hpm_entries; 288 int hpm_shift; 289 pfn_t hpm_base; 290 size_t *hpm_color_current[MAX_MNODE_MRANGES]; 291 } hw_page_map_t; 292 293 /* 294 * Element zero is not used, but is allocated for convenience. 295 */ 296 static hw_page_map_t *page_counters[MMU_PAGE_SIZES]; 297 298 /* 299 * Cached value of MNODE_RANGE_CNT(mnode). 300 * This is a function call in x86. 301 */ 302 static int mnode_nranges[MAX_MEM_NODES]; 303 static int mnode_maxmrange[MAX_MEM_NODES]; 304 305 /* 306 * The following macros are convenient ways to get access to the individual 307 * elements of the page_counters arrays. They can be used on both 308 * the left side and right side of equations. 309 */ 310 #define PAGE_COUNTERS(mnode, rg_szc, idx) \ 311 (page_counters[(rg_szc)][(mnode)].hpm_counters[(idx)]) 312 313 #define PAGE_COUNTERS_COUNTERS(mnode, rg_szc) \ 314 (page_counters[(rg_szc)][(mnode)].hpm_counters) 315 316 #define PAGE_COUNTERS_SHIFT(mnode, rg_szc) \ 317 (page_counters[(rg_szc)][(mnode)].hpm_shift) 318 319 #define PAGE_COUNTERS_ENTRIES(mnode, rg_szc) \ 320 (page_counters[(rg_szc)][(mnode)].hpm_entries) 321 322 #define PAGE_COUNTERS_BASE(mnode, rg_szc) \ 323 (page_counters[(rg_szc)][(mnode)].hpm_base) 324 325 #define PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, rg_szc, g) \ 326 (page_counters[(rg_szc)][(mnode)].hpm_color_current[(g)]) 327 328 #define PAGE_COUNTERS_CURRENT_COLOR(mnode, rg_szc, color, mrange) \ 329 (page_counters[(rg_szc)][(mnode)]. \ 330 hpm_color_current[(mrange)][(color)]) 331 332 #define PNUM_TO_IDX(mnode, rg_szc, pnum) \ 333 (((pnum) - PAGE_COUNTERS_BASE((mnode), (rg_szc))) >> \ 334 PAGE_COUNTERS_SHIFT((mnode), (rg_szc))) 335 336 #define IDX_TO_PNUM(mnode, rg_szc, index) \ 337 (PAGE_COUNTERS_BASE((mnode), (rg_szc)) + \ 338 ((index) << PAGE_COUNTERS_SHIFT((mnode), (rg_szc)))) 339 340 /* 341 * Protects the hpm_counters and hpm_color_current memory from changing while 342 * looking at page counters information. 343 * Grab the write lock to modify what these fields point at. 344 * Grab the read lock to prevent any pointers from changing. 345 * The write lock can not be held during memory allocation due to a possible 346 * recursion deadlock with trying to grab the read lock while the 347 * write lock is already held. 348 */ 349 krwlock_t page_ctrs_rwlock[MAX_MEM_NODES]; 350 351 352 /* 353 * initialize cpu_vm_data to point at cache aligned vm_cpu_data_t. 354 */ 355 void 356 cpu_vm_data_init(struct cpu *cp) 357 { 358 if (cp == CPU0) { 359 cp->cpu_vm_data = (void *)&vm_cpu_data0; 360 } else { 361 void *kmptr; 362 int align; 363 size_t sz; 364 365 align = (L2CACHE_ALIGN) ? L2CACHE_ALIGN : L2CACHE_ALIGN_MAX; 366 sz = P2ROUNDUP(sizeof (vm_cpu_data_t), align) + align; 367 kmptr = kmem_zalloc(sz, KM_SLEEP); 368 cp->cpu_vm_data = (void *) P2ROUNDUP((uintptr_t)kmptr, align); 369 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr = kmptr; 370 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize = sz; 371 } 372 } 373 374 /* 375 * free cpu_vm_data 376 */ 377 void 378 cpu_vm_data_destroy(struct cpu *cp) 379 { 380 if (cp->cpu_seqid && cp->cpu_vm_data) { 381 ASSERT(cp != CPU0); 382 kmem_free(((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr, 383 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize); 384 } 385 cp->cpu_vm_data = NULL; 386 } 387 388 389 /* 390 * page size to page size code 391 */ 392 int 393 page_szc(size_t pagesize) 394 { 395 int i = 0; 396 397 while (hw_page_array[i].hp_size) { 398 if (pagesize == hw_page_array[i].hp_size) 399 return (i); 400 i++; 401 } 402 return (-1); 403 } 404 405 /* 406 * page size to page size code with the restriction that it be a supported 407 * user page size. If it's not a supported user page size, -1 will be returned. 408 */ 409 int 410 page_szc_user_filtered(size_t pagesize) 411 { 412 int szc = page_szc(pagesize); 413 if ((szc != -1) && (SZC_2_USERSZC(szc) != -1)) { 414 return (szc); 415 } 416 return (-1); 417 } 418 419 /* 420 * Return how many page sizes are available for the user to use. This is 421 * what the hardware supports and not based upon how the OS implements the 422 * support of different page sizes. 423 * 424 * If legacy is non-zero, return the number of pagesizes available to legacy 425 * applications. The number of legacy page sizes might be less than the 426 * exported user page sizes. This is to prevent legacy applications that 427 * use the largest page size returned from getpagesizes(3c) from inadvertantly 428 * using the 'new' large pagesizes. 429 */ 430 uint_t 431 page_num_user_pagesizes(int legacy) 432 { 433 if (legacy) 434 return (mmu_legacy_page_sizes); 435 return (mmu_exported_page_sizes); 436 } 437 438 uint_t 439 page_num_pagesizes(void) 440 { 441 return (mmu_page_sizes); 442 } 443 444 /* 445 * returns the count of the number of base pagesize pages associated with szc 446 */ 447 pgcnt_t 448 page_get_pagecnt(uint_t szc) 449 { 450 if (szc >= mmu_page_sizes) 451 panic("page_get_pagecnt: out of range %d", szc); 452 return (hw_page_array[szc].hp_pgcnt); 453 } 454 455 size_t 456 page_get_pagesize(uint_t szc) 457 { 458 if (szc >= mmu_page_sizes) 459 panic("page_get_pagesize: out of range %d", szc); 460 return (hw_page_array[szc].hp_size); 461 } 462 463 /* 464 * Return the size of a page based upon the index passed in. An index of 465 * zero refers to the smallest page size in the system, and as index increases 466 * it refers to the next larger supported page size in the system. 467 * Note that szc and userszc may not be the same due to unsupported szc's on 468 * some systems. 469 */ 470 size_t 471 page_get_user_pagesize(uint_t userszc) 472 { 473 uint_t szc = USERSZC_2_SZC(userszc); 474 475 if (szc >= mmu_page_sizes) 476 panic("page_get_user_pagesize: out of range %d", szc); 477 return (hw_page_array[szc].hp_size); 478 } 479 480 uint_t 481 page_get_shift(uint_t szc) 482 { 483 if (szc >= mmu_page_sizes) 484 panic("page_get_shift: out of range %d", szc); 485 return (PAGE_GET_SHIFT(szc)); 486 } 487 488 uint_t 489 page_get_pagecolors(uint_t szc) 490 { 491 if (szc >= mmu_page_sizes) 492 panic("page_get_pagecolors: out of range %d", szc); 493 return (PAGE_GET_PAGECOLORS(szc)); 494 } 495 496 /* 497 * this assigns the desired equivalent color after a split 498 */ 499 uint_t 500 page_correct_color(uchar_t szc, uchar_t nszc, uint_t color, 501 uint_t ncolor, uint_t ceq_mask) 502 { 503 ASSERT(nszc > szc); 504 ASSERT(szc < mmu_page_sizes); 505 ASSERT(color < PAGE_GET_PAGECOLORS(szc)); 506 ASSERT(ncolor < PAGE_GET_PAGECOLORS(nszc)); 507 508 color &= ceq_mask; 509 ncolor = PAGE_CONVERT_COLOR(ncolor, szc, nszc); 510 return (color | (ncolor & ~ceq_mask)); 511 } 512 513 /* 514 * The interleaved_mnodes flag is set when mnodes overlap in 515 * the physbase..physmax range, but have disjoint slices. 516 * In this case hpm_counters is shared by all mnodes. 517 * This flag is set dynamically by the platform. 518 */ 519 int interleaved_mnodes = 0; 520 521 /* 522 * Called by startup(). 523 * Size up the per page size free list counters based on physmax 524 * of each node and max_mem_nodes. 525 * 526 * If interleaved_mnodes is set we need to find the first mnode that 527 * exists. hpm_counters for the first mnode will then be shared by 528 * all other mnodes. If interleaved_mnodes is not set, just set 529 * first=mnode each time. That means there will be no sharing. 530 */ 531 size_t 532 page_ctrs_sz(void) 533 { 534 int r; /* region size */ 535 int mnode; 536 int firstmn; /* first mnode that exists */ 537 int nranges; 538 pfn_t physbase; 539 pfn_t physmax; 540 uint_t ctrs_sz = 0; 541 int i; 542 pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; 543 544 /* 545 * We need to determine how many page colors there are for each 546 * page size in order to allocate memory for any color specific 547 * arrays. 548 */ 549 for (i = 0; i < mmu_page_sizes; i++) { 550 colors_per_szc[i] = PAGE_GET_PAGECOLORS(i); 551 } 552 553 for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) { 554 555 pgcnt_t r_pgcnt; 556 pfn_t r_base; 557 pgcnt_t r_align; 558 559 if (mem_node_config[mnode].exists == 0) 560 continue; 561 562 HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn); 563 nranges = MNODE_RANGE_CNT(mnode); 564 mnode_nranges[mnode] = nranges; 565 mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode); 566 567 /* 568 * determine size needed for page counter arrays with 569 * base aligned to large page size. 570 */ 571 for (r = 1; r < mmu_page_sizes; r++) { 572 /* add in space for hpm_color_current */ 573 ctrs_sz += sizeof (size_t) * 574 colors_per_szc[r] * nranges; 575 576 if (firstmn != mnode) 577 continue; 578 579 /* add in space for hpm_counters */ 580 r_align = page_get_pagecnt(r); 581 r_base = physbase; 582 r_base &= ~(r_align - 1); 583 r_pgcnt = howmany(physmax - r_base + 1, r_align); 584 585 /* 586 * Round up to always allocate on pointer sized 587 * boundaries. 588 */ 589 ctrs_sz += P2ROUNDUP((r_pgcnt * sizeof (hpmctr_t)), 590 sizeof (hpmctr_t *)); 591 } 592 } 593 594 for (r = 1; r < mmu_page_sizes; r++) { 595 ctrs_sz += (max_mem_nodes * sizeof (hw_page_map_t)); 596 } 597 598 /* add in space for page_ctrs_cands and pcc_color_free */ 599 ctrs_sz += sizeof (pcc_info_t *) * max_mem_nodes * 600 mmu_page_sizes * NPC_MUTEX; 601 602 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 603 604 if (mem_node_config[mnode].exists == 0) 605 continue; 606 607 nranges = mnode_nranges[mnode]; 608 ctrs_sz += sizeof (pcc_info_t) * nranges * 609 mmu_page_sizes * NPC_MUTEX; 610 for (r = 1; r < mmu_page_sizes; r++) { 611 ctrs_sz += sizeof (pgcnt_t) * nranges * 612 colors_per_szc[r] * NPC_MUTEX; 613 } 614 } 615 616 /* ctr_mutex */ 617 ctrs_sz += (max_mem_nodes * NPC_MUTEX * sizeof (kmutex_t)); 618 619 /* size for page list counts */ 620 PLCNT_SZ(ctrs_sz); 621 622 /* 623 * add some slop for roundups. page_ctrs_alloc will roundup the start 624 * address of the counters to ecache_alignsize boundary for every 625 * memory node. 626 */ 627 return (ctrs_sz + max_mem_nodes * L2CACHE_ALIGN); 628 } 629 630 caddr_t 631 page_ctrs_alloc(caddr_t alloc_base) 632 { 633 int mnode; 634 int mrange, nranges; 635 int r; /* region size */ 636 int i; 637 int firstmn; /* first mnode that exists */ 638 pfn_t physbase; 639 pfn_t physmax; 640 pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; 641 642 /* 643 * We need to determine how many page colors there are for each 644 * page size in order to allocate memory for any color specific 645 * arrays. 646 */ 647 for (i = 0; i < mmu_page_sizes; i++) { 648 colors_per_szc[i] = PAGE_GET_PAGECOLORS(i); 649 } 650 651 for (r = 1; r < mmu_page_sizes; r++) { 652 page_counters[r] = (hw_page_map_t *)alloc_base; 653 alloc_base += (max_mem_nodes * sizeof (hw_page_map_t)); 654 } 655 656 /* page_ctrs_cands and pcc_color_free array */ 657 for (i = 0; i < NPC_MUTEX; i++) { 658 for (r = 1; r < mmu_page_sizes; r++) { 659 660 page_ctrs_cands[i][r] = (pcc_info_t **)alloc_base; 661 alloc_base += sizeof (pcc_info_t *) * max_mem_nodes; 662 663 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 664 pcc_info_t *pi; 665 666 if (mem_node_config[mnode].exists == 0) 667 continue; 668 669 nranges = mnode_nranges[mnode]; 670 671 pi = (pcc_info_t *)alloc_base; 672 alloc_base += sizeof (pcc_info_t) * nranges; 673 page_ctrs_cands[i][r][mnode] = pi; 674 675 for (mrange = 0; mrange < nranges; mrange++) { 676 pi->pcc_color_free = 677 (pgcnt_t *)alloc_base; 678 alloc_base += sizeof (pgcnt_t) * 679 colors_per_szc[r]; 680 pi++; 681 } 682 } 683 } 684 } 685 686 /* ctr_mutex */ 687 for (i = 0; i < NPC_MUTEX; i++) { 688 ctr_mutex[i] = (kmutex_t *)alloc_base; 689 alloc_base += (max_mem_nodes * sizeof (kmutex_t)); 690 } 691 692 /* initialize page list counts */ 693 PLCNT_INIT(alloc_base); 694 695 for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) { 696 697 pgcnt_t r_pgcnt; 698 pfn_t r_base; 699 pgcnt_t r_align; 700 int r_shift; 701 int nranges = mnode_nranges[mnode]; 702 703 if (mem_node_config[mnode].exists == 0) 704 continue; 705 706 HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn); 707 708 for (r = 1; r < mmu_page_sizes; r++) { 709 /* 710 * the page_counters base has to be aligned to the 711 * page count of page size code r otherwise the counts 712 * will cross large page boundaries. 713 */ 714 r_align = page_get_pagecnt(r); 715 r_base = physbase; 716 /* base needs to be aligned - lower to aligned value */ 717 r_base &= ~(r_align - 1); 718 r_pgcnt = howmany(physmax - r_base + 1, r_align); 719 r_shift = PAGE_BSZS_SHIFT(r); 720 721 PAGE_COUNTERS_SHIFT(mnode, r) = r_shift; 722 PAGE_COUNTERS_ENTRIES(mnode, r) = r_pgcnt; 723 PAGE_COUNTERS_BASE(mnode, r) = r_base; 724 for (mrange = 0; mrange < nranges; mrange++) { 725 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, 726 r, mrange) = (size_t *)alloc_base; 727 alloc_base += sizeof (size_t) * 728 colors_per_szc[r]; 729 } 730 for (i = 0; i < colors_per_szc[r]; i++) { 731 uint_t color_mask = colors_per_szc[r] - 1; 732 pfn_t pfnum = r_base; 733 size_t idx; 734 int mrange; 735 MEM_NODE_ITERATOR_DECL(it); 736 737 MEM_NODE_ITERATOR_INIT(pfnum, mnode, r, &it); 738 if (pfnum == (pfn_t)-1) { 739 idx = 0; 740 } else { 741 PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i, 742 color_mask, color_mask, &it); 743 idx = PNUM_TO_IDX(mnode, r, pfnum); 744 idx = (idx >= r_pgcnt) ? 0 : idx; 745 } 746 for (mrange = 0; mrange < nranges; mrange++) { 747 PAGE_COUNTERS_CURRENT_COLOR(mnode, 748 r, i, mrange) = idx; 749 } 750 } 751 752 /* hpm_counters may be shared by all mnodes */ 753 if (firstmn == mnode) { 754 PAGE_COUNTERS_COUNTERS(mnode, r) = 755 (hpmctr_t *)alloc_base; 756 alloc_base += 757 P2ROUNDUP((sizeof (hpmctr_t) * r_pgcnt), 758 sizeof (hpmctr_t *)); 759 } else { 760 PAGE_COUNTERS_COUNTERS(mnode, r) = 761 PAGE_COUNTERS_COUNTERS(firstmn, r); 762 } 763 764 /* 765 * Verify that PNUM_TO_IDX and IDX_TO_PNUM 766 * satisfy the identity requirement. 767 * We should be able to go from one to the other 768 * and get consistent values. 769 */ 770 ASSERT(PNUM_TO_IDX(mnode, r, 771 (IDX_TO_PNUM(mnode, r, 0))) == 0); 772 ASSERT(IDX_TO_PNUM(mnode, r, 773 (PNUM_TO_IDX(mnode, r, r_base))) == r_base); 774 } 775 /* 776 * Roundup the start address of the page_counters to 777 * cache aligned boundary for every memory node. 778 * page_ctrs_sz() has added some slop for these roundups. 779 */ 780 alloc_base = (caddr_t)P2ROUNDUP((uintptr_t)alloc_base, 781 L2CACHE_ALIGN); 782 } 783 784 /* Initialize other page counter specific data structures. */ 785 for (mnode = 0; mnode < MAX_MEM_NODES; mnode++) { 786 rw_init(&page_ctrs_rwlock[mnode], NULL, RW_DEFAULT, NULL); 787 } 788 789 return (alloc_base); 790 } 791 792 /* 793 * Functions to adjust region counters for each size free list. 794 * Caller is responsible to acquire the ctr_mutex lock if necessary and 795 * thus can be called during startup without locks. 796 */ 797 /* ARGSUSED */ 798 void 799 page_ctr_add_internal(int mnode, int mtype, page_t *pp, int flags) 800 { 801 ssize_t r; /* region size */ 802 ssize_t idx; 803 pfn_t pfnum; 804 int lckidx; 805 806 ASSERT(mnode == PP_2_MEM_NODE(pp)); 807 ASSERT(mtype == PP_2_MTYPE(pp)); 808 809 ASSERT(pp->p_szc < mmu_page_sizes); 810 811 PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags); 812 813 /* no counter update needed for largest page size */ 814 if (pp->p_szc >= mmu_page_sizes - 1) { 815 return; 816 } 817 818 r = pp->p_szc + 1; 819 pfnum = pp->p_pagenum; 820 lckidx = PP_CTR_LOCK_INDX(pp); 821 822 /* 823 * Increment the count of free pages for the current 824 * region. Continue looping up in region size incrementing 825 * count if the preceeding region is full. 826 */ 827 while (r < mmu_page_sizes) { 828 idx = PNUM_TO_IDX(mnode, r, pfnum); 829 830 ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r)); 831 ASSERT(PAGE_COUNTERS(mnode, r, idx) < FULL_REGION_CNT(r)); 832 833 if (++PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r)) { 834 break; 835 } else { 836 int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r)); 837 pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode] 838 [MTYPE_2_MRANGE(mnode, root_mtype)]; 839 840 cand->pcc_pages_free++; 841 cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]++; 842 } 843 r++; 844 } 845 } 846 847 void 848 page_ctr_add(int mnode, int mtype, page_t *pp, int flags) 849 { 850 int lckidx = PP_CTR_LOCK_INDX(pp); 851 kmutex_t *lock = &ctr_mutex[lckidx][mnode]; 852 853 mutex_enter(lock); 854 page_ctr_add_internal(mnode, mtype, pp, flags); 855 mutex_exit(lock); 856 } 857 858 void 859 page_ctr_sub_internal(int mnode, int mtype, page_t *pp, int flags) 860 { 861 int lckidx; 862 ssize_t r; /* region size */ 863 ssize_t idx; 864 pfn_t pfnum; 865 866 ASSERT(mnode == PP_2_MEM_NODE(pp)); 867 ASSERT(mtype == PP_2_MTYPE(pp)); 868 869 ASSERT(pp->p_szc < mmu_page_sizes); 870 871 PLCNT_DECR(pp, mnode, mtype, pp->p_szc, flags); 872 873 /* no counter update needed for largest page size */ 874 if (pp->p_szc >= mmu_page_sizes - 1) { 875 return; 876 } 877 878 r = pp->p_szc + 1; 879 pfnum = pp->p_pagenum; 880 lckidx = PP_CTR_LOCK_INDX(pp); 881 882 /* 883 * Decrement the count of free pages for the current 884 * region. Continue looping up in region size decrementing 885 * count if the preceeding region was full. 886 */ 887 while (r < mmu_page_sizes) { 888 idx = PNUM_TO_IDX(mnode, r, pfnum); 889 890 ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r)); 891 ASSERT(PAGE_COUNTERS(mnode, r, idx) > 0); 892 893 if (--PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r) - 1) { 894 break; 895 } else { 896 int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r)); 897 pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode] 898 [MTYPE_2_MRANGE(mnode, root_mtype)]; 899 900 ASSERT(cand->pcc_pages_free != 0); 901 ASSERT(cand->pcc_color_free[PP_2_BIN_SZC(pp, r)] != 0); 902 903 cand->pcc_pages_free--; 904 cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]--; 905 } 906 r++; 907 } 908 } 909 910 void 911 page_ctr_sub(int mnode, int mtype, page_t *pp, int flags) 912 { 913 int lckidx = PP_CTR_LOCK_INDX(pp); 914 kmutex_t *lock = &ctr_mutex[lckidx][mnode]; 915 916 mutex_enter(lock); 917 page_ctr_sub_internal(mnode, mtype, pp, flags); 918 mutex_exit(lock); 919 } 920 921 /* 922 * Adjust page counters following a memory attach, since typically the 923 * size of the array needs to change, and the PFN to counter index 924 * mapping needs to change. 925 * 926 * It is possible this mnode did not exist at startup. In that case 927 * allocate pcc_info_t and pcc_color_free arrays. Also, allow for nranges 928 * to change (a theoretical possibility on x86), which means pcc_color_free 929 * arrays must be extended. 930 */ 931 uint_t 932 page_ctrs_adjust(int mnode) 933 { 934 pgcnt_t npgs; 935 int r; /* region size */ 936 int i; 937 size_t pcsz, old_csz; 938 hpmctr_t *new_ctr, *old_ctr; 939 pfn_t oldbase, newbase; 940 pfn_t physbase, physmax; 941 size_t old_npgs; 942 hpmctr_t *ctr_cache[MMU_PAGE_SIZES]; 943 size_t size_cache[MMU_PAGE_SIZES]; 944 size_t *color_cache[MMU_PAGE_SIZES][MAX_MNODE_MRANGES]; 945 size_t *old_color_array[MAX_MNODE_MRANGES]; 946 pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; 947 pcc_info_t **cands_cache; 948 pcc_info_t *old_pi, *pi; 949 pgcnt_t *pgcntp; 950 int nr, old_nranges, mrange, nranges = MNODE_RANGE_CNT(mnode); 951 int cands_cache_nranges; 952 int old_maxmrange, new_maxmrange; 953 int rc = 0; 954 955 cands_cache = kmem_zalloc(sizeof (pcc_info_t *) * NPC_MUTEX * 956 MMU_PAGE_SIZES, KM_NOSLEEP); 957 if (cands_cache == NULL) 958 return (ENOMEM); 959 960 i = -1; 961 HPM_COUNTERS_LIMITS(mnode, physbase, physmax, i); 962 963 newbase = physbase & ~PC_BASE_ALIGN_MASK; 964 npgs = roundup(physmax, PC_BASE_ALIGN) - newbase; 965 966 /* prepare to free non-null pointers on the way out */ 967 cands_cache_nranges = nranges; 968 bzero(ctr_cache, sizeof (ctr_cache)); 969 bzero(color_cache, sizeof (color_cache)); 970 971 /* 972 * We need to determine how many page colors there are for each 973 * page size in order to allocate memory for any color specific 974 * arrays. 975 */ 976 for (r = 0; r < mmu_page_sizes; r++) { 977 colors_per_szc[r] = PAGE_GET_PAGECOLORS(r); 978 } 979 980 /* 981 * Preallocate all of the new hpm_counters arrays as we can't 982 * hold the page_ctrs_rwlock as a writer and allocate memory. 983 * If we can't allocate all of the arrays, undo our work so far 984 * and return failure. 985 */ 986 for (r = 1; r < mmu_page_sizes; r++) { 987 pcsz = npgs >> PAGE_BSZS_SHIFT(r); 988 size_cache[r] = pcsz; 989 ctr_cache[r] = kmem_zalloc(pcsz * 990 sizeof (hpmctr_t), KM_NOSLEEP); 991 if (ctr_cache[r] == NULL) { 992 rc = ENOMEM; 993 goto cleanup; 994 } 995 } 996 997 /* 998 * Preallocate all of the new color current arrays as we can't 999 * hold the page_ctrs_rwlock as a writer and allocate memory. 1000 * If we can't allocate all of the arrays, undo our work so far 1001 * and return failure. 1002 */ 1003 for (r = 1; r < mmu_page_sizes; r++) { 1004 for (mrange = 0; mrange < nranges; mrange++) { 1005 color_cache[r][mrange] = kmem_zalloc(sizeof (size_t) * 1006 colors_per_szc[r], KM_NOSLEEP); 1007 if (color_cache[r][mrange] == NULL) { 1008 rc = ENOMEM; 1009 goto cleanup; 1010 } 1011 } 1012 } 1013 1014 /* 1015 * Preallocate all of the new pcc_info_t arrays as we can't 1016 * hold the page_ctrs_rwlock as a writer and allocate memory. 1017 * If we can't allocate all of the arrays, undo our work so far 1018 * and return failure. 1019 */ 1020 for (r = 1; r < mmu_page_sizes; r++) { 1021 for (i = 0; i < NPC_MUTEX; i++) { 1022 pi = kmem_zalloc(nranges * sizeof (pcc_info_t), 1023 KM_NOSLEEP); 1024 if (pi == NULL) { 1025 rc = ENOMEM; 1026 goto cleanup; 1027 } 1028 cands_cache[i * MMU_PAGE_SIZES + r] = pi; 1029 1030 for (mrange = 0; mrange < nranges; mrange++, pi++) { 1031 pgcntp = kmem_zalloc(colors_per_szc[r] * 1032 sizeof (pgcnt_t), KM_NOSLEEP); 1033 if (pgcntp == NULL) { 1034 rc = ENOMEM; 1035 goto cleanup; 1036 } 1037 pi->pcc_color_free = pgcntp; 1038 } 1039 } 1040 } 1041 1042 /* 1043 * Grab the write lock to prevent others from walking these arrays 1044 * while we are modifying them. 1045 */ 1046 PAGE_CTRS_WRITE_LOCK(mnode); 1047 1048 old_nranges = mnode_nranges[mnode]; 1049 cands_cache_nranges = old_nranges; 1050 mnode_nranges[mnode] = nranges; 1051 old_maxmrange = mnode_maxmrange[mnode]; 1052 mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode); 1053 new_maxmrange = mnode_maxmrange[mnode]; 1054 1055 for (r = 1; r < mmu_page_sizes; r++) { 1056 PAGE_COUNTERS_SHIFT(mnode, r) = PAGE_BSZS_SHIFT(r); 1057 old_ctr = PAGE_COUNTERS_COUNTERS(mnode, r); 1058 old_csz = PAGE_COUNTERS_ENTRIES(mnode, r); 1059 oldbase = PAGE_COUNTERS_BASE(mnode, r); 1060 old_npgs = old_csz << PAGE_COUNTERS_SHIFT(mnode, r); 1061 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) { 1062 old_color_array[mrange] = 1063 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, 1064 r, mrange); 1065 } 1066 1067 pcsz = npgs >> PAGE_COUNTERS_SHIFT(mnode, r); 1068 new_ctr = ctr_cache[r]; 1069 ctr_cache[r] = NULL; 1070 if (old_ctr != NULL && 1071 (oldbase + old_npgs > newbase) && 1072 (newbase + npgs > oldbase)) { 1073 /* 1074 * Map the intersection of the old and new 1075 * counters into the new array. 1076 */ 1077 size_t offset; 1078 if (newbase > oldbase) { 1079 offset = (newbase - oldbase) >> 1080 PAGE_COUNTERS_SHIFT(mnode, r); 1081 bcopy(old_ctr + offset, new_ctr, 1082 MIN(pcsz, (old_csz - offset)) * 1083 sizeof (hpmctr_t)); 1084 } else { 1085 offset = (oldbase - newbase) >> 1086 PAGE_COUNTERS_SHIFT(mnode, r); 1087 bcopy(old_ctr, new_ctr + offset, 1088 MIN(pcsz - offset, old_csz) * 1089 sizeof (hpmctr_t)); 1090 } 1091 } 1092 1093 PAGE_COUNTERS_COUNTERS(mnode, r) = new_ctr; 1094 PAGE_COUNTERS_ENTRIES(mnode, r) = pcsz; 1095 PAGE_COUNTERS_BASE(mnode, r) = newbase; 1096 1097 /* update shared hpm_counters in other mnodes */ 1098 if (interleaved_mnodes) { 1099 for (i = 0; i < max_mem_nodes; i++) { 1100 if (i == mnode) 1101 continue; 1102 if (mem_node_config[i].exists == 0) 1103 continue; 1104 ASSERT(PAGE_COUNTERS_COUNTERS(i, r) == old_ctr); 1105 PAGE_COUNTERS_COUNTERS(i, r) = new_ctr; 1106 PAGE_COUNTERS_ENTRIES(i, r) = pcsz; 1107 PAGE_COUNTERS_BASE(i, r) = newbase; 1108 } 1109 } 1110 1111 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) { 1112 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r, mrange) = 1113 color_cache[r][mrange]; 1114 color_cache[r][mrange] = NULL; 1115 } 1116 /* 1117 * for now, just reset on these events as it's probably 1118 * not worthwhile to try and optimize this. 1119 */ 1120 for (i = 0; i < colors_per_szc[r]; i++) { 1121 uint_t color_mask = colors_per_szc[r] - 1; 1122 int mlo = interleaved_mnodes ? 0 : mnode; 1123 int mhi = interleaved_mnodes ? max_mem_nodes : 1124 (mnode + 1); 1125 int m; 1126 pfn_t pfnum = newbase; 1127 size_t idx; 1128 MEM_NODE_ITERATOR_DECL(it); 1129 1130 for (m = mlo; m < mhi; m++) { 1131 if (mem_node_config[m].exists == 0) 1132 continue; 1133 MEM_NODE_ITERATOR_INIT(pfnum, m, r, &it); 1134 if (pfnum == (pfn_t)-1) { 1135 idx = 0; 1136 } else { 1137 PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i, 1138 color_mask, color_mask, &it); 1139 idx = PNUM_TO_IDX(m, r, pfnum); 1140 idx = (idx < pcsz) ? idx : 0; 1141 } 1142 for (mrange = 0; mrange < nranges; mrange++) { 1143 PAGE_COUNTERS_CURRENT_COLOR(m, 1144 r, i, mrange) = idx; 1145 } 1146 } 1147 } 1148 1149 /* cache info for freeing out of the critical path */ 1150 if ((caddr_t)old_ctr >= kernelheap && 1151 (caddr_t)old_ctr < ekernelheap) { 1152 ctr_cache[r] = old_ctr; 1153 size_cache[r] = old_csz; 1154 } 1155 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) { 1156 size_t *tmp = old_color_array[mrange]; 1157 if ((caddr_t)tmp >= kernelheap && 1158 (caddr_t)tmp < ekernelheap) { 1159 color_cache[r][mrange] = tmp; 1160 } 1161 } 1162 /* 1163 * Verify that PNUM_TO_IDX and IDX_TO_PNUM 1164 * satisfy the identity requirement. 1165 * We should be able to go from one to the other 1166 * and get consistent values. 1167 */ 1168 ASSERT(PNUM_TO_IDX(mnode, r, 1169 (IDX_TO_PNUM(mnode, r, 0))) == 0); 1170 ASSERT(IDX_TO_PNUM(mnode, r, 1171 (PNUM_TO_IDX(mnode, r, newbase))) == newbase); 1172 1173 /* pcc_info_t and pcc_color_free */ 1174 for (i = 0; i < NPC_MUTEX; i++) { 1175 pcc_info_t *epi; 1176 pcc_info_t *eold_pi; 1177 1178 pi = cands_cache[i * MMU_PAGE_SIZES + r]; 1179 old_pi = page_ctrs_cands[i][r][mnode]; 1180 page_ctrs_cands[i][r][mnode] = pi; 1181 cands_cache[i * MMU_PAGE_SIZES + r] = old_pi; 1182 1183 /* preserve old pcc_color_free values, if any */ 1184 if (old_pi == NULL) 1185 continue; 1186 1187 /* 1188 * when/if x86 does DR, must account for 1189 * possible change in range index when 1190 * preserving pcc_info 1191 */ 1192 epi = &pi[nranges]; 1193 eold_pi = &old_pi[old_nranges]; 1194 if (new_maxmrange > old_maxmrange) { 1195 pi += new_maxmrange - old_maxmrange; 1196 } else if (new_maxmrange < old_maxmrange) { 1197 old_pi += old_maxmrange - new_maxmrange; 1198 } 1199 for (; pi < epi && old_pi < eold_pi; pi++, old_pi++) { 1200 pcc_info_t tmp = *pi; 1201 *pi = *old_pi; 1202 *old_pi = tmp; 1203 } 1204 } 1205 } 1206 PAGE_CTRS_WRITE_UNLOCK(mnode); 1207 1208 /* 1209 * Now that we have dropped the write lock, it is safe to free all 1210 * of the memory we have cached above. 1211 * We come thru here to free memory when pre-alloc fails, and also to 1212 * free old pointers which were recorded while locked. 1213 */ 1214 cleanup: 1215 for (r = 1; r < mmu_page_sizes; r++) { 1216 if (ctr_cache[r] != NULL) { 1217 kmem_free(ctr_cache[r], 1218 size_cache[r] * sizeof (hpmctr_t)); 1219 } 1220 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) { 1221 if (color_cache[r][mrange] != NULL) { 1222 kmem_free(color_cache[r][mrange], 1223 colors_per_szc[r] * sizeof (size_t)); 1224 } 1225 } 1226 for (i = 0; i < NPC_MUTEX; i++) { 1227 pi = cands_cache[i * MMU_PAGE_SIZES + r]; 1228 if (pi == NULL) 1229 continue; 1230 nr = cands_cache_nranges; 1231 for (mrange = 0; mrange < nr; mrange++, pi++) { 1232 pgcntp = pi->pcc_color_free; 1233 if (pgcntp == NULL) 1234 continue; 1235 if ((caddr_t)pgcntp >= kernelheap && 1236 (caddr_t)pgcntp < ekernelheap) { 1237 kmem_free(pgcntp, 1238 colors_per_szc[r] * 1239 sizeof (pgcnt_t)); 1240 } 1241 } 1242 pi = cands_cache[i * MMU_PAGE_SIZES + r]; 1243 if ((caddr_t)pi >= kernelheap && 1244 (caddr_t)pi < ekernelheap) { 1245 kmem_free(pi, nr * sizeof (pcc_info_t)); 1246 } 1247 } 1248 } 1249 1250 kmem_free(cands_cache, 1251 sizeof (pcc_info_t *) * NPC_MUTEX * MMU_PAGE_SIZES); 1252 return (rc); 1253 } 1254 1255 1256 #ifdef DEBUG 1257 1258 /* 1259 * confirm pp is a large page corresponding to szc 1260 */ 1261 void 1262 chk_lpg(page_t *pp, uchar_t szc) 1263 { 1264 spgcnt_t npgs = page_get_pagecnt(pp->p_szc); 1265 uint_t noreloc; 1266 1267 if (npgs == 1) { 1268 ASSERT(pp->p_szc == 0); 1269 ASSERT(pp->p_next == pp); 1270 ASSERT(pp->p_prev == pp); 1271 return; 1272 } 1273 1274 ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL); 1275 ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL); 1276 1277 ASSERT(IS_P2ALIGNED(pp->p_pagenum, npgs)); 1278 ASSERT(pp->p_pagenum == (pp->p_next->p_pagenum - 1)); 1279 ASSERT(pp->p_prev->p_pagenum == (pp->p_pagenum + (npgs - 1))); 1280 ASSERT(pp->p_prev == (pp + (npgs - 1))); 1281 1282 /* 1283 * Check list of pages. 1284 */ 1285 noreloc = PP_ISNORELOC(pp); 1286 while (npgs--) { 1287 if (npgs != 0) { 1288 ASSERT(pp->p_pagenum == pp->p_next->p_pagenum - 1); 1289 ASSERT(pp->p_next == (pp + 1)); 1290 } 1291 ASSERT(pp->p_szc == szc); 1292 ASSERT(PP_ISFREE(pp)); 1293 ASSERT(PP_ISAGED(pp)); 1294 ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL); 1295 ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL); 1296 ASSERT(pp->p_vnode == NULL); 1297 ASSERT(PP_ISNORELOC(pp) == noreloc); 1298 1299 pp = pp->p_next; 1300 } 1301 } 1302 #endif /* DEBUG */ 1303 1304 void 1305 page_freelist_lock(int mnode) 1306 { 1307 int i; 1308 for (i = 0; i < NPC_MUTEX; i++) { 1309 mutex_enter(FPC_MUTEX(mnode, i)); 1310 mutex_enter(CPC_MUTEX(mnode, i)); 1311 } 1312 } 1313 1314 void 1315 page_freelist_unlock(int mnode) 1316 { 1317 int i; 1318 for (i = 0; i < NPC_MUTEX; i++) { 1319 mutex_exit(FPC_MUTEX(mnode, i)); 1320 mutex_exit(CPC_MUTEX(mnode, i)); 1321 } 1322 } 1323 1324 /* 1325 * add pp to the specified page list. Defaults to head of the page list 1326 * unless PG_LIST_TAIL is specified. 1327 */ 1328 void 1329 page_list_add(page_t *pp, int flags) 1330 { 1331 page_t **ppp; 1332 kmutex_t *pcm; 1333 uint_t bin, mtype; 1334 int mnode; 1335 1336 ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT)); 1337 ASSERT(PP_ISFREE(pp)); 1338 ASSERT(!hat_page_is_mapped(pp)); 1339 ASSERT(hat_page_getshare(pp) == 0); 1340 1341 /* 1342 * Large pages should be freed via page_list_add_pages(). 1343 */ 1344 ASSERT(pp->p_szc == 0); 1345 1346 /* 1347 * Don't need to lock the freelist first here 1348 * because the page isn't on the freelist yet. 1349 * This means p_szc can't change on us. 1350 */ 1351 1352 bin = PP_2_BIN(pp); 1353 mnode = PP_2_MEM_NODE(pp); 1354 mtype = PP_2_MTYPE(pp); 1355 1356 if (flags & PG_LIST_ISINIT) { 1357 /* 1358 * PG_LIST_ISINIT is set during system startup (ie. single 1359 * threaded), add a page to the free list and add to the 1360 * the free region counters w/o any locking 1361 */ 1362 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1363 1364 /* inline version of page_add() */ 1365 if (*ppp != NULL) { 1366 pp->p_next = *ppp; 1367 pp->p_prev = (*ppp)->p_prev; 1368 (*ppp)->p_prev = pp; 1369 pp->p_prev->p_next = pp; 1370 } else 1371 *ppp = pp; 1372 1373 page_ctr_add_internal(mnode, mtype, pp, flags); 1374 VM_STAT_ADD(vmm_vmstats.pladd_free[0]); 1375 } else { 1376 pcm = PC_BIN_MUTEX(mnode, bin, flags); 1377 1378 if (flags & PG_FREE_LIST) { 1379 VM_STAT_ADD(vmm_vmstats.pladd_free[0]); 1380 ASSERT(PP_ISAGED(pp)); 1381 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1382 1383 } else { 1384 VM_STAT_ADD(vmm_vmstats.pladd_cache); 1385 ASSERT(pp->p_vnode); 1386 ASSERT((pp->p_offset & PAGEOFFSET) == 0); 1387 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1388 } 1389 mutex_enter(pcm); 1390 page_add(ppp, pp); 1391 1392 if (flags & PG_LIST_TAIL) 1393 *ppp = (*ppp)->p_next; 1394 /* 1395 * Add counters before releasing pcm mutex to avoid a race with 1396 * page_freelist_coalesce and page_freelist_split. 1397 */ 1398 page_ctr_add(mnode, mtype, pp, flags); 1399 mutex_exit(pcm); 1400 } 1401 1402 1403 #if defined(__sparc) 1404 if (PP_ISNORELOC(pp)) { 1405 kcage_freemem_add(1); 1406 } 1407 #endif 1408 /* 1409 * It is up to the caller to unlock the page! 1410 */ 1411 ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT)); 1412 } 1413 1414 1415 #ifdef __sparc 1416 /* 1417 * This routine is only used by kcage_init during system startup. 1418 * It performs the function of page_list_sub/PP_SETNORELOC/page_list_add 1419 * without the overhead of taking locks and updating counters. 1420 */ 1421 void 1422 page_list_noreloc_startup(page_t *pp) 1423 { 1424 page_t **ppp; 1425 uint_t bin; 1426 int mnode; 1427 int mtype; 1428 int flags = 0; 1429 1430 /* 1431 * If this is a large page on the freelist then 1432 * break it up into smaller pages. 1433 */ 1434 if (pp->p_szc != 0) 1435 page_boot_demote(pp); 1436 1437 /* 1438 * Get list page is currently on. 1439 */ 1440 bin = PP_2_BIN(pp); 1441 mnode = PP_2_MEM_NODE(pp); 1442 mtype = PP_2_MTYPE(pp); 1443 ASSERT(mtype == MTYPE_RELOC); 1444 ASSERT(pp->p_szc == 0); 1445 1446 if (PP_ISAGED(pp)) { 1447 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1448 flags |= PG_FREE_LIST; 1449 } else { 1450 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1451 flags |= PG_CACHE_LIST; 1452 } 1453 1454 ASSERT(*ppp != NULL); 1455 1456 /* 1457 * Delete page from current list. 1458 */ 1459 if (*ppp == pp) 1460 *ppp = pp->p_next; /* go to next page */ 1461 if (*ppp == pp) { 1462 *ppp = NULL; /* page list is gone */ 1463 } else { 1464 pp->p_prev->p_next = pp->p_next; 1465 pp->p_next->p_prev = pp->p_prev; 1466 } 1467 1468 /* 1469 * Decrement page counters 1470 */ 1471 page_ctr_sub_internal(mnode, mtype, pp, flags); 1472 1473 /* 1474 * Set no reloc for cage initted pages. 1475 */ 1476 PP_SETNORELOC(pp); 1477 1478 mtype = PP_2_MTYPE(pp); 1479 ASSERT(mtype == MTYPE_NORELOC); 1480 1481 /* 1482 * Get new list for page. 1483 */ 1484 if (PP_ISAGED(pp)) { 1485 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1486 } else { 1487 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1488 } 1489 1490 /* 1491 * Insert page on new list. 1492 */ 1493 if (*ppp == NULL) { 1494 *ppp = pp; 1495 pp->p_next = pp->p_prev = pp; 1496 } else { 1497 pp->p_next = *ppp; 1498 pp->p_prev = (*ppp)->p_prev; 1499 (*ppp)->p_prev = pp; 1500 pp->p_prev->p_next = pp; 1501 } 1502 1503 /* 1504 * Increment page counters 1505 */ 1506 page_ctr_add_internal(mnode, mtype, pp, flags); 1507 1508 /* 1509 * Update cage freemem counter 1510 */ 1511 atomic_add_long(&kcage_freemem, 1); 1512 } 1513 #else /* __sparc */ 1514 1515 /* ARGSUSED */ 1516 void 1517 page_list_noreloc_startup(page_t *pp) 1518 { 1519 panic("page_list_noreloc_startup: should be here only for sparc"); 1520 } 1521 #endif 1522 1523 void 1524 page_list_add_pages(page_t *pp, int flags) 1525 { 1526 kmutex_t *pcm; 1527 pgcnt_t pgcnt; 1528 uint_t bin, mtype, i; 1529 int mnode; 1530 1531 /* default to freelist/head */ 1532 ASSERT((flags & (PG_CACHE_LIST | PG_LIST_TAIL)) == 0); 1533 1534 CHK_LPG(pp, pp->p_szc); 1535 VM_STAT_ADD(vmm_vmstats.pladd_free[pp->p_szc]); 1536 1537 bin = PP_2_BIN(pp); 1538 mnode = PP_2_MEM_NODE(pp); 1539 mtype = PP_2_MTYPE(pp); 1540 1541 if (flags & PG_LIST_ISINIT) { 1542 ASSERT(pp->p_szc == mmu_page_sizes - 1); 1543 page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1544 ASSERT(!PP_ISNORELOC(pp)); 1545 PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags); 1546 } else { 1547 1548 ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes); 1549 1550 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 1551 1552 mutex_enter(pcm); 1553 page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1554 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST); 1555 mutex_exit(pcm); 1556 1557 pgcnt = page_get_pagecnt(pp->p_szc); 1558 #if defined(__sparc) 1559 if (PP_ISNORELOC(pp)) 1560 kcage_freemem_add(pgcnt); 1561 #endif 1562 for (i = 0; i < pgcnt; i++, pp++) 1563 page_unlock_nocapture(pp); 1564 } 1565 } 1566 1567 /* 1568 * During boot, need to demote a large page to base 1569 * pagesize pages for seg_kmem for use in boot_alloc() 1570 */ 1571 void 1572 page_boot_demote(page_t *pp) 1573 { 1574 ASSERT(pp->p_szc != 0); 1575 ASSERT(PP_ISFREE(pp)); 1576 ASSERT(PP_ISAGED(pp)); 1577 1578 (void) page_demote(PP_2_MEM_NODE(pp), 1579 PFN_BASE(pp->p_pagenum, pp->p_szc), pp->p_szc, 0, PC_NO_COLOR, 1580 PC_FREE); 1581 1582 ASSERT(PP_ISFREE(pp)); 1583 ASSERT(PP_ISAGED(pp)); 1584 ASSERT(pp->p_szc == 0); 1585 } 1586 1587 /* 1588 * Take a particular page off of whatever freelist the page 1589 * is claimed to be on. 1590 * 1591 * NOTE: Only used for PAGESIZE pages. 1592 */ 1593 void 1594 page_list_sub(page_t *pp, int flags) 1595 { 1596 int bin; 1597 uint_t mtype; 1598 int mnode; 1599 kmutex_t *pcm; 1600 page_t **ppp; 1601 1602 ASSERT(PAGE_EXCL(pp)); 1603 ASSERT(PP_ISFREE(pp)); 1604 1605 /* 1606 * The p_szc field can only be changed by page_promote() 1607 * and page_demote(). Only free pages can be promoted and 1608 * demoted and the free list MUST be locked during these 1609 * operations. So to prevent a race in page_list_sub() 1610 * between computing which bin of the freelist lock to 1611 * grab and actually grabing the lock we check again that 1612 * the bin we locked is still the correct one. Notice that 1613 * the p_szc field could have actually changed on us but 1614 * if the bin happens to still be the same we are safe. 1615 */ 1616 try_again: 1617 bin = PP_2_BIN(pp); 1618 mnode = PP_2_MEM_NODE(pp); 1619 pcm = PC_BIN_MUTEX(mnode, bin, flags); 1620 mutex_enter(pcm); 1621 if (PP_2_BIN(pp) != bin) { 1622 mutex_exit(pcm); 1623 goto try_again; 1624 } 1625 mtype = PP_2_MTYPE(pp); 1626 1627 if (flags & PG_FREE_LIST) { 1628 VM_STAT_ADD(vmm_vmstats.plsub_free[0]); 1629 ASSERT(PP_ISAGED(pp)); 1630 ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype); 1631 } else { 1632 VM_STAT_ADD(vmm_vmstats.plsub_cache); 1633 ASSERT(!PP_ISAGED(pp)); 1634 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1635 } 1636 1637 /* 1638 * Common PAGESIZE case. 1639 * 1640 * Note that we locked the freelist. This prevents 1641 * any page promotion/demotion operations. Therefore 1642 * the p_szc will not change until we drop pcm mutex. 1643 */ 1644 if (pp->p_szc == 0) { 1645 page_sub(ppp, pp); 1646 /* 1647 * Subtract counters before releasing pcm mutex 1648 * to avoid race with page_freelist_coalesce. 1649 */ 1650 page_ctr_sub(mnode, mtype, pp, flags); 1651 mutex_exit(pcm); 1652 1653 #if defined(__sparc) 1654 if (PP_ISNORELOC(pp)) { 1655 kcage_freemem_sub(1); 1656 } 1657 #endif 1658 return; 1659 } 1660 1661 /* 1662 * Large pages on the cache list are not supported. 1663 */ 1664 if (flags & PG_CACHE_LIST) 1665 panic("page_list_sub: large page on cachelist"); 1666 1667 /* 1668 * Slow but rare. 1669 * 1670 * Somebody wants this particular page which is part 1671 * of a large page. In this case we just demote the page 1672 * if it's on the freelist. 1673 * 1674 * We have to drop pcm before locking the entire freelist. 1675 * Once we have re-locked the freelist check to make sure 1676 * the page hasn't already been demoted or completely 1677 * freed. 1678 */ 1679 mutex_exit(pcm); 1680 page_freelist_lock(mnode); 1681 if (pp->p_szc != 0) { 1682 /* 1683 * Large page is on freelist. 1684 */ 1685 (void) page_demote(mnode, PFN_BASE(pp->p_pagenum, pp->p_szc), 1686 pp->p_szc, 0, PC_NO_COLOR, PC_FREE); 1687 } 1688 ASSERT(PP_ISFREE(pp)); 1689 ASSERT(PP_ISAGED(pp)); 1690 ASSERT(pp->p_szc == 0); 1691 1692 /* 1693 * Subtract counters before releasing pcm mutex 1694 * to avoid race with page_freelist_coalesce. 1695 */ 1696 bin = PP_2_BIN(pp); 1697 mtype = PP_2_MTYPE(pp); 1698 ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype); 1699 1700 page_sub(ppp, pp); 1701 page_ctr_sub(mnode, mtype, pp, flags); 1702 page_freelist_unlock(mnode); 1703 1704 #if defined(__sparc) 1705 if (PP_ISNORELOC(pp)) { 1706 kcage_freemem_sub(1); 1707 } 1708 #endif 1709 } 1710 1711 void 1712 page_list_sub_pages(page_t *pp, uint_t szc) 1713 { 1714 kmutex_t *pcm; 1715 uint_t bin, mtype; 1716 int mnode; 1717 1718 ASSERT(PAGE_EXCL(pp)); 1719 ASSERT(PP_ISFREE(pp)); 1720 ASSERT(PP_ISAGED(pp)); 1721 1722 /* 1723 * See comment in page_list_sub(). 1724 */ 1725 try_again: 1726 bin = PP_2_BIN(pp); 1727 mnode = PP_2_MEM_NODE(pp); 1728 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 1729 mutex_enter(pcm); 1730 if (PP_2_BIN(pp) != bin) { 1731 mutex_exit(pcm); 1732 goto try_again; 1733 } 1734 1735 /* 1736 * If we're called with a page larger than szc or it got 1737 * promoted above szc before we locked the freelist then 1738 * drop pcm and re-lock entire freelist. If page still larger 1739 * than szc then demote it. 1740 */ 1741 if (pp->p_szc > szc) { 1742 mutex_exit(pcm); 1743 pcm = NULL; 1744 page_freelist_lock(mnode); 1745 if (pp->p_szc > szc) { 1746 VM_STAT_ADD(vmm_vmstats.plsubpages_szcbig); 1747 (void) page_demote(mnode, 1748 PFN_BASE(pp->p_pagenum, pp->p_szc), 1749 pp->p_szc, szc, PC_NO_COLOR, PC_FREE); 1750 } 1751 bin = PP_2_BIN(pp); 1752 } 1753 ASSERT(PP_ISFREE(pp)); 1754 ASSERT(PP_ISAGED(pp)); 1755 ASSERT(pp->p_szc <= szc); 1756 ASSERT(pp == PP_PAGEROOT(pp)); 1757 1758 VM_STAT_ADD(vmm_vmstats.plsub_free[pp->p_szc]); 1759 1760 mtype = PP_2_MTYPE(pp); 1761 if (pp->p_szc != 0) { 1762 page_vpsub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1763 CHK_LPG(pp, pp->p_szc); 1764 } else { 1765 VM_STAT_ADD(vmm_vmstats.plsubpages_szc0); 1766 page_sub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1767 } 1768 page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST); 1769 1770 if (pcm != NULL) { 1771 mutex_exit(pcm); 1772 } else { 1773 page_freelist_unlock(mnode); 1774 } 1775 1776 #if defined(__sparc) 1777 if (PP_ISNORELOC(pp)) { 1778 pgcnt_t pgcnt; 1779 1780 pgcnt = page_get_pagecnt(pp->p_szc); 1781 kcage_freemem_sub(pgcnt); 1782 } 1783 #endif 1784 } 1785 1786 /* 1787 * Add the page to the front of a linked list of pages 1788 * using the p_next & p_prev pointers for the list. 1789 * The caller is responsible for protecting the list pointers. 1790 */ 1791 void 1792 mach_page_add(page_t **ppp, page_t *pp) 1793 { 1794 if (*ppp == NULL) { 1795 pp->p_next = pp->p_prev = pp; 1796 } else { 1797 pp->p_next = *ppp; 1798 pp->p_prev = (*ppp)->p_prev; 1799 (*ppp)->p_prev = pp; 1800 pp->p_prev->p_next = pp; 1801 } 1802 *ppp = pp; 1803 } 1804 1805 /* 1806 * Remove this page from a linked list of pages 1807 * using the p_next & p_prev pointers for the list. 1808 * 1809 * The caller is responsible for protecting the list pointers. 1810 */ 1811 void 1812 mach_page_sub(page_t **ppp, page_t *pp) 1813 { 1814 ASSERT(PP_ISFREE(pp)); 1815 1816 if (*ppp == NULL || pp == NULL) 1817 panic("mach_page_sub"); 1818 1819 if (*ppp == pp) 1820 *ppp = pp->p_next; /* go to next page */ 1821 1822 if (*ppp == pp) 1823 *ppp = NULL; /* page list is gone */ 1824 else { 1825 pp->p_prev->p_next = pp->p_next; 1826 pp->p_next->p_prev = pp->p_prev; 1827 } 1828 pp->p_prev = pp->p_next = pp; /* make pp a list of one */ 1829 } 1830 1831 /* 1832 * Routine fsflush uses to gradually coalesce the free list into larger pages. 1833 */ 1834 void 1835 page_promote_size(page_t *pp, uint_t cur_szc) 1836 { 1837 pfn_t pfn; 1838 int mnode; 1839 int idx; 1840 int new_szc = cur_szc + 1; 1841 int full = FULL_REGION_CNT(new_szc); 1842 1843 pfn = page_pptonum(pp); 1844 mnode = PFN_2_MEM_NODE(pfn); 1845 1846 page_freelist_lock(mnode); 1847 1848 idx = PNUM_TO_IDX(mnode, new_szc, pfn); 1849 if (PAGE_COUNTERS(mnode, new_szc, idx) == full) 1850 (void) page_promote(mnode, pfn, new_szc, PC_FREE, PC_MTYPE_ANY); 1851 1852 page_freelist_unlock(mnode); 1853 } 1854 1855 static uint_t page_promote_err; 1856 static uint_t page_promote_noreloc_err; 1857 1858 /* 1859 * Create a single larger page (of szc new_szc) from smaller contiguous pages 1860 * for the given mnode starting at pfnum. Pages involved are on the freelist 1861 * before the call and may be returned to the caller if requested, otherwise 1862 * they will be placed back on the freelist. 1863 * If flags is PC_ALLOC, then the large page will be returned to the user in 1864 * a state which is consistent with a page being taken off the freelist. If 1865 * we failed to lock the new large page, then we will return NULL to the 1866 * caller and put the large page on the freelist instead. 1867 * If flags is PC_FREE, then the large page will be placed on the freelist, 1868 * and NULL will be returned. 1869 * The caller is responsible for locking the freelist as well as any other 1870 * accounting which needs to be done for a returned page. 1871 * 1872 * RFE: For performance pass in pp instead of pfnum so 1873 * we can avoid excessive calls to page_numtopp_nolock(). 1874 * This would depend on an assumption that all contiguous 1875 * pages are in the same memseg so we can just add/dec 1876 * our pp. 1877 * 1878 * Lock ordering: 1879 * 1880 * There is a potential but rare deadlock situation 1881 * for page promotion and demotion operations. The problem 1882 * is there are two paths into the freelist manager and 1883 * they have different lock orders: 1884 * 1885 * page_create() 1886 * lock freelist 1887 * page_lock(EXCL) 1888 * unlock freelist 1889 * return 1890 * caller drops page_lock 1891 * 1892 * page_free() and page_reclaim() 1893 * caller grabs page_lock(EXCL) 1894 * 1895 * lock freelist 1896 * unlock freelist 1897 * drop page_lock 1898 * 1899 * What prevents a thread in page_create() from deadlocking 1900 * with a thread freeing or reclaiming the same page is the 1901 * page_trylock() in page_get_freelist(). If the trylock fails 1902 * it skips the page. 1903 * 1904 * The lock ordering for promotion and demotion is the same as 1905 * for page_create(). Since the same deadlock could occur during 1906 * page promotion and freeing or reclaiming of a page on the 1907 * cache list we might have to fail the operation and undo what 1908 * have done so far. Again this is rare. 1909 */ 1910 page_t * 1911 page_promote(int mnode, pfn_t pfnum, uchar_t new_szc, int flags, int mtype) 1912 { 1913 page_t *pp, *pplist, *tpp, *start_pp; 1914 pgcnt_t new_npgs, npgs; 1915 uint_t bin; 1916 pgcnt_t tmpnpgs, pages_left; 1917 uint_t noreloc; 1918 int which_list; 1919 ulong_t index; 1920 kmutex_t *phm; 1921 1922 /* 1923 * General algorithm: 1924 * Find the starting page 1925 * Walk each page struct removing it from the freelist, 1926 * and linking it to all the other pages removed. 1927 * Once all pages are off the freelist, 1928 * walk the list, modifying p_szc to new_szc and what 1929 * ever other info needs to be done to create a large free page. 1930 * According to the flags, either return the page or put it 1931 * on the freelist. 1932 */ 1933 1934 start_pp = page_numtopp_nolock(pfnum); 1935 ASSERT(start_pp && (start_pp->p_pagenum == pfnum)); 1936 new_npgs = page_get_pagecnt(new_szc); 1937 ASSERT(IS_P2ALIGNED(pfnum, new_npgs)); 1938 1939 /* don't return page of the wrong mtype */ 1940 if (mtype != PC_MTYPE_ANY && mtype != PP_2_MTYPE(start_pp)) 1941 return (NULL); 1942 1943 /* 1944 * Loop through smaller pages to confirm that all pages 1945 * give the same result for PP_ISNORELOC(). 1946 * We can check this reliably here as the protocol for setting 1947 * P_NORELOC requires pages to be taken off the free list first. 1948 */ 1949 noreloc = PP_ISNORELOC(start_pp); 1950 for (pp = start_pp + new_npgs; --pp > start_pp; ) { 1951 if (noreloc != PP_ISNORELOC(pp)) { 1952 page_promote_noreloc_err++; 1953 page_promote_err++; 1954 return (NULL); 1955 } 1956 } 1957 1958 pages_left = new_npgs; 1959 pplist = NULL; 1960 pp = start_pp; 1961 1962 /* Loop around coalescing the smaller pages into a big page. */ 1963 while (pages_left) { 1964 /* 1965 * Remove from the freelist. 1966 */ 1967 ASSERT(PP_ISFREE(pp)); 1968 bin = PP_2_BIN(pp); 1969 ASSERT(mnode == PP_2_MEM_NODE(pp)); 1970 mtype = PP_2_MTYPE(pp); 1971 if (PP_ISAGED(pp)) { 1972 1973 /* 1974 * PG_FREE_LIST 1975 */ 1976 if (pp->p_szc) { 1977 page_vpsub(&PAGE_FREELISTS(mnode, 1978 pp->p_szc, bin, mtype), pp); 1979 } else { 1980 mach_page_sub(&PAGE_FREELISTS(mnode, 0, 1981 bin, mtype), pp); 1982 } 1983 which_list = PG_FREE_LIST; 1984 } else { 1985 ASSERT(pp->p_szc == 0); 1986 1987 /* 1988 * PG_CACHE_LIST 1989 * 1990 * Since this page comes from the 1991 * cachelist, we must destroy the 1992 * vnode association. 1993 */ 1994 if (!page_trylock(pp, SE_EXCL)) { 1995 goto fail_promote; 1996 } 1997 1998 /* 1999 * We need to be careful not to deadlock 2000 * with another thread in page_lookup(). 2001 * The page_lookup() thread could be holding 2002 * the same phm that we need if the two 2003 * pages happen to hash to the same phm lock. 2004 * At this point we have locked the entire 2005 * freelist and page_lookup() could be trying 2006 * to grab a freelist lock. 2007 */ 2008 index = PAGE_HASH_FUNC(pp->p_vnode, pp->p_offset); 2009 phm = PAGE_HASH_MUTEX(index); 2010 if (!mutex_tryenter(phm)) { 2011 page_unlock_nocapture(pp); 2012 goto fail_promote; 2013 } 2014 2015 mach_page_sub(&PAGE_CACHELISTS(mnode, bin, mtype), pp); 2016 page_hashout(pp, phm); 2017 mutex_exit(phm); 2018 PP_SETAGED(pp); 2019 page_unlock_nocapture(pp); 2020 which_list = PG_CACHE_LIST; 2021 } 2022 page_ctr_sub(mnode, mtype, pp, which_list); 2023 2024 /* 2025 * Concatenate the smaller page(s) onto 2026 * the large page list. 2027 */ 2028 tmpnpgs = npgs = page_get_pagecnt(pp->p_szc); 2029 pages_left -= npgs; 2030 tpp = pp; 2031 while (npgs--) { 2032 tpp->p_szc = new_szc; 2033 tpp = tpp->p_next; 2034 } 2035 page_list_concat(&pplist, &pp); 2036 pp += tmpnpgs; 2037 } 2038 CHK_LPG(pplist, new_szc); 2039 2040 /* 2041 * return the page to the user if requested 2042 * in the properly locked state. 2043 */ 2044 if (flags == PC_ALLOC && (page_trylock_cons(pplist, SE_EXCL))) { 2045 return (pplist); 2046 } 2047 2048 /* 2049 * Otherwise place the new large page on the freelist 2050 */ 2051 bin = PP_2_BIN(pplist); 2052 mnode = PP_2_MEM_NODE(pplist); 2053 mtype = PP_2_MTYPE(pplist); 2054 page_vpadd(&PAGE_FREELISTS(mnode, new_szc, bin, mtype), pplist); 2055 2056 page_ctr_add(mnode, mtype, pplist, PG_FREE_LIST); 2057 return (NULL); 2058 2059 fail_promote: 2060 /* 2061 * A thread must have still been freeing or 2062 * reclaiming the page on the cachelist. 2063 * To prevent a deadlock undo what we have 2064 * done sofar and return failure. This 2065 * situation can only happen while promoting 2066 * PAGESIZE pages. 2067 */ 2068 page_promote_err++; 2069 while (pplist) { 2070 pp = pplist; 2071 mach_page_sub(&pplist, pp); 2072 pp->p_szc = 0; 2073 bin = PP_2_BIN(pp); 2074 mtype = PP_2_MTYPE(pp); 2075 mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, mtype), pp); 2076 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST); 2077 } 2078 return (NULL); 2079 2080 } 2081 2082 /* 2083 * Break up a large page into smaller size pages. 2084 * Pages involved are on the freelist before the call and may 2085 * be returned to the caller if requested, otherwise they will 2086 * be placed back on the freelist. 2087 * The caller is responsible for locking the freelist as well as any other 2088 * accounting which needs to be done for a returned page. 2089 * If flags is not PC_ALLOC, the color argument is ignored, and thus 2090 * technically, any value may be passed in but PC_NO_COLOR is the standard 2091 * which should be followed for clarity's sake. 2092 */ 2093 page_t * 2094 page_demote(int mnode, pfn_t pfnum, uchar_t cur_szc, uchar_t new_szc, 2095 int color, int flags) 2096 { 2097 page_t *pp, *pplist, *npplist; 2098 pgcnt_t npgs, n; 2099 uint_t bin; 2100 uint_t mtype; 2101 page_t *ret_pp = NULL; 2102 2103 ASSERT(cur_szc != 0); 2104 ASSERT(new_szc < cur_szc); 2105 2106 pplist = page_numtopp_nolock(pfnum); 2107 ASSERT(pplist != NULL); 2108 2109 ASSERT(pplist->p_szc == cur_szc); 2110 2111 bin = PP_2_BIN(pplist); 2112 ASSERT(mnode == PP_2_MEM_NODE(pplist)); 2113 mtype = PP_2_MTYPE(pplist); 2114 page_vpsub(&PAGE_FREELISTS(mnode, cur_szc, bin, mtype), pplist); 2115 2116 CHK_LPG(pplist, cur_szc); 2117 page_ctr_sub(mnode, mtype, pplist, PG_FREE_LIST); 2118 2119 /* 2120 * Number of PAGESIZE pages for smaller new_szc 2121 * page. 2122 */ 2123 npgs = page_get_pagecnt(new_szc); 2124 2125 while (pplist) { 2126 pp = pplist; 2127 2128 ASSERT(pp->p_szc == cur_szc); 2129 2130 /* 2131 * We either break it up into PAGESIZE pages or larger. 2132 */ 2133 if (npgs == 1) { /* PAGESIZE case */ 2134 mach_page_sub(&pplist, pp); 2135 ASSERT(pp->p_szc == cur_szc); 2136 ASSERT(new_szc == 0); 2137 ASSERT(mnode == PP_2_MEM_NODE(pp)); 2138 pp->p_szc = new_szc; 2139 bin = PP_2_BIN(pp); 2140 if ((bin == color) && (flags == PC_ALLOC) && 2141 (ret_pp == NULL) && 2142 page_trylock_cons(pp, SE_EXCL)) { 2143 ret_pp = pp; 2144 } else { 2145 mtype = PP_2_MTYPE(pp); 2146 mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, 2147 mtype), pp); 2148 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST); 2149 } 2150 } else { 2151 2152 /* 2153 * Break down into smaller lists of pages. 2154 */ 2155 page_list_break(&pplist, &npplist, npgs); 2156 2157 pp = pplist; 2158 n = npgs; 2159 while (n--) { 2160 ASSERT(pp->p_szc == cur_szc); 2161 pp->p_szc = new_szc; 2162 pp = pp->p_next; 2163 } 2164 2165 CHK_LPG(pplist, new_szc); 2166 2167 bin = PP_2_BIN(pplist); 2168 ASSERT(mnode == PP_2_MEM_NODE(pp)); 2169 if ((bin == color) && (flags == PC_ALLOC) && 2170 (ret_pp == NULL) && 2171 page_trylock_cons(pp, SE_EXCL)) { 2172 ret_pp = pp; 2173 } else { 2174 mtype = PP_2_MTYPE(pp); 2175 page_vpadd(&PAGE_FREELISTS(mnode, new_szc, 2176 bin, mtype), pplist); 2177 2178 page_ctr_add(mnode, mtype, pplist, 2179 PG_FREE_LIST); 2180 } 2181 pplist = npplist; 2182 } 2183 } 2184 return (ret_pp); 2185 } 2186 2187 int mpss_coalesce_disable = 0; 2188 2189 /* 2190 * Coalesce free pages into a page of the given szc and color if possible. 2191 * Return the pointer to the page created, otherwise, return NULL. 2192 * 2193 * If pfnhi is non-zero, search for large page with pfn range less than pfnhi. 2194 */ 2195 page_t * 2196 page_freelist_coalesce(int mnode, uchar_t szc, uint_t color, uint_t ceq_mask, 2197 int mtype, pfn_t pfnhi) 2198 { 2199 int r = szc; /* region size */ 2200 int mrange; 2201 uint_t full, bin, color_mask, wrap = 0; 2202 pfn_t pfnum, lo, hi; 2203 size_t len, idx, idx0; 2204 pgcnt_t cands = 0, szcpgcnt = page_get_pagecnt(szc); 2205 page_t *ret_pp; 2206 MEM_NODE_ITERATOR_DECL(it); 2207 #if defined(__sparc) 2208 pfn_t pfnum0, nlo, nhi; 2209 #endif 2210 2211 if (mpss_coalesce_disable) { 2212 ASSERT(szc < MMU_PAGE_SIZES); 2213 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[szc][0]); 2214 return (NULL); 2215 } 2216 2217 ASSERT(szc < mmu_page_sizes); 2218 color_mask = PAGE_GET_PAGECOLORS(szc) - 1; 2219 ASSERT(ceq_mask <= color_mask); 2220 ASSERT(color <= color_mask); 2221 color &= ceq_mask; 2222 2223 /* Prevent page_counters dynamic memory from being freed */ 2224 rw_enter(&page_ctrs_rwlock[mnode], RW_READER); 2225 2226 mrange = MTYPE_2_MRANGE(mnode, mtype); 2227 ASSERT(mrange < mnode_nranges[mnode]); 2228 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[r][mrange]); 2229 2230 /* get pfn range for mtype */ 2231 len = PAGE_COUNTERS_ENTRIES(mnode, r); 2232 #if defined(__sparc) 2233 lo = PAGE_COUNTERS_BASE(mnode, r); 2234 hi = IDX_TO_PNUM(mnode, r, len); 2235 #else 2236 MNODETYPE_2_PFN(mnode, mtype, lo, hi); 2237 hi++; 2238 #endif 2239 2240 /* use lower limit if given */ 2241 if (pfnhi != PFNNULL && pfnhi < hi) 2242 hi = pfnhi; 2243 2244 /* round to szcpgcnt boundaries */ 2245 lo = P2ROUNDUP(lo, szcpgcnt); 2246 MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it); 2247 if (lo == (pfn_t)-1) { 2248 rw_exit(&page_ctrs_rwlock[mnode]); 2249 return (NULL); 2250 } 2251 hi = hi & ~(szcpgcnt - 1); 2252 2253 /* set lo to the closest pfn of the right color */ 2254 if (((PFN_2_COLOR(lo, szc, &it) ^ color) & ceq_mask) || 2255 (interleaved_mnodes && PFN_2_MEM_NODE(lo) != mnode)) { 2256 PAGE_NEXT_PFN_FOR_COLOR(lo, szc, color, ceq_mask, color_mask, 2257 &it); 2258 } 2259 2260 if (hi <= lo) { 2261 rw_exit(&page_ctrs_rwlock[mnode]); 2262 return (NULL); 2263 } 2264 2265 full = FULL_REGION_CNT(r); 2266 2267 /* calculate the number of page candidates and initial search index */ 2268 bin = color; 2269 idx0 = (size_t)(-1); 2270 do { 2271 pgcnt_t acand; 2272 2273 PGCTRS_CANDS_GETVALUECOLOR(mnode, mrange, r, bin, acand); 2274 if (acand) { 2275 idx = PAGE_COUNTERS_CURRENT_COLOR(mnode, 2276 r, bin, mrange); 2277 idx0 = MIN(idx0, idx); 2278 cands += acand; 2279 } 2280 bin = ADD_MASKED(bin, 1, ceq_mask, color_mask); 2281 } while (bin != color); 2282 2283 if (cands == 0) { 2284 VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip[r][mrange]); 2285 rw_exit(&page_ctrs_rwlock[mnode]); 2286 return (NULL); 2287 } 2288 2289 pfnum = IDX_TO_PNUM(mnode, r, idx0); 2290 if (pfnum < lo || pfnum >= hi) { 2291 pfnum = lo; 2292 } else { 2293 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it); 2294 if (pfnum == (pfn_t)-1) { 2295 pfnum = lo; 2296 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it); 2297 ASSERT(pfnum != (pfn_t)-1); 2298 } else if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) & ceq_mask || 2299 (interleaved_mnodes && PFN_2_MEM_NODE(pfnum) != mnode)) { 2300 /* invalid color, get the closest correct pfn */ 2301 PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask, 2302 color_mask, &it); 2303 if (pfnum >= hi) { 2304 pfnum = lo; 2305 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it); 2306 } 2307 } 2308 } 2309 2310 /* set starting index */ 2311 idx0 = PNUM_TO_IDX(mnode, r, pfnum); 2312 ASSERT(idx0 < len); 2313 2314 #if defined(__sparc) 2315 pfnum0 = pfnum; /* page corresponding to idx0 */ 2316 nhi = 0; /* search kcage ranges */ 2317 #endif 2318 2319 for (idx = idx0; wrap == 0 || (idx < idx0 && wrap < 2); ) { 2320 2321 #if defined(__sparc) 2322 /* 2323 * Find lowest intersection of kcage ranges and mnode. 2324 * MTYPE_NORELOC means look in the cage, otherwise outside. 2325 */ 2326 if (nhi <= pfnum) { 2327 if (kcage_next_range(mtype == MTYPE_NORELOC, pfnum, 2328 (wrap == 0 ? hi : pfnum0), &nlo, &nhi)) 2329 goto wrapit; 2330 2331 /* jump to the next page in the range */ 2332 if (pfnum < nlo) { 2333 pfnum = P2ROUNDUP(nlo, szcpgcnt); 2334 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it); 2335 idx = PNUM_TO_IDX(mnode, r, pfnum); 2336 if (idx >= len || pfnum >= hi) 2337 goto wrapit; 2338 if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) & 2339 ceq_mask) 2340 goto next; 2341 if (interleaved_mnodes && 2342 PFN_2_MEM_NODE(pfnum) != mnode) 2343 goto next; 2344 } 2345 } 2346 #endif 2347 2348 if (PAGE_COUNTERS(mnode, r, idx) != full) 2349 goto next; 2350 2351 /* 2352 * RFE: For performance maybe we can do something less 2353 * brutal than locking the entire freelist. So far 2354 * this doesn't seem to be a performance problem? 2355 */ 2356 page_freelist_lock(mnode); 2357 if (PAGE_COUNTERS(mnode, r, idx) == full) { 2358 ret_pp = 2359 page_promote(mnode, pfnum, r, PC_ALLOC, mtype); 2360 if (ret_pp != NULL) { 2361 VM_STAT_ADD(vmm_vmstats.pfc_coalok[r][mrange]); 2362 PAGE_COUNTERS_CURRENT_COLOR(mnode, r, 2363 PFN_2_COLOR(pfnum, szc, &it), mrange) = idx; 2364 page_freelist_unlock(mnode); 2365 rw_exit(&page_ctrs_rwlock[mnode]); 2366 #if defined(__sparc) 2367 if (PP_ISNORELOC(ret_pp)) { 2368 pgcnt_t npgs; 2369 2370 npgs = page_get_pagecnt(ret_pp->p_szc); 2371 kcage_freemem_sub(npgs); 2372 } 2373 #endif 2374 return (ret_pp); 2375 } 2376 } else { 2377 VM_STAT_ADD(vmm_vmstats.page_ctrs_changed[r][mrange]); 2378 } 2379 2380 page_freelist_unlock(mnode); 2381 /* 2382 * No point looking for another page if we've 2383 * already tried all of the ones that 2384 * page_ctr_cands indicated. Stash off where we left 2385 * off. 2386 * Note: this is not exact since we don't hold the 2387 * page_freelist_locks before we initially get the 2388 * value of cands for performance reasons, but should 2389 * be a decent approximation. 2390 */ 2391 if (--cands == 0) { 2392 PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color, mrange) = 2393 idx; 2394 break; 2395 } 2396 next: 2397 PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask, 2398 color_mask, &it); 2399 idx = PNUM_TO_IDX(mnode, r, pfnum); 2400 if (idx >= len || pfnum >= hi) { 2401 wrapit: 2402 pfnum = lo; 2403 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it); 2404 idx = PNUM_TO_IDX(mnode, r, pfnum); 2405 wrap++; 2406 #if defined(__sparc) 2407 nhi = 0; /* search kcage ranges */ 2408 #endif 2409 } 2410 } 2411 2412 rw_exit(&page_ctrs_rwlock[mnode]); 2413 VM_STAT_ADD(vmm_vmstats.page_ctrs_failed[r][mrange]); 2414 return (NULL); 2415 } 2416 2417 /* 2418 * For the given mnode, promote as many small pages to large pages as possible. 2419 * mnode can be -1, which means do them all 2420 */ 2421 void 2422 page_freelist_coalesce_all(int mnode) 2423 { 2424 int r; /* region size */ 2425 int idx, full; 2426 size_t len; 2427 int doall = interleaved_mnodes || mnode < 0; 2428 int mlo = doall ? 0 : mnode; 2429 int mhi = doall ? max_mem_nodes : (mnode + 1); 2430 2431 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce_all); 2432 2433 if (mpss_coalesce_disable) { 2434 return; 2435 } 2436 2437 /* 2438 * Lock the entire freelist and coalesce what we can. 2439 * 2440 * Always promote to the largest page possible 2441 * first to reduce the number of page promotions. 2442 */ 2443 for (mnode = mlo; mnode < mhi; mnode++) { 2444 rw_enter(&page_ctrs_rwlock[mnode], RW_READER); 2445 page_freelist_lock(mnode); 2446 } 2447 for (r = mmu_page_sizes - 1; r > 0; r--) { 2448 for (mnode = mlo; mnode < mhi; mnode++) { 2449 pgcnt_t cands = 0; 2450 int mrange, nranges = mnode_nranges[mnode]; 2451 2452 for (mrange = 0; mrange < nranges; mrange++) { 2453 PGCTRS_CANDS_GETVALUE(mnode, mrange, r, cands); 2454 if (cands != 0) 2455 break; 2456 } 2457 if (cands == 0) { 2458 VM_STAT_ADD(vmm_vmstats. 2459 page_ctrs_cands_skip_all); 2460 continue; 2461 } 2462 2463 full = FULL_REGION_CNT(r); 2464 len = PAGE_COUNTERS_ENTRIES(mnode, r); 2465 2466 for (idx = 0; idx < len; idx++) { 2467 if (PAGE_COUNTERS(mnode, r, idx) == full) { 2468 pfn_t pfnum = 2469 IDX_TO_PNUM(mnode, r, idx); 2470 int tmnode = interleaved_mnodes ? 2471 PFN_2_MEM_NODE(pfnum) : mnode; 2472 2473 ASSERT(pfnum >= 2474 mem_node_config[tmnode].physbase && 2475 pfnum < 2476 mem_node_config[tmnode].physmax); 2477 2478 (void) page_promote(tmnode, 2479 pfnum, r, PC_FREE, PC_MTYPE_ANY); 2480 } 2481 } 2482 /* shared hpm_counters covers all mnodes, so we quit */ 2483 if (interleaved_mnodes) 2484 break; 2485 } 2486 } 2487 for (mnode = mlo; mnode < mhi; mnode++) { 2488 page_freelist_unlock(mnode); 2489 rw_exit(&page_ctrs_rwlock[mnode]); 2490 } 2491 } 2492 2493 /* 2494 * This is where all polices for moving pages around 2495 * to different page size free lists is implemented. 2496 * Returns 1 on success, 0 on failure. 2497 * 2498 * So far these are the priorities for this algorithm in descending 2499 * order: 2500 * 2501 * 1) When servicing a request try to do so with a free page 2502 * from next size up. Helps defer fragmentation as long 2503 * as possible. 2504 * 2505 * 2) Page coalesce on demand. Only when a freelist 2506 * larger than PAGESIZE is empty and step 1 2507 * will not work since all larger size lists are 2508 * also empty. 2509 * 2510 * If pfnhi is non-zero, search for large page with pfn range less than pfnhi. 2511 */ 2512 2513 page_t * 2514 page_freelist_split(uchar_t szc, uint_t color, int mnode, int mtype, 2515 pfn_t pfnhi, page_list_walker_t *plw) 2516 { 2517 uchar_t nszc = szc + 1; 2518 uint_t bin, sbin, bin_prev; 2519 page_t *pp, *firstpp; 2520 page_t *ret_pp = NULL; 2521 uint_t color_mask; 2522 2523 if (nszc == mmu_page_sizes) 2524 return (NULL); 2525 2526 ASSERT(nszc < mmu_page_sizes); 2527 color_mask = PAGE_GET_PAGECOLORS(nszc) - 1; 2528 bin = sbin = PAGE_GET_NSZ_COLOR(szc, color); 2529 bin_prev = (plw->plw_bin_split_prev == color) ? INVALID_COLOR : 2530 PAGE_GET_NSZ_COLOR(szc, plw->plw_bin_split_prev); 2531 2532 VM_STAT_ADD(vmm_vmstats.pfs_req[szc]); 2533 /* 2534 * First try to break up a larger page to fill current size freelist. 2535 */ 2536 while (plw->plw_bins[nszc] != 0) { 2537 2538 ASSERT(nszc < mmu_page_sizes); 2539 2540 /* 2541 * If page found then demote it. 2542 */ 2543 if (PAGE_FREELISTS(mnode, nszc, bin, mtype)) { 2544 page_freelist_lock(mnode); 2545 firstpp = pp = PAGE_FREELISTS(mnode, nszc, bin, mtype); 2546 2547 /* 2548 * If pfnhi is not PFNNULL, look for large page below 2549 * pfnhi. PFNNULL signifies no pfn requirement. 2550 */ 2551 if (pfnhi != PFNNULL && pp->p_pagenum >= pfnhi) { 2552 do { 2553 pp = pp->p_vpnext; 2554 if (pp == firstpp) { 2555 pp = NULL; 2556 break; 2557 } 2558 } while (pp->p_pagenum >= pfnhi); 2559 } 2560 if (pp) { 2561 uint_t ccolor = page_correct_color(szc, nszc, 2562 color, bin, plw->plw_ceq_mask[szc]); 2563 2564 ASSERT(pp->p_szc == nszc); 2565 VM_STAT_ADD(vmm_vmstats.pfs_demote[nszc]); 2566 ret_pp = page_demote(mnode, pp->p_pagenum, 2567 pp->p_szc, szc, ccolor, PC_ALLOC); 2568 if (ret_pp) { 2569 page_freelist_unlock(mnode); 2570 #if defined(__sparc) 2571 if (PP_ISNORELOC(ret_pp)) { 2572 pgcnt_t npgs; 2573 2574 npgs = page_get_pagecnt( 2575 ret_pp->p_szc); 2576 kcage_freemem_sub(npgs); 2577 } 2578 #endif 2579 return (ret_pp); 2580 } 2581 } 2582 page_freelist_unlock(mnode); 2583 } 2584 2585 /* loop through next size bins */ 2586 bin = ADD_MASKED(bin, 1, plw->plw_ceq_mask[nszc], color_mask); 2587 plw->plw_bins[nszc]--; 2588 2589 if (bin == sbin) { 2590 uchar_t nnszc = nszc + 1; 2591 2592 /* we are done with this page size - check next */ 2593 if (plw->plw_bins[nnszc] == 0) 2594 /* we have already checked next size bins */ 2595 break; 2596 2597 bin = sbin = PAGE_GET_NSZ_COLOR(nszc, bin); 2598 if (bin_prev != INVALID_COLOR) { 2599 bin_prev = PAGE_GET_NSZ_COLOR(nszc, bin_prev); 2600 if (!((bin ^ bin_prev) & 2601 plw->plw_ceq_mask[nnszc])) 2602 break; 2603 } 2604 ASSERT(nnszc < mmu_page_sizes); 2605 color_mask = PAGE_GET_PAGECOLORS(nnszc) - 1; 2606 nszc = nnszc; 2607 ASSERT(nszc < mmu_page_sizes); 2608 } 2609 } 2610 2611 return (ret_pp); 2612 } 2613 2614 /* 2615 * Helper routine used only by the freelist code to lock 2616 * a page. If the page is a large page then it succeeds in 2617 * locking all the constituent pages or none at all. 2618 * Returns 1 on sucess, 0 on failure. 2619 */ 2620 static int 2621 page_trylock_cons(page_t *pp, se_t se) 2622 { 2623 page_t *tpp, *first_pp = pp; 2624 2625 /* 2626 * Fail if can't lock first or only page. 2627 */ 2628 if (!page_trylock(pp, se)) { 2629 return (0); 2630 } 2631 2632 /* 2633 * PAGESIZE: common case. 2634 */ 2635 if (pp->p_szc == 0) { 2636 return (1); 2637 } 2638 2639 /* 2640 * Large page case. 2641 */ 2642 tpp = pp->p_next; 2643 while (tpp != pp) { 2644 if (!page_trylock(tpp, se)) { 2645 /* 2646 * On failure unlock what we have locked so far. 2647 * We want to avoid attempting to capture these 2648 * pages as the pcm mutex may be held which could 2649 * lead to a recursive mutex panic. 2650 */ 2651 while (first_pp != tpp) { 2652 page_unlock_nocapture(first_pp); 2653 first_pp = first_pp->p_next; 2654 } 2655 return (0); 2656 } 2657 tpp = tpp->p_next; 2658 } 2659 return (1); 2660 } 2661 2662 /* 2663 * init context for walking page lists 2664 * Called when a page of the given szc in unavailable. Sets markers 2665 * for the beginning of the search to detect when search has 2666 * completed a full cycle. Sets flags for splitting larger pages 2667 * and coalescing smaller pages. Page walking procedes until a page 2668 * of the desired equivalent color is found. 2669 */ 2670 void 2671 page_list_walk_init(uchar_t szc, uint_t flags, uint_t bin, int can_split, 2672 int use_ceq, page_list_walker_t *plw) 2673 { 2674 uint_t nszc, ceq_mask, colors; 2675 uchar_t ceq = use_ceq ? colorequivszc[szc] : 0; 2676 2677 ASSERT(szc < mmu_page_sizes); 2678 colors = PAGE_GET_PAGECOLORS(szc); 2679 2680 plw->plw_colors = colors; 2681 plw->plw_color_mask = colors - 1; 2682 plw->plw_bin_marker = plw->plw_bin0 = bin; 2683 plw->plw_bin_split_prev = bin; 2684 plw->plw_bin_step = (szc == 0) ? vac_colors : 1; 2685 2686 /* 2687 * if vac aliasing is possible make sure lower order color 2688 * bits are never ignored 2689 */ 2690 if (vac_colors > 1) 2691 ceq &= 0xf0; 2692 2693 /* 2694 * calculate the number of non-equivalent colors and 2695 * color equivalency mask 2696 */ 2697 plw->plw_ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf)); 2698 ASSERT(szc > 0 || plw->plw_ceq_dif >= vac_colors); 2699 ASSERT(plw->plw_ceq_dif > 0); 2700 plw->plw_ceq_mask[szc] = (plw->plw_ceq_dif - 1) << (ceq & 0xf); 2701 2702 if (flags & PG_MATCH_COLOR) { 2703 if (cpu_page_colors < 0) { 2704 /* 2705 * this is a heterogeneous machine with different CPUs 2706 * having different size e$ (not supported for ni2/rock 2707 */ 2708 uint_t cpucolors = CPUSETSIZE() >> PAGE_GET_SHIFT(szc); 2709 cpucolors = MAX(cpucolors, 1); 2710 ceq_mask = plw->plw_color_mask & (cpucolors - 1); 2711 plw->plw_ceq_mask[szc] = 2712 MIN(ceq_mask, plw->plw_ceq_mask[szc]); 2713 } 2714 plw->plw_ceq_dif = 1; 2715 } 2716 2717 /* we can split pages in the freelist, but not the cachelist */ 2718 if (can_split) { 2719 plw->plw_do_split = (szc + 1 < mmu_page_sizes) ? 1 : 0; 2720 2721 /* set next szc color masks and number of free list bins */ 2722 for (nszc = szc + 1; nszc < mmu_page_sizes; nszc++, szc++) { 2723 plw->plw_ceq_mask[nszc] = PAGE_GET_NSZ_MASK(szc, 2724 plw->plw_ceq_mask[szc]); 2725 plw->plw_bins[nszc] = PAGE_GET_PAGECOLORS(nszc); 2726 } 2727 plw->plw_ceq_mask[nszc] = INVALID_MASK; 2728 plw->plw_bins[nszc] = 0; 2729 2730 } else { 2731 ASSERT(szc == 0); 2732 plw->plw_do_split = 0; 2733 plw->plw_bins[1] = 0; 2734 plw->plw_ceq_mask[1] = INVALID_MASK; 2735 } 2736 } 2737 2738 /* 2739 * set mark to flag where next split should occur 2740 */ 2741 #define PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw) { \ 2742 uint_t bin_nsz = PAGE_GET_NSZ_COLOR(szc, bin); \ 2743 uint_t bin0_nsz = PAGE_GET_NSZ_COLOR(szc, plw->plw_bin0); \ 2744 uint_t neq_mask = ~plw->plw_ceq_mask[nszc] & plw->plw_color_mask; \ 2745 plw->plw_split_next = \ 2746 INC_MASKED(bin_nsz, neq_mask, plw->plw_color_mask); \ 2747 if (!((plw->plw_split_next ^ bin0_nsz) & plw->plw_ceq_mask[nszc])) { \ 2748 plw->plw_split_next = \ 2749 INC_MASKED(plw->plw_split_next, \ 2750 neq_mask, plw->plw_color_mask); \ 2751 } \ 2752 } 2753 2754 uint_t 2755 page_list_walk_next_bin(uchar_t szc, uint_t bin, page_list_walker_t *plw) 2756 { 2757 uint_t neq_mask = ~plw->plw_ceq_mask[szc] & plw->plw_color_mask; 2758 uint_t bin0_nsz, nbin_nsz, nbin0, nbin; 2759 uchar_t nszc = szc + 1; 2760 2761 nbin = ADD_MASKED(bin, 2762 plw->plw_bin_step, neq_mask, plw->plw_color_mask); 2763 2764 if (plw->plw_do_split) { 2765 plw->plw_bin_split_prev = bin; 2766 PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw); 2767 plw->plw_do_split = 0; 2768 } 2769 2770 if (szc == 0) { 2771 if (plw->plw_count != 0 || plw->plw_ceq_dif == vac_colors) { 2772 if (nbin == plw->plw_bin0 && 2773 (vac_colors == 1 || nbin != plw->plw_bin_marker)) { 2774 nbin = ADD_MASKED(nbin, plw->plw_bin_step, 2775 neq_mask, plw->plw_color_mask); 2776 plw->plw_bin_split_prev = plw->plw_bin0; 2777 } 2778 2779 if (vac_colors > 1 && nbin == plw->plw_bin_marker) { 2780 plw->plw_bin_marker = 2781 nbin = INC_MASKED(nbin, neq_mask, 2782 plw->plw_color_mask); 2783 plw->plw_bin_split_prev = plw->plw_bin0; 2784 /* 2785 * large pages all have the same vac color 2786 * so by now we should be done with next 2787 * size page splitting process 2788 */ 2789 ASSERT(plw->plw_bins[1] == 0); 2790 plw->plw_do_split = 0; 2791 return (nbin); 2792 } 2793 2794 } else { 2795 uint_t bin_jump = (vac_colors == 1) ? 2796 (BIN_STEP & ~3) - (plw->plw_bin0 & 3) : BIN_STEP; 2797 2798 bin_jump &= ~(vac_colors - 1); 2799 2800 nbin0 = ADD_MASKED(plw->plw_bin0, bin_jump, neq_mask, 2801 plw->plw_color_mask); 2802 2803 if ((nbin0 ^ plw->plw_bin0) & plw->plw_ceq_mask[szc]) { 2804 2805 plw->plw_bin_marker = nbin = nbin0; 2806 2807 if (plw->plw_bins[nszc] != 0) { 2808 /* 2809 * check if next page size bin is the 2810 * same as the next page size bin for 2811 * bin0 2812 */ 2813 nbin_nsz = PAGE_GET_NSZ_COLOR(szc, 2814 nbin); 2815 bin0_nsz = PAGE_GET_NSZ_COLOR(szc, 2816 plw->plw_bin0); 2817 2818 if ((bin0_nsz ^ nbin_nsz) & 2819 plw->plw_ceq_mask[nszc]) 2820 plw->plw_do_split = 1; 2821 } 2822 return (nbin); 2823 } 2824 } 2825 } 2826 2827 if (plw->plw_bins[nszc] != 0) { 2828 nbin_nsz = PAGE_GET_NSZ_COLOR(szc, nbin); 2829 if (!((plw->plw_split_next ^ nbin_nsz) & 2830 plw->plw_ceq_mask[nszc])) 2831 plw->plw_do_split = 1; 2832 } 2833 2834 return (nbin); 2835 } 2836 2837 page_t * 2838 page_get_mnode_freelist(int mnode, uint_t bin, int mtype, uchar_t szc, 2839 uint_t flags) 2840 { 2841 kmutex_t *pcm; 2842 page_t *pp, *first_pp; 2843 uint_t sbin; 2844 int plw_initialized; 2845 page_list_walker_t plw; 2846 2847 ASSERT(szc < mmu_page_sizes); 2848 2849 VM_STAT_ADD(vmm_vmstats.pgmf_alloc[szc]); 2850 2851 MTYPE_START(mnode, mtype, flags); 2852 if (mtype < 0) { /* mnode does not have memory in mtype range */ 2853 VM_STAT_ADD(vmm_vmstats.pgmf_allocempty[szc]); 2854 return (NULL); 2855 } 2856 try_again: 2857 2858 plw_initialized = 0; 2859 plw.plw_ceq_dif = 1; 2860 2861 /* 2862 * Only hold one freelist lock at a time, that way we 2863 * can start anywhere and not have to worry about lock 2864 * ordering. 2865 */ 2866 for (plw.plw_count = 0; 2867 plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) { 2868 sbin = bin; 2869 do { 2870 if (!PAGE_FREELISTS(mnode, szc, bin, mtype)) 2871 goto bin_empty_1; 2872 2873 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 2874 mutex_enter(pcm); 2875 pp = PAGE_FREELISTS(mnode, szc, bin, mtype); 2876 if (pp == NULL) 2877 goto bin_empty_0; 2878 2879 /* 2880 * These were set before the page 2881 * was put on the free list, 2882 * they must still be set. 2883 */ 2884 ASSERT(PP_ISFREE(pp)); 2885 ASSERT(PP_ISAGED(pp)); 2886 ASSERT(pp->p_vnode == NULL); 2887 ASSERT(pp->p_hash == NULL); 2888 ASSERT(pp->p_offset == (u_offset_t)-1); 2889 ASSERT(pp->p_szc == szc); 2890 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 2891 2892 /* 2893 * Walk down the hash chain. 2894 * 8k pages are linked on p_next 2895 * and p_prev fields. Large pages 2896 * are a contiguous group of 2897 * constituent pages linked together 2898 * on their p_next and p_prev fields. 2899 * The large pages are linked together 2900 * on the hash chain using p_vpnext 2901 * p_vpprev of the base constituent 2902 * page of each large page. 2903 */ 2904 first_pp = pp; 2905 while (!page_trylock_cons(pp, SE_EXCL)) { 2906 if (szc == 0) { 2907 pp = pp->p_next; 2908 } else { 2909 pp = pp->p_vpnext; 2910 } 2911 2912 ASSERT(PP_ISFREE(pp)); 2913 ASSERT(PP_ISAGED(pp)); 2914 ASSERT(pp->p_vnode == NULL); 2915 ASSERT(pp->p_hash == NULL); 2916 ASSERT(pp->p_offset == (u_offset_t)-1); 2917 ASSERT(pp->p_szc == szc); 2918 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 2919 2920 if (pp == first_pp) 2921 goto bin_empty_0; 2922 } 2923 2924 ASSERT(pp != NULL); 2925 ASSERT(mtype == PP_2_MTYPE(pp)); 2926 ASSERT(pp->p_szc == szc); 2927 if (szc == 0) { 2928 page_sub(&PAGE_FREELISTS(mnode, 2929 szc, bin, mtype), pp); 2930 } else { 2931 page_vpsub(&PAGE_FREELISTS(mnode, 2932 szc, bin, mtype), pp); 2933 CHK_LPG(pp, szc); 2934 } 2935 page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST); 2936 2937 if ((PP_ISFREE(pp) == 0) || (PP_ISAGED(pp) == 0)) 2938 panic("free page is not. pp %p", (void *)pp); 2939 mutex_exit(pcm); 2940 2941 #if defined(__sparc) 2942 ASSERT(!kcage_on || PP_ISNORELOC(pp) || 2943 (flags & PG_NORELOC) == 0); 2944 2945 if (PP_ISNORELOC(pp)) 2946 kcage_freemem_sub(page_get_pagecnt(szc)); 2947 #endif 2948 VM_STAT_ADD(vmm_vmstats.pgmf_allocok[szc]); 2949 return (pp); 2950 2951 bin_empty_0: 2952 mutex_exit(pcm); 2953 bin_empty_1: 2954 if (plw_initialized == 0) { 2955 page_list_walk_init(szc, flags, bin, 1, 1, 2956 &plw); 2957 plw_initialized = 1; 2958 ASSERT(plw.plw_colors <= 2959 PAGE_GET_PAGECOLORS(szc)); 2960 ASSERT(plw.plw_colors > 0); 2961 ASSERT((plw.plw_colors & 2962 (plw.plw_colors - 1)) == 0); 2963 ASSERT(bin < plw.plw_colors); 2964 ASSERT(plw.plw_ceq_mask[szc] < plw.plw_colors); 2965 } 2966 /* calculate the next bin with equivalent color */ 2967 bin = ADD_MASKED(bin, plw.plw_bin_step, 2968 plw.plw_ceq_mask[szc], plw.plw_color_mask); 2969 } while (sbin != bin); 2970 2971 /* 2972 * color bins are all empty if color match. Try and 2973 * satisfy the request by breaking up or coalescing 2974 * pages from a different size freelist of the correct 2975 * color that satisfies the ORIGINAL color requested. 2976 * If that fails then try pages of the same size but 2977 * different colors assuming we are not called with 2978 * PG_MATCH_COLOR. 2979 */ 2980 if (plw.plw_do_split && 2981 (pp = page_freelist_split(szc, bin, mnode, 2982 mtype, PFNNULL, &plw)) != NULL) 2983 return (pp); 2984 2985 if (szc > 0 && (pp = page_freelist_coalesce(mnode, szc, 2986 bin, plw.plw_ceq_mask[szc], mtype, PFNNULL)) != NULL) 2987 return (pp); 2988 2989 if (plw.plw_ceq_dif > 1) 2990 bin = page_list_walk_next_bin(szc, bin, &plw); 2991 } 2992 2993 /* if allowed, cycle through additional mtypes */ 2994 MTYPE_NEXT(mnode, mtype, flags); 2995 if (mtype >= 0) 2996 goto try_again; 2997 2998 VM_STAT_ADD(vmm_vmstats.pgmf_allocfailed[szc]); 2999 3000 return (NULL); 3001 } 3002 3003 /* 3004 * Returns the count of free pages for 'pp' with size code 'szc'. 3005 * Note: This function does not return an exact value as the page freelist 3006 * locks are not held and thus the values in the page_counters may be 3007 * changing as we walk through the data. 3008 */ 3009 static int 3010 page_freecnt(int mnode, page_t *pp, uchar_t szc) 3011 { 3012 pgcnt_t pgfree; 3013 pgcnt_t cnt; 3014 ssize_t r = szc; /* region size */ 3015 ssize_t idx; 3016 int i; 3017 int full, range; 3018 3019 /* Make sure pagenum passed in is aligned properly */ 3020 ASSERT((pp->p_pagenum & (PNUM_SIZE(szc) - 1)) == 0); 3021 ASSERT(szc > 0); 3022 3023 /* Prevent page_counters dynamic memory from being freed */ 3024 rw_enter(&page_ctrs_rwlock[mnode], RW_READER); 3025 idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum); 3026 cnt = PAGE_COUNTERS(mnode, r, idx); 3027 pgfree = cnt << PNUM_SHIFT(r - 1); 3028 range = FULL_REGION_CNT(szc); 3029 3030 /* Check for completely full region */ 3031 if (cnt == range) { 3032 rw_exit(&page_ctrs_rwlock[mnode]); 3033 return (pgfree); 3034 } 3035 3036 while (--r > 0) { 3037 idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum); 3038 full = FULL_REGION_CNT(r); 3039 for (i = 0; i < range; i++, idx++) { 3040 cnt = PAGE_COUNTERS(mnode, r, idx); 3041 /* 3042 * If cnt here is full, that means we have already 3043 * accounted for these pages earlier. 3044 */ 3045 if (cnt != full) { 3046 pgfree += (cnt << PNUM_SHIFT(r - 1)); 3047 } 3048 } 3049 range *= full; 3050 } 3051 rw_exit(&page_ctrs_rwlock[mnode]); 3052 return (pgfree); 3053 } 3054 3055 /* 3056 * Called from page_geti_contig_pages to exclusively lock constituent pages 3057 * starting from 'spp' for page size code 'szc'. 3058 * 3059 * If 'ptcpthreshold' is set, the number of free pages needed in the 'szc' 3060 * region needs to be greater than or equal to the threshold. 3061 */ 3062 static int 3063 page_trylock_contig_pages(int mnode, page_t *spp, uchar_t szc, int flags) 3064 { 3065 pgcnt_t pgcnt = PNUM_SIZE(szc); 3066 pgcnt_t pgfree, i; 3067 page_t *pp; 3068 3069 VM_STAT_ADD(vmm_vmstats.ptcp[szc]); 3070 3071 3072 if ((ptcpthreshold == 0) || (flags & PGI_PGCPHIPRI)) 3073 goto skipptcpcheck; 3074 /* 3075 * check if there are sufficient free pages available before attempting 3076 * to trylock. Count is approximate as page counters can change. 3077 */ 3078 pgfree = page_freecnt(mnode, spp, szc); 3079 3080 /* attempt to trylock if there are sufficient already free pages */ 3081 if (pgfree < pgcnt/ptcpthreshold) { 3082 VM_STAT_ADD(vmm_vmstats.ptcpfreethresh[szc]); 3083 return (0); 3084 } 3085 3086 skipptcpcheck: 3087 3088 for (i = 0; i < pgcnt; i++) { 3089 pp = &spp[i]; 3090 if (!page_trylock(pp, SE_EXCL)) { 3091 VM_STAT_ADD(vmm_vmstats.ptcpfailexcl[szc]); 3092 while (--i != (pgcnt_t)-1) { 3093 pp = &spp[i]; 3094 ASSERT(PAGE_EXCL(pp)); 3095 page_unlock_nocapture(pp); 3096 } 3097 return (0); 3098 } 3099 ASSERT(spp[i].p_pagenum == spp->p_pagenum + i); 3100 if ((pp->p_szc > szc || (szc && pp->p_szc == szc)) && 3101 !PP_ISFREE(pp)) { 3102 VM_STAT_ADD(vmm_vmstats.ptcpfailszc[szc]); 3103 ASSERT(i == 0); 3104 page_unlock_nocapture(pp); 3105 return (0); 3106 } 3107 if (PP_ISNORELOC(pp)) { 3108 VM_STAT_ADD(vmm_vmstats.ptcpfailcage[szc]); 3109 while (i != (pgcnt_t)-1) { 3110 pp = &spp[i]; 3111 ASSERT(PAGE_EXCL(pp)); 3112 page_unlock_nocapture(pp); 3113 i--; 3114 } 3115 return (0); 3116 } 3117 } 3118 VM_STAT_ADD(vmm_vmstats.ptcpok[szc]); 3119 return (1); 3120 } 3121 3122 /* 3123 * Claim large page pointed to by 'pp'. 'pp' is the starting set 3124 * of 'szc' constituent pages that had been locked exclusively previously. 3125 * Will attempt to relocate constituent pages in use. 3126 */ 3127 static page_t * 3128 page_claim_contig_pages(page_t *pp, uchar_t szc, int flags) 3129 { 3130 spgcnt_t pgcnt, npgs, i; 3131 page_t *targpp, *rpp, *hpp; 3132 page_t *replpp = NULL; 3133 page_t *pplist = NULL; 3134 3135 ASSERT(pp != NULL); 3136 3137 pgcnt = page_get_pagecnt(szc); 3138 while (pgcnt) { 3139 ASSERT(PAGE_EXCL(pp)); 3140 ASSERT(!PP_ISNORELOC(pp)); 3141 if (PP_ISFREE(pp)) { 3142 /* 3143 * If this is a PG_FREE_LIST page then its 3144 * size code can change underneath us due to 3145 * page promotion or demotion. As an optimzation 3146 * use page_list_sub_pages() instead of 3147 * page_list_sub(). 3148 */ 3149 if (PP_ISAGED(pp)) { 3150 page_list_sub_pages(pp, szc); 3151 if (pp->p_szc == szc) { 3152 return (pp); 3153 } 3154 ASSERT(pp->p_szc < szc); 3155 npgs = page_get_pagecnt(pp->p_szc); 3156 hpp = pp; 3157 for (i = 0; i < npgs; i++, pp++) { 3158 pp->p_szc = szc; 3159 } 3160 page_list_concat(&pplist, &hpp); 3161 pgcnt -= npgs; 3162 continue; 3163 } 3164 ASSERT(!PP_ISAGED(pp)); 3165 ASSERT(pp->p_szc == 0); 3166 page_list_sub(pp, PG_CACHE_LIST); 3167 page_hashout(pp, NULL); 3168 PP_SETAGED(pp); 3169 pp->p_szc = szc; 3170 page_list_concat(&pplist, &pp); 3171 pp++; 3172 pgcnt--; 3173 continue; 3174 } 3175 npgs = page_get_pagecnt(pp->p_szc); 3176 3177 /* 3178 * page_create_wait freemem accounting done by caller of 3179 * page_get_freelist and not necessary to call it prior to 3180 * calling page_get_replacement_page. 3181 * 3182 * page_get_replacement_page can call page_get_contig_pages 3183 * to acquire a large page (szc > 0); the replacement must be 3184 * smaller than the contig page size to avoid looping or 3185 * szc == 0 and PGI_PGCPSZC0 is set. 3186 */ 3187 if (pp->p_szc < szc || (szc == 0 && (flags & PGI_PGCPSZC0))) { 3188 replpp = page_get_replacement_page(pp, NULL, 0); 3189 if (replpp) { 3190 npgs = page_get_pagecnt(pp->p_szc); 3191 ASSERT(npgs <= pgcnt); 3192 targpp = pp; 3193 } 3194 } 3195 3196 /* 3197 * If replacement is NULL or do_page_relocate fails, fail 3198 * coalescing of pages. 3199 */ 3200 if (replpp == NULL || (do_page_relocate(&targpp, &replpp, 0, 3201 &npgs, NULL) != 0)) { 3202 /* 3203 * Unlock un-processed target list 3204 */ 3205 while (pgcnt--) { 3206 ASSERT(PAGE_EXCL(pp)); 3207 page_unlock_nocapture(pp); 3208 pp++; 3209 } 3210 /* 3211 * Free the processed target list. 3212 */ 3213 while (pplist) { 3214 pp = pplist; 3215 page_sub(&pplist, pp); 3216 ASSERT(PAGE_EXCL(pp)); 3217 ASSERT(pp->p_szc == szc); 3218 ASSERT(PP_ISFREE(pp)); 3219 ASSERT(PP_ISAGED(pp)); 3220 pp->p_szc = 0; 3221 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); 3222 page_unlock_nocapture(pp); 3223 } 3224 3225 if (replpp != NULL) 3226 page_free_replacement_page(replpp); 3227 3228 return (NULL); 3229 } 3230 ASSERT(pp == targpp); 3231 3232 /* LINTED */ 3233 ASSERT(hpp = pp); /* That's right, it's an assignment */ 3234 3235 pp += npgs; 3236 pgcnt -= npgs; 3237 3238 while (npgs--) { 3239 ASSERT(PAGE_EXCL(targpp)); 3240 ASSERT(!PP_ISFREE(targpp)); 3241 ASSERT(!PP_ISNORELOC(targpp)); 3242 PP_SETFREE(targpp); 3243 ASSERT(PP_ISAGED(targpp)); 3244 ASSERT(targpp->p_szc < szc || (szc == 0 && 3245 (flags & PGI_PGCPSZC0))); 3246 targpp->p_szc = szc; 3247 targpp = targpp->p_next; 3248 3249 rpp = replpp; 3250 ASSERT(rpp != NULL); 3251 page_sub(&replpp, rpp); 3252 ASSERT(PAGE_EXCL(rpp)); 3253 ASSERT(!PP_ISFREE(rpp)); 3254 page_unlock_nocapture(rpp); 3255 } 3256 ASSERT(targpp == hpp); 3257 ASSERT(replpp == NULL); 3258 page_list_concat(&pplist, &targpp); 3259 } 3260 CHK_LPG(pplist, szc); 3261 return (pplist); 3262 } 3263 3264 /* 3265 * Trim kernel cage from pfnlo-pfnhi and store result in lo-hi. Return code 3266 * of 0 means nothing left after trim. 3267 */ 3268 int 3269 trimkcage(struct memseg *mseg, pfn_t *lo, pfn_t *hi, pfn_t pfnlo, pfn_t pfnhi) 3270 { 3271 pfn_t kcagepfn; 3272 int decr; 3273 int rc = 0; 3274 3275 if (PP_ISNORELOC(mseg->pages)) { 3276 if (PP_ISNORELOC(mseg->epages - 1) == 0) { 3277 3278 /* lower part of this mseg inside kernel cage */ 3279 decr = kcage_current_pfn(&kcagepfn); 3280 3281 /* kernel cage may have transitioned past mseg */ 3282 if (kcagepfn >= mseg->pages_base && 3283 kcagepfn < mseg->pages_end) { 3284 ASSERT(decr == 0); 3285 *lo = MAX(kcagepfn, pfnlo); 3286 *hi = MIN(pfnhi, (mseg->pages_end - 1)); 3287 rc = 1; 3288 } 3289 } 3290 /* else entire mseg in the cage */ 3291 } else { 3292 if (PP_ISNORELOC(mseg->epages - 1)) { 3293 3294 /* upper part of this mseg inside kernel cage */ 3295 decr = kcage_current_pfn(&kcagepfn); 3296 3297 /* kernel cage may have transitioned past mseg */ 3298 if (kcagepfn >= mseg->pages_base && 3299 kcagepfn < mseg->pages_end) { 3300 ASSERT(decr); 3301 *hi = MIN(kcagepfn, pfnhi); 3302 *lo = MAX(pfnlo, mseg->pages_base); 3303 rc = 1; 3304 } 3305 } else { 3306 /* entire mseg outside of kernel cage */ 3307 *lo = MAX(pfnlo, mseg->pages_base); 3308 *hi = MIN(pfnhi, (mseg->pages_end - 1)); 3309 rc = 1; 3310 } 3311 } 3312 return (rc); 3313 } 3314 3315 /* 3316 * called from page_get_contig_pages to search 'pfnlo' thru 'pfnhi' to claim a 3317 * page with size code 'szc'. Claiming such a page requires acquiring 3318 * exclusive locks on all constituent pages (page_trylock_contig_pages), 3319 * relocating pages in use and concatenating these constituent pages into a 3320 * large page. 3321 * 3322 * The page lists do not have such a large page and page_freelist_split has 3323 * already failed to demote larger pages and/or coalesce smaller free pages. 3324 * 3325 * 'flags' may specify PG_COLOR_MATCH which would limit the search of large 3326 * pages with the same color as 'bin'. 3327 * 3328 * 'pfnflag' specifies the subset of the pfn range to search. 3329 */ 3330 3331 static page_t * 3332 page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags, 3333 pfn_t pfnlo, pfn_t pfnhi, pgcnt_t pfnflag) 3334 { 3335 struct memseg *mseg; 3336 pgcnt_t szcpgcnt = page_get_pagecnt(szc); 3337 pgcnt_t szcpgmask = szcpgcnt - 1; 3338 pfn_t randpfn; 3339 page_t *pp, *randpp, *endpp; 3340 uint_t colors, ceq_mask; 3341 /* LINTED : set but not used in function */ 3342 uint_t color_mask; 3343 pfn_t hi, lo; 3344 uint_t skip; 3345 MEM_NODE_ITERATOR_DECL(it); 3346 3347 ASSERT(szc != 0 || (flags & PGI_PGCPSZC0)); 3348 3349 pfnlo = P2ROUNDUP(pfnlo, szcpgcnt); 3350 3351 if ((pfnhi - pfnlo) + 1 < szcpgcnt || pfnlo >= pfnhi) 3352 return (NULL); 3353 3354 ASSERT(szc < mmu_page_sizes); 3355 3356 colors = PAGE_GET_PAGECOLORS(szc); 3357 color_mask = colors - 1; 3358 if ((colors > 1) && (flags & PG_MATCH_COLOR)) { 3359 uchar_t ceq = colorequivszc[szc]; 3360 uint_t ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf)); 3361 3362 ASSERT(ceq_dif > 0); 3363 ceq_mask = (ceq_dif - 1) << (ceq & 0xf); 3364 } else { 3365 ceq_mask = 0; 3366 } 3367 3368 ASSERT(bin < colors); 3369 3370 /* clear "non-significant" color bits */ 3371 bin &= ceq_mask; 3372 3373 /* 3374 * trim the pfn range to search based on pfnflag. pfnflag is set 3375 * when there have been previous page_get_contig_page failures to 3376 * limit the search. 3377 * 3378 * The high bit in pfnflag specifies the number of 'slots' in the 3379 * pfn range and the remainder of pfnflag specifies which slot. 3380 * For example, a value of 1010b would mean the second slot of 3381 * the pfn range that has been divided into 8 slots. 3382 */ 3383 if (pfnflag > 1) { 3384 int slots = 1 << (highbit(pfnflag) - 1); 3385 int slotid = pfnflag & (slots - 1); 3386 pgcnt_t szcpages; 3387 int slotlen; 3388 3389 pfnhi = P2ALIGN((pfnhi + 1), szcpgcnt) - 1; 3390 szcpages = ((pfnhi - pfnlo) + 1) / szcpgcnt; 3391 slotlen = howmany(szcpages, slots); 3392 /* skip if 'slotid' slot is empty */ 3393 if (slotid * slotlen >= szcpages) 3394 return (NULL); 3395 pfnlo = pfnlo + (((slotid * slotlen) % szcpages) * szcpgcnt); 3396 ASSERT(pfnlo < pfnhi); 3397 if (pfnhi > pfnlo + (slotlen * szcpgcnt)) 3398 pfnhi = pfnlo + (slotlen * szcpgcnt) - 1; 3399 } 3400 3401 memsegs_lock(0); 3402 3403 /* 3404 * loop through memsegs to look for contig page candidates 3405 */ 3406 3407 for (mseg = memsegs; mseg != NULL; mseg = mseg->next) { 3408 if (pfnhi < mseg->pages_base || pfnlo >= mseg->pages_end) { 3409 /* no overlap */ 3410 continue; 3411 } 3412 3413 if (mseg->pages_end - mseg->pages_base < szcpgcnt) 3414 /* mseg too small */ 3415 continue; 3416 3417 /* 3418 * trim off kernel cage pages from pfn range and check for 3419 * a trimmed pfn range returned that does not span the 3420 * desired large page size. 3421 */ 3422 if (kcage_on) { 3423 if (trimkcage(mseg, &lo, &hi, pfnlo, pfnhi) == 0 || 3424 lo >= hi || ((hi - lo) + 1) < szcpgcnt) 3425 continue; 3426 } else { 3427 lo = MAX(pfnlo, mseg->pages_base); 3428 hi = MIN(pfnhi, (mseg->pages_end - 1)); 3429 } 3430 3431 /* round to szcpgcnt boundaries */ 3432 lo = P2ROUNDUP(lo, szcpgcnt); 3433 3434 MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it); 3435 hi = P2ALIGN((hi + 1), szcpgcnt) - 1; 3436 3437 if (hi <= lo) 3438 continue; 3439 3440 /* 3441 * set lo to point to the pfn for the desired bin. Large 3442 * page sizes may only have a single page color 3443 */ 3444 skip = szcpgcnt; 3445 if (ceq_mask > 0 || interleaved_mnodes) { 3446 /* set lo to point at appropriate color */ 3447 if (((PFN_2_COLOR(lo, szc, &it) ^ bin) & ceq_mask) || 3448 (interleaved_mnodes && 3449 PFN_2_MEM_NODE(lo) != mnode)) { 3450 PAGE_NEXT_PFN_FOR_COLOR(lo, szc, bin, ceq_mask, 3451 color_mask, &it); 3452 } 3453 if (hi <= lo) 3454 /* mseg cannot satisfy color request */ 3455 continue; 3456 } 3457 3458 /* randomly choose a point between lo and hi to begin search */ 3459 3460 randpfn = (pfn_t)GETTICK(); 3461 randpfn = ((randpfn % (hi - lo)) + lo) & ~(skip - 1); 3462 MEM_NODE_ITERATOR_INIT(randpfn, mnode, szc, &it); 3463 if (ceq_mask || interleaved_mnodes || randpfn == (pfn_t)-1) { 3464 if (randpfn != (pfn_t)-1) { 3465 PAGE_NEXT_PFN_FOR_COLOR(randpfn, szc, bin, 3466 ceq_mask, color_mask, &it); 3467 } 3468 if (randpfn >= hi) { 3469 randpfn = lo; 3470 MEM_NODE_ITERATOR_INIT(randpfn, mnode, szc, 3471 &it); 3472 } 3473 } 3474 randpp = mseg->pages + (randpfn - mseg->pages_base); 3475 3476 ASSERT(randpp->p_pagenum == randpfn); 3477 3478 pp = randpp; 3479 endpp = mseg->pages + (hi - mseg->pages_base) + 1; 3480 3481 ASSERT(randpp + szcpgcnt <= endpp); 3482 3483 do { 3484 ASSERT(!(pp->p_pagenum & szcpgmask)); 3485 ASSERT(((PP_2_BIN(pp) ^ bin) & ceq_mask) == 0); 3486 3487 if (page_trylock_contig_pages(mnode, pp, szc, flags)) { 3488 /* pages unlocked by page_claim on failure */ 3489 if (page_claim_contig_pages(pp, szc, flags)) { 3490 memsegs_unlock(0); 3491 return (pp); 3492 } 3493 } 3494 3495 if (ceq_mask == 0 && !interleaved_mnodes) { 3496 pp += skip; 3497 } else { 3498 pfn_t pfn = pp->p_pagenum; 3499 3500 PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, bin, 3501 ceq_mask, color_mask, &it); 3502 if (pfn == (pfn_t)-1) { 3503 pp = endpp; 3504 } else { 3505 pp = mseg->pages + 3506 (pfn - mseg->pages_base); 3507 } 3508 } 3509 if (pp >= endpp) { 3510 /* start from the beginning */ 3511 MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it); 3512 pp = mseg->pages + (lo - mseg->pages_base); 3513 ASSERT(pp->p_pagenum == lo); 3514 ASSERT(pp + szcpgcnt <= endpp); 3515 } 3516 } while (pp != randpp); 3517 } 3518 memsegs_unlock(0); 3519 return (NULL); 3520 } 3521 3522 3523 /* 3524 * controlling routine that searches through physical memory in an attempt to 3525 * claim a large page based on the input parameters. 3526 * on the page free lists. 3527 * 3528 * calls page_geti_contig_pages with an initial pfn range from the mnode 3529 * and mtype. page_geti_contig_pages will trim off the parts of the pfn range 3530 * that overlaps with the kernel cage or does not match the requested page 3531 * color if PG_MATCH_COLOR is set. Since this search is very expensive, 3532 * page_geti_contig_pages may further limit the search range based on 3533 * previous failure counts (pgcpfailcnt[]). 3534 * 3535 * for PGI_PGCPSZC0 requests, page_get_contig_pages will relocate a base 3536 * pagesize page that satisfies mtype. 3537 */ 3538 page_t * 3539 page_get_contig_pages(int mnode, uint_t bin, int mtype, uchar_t szc, 3540 uint_t flags) 3541 { 3542 pfn_t pfnlo, pfnhi; /* contig pages pfn range */ 3543 page_t *pp; 3544 pgcnt_t pfnflag = 0; /* no limit on search if 0 */ 3545 3546 VM_STAT_ADD(vmm_vmstats.pgcp_alloc[szc]); 3547 3548 /* no allocations from cage */ 3549 flags |= PGI_NOCAGE; 3550 3551 /* LINTED */ 3552 MTYPE_START(mnode, mtype, flags); 3553 if (mtype < 0) { /* mnode does not have memory in mtype range */ 3554 VM_STAT_ADD(vmm_vmstats.pgcp_allocempty[szc]); 3555 return (NULL); 3556 } 3557 3558 ASSERT(szc > 0 || (flags & PGI_PGCPSZC0)); 3559 3560 /* do not limit search and ignore color if hi pri */ 3561 3562 if (pgcplimitsearch && ((flags & PGI_PGCPHIPRI) == 0)) 3563 pfnflag = pgcpfailcnt[szc]; 3564 3565 /* remove color match to improve chances */ 3566 3567 if (flags & PGI_PGCPHIPRI || pfnflag) 3568 flags &= ~PG_MATCH_COLOR; 3569 3570 do { 3571 /* get pfn range based on mnode and mtype */ 3572 MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi); 3573 3574 ASSERT(pfnhi >= pfnlo); 3575 3576 pp = page_geti_contig_pages(mnode, bin, szc, flags, 3577 pfnlo, pfnhi, pfnflag); 3578 3579 if (pp != NULL) { 3580 pfnflag = pgcpfailcnt[szc]; 3581 if (pfnflag) { 3582 /* double the search size */ 3583 pgcpfailcnt[szc] = pfnflag >> 1; 3584 } 3585 VM_STAT_ADD(vmm_vmstats.pgcp_allocok[szc]); 3586 return (pp); 3587 } 3588 MTYPE_NEXT(mnode, mtype, flags); 3589 } while (mtype >= 0); 3590 3591 VM_STAT_ADD(vmm_vmstats.pgcp_allocfailed[szc]); 3592 return (NULL); 3593 } 3594 3595 #if defined(__i386) || defined(__amd64) 3596 /* 3597 * Determine the likelihood of finding/coalescing a szc page. 3598 * Return 0 if the likelihood is small otherwise return 1. 3599 * 3600 * For now, be conservative and check only 1g pages and return 0 3601 * if there had been previous coalescing failures and the szc pages 3602 * needed to satisfy request would exhaust most of freemem. 3603 */ 3604 int 3605 page_chk_freelist(uint_t szc) 3606 { 3607 pgcnt_t pgcnt; 3608 3609 if (szc <= 1) 3610 return (1); 3611 3612 pgcnt = page_get_pagecnt(szc); 3613 if (pgcpfailcnt[szc] && pgcnt + throttlefree >= freemem) { 3614 VM_STAT_ADD(vmm_vmstats.pcf_deny[szc]); 3615 return (0); 3616 } 3617 VM_STAT_ADD(vmm_vmstats.pcf_allow[szc]); 3618 return (1); 3619 } 3620 #endif 3621 3622 /* 3623 * Find the `best' page on the freelist for this (vp,off) (as,vaddr) pair. 3624 * 3625 * Does its own locking and accounting. 3626 * If PG_MATCH_COLOR is set, then NULL will be returned if there are no 3627 * pages of the proper color even if there are pages of a different color. 3628 * 3629 * Finds a page, removes it, THEN locks it. 3630 */ 3631 3632 /*ARGSUSED*/ 3633 page_t * 3634 page_get_freelist(struct vnode *vp, u_offset_t off, struct seg *seg, 3635 caddr_t vaddr, size_t size, uint_t flags, struct lgrp *lgrp) 3636 { 3637 struct as *as = seg->s_as; 3638 page_t *pp = NULL; 3639 ulong_t bin; 3640 uchar_t szc; 3641 int mnode; 3642 int mtype; 3643 page_t *(*page_get_func)(int, uint_t, int, uchar_t, uint_t); 3644 lgrp_mnode_cookie_t lgrp_cookie; 3645 3646 page_get_func = page_get_mnode_freelist; 3647 3648 /* 3649 * If we aren't passed a specific lgroup, or passed a freed lgrp 3650 * assume we wish to allocate near to the current thread's home. 3651 */ 3652 if (!LGRP_EXISTS(lgrp)) 3653 lgrp = lgrp_home_lgrp(); 3654 3655 if (kcage_on) { 3656 if ((flags & (PG_NORELOC | PG_PANIC)) == PG_NORELOC && 3657 kcage_freemem < kcage_throttlefree + btop(size) && 3658 curthread != kcage_cageout_thread) { 3659 /* 3660 * Set a "reserve" of kcage_throttlefree pages for 3661 * PG_PANIC and cageout thread allocations. 3662 * 3663 * Everybody else has to serialize in 3664 * page_create_get_something() to get a cage page, so 3665 * that we don't deadlock cageout! 3666 */ 3667 return (NULL); 3668 } 3669 } else { 3670 flags &= ~PG_NORELOC; 3671 flags |= PGI_NOCAGE; 3672 } 3673 3674 /* LINTED */ 3675 MTYPE_INIT(mtype, vp, vaddr, flags, size); 3676 3677 /* 3678 * Convert size to page size code. 3679 */ 3680 if ((szc = page_szc(size)) == (uchar_t)-1) 3681 panic("page_get_freelist: illegal page size request"); 3682 ASSERT(szc < mmu_page_sizes); 3683 3684 VM_STAT_ADD(vmm_vmstats.pgf_alloc[szc]); 3685 3686 /* LINTED */ 3687 AS_2_BIN(as, seg, vp, vaddr, bin, szc); 3688 3689 ASSERT(bin < PAGE_GET_PAGECOLORS(szc)); 3690 3691 /* 3692 * Try to get a local page first, but try remote if we can't 3693 * get a page of the right color. 3694 */ 3695 pgretry: 3696 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL); 3697 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3698 pp = page_get_func(mnode, bin, mtype, szc, flags); 3699 if (pp != NULL) { 3700 VM_STAT_ADD(vmm_vmstats.pgf_allocok[szc]); 3701 DTRACE_PROBE4(page__get, 3702 lgrp_t *, lgrp, 3703 int, mnode, 3704 ulong_t, bin, 3705 uint_t, flags); 3706 return (pp); 3707 } 3708 } 3709 ASSERT(pp == NULL); 3710 3711 /* 3712 * for non-SZC0 PAGESIZE requests, check cachelist before checking 3713 * remote free lists. Caller expected to call page_get_cachelist which 3714 * will check local cache lists and remote free lists. 3715 */ 3716 if (szc == 0 && ((flags & PGI_PGCPSZC0) == 0)) { 3717 VM_STAT_ADD(vmm_vmstats.pgf_allocdeferred); 3718 return (NULL); 3719 } 3720 3721 ASSERT(szc > 0 || (flags & PGI_PGCPSZC0)); 3722 3723 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); 3724 3725 if (!(flags & PG_LOCAL)) { 3726 /* 3727 * Try to get a non-local freelist page. 3728 */ 3729 LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie); 3730 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3731 pp = page_get_func(mnode, bin, mtype, szc, flags); 3732 if (pp != NULL) { 3733 DTRACE_PROBE4(page__get, 3734 lgrp_t *, lgrp, 3735 int, mnode, 3736 ulong_t, bin, 3737 uint_t, flags); 3738 VM_STAT_ADD(vmm_vmstats.pgf_allocokrem[szc]); 3739 return (pp); 3740 } 3741 } 3742 ASSERT(pp == NULL); 3743 } 3744 3745 /* 3746 * when the cage is off chances are page_get_contig_pages() will fail 3747 * to lock a large page chunk therefore when the cage is off it's not 3748 * called by default. this can be changed via /etc/system. 3749 * 3750 * page_get_contig_pages() also called to acquire a base pagesize page 3751 * for page_create_get_something(). 3752 */ 3753 if (!(flags & PG_NORELOC) && (pg_contig_disable == 0) && 3754 (kcage_on || pg_lpgcreate_nocage || szc == 0) && 3755 (page_get_func != page_get_contig_pages)) { 3756 3757 VM_STAT_ADD(vmm_vmstats.pgf_allocretry[szc]); 3758 page_get_func = page_get_contig_pages; 3759 goto pgretry; 3760 } 3761 3762 if (!(flags & PG_LOCAL) && pgcplimitsearch && 3763 page_get_func == page_get_contig_pages) 3764 SETPGCPFAILCNT(szc); 3765 3766 VM_STAT_ADD(vmm_vmstats.pgf_allocfailed[szc]); 3767 return (NULL); 3768 } 3769 3770 /* 3771 * Find the `best' page on the cachelist for this (vp,off) (as,vaddr) pair. 3772 * 3773 * Does its own locking. 3774 * If PG_MATCH_COLOR is set, then NULL will be returned if there are no 3775 * pages of the proper color even if there are pages of a different color. 3776 * Otherwise, scan the bins for ones with pages. For each bin with pages, 3777 * try to lock one of them. If no page can be locked, try the 3778 * next bin. Return NULL if a page can not be found and locked. 3779 * 3780 * Finds a pages, trys to lock it, then removes it. 3781 */ 3782 3783 /*ARGSUSED*/ 3784 page_t * 3785 page_get_cachelist(struct vnode *vp, u_offset_t off, struct seg *seg, 3786 caddr_t vaddr, uint_t flags, struct lgrp *lgrp) 3787 { 3788 page_t *pp; 3789 struct as *as = seg->s_as; 3790 ulong_t bin; 3791 /*LINTED*/ 3792 int mnode; 3793 int mtype; 3794 lgrp_mnode_cookie_t lgrp_cookie; 3795 3796 /* 3797 * If we aren't passed a specific lgroup, or pasased a freed lgrp 3798 * assume we wish to allocate near to the current thread's home. 3799 */ 3800 if (!LGRP_EXISTS(lgrp)) 3801 lgrp = lgrp_home_lgrp(); 3802 3803 if (!kcage_on) { 3804 flags &= ~PG_NORELOC; 3805 flags |= PGI_NOCAGE; 3806 } 3807 3808 if ((flags & (PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == PG_NORELOC && 3809 kcage_freemem <= kcage_throttlefree) { 3810 /* 3811 * Reserve kcage_throttlefree pages for critical kernel 3812 * threads. 3813 * 3814 * Everybody else has to go to page_create_get_something() 3815 * to get a cage page, so we don't deadlock cageout. 3816 */ 3817 return (NULL); 3818 } 3819 3820 /* LINTED */ 3821 AS_2_BIN(as, seg, vp, vaddr, bin, 0); 3822 3823 ASSERT(bin < PAGE_GET_PAGECOLORS(0)); 3824 3825 /* LINTED */ 3826 MTYPE_INIT(mtype, vp, vaddr, flags, MMU_PAGESIZE); 3827 3828 VM_STAT_ADD(vmm_vmstats.pgc_alloc); 3829 3830 /* 3831 * Try local cachelists first 3832 */ 3833 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL); 3834 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3835 pp = page_get_mnode_cachelist(bin, flags, mnode, mtype); 3836 if (pp != NULL) { 3837 VM_STAT_ADD(vmm_vmstats.pgc_allocok); 3838 DTRACE_PROBE4(page__get, 3839 lgrp_t *, lgrp, 3840 int, mnode, 3841 ulong_t, bin, 3842 uint_t, flags); 3843 return (pp); 3844 } 3845 } 3846 3847 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); 3848 3849 /* 3850 * Try freelists/cachelists that are farther away 3851 * This is our only chance to allocate remote pages for PAGESIZE 3852 * requests. 3853 */ 3854 LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie); 3855 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3856 pp = page_get_mnode_freelist(mnode, bin, mtype, 3857 0, flags); 3858 if (pp != NULL) { 3859 VM_STAT_ADD(vmm_vmstats.pgc_allocokdeferred); 3860 DTRACE_PROBE4(page__get, 3861 lgrp_t *, lgrp, 3862 int, mnode, 3863 ulong_t, bin, 3864 uint_t, flags); 3865 return (pp); 3866 } 3867 pp = page_get_mnode_cachelist(bin, flags, mnode, mtype); 3868 if (pp != NULL) { 3869 VM_STAT_ADD(vmm_vmstats.pgc_allocokrem); 3870 DTRACE_PROBE4(page__get, 3871 lgrp_t *, lgrp, 3872 int, mnode, 3873 ulong_t, bin, 3874 uint_t, flags); 3875 return (pp); 3876 } 3877 } 3878 3879 VM_STAT_ADD(vmm_vmstats.pgc_allocfailed); 3880 return (NULL); 3881 } 3882 3883 page_t * 3884 page_get_mnode_cachelist(uint_t bin, uint_t flags, int mnode, int mtype) 3885 { 3886 kmutex_t *pcm; 3887 page_t *pp, *first_pp; 3888 uint_t sbin; 3889 int plw_initialized; 3890 page_list_walker_t plw; 3891 3892 VM_STAT_ADD(vmm_vmstats.pgmc_alloc); 3893 3894 /* LINTED */ 3895 MTYPE_START(mnode, mtype, flags); 3896 if (mtype < 0) { /* mnode does not have memory in mtype range */ 3897 VM_STAT_ADD(vmm_vmstats.pgmc_allocempty); 3898 return (NULL); 3899 } 3900 3901 try_again: 3902 3903 plw_initialized = 0; 3904 plw.plw_ceq_dif = 1; 3905 3906 /* 3907 * Only hold one cachelist lock at a time, that way we 3908 * can start anywhere and not have to worry about lock 3909 * ordering. 3910 */ 3911 3912 for (plw.plw_count = 0; 3913 plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) { 3914 sbin = bin; 3915 do { 3916 3917 if (!PAGE_CACHELISTS(mnode, bin, mtype)) 3918 goto bin_empty_1; 3919 pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST); 3920 mutex_enter(pcm); 3921 pp = PAGE_CACHELISTS(mnode, bin, mtype); 3922 if (pp == NULL) 3923 goto bin_empty_0; 3924 3925 first_pp = pp; 3926 ASSERT(pp->p_vnode); 3927 ASSERT(PP_ISAGED(pp) == 0); 3928 ASSERT(pp->p_szc == 0); 3929 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 3930 while (!page_trylock(pp, SE_EXCL)) { 3931 pp = pp->p_next; 3932 ASSERT(pp->p_szc == 0); 3933 if (pp == first_pp) { 3934 /* 3935 * We have searched the complete list! 3936 * And all of them (might only be one) 3937 * are locked. This can happen since 3938 * these pages can also be found via 3939 * the hash list. When found via the 3940 * hash list, they are locked first, 3941 * then removed. We give up to let the 3942 * other thread run. 3943 */ 3944 pp = NULL; 3945 break; 3946 } 3947 ASSERT(pp->p_vnode); 3948 ASSERT(PP_ISFREE(pp)); 3949 ASSERT(PP_ISAGED(pp) == 0); 3950 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == 3951 mnode); 3952 } 3953 3954 if (pp) { 3955 page_t **ppp; 3956 /* 3957 * Found and locked a page. 3958 * Pull it off the list. 3959 */ 3960 ASSERT(mtype == PP_2_MTYPE(pp)); 3961 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 3962 page_sub(ppp, pp); 3963 /* 3964 * Subtract counters before releasing pcm mutex 3965 * to avoid a race with page_freelist_coalesce 3966 * and page_freelist_split. 3967 */ 3968 page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST); 3969 mutex_exit(pcm); 3970 ASSERT(pp->p_vnode); 3971 ASSERT(PP_ISAGED(pp) == 0); 3972 #if defined(__sparc) 3973 ASSERT(!kcage_on || 3974 (flags & PG_NORELOC) == 0 || 3975 PP_ISNORELOC(pp)); 3976 if (PP_ISNORELOC(pp)) { 3977 kcage_freemem_sub(1); 3978 } 3979 #endif 3980 VM_STAT_ADD(vmm_vmstats. pgmc_allocok); 3981 return (pp); 3982 } 3983 bin_empty_0: 3984 mutex_exit(pcm); 3985 bin_empty_1: 3986 if (plw_initialized == 0) { 3987 page_list_walk_init(0, flags, bin, 0, 1, &plw); 3988 plw_initialized = 1; 3989 } 3990 /* calculate the next bin with equivalent color */ 3991 bin = ADD_MASKED(bin, plw.plw_bin_step, 3992 plw.plw_ceq_mask[0], plw.plw_color_mask); 3993 } while (sbin != bin); 3994 3995 if (plw.plw_ceq_dif > 1) 3996 bin = page_list_walk_next_bin(0, bin, &plw); 3997 } 3998 3999 MTYPE_NEXT(mnode, mtype, flags); 4000 if (mtype >= 0) 4001 goto try_again; 4002 4003 VM_STAT_ADD(vmm_vmstats.pgmc_allocfailed); 4004 return (NULL); 4005 } 4006 4007 #ifdef DEBUG 4008 #define REPL_PAGE_STATS 4009 #endif /* DEBUG */ 4010 4011 #ifdef REPL_PAGE_STATS 4012 struct repl_page_stats { 4013 uint_t ngets; 4014 uint_t ngets_noreloc; 4015 uint_t npgr_noreloc; 4016 uint_t nnopage_first; 4017 uint_t nnopage; 4018 uint_t nhashout; 4019 uint_t nnofree; 4020 uint_t nnext_pp; 4021 } repl_page_stats; 4022 #define REPL_STAT_INCR(v) atomic_add_32(&repl_page_stats.v, 1) 4023 #else /* REPL_PAGE_STATS */ 4024 #define REPL_STAT_INCR(v) 4025 #endif /* REPL_PAGE_STATS */ 4026 4027 int pgrppgcp; 4028 4029 /* 4030 * The freemem accounting must be done by the caller. 4031 * First we try to get a replacement page of the same size as like_pp, 4032 * if that is not possible, then we just get a set of discontiguous 4033 * PAGESIZE pages. 4034 */ 4035 page_t * 4036 page_get_replacement_page(page_t *orig_like_pp, struct lgrp *lgrp_target, 4037 uint_t pgrflags) 4038 { 4039 page_t *like_pp; 4040 page_t *pp, *pplist; 4041 page_t *pl = NULL; 4042 ulong_t bin; 4043 int mnode, page_mnode; 4044 int szc; 4045 spgcnt_t npgs, pg_cnt; 4046 pfn_t pfnum; 4047 int mtype; 4048 int flags = 0; 4049 lgrp_mnode_cookie_t lgrp_cookie; 4050 lgrp_t *lgrp; 4051 4052 REPL_STAT_INCR(ngets); 4053 like_pp = orig_like_pp; 4054 ASSERT(PAGE_EXCL(like_pp)); 4055 4056 szc = like_pp->p_szc; 4057 npgs = page_get_pagecnt(szc); 4058 /* 4059 * Now we reset like_pp to the base page_t. 4060 * That way, we won't walk past the end of this 'szc' page. 4061 */ 4062 pfnum = PFN_BASE(like_pp->p_pagenum, szc); 4063 like_pp = page_numtopp_nolock(pfnum); 4064 ASSERT(like_pp->p_szc == szc); 4065 4066 if (PP_ISNORELOC(like_pp)) { 4067 ASSERT(kcage_on); 4068 REPL_STAT_INCR(ngets_noreloc); 4069 flags = PGI_RELOCONLY; 4070 } else if (pgrflags & PGR_NORELOC) { 4071 ASSERT(kcage_on); 4072 REPL_STAT_INCR(npgr_noreloc); 4073 flags = PG_NORELOC; 4074 } 4075 4076 /* 4077 * Kernel pages must always be replaced with the same size 4078 * pages, since we cannot properly handle demotion of kernel 4079 * pages. 4080 */ 4081 if (PP_ISKAS(like_pp)) 4082 pgrflags |= PGR_SAMESZC; 4083 4084 /* LINTED */ 4085 MTYPE_PGR_INIT(mtype, flags, like_pp, page_mnode, npgs); 4086 4087 while (npgs) { 4088 pplist = NULL; 4089 for (;;) { 4090 pg_cnt = page_get_pagecnt(szc); 4091 bin = PP_2_BIN(like_pp); 4092 ASSERT(like_pp->p_szc == orig_like_pp->p_szc); 4093 ASSERT(pg_cnt <= npgs); 4094 4095 /* 4096 * If an lgroup was specified, try to get the 4097 * page from that lgroup. 4098 * NOTE: Must be careful with code below because 4099 * lgroup may disappear and reappear since there 4100 * is no locking for lgroup here. 4101 */ 4102 if (LGRP_EXISTS(lgrp_target)) { 4103 /* 4104 * Keep local variable for lgroup separate 4105 * from lgroup argument since this code should 4106 * only be exercised when lgroup argument 4107 * exists.... 4108 */ 4109 lgrp = lgrp_target; 4110 4111 /* Try the lgroup's freelists first */ 4112 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 4113 LGRP_SRCH_LOCAL); 4114 while ((pplist == NULL) && 4115 (mnode = lgrp_memnode_choose(&lgrp_cookie)) 4116 != -1) { 4117 pplist = 4118 page_get_mnode_freelist(mnode, bin, 4119 mtype, szc, flags); 4120 } 4121 4122 /* 4123 * Now try it's cachelists if this is a 4124 * small page. Don't need to do it for 4125 * larger ones since page_freelist_coalesce() 4126 * already failed. 4127 */ 4128 if (pplist != NULL || szc != 0) 4129 break; 4130 4131 /* Now try it's cachelists */ 4132 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 4133 LGRP_SRCH_LOCAL); 4134 4135 while ((pplist == NULL) && 4136 (mnode = lgrp_memnode_choose(&lgrp_cookie)) 4137 != -1) { 4138 pplist = 4139 page_get_mnode_cachelist(bin, flags, 4140 mnode, mtype); 4141 } 4142 if (pplist != NULL) { 4143 page_hashout(pplist, NULL); 4144 PP_SETAGED(pplist); 4145 REPL_STAT_INCR(nhashout); 4146 break; 4147 } 4148 /* Done looking in this lgroup. Bail out. */ 4149 break; 4150 } 4151 4152 /* 4153 * No lgroup was specified (or lgroup was removed by 4154 * DR, so just try to get the page as close to 4155 * like_pp's mnode as possible. 4156 * First try the local freelist... 4157 */ 4158 mnode = PP_2_MEM_NODE(like_pp); 4159 pplist = page_get_mnode_freelist(mnode, bin, 4160 mtype, szc, flags); 4161 if (pplist != NULL) 4162 break; 4163 4164 REPL_STAT_INCR(nnofree); 4165 4166 /* 4167 * ...then the local cachelist. Don't need to do it for 4168 * larger pages cause page_freelist_coalesce() already 4169 * failed there anyway. 4170 */ 4171 if (szc == 0) { 4172 pplist = page_get_mnode_cachelist(bin, flags, 4173 mnode, mtype); 4174 if (pplist != NULL) { 4175 page_hashout(pplist, NULL); 4176 PP_SETAGED(pplist); 4177 REPL_STAT_INCR(nhashout); 4178 break; 4179 } 4180 } 4181 4182 /* Now try remote freelists */ 4183 page_mnode = mnode; 4184 lgrp = 4185 lgrp_hand_to_lgrp(MEM_NODE_2_LGRPHAND(page_mnode)); 4186 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 4187 LGRP_SRCH_HIER); 4188 while (pplist == NULL && 4189 (mnode = lgrp_memnode_choose(&lgrp_cookie)) 4190 != -1) { 4191 /* 4192 * Skip local mnode. 4193 */ 4194 if ((mnode == page_mnode) || 4195 (mem_node_config[mnode].exists == 0)) 4196 continue; 4197 4198 pplist = page_get_mnode_freelist(mnode, 4199 bin, mtype, szc, flags); 4200 } 4201 4202 if (pplist != NULL) 4203 break; 4204 4205 4206 /* Now try remote cachelists */ 4207 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 4208 LGRP_SRCH_HIER); 4209 while (pplist == NULL && szc == 0) { 4210 mnode = lgrp_memnode_choose(&lgrp_cookie); 4211 if (mnode == -1) 4212 break; 4213 /* 4214 * Skip local mnode. 4215 */ 4216 if ((mnode == page_mnode) || 4217 (mem_node_config[mnode].exists == 0)) 4218 continue; 4219 4220 pplist = page_get_mnode_cachelist(bin, 4221 flags, mnode, mtype); 4222 4223 if (pplist != NULL) { 4224 page_hashout(pplist, NULL); 4225 PP_SETAGED(pplist); 4226 REPL_STAT_INCR(nhashout); 4227 break; 4228 } 4229 } 4230 4231 /* 4232 * Break out of while loop under the following cases: 4233 * - If we successfully got a page. 4234 * - If pgrflags specified only returning a specific 4235 * page size and we could not find that page size. 4236 * - If we could not satisfy the request with PAGESIZE 4237 * or larger pages. 4238 */ 4239 if (pplist != NULL || szc == 0) 4240 break; 4241 4242 if ((pgrflags & PGR_SAMESZC) || pgrppgcp) { 4243 /* try to find contig page */ 4244 4245 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 4246 LGRP_SRCH_HIER); 4247 4248 while ((pplist == NULL) && 4249 (mnode = 4250 lgrp_memnode_choose(&lgrp_cookie)) 4251 != -1) { 4252 pplist = page_get_contig_pages( 4253 mnode, bin, mtype, szc, 4254 flags | PGI_PGCPHIPRI); 4255 } 4256 break; 4257 } 4258 4259 /* 4260 * The correct thing to do here is try the next 4261 * page size down using szc--. Due to a bug 4262 * with the processing of HAT_RELOAD_SHARE 4263 * where the sfmmu_ttecnt arrays of all 4264 * hats sharing an ISM segment don't get updated, 4265 * using intermediate size pages for relocation 4266 * can lead to continuous page faults. 4267 */ 4268 szc = 0; 4269 } 4270 4271 if (pplist != NULL) { 4272 DTRACE_PROBE4(page__get, 4273 lgrp_t *, lgrp, 4274 int, mnode, 4275 ulong_t, bin, 4276 uint_t, flags); 4277 4278 while (pplist != NULL && pg_cnt--) { 4279 ASSERT(pplist != NULL); 4280 pp = pplist; 4281 page_sub(&pplist, pp); 4282 PP_CLRFREE(pp); 4283 PP_CLRAGED(pp); 4284 page_list_concat(&pl, &pp); 4285 npgs--; 4286 like_pp = like_pp + 1; 4287 REPL_STAT_INCR(nnext_pp); 4288 } 4289 ASSERT(pg_cnt == 0); 4290 } else { 4291 break; 4292 } 4293 } 4294 4295 if (npgs) { 4296 /* 4297 * We were unable to allocate the necessary number 4298 * of pages. 4299 * We need to free up any pl. 4300 */ 4301 REPL_STAT_INCR(nnopage); 4302 page_free_replacement_page(pl); 4303 return (NULL); 4304 } else { 4305 return (pl); 4306 } 4307 } 4308 4309 /* 4310 * demote a free large page to it's constituent pages 4311 */ 4312 void 4313 page_demote_free_pages(page_t *pp) 4314 { 4315 4316 int mnode; 4317 4318 ASSERT(pp != NULL); 4319 ASSERT(PAGE_LOCKED(pp)); 4320 ASSERT(PP_ISFREE(pp)); 4321 ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes); 4322 4323 mnode = PP_2_MEM_NODE(pp); 4324 page_freelist_lock(mnode); 4325 if (pp->p_szc != 0) { 4326 (void) page_demote(mnode, PFN_BASE(pp->p_pagenum, 4327 pp->p_szc), pp->p_szc, 0, PC_NO_COLOR, PC_FREE); 4328 } 4329 page_freelist_unlock(mnode); 4330 ASSERT(pp->p_szc == 0); 4331 } 4332 4333 /* 4334 * Factor in colorequiv to check additional 'equivalent' bins. 4335 * colorequiv may be set in /etc/system 4336 */ 4337 void 4338 page_set_colorequiv_arr(void) 4339 { 4340 if (colorequiv > 1) { 4341 int i; 4342 uint_t sv_a = lowbit(colorequiv) - 1; 4343 4344 if (sv_a > 15) 4345 sv_a = 15; 4346 4347 for (i = 0; i < MMU_PAGE_SIZES; i++) { 4348 uint_t colors; 4349 uint_t a = sv_a; 4350 4351 if ((colors = hw_page_array[i].hp_colors) <= 1) { 4352 continue; 4353 } 4354 while ((colors >> a) == 0) 4355 a--; 4356 if ((a << 4) > colorequivszc[i]) { 4357 colorequivszc[i] = (a << 4); 4358 } 4359 } 4360 } 4361 } 4362