1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 /* 26 * Copyright 2012 Joyent, Inc. All rights reserved. 27 */ 28 29 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 30 /* All Rights Reserved */ 31 32 /* 33 * Portions of this source code were derived from Berkeley 4.3 BSD 34 * under license from the Regents of the University of California. 35 */ 36 37 38 /* 39 * This file contains common functions to access and manage the page lists. 40 * Many of these routines originated from platform dependent modules 41 * (sun4/vm/vm_dep.c, i86pc/vm/vm_machdep.c) and modified to function in 42 * a platform independent manner. 43 * 44 * vm/vm_dep.h provides for platform specific support. 45 */ 46 47 #include <sys/types.h> 48 #include <sys/debug.h> 49 #include <sys/cmn_err.h> 50 #include <sys/systm.h> 51 #include <sys/atomic.h> 52 #include <sys/sysmacros.h> 53 #include <vm/as.h> 54 #include <vm/page.h> 55 #include <vm/seg_kmem.h> 56 #include <vm/seg_vn.h> 57 #include <sys/vmsystm.h> 58 #include <sys/memnode.h> 59 #include <vm/vm_dep.h> 60 #include <sys/lgrp.h> 61 #include <sys/mem_config.h> 62 #include <sys/callb.h> 63 #include <sys/mem_cage.h> 64 #include <sys/sdt.h> 65 #include <sys/dumphdr.h> 66 #include <sys/swap.h> 67 68 extern uint_t vac_colors; 69 70 #define MAX_PRAGMA_ALIGN 128 71 72 /* vm_cpu_data0 for the boot cpu before kmem is initialized */ 73 74 #if L2CACHE_ALIGN_MAX <= MAX_PRAGMA_ALIGN 75 #pragma align L2CACHE_ALIGN_MAX(vm_cpu_data0) 76 #else 77 #pragma align MAX_PRAGMA_ALIGN(vm_cpu_data0) 78 #endif 79 char vm_cpu_data0[VM_CPU_DATA_PADSIZE]; 80 81 /* 82 * number of page colors equivalent to reqested color in page_get routines. 83 * If set, keeps large pages intact longer and keeps MPO allocation 84 * from the local mnode in favor of acquiring the 'correct' page color from 85 * a demoted large page or from a remote mnode. 86 */ 87 uint_t colorequiv; 88 89 /* 90 * color equivalency mask for each page size. 91 * Mask is computed based on cpu L2$ way sizes and colorequiv global. 92 * High 4 bits determine the number of high order bits of the color to ignore. 93 * Low 4 bits determines number of low order bits of color to ignore (it's only 94 * relevant for hashed index based page coloring). 95 */ 96 uchar_t colorequivszc[MMU_PAGE_SIZES]; 97 98 /* 99 * if set, specifies the percentage of large pages that are free from within 100 * a large page region before attempting to lock those pages for 101 * page_get_contig_pages processing. 102 * 103 * Should be turned on when kpr is available when page_trylock_contig_pages 104 * can be more selective. 105 */ 106 107 int ptcpthreshold; 108 109 /* 110 * Limit page get contig page search based on failure cnts in pgcpfailcnt[]. 111 * Enabled by default via pgcplimitsearch. 112 * 113 * pgcpfailcnt[] is bounded by PGCPFAILMAX (>= 1/2 of installed 114 * memory). When reached, pgcpfailcnt[] is reset to 1/2 of this upper 115 * bound. This upper bound range guarantees: 116 * - all large page 'slots' will be searched over time 117 * - the minimum (1) large page candidates considered on each pgcp call 118 * - count doesn't wrap around to 0 119 */ 120 pgcnt_t pgcpfailcnt[MMU_PAGE_SIZES]; 121 int pgcplimitsearch = 1; 122 123 #define PGCPFAILMAX (1 << (highbit(physinstalled) - 1)) 124 #define SETPGCPFAILCNT(szc) \ 125 if (++pgcpfailcnt[szc] >= PGCPFAILMAX) \ 126 pgcpfailcnt[szc] = PGCPFAILMAX / 2; 127 128 #ifdef VM_STATS 129 struct vmm_vmstats_str vmm_vmstats; 130 131 #endif /* VM_STATS */ 132 133 #if defined(__sparc) 134 #define LPGCREATE 0 135 #else 136 /* enable page_get_contig_pages */ 137 #define LPGCREATE 1 138 #endif 139 140 int pg_contig_disable; 141 int pg_lpgcreate_nocage = LPGCREATE; 142 143 /* 144 * page_freelist_split pfn flag to signify no lo or hi pfn requirement. 145 */ 146 #define PFNNULL 0 147 148 /* Flags involved in promotion and demotion routines */ 149 #define PC_FREE 0x1 /* put page on freelist */ 150 #define PC_ALLOC 0x2 /* return page for allocation */ 151 152 /* 153 * Flag for page_demote to be used with PC_FREE to denote that we don't care 154 * what the color is as the color parameter to the function is ignored. 155 */ 156 #define PC_NO_COLOR (-1) 157 158 /* mtype value for page_promote to use when mtype does not matter */ 159 #define PC_MTYPE_ANY (-1) 160 161 /* 162 * page counters candidates info 163 * See page_ctrs_cands comment below for more details. 164 * fields are as follows: 165 * pcc_pages_free: # pages which freelist coalesce can create 166 * pcc_color_free: pointer to page free counts per color 167 */ 168 typedef struct pcc_info { 169 pgcnt_t pcc_pages_free; 170 pgcnt_t *pcc_color_free; 171 uint_t pad[12]; 172 } pcc_info_t; 173 174 /* 175 * On big machines it can take a long time to check page_counters 176 * arrays. page_ctrs_cands is a summary array whose elements are a dynamically 177 * updated sum of all elements of the corresponding page_counters arrays. 178 * page_freelist_coalesce() searches page_counters only if an appropriate 179 * element of page_ctrs_cands array is greater than 0. 180 * 181 * page_ctrs_cands is indexed by mutex (i), region (r), mnode (m), mrange (g) 182 */ 183 pcc_info_t **page_ctrs_cands[NPC_MUTEX][MMU_PAGE_SIZES]; 184 185 /* 186 * Return in val the total number of free pages which can be created 187 * for the given mnode (m), mrange (g), and region size (r) 188 */ 189 #define PGCTRS_CANDS_GETVALUE(m, g, r, val) { \ 190 int i; \ 191 val = 0; \ 192 for (i = 0; i < NPC_MUTEX; i++) { \ 193 val += page_ctrs_cands[i][(r)][(m)][(g)].pcc_pages_free; \ 194 } \ 195 } 196 197 /* 198 * Return in val the total number of free pages which can be created 199 * for the given mnode (m), mrange (g), region size (r), and color (c) 200 */ 201 #define PGCTRS_CANDS_GETVALUECOLOR(m, g, r, c, val) { \ 202 int i; \ 203 val = 0; \ 204 ASSERT((c) < PAGE_GET_PAGECOLORS(r)); \ 205 for (i = 0; i < NPC_MUTEX; i++) { \ 206 val += \ 207 page_ctrs_cands[i][(r)][(m)][(g)].pcc_color_free[(c)]; \ 208 } \ 209 } 210 211 /* 212 * We can only allow a single thread to update a counter within the physical 213 * range of the largest supported page size. That is the finest granularity 214 * possible since the counter values are dependent on each other 215 * as you move accross region sizes. PP_CTR_LOCK_INDX is used to determine the 216 * ctr_mutex lock index for a particular physical range. 217 */ 218 static kmutex_t *ctr_mutex[NPC_MUTEX]; 219 220 #define PP_CTR_LOCK_INDX(pp) \ 221 (((pp)->p_pagenum >> \ 222 (PAGE_BSZS_SHIFT(mmu_page_sizes - 1))) & (NPC_MUTEX - 1)) 223 224 #define INVALID_COLOR 0xffffffff 225 #define INVALID_MASK 0xffffffff 226 227 /* 228 * Local functions prototypes. 229 */ 230 231 void page_ctr_add(int, int, page_t *, int); 232 void page_ctr_add_internal(int, int, page_t *, int); 233 void page_ctr_sub(int, int, page_t *, int); 234 void page_ctr_sub_internal(int, int, page_t *, int); 235 void page_freelist_lock(int); 236 void page_freelist_unlock(int); 237 page_t *page_promote(int, pfn_t, uchar_t, int, int); 238 page_t *page_demote(int, pfn_t, pfn_t, uchar_t, uchar_t, int, int); 239 page_t *page_freelist_split(uchar_t, 240 uint_t, int, int, pfn_t, pfn_t, page_list_walker_t *); 241 page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int); 242 static int page_trylock_cons(page_t *pp, se_t se); 243 244 /* 245 * The page_counters array below is used to keep track of free contiguous 246 * physical memory. A hw_page_map_t will be allocated per mnode per szc. 247 * This contains an array of counters, the size of the array, a shift value 248 * used to convert a pagenum into a counter array index or vice versa, as 249 * well as a cache of the last successful index to be promoted to a larger 250 * page size. As an optimization, we keep track of the last successful index 251 * to be promoted per page color for the given size region, and this is 252 * allocated dynamically based upon the number of colors for a given 253 * region size. 254 * 255 * Conceptually, the page counters are represented as: 256 * 257 * page_counters[region_size][mnode] 258 * 259 * region_size: size code of a candidate larger page made up 260 * of contiguous free smaller pages. 261 * 262 * page_counters[region_size][mnode].hpm_counters[index]: 263 * represents how many (region_size - 1) pages either 264 * exist or can be created within the given index range. 265 * 266 * Let's look at a sparc example: 267 * If we want to create a free 512k page, we look at region_size 2 268 * for the mnode we want. We calculate the index and look at a specific 269 * hpm_counters location. If we see 8 (FULL_REGION_CNT on sparc) at 270 * this location, it means that 8 64k pages either exist or can be created 271 * from 8K pages in order to make a single free 512k page at the given 272 * index. Note that when a region is full, it will contribute to the 273 * counts in the region above it. Thus we will not know what page 274 * size the free pages will be which can be promoted to this new free 275 * page unless we look at all regions below the current region. 276 */ 277 278 /* 279 * Note: hpmctr_t is defined in platform vm_dep.h 280 * hw_page_map_t contains all the information needed for the page_counters 281 * logic. The fields are as follows: 282 * 283 * hpm_counters: dynamically allocated array to hold counter data 284 * hpm_entries: entries in hpm_counters 285 * hpm_shift: shift for pnum/array index conv 286 * hpm_base: PFN mapped to counter index 0 287 * hpm_color_current: last index in counter array for this color at 288 * which we successfully created a large page 289 */ 290 typedef struct hw_page_map { 291 hpmctr_t *hpm_counters; 292 size_t hpm_entries; 293 int hpm_shift; 294 pfn_t hpm_base; 295 size_t *hpm_color_current[MAX_MNODE_MRANGES]; 296 #if defined(__sparc) 297 uint_t pad[4]; 298 #endif 299 } hw_page_map_t; 300 301 /* 302 * Element zero is not used, but is allocated for convenience. 303 */ 304 static hw_page_map_t *page_counters[MMU_PAGE_SIZES]; 305 306 /* 307 * Cached value of MNODE_RANGE_CNT(mnode). 308 * This is a function call in x86. 309 */ 310 static int mnode_nranges[MAX_MEM_NODES]; 311 static int mnode_maxmrange[MAX_MEM_NODES]; 312 313 /* 314 * The following macros are convenient ways to get access to the individual 315 * elements of the page_counters arrays. They can be used on both 316 * the left side and right side of equations. 317 */ 318 #define PAGE_COUNTERS(mnode, rg_szc, idx) \ 319 (page_counters[(rg_szc)][(mnode)].hpm_counters[(idx)]) 320 321 #define PAGE_COUNTERS_COUNTERS(mnode, rg_szc) \ 322 (page_counters[(rg_szc)][(mnode)].hpm_counters) 323 324 #define PAGE_COUNTERS_SHIFT(mnode, rg_szc) \ 325 (page_counters[(rg_szc)][(mnode)].hpm_shift) 326 327 #define PAGE_COUNTERS_ENTRIES(mnode, rg_szc) \ 328 (page_counters[(rg_szc)][(mnode)].hpm_entries) 329 330 #define PAGE_COUNTERS_BASE(mnode, rg_szc) \ 331 (page_counters[(rg_szc)][(mnode)].hpm_base) 332 333 #define PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, rg_szc, g) \ 334 (page_counters[(rg_szc)][(mnode)].hpm_color_current[(g)]) 335 336 #define PAGE_COUNTERS_CURRENT_COLOR(mnode, rg_szc, color, mrange) \ 337 (page_counters[(rg_szc)][(mnode)]. \ 338 hpm_color_current[(mrange)][(color)]) 339 340 #define PNUM_TO_IDX(mnode, rg_szc, pnum) \ 341 (((pnum) - PAGE_COUNTERS_BASE((mnode), (rg_szc))) >> \ 342 PAGE_COUNTERS_SHIFT((mnode), (rg_szc))) 343 344 #define IDX_TO_PNUM(mnode, rg_szc, index) \ 345 (PAGE_COUNTERS_BASE((mnode), (rg_szc)) + \ 346 ((index) << PAGE_COUNTERS_SHIFT((mnode), (rg_szc)))) 347 348 /* 349 * Protects the hpm_counters and hpm_color_current memory from changing while 350 * looking at page counters information. 351 * Grab the write lock to modify what these fields point at. 352 * Grab the read lock to prevent any pointers from changing. 353 * The write lock can not be held during memory allocation due to a possible 354 * recursion deadlock with trying to grab the read lock while the 355 * write lock is already held. 356 */ 357 krwlock_t page_ctrs_rwlock[MAX_MEM_NODES]; 358 359 360 /* 361 * initialize cpu_vm_data to point at cache aligned vm_cpu_data_t. 362 */ 363 void 364 cpu_vm_data_init(struct cpu *cp) 365 { 366 if (cp == CPU0) { 367 cp->cpu_vm_data = (void *)&vm_cpu_data0; 368 } else { 369 void *kmptr; 370 int align; 371 size_t sz; 372 373 align = (L2CACHE_ALIGN) ? L2CACHE_ALIGN : L2CACHE_ALIGN_MAX; 374 sz = P2ROUNDUP(sizeof (vm_cpu_data_t), align) + align; 375 kmptr = kmem_zalloc(sz, KM_SLEEP); 376 cp->cpu_vm_data = (void *) P2ROUNDUP((uintptr_t)kmptr, align); 377 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr = kmptr; 378 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize = sz; 379 } 380 } 381 382 /* 383 * free cpu_vm_data 384 */ 385 void 386 cpu_vm_data_destroy(struct cpu *cp) 387 { 388 if (cp->cpu_seqid && cp->cpu_vm_data) { 389 ASSERT(cp != CPU0); 390 kmem_free(((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr, 391 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize); 392 } 393 cp->cpu_vm_data = NULL; 394 } 395 396 397 /* 398 * page size to page size code 399 */ 400 int 401 page_szc(size_t pagesize) 402 { 403 int i = 0; 404 405 while (hw_page_array[i].hp_size) { 406 if (pagesize == hw_page_array[i].hp_size) 407 return (i); 408 i++; 409 } 410 return (-1); 411 } 412 413 /* 414 * page size to page size code with the restriction that it be a supported 415 * user page size. If it's not a supported user page size, -1 will be returned. 416 */ 417 int 418 page_szc_user_filtered(size_t pagesize) 419 { 420 int szc = page_szc(pagesize); 421 if ((szc != -1) && (SZC_2_USERSZC(szc) != -1)) { 422 return (szc); 423 } 424 return (-1); 425 } 426 427 /* 428 * Return how many page sizes are available for the user to use. This is 429 * what the hardware supports and not based upon how the OS implements the 430 * support of different page sizes. 431 * 432 * If legacy is non-zero, return the number of pagesizes available to legacy 433 * applications. The number of legacy page sizes might be less than the 434 * exported user page sizes. This is to prevent legacy applications that 435 * use the largest page size returned from getpagesizes(3c) from inadvertantly 436 * using the 'new' large pagesizes. 437 */ 438 uint_t 439 page_num_user_pagesizes(int legacy) 440 { 441 if (legacy) 442 return (mmu_legacy_page_sizes); 443 return (mmu_exported_page_sizes); 444 } 445 446 uint_t 447 page_num_pagesizes(void) 448 { 449 return (mmu_page_sizes); 450 } 451 452 /* 453 * returns the count of the number of base pagesize pages associated with szc 454 */ 455 pgcnt_t 456 page_get_pagecnt(uint_t szc) 457 { 458 if (szc >= mmu_page_sizes) 459 panic("page_get_pagecnt: out of range %d", szc); 460 return (hw_page_array[szc].hp_pgcnt); 461 } 462 463 size_t 464 page_get_pagesize(uint_t szc) 465 { 466 if (szc >= mmu_page_sizes) 467 panic("page_get_pagesize: out of range %d", szc); 468 return (hw_page_array[szc].hp_size); 469 } 470 471 /* 472 * Return the size of a page based upon the index passed in. An index of 473 * zero refers to the smallest page size in the system, and as index increases 474 * it refers to the next larger supported page size in the system. 475 * Note that szc and userszc may not be the same due to unsupported szc's on 476 * some systems. 477 */ 478 size_t 479 page_get_user_pagesize(uint_t userszc) 480 { 481 uint_t szc = USERSZC_2_SZC(userszc); 482 483 if (szc >= mmu_page_sizes) 484 panic("page_get_user_pagesize: out of range %d", szc); 485 return (hw_page_array[szc].hp_size); 486 } 487 488 uint_t 489 page_get_shift(uint_t szc) 490 { 491 if (szc >= mmu_page_sizes) 492 panic("page_get_shift: out of range %d", szc); 493 return (PAGE_GET_SHIFT(szc)); 494 } 495 496 uint_t 497 page_get_pagecolors(uint_t szc) 498 { 499 if (szc >= mmu_page_sizes) 500 panic("page_get_pagecolors: out of range %d", szc); 501 return (PAGE_GET_PAGECOLORS(szc)); 502 } 503 504 /* 505 * this assigns the desired equivalent color after a split 506 */ 507 uint_t 508 page_correct_color(uchar_t szc, uchar_t nszc, uint_t color, 509 uint_t ncolor, uint_t ceq_mask) 510 { 511 ASSERT(nszc > szc); 512 ASSERT(szc < mmu_page_sizes); 513 ASSERT(color < PAGE_GET_PAGECOLORS(szc)); 514 ASSERT(ncolor < PAGE_GET_PAGECOLORS(nszc)); 515 516 color &= ceq_mask; 517 ncolor = PAGE_CONVERT_COLOR(ncolor, szc, nszc); 518 return (color | (ncolor & ~ceq_mask)); 519 } 520 521 /* 522 * The interleaved_mnodes flag is set when mnodes overlap in 523 * the physbase..physmax range, but have disjoint slices. 524 * In this case hpm_counters is shared by all mnodes. 525 * This flag is set dynamically by the platform. 526 */ 527 int interleaved_mnodes = 0; 528 529 /* 530 * Called by startup(). 531 * Size up the per page size free list counters based on physmax 532 * of each node and max_mem_nodes. 533 * 534 * If interleaved_mnodes is set we need to find the first mnode that 535 * exists. hpm_counters for the first mnode will then be shared by 536 * all other mnodes. If interleaved_mnodes is not set, just set 537 * first=mnode each time. That means there will be no sharing. 538 */ 539 size_t 540 page_ctrs_sz(void) 541 { 542 int r; /* region size */ 543 int mnode; 544 int firstmn; /* first mnode that exists */ 545 int nranges; 546 pfn_t physbase; 547 pfn_t physmax; 548 uint_t ctrs_sz = 0; 549 int i; 550 pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; 551 552 /* 553 * We need to determine how many page colors there are for each 554 * page size in order to allocate memory for any color specific 555 * arrays. 556 */ 557 for (i = 0; i < mmu_page_sizes; i++) { 558 colors_per_szc[i] = PAGE_GET_PAGECOLORS(i); 559 } 560 561 for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) { 562 563 pgcnt_t r_pgcnt; 564 pfn_t r_base; 565 pgcnt_t r_align; 566 567 if (mem_node_config[mnode].exists == 0) 568 continue; 569 570 HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn); 571 nranges = MNODE_RANGE_CNT(mnode); 572 mnode_nranges[mnode] = nranges; 573 mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode); 574 575 /* 576 * determine size needed for page counter arrays with 577 * base aligned to large page size. 578 */ 579 for (r = 1; r < mmu_page_sizes; r++) { 580 /* add in space for hpm_color_current */ 581 ctrs_sz += sizeof (size_t) * 582 colors_per_szc[r] * nranges; 583 584 if (firstmn != mnode) 585 continue; 586 587 /* add in space for hpm_counters */ 588 r_align = page_get_pagecnt(r); 589 r_base = physbase; 590 r_base &= ~(r_align - 1); 591 r_pgcnt = howmany(physmax - r_base + 1, r_align); 592 593 /* 594 * Round up to always allocate on pointer sized 595 * boundaries. 596 */ 597 ctrs_sz += P2ROUNDUP((r_pgcnt * sizeof (hpmctr_t)), 598 sizeof (hpmctr_t *)); 599 } 600 } 601 602 for (r = 1; r < mmu_page_sizes; r++) { 603 ctrs_sz += (max_mem_nodes * sizeof (hw_page_map_t)); 604 } 605 606 /* add in space for page_ctrs_cands and pcc_color_free */ 607 ctrs_sz += sizeof (pcc_info_t *) * max_mem_nodes * 608 mmu_page_sizes * NPC_MUTEX; 609 610 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 611 612 if (mem_node_config[mnode].exists == 0) 613 continue; 614 615 nranges = mnode_nranges[mnode]; 616 ctrs_sz += sizeof (pcc_info_t) * nranges * 617 mmu_page_sizes * NPC_MUTEX; 618 for (r = 1; r < mmu_page_sizes; r++) { 619 ctrs_sz += sizeof (pgcnt_t) * nranges * 620 colors_per_szc[r] * NPC_MUTEX; 621 } 622 } 623 624 /* ctr_mutex */ 625 ctrs_sz += (max_mem_nodes * NPC_MUTEX * sizeof (kmutex_t)); 626 627 /* size for page list counts */ 628 PLCNT_SZ(ctrs_sz); 629 630 /* 631 * add some slop for roundups. page_ctrs_alloc will roundup the start 632 * address of the counters to ecache_alignsize boundary for every 633 * memory node. 634 */ 635 return (ctrs_sz + max_mem_nodes * L2CACHE_ALIGN); 636 } 637 638 caddr_t 639 page_ctrs_alloc(caddr_t alloc_base) 640 { 641 int mnode; 642 int mrange, nranges; 643 int r; /* region size */ 644 int i; 645 int firstmn; /* first mnode that exists */ 646 pfn_t physbase; 647 pfn_t physmax; 648 pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; 649 650 /* 651 * We need to determine how many page colors there are for each 652 * page size in order to allocate memory for any color specific 653 * arrays. 654 */ 655 for (i = 0; i < mmu_page_sizes; i++) { 656 colors_per_szc[i] = PAGE_GET_PAGECOLORS(i); 657 } 658 659 for (r = 1; r < mmu_page_sizes; r++) { 660 page_counters[r] = (hw_page_map_t *)alloc_base; 661 alloc_base += (max_mem_nodes * sizeof (hw_page_map_t)); 662 } 663 664 /* page_ctrs_cands and pcc_color_free array */ 665 for (i = 0; i < NPC_MUTEX; i++) { 666 for (r = 1; r < mmu_page_sizes; r++) { 667 668 page_ctrs_cands[i][r] = (pcc_info_t **)alloc_base; 669 alloc_base += sizeof (pcc_info_t *) * max_mem_nodes; 670 671 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 672 pcc_info_t *pi; 673 674 if (mem_node_config[mnode].exists == 0) 675 continue; 676 677 nranges = mnode_nranges[mnode]; 678 679 pi = (pcc_info_t *)alloc_base; 680 alloc_base += sizeof (pcc_info_t) * nranges; 681 page_ctrs_cands[i][r][mnode] = pi; 682 683 for (mrange = 0; mrange < nranges; mrange++) { 684 pi->pcc_color_free = 685 (pgcnt_t *)alloc_base; 686 alloc_base += sizeof (pgcnt_t) * 687 colors_per_szc[r]; 688 pi++; 689 } 690 } 691 } 692 } 693 694 /* ctr_mutex */ 695 for (i = 0; i < NPC_MUTEX; i++) { 696 ctr_mutex[i] = (kmutex_t *)alloc_base; 697 alloc_base += (max_mem_nodes * sizeof (kmutex_t)); 698 } 699 700 /* initialize page list counts */ 701 PLCNT_INIT(alloc_base); 702 703 for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) { 704 705 pgcnt_t r_pgcnt; 706 pfn_t r_base; 707 pgcnt_t r_align; 708 int r_shift; 709 int nranges = mnode_nranges[mnode]; 710 711 if (mem_node_config[mnode].exists == 0) 712 continue; 713 714 HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn); 715 716 for (r = 1; r < mmu_page_sizes; r++) { 717 /* 718 * the page_counters base has to be aligned to the 719 * page count of page size code r otherwise the counts 720 * will cross large page boundaries. 721 */ 722 r_align = page_get_pagecnt(r); 723 r_base = physbase; 724 /* base needs to be aligned - lower to aligned value */ 725 r_base &= ~(r_align - 1); 726 r_pgcnt = howmany(physmax - r_base + 1, r_align); 727 r_shift = PAGE_BSZS_SHIFT(r); 728 729 PAGE_COUNTERS_SHIFT(mnode, r) = r_shift; 730 PAGE_COUNTERS_ENTRIES(mnode, r) = r_pgcnt; 731 PAGE_COUNTERS_BASE(mnode, r) = r_base; 732 for (mrange = 0; mrange < nranges; mrange++) { 733 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, 734 r, mrange) = (size_t *)alloc_base; 735 alloc_base += sizeof (size_t) * 736 colors_per_szc[r]; 737 } 738 for (i = 0; i < colors_per_szc[r]; i++) { 739 uint_t color_mask = colors_per_szc[r] - 1; 740 pfn_t pfnum = r_base; 741 size_t idx; 742 int mrange; 743 MEM_NODE_ITERATOR_DECL(it); 744 745 MEM_NODE_ITERATOR_INIT(pfnum, mnode, r, &it); 746 if (pfnum == (pfn_t)-1) { 747 idx = 0; 748 } else { 749 PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i, 750 color_mask, color_mask, &it); 751 idx = PNUM_TO_IDX(mnode, r, pfnum); 752 idx = (idx >= r_pgcnt) ? 0 : idx; 753 } 754 for (mrange = 0; mrange < nranges; mrange++) { 755 PAGE_COUNTERS_CURRENT_COLOR(mnode, 756 r, i, mrange) = idx; 757 } 758 } 759 760 /* hpm_counters may be shared by all mnodes */ 761 if (firstmn == mnode) { 762 PAGE_COUNTERS_COUNTERS(mnode, r) = 763 (hpmctr_t *)alloc_base; 764 alloc_base += 765 P2ROUNDUP((sizeof (hpmctr_t) * r_pgcnt), 766 sizeof (hpmctr_t *)); 767 } else { 768 PAGE_COUNTERS_COUNTERS(mnode, r) = 769 PAGE_COUNTERS_COUNTERS(firstmn, r); 770 } 771 772 /* 773 * Verify that PNUM_TO_IDX and IDX_TO_PNUM 774 * satisfy the identity requirement. 775 * We should be able to go from one to the other 776 * and get consistent values. 777 */ 778 ASSERT(PNUM_TO_IDX(mnode, r, 779 (IDX_TO_PNUM(mnode, r, 0))) == 0); 780 ASSERT(IDX_TO_PNUM(mnode, r, 781 (PNUM_TO_IDX(mnode, r, r_base))) == r_base); 782 } 783 /* 784 * Roundup the start address of the page_counters to 785 * cache aligned boundary for every memory node. 786 * page_ctrs_sz() has added some slop for these roundups. 787 */ 788 alloc_base = (caddr_t)P2ROUNDUP((uintptr_t)alloc_base, 789 L2CACHE_ALIGN); 790 } 791 792 /* Initialize other page counter specific data structures. */ 793 for (mnode = 0; mnode < MAX_MEM_NODES; mnode++) { 794 rw_init(&page_ctrs_rwlock[mnode], NULL, RW_DEFAULT, NULL); 795 } 796 797 return (alloc_base); 798 } 799 800 /* 801 * Functions to adjust region counters for each size free list. 802 * Caller is responsible to acquire the ctr_mutex lock if necessary and 803 * thus can be called during startup without locks. 804 */ 805 /* ARGSUSED */ 806 void 807 page_ctr_add_internal(int mnode, int mtype, page_t *pp, int flags) 808 { 809 ssize_t r; /* region size */ 810 ssize_t idx; 811 pfn_t pfnum; 812 int lckidx; 813 814 ASSERT(mnode == PP_2_MEM_NODE(pp)); 815 ASSERT(mtype == PP_2_MTYPE(pp)); 816 817 ASSERT(pp->p_szc < mmu_page_sizes); 818 819 PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags); 820 821 /* no counter update needed for largest page size */ 822 if (pp->p_szc >= mmu_page_sizes - 1) { 823 return; 824 } 825 826 r = pp->p_szc + 1; 827 pfnum = pp->p_pagenum; 828 lckidx = PP_CTR_LOCK_INDX(pp); 829 830 /* 831 * Increment the count of free pages for the current 832 * region. Continue looping up in region size incrementing 833 * count if the preceeding region is full. 834 */ 835 while (r < mmu_page_sizes) { 836 idx = PNUM_TO_IDX(mnode, r, pfnum); 837 838 ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r)); 839 ASSERT(PAGE_COUNTERS(mnode, r, idx) < FULL_REGION_CNT(r)); 840 841 if (++PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r)) { 842 break; 843 } else { 844 int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r)); 845 pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode] 846 [MTYPE_2_MRANGE(mnode, root_mtype)]; 847 848 cand->pcc_pages_free++; 849 cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]++; 850 } 851 r++; 852 } 853 } 854 855 void 856 page_ctr_add(int mnode, int mtype, page_t *pp, int flags) 857 { 858 int lckidx = PP_CTR_LOCK_INDX(pp); 859 kmutex_t *lock = &ctr_mutex[lckidx][mnode]; 860 861 mutex_enter(lock); 862 page_ctr_add_internal(mnode, mtype, pp, flags); 863 mutex_exit(lock); 864 } 865 866 void 867 page_ctr_sub_internal(int mnode, int mtype, page_t *pp, int flags) 868 { 869 int lckidx; 870 ssize_t r; /* region size */ 871 ssize_t idx; 872 pfn_t pfnum; 873 874 ASSERT(mnode == PP_2_MEM_NODE(pp)); 875 ASSERT(mtype == PP_2_MTYPE(pp)); 876 877 ASSERT(pp->p_szc < mmu_page_sizes); 878 879 PLCNT_DECR(pp, mnode, mtype, pp->p_szc, flags); 880 881 /* no counter update needed for largest page size */ 882 if (pp->p_szc >= mmu_page_sizes - 1) { 883 return; 884 } 885 886 r = pp->p_szc + 1; 887 pfnum = pp->p_pagenum; 888 lckidx = PP_CTR_LOCK_INDX(pp); 889 890 /* 891 * Decrement the count of free pages for the current 892 * region. Continue looping up in region size decrementing 893 * count if the preceeding region was full. 894 */ 895 while (r < mmu_page_sizes) { 896 idx = PNUM_TO_IDX(mnode, r, pfnum); 897 898 ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r)); 899 ASSERT(PAGE_COUNTERS(mnode, r, idx) > 0); 900 901 if (--PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r) - 1) { 902 break; 903 } else { 904 int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r)); 905 pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode] 906 [MTYPE_2_MRANGE(mnode, root_mtype)]; 907 908 ASSERT(cand->pcc_pages_free != 0); 909 ASSERT(cand->pcc_color_free[PP_2_BIN_SZC(pp, r)] != 0); 910 911 cand->pcc_pages_free--; 912 cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]--; 913 } 914 r++; 915 } 916 } 917 918 void 919 page_ctr_sub(int mnode, int mtype, page_t *pp, int flags) 920 { 921 int lckidx = PP_CTR_LOCK_INDX(pp); 922 kmutex_t *lock = &ctr_mutex[lckidx][mnode]; 923 924 mutex_enter(lock); 925 page_ctr_sub_internal(mnode, mtype, pp, flags); 926 mutex_exit(lock); 927 } 928 929 /* 930 * Adjust page counters following a memory attach, since typically the 931 * size of the array needs to change, and the PFN to counter index 932 * mapping needs to change. 933 * 934 * It is possible this mnode did not exist at startup. In that case 935 * allocate pcc_info_t and pcc_color_free arrays. Also, allow for nranges 936 * to change (a theoretical possibility on x86), which means pcc_color_free 937 * arrays must be extended. 938 */ 939 uint_t 940 page_ctrs_adjust(int mnode) 941 { 942 pgcnt_t npgs; 943 int r; /* region size */ 944 int i; 945 size_t pcsz, old_csz; 946 hpmctr_t *new_ctr, *old_ctr; 947 pfn_t oldbase, newbase; 948 pfn_t physbase, physmax; 949 size_t old_npgs; 950 hpmctr_t *ctr_cache[MMU_PAGE_SIZES]; 951 size_t size_cache[MMU_PAGE_SIZES]; 952 size_t *color_cache[MMU_PAGE_SIZES][MAX_MNODE_MRANGES]; 953 size_t *old_color_array[MAX_MNODE_MRANGES]; 954 pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; 955 pcc_info_t **cands_cache; 956 pcc_info_t *old_pi, *pi; 957 pgcnt_t *pgcntp; 958 int nr, old_nranges, mrange, nranges = MNODE_RANGE_CNT(mnode); 959 int cands_cache_nranges; 960 int old_maxmrange, new_maxmrange; 961 int rc = 0; 962 int oldmnode; 963 964 cands_cache = kmem_zalloc(sizeof (pcc_info_t *) * NPC_MUTEX * 965 MMU_PAGE_SIZES, KM_NOSLEEP); 966 if (cands_cache == NULL) 967 return (ENOMEM); 968 969 i = -1; 970 HPM_COUNTERS_LIMITS(mnode, physbase, physmax, i); 971 972 newbase = physbase & ~PC_BASE_ALIGN_MASK; 973 npgs = roundup(physmax, PC_BASE_ALIGN) - newbase; 974 975 /* prepare to free non-null pointers on the way out */ 976 cands_cache_nranges = nranges; 977 bzero(ctr_cache, sizeof (ctr_cache)); 978 bzero(color_cache, sizeof (color_cache)); 979 980 /* 981 * We need to determine how many page colors there are for each 982 * page size in order to allocate memory for any color specific 983 * arrays. 984 */ 985 for (r = 0; r < mmu_page_sizes; r++) { 986 colors_per_szc[r] = PAGE_GET_PAGECOLORS(r); 987 } 988 989 /* 990 * Preallocate all of the new hpm_counters arrays as we can't 991 * hold the page_ctrs_rwlock as a writer and allocate memory. 992 * If we can't allocate all of the arrays, undo our work so far 993 * and return failure. 994 */ 995 for (r = 1; r < mmu_page_sizes; r++) { 996 pcsz = npgs >> PAGE_BSZS_SHIFT(r); 997 size_cache[r] = pcsz; 998 ctr_cache[r] = kmem_zalloc(pcsz * 999 sizeof (hpmctr_t), KM_NOSLEEP); 1000 if (ctr_cache[r] == NULL) { 1001 rc = ENOMEM; 1002 goto cleanup; 1003 } 1004 } 1005 1006 /* 1007 * Preallocate all of the new color current arrays as we can't 1008 * hold the page_ctrs_rwlock as a writer and allocate memory. 1009 * If we can't allocate all of the arrays, undo our work so far 1010 * and return failure. 1011 */ 1012 for (r = 1; r < mmu_page_sizes; r++) { 1013 for (mrange = 0; mrange < nranges; mrange++) { 1014 color_cache[r][mrange] = kmem_zalloc(sizeof (size_t) * 1015 colors_per_szc[r], KM_NOSLEEP); 1016 if (color_cache[r][mrange] == NULL) { 1017 rc = ENOMEM; 1018 goto cleanup; 1019 } 1020 } 1021 } 1022 1023 /* 1024 * Preallocate all of the new pcc_info_t arrays as we can't 1025 * hold the page_ctrs_rwlock as a writer and allocate memory. 1026 * If we can't allocate all of the arrays, undo our work so far 1027 * and return failure. 1028 */ 1029 for (r = 1; r < mmu_page_sizes; r++) { 1030 for (i = 0; i < NPC_MUTEX; i++) { 1031 pi = kmem_zalloc(nranges * sizeof (pcc_info_t), 1032 KM_NOSLEEP); 1033 if (pi == NULL) { 1034 rc = ENOMEM; 1035 goto cleanup; 1036 } 1037 cands_cache[i * MMU_PAGE_SIZES + r] = pi; 1038 1039 for (mrange = 0; mrange < nranges; mrange++, pi++) { 1040 pgcntp = kmem_zalloc(colors_per_szc[r] * 1041 sizeof (pgcnt_t), KM_NOSLEEP); 1042 if (pgcntp == NULL) { 1043 rc = ENOMEM; 1044 goto cleanup; 1045 } 1046 pi->pcc_color_free = pgcntp; 1047 } 1048 } 1049 } 1050 1051 /* 1052 * Grab the write lock to prevent others from walking these arrays 1053 * while we are modifying them. 1054 */ 1055 PAGE_CTRS_WRITE_LOCK(mnode); 1056 1057 /* 1058 * For interleaved mnodes, find the first mnode 1059 * with valid page counters since the current 1060 * mnode may have just been added and not have 1061 * valid page counters. 1062 */ 1063 if (interleaved_mnodes) { 1064 for (i = 0; i < max_mem_nodes; i++) 1065 if (PAGE_COUNTERS_COUNTERS(i, 1) != NULL) 1066 break; 1067 ASSERT(i < max_mem_nodes); 1068 oldmnode = i; 1069 } else 1070 oldmnode = mnode; 1071 1072 old_nranges = mnode_nranges[mnode]; 1073 cands_cache_nranges = old_nranges; 1074 mnode_nranges[mnode] = nranges; 1075 old_maxmrange = mnode_maxmrange[mnode]; 1076 mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode); 1077 new_maxmrange = mnode_maxmrange[mnode]; 1078 1079 for (r = 1; r < mmu_page_sizes; r++) { 1080 PAGE_COUNTERS_SHIFT(mnode, r) = PAGE_BSZS_SHIFT(r); 1081 old_ctr = PAGE_COUNTERS_COUNTERS(oldmnode, r); 1082 old_csz = PAGE_COUNTERS_ENTRIES(oldmnode, r); 1083 oldbase = PAGE_COUNTERS_BASE(oldmnode, r); 1084 old_npgs = old_csz << PAGE_COUNTERS_SHIFT(oldmnode, r); 1085 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) { 1086 old_color_array[mrange] = 1087 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, 1088 r, mrange); 1089 } 1090 1091 pcsz = npgs >> PAGE_COUNTERS_SHIFT(mnode, r); 1092 new_ctr = ctr_cache[r]; 1093 ctr_cache[r] = NULL; 1094 if (old_ctr != NULL && 1095 (oldbase + old_npgs > newbase) && 1096 (newbase + npgs > oldbase)) { 1097 /* 1098 * Map the intersection of the old and new 1099 * counters into the new array. 1100 */ 1101 size_t offset; 1102 if (newbase > oldbase) { 1103 offset = (newbase - oldbase) >> 1104 PAGE_COUNTERS_SHIFT(mnode, r); 1105 bcopy(old_ctr + offset, new_ctr, 1106 MIN(pcsz, (old_csz - offset)) * 1107 sizeof (hpmctr_t)); 1108 } else { 1109 offset = (oldbase - newbase) >> 1110 PAGE_COUNTERS_SHIFT(mnode, r); 1111 bcopy(old_ctr, new_ctr + offset, 1112 MIN(pcsz - offset, old_csz) * 1113 sizeof (hpmctr_t)); 1114 } 1115 } 1116 1117 PAGE_COUNTERS_COUNTERS(mnode, r) = new_ctr; 1118 PAGE_COUNTERS_ENTRIES(mnode, r) = pcsz; 1119 PAGE_COUNTERS_BASE(mnode, r) = newbase; 1120 1121 /* update shared hpm_counters in other mnodes */ 1122 if (interleaved_mnodes) { 1123 for (i = 0; i < max_mem_nodes; i++) { 1124 if ((i == mnode) || 1125 (mem_node_config[i].exists == 0)) 1126 continue; 1127 ASSERT( 1128 PAGE_COUNTERS_COUNTERS(i, r) == old_ctr || 1129 PAGE_COUNTERS_COUNTERS(i, r) == NULL); 1130 PAGE_COUNTERS_COUNTERS(i, r) = new_ctr; 1131 PAGE_COUNTERS_ENTRIES(i, r) = pcsz; 1132 PAGE_COUNTERS_BASE(i, r) = newbase; 1133 } 1134 } 1135 1136 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) { 1137 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r, mrange) = 1138 color_cache[r][mrange]; 1139 color_cache[r][mrange] = NULL; 1140 } 1141 /* 1142 * for now, just reset on these events as it's probably 1143 * not worthwhile to try and optimize this. 1144 */ 1145 for (i = 0; i < colors_per_szc[r]; i++) { 1146 uint_t color_mask = colors_per_szc[r] - 1; 1147 int mlo = interleaved_mnodes ? 0 : mnode; 1148 int mhi = interleaved_mnodes ? max_mem_nodes : 1149 (mnode + 1); 1150 int m; 1151 pfn_t pfnum; 1152 size_t idx; 1153 MEM_NODE_ITERATOR_DECL(it); 1154 1155 for (m = mlo; m < mhi; m++) { 1156 if (mem_node_config[m].exists == 0) 1157 continue; 1158 pfnum = newbase; 1159 MEM_NODE_ITERATOR_INIT(pfnum, m, r, &it); 1160 if (pfnum == (pfn_t)-1) { 1161 idx = 0; 1162 } else { 1163 PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i, 1164 color_mask, color_mask, &it); 1165 idx = PNUM_TO_IDX(m, r, pfnum); 1166 idx = (idx < pcsz) ? idx : 0; 1167 } 1168 for (mrange = 0; mrange < nranges; mrange++) { 1169 if (PAGE_COUNTERS_CURRENT_COLOR_ARRAY(m, 1170 r, mrange) != NULL) 1171 PAGE_COUNTERS_CURRENT_COLOR(m, 1172 r, i, mrange) = idx; 1173 } 1174 } 1175 } 1176 1177 /* cache info for freeing out of the critical path */ 1178 if ((caddr_t)old_ctr >= kernelheap && 1179 (caddr_t)old_ctr < ekernelheap) { 1180 ctr_cache[r] = old_ctr; 1181 size_cache[r] = old_csz; 1182 } 1183 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) { 1184 size_t *tmp = old_color_array[mrange]; 1185 if ((caddr_t)tmp >= kernelheap && 1186 (caddr_t)tmp < ekernelheap) { 1187 color_cache[r][mrange] = tmp; 1188 } 1189 } 1190 /* 1191 * Verify that PNUM_TO_IDX and IDX_TO_PNUM 1192 * satisfy the identity requirement. 1193 * We should be able to go from one to the other 1194 * and get consistent values. 1195 */ 1196 ASSERT(PNUM_TO_IDX(mnode, r, 1197 (IDX_TO_PNUM(mnode, r, 0))) == 0); 1198 ASSERT(IDX_TO_PNUM(mnode, r, 1199 (PNUM_TO_IDX(mnode, r, newbase))) == newbase); 1200 1201 /* pcc_info_t and pcc_color_free */ 1202 for (i = 0; i < NPC_MUTEX; i++) { 1203 pcc_info_t *epi; 1204 pcc_info_t *eold_pi; 1205 1206 pi = cands_cache[i * MMU_PAGE_SIZES + r]; 1207 old_pi = page_ctrs_cands[i][r][mnode]; 1208 page_ctrs_cands[i][r][mnode] = pi; 1209 cands_cache[i * MMU_PAGE_SIZES + r] = old_pi; 1210 1211 /* preserve old pcc_color_free values, if any */ 1212 if (old_pi == NULL) 1213 continue; 1214 1215 /* 1216 * when/if x86 does DR, must account for 1217 * possible change in range index when 1218 * preserving pcc_info 1219 */ 1220 epi = &pi[nranges]; 1221 eold_pi = &old_pi[old_nranges]; 1222 if (new_maxmrange > old_maxmrange) { 1223 pi += new_maxmrange - old_maxmrange; 1224 } else if (new_maxmrange < old_maxmrange) { 1225 old_pi += old_maxmrange - new_maxmrange; 1226 } 1227 for (; pi < epi && old_pi < eold_pi; pi++, old_pi++) { 1228 pcc_info_t tmp = *pi; 1229 *pi = *old_pi; 1230 *old_pi = tmp; 1231 } 1232 } 1233 } 1234 PAGE_CTRS_WRITE_UNLOCK(mnode); 1235 1236 /* 1237 * Now that we have dropped the write lock, it is safe to free all 1238 * of the memory we have cached above. 1239 * We come thru here to free memory when pre-alloc fails, and also to 1240 * free old pointers which were recorded while locked. 1241 */ 1242 cleanup: 1243 for (r = 1; r < mmu_page_sizes; r++) { 1244 if (ctr_cache[r] != NULL) { 1245 kmem_free(ctr_cache[r], 1246 size_cache[r] * sizeof (hpmctr_t)); 1247 } 1248 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) { 1249 if (color_cache[r][mrange] != NULL) { 1250 kmem_free(color_cache[r][mrange], 1251 colors_per_szc[r] * sizeof (size_t)); 1252 } 1253 } 1254 for (i = 0; i < NPC_MUTEX; i++) { 1255 pi = cands_cache[i * MMU_PAGE_SIZES + r]; 1256 if (pi == NULL) 1257 continue; 1258 nr = cands_cache_nranges; 1259 for (mrange = 0; mrange < nr; mrange++, pi++) { 1260 pgcntp = pi->pcc_color_free; 1261 if (pgcntp == NULL) 1262 continue; 1263 if ((caddr_t)pgcntp >= kernelheap && 1264 (caddr_t)pgcntp < ekernelheap) { 1265 kmem_free(pgcntp, 1266 colors_per_szc[r] * 1267 sizeof (pgcnt_t)); 1268 } 1269 } 1270 pi = cands_cache[i * MMU_PAGE_SIZES + r]; 1271 if ((caddr_t)pi >= kernelheap && 1272 (caddr_t)pi < ekernelheap) { 1273 kmem_free(pi, nr * sizeof (pcc_info_t)); 1274 } 1275 } 1276 } 1277 1278 kmem_free(cands_cache, 1279 sizeof (pcc_info_t *) * NPC_MUTEX * MMU_PAGE_SIZES); 1280 return (rc); 1281 } 1282 1283 /* 1284 * Cleanup the hpm_counters field in the page counters 1285 * array. 1286 */ 1287 void 1288 page_ctrs_cleanup(void) 1289 { 1290 int r; /* region size */ 1291 int i; /* mnode index */ 1292 1293 /* 1294 * Get the page counters write lock while we are 1295 * setting the page hpm_counters field to NULL 1296 * for non-existent mnodes. 1297 */ 1298 for (i = 0; i < max_mem_nodes; i++) { 1299 PAGE_CTRS_WRITE_LOCK(i); 1300 if (mem_node_config[i].exists) { 1301 PAGE_CTRS_WRITE_UNLOCK(i); 1302 continue; 1303 } 1304 for (r = 1; r < mmu_page_sizes; r++) { 1305 PAGE_COUNTERS_COUNTERS(i, r) = NULL; 1306 } 1307 PAGE_CTRS_WRITE_UNLOCK(i); 1308 } 1309 } 1310 1311 #ifdef DEBUG 1312 1313 /* 1314 * confirm pp is a large page corresponding to szc 1315 */ 1316 void 1317 chk_lpg(page_t *pp, uchar_t szc) 1318 { 1319 spgcnt_t npgs = page_get_pagecnt(pp->p_szc); 1320 uint_t noreloc; 1321 1322 if (npgs == 1) { 1323 ASSERT(pp->p_szc == 0); 1324 ASSERT(pp->p_next == pp); 1325 ASSERT(pp->p_prev == pp); 1326 return; 1327 } 1328 1329 ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL); 1330 ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL); 1331 1332 ASSERT(IS_P2ALIGNED(pp->p_pagenum, npgs)); 1333 ASSERT(pp->p_pagenum == (pp->p_next->p_pagenum - 1)); 1334 ASSERT(pp->p_prev->p_pagenum == (pp->p_pagenum + (npgs - 1))); 1335 ASSERT(pp->p_prev == (pp + (npgs - 1))); 1336 1337 /* 1338 * Check list of pages. 1339 */ 1340 noreloc = PP_ISNORELOC(pp); 1341 while (npgs--) { 1342 if (npgs != 0) { 1343 ASSERT(pp->p_pagenum == pp->p_next->p_pagenum - 1); 1344 ASSERT(pp->p_next == (pp + 1)); 1345 } 1346 ASSERT(pp->p_szc == szc); 1347 ASSERT(PP_ISFREE(pp)); 1348 ASSERT(PP_ISAGED(pp)); 1349 ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL); 1350 ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL); 1351 ASSERT(pp->p_vnode == NULL); 1352 ASSERT(PP_ISNORELOC(pp) == noreloc); 1353 1354 pp = pp->p_next; 1355 } 1356 } 1357 #endif /* DEBUG */ 1358 1359 void 1360 page_freelist_lock(int mnode) 1361 { 1362 int i; 1363 for (i = 0; i < NPC_MUTEX; i++) { 1364 mutex_enter(FPC_MUTEX(mnode, i)); 1365 mutex_enter(CPC_MUTEX(mnode, i)); 1366 } 1367 } 1368 1369 void 1370 page_freelist_unlock(int mnode) 1371 { 1372 int i; 1373 for (i = 0; i < NPC_MUTEX; i++) { 1374 mutex_exit(FPC_MUTEX(mnode, i)); 1375 mutex_exit(CPC_MUTEX(mnode, i)); 1376 } 1377 } 1378 1379 /* 1380 * add pp to the specified page list. Defaults to head of the page list 1381 * unless PG_LIST_TAIL is specified. 1382 */ 1383 void 1384 page_list_add(page_t *pp, int flags) 1385 { 1386 page_t **ppp; 1387 kmutex_t *pcm; 1388 uint_t bin, mtype; 1389 int mnode; 1390 1391 ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT)); 1392 ASSERT(PP_ISFREE(pp)); 1393 ASSERT(!hat_page_is_mapped(pp)); 1394 ASSERT(hat_page_getshare(pp) == 0); 1395 1396 /* 1397 * Large pages should be freed via page_list_add_pages(). 1398 */ 1399 ASSERT(pp->p_szc == 0); 1400 1401 /* 1402 * Don't need to lock the freelist first here 1403 * because the page isn't on the freelist yet. 1404 * This means p_szc can't change on us. 1405 */ 1406 1407 bin = PP_2_BIN(pp); 1408 mnode = PP_2_MEM_NODE(pp); 1409 mtype = PP_2_MTYPE(pp); 1410 1411 if (flags & PG_LIST_ISINIT) { 1412 /* 1413 * PG_LIST_ISINIT is set during system startup (ie. single 1414 * threaded), add a page to the free list and add to the 1415 * the free region counters w/o any locking 1416 */ 1417 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1418 1419 /* inline version of page_add() */ 1420 if (*ppp != NULL) { 1421 pp->p_next = *ppp; 1422 pp->p_prev = (*ppp)->p_prev; 1423 (*ppp)->p_prev = pp; 1424 pp->p_prev->p_next = pp; 1425 } else 1426 *ppp = pp; 1427 1428 page_ctr_add_internal(mnode, mtype, pp, flags); 1429 VM_STAT_ADD(vmm_vmstats.pladd_free[0]); 1430 } else { 1431 pcm = PC_BIN_MUTEX(mnode, bin, flags); 1432 1433 if (flags & PG_FREE_LIST) { 1434 VM_STAT_ADD(vmm_vmstats.pladd_free[0]); 1435 ASSERT(PP_ISAGED(pp)); 1436 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1437 1438 } else { 1439 VM_STAT_ADD(vmm_vmstats.pladd_cache); 1440 ASSERT(pp->p_vnode); 1441 ASSERT((pp->p_offset & PAGEOFFSET) == 0); 1442 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1443 } 1444 mutex_enter(pcm); 1445 page_add(ppp, pp); 1446 1447 if (flags & PG_LIST_TAIL) 1448 *ppp = (*ppp)->p_next; 1449 /* 1450 * Add counters before releasing pcm mutex to avoid a race with 1451 * page_freelist_coalesce and page_freelist_split. 1452 */ 1453 page_ctr_add(mnode, mtype, pp, flags); 1454 mutex_exit(pcm); 1455 } 1456 1457 1458 #if defined(__sparc) 1459 if (PP_ISNORELOC(pp)) { 1460 kcage_freemem_add(1); 1461 } 1462 #endif 1463 /* 1464 * It is up to the caller to unlock the page! 1465 */ 1466 ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT)); 1467 } 1468 1469 1470 #ifdef __sparc 1471 /* 1472 * This routine is only used by kcage_init during system startup. 1473 * It performs the function of page_list_sub/PP_SETNORELOC/page_list_add 1474 * without the overhead of taking locks and updating counters. 1475 */ 1476 void 1477 page_list_noreloc_startup(page_t *pp) 1478 { 1479 page_t **ppp; 1480 uint_t bin; 1481 int mnode; 1482 int mtype; 1483 int flags = 0; 1484 1485 /* 1486 * If this is a large page on the freelist then 1487 * break it up into smaller pages. 1488 */ 1489 if (pp->p_szc != 0) 1490 page_boot_demote(pp); 1491 1492 /* 1493 * Get list page is currently on. 1494 */ 1495 bin = PP_2_BIN(pp); 1496 mnode = PP_2_MEM_NODE(pp); 1497 mtype = PP_2_MTYPE(pp); 1498 ASSERT(mtype == MTYPE_RELOC); 1499 ASSERT(pp->p_szc == 0); 1500 1501 if (PP_ISAGED(pp)) { 1502 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1503 flags |= PG_FREE_LIST; 1504 } else { 1505 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1506 flags |= PG_CACHE_LIST; 1507 } 1508 1509 ASSERT(*ppp != NULL); 1510 1511 /* 1512 * Delete page from current list. 1513 */ 1514 if (*ppp == pp) 1515 *ppp = pp->p_next; /* go to next page */ 1516 if (*ppp == pp) { 1517 *ppp = NULL; /* page list is gone */ 1518 } else { 1519 pp->p_prev->p_next = pp->p_next; 1520 pp->p_next->p_prev = pp->p_prev; 1521 } 1522 1523 /* 1524 * Decrement page counters 1525 */ 1526 page_ctr_sub_internal(mnode, mtype, pp, flags); 1527 1528 /* 1529 * Set no reloc for cage initted pages. 1530 */ 1531 PP_SETNORELOC(pp); 1532 1533 mtype = PP_2_MTYPE(pp); 1534 ASSERT(mtype == MTYPE_NORELOC); 1535 1536 /* 1537 * Get new list for page. 1538 */ 1539 if (PP_ISAGED(pp)) { 1540 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1541 } else { 1542 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1543 } 1544 1545 /* 1546 * Insert page on new list. 1547 */ 1548 if (*ppp == NULL) { 1549 *ppp = pp; 1550 pp->p_next = pp->p_prev = pp; 1551 } else { 1552 pp->p_next = *ppp; 1553 pp->p_prev = (*ppp)->p_prev; 1554 (*ppp)->p_prev = pp; 1555 pp->p_prev->p_next = pp; 1556 } 1557 1558 /* 1559 * Increment page counters 1560 */ 1561 page_ctr_add_internal(mnode, mtype, pp, flags); 1562 1563 /* 1564 * Update cage freemem counter 1565 */ 1566 atomic_inc_ulong(&kcage_freemem); 1567 } 1568 #else /* __sparc */ 1569 1570 /* ARGSUSED */ 1571 void 1572 page_list_noreloc_startup(page_t *pp) 1573 { 1574 panic("page_list_noreloc_startup: should be here only for sparc"); 1575 } 1576 #endif 1577 1578 void 1579 page_list_add_pages(page_t *pp, int flags) 1580 { 1581 kmutex_t *pcm; 1582 pgcnt_t pgcnt; 1583 uint_t bin, mtype, i; 1584 int mnode; 1585 1586 /* default to freelist/head */ 1587 ASSERT((flags & (PG_CACHE_LIST | PG_LIST_TAIL)) == 0); 1588 1589 CHK_LPG(pp, pp->p_szc); 1590 VM_STAT_ADD(vmm_vmstats.pladd_free[pp->p_szc]); 1591 1592 bin = PP_2_BIN(pp); 1593 mnode = PP_2_MEM_NODE(pp); 1594 mtype = PP_2_MTYPE(pp); 1595 1596 if (flags & PG_LIST_ISINIT) { 1597 ASSERT(pp->p_szc == mmu_page_sizes - 1); 1598 page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1599 ASSERT(!PP_ISNORELOC(pp)); 1600 PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags); 1601 } else { 1602 1603 ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes); 1604 1605 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 1606 1607 mutex_enter(pcm); 1608 page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1609 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST); 1610 mutex_exit(pcm); 1611 1612 pgcnt = page_get_pagecnt(pp->p_szc); 1613 #if defined(__sparc) 1614 if (PP_ISNORELOC(pp)) 1615 kcage_freemem_add(pgcnt); 1616 #endif 1617 for (i = 0; i < pgcnt; i++, pp++) 1618 page_unlock_nocapture(pp); 1619 } 1620 } 1621 1622 /* 1623 * During boot, need to demote a large page to base 1624 * pagesize pages for seg_kmem for use in boot_alloc() 1625 */ 1626 void 1627 page_boot_demote(page_t *pp) 1628 { 1629 ASSERT(pp->p_szc != 0); 1630 ASSERT(PP_ISFREE(pp)); 1631 ASSERT(PP_ISAGED(pp)); 1632 1633 (void) page_demote(PP_2_MEM_NODE(pp), 1634 PFN_BASE(pp->p_pagenum, pp->p_szc), 0, pp->p_szc, 0, PC_NO_COLOR, 1635 PC_FREE); 1636 1637 ASSERT(PP_ISFREE(pp)); 1638 ASSERT(PP_ISAGED(pp)); 1639 ASSERT(pp->p_szc == 0); 1640 } 1641 1642 /* 1643 * Take a particular page off of whatever freelist the page 1644 * is claimed to be on. 1645 * 1646 * NOTE: Only used for PAGESIZE pages. 1647 */ 1648 void 1649 page_list_sub(page_t *pp, int flags) 1650 { 1651 int bin; 1652 uint_t mtype; 1653 int mnode; 1654 kmutex_t *pcm; 1655 page_t **ppp; 1656 1657 ASSERT(PAGE_EXCL(pp)); 1658 ASSERT(PP_ISFREE(pp)); 1659 1660 /* 1661 * The p_szc field can only be changed by page_promote() 1662 * and page_demote(). Only free pages can be promoted and 1663 * demoted and the free list MUST be locked during these 1664 * operations. So to prevent a race in page_list_sub() 1665 * between computing which bin of the freelist lock to 1666 * grab and actually grabing the lock we check again that 1667 * the bin we locked is still the correct one. Notice that 1668 * the p_szc field could have actually changed on us but 1669 * if the bin happens to still be the same we are safe. 1670 */ 1671 try_again: 1672 bin = PP_2_BIN(pp); 1673 mnode = PP_2_MEM_NODE(pp); 1674 pcm = PC_BIN_MUTEX(mnode, bin, flags); 1675 mutex_enter(pcm); 1676 if (PP_2_BIN(pp) != bin) { 1677 mutex_exit(pcm); 1678 goto try_again; 1679 } 1680 mtype = PP_2_MTYPE(pp); 1681 1682 if (flags & PG_FREE_LIST) { 1683 VM_STAT_ADD(vmm_vmstats.plsub_free[0]); 1684 ASSERT(PP_ISAGED(pp)); 1685 ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype); 1686 } else { 1687 VM_STAT_ADD(vmm_vmstats.plsub_cache); 1688 ASSERT(!PP_ISAGED(pp)); 1689 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1690 } 1691 1692 /* 1693 * Common PAGESIZE case. 1694 * 1695 * Note that we locked the freelist. This prevents 1696 * any page promotion/demotion operations. Therefore 1697 * the p_szc will not change until we drop pcm mutex. 1698 */ 1699 if (pp->p_szc == 0) { 1700 page_sub(ppp, pp); 1701 /* 1702 * Subtract counters before releasing pcm mutex 1703 * to avoid race with page_freelist_coalesce. 1704 */ 1705 page_ctr_sub(mnode, mtype, pp, flags); 1706 mutex_exit(pcm); 1707 1708 #if defined(__sparc) 1709 if (PP_ISNORELOC(pp)) { 1710 kcage_freemem_sub(1); 1711 } 1712 #endif 1713 return; 1714 } 1715 1716 /* 1717 * Large pages on the cache list are not supported. 1718 */ 1719 if (flags & PG_CACHE_LIST) 1720 panic("page_list_sub: large page on cachelist"); 1721 1722 /* 1723 * Slow but rare. 1724 * 1725 * Somebody wants this particular page which is part 1726 * of a large page. In this case we just demote the page 1727 * if it's on the freelist. 1728 * 1729 * We have to drop pcm before locking the entire freelist. 1730 * Once we have re-locked the freelist check to make sure 1731 * the page hasn't already been demoted or completely 1732 * freed. 1733 */ 1734 mutex_exit(pcm); 1735 page_freelist_lock(mnode); 1736 if (pp->p_szc != 0) { 1737 /* 1738 * Large page is on freelist. 1739 */ 1740 (void) page_demote(mnode, PFN_BASE(pp->p_pagenum, pp->p_szc), 1741 0, pp->p_szc, 0, PC_NO_COLOR, PC_FREE); 1742 } 1743 ASSERT(PP_ISFREE(pp)); 1744 ASSERT(PP_ISAGED(pp)); 1745 ASSERT(pp->p_szc == 0); 1746 1747 /* 1748 * Subtract counters before releasing pcm mutex 1749 * to avoid race with page_freelist_coalesce. 1750 */ 1751 bin = PP_2_BIN(pp); 1752 mtype = PP_2_MTYPE(pp); 1753 ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype); 1754 1755 page_sub(ppp, pp); 1756 page_ctr_sub(mnode, mtype, pp, flags); 1757 page_freelist_unlock(mnode); 1758 1759 #if defined(__sparc) 1760 if (PP_ISNORELOC(pp)) { 1761 kcage_freemem_sub(1); 1762 } 1763 #endif 1764 } 1765 1766 void 1767 page_list_sub_pages(page_t *pp, uint_t szc) 1768 { 1769 kmutex_t *pcm; 1770 uint_t bin, mtype; 1771 int mnode; 1772 1773 ASSERT(PAGE_EXCL(pp)); 1774 ASSERT(PP_ISFREE(pp)); 1775 ASSERT(PP_ISAGED(pp)); 1776 1777 /* 1778 * See comment in page_list_sub(). 1779 */ 1780 try_again: 1781 bin = PP_2_BIN(pp); 1782 mnode = PP_2_MEM_NODE(pp); 1783 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 1784 mutex_enter(pcm); 1785 if (PP_2_BIN(pp) != bin) { 1786 mutex_exit(pcm); 1787 goto try_again; 1788 } 1789 1790 /* 1791 * If we're called with a page larger than szc or it got 1792 * promoted above szc before we locked the freelist then 1793 * drop pcm and re-lock entire freelist. If page still larger 1794 * than szc then demote it. 1795 */ 1796 if (pp->p_szc > szc) { 1797 mutex_exit(pcm); 1798 pcm = NULL; 1799 page_freelist_lock(mnode); 1800 if (pp->p_szc > szc) { 1801 VM_STAT_ADD(vmm_vmstats.plsubpages_szcbig); 1802 (void) page_demote(mnode, 1803 PFN_BASE(pp->p_pagenum, pp->p_szc), 0, 1804 pp->p_szc, szc, PC_NO_COLOR, PC_FREE); 1805 } 1806 bin = PP_2_BIN(pp); 1807 } 1808 ASSERT(PP_ISFREE(pp)); 1809 ASSERT(PP_ISAGED(pp)); 1810 ASSERT(pp->p_szc <= szc); 1811 ASSERT(pp == PP_PAGEROOT(pp)); 1812 1813 VM_STAT_ADD(vmm_vmstats.plsub_free[pp->p_szc]); 1814 1815 mtype = PP_2_MTYPE(pp); 1816 if (pp->p_szc != 0) { 1817 page_vpsub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1818 CHK_LPG(pp, pp->p_szc); 1819 } else { 1820 VM_STAT_ADD(vmm_vmstats.plsubpages_szc0); 1821 page_sub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1822 } 1823 page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST); 1824 1825 if (pcm != NULL) { 1826 mutex_exit(pcm); 1827 } else { 1828 page_freelist_unlock(mnode); 1829 } 1830 1831 #if defined(__sparc) 1832 if (PP_ISNORELOC(pp)) { 1833 pgcnt_t pgcnt; 1834 1835 pgcnt = page_get_pagecnt(pp->p_szc); 1836 kcage_freemem_sub(pgcnt); 1837 } 1838 #endif 1839 } 1840 1841 /* 1842 * Add the page to the front of a linked list of pages 1843 * using the p_next & p_prev pointers for the list. 1844 * The caller is responsible for protecting the list pointers. 1845 */ 1846 void 1847 mach_page_add(page_t **ppp, page_t *pp) 1848 { 1849 if (*ppp == NULL) { 1850 pp->p_next = pp->p_prev = pp; 1851 } else { 1852 pp->p_next = *ppp; 1853 pp->p_prev = (*ppp)->p_prev; 1854 (*ppp)->p_prev = pp; 1855 pp->p_prev->p_next = pp; 1856 } 1857 *ppp = pp; 1858 } 1859 1860 /* 1861 * Remove this page from a linked list of pages 1862 * using the p_next & p_prev pointers for the list. 1863 * 1864 * The caller is responsible for protecting the list pointers. 1865 */ 1866 void 1867 mach_page_sub(page_t **ppp, page_t *pp) 1868 { 1869 ASSERT(pp != NULL && PP_ISFREE(pp)); 1870 1871 if (*ppp == NULL || pp == NULL) 1872 panic("mach_page_sub"); 1873 1874 if (*ppp == pp) 1875 *ppp = pp->p_next; /* go to next page */ 1876 1877 if (*ppp == pp) 1878 *ppp = NULL; /* page list is gone */ 1879 else { 1880 pp->p_prev->p_next = pp->p_next; 1881 pp->p_next->p_prev = pp->p_prev; 1882 } 1883 pp->p_prev = pp->p_next = pp; /* make pp a list of one */ 1884 } 1885 1886 /* 1887 * Routine fsflush uses to gradually coalesce the free list into larger pages. 1888 */ 1889 void 1890 page_promote_size(page_t *pp, uint_t cur_szc) 1891 { 1892 pfn_t pfn; 1893 int mnode; 1894 int idx; 1895 int new_szc = cur_szc + 1; 1896 int full = FULL_REGION_CNT(new_szc); 1897 1898 pfn = page_pptonum(pp); 1899 mnode = PFN_2_MEM_NODE(pfn); 1900 1901 page_freelist_lock(mnode); 1902 1903 idx = PNUM_TO_IDX(mnode, new_szc, pfn); 1904 if (PAGE_COUNTERS(mnode, new_szc, idx) == full) 1905 (void) page_promote(mnode, pfn, new_szc, PC_FREE, PC_MTYPE_ANY); 1906 1907 page_freelist_unlock(mnode); 1908 } 1909 1910 static uint_t page_promote_err; 1911 static uint_t page_promote_noreloc_err; 1912 1913 /* 1914 * Create a single larger page (of szc new_szc) from smaller contiguous pages 1915 * for the given mnode starting at pfnum. Pages involved are on the freelist 1916 * before the call and may be returned to the caller if requested, otherwise 1917 * they will be placed back on the freelist. 1918 * If flags is PC_ALLOC, then the large page will be returned to the user in 1919 * a state which is consistent with a page being taken off the freelist. If 1920 * we failed to lock the new large page, then we will return NULL to the 1921 * caller and put the large page on the freelist instead. 1922 * If flags is PC_FREE, then the large page will be placed on the freelist, 1923 * and NULL will be returned. 1924 * The caller is responsible for locking the freelist as well as any other 1925 * accounting which needs to be done for a returned page. 1926 * 1927 * RFE: For performance pass in pp instead of pfnum so 1928 * we can avoid excessive calls to page_numtopp_nolock(). 1929 * This would depend on an assumption that all contiguous 1930 * pages are in the same memseg so we can just add/dec 1931 * our pp. 1932 * 1933 * Lock ordering: 1934 * 1935 * There is a potential but rare deadlock situation 1936 * for page promotion and demotion operations. The problem 1937 * is there are two paths into the freelist manager and 1938 * they have different lock orders: 1939 * 1940 * page_create() 1941 * lock freelist 1942 * page_lock(EXCL) 1943 * unlock freelist 1944 * return 1945 * caller drops page_lock 1946 * 1947 * page_free() and page_reclaim() 1948 * caller grabs page_lock(EXCL) 1949 * 1950 * lock freelist 1951 * unlock freelist 1952 * drop page_lock 1953 * 1954 * What prevents a thread in page_create() from deadlocking 1955 * with a thread freeing or reclaiming the same page is the 1956 * page_trylock() in page_get_freelist(). If the trylock fails 1957 * it skips the page. 1958 * 1959 * The lock ordering for promotion and demotion is the same as 1960 * for page_create(). Since the same deadlock could occur during 1961 * page promotion and freeing or reclaiming of a page on the 1962 * cache list we might have to fail the operation and undo what 1963 * have done so far. Again this is rare. 1964 */ 1965 page_t * 1966 page_promote(int mnode, pfn_t pfnum, uchar_t new_szc, int flags, int mtype) 1967 { 1968 page_t *pp, *pplist, *tpp, *start_pp; 1969 pgcnt_t new_npgs, npgs; 1970 uint_t bin; 1971 pgcnt_t tmpnpgs, pages_left; 1972 uint_t noreloc; 1973 int which_list; 1974 ulong_t index; 1975 kmutex_t *phm; 1976 1977 /* 1978 * General algorithm: 1979 * Find the starting page 1980 * Walk each page struct removing it from the freelist, 1981 * and linking it to all the other pages removed. 1982 * Once all pages are off the freelist, 1983 * walk the list, modifying p_szc to new_szc and what 1984 * ever other info needs to be done to create a large free page. 1985 * According to the flags, either return the page or put it 1986 * on the freelist. 1987 */ 1988 1989 start_pp = page_numtopp_nolock(pfnum); 1990 ASSERT(start_pp && (start_pp->p_pagenum == pfnum)); 1991 new_npgs = page_get_pagecnt(new_szc); 1992 ASSERT(IS_P2ALIGNED(pfnum, new_npgs)); 1993 1994 /* don't return page of the wrong mtype */ 1995 if (mtype != PC_MTYPE_ANY && mtype != PP_2_MTYPE(start_pp)) 1996 return (NULL); 1997 1998 /* 1999 * Loop through smaller pages to confirm that all pages 2000 * give the same result for PP_ISNORELOC(). 2001 * We can check this reliably here as the protocol for setting 2002 * P_NORELOC requires pages to be taken off the free list first. 2003 */ 2004 noreloc = PP_ISNORELOC(start_pp); 2005 for (pp = start_pp + new_npgs; --pp > start_pp; ) { 2006 if (noreloc != PP_ISNORELOC(pp)) { 2007 page_promote_noreloc_err++; 2008 page_promote_err++; 2009 return (NULL); 2010 } 2011 } 2012 2013 pages_left = new_npgs; 2014 pplist = NULL; 2015 pp = start_pp; 2016 2017 /* Loop around coalescing the smaller pages into a big page. */ 2018 while (pages_left) { 2019 /* 2020 * Remove from the freelist. 2021 */ 2022 ASSERT(PP_ISFREE(pp)); 2023 bin = PP_2_BIN(pp); 2024 ASSERT(mnode == PP_2_MEM_NODE(pp)); 2025 mtype = PP_2_MTYPE(pp); 2026 if (PP_ISAGED(pp)) { 2027 2028 /* 2029 * PG_FREE_LIST 2030 */ 2031 if (pp->p_szc) { 2032 page_vpsub(&PAGE_FREELISTS(mnode, 2033 pp->p_szc, bin, mtype), pp); 2034 } else { 2035 mach_page_sub(&PAGE_FREELISTS(mnode, 0, 2036 bin, mtype), pp); 2037 } 2038 which_list = PG_FREE_LIST; 2039 } else { 2040 ASSERT(pp->p_szc == 0); 2041 2042 /* 2043 * PG_CACHE_LIST 2044 * 2045 * Since this page comes from the 2046 * cachelist, we must destroy the 2047 * vnode association. 2048 */ 2049 if (!page_trylock(pp, SE_EXCL)) { 2050 goto fail_promote; 2051 } 2052 2053 /* 2054 * We need to be careful not to deadlock 2055 * with another thread in page_lookup(). 2056 * The page_lookup() thread could be holding 2057 * the same phm that we need if the two 2058 * pages happen to hash to the same phm lock. 2059 * At this point we have locked the entire 2060 * freelist and page_lookup() could be trying 2061 * to grab a freelist lock. 2062 */ 2063 index = PAGE_HASH_FUNC(pp->p_vnode, pp->p_offset); 2064 phm = PAGE_HASH_MUTEX(index); 2065 if (!mutex_tryenter(phm)) { 2066 page_unlock_nocapture(pp); 2067 goto fail_promote; 2068 } 2069 2070 mach_page_sub(&PAGE_CACHELISTS(mnode, bin, mtype), pp); 2071 page_hashout(pp, phm); 2072 mutex_exit(phm); 2073 PP_SETAGED(pp); 2074 page_unlock_nocapture(pp); 2075 which_list = PG_CACHE_LIST; 2076 } 2077 page_ctr_sub(mnode, mtype, pp, which_list); 2078 2079 /* 2080 * Concatenate the smaller page(s) onto 2081 * the large page list. 2082 */ 2083 tmpnpgs = npgs = page_get_pagecnt(pp->p_szc); 2084 pages_left -= npgs; 2085 tpp = pp; 2086 while (npgs--) { 2087 tpp->p_szc = new_szc; 2088 tpp = tpp->p_next; 2089 } 2090 page_list_concat(&pplist, &pp); 2091 pp += tmpnpgs; 2092 } 2093 CHK_LPG(pplist, new_szc); 2094 2095 /* 2096 * return the page to the user if requested 2097 * in the properly locked state. 2098 */ 2099 if (flags == PC_ALLOC && (page_trylock_cons(pplist, SE_EXCL))) { 2100 return (pplist); 2101 } 2102 2103 /* 2104 * Otherwise place the new large page on the freelist 2105 */ 2106 bin = PP_2_BIN(pplist); 2107 mnode = PP_2_MEM_NODE(pplist); 2108 mtype = PP_2_MTYPE(pplist); 2109 page_vpadd(&PAGE_FREELISTS(mnode, new_szc, bin, mtype), pplist); 2110 2111 page_ctr_add(mnode, mtype, pplist, PG_FREE_LIST); 2112 return (NULL); 2113 2114 fail_promote: 2115 /* 2116 * A thread must have still been freeing or 2117 * reclaiming the page on the cachelist. 2118 * To prevent a deadlock undo what we have 2119 * done sofar and return failure. This 2120 * situation can only happen while promoting 2121 * PAGESIZE pages. 2122 */ 2123 page_promote_err++; 2124 while (pplist) { 2125 pp = pplist; 2126 mach_page_sub(&pplist, pp); 2127 pp->p_szc = 0; 2128 bin = PP_2_BIN(pp); 2129 mtype = PP_2_MTYPE(pp); 2130 mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, mtype), pp); 2131 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST); 2132 } 2133 return (NULL); 2134 2135 } 2136 2137 /* 2138 * Break up a large page into smaller size pages. 2139 * Pages involved are on the freelist before the call and may 2140 * be returned to the caller if requested, otherwise they will 2141 * be placed back on the freelist. 2142 * The caller is responsible for locking the freelist as well as any other 2143 * accounting which needs to be done for a returned page. 2144 * If flags is not PC_ALLOC, the color argument is ignored, and thus 2145 * technically, any value may be passed in but PC_NO_COLOR is the standard 2146 * which should be followed for clarity's sake. 2147 * Returns a page whose pfn is < pfnmax 2148 */ 2149 page_t * 2150 page_demote(int mnode, pfn_t pfnum, pfn_t pfnmax, uchar_t cur_szc, 2151 uchar_t new_szc, int color, int flags) 2152 { 2153 page_t *pp, *pplist, *npplist; 2154 pgcnt_t npgs, n; 2155 uint_t bin; 2156 uint_t mtype; 2157 page_t *ret_pp = NULL; 2158 2159 ASSERT(cur_szc != 0); 2160 ASSERT(new_szc < cur_szc); 2161 2162 pplist = page_numtopp_nolock(pfnum); 2163 ASSERT(pplist != NULL); 2164 2165 ASSERT(pplist->p_szc == cur_szc); 2166 2167 bin = PP_2_BIN(pplist); 2168 ASSERT(mnode == PP_2_MEM_NODE(pplist)); 2169 mtype = PP_2_MTYPE(pplist); 2170 page_vpsub(&PAGE_FREELISTS(mnode, cur_szc, bin, mtype), pplist); 2171 2172 CHK_LPG(pplist, cur_szc); 2173 page_ctr_sub(mnode, mtype, pplist, PG_FREE_LIST); 2174 2175 /* 2176 * Number of PAGESIZE pages for smaller new_szc 2177 * page. 2178 */ 2179 npgs = page_get_pagecnt(new_szc); 2180 2181 while (pplist) { 2182 pp = pplist; 2183 2184 ASSERT(pp->p_szc == cur_szc); 2185 2186 /* 2187 * We either break it up into PAGESIZE pages or larger. 2188 */ 2189 if (npgs == 1) { /* PAGESIZE case */ 2190 mach_page_sub(&pplist, pp); 2191 ASSERT(pp->p_szc == cur_szc); 2192 ASSERT(new_szc == 0); 2193 ASSERT(mnode == PP_2_MEM_NODE(pp)); 2194 pp->p_szc = new_szc; 2195 bin = PP_2_BIN(pp); 2196 if ((bin == color) && (flags == PC_ALLOC) && 2197 (ret_pp == NULL) && (pfnmax == 0 || 2198 pp->p_pagenum < pfnmax) && 2199 page_trylock_cons(pp, SE_EXCL)) { 2200 ret_pp = pp; 2201 } else { 2202 mtype = PP_2_MTYPE(pp); 2203 mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, 2204 mtype), pp); 2205 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST); 2206 } 2207 } else { 2208 page_t *try_to_return_this_page = NULL; 2209 int count = 0; 2210 2211 /* 2212 * Break down into smaller lists of pages. 2213 */ 2214 page_list_break(&pplist, &npplist, npgs); 2215 2216 pp = pplist; 2217 n = npgs; 2218 while (n--) { 2219 ASSERT(pp->p_szc == cur_szc); 2220 /* 2221 * Check whether all the pages in this list 2222 * fit the request criteria. 2223 */ 2224 if (pfnmax == 0 || pp->p_pagenum < pfnmax) { 2225 count++; 2226 } 2227 pp->p_szc = new_szc; 2228 pp = pp->p_next; 2229 } 2230 2231 if (count == npgs && 2232 (pfnmax == 0 || pp->p_pagenum < pfnmax)) { 2233 try_to_return_this_page = pp; 2234 } 2235 2236 CHK_LPG(pplist, new_szc); 2237 2238 bin = PP_2_BIN(pplist); 2239 if (try_to_return_this_page) 2240 ASSERT(mnode == 2241 PP_2_MEM_NODE(try_to_return_this_page)); 2242 if ((bin == color) && (flags == PC_ALLOC) && 2243 (ret_pp == NULL) && try_to_return_this_page && 2244 page_trylock_cons(try_to_return_this_page, 2245 SE_EXCL)) { 2246 ret_pp = try_to_return_this_page; 2247 } else { 2248 mtype = PP_2_MTYPE(pp); 2249 page_vpadd(&PAGE_FREELISTS(mnode, new_szc, 2250 bin, mtype), pplist); 2251 2252 page_ctr_add(mnode, mtype, pplist, 2253 PG_FREE_LIST); 2254 } 2255 pplist = npplist; 2256 } 2257 } 2258 return (ret_pp); 2259 } 2260 2261 int mpss_coalesce_disable = 0; 2262 2263 /* 2264 * Coalesce free pages into a page of the given szc and color if possible. 2265 * Return the pointer to the page created, otherwise, return NULL. 2266 * 2267 * If pfnhi is non-zero, search for large page with pfn range less than pfnhi. 2268 */ 2269 page_t * 2270 page_freelist_coalesce(int mnode, uchar_t szc, uint_t color, uint_t ceq_mask, 2271 int mtype, pfn_t pfnhi) 2272 { 2273 int r = szc; /* region size */ 2274 int mrange; 2275 uint_t full, bin, color_mask, wrap = 0; 2276 pfn_t pfnum, lo, hi; 2277 size_t len, idx, idx0; 2278 pgcnt_t cands = 0, szcpgcnt = page_get_pagecnt(szc); 2279 page_t *ret_pp; 2280 MEM_NODE_ITERATOR_DECL(it); 2281 2282 if (mpss_coalesce_disable) { 2283 ASSERT(szc < MMU_PAGE_SIZES); 2284 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[szc][0]); 2285 return (NULL); 2286 } 2287 2288 ASSERT(szc < mmu_page_sizes); 2289 color_mask = PAGE_GET_PAGECOLORS(szc) - 1; 2290 ASSERT(ceq_mask <= color_mask); 2291 ASSERT(color <= color_mask); 2292 color &= ceq_mask; 2293 2294 /* Prevent page_counters dynamic memory from being freed */ 2295 rw_enter(&page_ctrs_rwlock[mnode], RW_READER); 2296 2297 mrange = MTYPE_2_MRANGE(mnode, mtype); 2298 ASSERT(mrange < mnode_nranges[mnode]); 2299 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[r][mrange]); 2300 2301 /* get pfn range for mtype */ 2302 len = PAGE_COUNTERS_ENTRIES(mnode, r); 2303 MNODETYPE_2_PFN(mnode, mtype, lo, hi); 2304 hi++; 2305 2306 /* use lower limit if given */ 2307 if (pfnhi != PFNNULL && pfnhi < hi) 2308 hi = pfnhi; 2309 2310 /* round to szcpgcnt boundaries */ 2311 lo = P2ROUNDUP(lo, szcpgcnt); 2312 MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it); 2313 if (lo == (pfn_t)-1) { 2314 rw_exit(&page_ctrs_rwlock[mnode]); 2315 return (NULL); 2316 } 2317 hi = hi & ~(szcpgcnt - 1); 2318 2319 /* set lo to the closest pfn of the right color */ 2320 if (((PFN_2_COLOR(lo, szc, &it) ^ color) & ceq_mask) || 2321 (interleaved_mnodes && PFN_2_MEM_NODE(lo) != mnode)) { 2322 PAGE_NEXT_PFN_FOR_COLOR(lo, szc, color, ceq_mask, color_mask, 2323 &it); 2324 } 2325 2326 if (hi <= lo) { 2327 rw_exit(&page_ctrs_rwlock[mnode]); 2328 return (NULL); 2329 } 2330 2331 full = FULL_REGION_CNT(r); 2332 2333 /* calculate the number of page candidates and initial search index */ 2334 bin = color; 2335 idx0 = (size_t)(-1); 2336 do { 2337 pgcnt_t acand; 2338 2339 PGCTRS_CANDS_GETVALUECOLOR(mnode, mrange, r, bin, acand); 2340 if (acand) { 2341 idx = PAGE_COUNTERS_CURRENT_COLOR(mnode, 2342 r, bin, mrange); 2343 idx0 = MIN(idx0, idx); 2344 cands += acand; 2345 } 2346 bin = ADD_MASKED(bin, 1, ceq_mask, color_mask); 2347 } while (bin != color); 2348 2349 if (cands == 0) { 2350 VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip[r][mrange]); 2351 rw_exit(&page_ctrs_rwlock[mnode]); 2352 return (NULL); 2353 } 2354 2355 pfnum = IDX_TO_PNUM(mnode, r, idx0); 2356 if (pfnum < lo || pfnum >= hi) { 2357 pfnum = lo; 2358 } else { 2359 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it); 2360 if (pfnum == (pfn_t)-1) { 2361 pfnum = lo; 2362 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it); 2363 ASSERT(pfnum != (pfn_t)-1); 2364 } else if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) & ceq_mask || 2365 (interleaved_mnodes && PFN_2_MEM_NODE(pfnum) != mnode)) { 2366 /* invalid color, get the closest correct pfn */ 2367 PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask, 2368 color_mask, &it); 2369 if (pfnum >= hi) { 2370 pfnum = lo; 2371 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it); 2372 } 2373 } 2374 } 2375 2376 /* set starting index */ 2377 idx0 = PNUM_TO_IDX(mnode, r, pfnum); 2378 ASSERT(idx0 < len); 2379 2380 for (idx = idx0; wrap == 0 || (idx < idx0 && wrap < 2); ) { 2381 2382 if (PAGE_COUNTERS(mnode, r, idx) != full) 2383 goto next; 2384 2385 /* 2386 * RFE: For performance maybe we can do something less 2387 * brutal than locking the entire freelist. So far 2388 * this doesn't seem to be a performance problem? 2389 */ 2390 page_freelist_lock(mnode); 2391 if (PAGE_COUNTERS(mnode, r, idx) == full) { 2392 ret_pp = 2393 page_promote(mnode, pfnum, r, PC_ALLOC, mtype); 2394 if (ret_pp != NULL) { 2395 VM_STAT_ADD(vmm_vmstats.pfc_coalok[r][mrange]); 2396 PAGE_COUNTERS_CURRENT_COLOR(mnode, r, 2397 PFN_2_COLOR(pfnum, szc, &it), mrange) = idx; 2398 page_freelist_unlock(mnode); 2399 rw_exit(&page_ctrs_rwlock[mnode]); 2400 2401 return (ret_pp); 2402 } 2403 } else { 2404 VM_STAT_ADD(vmm_vmstats.page_ctrs_changed[r][mrange]); 2405 } 2406 2407 page_freelist_unlock(mnode); 2408 /* 2409 * No point looking for another page if we've 2410 * already tried all of the ones that 2411 * page_ctr_cands indicated. Stash off where we left 2412 * off. 2413 * Note: this is not exact since we don't hold the 2414 * page_freelist_locks before we initially get the 2415 * value of cands for performance reasons, but should 2416 * be a decent approximation. 2417 */ 2418 if (--cands == 0) { 2419 PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color, mrange) = 2420 idx; 2421 break; 2422 } 2423 next: 2424 PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask, 2425 color_mask, &it); 2426 idx = PNUM_TO_IDX(mnode, r, pfnum); 2427 if (idx >= len || pfnum >= hi) { 2428 pfnum = lo; 2429 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it); 2430 idx = PNUM_TO_IDX(mnode, r, pfnum); 2431 wrap++; 2432 } 2433 } 2434 2435 rw_exit(&page_ctrs_rwlock[mnode]); 2436 VM_STAT_ADD(vmm_vmstats.page_ctrs_failed[r][mrange]); 2437 return (NULL); 2438 } 2439 2440 /* 2441 * For the given mnode, promote as many small pages to large pages as possible. 2442 * mnode can be -1, which means do them all 2443 */ 2444 void 2445 page_freelist_coalesce_all(int mnode) 2446 { 2447 int r; /* region size */ 2448 int idx, full; 2449 size_t len; 2450 int doall = interleaved_mnodes || mnode < 0; 2451 int mlo = doall ? 0 : mnode; 2452 int mhi = doall ? max_mem_nodes : (mnode + 1); 2453 2454 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce_all); 2455 2456 if (mpss_coalesce_disable) { 2457 return; 2458 } 2459 2460 /* 2461 * Lock the entire freelist and coalesce what we can. 2462 * 2463 * Always promote to the largest page possible 2464 * first to reduce the number of page promotions. 2465 */ 2466 for (mnode = mlo; mnode < mhi; mnode++) { 2467 rw_enter(&page_ctrs_rwlock[mnode], RW_READER); 2468 page_freelist_lock(mnode); 2469 } 2470 for (r = mmu_page_sizes - 1; r > 0; r--) { 2471 for (mnode = mlo; mnode < mhi; mnode++) { 2472 pgcnt_t cands = 0; 2473 int mrange, nranges = mnode_nranges[mnode]; 2474 2475 for (mrange = 0; mrange < nranges; mrange++) { 2476 PGCTRS_CANDS_GETVALUE(mnode, mrange, r, cands); 2477 if (cands != 0) 2478 break; 2479 } 2480 if (cands == 0) { 2481 VM_STAT_ADD(vmm_vmstats. 2482 page_ctrs_cands_skip_all); 2483 continue; 2484 } 2485 2486 full = FULL_REGION_CNT(r); 2487 len = PAGE_COUNTERS_ENTRIES(mnode, r); 2488 2489 for (idx = 0; idx < len; idx++) { 2490 if (PAGE_COUNTERS(mnode, r, idx) == full) { 2491 pfn_t pfnum = 2492 IDX_TO_PNUM(mnode, r, idx); 2493 int tmnode = interleaved_mnodes ? 2494 PFN_2_MEM_NODE(pfnum) : mnode; 2495 2496 ASSERT(pfnum >= 2497 mem_node_config[tmnode].physbase && 2498 pfnum < 2499 mem_node_config[tmnode].physmax); 2500 2501 (void) page_promote(tmnode, 2502 pfnum, r, PC_FREE, PC_MTYPE_ANY); 2503 } 2504 } 2505 /* shared hpm_counters covers all mnodes, so we quit */ 2506 if (interleaved_mnodes) 2507 break; 2508 } 2509 } 2510 for (mnode = mlo; mnode < mhi; mnode++) { 2511 page_freelist_unlock(mnode); 2512 rw_exit(&page_ctrs_rwlock[mnode]); 2513 } 2514 } 2515 2516 /* 2517 * This is where all polices for moving pages around 2518 * to different page size free lists is implemented. 2519 * Returns 1 on success, 0 on failure. 2520 * 2521 * So far these are the priorities for this algorithm in descending 2522 * order: 2523 * 2524 * 1) When servicing a request try to do so with a free page 2525 * from next size up. Helps defer fragmentation as long 2526 * as possible. 2527 * 2528 * 2) Page coalesce on demand. Only when a freelist 2529 * larger than PAGESIZE is empty and step 1 2530 * will not work since all larger size lists are 2531 * also empty. 2532 * 2533 * If pfnhi is non-zero, search for large page with pfn range less than pfnhi. 2534 */ 2535 2536 page_t * 2537 page_freelist_split(uchar_t szc, uint_t color, int mnode, int mtype, 2538 pfn_t pfnlo, pfn_t pfnhi, page_list_walker_t *plw) 2539 { 2540 uchar_t nszc = szc + 1; 2541 uint_t bin, sbin, bin_prev; 2542 page_t *pp, *firstpp; 2543 page_t *ret_pp = NULL; 2544 uint_t color_mask; 2545 2546 if (nszc == mmu_page_sizes) 2547 return (NULL); 2548 2549 ASSERT(nszc < mmu_page_sizes); 2550 color_mask = PAGE_GET_PAGECOLORS(nszc) - 1; 2551 bin = sbin = PAGE_GET_NSZ_COLOR(szc, color); 2552 bin_prev = (plw->plw_bin_split_prev == color) ? INVALID_COLOR : 2553 PAGE_GET_NSZ_COLOR(szc, plw->plw_bin_split_prev); 2554 2555 VM_STAT_ADD(vmm_vmstats.pfs_req[szc]); 2556 /* 2557 * First try to break up a larger page to fill current size freelist. 2558 */ 2559 while (plw->plw_bins[nszc] != 0) { 2560 2561 ASSERT(nszc < mmu_page_sizes); 2562 2563 /* 2564 * If page found then demote it. 2565 */ 2566 if (PAGE_FREELISTS(mnode, nszc, bin, mtype)) { 2567 page_freelist_lock(mnode); 2568 firstpp = pp = PAGE_FREELISTS(mnode, nszc, bin, mtype); 2569 2570 /* 2571 * If pfnhi is not PFNNULL, look for large page below 2572 * pfnhi. PFNNULL signifies no pfn requirement. 2573 */ 2574 if (pp && 2575 ((pfnhi != PFNNULL && pp->p_pagenum >= pfnhi) || 2576 (pfnlo != PFNNULL && pp->p_pagenum < pfnlo))) { 2577 do { 2578 pp = pp->p_vpnext; 2579 if (pp == firstpp) { 2580 pp = NULL; 2581 break; 2582 } 2583 } while ((pfnhi != PFNNULL && 2584 pp->p_pagenum >= pfnhi) || 2585 (pfnlo != PFNNULL && 2586 pp->p_pagenum < pfnlo)); 2587 2588 if (pfnhi != PFNNULL && pp != NULL) 2589 ASSERT(pp->p_pagenum < pfnhi); 2590 2591 if (pfnlo != PFNNULL && pp != NULL) 2592 ASSERT(pp->p_pagenum >= pfnlo); 2593 } 2594 if (pp) { 2595 uint_t ccolor = page_correct_color(szc, nszc, 2596 color, bin, plw->plw_ceq_mask[szc]); 2597 2598 ASSERT(pp->p_szc == nszc); 2599 VM_STAT_ADD(vmm_vmstats.pfs_demote[nszc]); 2600 ret_pp = page_demote(mnode, pp->p_pagenum, 2601 pfnhi, pp->p_szc, szc, ccolor, PC_ALLOC); 2602 if (ret_pp) { 2603 page_freelist_unlock(mnode); 2604 #if defined(__sparc) 2605 if (PP_ISNORELOC(ret_pp)) { 2606 pgcnt_t npgs; 2607 2608 npgs = page_get_pagecnt( 2609 ret_pp->p_szc); 2610 kcage_freemem_sub(npgs); 2611 } 2612 #endif 2613 return (ret_pp); 2614 } 2615 } 2616 page_freelist_unlock(mnode); 2617 } 2618 2619 /* loop through next size bins */ 2620 bin = ADD_MASKED(bin, 1, plw->plw_ceq_mask[nszc], color_mask); 2621 plw->plw_bins[nszc]--; 2622 2623 if (bin == sbin) { 2624 uchar_t nnszc = nszc + 1; 2625 2626 /* we are done with this page size - check next */ 2627 if (plw->plw_bins[nnszc] == 0) 2628 /* we have already checked next size bins */ 2629 break; 2630 2631 bin = sbin = PAGE_GET_NSZ_COLOR(nszc, bin); 2632 if (bin_prev != INVALID_COLOR) { 2633 bin_prev = PAGE_GET_NSZ_COLOR(nszc, bin_prev); 2634 if (!((bin ^ bin_prev) & 2635 plw->plw_ceq_mask[nnszc])) 2636 break; 2637 } 2638 ASSERT(nnszc < mmu_page_sizes); 2639 color_mask = PAGE_GET_PAGECOLORS(nnszc) - 1; 2640 nszc = nnszc; 2641 ASSERT(nszc < mmu_page_sizes); 2642 } 2643 } 2644 2645 return (ret_pp); 2646 } 2647 2648 /* 2649 * Helper routine used only by the freelist code to lock 2650 * a page. If the page is a large page then it succeeds in 2651 * locking all the constituent pages or none at all. 2652 * Returns 1 on sucess, 0 on failure. 2653 */ 2654 static int 2655 page_trylock_cons(page_t *pp, se_t se) 2656 { 2657 page_t *tpp, *first_pp = pp; 2658 2659 /* 2660 * Fail if can't lock first or only page. 2661 */ 2662 if (!page_trylock(pp, se)) { 2663 return (0); 2664 } 2665 2666 /* 2667 * PAGESIZE: common case. 2668 */ 2669 if (pp->p_szc == 0) { 2670 return (1); 2671 } 2672 2673 /* 2674 * Large page case. 2675 */ 2676 tpp = pp->p_next; 2677 while (tpp != pp) { 2678 if (!page_trylock(tpp, se)) { 2679 /* 2680 * On failure unlock what we have locked so far. 2681 * We want to avoid attempting to capture these 2682 * pages as the pcm mutex may be held which could 2683 * lead to a recursive mutex panic. 2684 */ 2685 while (first_pp != tpp) { 2686 page_unlock_nocapture(first_pp); 2687 first_pp = first_pp->p_next; 2688 } 2689 return (0); 2690 } 2691 tpp = tpp->p_next; 2692 } 2693 return (1); 2694 } 2695 2696 /* 2697 * init context for walking page lists 2698 * Called when a page of the given szc in unavailable. Sets markers 2699 * for the beginning of the search to detect when search has 2700 * completed a full cycle. Sets flags for splitting larger pages 2701 * and coalescing smaller pages. Page walking procedes until a page 2702 * of the desired equivalent color is found. 2703 */ 2704 void 2705 page_list_walk_init(uchar_t szc, uint_t flags, uint_t bin, int can_split, 2706 int use_ceq, page_list_walker_t *plw) 2707 { 2708 uint_t nszc, ceq_mask, colors; 2709 uchar_t ceq = use_ceq ? colorequivszc[szc] : 0; 2710 2711 ASSERT(szc < mmu_page_sizes); 2712 colors = PAGE_GET_PAGECOLORS(szc); 2713 2714 plw->plw_colors = colors; 2715 plw->plw_color_mask = colors - 1; 2716 plw->plw_bin_marker = plw->plw_bin0 = bin; 2717 plw->plw_bin_split_prev = bin; 2718 plw->plw_bin_step = (szc == 0) ? vac_colors : 1; 2719 2720 /* 2721 * if vac aliasing is possible make sure lower order color 2722 * bits are never ignored 2723 */ 2724 if (vac_colors > 1) 2725 ceq &= 0xf0; 2726 2727 /* 2728 * calculate the number of non-equivalent colors and 2729 * color equivalency mask 2730 */ 2731 plw->plw_ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf)); 2732 ASSERT(szc > 0 || plw->plw_ceq_dif >= vac_colors); 2733 ASSERT(plw->plw_ceq_dif > 0); 2734 plw->plw_ceq_mask[szc] = (plw->plw_ceq_dif - 1) << (ceq & 0xf); 2735 2736 if (flags & PG_MATCH_COLOR) { 2737 if (cpu_page_colors < 0) { 2738 /* 2739 * this is a heterogeneous machine with different CPUs 2740 * having different size e$ (not supported for ni2/rock 2741 */ 2742 uint_t cpucolors = CPUSETSIZE() >> PAGE_GET_SHIFT(szc); 2743 cpucolors = MAX(cpucolors, 1); 2744 ceq_mask = plw->plw_color_mask & (cpucolors - 1); 2745 plw->plw_ceq_mask[szc] = 2746 MIN(ceq_mask, plw->plw_ceq_mask[szc]); 2747 } 2748 plw->plw_ceq_dif = 1; 2749 } 2750 2751 /* we can split pages in the freelist, but not the cachelist */ 2752 if (can_split) { 2753 plw->plw_do_split = (szc + 1 < mmu_page_sizes) ? 1 : 0; 2754 2755 /* set next szc color masks and number of free list bins */ 2756 for (nszc = szc + 1; nszc < mmu_page_sizes; nszc++, szc++) { 2757 plw->plw_ceq_mask[nszc] = PAGE_GET_NSZ_MASK(szc, 2758 plw->plw_ceq_mask[szc]); 2759 plw->plw_bins[nszc] = PAGE_GET_PAGECOLORS(nszc); 2760 } 2761 plw->plw_ceq_mask[nszc] = INVALID_MASK; 2762 plw->plw_bins[nszc] = 0; 2763 2764 } else { 2765 ASSERT(szc == 0); 2766 plw->plw_do_split = 0; 2767 plw->plw_bins[1] = 0; 2768 plw->plw_ceq_mask[1] = INVALID_MASK; 2769 } 2770 } 2771 2772 /* 2773 * set mark to flag where next split should occur 2774 */ 2775 #define PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw) { \ 2776 uint_t bin_nsz = PAGE_GET_NSZ_COLOR(szc, bin); \ 2777 uint_t bin0_nsz = PAGE_GET_NSZ_COLOR(szc, plw->plw_bin0); \ 2778 uint_t neq_mask = ~plw->plw_ceq_mask[nszc] & plw->plw_color_mask; \ 2779 plw->plw_split_next = \ 2780 INC_MASKED(bin_nsz, neq_mask, plw->plw_color_mask); \ 2781 if (!((plw->plw_split_next ^ bin0_nsz) & plw->plw_ceq_mask[nszc])) { \ 2782 plw->plw_split_next = \ 2783 INC_MASKED(plw->plw_split_next, \ 2784 neq_mask, plw->plw_color_mask); \ 2785 } \ 2786 } 2787 2788 uint_t 2789 page_list_walk_next_bin(uchar_t szc, uint_t bin, page_list_walker_t *plw) 2790 { 2791 uint_t neq_mask = ~plw->plw_ceq_mask[szc] & plw->plw_color_mask; 2792 uint_t bin0_nsz, nbin_nsz, nbin0, nbin; 2793 uchar_t nszc = szc + 1; 2794 2795 nbin = ADD_MASKED(bin, 2796 plw->plw_bin_step, neq_mask, plw->plw_color_mask); 2797 2798 if (plw->plw_do_split) { 2799 plw->plw_bin_split_prev = bin; 2800 PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw); 2801 plw->plw_do_split = 0; 2802 } 2803 2804 if (szc == 0) { 2805 if (plw->plw_count != 0 || plw->plw_ceq_dif == vac_colors) { 2806 if (nbin == plw->plw_bin0 && 2807 (vac_colors == 1 || nbin != plw->plw_bin_marker)) { 2808 nbin = ADD_MASKED(nbin, plw->plw_bin_step, 2809 neq_mask, plw->plw_color_mask); 2810 plw->plw_bin_split_prev = plw->plw_bin0; 2811 } 2812 2813 if (vac_colors > 1 && nbin == plw->plw_bin_marker) { 2814 plw->plw_bin_marker = 2815 nbin = INC_MASKED(nbin, neq_mask, 2816 plw->plw_color_mask); 2817 plw->plw_bin_split_prev = plw->plw_bin0; 2818 /* 2819 * large pages all have the same vac color 2820 * so by now we should be done with next 2821 * size page splitting process 2822 */ 2823 ASSERT(plw->plw_bins[1] == 0); 2824 plw->plw_do_split = 0; 2825 return (nbin); 2826 } 2827 2828 } else { 2829 uint_t bin_jump = (vac_colors == 1) ? 2830 (BIN_STEP & ~3) - (plw->plw_bin0 & 3) : BIN_STEP; 2831 2832 bin_jump &= ~(vac_colors - 1); 2833 2834 nbin0 = ADD_MASKED(plw->plw_bin0, bin_jump, neq_mask, 2835 plw->plw_color_mask); 2836 2837 if ((nbin0 ^ plw->plw_bin0) & plw->plw_ceq_mask[szc]) { 2838 2839 plw->plw_bin_marker = nbin = nbin0; 2840 2841 if (plw->plw_bins[nszc] != 0) { 2842 /* 2843 * check if next page size bin is the 2844 * same as the next page size bin for 2845 * bin0 2846 */ 2847 nbin_nsz = PAGE_GET_NSZ_COLOR(szc, 2848 nbin); 2849 bin0_nsz = PAGE_GET_NSZ_COLOR(szc, 2850 plw->plw_bin0); 2851 2852 if ((bin0_nsz ^ nbin_nsz) & 2853 plw->plw_ceq_mask[nszc]) 2854 plw->plw_do_split = 1; 2855 } 2856 return (nbin); 2857 } 2858 } 2859 } 2860 2861 if (plw->plw_bins[nszc] != 0) { 2862 nbin_nsz = PAGE_GET_NSZ_COLOR(szc, nbin); 2863 if (!((plw->plw_split_next ^ nbin_nsz) & 2864 plw->plw_ceq_mask[nszc])) 2865 plw->plw_do_split = 1; 2866 } 2867 2868 return (nbin); 2869 } 2870 2871 page_t * 2872 page_get_mnode_freelist(int mnode, uint_t bin, int mtype, uchar_t szc, 2873 uint_t flags) 2874 { 2875 kmutex_t *pcm; 2876 page_t *pp, *first_pp; 2877 uint_t sbin; 2878 int plw_initialized; 2879 page_list_walker_t plw; 2880 2881 ASSERT(szc < mmu_page_sizes); 2882 2883 VM_STAT_ADD(vmm_vmstats.pgmf_alloc[szc]); 2884 2885 MTYPE_START(mnode, mtype, flags); 2886 if (mtype < 0) { /* mnode does not have memory in mtype range */ 2887 VM_STAT_ADD(vmm_vmstats.pgmf_allocempty[szc]); 2888 return (NULL); 2889 } 2890 try_again: 2891 2892 plw_initialized = 0; 2893 plw.plw_ceq_dif = 1; 2894 2895 /* 2896 * Only hold one freelist lock at a time, that way we 2897 * can start anywhere and not have to worry about lock 2898 * ordering. 2899 */ 2900 for (plw.plw_count = 0; 2901 plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) { 2902 sbin = bin; 2903 do { 2904 if (!PAGE_FREELISTS(mnode, szc, bin, mtype)) 2905 goto bin_empty_1; 2906 2907 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 2908 mutex_enter(pcm); 2909 pp = PAGE_FREELISTS(mnode, szc, bin, mtype); 2910 if (pp == NULL) 2911 goto bin_empty_0; 2912 2913 /* 2914 * These were set before the page 2915 * was put on the free list, 2916 * they must still be set. 2917 */ 2918 ASSERT(PP_ISFREE(pp)); 2919 ASSERT(PP_ISAGED(pp)); 2920 ASSERT(pp->p_vnode == NULL); 2921 ASSERT(pp->p_hash == NULL); 2922 ASSERT(pp->p_offset == (u_offset_t)-1); 2923 ASSERT(pp->p_szc == szc); 2924 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 2925 2926 /* 2927 * Walk down the hash chain. 2928 * 8k pages are linked on p_next 2929 * and p_prev fields. Large pages 2930 * are a contiguous group of 2931 * constituent pages linked together 2932 * on their p_next and p_prev fields. 2933 * The large pages are linked together 2934 * on the hash chain using p_vpnext 2935 * p_vpprev of the base constituent 2936 * page of each large page. 2937 */ 2938 first_pp = pp; 2939 while (IS_DUMP_PAGE(pp) || !page_trylock_cons(pp, 2940 SE_EXCL)) { 2941 if (szc == 0) { 2942 pp = pp->p_next; 2943 } else { 2944 pp = pp->p_vpnext; 2945 } 2946 2947 ASSERT(PP_ISFREE(pp)); 2948 ASSERT(PP_ISAGED(pp)); 2949 ASSERT(pp->p_vnode == NULL); 2950 ASSERT(pp->p_hash == NULL); 2951 ASSERT(pp->p_offset == (u_offset_t)-1); 2952 ASSERT(pp->p_szc == szc); 2953 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 2954 2955 if (pp == first_pp) 2956 goto bin_empty_0; 2957 } 2958 2959 ASSERT(pp != NULL); 2960 ASSERT(mtype == PP_2_MTYPE(pp)); 2961 ASSERT(pp->p_szc == szc); 2962 if (szc == 0) { 2963 page_sub(&PAGE_FREELISTS(mnode, 2964 szc, bin, mtype), pp); 2965 } else { 2966 page_vpsub(&PAGE_FREELISTS(mnode, 2967 szc, bin, mtype), pp); 2968 CHK_LPG(pp, szc); 2969 } 2970 page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST); 2971 2972 if ((PP_ISFREE(pp) == 0) || (PP_ISAGED(pp) == 0)) 2973 panic("free page is not. pp %p", (void *)pp); 2974 mutex_exit(pcm); 2975 2976 #if defined(__sparc) 2977 ASSERT(!kcage_on || PP_ISNORELOC(pp) || 2978 (flags & PG_NORELOC) == 0); 2979 2980 if (PP_ISNORELOC(pp)) 2981 kcage_freemem_sub(page_get_pagecnt(szc)); 2982 #endif 2983 VM_STAT_ADD(vmm_vmstats.pgmf_allocok[szc]); 2984 return (pp); 2985 2986 bin_empty_0: 2987 mutex_exit(pcm); 2988 bin_empty_1: 2989 if (plw_initialized == 0) { 2990 page_list_walk_init(szc, flags, bin, 1, 1, 2991 &plw); 2992 plw_initialized = 1; 2993 ASSERT(plw.plw_colors <= 2994 PAGE_GET_PAGECOLORS(szc)); 2995 ASSERT(plw.plw_colors > 0); 2996 ASSERT((plw.plw_colors & 2997 (plw.plw_colors - 1)) == 0); 2998 ASSERT(bin < plw.plw_colors); 2999 ASSERT(plw.plw_ceq_mask[szc] < plw.plw_colors); 3000 } 3001 /* calculate the next bin with equivalent color */ 3002 bin = ADD_MASKED(bin, plw.plw_bin_step, 3003 plw.plw_ceq_mask[szc], plw.plw_color_mask); 3004 } while (sbin != bin); 3005 3006 /* 3007 * color bins are all empty if color match. Try and 3008 * satisfy the request by breaking up or coalescing 3009 * pages from a different size freelist of the correct 3010 * color that satisfies the ORIGINAL color requested. 3011 * If that fails then try pages of the same size but 3012 * different colors assuming we are not called with 3013 * PG_MATCH_COLOR. 3014 */ 3015 if (plw.plw_do_split && 3016 (pp = page_freelist_split(szc, bin, mnode, 3017 mtype, PFNNULL, PFNNULL, &plw)) != NULL) 3018 return (pp); 3019 3020 if (szc > 0 && (pp = page_freelist_coalesce(mnode, szc, 3021 bin, plw.plw_ceq_mask[szc], mtype, PFNNULL)) != NULL) 3022 return (pp); 3023 3024 if (plw.plw_ceq_dif > 1) 3025 bin = page_list_walk_next_bin(szc, bin, &plw); 3026 } 3027 3028 /* if allowed, cycle through additional mtypes */ 3029 MTYPE_NEXT(mnode, mtype, flags); 3030 if (mtype >= 0) 3031 goto try_again; 3032 3033 VM_STAT_ADD(vmm_vmstats.pgmf_allocfailed[szc]); 3034 3035 return (NULL); 3036 } 3037 3038 /* 3039 * Returns the count of free pages for 'pp' with size code 'szc'. 3040 * Note: This function does not return an exact value as the page freelist 3041 * locks are not held and thus the values in the page_counters may be 3042 * changing as we walk through the data. 3043 */ 3044 static int 3045 page_freecnt(int mnode, page_t *pp, uchar_t szc) 3046 { 3047 pgcnt_t pgfree; 3048 pgcnt_t cnt; 3049 ssize_t r = szc; /* region size */ 3050 ssize_t idx; 3051 int i; 3052 int full, range; 3053 3054 /* Make sure pagenum passed in is aligned properly */ 3055 ASSERT((pp->p_pagenum & (PNUM_SIZE(szc) - 1)) == 0); 3056 ASSERT(szc > 0); 3057 3058 /* Prevent page_counters dynamic memory from being freed */ 3059 rw_enter(&page_ctrs_rwlock[mnode], RW_READER); 3060 idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum); 3061 cnt = PAGE_COUNTERS(mnode, r, idx); 3062 pgfree = cnt << PNUM_SHIFT(r - 1); 3063 range = FULL_REGION_CNT(szc); 3064 3065 /* Check for completely full region */ 3066 if (cnt == range) { 3067 rw_exit(&page_ctrs_rwlock[mnode]); 3068 return (pgfree); 3069 } 3070 3071 while (--r > 0) { 3072 idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum); 3073 full = FULL_REGION_CNT(r); 3074 for (i = 0; i < range; i++, idx++) { 3075 cnt = PAGE_COUNTERS(mnode, r, idx); 3076 /* 3077 * If cnt here is full, that means we have already 3078 * accounted for these pages earlier. 3079 */ 3080 if (cnt != full) { 3081 pgfree += (cnt << PNUM_SHIFT(r - 1)); 3082 } 3083 } 3084 range *= full; 3085 } 3086 rw_exit(&page_ctrs_rwlock[mnode]); 3087 return (pgfree); 3088 } 3089 3090 /* 3091 * Called from page_geti_contig_pages to exclusively lock constituent pages 3092 * starting from 'spp' for page size code 'szc'. 3093 * 3094 * If 'ptcpthreshold' is set, the number of free pages needed in the 'szc' 3095 * region needs to be greater than or equal to the threshold. 3096 */ 3097 static int 3098 page_trylock_contig_pages(int mnode, page_t *spp, uchar_t szc, int flags) 3099 { 3100 pgcnt_t pgcnt = PNUM_SIZE(szc); 3101 pgcnt_t pgfree, i; 3102 page_t *pp; 3103 3104 VM_STAT_ADD(vmm_vmstats.ptcp[szc]); 3105 3106 3107 if ((ptcpthreshold == 0) || (flags & PGI_PGCPHIPRI)) 3108 goto skipptcpcheck; 3109 /* 3110 * check if there are sufficient free pages available before attempting 3111 * to trylock. Count is approximate as page counters can change. 3112 */ 3113 pgfree = page_freecnt(mnode, spp, szc); 3114 3115 /* attempt to trylock if there are sufficient already free pages */ 3116 if (pgfree < pgcnt/ptcpthreshold) { 3117 VM_STAT_ADD(vmm_vmstats.ptcpfreethresh[szc]); 3118 return (0); 3119 } 3120 3121 skipptcpcheck: 3122 3123 for (i = 0; i < pgcnt; i++) { 3124 pp = &spp[i]; 3125 if (!page_trylock(pp, SE_EXCL)) { 3126 VM_STAT_ADD(vmm_vmstats.ptcpfailexcl[szc]); 3127 while (--i != (pgcnt_t)-1) { 3128 pp = &spp[i]; 3129 ASSERT(PAGE_EXCL(pp)); 3130 page_unlock_nocapture(pp); 3131 } 3132 return (0); 3133 } 3134 ASSERT(spp[i].p_pagenum == spp->p_pagenum + i); 3135 if ((pp->p_szc > szc || (szc && pp->p_szc == szc)) && 3136 !PP_ISFREE(pp)) { 3137 VM_STAT_ADD(vmm_vmstats.ptcpfailszc[szc]); 3138 ASSERT(i == 0); 3139 page_unlock_nocapture(pp); 3140 return (0); 3141 } 3142 3143 /* 3144 * If a page has been marked non-relocatable or has been 3145 * explicitly locked in memory, we don't want to relocate it; 3146 * unlock the pages and fail the operation. 3147 */ 3148 if (PP_ISNORELOC(pp) || 3149 pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 3150 VM_STAT_ADD(vmm_vmstats.ptcpfailcage[szc]); 3151 while (i != (pgcnt_t)-1) { 3152 pp = &spp[i]; 3153 ASSERT(PAGE_EXCL(pp)); 3154 page_unlock_nocapture(pp); 3155 i--; 3156 } 3157 return (0); 3158 } 3159 } 3160 VM_STAT_ADD(vmm_vmstats.ptcpok[szc]); 3161 return (1); 3162 } 3163 3164 /* 3165 * Claim large page pointed to by 'pp'. 'pp' is the starting set 3166 * of 'szc' constituent pages that had been locked exclusively previously. 3167 * Will attempt to relocate constituent pages in use. 3168 */ 3169 static page_t * 3170 page_claim_contig_pages(page_t *pp, uchar_t szc, int flags) 3171 { 3172 spgcnt_t pgcnt, npgs, i; 3173 page_t *targpp, *rpp, *hpp; 3174 page_t *replpp = NULL; 3175 page_t *pplist = NULL; 3176 3177 ASSERT(pp != NULL); 3178 3179 pgcnt = page_get_pagecnt(szc); 3180 while (pgcnt) { 3181 ASSERT(PAGE_EXCL(pp)); 3182 ASSERT(!PP_ISNORELOC(pp)); 3183 if (PP_ISFREE(pp)) { 3184 /* 3185 * If this is a PG_FREE_LIST page then its 3186 * size code can change underneath us due to 3187 * page promotion or demotion. As an optimzation 3188 * use page_list_sub_pages() instead of 3189 * page_list_sub(). 3190 */ 3191 if (PP_ISAGED(pp)) { 3192 page_list_sub_pages(pp, szc); 3193 if (pp->p_szc == szc) { 3194 return (pp); 3195 } 3196 ASSERT(pp->p_szc < szc); 3197 npgs = page_get_pagecnt(pp->p_szc); 3198 hpp = pp; 3199 for (i = 0; i < npgs; i++, pp++) { 3200 pp->p_szc = szc; 3201 } 3202 page_list_concat(&pplist, &hpp); 3203 pgcnt -= npgs; 3204 continue; 3205 } 3206 ASSERT(!PP_ISAGED(pp)); 3207 ASSERT(pp->p_szc == 0); 3208 page_list_sub(pp, PG_CACHE_LIST); 3209 page_hashout(pp, NULL); 3210 PP_SETAGED(pp); 3211 pp->p_szc = szc; 3212 page_list_concat(&pplist, &pp); 3213 pp++; 3214 pgcnt--; 3215 continue; 3216 } 3217 npgs = page_get_pagecnt(pp->p_szc); 3218 3219 /* 3220 * page_create_wait freemem accounting done by caller of 3221 * page_get_freelist and not necessary to call it prior to 3222 * calling page_get_replacement_page. 3223 * 3224 * page_get_replacement_page can call page_get_contig_pages 3225 * to acquire a large page (szc > 0); the replacement must be 3226 * smaller than the contig page size to avoid looping or 3227 * szc == 0 and PGI_PGCPSZC0 is set. 3228 */ 3229 if (pp->p_szc < szc || (szc == 0 && (flags & PGI_PGCPSZC0))) { 3230 replpp = page_get_replacement_page(pp, NULL, 0); 3231 if (replpp) { 3232 npgs = page_get_pagecnt(pp->p_szc); 3233 ASSERT(npgs <= pgcnt); 3234 targpp = pp; 3235 } 3236 } 3237 3238 /* 3239 * If replacement is NULL or do_page_relocate fails, fail 3240 * coalescing of pages. 3241 */ 3242 if (replpp == NULL || (do_page_relocate(&targpp, &replpp, 0, 3243 &npgs, NULL) != 0)) { 3244 /* 3245 * Unlock un-processed target list 3246 */ 3247 while (pgcnt--) { 3248 ASSERT(PAGE_EXCL(pp)); 3249 page_unlock_nocapture(pp); 3250 pp++; 3251 } 3252 /* 3253 * Free the processed target list. 3254 */ 3255 while (pplist) { 3256 pp = pplist; 3257 page_sub(&pplist, pp); 3258 ASSERT(PAGE_EXCL(pp)); 3259 ASSERT(pp->p_szc == szc); 3260 ASSERT(PP_ISFREE(pp)); 3261 ASSERT(PP_ISAGED(pp)); 3262 pp->p_szc = 0; 3263 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); 3264 page_unlock_nocapture(pp); 3265 } 3266 3267 if (replpp != NULL) 3268 page_free_replacement_page(replpp); 3269 3270 return (NULL); 3271 } 3272 ASSERT(pp == targpp); 3273 3274 /* LINTED */ 3275 ASSERT(hpp = pp); /* That's right, it's an assignment */ 3276 3277 pp += npgs; 3278 pgcnt -= npgs; 3279 3280 while (npgs--) { 3281 ASSERT(PAGE_EXCL(targpp)); 3282 ASSERT(!PP_ISFREE(targpp)); 3283 ASSERT(!PP_ISNORELOC(targpp)); 3284 PP_SETFREE(targpp); 3285 ASSERT(PP_ISAGED(targpp)); 3286 ASSERT(targpp->p_szc < szc || (szc == 0 && 3287 (flags & PGI_PGCPSZC0))); 3288 targpp->p_szc = szc; 3289 targpp = targpp->p_next; 3290 3291 rpp = replpp; 3292 ASSERT(rpp != NULL); 3293 page_sub(&replpp, rpp); 3294 ASSERT(PAGE_EXCL(rpp)); 3295 ASSERT(!PP_ISFREE(rpp)); 3296 page_unlock_nocapture(rpp); 3297 } 3298 ASSERT(targpp == hpp); 3299 ASSERT(replpp == NULL); 3300 page_list_concat(&pplist, &targpp); 3301 } 3302 CHK_LPG(pplist, szc); 3303 return (pplist); 3304 } 3305 3306 /* 3307 * Trim kernel cage from pfnlo-pfnhi and store result in lo-hi. Return code 3308 * of 0 means nothing left after trim. 3309 */ 3310 int 3311 trimkcage(struct memseg *mseg, pfn_t *lo, pfn_t *hi, pfn_t pfnlo, pfn_t pfnhi) 3312 { 3313 pfn_t kcagepfn; 3314 int decr; 3315 int rc = 0; 3316 3317 if (PP_ISNORELOC(mseg->pages)) { 3318 if (PP_ISNORELOC(mseg->epages - 1) == 0) { 3319 3320 /* lower part of this mseg inside kernel cage */ 3321 decr = kcage_current_pfn(&kcagepfn); 3322 3323 /* kernel cage may have transitioned past mseg */ 3324 if (kcagepfn >= mseg->pages_base && 3325 kcagepfn < mseg->pages_end) { 3326 ASSERT(decr == 0); 3327 *lo = MAX(kcagepfn, pfnlo); 3328 *hi = MIN(pfnhi, (mseg->pages_end - 1)); 3329 rc = 1; 3330 } 3331 } 3332 /* else entire mseg in the cage */ 3333 } else { 3334 if (PP_ISNORELOC(mseg->epages - 1)) { 3335 3336 /* upper part of this mseg inside kernel cage */ 3337 decr = kcage_current_pfn(&kcagepfn); 3338 3339 /* kernel cage may have transitioned past mseg */ 3340 if (kcagepfn >= mseg->pages_base && 3341 kcagepfn < mseg->pages_end) { 3342 ASSERT(decr); 3343 *hi = MIN(kcagepfn, pfnhi); 3344 *lo = MAX(pfnlo, mseg->pages_base); 3345 rc = 1; 3346 } 3347 } else { 3348 /* entire mseg outside of kernel cage */ 3349 *lo = MAX(pfnlo, mseg->pages_base); 3350 *hi = MIN(pfnhi, (mseg->pages_end - 1)); 3351 rc = 1; 3352 } 3353 } 3354 return (rc); 3355 } 3356 3357 /* 3358 * called from page_get_contig_pages to search 'pfnlo' thru 'pfnhi' to claim a 3359 * page with size code 'szc'. Claiming such a page requires acquiring 3360 * exclusive locks on all constituent pages (page_trylock_contig_pages), 3361 * relocating pages in use and concatenating these constituent pages into a 3362 * large page. 3363 * 3364 * The page lists do not have such a large page and page_freelist_split has 3365 * already failed to demote larger pages and/or coalesce smaller free pages. 3366 * 3367 * 'flags' may specify PG_COLOR_MATCH which would limit the search of large 3368 * pages with the same color as 'bin'. 3369 * 3370 * 'pfnflag' specifies the subset of the pfn range to search. 3371 */ 3372 3373 static page_t * 3374 page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags, 3375 pfn_t pfnlo, pfn_t pfnhi, pgcnt_t pfnflag) 3376 { 3377 struct memseg *mseg; 3378 pgcnt_t szcpgcnt = page_get_pagecnt(szc); 3379 pgcnt_t szcpgmask = szcpgcnt - 1; 3380 pfn_t randpfn; 3381 page_t *pp, *randpp, *endpp; 3382 uint_t colors, ceq_mask; 3383 /* LINTED : set but not used in function */ 3384 uint_t color_mask __unused; 3385 pfn_t hi, lo; 3386 uint_t skip; 3387 MEM_NODE_ITERATOR_DECL(it); 3388 3389 ASSERT(szc != 0 || (flags & PGI_PGCPSZC0)); 3390 3391 pfnlo = P2ROUNDUP(pfnlo, szcpgcnt); 3392 3393 if ((pfnhi - pfnlo) + 1 < szcpgcnt || pfnlo >= pfnhi) 3394 return (NULL); 3395 3396 ASSERT(szc < mmu_page_sizes); 3397 3398 colors = PAGE_GET_PAGECOLORS(szc); 3399 color_mask = colors - 1; 3400 if ((colors > 1) && (flags & PG_MATCH_COLOR)) { 3401 uchar_t ceq = colorequivszc[szc]; 3402 uint_t ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf)); 3403 3404 ASSERT(ceq_dif > 0); 3405 ceq_mask = (ceq_dif - 1) << (ceq & 0xf); 3406 } else { 3407 ceq_mask = 0; 3408 } 3409 3410 ASSERT(bin < colors); 3411 3412 /* clear "non-significant" color bits */ 3413 bin &= ceq_mask; 3414 3415 /* 3416 * trim the pfn range to search based on pfnflag. pfnflag is set 3417 * when there have been previous page_get_contig_page failures to 3418 * limit the search. 3419 * 3420 * The high bit in pfnflag specifies the number of 'slots' in the 3421 * pfn range and the remainder of pfnflag specifies which slot. 3422 * For example, a value of 1010b would mean the second slot of 3423 * the pfn range that has been divided into 8 slots. 3424 */ 3425 if (pfnflag > 1) { 3426 int slots = 1 << (highbit(pfnflag) - 1); 3427 int slotid = pfnflag & (slots - 1); 3428 pgcnt_t szcpages; 3429 int slotlen; 3430 3431 pfnhi = P2ALIGN((pfnhi + 1), szcpgcnt) - 1; 3432 szcpages = ((pfnhi - pfnlo) + 1) / szcpgcnt; 3433 slotlen = howmany(szcpages, slots); 3434 /* skip if 'slotid' slot is empty */ 3435 if (slotid * slotlen >= szcpages) 3436 return (NULL); 3437 pfnlo = pfnlo + (((slotid * slotlen) % szcpages) * szcpgcnt); 3438 ASSERT(pfnlo < pfnhi); 3439 if (pfnhi > pfnlo + (slotlen * szcpgcnt)) 3440 pfnhi = pfnlo + (slotlen * szcpgcnt) - 1; 3441 } 3442 3443 /* 3444 * This routine is can be called recursively so we shouldn't 3445 * acquire a reader lock if a write request is pending. This 3446 * could lead to a deadlock with the DR thread. 3447 * 3448 * Returning NULL informs the caller that we could not get 3449 * a contig page with the required characteristics. 3450 */ 3451 3452 if (!memsegs_trylock(0)) 3453 return (NULL); 3454 3455 /* 3456 * loop through memsegs to look for contig page candidates 3457 */ 3458 3459 for (mseg = memsegs; mseg != NULL; mseg = mseg->next) { 3460 if (pfnhi < mseg->pages_base || pfnlo >= mseg->pages_end) { 3461 /* no overlap */ 3462 continue; 3463 } 3464 3465 if (mseg->pages_end - mseg->pages_base < szcpgcnt) 3466 /* mseg too small */ 3467 continue; 3468 3469 /* 3470 * trim off kernel cage pages from pfn range and check for 3471 * a trimmed pfn range returned that does not span the 3472 * desired large page size. 3473 */ 3474 if (kcage_on) { 3475 if (trimkcage(mseg, &lo, &hi, pfnlo, pfnhi) == 0 || 3476 lo >= hi || ((hi - lo) + 1) < szcpgcnt) 3477 continue; 3478 } else { 3479 lo = MAX(pfnlo, mseg->pages_base); 3480 hi = MIN(pfnhi, (mseg->pages_end - 1)); 3481 } 3482 3483 /* round to szcpgcnt boundaries */ 3484 lo = P2ROUNDUP(lo, szcpgcnt); 3485 3486 MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it); 3487 hi = P2ALIGN((hi + 1), szcpgcnt) - 1; 3488 3489 if (hi <= lo) 3490 continue; 3491 3492 /* 3493 * set lo to point to the pfn for the desired bin. Large 3494 * page sizes may only have a single page color 3495 */ 3496 skip = szcpgcnt; 3497 if (ceq_mask > 0 || interleaved_mnodes) { 3498 /* set lo to point at appropriate color */ 3499 if (((PFN_2_COLOR(lo, szc, &it) ^ bin) & ceq_mask) || 3500 (interleaved_mnodes && 3501 PFN_2_MEM_NODE(lo) != mnode)) { 3502 PAGE_NEXT_PFN_FOR_COLOR(lo, szc, bin, ceq_mask, 3503 color_mask, &it); 3504 } 3505 if (hi <= lo) 3506 /* mseg cannot satisfy color request */ 3507 continue; 3508 } 3509 3510 /* randomly choose a point between lo and hi to begin search */ 3511 3512 randpfn = (pfn_t)GETTICK(); 3513 randpfn = ((randpfn % (hi - lo)) + lo) & ~(skip - 1); 3514 MEM_NODE_ITERATOR_INIT(randpfn, mnode, szc, &it); 3515 if (ceq_mask || interleaved_mnodes || randpfn == (pfn_t)-1) { 3516 if (randpfn != (pfn_t)-1) { 3517 PAGE_NEXT_PFN_FOR_COLOR(randpfn, szc, bin, 3518 ceq_mask, color_mask, &it); 3519 } 3520 if (randpfn >= hi) { 3521 randpfn = lo; 3522 MEM_NODE_ITERATOR_INIT(randpfn, mnode, szc, 3523 &it); 3524 } 3525 } 3526 randpp = mseg->pages + (randpfn - mseg->pages_base); 3527 3528 ASSERT(randpp->p_pagenum == randpfn); 3529 3530 pp = randpp; 3531 endpp = mseg->pages + (hi - mseg->pages_base) + 1; 3532 3533 ASSERT(randpp + szcpgcnt <= endpp); 3534 3535 do { 3536 ASSERT(!(pp->p_pagenum & szcpgmask)); 3537 ASSERT(((PP_2_BIN(pp) ^ bin) & ceq_mask) == 0); 3538 3539 if (page_trylock_contig_pages(mnode, pp, szc, flags)) { 3540 /* pages unlocked by page_claim on failure */ 3541 if (page_claim_contig_pages(pp, szc, flags)) { 3542 memsegs_unlock(0); 3543 return (pp); 3544 } 3545 } 3546 3547 if (ceq_mask == 0 && !interleaved_mnodes) { 3548 pp += skip; 3549 } else { 3550 pfn_t pfn = pp->p_pagenum; 3551 3552 PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, bin, 3553 ceq_mask, color_mask, &it); 3554 if (pfn == (pfn_t)-1) { 3555 pp = endpp; 3556 } else { 3557 pp = mseg->pages + 3558 (pfn - mseg->pages_base); 3559 } 3560 } 3561 if (pp >= endpp) { 3562 /* start from the beginning */ 3563 MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it); 3564 pp = mseg->pages + (lo - mseg->pages_base); 3565 ASSERT(pp->p_pagenum == lo); 3566 ASSERT(pp + szcpgcnt <= endpp); 3567 } 3568 } while (pp != randpp); 3569 } 3570 memsegs_unlock(0); 3571 return (NULL); 3572 } 3573 3574 3575 /* 3576 * controlling routine that searches through physical memory in an attempt to 3577 * claim a large page based on the input parameters. 3578 * on the page free lists. 3579 * 3580 * calls page_geti_contig_pages with an initial pfn range from the mnode 3581 * and mtype. page_geti_contig_pages will trim off the parts of the pfn range 3582 * that overlaps with the kernel cage or does not match the requested page 3583 * color if PG_MATCH_COLOR is set. Since this search is very expensive, 3584 * page_geti_contig_pages may further limit the search range based on 3585 * previous failure counts (pgcpfailcnt[]). 3586 * 3587 * for PGI_PGCPSZC0 requests, page_get_contig_pages will relocate a base 3588 * pagesize page that satisfies mtype. 3589 */ 3590 page_t * 3591 page_get_contig_pages(int mnode, uint_t bin, int mtype, uchar_t szc, 3592 uint_t flags) 3593 { 3594 pfn_t pfnlo, pfnhi; /* contig pages pfn range */ 3595 page_t *pp; 3596 pgcnt_t pfnflag = 0; /* no limit on search if 0 */ 3597 3598 VM_STAT_ADD(vmm_vmstats.pgcp_alloc[szc]); 3599 3600 /* no allocations from cage */ 3601 flags |= PGI_NOCAGE; 3602 3603 /* LINTED */ 3604 MTYPE_START(mnode, mtype, flags); 3605 if (mtype < 0) { /* mnode does not have memory in mtype range */ 3606 VM_STAT_ADD(vmm_vmstats.pgcp_allocempty[szc]); 3607 return (NULL); 3608 } 3609 3610 ASSERT(szc > 0 || (flags & PGI_PGCPSZC0)); 3611 3612 /* do not limit search and ignore color if hi pri */ 3613 3614 if (pgcplimitsearch && ((flags & PGI_PGCPHIPRI) == 0)) 3615 pfnflag = pgcpfailcnt[szc]; 3616 3617 /* remove color match to improve chances */ 3618 3619 if (flags & PGI_PGCPHIPRI || pfnflag) 3620 flags &= ~PG_MATCH_COLOR; 3621 3622 do { 3623 /* get pfn range based on mnode and mtype */ 3624 MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi); 3625 3626 ASSERT(pfnhi >= pfnlo); 3627 3628 pp = page_geti_contig_pages(mnode, bin, szc, flags, 3629 pfnlo, pfnhi, pfnflag); 3630 3631 if (pp != NULL) { 3632 pfnflag = pgcpfailcnt[szc]; 3633 if (pfnflag) { 3634 /* double the search size */ 3635 pgcpfailcnt[szc] = pfnflag >> 1; 3636 } 3637 VM_STAT_ADD(vmm_vmstats.pgcp_allocok[szc]); 3638 return (pp); 3639 } 3640 MTYPE_NEXT(mnode, mtype, flags); 3641 } while (mtype >= 0); 3642 3643 VM_STAT_ADD(vmm_vmstats.pgcp_allocfailed[szc]); 3644 return (NULL); 3645 } 3646 3647 #if defined(__x86) 3648 /* 3649 * Determine the likelihood of finding/coalescing a szc page. 3650 * Return 0 if the likelihood is small otherwise return 1. 3651 * 3652 * For now, be conservative and check only 1g pages and return 0 3653 * if there had been previous coalescing failures and the szc pages 3654 * needed to satisfy request would exhaust most of freemem. 3655 */ 3656 int 3657 page_chk_freelist(uint_t szc) 3658 { 3659 pgcnt_t pgcnt; 3660 3661 if (szc <= 1) 3662 return (1); 3663 3664 pgcnt = page_get_pagecnt(szc); 3665 if (pgcpfailcnt[szc] && pgcnt + throttlefree >= freemem) { 3666 VM_STAT_ADD(vmm_vmstats.pcf_deny[szc]); 3667 return (0); 3668 } 3669 VM_STAT_ADD(vmm_vmstats.pcf_allow[szc]); 3670 return (1); 3671 } 3672 #endif 3673 3674 /* 3675 * Find the `best' page on the freelist for this (vp,off) (as,vaddr) pair. 3676 * 3677 * Does its own locking and accounting. 3678 * If PG_MATCH_COLOR is set, then NULL will be returned if there are no 3679 * pages of the proper color even if there are pages of a different color. 3680 * 3681 * Finds a page, removes it, THEN locks it. 3682 */ 3683 3684 /*ARGSUSED*/ 3685 page_t * 3686 page_get_freelist(struct vnode *vp, u_offset_t off, struct seg *seg, 3687 caddr_t vaddr, size_t size, uint_t flags, struct lgrp *lgrp) 3688 { 3689 struct as *as = seg->s_as; 3690 page_t *pp = NULL; 3691 ulong_t bin; 3692 uchar_t szc; 3693 int mnode; 3694 int mtype; 3695 page_t *(*page_get_func)(int, uint_t, int, uchar_t, uint_t); 3696 lgrp_mnode_cookie_t lgrp_cookie; 3697 3698 page_get_func = page_get_mnode_freelist; 3699 3700 /* 3701 * If we aren't passed a specific lgroup, or passed a freed lgrp 3702 * assume we wish to allocate near to the current thread's home. 3703 */ 3704 if (!LGRP_EXISTS(lgrp)) 3705 lgrp = lgrp_home_lgrp(); 3706 3707 if (kcage_on) { 3708 if ((flags & (PG_NORELOC | PG_PANIC)) == PG_NORELOC && 3709 kcage_freemem < kcage_throttlefree + btop(size) && 3710 curthread != kcage_cageout_thread) { 3711 /* 3712 * Set a "reserve" of kcage_throttlefree pages for 3713 * PG_PANIC and cageout thread allocations. 3714 * 3715 * Everybody else has to serialize in 3716 * page_create_get_something() to get a cage page, so 3717 * that we don't deadlock cageout! 3718 */ 3719 return (NULL); 3720 } 3721 } else { 3722 flags &= ~PG_NORELOC; 3723 flags |= PGI_NOCAGE; 3724 } 3725 3726 /* LINTED */ 3727 MTYPE_INIT(mtype, vp, vaddr, flags, size); 3728 3729 /* 3730 * Convert size to page size code. 3731 */ 3732 if ((szc = page_szc(size)) == (uchar_t)-1) 3733 panic("page_get_freelist: illegal page size request"); 3734 ASSERT(szc < mmu_page_sizes); 3735 3736 VM_STAT_ADD(vmm_vmstats.pgf_alloc[szc]); 3737 3738 /* LINTED */ 3739 AS_2_BIN(as, seg, vp, vaddr, bin, szc); 3740 3741 ASSERT(bin < PAGE_GET_PAGECOLORS(szc)); 3742 3743 /* 3744 * Try to get a local page first, but try remote if we can't 3745 * get a page of the right color. 3746 */ 3747 pgretry: 3748 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL); 3749 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3750 pp = page_get_func(mnode, bin, mtype, szc, flags); 3751 if (pp != NULL) { 3752 VM_STAT_ADD(vmm_vmstats.pgf_allocok[szc]); 3753 DTRACE_PROBE4(page__get, 3754 lgrp_t *, lgrp, 3755 int, mnode, 3756 ulong_t, bin, 3757 uint_t, flags); 3758 return (pp); 3759 } 3760 } 3761 ASSERT(pp == NULL); 3762 3763 /* 3764 * for non-SZC0 PAGESIZE requests, check cachelist before checking 3765 * remote free lists. Caller expected to call page_get_cachelist which 3766 * will check local cache lists and remote free lists. 3767 */ 3768 if (szc == 0 && ((flags & PGI_PGCPSZC0) == 0)) { 3769 VM_STAT_ADD(vmm_vmstats.pgf_allocdeferred); 3770 return (NULL); 3771 } 3772 3773 ASSERT(szc > 0 || (flags & PGI_PGCPSZC0)); 3774 3775 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); 3776 3777 if (!(flags & PG_LOCAL)) { 3778 /* 3779 * Try to get a non-local freelist page. 3780 */ 3781 LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie); 3782 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3783 pp = page_get_func(mnode, bin, mtype, szc, flags); 3784 if (pp != NULL) { 3785 DTRACE_PROBE4(page__get, 3786 lgrp_t *, lgrp, 3787 int, mnode, 3788 ulong_t, bin, 3789 uint_t, flags); 3790 VM_STAT_ADD(vmm_vmstats.pgf_allocokrem[szc]); 3791 return (pp); 3792 } 3793 } 3794 ASSERT(pp == NULL); 3795 } 3796 3797 /* 3798 * when the cage is off chances are page_get_contig_pages() will fail 3799 * to lock a large page chunk therefore when the cage is off it's not 3800 * called by default. this can be changed via /etc/system. 3801 * 3802 * page_get_contig_pages() also called to acquire a base pagesize page 3803 * for page_create_get_something(). 3804 */ 3805 if (!(flags & PG_NORELOC) && (pg_contig_disable == 0) && 3806 (kcage_on || pg_lpgcreate_nocage || szc == 0) && 3807 (page_get_func != page_get_contig_pages)) { 3808 3809 VM_STAT_ADD(vmm_vmstats.pgf_allocretry[szc]); 3810 page_get_func = page_get_contig_pages; 3811 goto pgretry; 3812 } 3813 3814 if (!(flags & PG_LOCAL) && pgcplimitsearch && 3815 page_get_func == page_get_contig_pages) 3816 SETPGCPFAILCNT(szc); 3817 3818 VM_STAT_ADD(vmm_vmstats.pgf_allocfailed[szc]); 3819 return (NULL); 3820 } 3821 3822 /* 3823 * Find the `best' page on the cachelist for this (vp,off) (as,vaddr) pair. 3824 * 3825 * Does its own locking. 3826 * If PG_MATCH_COLOR is set, then NULL will be returned if there are no 3827 * pages of the proper color even if there are pages of a different color. 3828 * Otherwise, scan the bins for ones with pages. For each bin with pages, 3829 * try to lock one of them. If no page can be locked, try the 3830 * next bin. Return NULL if a page can not be found and locked. 3831 * 3832 * Finds a pages, trys to lock it, then removes it. 3833 */ 3834 3835 /*ARGSUSED*/ 3836 page_t * 3837 page_get_cachelist(struct vnode *vp, u_offset_t off, struct seg *seg, 3838 caddr_t vaddr, uint_t flags, struct lgrp *lgrp) 3839 { 3840 page_t *pp; 3841 struct as *as = seg->s_as; 3842 ulong_t bin; 3843 /*LINTED*/ 3844 int mnode; 3845 int mtype; 3846 lgrp_mnode_cookie_t lgrp_cookie; 3847 3848 /* 3849 * If we aren't passed a specific lgroup, or pasased a freed lgrp 3850 * assume we wish to allocate near to the current thread's home. 3851 */ 3852 if (!LGRP_EXISTS(lgrp)) 3853 lgrp = lgrp_home_lgrp(); 3854 3855 if (!kcage_on) { 3856 flags &= ~PG_NORELOC; 3857 flags |= PGI_NOCAGE; 3858 } 3859 3860 if ((flags & (PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == PG_NORELOC && 3861 kcage_freemem <= kcage_throttlefree) { 3862 /* 3863 * Reserve kcage_throttlefree pages for critical kernel 3864 * threads. 3865 * 3866 * Everybody else has to go to page_create_get_something() 3867 * to get a cage page, so we don't deadlock cageout. 3868 */ 3869 return (NULL); 3870 } 3871 3872 /* LINTED */ 3873 AS_2_BIN(as, seg, vp, vaddr, bin, 0); 3874 3875 ASSERT(bin < PAGE_GET_PAGECOLORS(0)); 3876 3877 /* LINTED */ 3878 MTYPE_INIT(mtype, vp, vaddr, flags, MMU_PAGESIZE); 3879 3880 VM_STAT_ADD(vmm_vmstats.pgc_alloc); 3881 3882 /* 3883 * Try local cachelists first 3884 */ 3885 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL); 3886 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3887 pp = page_get_mnode_cachelist(bin, flags, mnode, mtype); 3888 if (pp != NULL) { 3889 VM_STAT_ADD(vmm_vmstats.pgc_allocok); 3890 DTRACE_PROBE4(page__get, 3891 lgrp_t *, lgrp, 3892 int, mnode, 3893 ulong_t, bin, 3894 uint_t, flags); 3895 return (pp); 3896 } 3897 } 3898 3899 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); 3900 3901 /* 3902 * Try freelists/cachelists that are farther away 3903 * This is our only chance to allocate remote pages for PAGESIZE 3904 * requests. 3905 */ 3906 LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie); 3907 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3908 pp = page_get_mnode_freelist(mnode, bin, mtype, 3909 0, flags); 3910 if (pp != NULL) { 3911 VM_STAT_ADD(vmm_vmstats.pgc_allocokdeferred); 3912 DTRACE_PROBE4(page__get, 3913 lgrp_t *, lgrp, 3914 int, mnode, 3915 ulong_t, bin, 3916 uint_t, flags); 3917 return (pp); 3918 } 3919 pp = page_get_mnode_cachelist(bin, flags, mnode, mtype); 3920 if (pp != NULL) { 3921 VM_STAT_ADD(vmm_vmstats.pgc_allocokrem); 3922 DTRACE_PROBE4(page__get, 3923 lgrp_t *, lgrp, 3924 int, mnode, 3925 ulong_t, bin, 3926 uint_t, flags); 3927 return (pp); 3928 } 3929 } 3930 3931 VM_STAT_ADD(vmm_vmstats.pgc_allocfailed); 3932 return (NULL); 3933 } 3934 3935 page_t * 3936 page_get_mnode_cachelist(uint_t bin, uint_t flags, int mnode, int mtype) 3937 { 3938 kmutex_t *pcm; 3939 page_t *pp, *first_pp; 3940 uint_t sbin; 3941 int plw_initialized; 3942 page_list_walker_t plw; 3943 3944 VM_STAT_ADD(vmm_vmstats.pgmc_alloc); 3945 3946 /* LINTED */ 3947 MTYPE_START(mnode, mtype, flags); 3948 if (mtype < 0) { /* mnode does not have memory in mtype range */ 3949 VM_STAT_ADD(vmm_vmstats.pgmc_allocempty); 3950 return (NULL); 3951 } 3952 3953 try_again: 3954 3955 plw_initialized = 0; 3956 plw.plw_ceq_dif = 1; 3957 3958 /* 3959 * Only hold one cachelist lock at a time, that way we 3960 * can start anywhere and not have to worry about lock 3961 * ordering. 3962 */ 3963 3964 for (plw.plw_count = 0; 3965 plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) { 3966 sbin = bin; 3967 do { 3968 3969 if (!PAGE_CACHELISTS(mnode, bin, mtype)) 3970 goto bin_empty_1; 3971 pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST); 3972 mutex_enter(pcm); 3973 pp = PAGE_CACHELISTS(mnode, bin, mtype); 3974 if (pp == NULL) 3975 goto bin_empty_0; 3976 3977 first_pp = pp; 3978 ASSERT(pp->p_vnode); 3979 ASSERT(PP_ISAGED(pp) == 0); 3980 ASSERT(pp->p_szc == 0); 3981 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 3982 while (IS_DUMP_PAGE(pp) || !page_trylock(pp, SE_EXCL)) { 3983 pp = pp->p_next; 3984 ASSERT(pp->p_szc == 0); 3985 if (pp == first_pp) { 3986 /* 3987 * We have searched the complete list! 3988 * And all of them (might only be one) 3989 * are locked. This can happen since 3990 * these pages can also be found via 3991 * the hash list. When found via the 3992 * hash list, they are locked first, 3993 * then removed. We give up to let the 3994 * other thread run. 3995 */ 3996 pp = NULL; 3997 break; 3998 } 3999 ASSERT(pp->p_vnode); 4000 ASSERT(PP_ISFREE(pp)); 4001 ASSERT(PP_ISAGED(pp) == 0); 4002 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == 4003 mnode); 4004 } 4005 4006 if (pp) { 4007 page_t **ppp; 4008 /* 4009 * Found and locked a page. 4010 * Pull it off the list. 4011 */ 4012 ASSERT(mtype == PP_2_MTYPE(pp)); 4013 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 4014 page_sub(ppp, pp); 4015 /* 4016 * Subtract counters before releasing pcm mutex 4017 * to avoid a race with page_freelist_coalesce 4018 * and page_freelist_split. 4019 */ 4020 page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST); 4021 mutex_exit(pcm); 4022 ASSERT(pp->p_vnode); 4023 ASSERT(PP_ISAGED(pp) == 0); 4024 #if defined(__sparc) 4025 ASSERT(!kcage_on || 4026 (flags & PG_NORELOC) == 0 || 4027 PP_ISNORELOC(pp)); 4028 if (PP_ISNORELOC(pp)) { 4029 kcage_freemem_sub(1); 4030 } 4031 #endif 4032 VM_STAT_ADD(vmm_vmstats. pgmc_allocok); 4033 return (pp); 4034 } 4035 bin_empty_0: 4036 mutex_exit(pcm); 4037 bin_empty_1: 4038 if (plw_initialized == 0) { 4039 page_list_walk_init(0, flags, bin, 0, 1, &plw); 4040 plw_initialized = 1; 4041 } 4042 /* calculate the next bin with equivalent color */ 4043 bin = ADD_MASKED(bin, plw.plw_bin_step, 4044 plw.plw_ceq_mask[0], plw.plw_color_mask); 4045 } while (sbin != bin); 4046 4047 if (plw.plw_ceq_dif > 1) 4048 bin = page_list_walk_next_bin(0, bin, &plw); 4049 } 4050 4051 MTYPE_NEXT(mnode, mtype, flags); 4052 if (mtype >= 0) 4053 goto try_again; 4054 4055 VM_STAT_ADD(vmm_vmstats.pgmc_allocfailed); 4056 return (NULL); 4057 } 4058 4059 #ifdef DEBUG 4060 #define REPL_PAGE_STATS 4061 #endif /* DEBUG */ 4062 4063 #ifdef REPL_PAGE_STATS 4064 struct repl_page_stats { 4065 uint_t ngets; 4066 uint_t ngets_noreloc; 4067 uint_t npgr_noreloc; 4068 uint_t nnopage_first; 4069 uint_t nnopage; 4070 uint_t nhashout; 4071 uint_t nnofree; 4072 uint_t nnext_pp; 4073 } repl_page_stats; 4074 #define REPL_STAT_INCR(v) atomic_inc_32(&repl_page_stats.v) 4075 #else /* REPL_PAGE_STATS */ 4076 #define REPL_STAT_INCR(v) 4077 #endif /* REPL_PAGE_STATS */ 4078 4079 int pgrppgcp; 4080 4081 /* 4082 * The freemem accounting must be done by the caller. 4083 * First we try to get a replacement page of the same size as like_pp, 4084 * if that is not possible, then we just get a set of discontiguous 4085 * PAGESIZE pages. 4086 */ 4087 page_t * 4088 page_get_replacement_page(page_t *orig_like_pp, struct lgrp *lgrp_target, 4089 uint_t pgrflags) 4090 { 4091 page_t *like_pp; 4092 page_t *pp, *pplist; 4093 page_t *pl = NULL; 4094 ulong_t bin; 4095 int mnode, page_mnode; 4096 int szc; 4097 spgcnt_t npgs, pg_cnt; 4098 pfn_t pfnum; 4099 int mtype; 4100 int flags = 0; 4101 lgrp_mnode_cookie_t lgrp_cookie; 4102 lgrp_t *lgrp; 4103 4104 mnode = 0; 4105 lgrp = NULL; 4106 REPL_STAT_INCR(ngets); 4107 like_pp = orig_like_pp; 4108 ASSERT(PAGE_EXCL(like_pp)); 4109 4110 szc = like_pp->p_szc; 4111 npgs = page_get_pagecnt(szc); 4112 /* 4113 * Now we reset like_pp to the base page_t. 4114 * That way, we won't walk past the end of this 'szc' page. 4115 */ 4116 pfnum = PFN_BASE(like_pp->p_pagenum, szc); 4117 like_pp = page_numtopp_nolock(pfnum); 4118 ASSERT(like_pp->p_szc == szc); 4119 4120 if (PP_ISNORELOC(like_pp)) { 4121 ASSERT(kcage_on); 4122 REPL_STAT_INCR(ngets_noreloc); 4123 flags = PGI_RELOCONLY; 4124 } else if (pgrflags & PGR_NORELOC) { 4125 ASSERT(kcage_on); 4126 REPL_STAT_INCR(npgr_noreloc); 4127 flags = PG_NORELOC; 4128 } 4129 4130 /* 4131 * Kernel pages must always be replaced with the same size 4132 * pages, since we cannot properly handle demotion of kernel 4133 * pages. 4134 */ 4135 if (PP_ISKAS(like_pp)) 4136 pgrflags |= PGR_SAMESZC; 4137 4138 MTYPE_PGR_INIT(mtype, flags, like_pp, npgs); 4139 4140 while (npgs) { 4141 pplist = NULL; 4142 for (;;) { 4143 pg_cnt = page_get_pagecnt(szc); 4144 bin = PP_2_BIN(like_pp); 4145 ASSERT(like_pp->p_szc == orig_like_pp->p_szc); 4146 ASSERT(pg_cnt <= npgs); 4147 4148 /* 4149 * If an lgroup was specified, try to get the 4150 * page from that lgroup. 4151 * NOTE: Must be careful with code below because 4152 * lgroup may disappear and reappear since there 4153 * is no locking for lgroup here. 4154 */ 4155 if (LGRP_EXISTS(lgrp_target)) { 4156 /* 4157 * Keep local variable for lgroup separate 4158 * from lgroup argument since this code should 4159 * only be exercised when lgroup argument 4160 * exists.... 4161 */ 4162 lgrp = lgrp_target; 4163 4164 /* Try the lgroup's freelists first */ 4165 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 4166 LGRP_SRCH_LOCAL); 4167 while ((pplist == NULL) && 4168 (mnode = lgrp_memnode_choose(&lgrp_cookie)) 4169 != -1) { 4170 pplist = 4171 page_get_mnode_freelist(mnode, bin, 4172 mtype, szc, flags); 4173 } 4174 4175 /* 4176 * Now try it's cachelists if this is a 4177 * small page. Don't need to do it for 4178 * larger ones since page_freelist_coalesce() 4179 * already failed. 4180 */ 4181 if (pplist != NULL || szc != 0) 4182 break; 4183 4184 /* Now try it's cachelists */ 4185 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 4186 LGRP_SRCH_LOCAL); 4187 4188 while ((pplist == NULL) && 4189 (mnode = lgrp_memnode_choose(&lgrp_cookie)) 4190 != -1) { 4191 pplist = 4192 page_get_mnode_cachelist(bin, flags, 4193 mnode, mtype); 4194 } 4195 if (pplist != NULL) { 4196 page_hashout(pplist, NULL); 4197 PP_SETAGED(pplist); 4198 REPL_STAT_INCR(nhashout); 4199 break; 4200 } 4201 /* Done looking in this lgroup. Bail out. */ 4202 break; 4203 } 4204 4205 /* 4206 * No lgroup was specified (or lgroup was removed by 4207 * DR, so just try to get the page as close to 4208 * like_pp's mnode as possible. 4209 * First try the local freelist... 4210 */ 4211 mnode = PP_2_MEM_NODE(like_pp); 4212 pplist = page_get_mnode_freelist(mnode, bin, 4213 mtype, szc, flags); 4214 if (pplist != NULL) 4215 break; 4216 4217 REPL_STAT_INCR(nnofree); 4218 4219 /* 4220 * ...then the local cachelist. Don't need to do it for 4221 * larger pages cause page_freelist_coalesce() already 4222 * failed there anyway. 4223 */ 4224 if (szc == 0) { 4225 pplist = page_get_mnode_cachelist(bin, flags, 4226 mnode, mtype); 4227 if (pplist != NULL) { 4228 page_hashout(pplist, NULL); 4229 PP_SETAGED(pplist); 4230 REPL_STAT_INCR(nhashout); 4231 break; 4232 } 4233 } 4234 4235 /* Now try remote freelists */ 4236 page_mnode = mnode; 4237 lgrp = 4238 lgrp_hand_to_lgrp(MEM_NODE_2_LGRPHAND(page_mnode)); 4239 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 4240 LGRP_SRCH_HIER); 4241 while (pplist == NULL && 4242 (mnode = lgrp_memnode_choose(&lgrp_cookie)) 4243 != -1) { 4244 /* 4245 * Skip local mnode. 4246 */ 4247 if ((mnode == page_mnode) || 4248 (mem_node_config[mnode].exists == 0)) 4249 continue; 4250 4251 pplist = page_get_mnode_freelist(mnode, 4252 bin, mtype, szc, flags); 4253 } 4254 4255 if (pplist != NULL) 4256 break; 4257 4258 4259 /* Now try remote cachelists */ 4260 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 4261 LGRP_SRCH_HIER); 4262 while (pplist == NULL && szc == 0) { 4263 mnode = lgrp_memnode_choose(&lgrp_cookie); 4264 if (mnode == -1) 4265 break; 4266 /* 4267 * Skip local mnode. 4268 */ 4269 if ((mnode == page_mnode) || 4270 (mem_node_config[mnode].exists == 0)) 4271 continue; 4272 4273 pplist = page_get_mnode_cachelist(bin, 4274 flags, mnode, mtype); 4275 4276 if (pplist != NULL) { 4277 page_hashout(pplist, NULL); 4278 PP_SETAGED(pplist); 4279 REPL_STAT_INCR(nhashout); 4280 break; 4281 } 4282 } 4283 4284 /* 4285 * Break out of while loop under the following cases: 4286 * - If we successfully got a page. 4287 * - If pgrflags specified only returning a specific 4288 * page size and we could not find that page size. 4289 * - If we could not satisfy the request with PAGESIZE 4290 * or larger pages. 4291 */ 4292 if (pplist != NULL || szc == 0) 4293 break; 4294 4295 if ((pgrflags & PGR_SAMESZC) || pgrppgcp) { 4296 /* try to find contig page */ 4297 4298 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 4299 LGRP_SRCH_HIER); 4300 4301 while ((pplist == NULL) && 4302 (mnode = 4303 lgrp_memnode_choose(&lgrp_cookie)) 4304 != -1) { 4305 pplist = page_get_contig_pages( 4306 mnode, bin, mtype, szc, 4307 flags | PGI_PGCPHIPRI); 4308 } 4309 break; 4310 } 4311 4312 /* 4313 * The correct thing to do here is try the next 4314 * page size down using szc--. Due to a bug 4315 * with the processing of HAT_RELOAD_SHARE 4316 * where the sfmmu_ttecnt arrays of all 4317 * hats sharing an ISM segment don't get updated, 4318 * using intermediate size pages for relocation 4319 * can lead to continuous page faults. 4320 */ 4321 szc = 0; 4322 } 4323 4324 if (pplist != NULL) { 4325 DTRACE_PROBE4(page__get, 4326 lgrp_t *, lgrp, 4327 int, mnode, 4328 ulong_t, bin, 4329 uint_t, flags); 4330 4331 while (pplist != NULL && pg_cnt--) { 4332 ASSERT(pplist != NULL); 4333 pp = pplist; 4334 page_sub(&pplist, pp); 4335 PP_CLRFREE(pp); 4336 PP_CLRAGED(pp); 4337 page_list_concat(&pl, &pp); 4338 npgs--; 4339 like_pp = like_pp + 1; 4340 REPL_STAT_INCR(nnext_pp); 4341 } 4342 ASSERT(pg_cnt == 0); 4343 } else { 4344 break; 4345 } 4346 } 4347 4348 if (npgs) { 4349 /* 4350 * We were unable to allocate the necessary number 4351 * of pages. 4352 * We need to free up any pl. 4353 */ 4354 REPL_STAT_INCR(nnopage); 4355 page_free_replacement_page(pl); 4356 return (NULL); 4357 } else { 4358 return (pl); 4359 } 4360 } 4361 4362 /* 4363 * demote a free large page to it's constituent pages 4364 */ 4365 void 4366 page_demote_free_pages(page_t *pp) 4367 { 4368 4369 int mnode; 4370 4371 ASSERT(pp != NULL); 4372 ASSERT(PAGE_LOCKED(pp)); 4373 ASSERT(PP_ISFREE(pp)); 4374 ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes); 4375 4376 mnode = PP_2_MEM_NODE(pp); 4377 page_freelist_lock(mnode); 4378 if (pp->p_szc != 0) { 4379 (void) page_demote(mnode, PFN_BASE(pp->p_pagenum, 4380 pp->p_szc), 0, pp->p_szc, 0, PC_NO_COLOR, PC_FREE); 4381 } 4382 page_freelist_unlock(mnode); 4383 ASSERT(pp->p_szc == 0); 4384 } 4385 4386 /* 4387 * Factor in colorequiv to check additional 'equivalent' bins. 4388 * colorequiv may be set in /etc/system 4389 */ 4390 void 4391 page_set_colorequiv_arr(void) 4392 { 4393 if (colorequiv > 1) { 4394 int i; 4395 uint_t sv_a = lowbit(colorequiv) - 1; 4396 4397 if (sv_a > 15) 4398 sv_a = 15; 4399 4400 for (i = 0; i < MMU_PAGE_SIZES; i++) { 4401 uint_t colors; 4402 uint_t a = sv_a; 4403 4404 if ((colors = hw_page_array[i].hp_colors) <= 1) { 4405 continue; 4406 } 4407 while ((colors >> a) == 0) 4408 a--; 4409 if ((a << 4) > colorequivszc[i]) { 4410 colorequivszc[i] = (a << 4); 4411 } 4412 } 4413 } 4414 } 4415