1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * Portions of this source code were derived from Berkeley 4.3 BSD 31 * under license from the Regents of the University of California. 32 */ 33 34 35 /* 36 * This file contains common functions to access and manage the page lists. 37 * Many of these routines originated from platform dependent modules 38 * (sun4/vm/vm_dep.c, i86pc/vm/vm_machdep.c) and modified to function in 39 * a platform independent manner. 40 * 41 * vm/vm_dep.h provides for platform specific support. 42 */ 43 44 #include <sys/types.h> 45 #include <sys/debug.h> 46 #include <sys/cmn_err.h> 47 #include <sys/systm.h> 48 #include <sys/atomic.h> 49 #include <sys/sysmacros.h> 50 #include <vm/as.h> 51 #include <vm/page.h> 52 #include <vm/seg_kmem.h> 53 #include <vm/seg_vn.h> 54 #include <sys/vmsystm.h> 55 #include <sys/memnode.h> 56 #include <vm/vm_dep.h> 57 #include <sys/lgrp.h> 58 #include <sys/mem_config.h> 59 #include <sys/callb.h> 60 #include <sys/mem_cage.h> 61 #include <sys/sdt.h> 62 #include <sys/dumphdr.h> 63 64 extern uint_t vac_colors; 65 66 #define MAX_PRAGMA_ALIGN 128 67 68 /* vm_cpu_data0 for the boot cpu before kmem is initialized */ 69 70 #if L2CACHE_ALIGN_MAX <= MAX_PRAGMA_ALIGN 71 #pragma align L2CACHE_ALIGN_MAX(vm_cpu_data0) 72 #else 73 #pragma align MAX_PRAGMA_ALIGN(vm_cpu_data0) 74 #endif 75 char vm_cpu_data0[VM_CPU_DATA_PADSIZE]; 76 77 /* 78 * number of page colors equivalent to reqested color in page_get routines. 79 * If set, keeps large pages intact longer and keeps MPO allocation 80 * from the local mnode in favor of acquiring the 'correct' page color from 81 * a demoted large page or from a remote mnode. 82 */ 83 uint_t colorequiv; 84 85 /* 86 * color equivalency mask for each page size. 87 * Mask is computed based on cpu L2$ way sizes and colorequiv global. 88 * High 4 bits determine the number of high order bits of the color to ignore. 89 * Low 4 bits determines number of low order bits of color to ignore (it's only 90 * relevant for hashed index based page coloring). 91 */ 92 uchar_t colorequivszc[MMU_PAGE_SIZES]; 93 94 /* 95 * if set, specifies the percentage of large pages that are free from within 96 * a large page region before attempting to lock those pages for 97 * page_get_contig_pages processing. 98 * 99 * Should be turned on when kpr is available when page_trylock_contig_pages 100 * can be more selective. 101 */ 102 103 int ptcpthreshold; 104 105 /* 106 * Limit page get contig page search based on failure cnts in pgcpfailcnt[]. 107 * Enabled by default via pgcplimitsearch. 108 * 109 * pgcpfailcnt[] is bounded by PGCPFAILMAX (>= 1/2 of installed 110 * memory). When reached, pgcpfailcnt[] is reset to 1/2 of this upper 111 * bound. This upper bound range guarantees: 112 * - all large page 'slots' will be searched over time 113 * - the minimum (1) large page candidates considered on each pgcp call 114 * - count doesn't wrap around to 0 115 */ 116 pgcnt_t pgcpfailcnt[MMU_PAGE_SIZES]; 117 int pgcplimitsearch = 1; 118 119 #define PGCPFAILMAX (1 << (highbit(physinstalled) - 1)) 120 #define SETPGCPFAILCNT(szc) \ 121 if (++pgcpfailcnt[szc] >= PGCPFAILMAX) \ 122 pgcpfailcnt[szc] = PGCPFAILMAX / 2; 123 124 #ifdef VM_STATS 125 struct vmm_vmstats_str vmm_vmstats; 126 127 #endif /* VM_STATS */ 128 129 #if defined(__sparc) 130 #define LPGCREATE 0 131 #else 132 /* enable page_get_contig_pages */ 133 #define LPGCREATE 1 134 #endif 135 136 int pg_contig_disable; 137 int pg_lpgcreate_nocage = LPGCREATE; 138 139 /* 140 * page_freelist_split pfn flag to signify no lo or hi pfn requirement. 141 */ 142 #define PFNNULL 0 143 144 /* Flags involved in promotion and demotion routines */ 145 #define PC_FREE 0x1 /* put page on freelist */ 146 #define PC_ALLOC 0x2 /* return page for allocation */ 147 148 /* 149 * Flag for page_demote to be used with PC_FREE to denote that we don't care 150 * what the color is as the color parameter to the function is ignored. 151 */ 152 #define PC_NO_COLOR (-1) 153 154 /* mtype value for page_promote to use when mtype does not matter */ 155 #define PC_MTYPE_ANY (-1) 156 157 /* 158 * page counters candidates info 159 * See page_ctrs_cands comment below for more details. 160 * fields are as follows: 161 * pcc_pages_free: # pages which freelist coalesce can create 162 * pcc_color_free: pointer to page free counts per color 163 */ 164 typedef struct pcc_info { 165 pgcnt_t pcc_pages_free; 166 pgcnt_t *pcc_color_free; 167 uint_t pad[12]; 168 } pcc_info_t; 169 170 /* 171 * On big machines it can take a long time to check page_counters 172 * arrays. page_ctrs_cands is a summary array whose elements are a dynamically 173 * updated sum of all elements of the corresponding page_counters arrays. 174 * page_freelist_coalesce() searches page_counters only if an appropriate 175 * element of page_ctrs_cands array is greater than 0. 176 * 177 * page_ctrs_cands is indexed by mutex (i), region (r), mnode (m), mrange (g) 178 */ 179 pcc_info_t **page_ctrs_cands[NPC_MUTEX][MMU_PAGE_SIZES]; 180 181 /* 182 * Return in val the total number of free pages which can be created 183 * for the given mnode (m), mrange (g), and region size (r) 184 */ 185 #define PGCTRS_CANDS_GETVALUE(m, g, r, val) { \ 186 int i; \ 187 val = 0; \ 188 for (i = 0; i < NPC_MUTEX; i++) { \ 189 val += page_ctrs_cands[i][(r)][(m)][(g)].pcc_pages_free; \ 190 } \ 191 } 192 193 /* 194 * Return in val the total number of free pages which can be created 195 * for the given mnode (m), mrange (g), region size (r), and color (c) 196 */ 197 #define PGCTRS_CANDS_GETVALUECOLOR(m, g, r, c, val) { \ 198 int i; \ 199 val = 0; \ 200 ASSERT((c) < PAGE_GET_PAGECOLORS(r)); \ 201 for (i = 0; i < NPC_MUTEX; i++) { \ 202 val += \ 203 page_ctrs_cands[i][(r)][(m)][(g)].pcc_color_free[(c)]; \ 204 } \ 205 } 206 207 /* 208 * We can only allow a single thread to update a counter within the physical 209 * range of the largest supported page size. That is the finest granularity 210 * possible since the counter values are dependent on each other 211 * as you move accross region sizes. PP_CTR_LOCK_INDX is used to determine the 212 * ctr_mutex lock index for a particular physical range. 213 */ 214 static kmutex_t *ctr_mutex[NPC_MUTEX]; 215 216 #define PP_CTR_LOCK_INDX(pp) \ 217 (((pp)->p_pagenum >> \ 218 (PAGE_BSZS_SHIFT(mmu_page_sizes - 1))) & (NPC_MUTEX - 1)) 219 220 #define INVALID_COLOR 0xffffffff 221 #define INVALID_MASK 0xffffffff 222 223 /* 224 * Local functions prototypes. 225 */ 226 227 void page_ctr_add(int, int, page_t *, int); 228 void page_ctr_add_internal(int, int, page_t *, int); 229 void page_ctr_sub(int, int, page_t *, int); 230 void page_ctr_sub_internal(int, int, page_t *, int); 231 void page_freelist_lock(int); 232 void page_freelist_unlock(int); 233 page_t *page_promote(int, pfn_t, uchar_t, int, int); 234 page_t *page_demote(int, pfn_t, pfn_t, uchar_t, uchar_t, int, int); 235 page_t *page_freelist_split(uchar_t, 236 uint_t, int, int, pfn_t, pfn_t, page_list_walker_t *); 237 page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int); 238 static int page_trylock_cons(page_t *pp, se_t se); 239 240 /* 241 * The page_counters array below is used to keep track of free contiguous 242 * physical memory. A hw_page_map_t will be allocated per mnode per szc. 243 * This contains an array of counters, the size of the array, a shift value 244 * used to convert a pagenum into a counter array index or vice versa, as 245 * well as a cache of the last successful index to be promoted to a larger 246 * page size. As an optimization, we keep track of the last successful index 247 * to be promoted per page color for the given size region, and this is 248 * allocated dynamically based upon the number of colors for a given 249 * region size. 250 * 251 * Conceptually, the page counters are represented as: 252 * 253 * page_counters[region_size][mnode] 254 * 255 * region_size: size code of a candidate larger page made up 256 * of contiguous free smaller pages. 257 * 258 * page_counters[region_size][mnode].hpm_counters[index]: 259 * represents how many (region_size - 1) pages either 260 * exist or can be created within the given index range. 261 * 262 * Let's look at a sparc example: 263 * If we want to create a free 512k page, we look at region_size 2 264 * for the mnode we want. We calculate the index and look at a specific 265 * hpm_counters location. If we see 8 (FULL_REGION_CNT on sparc) at 266 * this location, it means that 8 64k pages either exist or can be created 267 * from 8K pages in order to make a single free 512k page at the given 268 * index. Note that when a region is full, it will contribute to the 269 * counts in the region above it. Thus we will not know what page 270 * size the free pages will be which can be promoted to this new free 271 * page unless we look at all regions below the current region. 272 */ 273 274 /* 275 * Note: hpmctr_t is defined in platform vm_dep.h 276 * hw_page_map_t contains all the information needed for the page_counters 277 * logic. The fields are as follows: 278 * 279 * hpm_counters: dynamically allocated array to hold counter data 280 * hpm_entries: entries in hpm_counters 281 * hpm_shift: shift for pnum/array index conv 282 * hpm_base: PFN mapped to counter index 0 283 * hpm_color_current: last index in counter array for this color at 284 * which we successfully created a large page 285 */ 286 typedef struct hw_page_map { 287 hpmctr_t *hpm_counters; 288 size_t hpm_entries; 289 int hpm_shift; 290 pfn_t hpm_base; 291 size_t *hpm_color_current[MAX_MNODE_MRANGES]; 292 #if defined(__sparc) 293 uint_t pad[4]; 294 #endif 295 } hw_page_map_t; 296 297 /* 298 * Element zero is not used, but is allocated for convenience. 299 */ 300 static hw_page_map_t *page_counters[MMU_PAGE_SIZES]; 301 302 /* 303 * Cached value of MNODE_RANGE_CNT(mnode). 304 * This is a function call in x86. 305 */ 306 static int mnode_nranges[MAX_MEM_NODES]; 307 static int mnode_maxmrange[MAX_MEM_NODES]; 308 309 /* 310 * The following macros are convenient ways to get access to the individual 311 * elements of the page_counters arrays. They can be used on both 312 * the left side and right side of equations. 313 */ 314 #define PAGE_COUNTERS(mnode, rg_szc, idx) \ 315 (page_counters[(rg_szc)][(mnode)].hpm_counters[(idx)]) 316 317 #define PAGE_COUNTERS_COUNTERS(mnode, rg_szc) \ 318 (page_counters[(rg_szc)][(mnode)].hpm_counters) 319 320 #define PAGE_COUNTERS_SHIFT(mnode, rg_szc) \ 321 (page_counters[(rg_szc)][(mnode)].hpm_shift) 322 323 #define PAGE_COUNTERS_ENTRIES(mnode, rg_szc) \ 324 (page_counters[(rg_szc)][(mnode)].hpm_entries) 325 326 #define PAGE_COUNTERS_BASE(mnode, rg_szc) \ 327 (page_counters[(rg_szc)][(mnode)].hpm_base) 328 329 #define PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, rg_szc, g) \ 330 (page_counters[(rg_szc)][(mnode)].hpm_color_current[(g)]) 331 332 #define PAGE_COUNTERS_CURRENT_COLOR(mnode, rg_szc, color, mrange) \ 333 (page_counters[(rg_szc)][(mnode)]. \ 334 hpm_color_current[(mrange)][(color)]) 335 336 #define PNUM_TO_IDX(mnode, rg_szc, pnum) \ 337 (((pnum) - PAGE_COUNTERS_BASE((mnode), (rg_szc))) >> \ 338 PAGE_COUNTERS_SHIFT((mnode), (rg_szc))) 339 340 #define IDX_TO_PNUM(mnode, rg_szc, index) \ 341 (PAGE_COUNTERS_BASE((mnode), (rg_szc)) + \ 342 ((index) << PAGE_COUNTERS_SHIFT((mnode), (rg_szc)))) 343 344 /* 345 * Protects the hpm_counters and hpm_color_current memory from changing while 346 * looking at page counters information. 347 * Grab the write lock to modify what these fields point at. 348 * Grab the read lock to prevent any pointers from changing. 349 * The write lock can not be held during memory allocation due to a possible 350 * recursion deadlock with trying to grab the read lock while the 351 * write lock is already held. 352 */ 353 krwlock_t page_ctrs_rwlock[MAX_MEM_NODES]; 354 355 356 /* 357 * initialize cpu_vm_data to point at cache aligned vm_cpu_data_t. 358 */ 359 void 360 cpu_vm_data_init(struct cpu *cp) 361 { 362 if (cp == CPU0) { 363 cp->cpu_vm_data = (void *)&vm_cpu_data0; 364 } else { 365 void *kmptr; 366 int align; 367 size_t sz; 368 369 align = (L2CACHE_ALIGN) ? L2CACHE_ALIGN : L2CACHE_ALIGN_MAX; 370 sz = P2ROUNDUP(sizeof (vm_cpu_data_t), align) + align; 371 kmptr = kmem_zalloc(sz, KM_SLEEP); 372 cp->cpu_vm_data = (void *) P2ROUNDUP((uintptr_t)kmptr, align); 373 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr = kmptr; 374 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize = sz; 375 } 376 } 377 378 /* 379 * free cpu_vm_data 380 */ 381 void 382 cpu_vm_data_destroy(struct cpu *cp) 383 { 384 if (cp->cpu_seqid && cp->cpu_vm_data) { 385 ASSERT(cp != CPU0); 386 kmem_free(((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr, 387 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize); 388 } 389 cp->cpu_vm_data = NULL; 390 } 391 392 393 /* 394 * page size to page size code 395 */ 396 int 397 page_szc(size_t pagesize) 398 { 399 int i = 0; 400 401 while (hw_page_array[i].hp_size) { 402 if (pagesize == hw_page_array[i].hp_size) 403 return (i); 404 i++; 405 } 406 return (-1); 407 } 408 409 /* 410 * page size to page size code with the restriction that it be a supported 411 * user page size. If it's not a supported user page size, -1 will be returned. 412 */ 413 int 414 page_szc_user_filtered(size_t pagesize) 415 { 416 int szc = page_szc(pagesize); 417 if ((szc != -1) && (SZC_2_USERSZC(szc) != -1)) { 418 return (szc); 419 } 420 return (-1); 421 } 422 423 /* 424 * Return how many page sizes are available for the user to use. This is 425 * what the hardware supports and not based upon how the OS implements the 426 * support of different page sizes. 427 * 428 * If legacy is non-zero, return the number of pagesizes available to legacy 429 * applications. The number of legacy page sizes might be less than the 430 * exported user page sizes. This is to prevent legacy applications that 431 * use the largest page size returned from getpagesizes(3c) from inadvertantly 432 * using the 'new' large pagesizes. 433 */ 434 uint_t 435 page_num_user_pagesizes(int legacy) 436 { 437 if (legacy) 438 return (mmu_legacy_page_sizes); 439 return (mmu_exported_page_sizes); 440 } 441 442 uint_t 443 page_num_pagesizes(void) 444 { 445 return (mmu_page_sizes); 446 } 447 448 /* 449 * returns the count of the number of base pagesize pages associated with szc 450 */ 451 pgcnt_t 452 page_get_pagecnt(uint_t szc) 453 { 454 if (szc >= mmu_page_sizes) 455 panic("page_get_pagecnt: out of range %d", szc); 456 return (hw_page_array[szc].hp_pgcnt); 457 } 458 459 size_t 460 page_get_pagesize(uint_t szc) 461 { 462 if (szc >= mmu_page_sizes) 463 panic("page_get_pagesize: out of range %d", szc); 464 return (hw_page_array[szc].hp_size); 465 } 466 467 /* 468 * Return the size of a page based upon the index passed in. An index of 469 * zero refers to the smallest page size in the system, and as index increases 470 * it refers to the next larger supported page size in the system. 471 * Note that szc and userszc may not be the same due to unsupported szc's on 472 * some systems. 473 */ 474 size_t 475 page_get_user_pagesize(uint_t userszc) 476 { 477 uint_t szc = USERSZC_2_SZC(userszc); 478 479 if (szc >= mmu_page_sizes) 480 panic("page_get_user_pagesize: out of range %d", szc); 481 return (hw_page_array[szc].hp_size); 482 } 483 484 uint_t 485 page_get_shift(uint_t szc) 486 { 487 if (szc >= mmu_page_sizes) 488 panic("page_get_shift: out of range %d", szc); 489 return (PAGE_GET_SHIFT(szc)); 490 } 491 492 uint_t 493 page_get_pagecolors(uint_t szc) 494 { 495 if (szc >= mmu_page_sizes) 496 panic("page_get_pagecolors: out of range %d", szc); 497 return (PAGE_GET_PAGECOLORS(szc)); 498 } 499 500 /* 501 * this assigns the desired equivalent color after a split 502 */ 503 uint_t 504 page_correct_color(uchar_t szc, uchar_t nszc, uint_t color, 505 uint_t ncolor, uint_t ceq_mask) 506 { 507 ASSERT(nszc > szc); 508 ASSERT(szc < mmu_page_sizes); 509 ASSERT(color < PAGE_GET_PAGECOLORS(szc)); 510 ASSERT(ncolor < PAGE_GET_PAGECOLORS(nszc)); 511 512 color &= ceq_mask; 513 ncolor = PAGE_CONVERT_COLOR(ncolor, szc, nszc); 514 return (color | (ncolor & ~ceq_mask)); 515 } 516 517 /* 518 * The interleaved_mnodes flag is set when mnodes overlap in 519 * the physbase..physmax range, but have disjoint slices. 520 * In this case hpm_counters is shared by all mnodes. 521 * This flag is set dynamically by the platform. 522 */ 523 int interleaved_mnodes = 0; 524 525 /* 526 * Called by startup(). 527 * Size up the per page size free list counters based on physmax 528 * of each node and max_mem_nodes. 529 * 530 * If interleaved_mnodes is set we need to find the first mnode that 531 * exists. hpm_counters for the first mnode will then be shared by 532 * all other mnodes. If interleaved_mnodes is not set, just set 533 * first=mnode each time. That means there will be no sharing. 534 */ 535 size_t 536 page_ctrs_sz(void) 537 { 538 int r; /* region size */ 539 int mnode; 540 int firstmn; /* first mnode that exists */ 541 int nranges; 542 pfn_t physbase; 543 pfn_t physmax; 544 uint_t ctrs_sz = 0; 545 int i; 546 pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; 547 548 /* 549 * We need to determine how many page colors there are for each 550 * page size in order to allocate memory for any color specific 551 * arrays. 552 */ 553 for (i = 0; i < mmu_page_sizes; i++) { 554 colors_per_szc[i] = PAGE_GET_PAGECOLORS(i); 555 } 556 557 for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) { 558 559 pgcnt_t r_pgcnt; 560 pfn_t r_base; 561 pgcnt_t r_align; 562 563 if (mem_node_config[mnode].exists == 0) 564 continue; 565 566 HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn); 567 nranges = MNODE_RANGE_CNT(mnode); 568 mnode_nranges[mnode] = nranges; 569 mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode); 570 571 /* 572 * determine size needed for page counter arrays with 573 * base aligned to large page size. 574 */ 575 for (r = 1; r < mmu_page_sizes; r++) { 576 /* add in space for hpm_color_current */ 577 ctrs_sz += sizeof (size_t) * 578 colors_per_szc[r] * nranges; 579 580 if (firstmn != mnode) 581 continue; 582 583 /* add in space for hpm_counters */ 584 r_align = page_get_pagecnt(r); 585 r_base = physbase; 586 r_base &= ~(r_align - 1); 587 r_pgcnt = howmany(physmax - r_base + 1, r_align); 588 589 /* 590 * Round up to always allocate on pointer sized 591 * boundaries. 592 */ 593 ctrs_sz += P2ROUNDUP((r_pgcnt * sizeof (hpmctr_t)), 594 sizeof (hpmctr_t *)); 595 } 596 } 597 598 for (r = 1; r < mmu_page_sizes; r++) { 599 ctrs_sz += (max_mem_nodes * sizeof (hw_page_map_t)); 600 } 601 602 /* add in space for page_ctrs_cands and pcc_color_free */ 603 ctrs_sz += sizeof (pcc_info_t *) * max_mem_nodes * 604 mmu_page_sizes * NPC_MUTEX; 605 606 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 607 608 if (mem_node_config[mnode].exists == 0) 609 continue; 610 611 nranges = mnode_nranges[mnode]; 612 ctrs_sz += sizeof (pcc_info_t) * nranges * 613 mmu_page_sizes * NPC_MUTEX; 614 for (r = 1; r < mmu_page_sizes; r++) { 615 ctrs_sz += sizeof (pgcnt_t) * nranges * 616 colors_per_szc[r] * NPC_MUTEX; 617 } 618 } 619 620 /* ctr_mutex */ 621 ctrs_sz += (max_mem_nodes * NPC_MUTEX * sizeof (kmutex_t)); 622 623 /* size for page list counts */ 624 PLCNT_SZ(ctrs_sz); 625 626 /* 627 * add some slop for roundups. page_ctrs_alloc will roundup the start 628 * address of the counters to ecache_alignsize boundary for every 629 * memory node. 630 */ 631 return (ctrs_sz + max_mem_nodes * L2CACHE_ALIGN); 632 } 633 634 caddr_t 635 page_ctrs_alloc(caddr_t alloc_base) 636 { 637 int mnode; 638 int mrange, nranges; 639 int r; /* region size */ 640 int i; 641 int firstmn; /* first mnode that exists */ 642 pfn_t physbase; 643 pfn_t physmax; 644 pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; 645 646 /* 647 * We need to determine how many page colors there are for each 648 * page size in order to allocate memory for any color specific 649 * arrays. 650 */ 651 for (i = 0; i < mmu_page_sizes; i++) { 652 colors_per_szc[i] = PAGE_GET_PAGECOLORS(i); 653 } 654 655 for (r = 1; r < mmu_page_sizes; r++) { 656 page_counters[r] = (hw_page_map_t *)alloc_base; 657 alloc_base += (max_mem_nodes * sizeof (hw_page_map_t)); 658 } 659 660 /* page_ctrs_cands and pcc_color_free array */ 661 for (i = 0; i < NPC_MUTEX; i++) { 662 for (r = 1; r < mmu_page_sizes; r++) { 663 664 page_ctrs_cands[i][r] = (pcc_info_t **)alloc_base; 665 alloc_base += sizeof (pcc_info_t *) * max_mem_nodes; 666 667 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 668 pcc_info_t *pi; 669 670 if (mem_node_config[mnode].exists == 0) 671 continue; 672 673 nranges = mnode_nranges[mnode]; 674 675 pi = (pcc_info_t *)alloc_base; 676 alloc_base += sizeof (pcc_info_t) * nranges; 677 page_ctrs_cands[i][r][mnode] = pi; 678 679 for (mrange = 0; mrange < nranges; mrange++) { 680 pi->pcc_color_free = 681 (pgcnt_t *)alloc_base; 682 alloc_base += sizeof (pgcnt_t) * 683 colors_per_szc[r]; 684 pi++; 685 } 686 } 687 } 688 } 689 690 /* ctr_mutex */ 691 for (i = 0; i < NPC_MUTEX; i++) { 692 ctr_mutex[i] = (kmutex_t *)alloc_base; 693 alloc_base += (max_mem_nodes * sizeof (kmutex_t)); 694 } 695 696 /* initialize page list counts */ 697 PLCNT_INIT(alloc_base); 698 699 for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) { 700 701 pgcnt_t r_pgcnt; 702 pfn_t r_base; 703 pgcnt_t r_align; 704 int r_shift; 705 int nranges = mnode_nranges[mnode]; 706 707 if (mem_node_config[mnode].exists == 0) 708 continue; 709 710 HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn); 711 712 for (r = 1; r < mmu_page_sizes; r++) { 713 /* 714 * the page_counters base has to be aligned to the 715 * page count of page size code r otherwise the counts 716 * will cross large page boundaries. 717 */ 718 r_align = page_get_pagecnt(r); 719 r_base = physbase; 720 /* base needs to be aligned - lower to aligned value */ 721 r_base &= ~(r_align - 1); 722 r_pgcnt = howmany(physmax - r_base + 1, r_align); 723 r_shift = PAGE_BSZS_SHIFT(r); 724 725 PAGE_COUNTERS_SHIFT(mnode, r) = r_shift; 726 PAGE_COUNTERS_ENTRIES(mnode, r) = r_pgcnt; 727 PAGE_COUNTERS_BASE(mnode, r) = r_base; 728 for (mrange = 0; mrange < nranges; mrange++) { 729 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, 730 r, mrange) = (size_t *)alloc_base; 731 alloc_base += sizeof (size_t) * 732 colors_per_szc[r]; 733 } 734 for (i = 0; i < colors_per_szc[r]; i++) { 735 uint_t color_mask = colors_per_szc[r] - 1; 736 pfn_t pfnum = r_base; 737 size_t idx; 738 int mrange; 739 MEM_NODE_ITERATOR_DECL(it); 740 741 MEM_NODE_ITERATOR_INIT(pfnum, mnode, r, &it); 742 if (pfnum == (pfn_t)-1) { 743 idx = 0; 744 } else { 745 PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i, 746 color_mask, color_mask, &it); 747 idx = PNUM_TO_IDX(mnode, r, pfnum); 748 idx = (idx >= r_pgcnt) ? 0 : idx; 749 } 750 for (mrange = 0; mrange < nranges; mrange++) { 751 PAGE_COUNTERS_CURRENT_COLOR(mnode, 752 r, i, mrange) = idx; 753 } 754 } 755 756 /* hpm_counters may be shared by all mnodes */ 757 if (firstmn == mnode) { 758 PAGE_COUNTERS_COUNTERS(mnode, r) = 759 (hpmctr_t *)alloc_base; 760 alloc_base += 761 P2ROUNDUP((sizeof (hpmctr_t) * r_pgcnt), 762 sizeof (hpmctr_t *)); 763 } else { 764 PAGE_COUNTERS_COUNTERS(mnode, r) = 765 PAGE_COUNTERS_COUNTERS(firstmn, r); 766 } 767 768 /* 769 * Verify that PNUM_TO_IDX and IDX_TO_PNUM 770 * satisfy the identity requirement. 771 * We should be able to go from one to the other 772 * and get consistent values. 773 */ 774 ASSERT(PNUM_TO_IDX(mnode, r, 775 (IDX_TO_PNUM(mnode, r, 0))) == 0); 776 ASSERT(IDX_TO_PNUM(mnode, r, 777 (PNUM_TO_IDX(mnode, r, r_base))) == r_base); 778 } 779 /* 780 * Roundup the start address of the page_counters to 781 * cache aligned boundary for every memory node. 782 * page_ctrs_sz() has added some slop for these roundups. 783 */ 784 alloc_base = (caddr_t)P2ROUNDUP((uintptr_t)alloc_base, 785 L2CACHE_ALIGN); 786 } 787 788 /* Initialize other page counter specific data structures. */ 789 for (mnode = 0; mnode < MAX_MEM_NODES; mnode++) { 790 rw_init(&page_ctrs_rwlock[mnode], NULL, RW_DEFAULT, NULL); 791 } 792 793 return (alloc_base); 794 } 795 796 /* 797 * Functions to adjust region counters for each size free list. 798 * Caller is responsible to acquire the ctr_mutex lock if necessary and 799 * thus can be called during startup without locks. 800 */ 801 /* ARGSUSED */ 802 void 803 page_ctr_add_internal(int mnode, int mtype, page_t *pp, int flags) 804 { 805 ssize_t r; /* region size */ 806 ssize_t idx; 807 pfn_t pfnum; 808 int lckidx; 809 810 ASSERT(mnode == PP_2_MEM_NODE(pp)); 811 ASSERT(mtype == PP_2_MTYPE(pp)); 812 813 ASSERT(pp->p_szc < mmu_page_sizes); 814 815 PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags); 816 817 /* no counter update needed for largest page size */ 818 if (pp->p_szc >= mmu_page_sizes - 1) { 819 return; 820 } 821 822 r = pp->p_szc + 1; 823 pfnum = pp->p_pagenum; 824 lckidx = PP_CTR_LOCK_INDX(pp); 825 826 /* 827 * Increment the count of free pages for the current 828 * region. Continue looping up in region size incrementing 829 * count if the preceeding region is full. 830 */ 831 while (r < mmu_page_sizes) { 832 idx = PNUM_TO_IDX(mnode, r, pfnum); 833 834 ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r)); 835 ASSERT(PAGE_COUNTERS(mnode, r, idx) < FULL_REGION_CNT(r)); 836 837 if (++PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r)) { 838 break; 839 } else { 840 int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r)); 841 pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode] 842 [MTYPE_2_MRANGE(mnode, root_mtype)]; 843 844 cand->pcc_pages_free++; 845 cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]++; 846 } 847 r++; 848 } 849 } 850 851 void 852 page_ctr_add(int mnode, int mtype, page_t *pp, int flags) 853 { 854 int lckidx = PP_CTR_LOCK_INDX(pp); 855 kmutex_t *lock = &ctr_mutex[lckidx][mnode]; 856 857 mutex_enter(lock); 858 page_ctr_add_internal(mnode, mtype, pp, flags); 859 mutex_exit(lock); 860 } 861 862 void 863 page_ctr_sub_internal(int mnode, int mtype, page_t *pp, int flags) 864 { 865 int lckidx; 866 ssize_t r; /* region size */ 867 ssize_t idx; 868 pfn_t pfnum; 869 870 ASSERT(mnode == PP_2_MEM_NODE(pp)); 871 ASSERT(mtype == PP_2_MTYPE(pp)); 872 873 ASSERT(pp->p_szc < mmu_page_sizes); 874 875 PLCNT_DECR(pp, mnode, mtype, pp->p_szc, flags); 876 877 /* no counter update needed for largest page size */ 878 if (pp->p_szc >= mmu_page_sizes - 1) { 879 return; 880 } 881 882 r = pp->p_szc + 1; 883 pfnum = pp->p_pagenum; 884 lckidx = PP_CTR_LOCK_INDX(pp); 885 886 /* 887 * Decrement the count of free pages for the current 888 * region. Continue looping up in region size decrementing 889 * count if the preceeding region was full. 890 */ 891 while (r < mmu_page_sizes) { 892 idx = PNUM_TO_IDX(mnode, r, pfnum); 893 894 ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r)); 895 ASSERT(PAGE_COUNTERS(mnode, r, idx) > 0); 896 897 if (--PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r) - 1) { 898 break; 899 } else { 900 int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r)); 901 pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode] 902 [MTYPE_2_MRANGE(mnode, root_mtype)]; 903 904 ASSERT(cand->pcc_pages_free != 0); 905 ASSERT(cand->pcc_color_free[PP_2_BIN_SZC(pp, r)] != 0); 906 907 cand->pcc_pages_free--; 908 cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]--; 909 } 910 r++; 911 } 912 } 913 914 void 915 page_ctr_sub(int mnode, int mtype, page_t *pp, int flags) 916 { 917 int lckidx = PP_CTR_LOCK_INDX(pp); 918 kmutex_t *lock = &ctr_mutex[lckidx][mnode]; 919 920 mutex_enter(lock); 921 page_ctr_sub_internal(mnode, mtype, pp, flags); 922 mutex_exit(lock); 923 } 924 925 /* 926 * Adjust page counters following a memory attach, since typically the 927 * size of the array needs to change, and the PFN to counter index 928 * mapping needs to change. 929 * 930 * It is possible this mnode did not exist at startup. In that case 931 * allocate pcc_info_t and pcc_color_free arrays. Also, allow for nranges 932 * to change (a theoretical possibility on x86), which means pcc_color_free 933 * arrays must be extended. 934 */ 935 uint_t 936 page_ctrs_adjust(int mnode) 937 { 938 pgcnt_t npgs; 939 int r; /* region size */ 940 int i; 941 size_t pcsz, old_csz; 942 hpmctr_t *new_ctr, *old_ctr; 943 pfn_t oldbase, newbase; 944 pfn_t physbase, physmax; 945 size_t old_npgs; 946 hpmctr_t *ctr_cache[MMU_PAGE_SIZES]; 947 size_t size_cache[MMU_PAGE_SIZES]; 948 size_t *color_cache[MMU_PAGE_SIZES][MAX_MNODE_MRANGES]; 949 size_t *old_color_array[MAX_MNODE_MRANGES]; 950 pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; 951 pcc_info_t **cands_cache; 952 pcc_info_t *old_pi, *pi; 953 pgcnt_t *pgcntp; 954 int nr, old_nranges, mrange, nranges = MNODE_RANGE_CNT(mnode); 955 int cands_cache_nranges; 956 int old_maxmrange, new_maxmrange; 957 int rc = 0; 958 int oldmnode; 959 960 cands_cache = kmem_zalloc(sizeof (pcc_info_t *) * NPC_MUTEX * 961 MMU_PAGE_SIZES, KM_NOSLEEP); 962 if (cands_cache == NULL) 963 return (ENOMEM); 964 965 i = -1; 966 HPM_COUNTERS_LIMITS(mnode, physbase, physmax, i); 967 968 newbase = physbase & ~PC_BASE_ALIGN_MASK; 969 npgs = roundup(physmax, PC_BASE_ALIGN) - newbase; 970 971 /* prepare to free non-null pointers on the way out */ 972 cands_cache_nranges = nranges; 973 bzero(ctr_cache, sizeof (ctr_cache)); 974 bzero(color_cache, sizeof (color_cache)); 975 976 /* 977 * We need to determine how many page colors there are for each 978 * page size in order to allocate memory for any color specific 979 * arrays. 980 */ 981 for (r = 0; r < mmu_page_sizes; r++) { 982 colors_per_szc[r] = PAGE_GET_PAGECOLORS(r); 983 } 984 985 /* 986 * Preallocate all of the new hpm_counters arrays as we can't 987 * hold the page_ctrs_rwlock as a writer and allocate memory. 988 * If we can't allocate all of the arrays, undo our work so far 989 * and return failure. 990 */ 991 for (r = 1; r < mmu_page_sizes; r++) { 992 pcsz = npgs >> PAGE_BSZS_SHIFT(r); 993 size_cache[r] = pcsz; 994 ctr_cache[r] = kmem_zalloc(pcsz * 995 sizeof (hpmctr_t), KM_NOSLEEP); 996 if (ctr_cache[r] == NULL) { 997 rc = ENOMEM; 998 goto cleanup; 999 } 1000 } 1001 1002 /* 1003 * Preallocate all of the new color current arrays as we can't 1004 * hold the page_ctrs_rwlock as a writer and allocate memory. 1005 * If we can't allocate all of the arrays, undo our work so far 1006 * and return failure. 1007 */ 1008 for (r = 1; r < mmu_page_sizes; r++) { 1009 for (mrange = 0; mrange < nranges; mrange++) { 1010 color_cache[r][mrange] = kmem_zalloc(sizeof (size_t) * 1011 colors_per_szc[r], KM_NOSLEEP); 1012 if (color_cache[r][mrange] == NULL) { 1013 rc = ENOMEM; 1014 goto cleanup; 1015 } 1016 } 1017 } 1018 1019 /* 1020 * Preallocate all of the new pcc_info_t arrays as we can't 1021 * hold the page_ctrs_rwlock as a writer and allocate memory. 1022 * If we can't allocate all of the arrays, undo our work so far 1023 * and return failure. 1024 */ 1025 for (r = 1; r < mmu_page_sizes; r++) { 1026 for (i = 0; i < NPC_MUTEX; i++) { 1027 pi = kmem_zalloc(nranges * sizeof (pcc_info_t), 1028 KM_NOSLEEP); 1029 if (pi == NULL) { 1030 rc = ENOMEM; 1031 goto cleanup; 1032 } 1033 cands_cache[i * MMU_PAGE_SIZES + r] = pi; 1034 1035 for (mrange = 0; mrange < nranges; mrange++, pi++) { 1036 pgcntp = kmem_zalloc(colors_per_szc[r] * 1037 sizeof (pgcnt_t), KM_NOSLEEP); 1038 if (pgcntp == NULL) { 1039 rc = ENOMEM; 1040 goto cleanup; 1041 } 1042 pi->pcc_color_free = pgcntp; 1043 } 1044 } 1045 } 1046 1047 /* 1048 * Grab the write lock to prevent others from walking these arrays 1049 * while we are modifying them. 1050 */ 1051 PAGE_CTRS_WRITE_LOCK(mnode); 1052 1053 /* 1054 * For interleaved mnodes, find the first mnode 1055 * with valid page counters since the current 1056 * mnode may have just been added and not have 1057 * valid page counters. 1058 */ 1059 if (interleaved_mnodes) { 1060 for (i = 0; i < max_mem_nodes; i++) 1061 if (PAGE_COUNTERS_COUNTERS(i, 1) != NULL) 1062 break; 1063 ASSERT(i < max_mem_nodes); 1064 oldmnode = i; 1065 } else 1066 oldmnode = mnode; 1067 1068 old_nranges = mnode_nranges[mnode]; 1069 cands_cache_nranges = old_nranges; 1070 mnode_nranges[mnode] = nranges; 1071 old_maxmrange = mnode_maxmrange[mnode]; 1072 mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode); 1073 new_maxmrange = mnode_maxmrange[mnode]; 1074 1075 for (r = 1; r < mmu_page_sizes; r++) { 1076 PAGE_COUNTERS_SHIFT(mnode, r) = PAGE_BSZS_SHIFT(r); 1077 old_ctr = PAGE_COUNTERS_COUNTERS(oldmnode, r); 1078 old_csz = PAGE_COUNTERS_ENTRIES(oldmnode, r); 1079 oldbase = PAGE_COUNTERS_BASE(oldmnode, r); 1080 old_npgs = old_csz << PAGE_COUNTERS_SHIFT(oldmnode, r); 1081 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) { 1082 old_color_array[mrange] = 1083 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, 1084 r, mrange); 1085 } 1086 1087 pcsz = npgs >> PAGE_COUNTERS_SHIFT(mnode, r); 1088 new_ctr = ctr_cache[r]; 1089 ctr_cache[r] = NULL; 1090 if (old_ctr != NULL && 1091 (oldbase + old_npgs > newbase) && 1092 (newbase + npgs > oldbase)) { 1093 /* 1094 * Map the intersection of the old and new 1095 * counters into the new array. 1096 */ 1097 size_t offset; 1098 if (newbase > oldbase) { 1099 offset = (newbase - oldbase) >> 1100 PAGE_COUNTERS_SHIFT(mnode, r); 1101 bcopy(old_ctr + offset, new_ctr, 1102 MIN(pcsz, (old_csz - offset)) * 1103 sizeof (hpmctr_t)); 1104 } else { 1105 offset = (oldbase - newbase) >> 1106 PAGE_COUNTERS_SHIFT(mnode, r); 1107 bcopy(old_ctr, new_ctr + offset, 1108 MIN(pcsz - offset, old_csz) * 1109 sizeof (hpmctr_t)); 1110 } 1111 } 1112 1113 PAGE_COUNTERS_COUNTERS(mnode, r) = new_ctr; 1114 PAGE_COUNTERS_ENTRIES(mnode, r) = pcsz; 1115 PAGE_COUNTERS_BASE(mnode, r) = newbase; 1116 1117 /* update shared hpm_counters in other mnodes */ 1118 if (interleaved_mnodes) { 1119 for (i = 0; i < max_mem_nodes; i++) { 1120 if (i == mnode) 1121 continue; 1122 ASSERT( 1123 PAGE_COUNTERS_COUNTERS(i, r) == old_ctr || 1124 PAGE_COUNTERS_COUNTERS(i, r) == NULL); 1125 if (mem_node_config[i].exists == 0) 1126 continue; 1127 PAGE_COUNTERS_COUNTERS(i, r) = new_ctr; 1128 PAGE_COUNTERS_ENTRIES(i, r) = pcsz; 1129 PAGE_COUNTERS_BASE(i, r) = newbase; 1130 } 1131 } 1132 1133 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) { 1134 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r, mrange) = 1135 color_cache[r][mrange]; 1136 color_cache[r][mrange] = NULL; 1137 } 1138 /* 1139 * for now, just reset on these events as it's probably 1140 * not worthwhile to try and optimize this. 1141 */ 1142 for (i = 0; i < colors_per_szc[r]; i++) { 1143 uint_t color_mask = colors_per_szc[r] - 1; 1144 int mlo = interleaved_mnodes ? 0 : mnode; 1145 int mhi = interleaved_mnodes ? max_mem_nodes : 1146 (mnode + 1); 1147 int m; 1148 pfn_t pfnum; 1149 size_t idx; 1150 MEM_NODE_ITERATOR_DECL(it); 1151 1152 for (m = mlo; m < mhi; m++) { 1153 if (mem_node_config[m].exists == 0) 1154 continue; 1155 pfnum = newbase; 1156 MEM_NODE_ITERATOR_INIT(pfnum, m, r, &it); 1157 if (pfnum == (pfn_t)-1) { 1158 idx = 0; 1159 } else { 1160 PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i, 1161 color_mask, color_mask, &it); 1162 idx = PNUM_TO_IDX(m, r, pfnum); 1163 idx = (idx < pcsz) ? idx : 0; 1164 } 1165 for (mrange = 0; mrange < nranges; mrange++) { 1166 if (PAGE_COUNTERS_CURRENT_COLOR_ARRAY(m, 1167 r, mrange) != NULL) 1168 PAGE_COUNTERS_CURRENT_COLOR(m, 1169 r, i, mrange) = idx; 1170 } 1171 } 1172 } 1173 1174 /* cache info for freeing out of the critical path */ 1175 if ((caddr_t)old_ctr >= kernelheap && 1176 (caddr_t)old_ctr < ekernelheap) { 1177 ctr_cache[r] = old_ctr; 1178 size_cache[r] = old_csz; 1179 } 1180 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) { 1181 size_t *tmp = old_color_array[mrange]; 1182 if ((caddr_t)tmp >= kernelheap && 1183 (caddr_t)tmp < ekernelheap) { 1184 color_cache[r][mrange] = tmp; 1185 } 1186 } 1187 /* 1188 * Verify that PNUM_TO_IDX and IDX_TO_PNUM 1189 * satisfy the identity requirement. 1190 * We should be able to go from one to the other 1191 * and get consistent values. 1192 */ 1193 ASSERT(PNUM_TO_IDX(mnode, r, 1194 (IDX_TO_PNUM(mnode, r, 0))) == 0); 1195 ASSERT(IDX_TO_PNUM(mnode, r, 1196 (PNUM_TO_IDX(mnode, r, newbase))) == newbase); 1197 1198 /* pcc_info_t and pcc_color_free */ 1199 for (i = 0; i < NPC_MUTEX; i++) { 1200 pcc_info_t *epi; 1201 pcc_info_t *eold_pi; 1202 1203 pi = cands_cache[i * MMU_PAGE_SIZES + r]; 1204 old_pi = page_ctrs_cands[i][r][mnode]; 1205 page_ctrs_cands[i][r][mnode] = pi; 1206 cands_cache[i * MMU_PAGE_SIZES + r] = old_pi; 1207 1208 /* preserve old pcc_color_free values, if any */ 1209 if (old_pi == NULL) 1210 continue; 1211 1212 /* 1213 * when/if x86 does DR, must account for 1214 * possible change in range index when 1215 * preserving pcc_info 1216 */ 1217 epi = &pi[nranges]; 1218 eold_pi = &old_pi[old_nranges]; 1219 if (new_maxmrange > old_maxmrange) { 1220 pi += new_maxmrange - old_maxmrange; 1221 } else if (new_maxmrange < old_maxmrange) { 1222 old_pi += old_maxmrange - new_maxmrange; 1223 } 1224 for (; pi < epi && old_pi < eold_pi; pi++, old_pi++) { 1225 pcc_info_t tmp = *pi; 1226 *pi = *old_pi; 1227 *old_pi = tmp; 1228 } 1229 } 1230 } 1231 PAGE_CTRS_WRITE_UNLOCK(mnode); 1232 1233 /* 1234 * Now that we have dropped the write lock, it is safe to free all 1235 * of the memory we have cached above. 1236 * We come thru here to free memory when pre-alloc fails, and also to 1237 * free old pointers which were recorded while locked. 1238 */ 1239 cleanup: 1240 for (r = 1; r < mmu_page_sizes; r++) { 1241 if (ctr_cache[r] != NULL) { 1242 kmem_free(ctr_cache[r], 1243 size_cache[r] * sizeof (hpmctr_t)); 1244 } 1245 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) { 1246 if (color_cache[r][mrange] != NULL) { 1247 kmem_free(color_cache[r][mrange], 1248 colors_per_szc[r] * sizeof (size_t)); 1249 } 1250 } 1251 for (i = 0; i < NPC_MUTEX; i++) { 1252 pi = cands_cache[i * MMU_PAGE_SIZES + r]; 1253 if (pi == NULL) 1254 continue; 1255 nr = cands_cache_nranges; 1256 for (mrange = 0; mrange < nr; mrange++, pi++) { 1257 pgcntp = pi->pcc_color_free; 1258 if (pgcntp == NULL) 1259 continue; 1260 if ((caddr_t)pgcntp >= kernelheap && 1261 (caddr_t)pgcntp < ekernelheap) { 1262 kmem_free(pgcntp, 1263 colors_per_szc[r] * 1264 sizeof (pgcnt_t)); 1265 } 1266 } 1267 pi = cands_cache[i * MMU_PAGE_SIZES + r]; 1268 if ((caddr_t)pi >= kernelheap && 1269 (caddr_t)pi < ekernelheap) { 1270 kmem_free(pi, nr * sizeof (pcc_info_t)); 1271 } 1272 } 1273 } 1274 1275 kmem_free(cands_cache, 1276 sizeof (pcc_info_t *) * NPC_MUTEX * MMU_PAGE_SIZES); 1277 return (rc); 1278 } 1279 1280 1281 #ifdef DEBUG 1282 1283 /* 1284 * confirm pp is a large page corresponding to szc 1285 */ 1286 void 1287 chk_lpg(page_t *pp, uchar_t szc) 1288 { 1289 spgcnt_t npgs = page_get_pagecnt(pp->p_szc); 1290 uint_t noreloc; 1291 1292 if (npgs == 1) { 1293 ASSERT(pp->p_szc == 0); 1294 ASSERT(pp->p_next == pp); 1295 ASSERT(pp->p_prev == pp); 1296 return; 1297 } 1298 1299 ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL); 1300 ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL); 1301 1302 ASSERT(IS_P2ALIGNED(pp->p_pagenum, npgs)); 1303 ASSERT(pp->p_pagenum == (pp->p_next->p_pagenum - 1)); 1304 ASSERT(pp->p_prev->p_pagenum == (pp->p_pagenum + (npgs - 1))); 1305 ASSERT(pp->p_prev == (pp + (npgs - 1))); 1306 1307 /* 1308 * Check list of pages. 1309 */ 1310 noreloc = PP_ISNORELOC(pp); 1311 while (npgs--) { 1312 if (npgs != 0) { 1313 ASSERT(pp->p_pagenum == pp->p_next->p_pagenum - 1); 1314 ASSERT(pp->p_next == (pp + 1)); 1315 } 1316 ASSERT(pp->p_szc == szc); 1317 ASSERT(PP_ISFREE(pp)); 1318 ASSERT(PP_ISAGED(pp)); 1319 ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL); 1320 ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL); 1321 ASSERT(pp->p_vnode == NULL); 1322 ASSERT(PP_ISNORELOC(pp) == noreloc); 1323 1324 pp = pp->p_next; 1325 } 1326 } 1327 #endif /* DEBUG */ 1328 1329 void 1330 page_freelist_lock(int mnode) 1331 { 1332 int i; 1333 for (i = 0; i < NPC_MUTEX; i++) { 1334 mutex_enter(FPC_MUTEX(mnode, i)); 1335 mutex_enter(CPC_MUTEX(mnode, i)); 1336 } 1337 } 1338 1339 void 1340 page_freelist_unlock(int mnode) 1341 { 1342 int i; 1343 for (i = 0; i < NPC_MUTEX; i++) { 1344 mutex_exit(FPC_MUTEX(mnode, i)); 1345 mutex_exit(CPC_MUTEX(mnode, i)); 1346 } 1347 } 1348 1349 /* 1350 * add pp to the specified page list. Defaults to head of the page list 1351 * unless PG_LIST_TAIL is specified. 1352 */ 1353 void 1354 page_list_add(page_t *pp, int flags) 1355 { 1356 page_t **ppp; 1357 kmutex_t *pcm; 1358 uint_t bin, mtype; 1359 int mnode; 1360 1361 ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT)); 1362 ASSERT(PP_ISFREE(pp)); 1363 ASSERT(!hat_page_is_mapped(pp)); 1364 ASSERT(hat_page_getshare(pp) == 0); 1365 1366 /* 1367 * Large pages should be freed via page_list_add_pages(). 1368 */ 1369 ASSERT(pp->p_szc == 0); 1370 1371 /* 1372 * Don't need to lock the freelist first here 1373 * because the page isn't on the freelist yet. 1374 * This means p_szc can't change on us. 1375 */ 1376 1377 bin = PP_2_BIN(pp); 1378 mnode = PP_2_MEM_NODE(pp); 1379 mtype = PP_2_MTYPE(pp); 1380 1381 if (flags & PG_LIST_ISINIT) { 1382 /* 1383 * PG_LIST_ISINIT is set during system startup (ie. single 1384 * threaded), add a page to the free list and add to the 1385 * the free region counters w/o any locking 1386 */ 1387 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1388 1389 /* inline version of page_add() */ 1390 if (*ppp != NULL) { 1391 pp->p_next = *ppp; 1392 pp->p_prev = (*ppp)->p_prev; 1393 (*ppp)->p_prev = pp; 1394 pp->p_prev->p_next = pp; 1395 } else 1396 *ppp = pp; 1397 1398 page_ctr_add_internal(mnode, mtype, pp, flags); 1399 VM_STAT_ADD(vmm_vmstats.pladd_free[0]); 1400 } else { 1401 pcm = PC_BIN_MUTEX(mnode, bin, flags); 1402 1403 if (flags & PG_FREE_LIST) { 1404 VM_STAT_ADD(vmm_vmstats.pladd_free[0]); 1405 ASSERT(PP_ISAGED(pp)); 1406 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1407 1408 } else { 1409 VM_STAT_ADD(vmm_vmstats.pladd_cache); 1410 ASSERT(pp->p_vnode); 1411 ASSERT((pp->p_offset & PAGEOFFSET) == 0); 1412 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1413 } 1414 mutex_enter(pcm); 1415 page_add(ppp, pp); 1416 1417 if (flags & PG_LIST_TAIL) 1418 *ppp = (*ppp)->p_next; 1419 /* 1420 * Add counters before releasing pcm mutex to avoid a race with 1421 * page_freelist_coalesce and page_freelist_split. 1422 */ 1423 page_ctr_add(mnode, mtype, pp, flags); 1424 mutex_exit(pcm); 1425 } 1426 1427 1428 #if defined(__sparc) 1429 if (PP_ISNORELOC(pp)) { 1430 kcage_freemem_add(1); 1431 } 1432 #endif 1433 /* 1434 * It is up to the caller to unlock the page! 1435 */ 1436 ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT)); 1437 } 1438 1439 1440 #ifdef __sparc 1441 /* 1442 * This routine is only used by kcage_init during system startup. 1443 * It performs the function of page_list_sub/PP_SETNORELOC/page_list_add 1444 * without the overhead of taking locks and updating counters. 1445 */ 1446 void 1447 page_list_noreloc_startup(page_t *pp) 1448 { 1449 page_t **ppp; 1450 uint_t bin; 1451 int mnode; 1452 int mtype; 1453 int flags = 0; 1454 1455 /* 1456 * If this is a large page on the freelist then 1457 * break it up into smaller pages. 1458 */ 1459 if (pp->p_szc != 0) 1460 page_boot_demote(pp); 1461 1462 /* 1463 * Get list page is currently on. 1464 */ 1465 bin = PP_2_BIN(pp); 1466 mnode = PP_2_MEM_NODE(pp); 1467 mtype = PP_2_MTYPE(pp); 1468 ASSERT(mtype == MTYPE_RELOC); 1469 ASSERT(pp->p_szc == 0); 1470 1471 if (PP_ISAGED(pp)) { 1472 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1473 flags |= PG_FREE_LIST; 1474 } else { 1475 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1476 flags |= PG_CACHE_LIST; 1477 } 1478 1479 ASSERT(*ppp != NULL); 1480 1481 /* 1482 * Delete page from current list. 1483 */ 1484 if (*ppp == pp) 1485 *ppp = pp->p_next; /* go to next page */ 1486 if (*ppp == pp) { 1487 *ppp = NULL; /* page list is gone */ 1488 } else { 1489 pp->p_prev->p_next = pp->p_next; 1490 pp->p_next->p_prev = pp->p_prev; 1491 } 1492 1493 /* 1494 * Decrement page counters 1495 */ 1496 page_ctr_sub_internal(mnode, mtype, pp, flags); 1497 1498 /* 1499 * Set no reloc for cage initted pages. 1500 */ 1501 PP_SETNORELOC(pp); 1502 1503 mtype = PP_2_MTYPE(pp); 1504 ASSERT(mtype == MTYPE_NORELOC); 1505 1506 /* 1507 * Get new list for page. 1508 */ 1509 if (PP_ISAGED(pp)) { 1510 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1511 } else { 1512 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1513 } 1514 1515 /* 1516 * Insert page on new list. 1517 */ 1518 if (*ppp == NULL) { 1519 *ppp = pp; 1520 pp->p_next = pp->p_prev = pp; 1521 } else { 1522 pp->p_next = *ppp; 1523 pp->p_prev = (*ppp)->p_prev; 1524 (*ppp)->p_prev = pp; 1525 pp->p_prev->p_next = pp; 1526 } 1527 1528 /* 1529 * Increment page counters 1530 */ 1531 page_ctr_add_internal(mnode, mtype, pp, flags); 1532 1533 /* 1534 * Update cage freemem counter 1535 */ 1536 atomic_add_long(&kcage_freemem, 1); 1537 } 1538 #else /* __sparc */ 1539 1540 /* ARGSUSED */ 1541 void 1542 page_list_noreloc_startup(page_t *pp) 1543 { 1544 panic("page_list_noreloc_startup: should be here only for sparc"); 1545 } 1546 #endif 1547 1548 void 1549 page_list_add_pages(page_t *pp, int flags) 1550 { 1551 kmutex_t *pcm; 1552 pgcnt_t pgcnt; 1553 uint_t bin, mtype, i; 1554 int mnode; 1555 1556 /* default to freelist/head */ 1557 ASSERT((flags & (PG_CACHE_LIST | PG_LIST_TAIL)) == 0); 1558 1559 CHK_LPG(pp, pp->p_szc); 1560 VM_STAT_ADD(vmm_vmstats.pladd_free[pp->p_szc]); 1561 1562 bin = PP_2_BIN(pp); 1563 mnode = PP_2_MEM_NODE(pp); 1564 mtype = PP_2_MTYPE(pp); 1565 1566 if (flags & PG_LIST_ISINIT) { 1567 ASSERT(pp->p_szc == mmu_page_sizes - 1); 1568 page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1569 ASSERT(!PP_ISNORELOC(pp)); 1570 PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags); 1571 } else { 1572 1573 ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes); 1574 1575 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 1576 1577 mutex_enter(pcm); 1578 page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1579 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST); 1580 mutex_exit(pcm); 1581 1582 pgcnt = page_get_pagecnt(pp->p_szc); 1583 #if defined(__sparc) 1584 if (PP_ISNORELOC(pp)) 1585 kcage_freemem_add(pgcnt); 1586 #endif 1587 for (i = 0; i < pgcnt; i++, pp++) 1588 page_unlock_nocapture(pp); 1589 } 1590 } 1591 1592 /* 1593 * During boot, need to demote a large page to base 1594 * pagesize pages for seg_kmem for use in boot_alloc() 1595 */ 1596 void 1597 page_boot_demote(page_t *pp) 1598 { 1599 ASSERT(pp->p_szc != 0); 1600 ASSERT(PP_ISFREE(pp)); 1601 ASSERT(PP_ISAGED(pp)); 1602 1603 (void) page_demote(PP_2_MEM_NODE(pp), 1604 PFN_BASE(pp->p_pagenum, pp->p_szc), 0, pp->p_szc, 0, PC_NO_COLOR, 1605 PC_FREE); 1606 1607 ASSERT(PP_ISFREE(pp)); 1608 ASSERT(PP_ISAGED(pp)); 1609 ASSERT(pp->p_szc == 0); 1610 } 1611 1612 /* 1613 * Take a particular page off of whatever freelist the page 1614 * is claimed to be on. 1615 * 1616 * NOTE: Only used for PAGESIZE pages. 1617 */ 1618 void 1619 page_list_sub(page_t *pp, int flags) 1620 { 1621 int bin; 1622 uint_t mtype; 1623 int mnode; 1624 kmutex_t *pcm; 1625 page_t **ppp; 1626 1627 ASSERT(PAGE_EXCL(pp)); 1628 ASSERT(PP_ISFREE(pp)); 1629 1630 /* 1631 * The p_szc field can only be changed by page_promote() 1632 * and page_demote(). Only free pages can be promoted and 1633 * demoted and the free list MUST be locked during these 1634 * operations. So to prevent a race in page_list_sub() 1635 * between computing which bin of the freelist lock to 1636 * grab and actually grabing the lock we check again that 1637 * the bin we locked is still the correct one. Notice that 1638 * the p_szc field could have actually changed on us but 1639 * if the bin happens to still be the same we are safe. 1640 */ 1641 try_again: 1642 bin = PP_2_BIN(pp); 1643 mnode = PP_2_MEM_NODE(pp); 1644 pcm = PC_BIN_MUTEX(mnode, bin, flags); 1645 mutex_enter(pcm); 1646 if (PP_2_BIN(pp) != bin) { 1647 mutex_exit(pcm); 1648 goto try_again; 1649 } 1650 mtype = PP_2_MTYPE(pp); 1651 1652 if (flags & PG_FREE_LIST) { 1653 VM_STAT_ADD(vmm_vmstats.plsub_free[0]); 1654 ASSERT(PP_ISAGED(pp)); 1655 ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype); 1656 } else { 1657 VM_STAT_ADD(vmm_vmstats.plsub_cache); 1658 ASSERT(!PP_ISAGED(pp)); 1659 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1660 } 1661 1662 /* 1663 * Common PAGESIZE case. 1664 * 1665 * Note that we locked the freelist. This prevents 1666 * any page promotion/demotion operations. Therefore 1667 * the p_szc will not change until we drop pcm mutex. 1668 */ 1669 if (pp->p_szc == 0) { 1670 page_sub(ppp, pp); 1671 /* 1672 * Subtract counters before releasing pcm mutex 1673 * to avoid race with page_freelist_coalesce. 1674 */ 1675 page_ctr_sub(mnode, mtype, pp, flags); 1676 mutex_exit(pcm); 1677 1678 #if defined(__sparc) 1679 if (PP_ISNORELOC(pp)) { 1680 kcage_freemem_sub(1); 1681 } 1682 #endif 1683 return; 1684 } 1685 1686 /* 1687 * Large pages on the cache list are not supported. 1688 */ 1689 if (flags & PG_CACHE_LIST) 1690 panic("page_list_sub: large page on cachelist"); 1691 1692 /* 1693 * Slow but rare. 1694 * 1695 * Somebody wants this particular page which is part 1696 * of a large page. In this case we just demote the page 1697 * if it's on the freelist. 1698 * 1699 * We have to drop pcm before locking the entire freelist. 1700 * Once we have re-locked the freelist check to make sure 1701 * the page hasn't already been demoted or completely 1702 * freed. 1703 */ 1704 mutex_exit(pcm); 1705 page_freelist_lock(mnode); 1706 if (pp->p_szc != 0) { 1707 /* 1708 * Large page is on freelist. 1709 */ 1710 (void) page_demote(mnode, PFN_BASE(pp->p_pagenum, pp->p_szc), 1711 0, pp->p_szc, 0, PC_NO_COLOR, PC_FREE); 1712 } 1713 ASSERT(PP_ISFREE(pp)); 1714 ASSERT(PP_ISAGED(pp)); 1715 ASSERT(pp->p_szc == 0); 1716 1717 /* 1718 * Subtract counters before releasing pcm mutex 1719 * to avoid race with page_freelist_coalesce. 1720 */ 1721 bin = PP_2_BIN(pp); 1722 mtype = PP_2_MTYPE(pp); 1723 ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype); 1724 1725 page_sub(ppp, pp); 1726 page_ctr_sub(mnode, mtype, pp, flags); 1727 page_freelist_unlock(mnode); 1728 1729 #if defined(__sparc) 1730 if (PP_ISNORELOC(pp)) { 1731 kcage_freemem_sub(1); 1732 } 1733 #endif 1734 } 1735 1736 void 1737 page_list_sub_pages(page_t *pp, uint_t szc) 1738 { 1739 kmutex_t *pcm; 1740 uint_t bin, mtype; 1741 int mnode; 1742 1743 ASSERT(PAGE_EXCL(pp)); 1744 ASSERT(PP_ISFREE(pp)); 1745 ASSERT(PP_ISAGED(pp)); 1746 1747 /* 1748 * See comment in page_list_sub(). 1749 */ 1750 try_again: 1751 bin = PP_2_BIN(pp); 1752 mnode = PP_2_MEM_NODE(pp); 1753 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 1754 mutex_enter(pcm); 1755 if (PP_2_BIN(pp) != bin) { 1756 mutex_exit(pcm); 1757 goto try_again; 1758 } 1759 1760 /* 1761 * If we're called with a page larger than szc or it got 1762 * promoted above szc before we locked the freelist then 1763 * drop pcm and re-lock entire freelist. If page still larger 1764 * than szc then demote it. 1765 */ 1766 if (pp->p_szc > szc) { 1767 mutex_exit(pcm); 1768 pcm = NULL; 1769 page_freelist_lock(mnode); 1770 if (pp->p_szc > szc) { 1771 VM_STAT_ADD(vmm_vmstats.plsubpages_szcbig); 1772 (void) page_demote(mnode, 1773 PFN_BASE(pp->p_pagenum, pp->p_szc), 0, 1774 pp->p_szc, szc, PC_NO_COLOR, PC_FREE); 1775 } 1776 bin = PP_2_BIN(pp); 1777 } 1778 ASSERT(PP_ISFREE(pp)); 1779 ASSERT(PP_ISAGED(pp)); 1780 ASSERT(pp->p_szc <= szc); 1781 ASSERT(pp == PP_PAGEROOT(pp)); 1782 1783 VM_STAT_ADD(vmm_vmstats.plsub_free[pp->p_szc]); 1784 1785 mtype = PP_2_MTYPE(pp); 1786 if (pp->p_szc != 0) { 1787 page_vpsub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1788 CHK_LPG(pp, pp->p_szc); 1789 } else { 1790 VM_STAT_ADD(vmm_vmstats.plsubpages_szc0); 1791 page_sub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1792 } 1793 page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST); 1794 1795 if (pcm != NULL) { 1796 mutex_exit(pcm); 1797 } else { 1798 page_freelist_unlock(mnode); 1799 } 1800 1801 #if defined(__sparc) 1802 if (PP_ISNORELOC(pp)) { 1803 pgcnt_t pgcnt; 1804 1805 pgcnt = page_get_pagecnt(pp->p_szc); 1806 kcage_freemem_sub(pgcnt); 1807 } 1808 #endif 1809 } 1810 1811 /* 1812 * Add the page to the front of a linked list of pages 1813 * using the p_next & p_prev pointers for the list. 1814 * The caller is responsible for protecting the list pointers. 1815 */ 1816 void 1817 mach_page_add(page_t **ppp, page_t *pp) 1818 { 1819 if (*ppp == NULL) { 1820 pp->p_next = pp->p_prev = pp; 1821 } else { 1822 pp->p_next = *ppp; 1823 pp->p_prev = (*ppp)->p_prev; 1824 (*ppp)->p_prev = pp; 1825 pp->p_prev->p_next = pp; 1826 } 1827 *ppp = pp; 1828 } 1829 1830 /* 1831 * Remove this page from a linked list of pages 1832 * using the p_next & p_prev pointers for the list. 1833 * 1834 * The caller is responsible for protecting the list pointers. 1835 */ 1836 void 1837 mach_page_sub(page_t **ppp, page_t *pp) 1838 { 1839 ASSERT(PP_ISFREE(pp)); 1840 1841 if (*ppp == NULL || pp == NULL) 1842 panic("mach_page_sub"); 1843 1844 if (*ppp == pp) 1845 *ppp = pp->p_next; /* go to next page */ 1846 1847 if (*ppp == pp) 1848 *ppp = NULL; /* page list is gone */ 1849 else { 1850 pp->p_prev->p_next = pp->p_next; 1851 pp->p_next->p_prev = pp->p_prev; 1852 } 1853 pp->p_prev = pp->p_next = pp; /* make pp a list of one */ 1854 } 1855 1856 /* 1857 * Routine fsflush uses to gradually coalesce the free list into larger pages. 1858 */ 1859 void 1860 page_promote_size(page_t *pp, uint_t cur_szc) 1861 { 1862 pfn_t pfn; 1863 int mnode; 1864 int idx; 1865 int new_szc = cur_szc + 1; 1866 int full = FULL_REGION_CNT(new_szc); 1867 1868 pfn = page_pptonum(pp); 1869 mnode = PFN_2_MEM_NODE(pfn); 1870 1871 page_freelist_lock(mnode); 1872 1873 idx = PNUM_TO_IDX(mnode, new_szc, pfn); 1874 if (PAGE_COUNTERS(mnode, new_szc, idx) == full) 1875 (void) page_promote(mnode, pfn, new_szc, PC_FREE, PC_MTYPE_ANY); 1876 1877 page_freelist_unlock(mnode); 1878 } 1879 1880 static uint_t page_promote_err; 1881 static uint_t page_promote_noreloc_err; 1882 1883 /* 1884 * Create a single larger page (of szc new_szc) from smaller contiguous pages 1885 * for the given mnode starting at pfnum. Pages involved are on the freelist 1886 * before the call and may be returned to the caller if requested, otherwise 1887 * they will be placed back on the freelist. 1888 * If flags is PC_ALLOC, then the large page will be returned to the user in 1889 * a state which is consistent with a page being taken off the freelist. If 1890 * we failed to lock the new large page, then we will return NULL to the 1891 * caller and put the large page on the freelist instead. 1892 * If flags is PC_FREE, then the large page will be placed on the freelist, 1893 * and NULL will be returned. 1894 * The caller is responsible for locking the freelist as well as any other 1895 * accounting which needs to be done for a returned page. 1896 * 1897 * RFE: For performance pass in pp instead of pfnum so 1898 * we can avoid excessive calls to page_numtopp_nolock(). 1899 * This would depend on an assumption that all contiguous 1900 * pages are in the same memseg so we can just add/dec 1901 * our pp. 1902 * 1903 * Lock ordering: 1904 * 1905 * There is a potential but rare deadlock situation 1906 * for page promotion and demotion operations. The problem 1907 * is there are two paths into the freelist manager and 1908 * they have different lock orders: 1909 * 1910 * page_create() 1911 * lock freelist 1912 * page_lock(EXCL) 1913 * unlock freelist 1914 * return 1915 * caller drops page_lock 1916 * 1917 * page_free() and page_reclaim() 1918 * caller grabs page_lock(EXCL) 1919 * 1920 * lock freelist 1921 * unlock freelist 1922 * drop page_lock 1923 * 1924 * What prevents a thread in page_create() from deadlocking 1925 * with a thread freeing or reclaiming the same page is the 1926 * page_trylock() in page_get_freelist(). If the trylock fails 1927 * it skips the page. 1928 * 1929 * The lock ordering for promotion and demotion is the same as 1930 * for page_create(). Since the same deadlock could occur during 1931 * page promotion and freeing or reclaiming of a page on the 1932 * cache list we might have to fail the operation and undo what 1933 * have done so far. Again this is rare. 1934 */ 1935 page_t * 1936 page_promote(int mnode, pfn_t pfnum, uchar_t new_szc, int flags, int mtype) 1937 { 1938 page_t *pp, *pplist, *tpp, *start_pp; 1939 pgcnt_t new_npgs, npgs; 1940 uint_t bin; 1941 pgcnt_t tmpnpgs, pages_left; 1942 uint_t noreloc; 1943 int which_list; 1944 ulong_t index; 1945 kmutex_t *phm; 1946 1947 /* 1948 * General algorithm: 1949 * Find the starting page 1950 * Walk each page struct removing it from the freelist, 1951 * and linking it to all the other pages removed. 1952 * Once all pages are off the freelist, 1953 * walk the list, modifying p_szc to new_szc and what 1954 * ever other info needs to be done to create a large free page. 1955 * According to the flags, either return the page or put it 1956 * on the freelist. 1957 */ 1958 1959 start_pp = page_numtopp_nolock(pfnum); 1960 ASSERT(start_pp && (start_pp->p_pagenum == pfnum)); 1961 new_npgs = page_get_pagecnt(new_szc); 1962 ASSERT(IS_P2ALIGNED(pfnum, new_npgs)); 1963 1964 /* don't return page of the wrong mtype */ 1965 if (mtype != PC_MTYPE_ANY && mtype != PP_2_MTYPE(start_pp)) 1966 return (NULL); 1967 1968 /* 1969 * Loop through smaller pages to confirm that all pages 1970 * give the same result for PP_ISNORELOC(). 1971 * We can check this reliably here as the protocol for setting 1972 * P_NORELOC requires pages to be taken off the free list first. 1973 */ 1974 noreloc = PP_ISNORELOC(start_pp); 1975 for (pp = start_pp + new_npgs; --pp > start_pp; ) { 1976 if (noreloc != PP_ISNORELOC(pp)) { 1977 page_promote_noreloc_err++; 1978 page_promote_err++; 1979 return (NULL); 1980 } 1981 } 1982 1983 pages_left = new_npgs; 1984 pplist = NULL; 1985 pp = start_pp; 1986 1987 /* Loop around coalescing the smaller pages into a big page. */ 1988 while (pages_left) { 1989 /* 1990 * Remove from the freelist. 1991 */ 1992 ASSERT(PP_ISFREE(pp)); 1993 bin = PP_2_BIN(pp); 1994 ASSERT(mnode == PP_2_MEM_NODE(pp)); 1995 mtype = PP_2_MTYPE(pp); 1996 if (PP_ISAGED(pp)) { 1997 1998 /* 1999 * PG_FREE_LIST 2000 */ 2001 if (pp->p_szc) { 2002 page_vpsub(&PAGE_FREELISTS(mnode, 2003 pp->p_szc, bin, mtype), pp); 2004 } else { 2005 mach_page_sub(&PAGE_FREELISTS(mnode, 0, 2006 bin, mtype), pp); 2007 } 2008 which_list = PG_FREE_LIST; 2009 } else { 2010 ASSERT(pp->p_szc == 0); 2011 2012 /* 2013 * PG_CACHE_LIST 2014 * 2015 * Since this page comes from the 2016 * cachelist, we must destroy the 2017 * vnode association. 2018 */ 2019 if (!page_trylock(pp, SE_EXCL)) { 2020 goto fail_promote; 2021 } 2022 2023 /* 2024 * We need to be careful not to deadlock 2025 * with another thread in page_lookup(). 2026 * The page_lookup() thread could be holding 2027 * the same phm that we need if the two 2028 * pages happen to hash to the same phm lock. 2029 * At this point we have locked the entire 2030 * freelist and page_lookup() could be trying 2031 * to grab a freelist lock. 2032 */ 2033 index = PAGE_HASH_FUNC(pp->p_vnode, pp->p_offset); 2034 phm = PAGE_HASH_MUTEX(index); 2035 if (!mutex_tryenter(phm)) { 2036 page_unlock_nocapture(pp); 2037 goto fail_promote; 2038 } 2039 2040 mach_page_sub(&PAGE_CACHELISTS(mnode, bin, mtype), pp); 2041 page_hashout(pp, phm); 2042 mutex_exit(phm); 2043 PP_SETAGED(pp); 2044 page_unlock_nocapture(pp); 2045 which_list = PG_CACHE_LIST; 2046 } 2047 page_ctr_sub(mnode, mtype, pp, which_list); 2048 2049 /* 2050 * Concatenate the smaller page(s) onto 2051 * the large page list. 2052 */ 2053 tmpnpgs = npgs = page_get_pagecnt(pp->p_szc); 2054 pages_left -= npgs; 2055 tpp = pp; 2056 while (npgs--) { 2057 tpp->p_szc = new_szc; 2058 tpp = tpp->p_next; 2059 } 2060 page_list_concat(&pplist, &pp); 2061 pp += tmpnpgs; 2062 } 2063 CHK_LPG(pplist, new_szc); 2064 2065 /* 2066 * return the page to the user if requested 2067 * in the properly locked state. 2068 */ 2069 if (flags == PC_ALLOC && (page_trylock_cons(pplist, SE_EXCL))) { 2070 return (pplist); 2071 } 2072 2073 /* 2074 * Otherwise place the new large page on the freelist 2075 */ 2076 bin = PP_2_BIN(pplist); 2077 mnode = PP_2_MEM_NODE(pplist); 2078 mtype = PP_2_MTYPE(pplist); 2079 page_vpadd(&PAGE_FREELISTS(mnode, new_szc, bin, mtype), pplist); 2080 2081 page_ctr_add(mnode, mtype, pplist, PG_FREE_LIST); 2082 return (NULL); 2083 2084 fail_promote: 2085 /* 2086 * A thread must have still been freeing or 2087 * reclaiming the page on the cachelist. 2088 * To prevent a deadlock undo what we have 2089 * done sofar and return failure. This 2090 * situation can only happen while promoting 2091 * PAGESIZE pages. 2092 */ 2093 page_promote_err++; 2094 while (pplist) { 2095 pp = pplist; 2096 mach_page_sub(&pplist, pp); 2097 pp->p_szc = 0; 2098 bin = PP_2_BIN(pp); 2099 mtype = PP_2_MTYPE(pp); 2100 mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, mtype), pp); 2101 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST); 2102 } 2103 return (NULL); 2104 2105 } 2106 2107 /* 2108 * Break up a large page into smaller size pages. 2109 * Pages involved are on the freelist before the call and may 2110 * be returned to the caller if requested, otherwise they will 2111 * be placed back on the freelist. 2112 * The caller is responsible for locking the freelist as well as any other 2113 * accounting which needs to be done for a returned page. 2114 * If flags is not PC_ALLOC, the color argument is ignored, and thus 2115 * technically, any value may be passed in but PC_NO_COLOR is the standard 2116 * which should be followed for clarity's sake. 2117 * Returns a page whose pfn is < pfnmax 2118 */ 2119 page_t * 2120 page_demote(int mnode, pfn_t pfnum, pfn_t pfnmax, uchar_t cur_szc, 2121 uchar_t new_szc, int color, int flags) 2122 { 2123 page_t *pp, *pplist, *npplist; 2124 pgcnt_t npgs, n; 2125 uint_t bin; 2126 uint_t mtype; 2127 page_t *ret_pp = NULL; 2128 2129 ASSERT(cur_szc != 0); 2130 ASSERT(new_szc < cur_szc); 2131 2132 pplist = page_numtopp_nolock(pfnum); 2133 ASSERT(pplist != NULL); 2134 2135 ASSERT(pplist->p_szc == cur_szc); 2136 2137 bin = PP_2_BIN(pplist); 2138 ASSERT(mnode == PP_2_MEM_NODE(pplist)); 2139 mtype = PP_2_MTYPE(pplist); 2140 page_vpsub(&PAGE_FREELISTS(mnode, cur_szc, bin, mtype), pplist); 2141 2142 CHK_LPG(pplist, cur_szc); 2143 page_ctr_sub(mnode, mtype, pplist, PG_FREE_LIST); 2144 2145 /* 2146 * Number of PAGESIZE pages for smaller new_szc 2147 * page. 2148 */ 2149 npgs = page_get_pagecnt(new_szc); 2150 2151 while (pplist) { 2152 pp = pplist; 2153 2154 ASSERT(pp->p_szc == cur_szc); 2155 2156 /* 2157 * We either break it up into PAGESIZE pages or larger. 2158 */ 2159 if (npgs == 1) { /* PAGESIZE case */ 2160 mach_page_sub(&pplist, pp); 2161 ASSERT(pp->p_szc == cur_szc); 2162 ASSERT(new_szc == 0); 2163 ASSERT(mnode == PP_2_MEM_NODE(pp)); 2164 pp->p_szc = new_szc; 2165 bin = PP_2_BIN(pp); 2166 if ((bin == color) && (flags == PC_ALLOC) && 2167 (ret_pp == NULL) && (pfnmax == 0 || 2168 pp->p_pagenum < pfnmax) && 2169 page_trylock_cons(pp, SE_EXCL)) { 2170 ret_pp = pp; 2171 } else { 2172 mtype = PP_2_MTYPE(pp); 2173 mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, 2174 mtype), pp); 2175 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST); 2176 } 2177 } else { 2178 page_t *try_to_return_this_page = NULL; 2179 int count = 0; 2180 2181 /* 2182 * Break down into smaller lists of pages. 2183 */ 2184 page_list_break(&pplist, &npplist, npgs); 2185 2186 pp = pplist; 2187 n = npgs; 2188 while (n--) { 2189 ASSERT(pp->p_szc == cur_szc); 2190 /* 2191 * Check whether all the pages in this list 2192 * fit the request criteria. 2193 */ 2194 if (pfnmax == 0 || pp->p_pagenum < pfnmax) { 2195 count++; 2196 } 2197 pp->p_szc = new_szc; 2198 pp = pp->p_next; 2199 } 2200 2201 if (count == npgs && 2202 (pfnmax == 0 || pp->p_pagenum < pfnmax)) { 2203 try_to_return_this_page = pp; 2204 } 2205 2206 CHK_LPG(pplist, new_szc); 2207 2208 bin = PP_2_BIN(pplist); 2209 if (try_to_return_this_page) 2210 ASSERT(mnode == 2211 PP_2_MEM_NODE(try_to_return_this_page)); 2212 if ((bin == color) && (flags == PC_ALLOC) && 2213 (ret_pp == NULL) && try_to_return_this_page && 2214 page_trylock_cons(try_to_return_this_page, 2215 SE_EXCL)) { 2216 ret_pp = try_to_return_this_page; 2217 } else { 2218 mtype = PP_2_MTYPE(pp); 2219 page_vpadd(&PAGE_FREELISTS(mnode, new_szc, 2220 bin, mtype), pplist); 2221 2222 page_ctr_add(mnode, mtype, pplist, 2223 PG_FREE_LIST); 2224 } 2225 pplist = npplist; 2226 } 2227 } 2228 return (ret_pp); 2229 } 2230 2231 int mpss_coalesce_disable = 0; 2232 2233 /* 2234 * Coalesce free pages into a page of the given szc and color if possible. 2235 * Return the pointer to the page created, otherwise, return NULL. 2236 * 2237 * If pfnhi is non-zero, search for large page with pfn range less than pfnhi. 2238 */ 2239 page_t * 2240 page_freelist_coalesce(int mnode, uchar_t szc, uint_t color, uint_t ceq_mask, 2241 int mtype, pfn_t pfnhi) 2242 { 2243 int r = szc; /* region size */ 2244 int mrange; 2245 uint_t full, bin, color_mask, wrap = 0; 2246 pfn_t pfnum, lo, hi; 2247 size_t len, idx, idx0; 2248 pgcnt_t cands = 0, szcpgcnt = page_get_pagecnt(szc); 2249 page_t *ret_pp; 2250 MEM_NODE_ITERATOR_DECL(it); 2251 #if defined(__sparc) 2252 pfn_t pfnum0, nlo, nhi; 2253 #endif 2254 2255 if (mpss_coalesce_disable) { 2256 ASSERT(szc < MMU_PAGE_SIZES); 2257 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[szc][0]); 2258 return (NULL); 2259 } 2260 2261 ASSERT(szc < mmu_page_sizes); 2262 color_mask = PAGE_GET_PAGECOLORS(szc) - 1; 2263 ASSERT(ceq_mask <= color_mask); 2264 ASSERT(color <= color_mask); 2265 color &= ceq_mask; 2266 2267 /* Prevent page_counters dynamic memory from being freed */ 2268 rw_enter(&page_ctrs_rwlock[mnode], RW_READER); 2269 2270 mrange = MTYPE_2_MRANGE(mnode, mtype); 2271 ASSERT(mrange < mnode_nranges[mnode]); 2272 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[r][mrange]); 2273 2274 /* get pfn range for mtype */ 2275 len = PAGE_COUNTERS_ENTRIES(mnode, r); 2276 MNODETYPE_2_PFN(mnode, mtype, lo, hi); 2277 hi++; 2278 2279 /* use lower limit if given */ 2280 if (pfnhi != PFNNULL && pfnhi < hi) 2281 hi = pfnhi; 2282 2283 /* round to szcpgcnt boundaries */ 2284 lo = P2ROUNDUP(lo, szcpgcnt); 2285 MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it); 2286 if (lo == (pfn_t)-1) { 2287 rw_exit(&page_ctrs_rwlock[mnode]); 2288 return (NULL); 2289 } 2290 hi = hi & ~(szcpgcnt - 1); 2291 2292 /* set lo to the closest pfn of the right color */ 2293 if (((PFN_2_COLOR(lo, szc, &it) ^ color) & ceq_mask) || 2294 (interleaved_mnodes && PFN_2_MEM_NODE(lo) != mnode)) { 2295 PAGE_NEXT_PFN_FOR_COLOR(lo, szc, color, ceq_mask, color_mask, 2296 &it); 2297 } 2298 2299 if (hi <= lo) { 2300 rw_exit(&page_ctrs_rwlock[mnode]); 2301 return (NULL); 2302 } 2303 2304 full = FULL_REGION_CNT(r); 2305 2306 /* calculate the number of page candidates and initial search index */ 2307 bin = color; 2308 idx0 = (size_t)(-1); 2309 do { 2310 pgcnt_t acand; 2311 2312 PGCTRS_CANDS_GETVALUECOLOR(mnode, mrange, r, bin, acand); 2313 if (acand) { 2314 idx = PAGE_COUNTERS_CURRENT_COLOR(mnode, 2315 r, bin, mrange); 2316 idx0 = MIN(idx0, idx); 2317 cands += acand; 2318 } 2319 bin = ADD_MASKED(bin, 1, ceq_mask, color_mask); 2320 } while (bin != color); 2321 2322 if (cands == 0) { 2323 VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip[r][mrange]); 2324 rw_exit(&page_ctrs_rwlock[mnode]); 2325 return (NULL); 2326 } 2327 2328 pfnum = IDX_TO_PNUM(mnode, r, idx0); 2329 if (pfnum < lo || pfnum >= hi) { 2330 pfnum = lo; 2331 } else { 2332 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it); 2333 if (pfnum == (pfn_t)-1) { 2334 pfnum = lo; 2335 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it); 2336 ASSERT(pfnum != (pfn_t)-1); 2337 } else if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) & ceq_mask || 2338 (interleaved_mnodes && PFN_2_MEM_NODE(pfnum) != mnode)) { 2339 /* invalid color, get the closest correct pfn */ 2340 PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask, 2341 color_mask, &it); 2342 if (pfnum >= hi) { 2343 pfnum = lo; 2344 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it); 2345 } 2346 } 2347 } 2348 2349 /* set starting index */ 2350 idx0 = PNUM_TO_IDX(mnode, r, pfnum); 2351 ASSERT(idx0 < len); 2352 2353 #if defined(__sparc) 2354 pfnum0 = pfnum; /* page corresponding to idx0 */ 2355 nhi = 0; /* search kcage ranges */ 2356 #endif 2357 2358 for (idx = idx0; wrap == 0 || (idx < idx0 && wrap < 2); ) { 2359 2360 #if defined(__sparc) 2361 /* 2362 * Find lowest intersection of kcage ranges and mnode. 2363 * MTYPE_NORELOC means look in the cage, otherwise outside. 2364 */ 2365 if (nhi <= pfnum) { 2366 if (kcage_next_range(mtype == MTYPE_NORELOC, pfnum, 2367 (wrap == 0 ? hi : pfnum0), &nlo, &nhi)) 2368 goto wrapit; 2369 2370 /* jump to the next page in the range */ 2371 if (pfnum < nlo) { 2372 pfnum = P2ROUNDUP(nlo, szcpgcnt); 2373 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it); 2374 idx = PNUM_TO_IDX(mnode, r, pfnum); 2375 if (idx >= len || pfnum >= hi) 2376 goto wrapit; 2377 if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) & 2378 ceq_mask) 2379 goto next; 2380 if (interleaved_mnodes && 2381 PFN_2_MEM_NODE(pfnum) != mnode) 2382 goto next; 2383 } 2384 } 2385 #endif 2386 2387 if (PAGE_COUNTERS(mnode, r, idx) != full) 2388 goto next; 2389 2390 /* 2391 * RFE: For performance maybe we can do something less 2392 * brutal than locking the entire freelist. So far 2393 * this doesn't seem to be a performance problem? 2394 */ 2395 page_freelist_lock(mnode); 2396 if (PAGE_COUNTERS(mnode, r, idx) == full) { 2397 ret_pp = 2398 page_promote(mnode, pfnum, r, PC_ALLOC, mtype); 2399 if (ret_pp != NULL) { 2400 VM_STAT_ADD(vmm_vmstats.pfc_coalok[r][mrange]); 2401 PAGE_COUNTERS_CURRENT_COLOR(mnode, r, 2402 PFN_2_COLOR(pfnum, szc, &it), mrange) = idx; 2403 page_freelist_unlock(mnode); 2404 rw_exit(&page_ctrs_rwlock[mnode]); 2405 #if defined(__sparc) 2406 if (PP_ISNORELOC(ret_pp)) { 2407 pgcnt_t npgs; 2408 2409 npgs = page_get_pagecnt(ret_pp->p_szc); 2410 kcage_freemem_sub(npgs); 2411 } 2412 #endif 2413 return (ret_pp); 2414 } 2415 } else { 2416 VM_STAT_ADD(vmm_vmstats.page_ctrs_changed[r][mrange]); 2417 } 2418 2419 page_freelist_unlock(mnode); 2420 /* 2421 * No point looking for another page if we've 2422 * already tried all of the ones that 2423 * page_ctr_cands indicated. Stash off where we left 2424 * off. 2425 * Note: this is not exact since we don't hold the 2426 * page_freelist_locks before we initially get the 2427 * value of cands for performance reasons, but should 2428 * be a decent approximation. 2429 */ 2430 if (--cands == 0) { 2431 PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color, mrange) = 2432 idx; 2433 break; 2434 } 2435 next: 2436 PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask, 2437 color_mask, &it); 2438 idx = PNUM_TO_IDX(mnode, r, pfnum); 2439 if (idx >= len || pfnum >= hi) { 2440 wrapit: 2441 pfnum = lo; 2442 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it); 2443 idx = PNUM_TO_IDX(mnode, r, pfnum); 2444 wrap++; 2445 #if defined(__sparc) 2446 nhi = 0; /* search kcage ranges */ 2447 #endif 2448 } 2449 } 2450 2451 rw_exit(&page_ctrs_rwlock[mnode]); 2452 VM_STAT_ADD(vmm_vmstats.page_ctrs_failed[r][mrange]); 2453 return (NULL); 2454 } 2455 2456 /* 2457 * For the given mnode, promote as many small pages to large pages as possible. 2458 * mnode can be -1, which means do them all 2459 */ 2460 void 2461 page_freelist_coalesce_all(int mnode) 2462 { 2463 int r; /* region size */ 2464 int idx, full; 2465 size_t len; 2466 int doall = interleaved_mnodes || mnode < 0; 2467 int mlo = doall ? 0 : mnode; 2468 int mhi = doall ? max_mem_nodes : (mnode + 1); 2469 2470 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce_all); 2471 2472 if (mpss_coalesce_disable) { 2473 return; 2474 } 2475 2476 /* 2477 * Lock the entire freelist and coalesce what we can. 2478 * 2479 * Always promote to the largest page possible 2480 * first to reduce the number of page promotions. 2481 */ 2482 for (mnode = mlo; mnode < mhi; mnode++) { 2483 rw_enter(&page_ctrs_rwlock[mnode], RW_READER); 2484 page_freelist_lock(mnode); 2485 } 2486 for (r = mmu_page_sizes - 1; r > 0; r--) { 2487 for (mnode = mlo; mnode < mhi; mnode++) { 2488 pgcnt_t cands = 0; 2489 int mrange, nranges = mnode_nranges[mnode]; 2490 2491 for (mrange = 0; mrange < nranges; mrange++) { 2492 PGCTRS_CANDS_GETVALUE(mnode, mrange, r, cands); 2493 if (cands != 0) 2494 break; 2495 } 2496 if (cands == 0) { 2497 VM_STAT_ADD(vmm_vmstats. 2498 page_ctrs_cands_skip_all); 2499 continue; 2500 } 2501 2502 full = FULL_REGION_CNT(r); 2503 len = PAGE_COUNTERS_ENTRIES(mnode, r); 2504 2505 for (idx = 0; idx < len; idx++) { 2506 if (PAGE_COUNTERS(mnode, r, idx) == full) { 2507 pfn_t pfnum = 2508 IDX_TO_PNUM(mnode, r, idx); 2509 int tmnode = interleaved_mnodes ? 2510 PFN_2_MEM_NODE(pfnum) : mnode; 2511 2512 ASSERT(pfnum >= 2513 mem_node_config[tmnode].physbase && 2514 pfnum < 2515 mem_node_config[tmnode].physmax); 2516 2517 (void) page_promote(tmnode, 2518 pfnum, r, PC_FREE, PC_MTYPE_ANY); 2519 } 2520 } 2521 /* shared hpm_counters covers all mnodes, so we quit */ 2522 if (interleaved_mnodes) 2523 break; 2524 } 2525 } 2526 for (mnode = mlo; mnode < mhi; mnode++) { 2527 page_freelist_unlock(mnode); 2528 rw_exit(&page_ctrs_rwlock[mnode]); 2529 } 2530 } 2531 2532 /* 2533 * This is where all polices for moving pages around 2534 * to different page size free lists is implemented. 2535 * Returns 1 on success, 0 on failure. 2536 * 2537 * So far these are the priorities for this algorithm in descending 2538 * order: 2539 * 2540 * 1) When servicing a request try to do so with a free page 2541 * from next size up. Helps defer fragmentation as long 2542 * as possible. 2543 * 2544 * 2) Page coalesce on demand. Only when a freelist 2545 * larger than PAGESIZE is empty and step 1 2546 * will not work since all larger size lists are 2547 * also empty. 2548 * 2549 * If pfnhi is non-zero, search for large page with pfn range less than pfnhi. 2550 */ 2551 2552 page_t * 2553 page_freelist_split(uchar_t szc, uint_t color, int mnode, int mtype, 2554 pfn_t pfnlo, pfn_t pfnhi, page_list_walker_t *plw) 2555 { 2556 uchar_t nszc = szc + 1; 2557 uint_t bin, sbin, bin_prev; 2558 page_t *pp, *firstpp; 2559 page_t *ret_pp = NULL; 2560 uint_t color_mask; 2561 2562 if (nszc == mmu_page_sizes) 2563 return (NULL); 2564 2565 ASSERT(nszc < mmu_page_sizes); 2566 color_mask = PAGE_GET_PAGECOLORS(nszc) - 1; 2567 bin = sbin = PAGE_GET_NSZ_COLOR(szc, color); 2568 bin_prev = (plw->plw_bin_split_prev == color) ? INVALID_COLOR : 2569 PAGE_GET_NSZ_COLOR(szc, plw->plw_bin_split_prev); 2570 2571 VM_STAT_ADD(vmm_vmstats.pfs_req[szc]); 2572 /* 2573 * First try to break up a larger page to fill current size freelist. 2574 */ 2575 while (plw->plw_bins[nszc] != 0) { 2576 2577 ASSERT(nszc < mmu_page_sizes); 2578 2579 /* 2580 * If page found then demote it. 2581 */ 2582 if (PAGE_FREELISTS(mnode, nszc, bin, mtype)) { 2583 page_freelist_lock(mnode); 2584 firstpp = pp = PAGE_FREELISTS(mnode, nszc, bin, mtype); 2585 2586 /* 2587 * If pfnhi is not PFNNULL, look for large page below 2588 * pfnhi. PFNNULL signifies no pfn requirement. 2589 */ 2590 if (pp && 2591 ((pfnhi != PFNNULL && pp->p_pagenum >= pfnhi) || 2592 (pfnlo != PFNNULL && pp->p_pagenum < pfnlo))) { 2593 do { 2594 pp = pp->p_vpnext; 2595 if (pp == firstpp) { 2596 pp = NULL; 2597 break; 2598 } 2599 } while ((pfnhi != PFNNULL && 2600 pp->p_pagenum >= pfnhi) || 2601 (pfnlo != PFNNULL && 2602 pp->p_pagenum < pfnlo)); 2603 2604 if (pfnhi != PFNNULL && pp != NULL) 2605 ASSERT(pp->p_pagenum < pfnhi); 2606 2607 if (pfnlo != PFNNULL && pp != NULL) 2608 ASSERT(pp->p_pagenum >= pfnlo); 2609 } 2610 if (pp) { 2611 uint_t ccolor = page_correct_color(szc, nszc, 2612 color, bin, plw->plw_ceq_mask[szc]); 2613 2614 ASSERT(pp->p_szc == nszc); 2615 VM_STAT_ADD(vmm_vmstats.pfs_demote[nszc]); 2616 ret_pp = page_demote(mnode, pp->p_pagenum, 2617 pfnhi, pp->p_szc, szc, ccolor, PC_ALLOC); 2618 if (ret_pp) { 2619 page_freelist_unlock(mnode); 2620 #if defined(__sparc) 2621 if (PP_ISNORELOC(ret_pp)) { 2622 pgcnt_t npgs; 2623 2624 npgs = page_get_pagecnt( 2625 ret_pp->p_szc); 2626 kcage_freemem_sub(npgs); 2627 } 2628 #endif 2629 return (ret_pp); 2630 } 2631 } 2632 page_freelist_unlock(mnode); 2633 } 2634 2635 /* loop through next size bins */ 2636 bin = ADD_MASKED(bin, 1, plw->plw_ceq_mask[nszc], color_mask); 2637 plw->plw_bins[nszc]--; 2638 2639 if (bin == sbin) { 2640 uchar_t nnszc = nszc + 1; 2641 2642 /* we are done with this page size - check next */ 2643 if (plw->plw_bins[nnszc] == 0) 2644 /* we have already checked next size bins */ 2645 break; 2646 2647 bin = sbin = PAGE_GET_NSZ_COLOR(nszc, bin); 2648 if (bin_prev != INVALID_COLOR) { 2649 bin_prev = PAGE_GET_NSZ_COLOR(nszc, bin_prev); 2650 if (!((bin ^ bin_prev) & 2651 plw->plw_ceq_mask[nnszc])) 2652 break; 2653 } 2654 ASSERT(nnszc < mmu_page_sizes); 2655 color_mask = PAGE_GET_PAGECOLORS(nnszc) - 1; 2656 nszc = nnszc; 2657 ASSERT(nszc < mmu_page_sizes); 2658 } 2659 } 2660 2661 return (ret_pp); 2662 } 2663 2664 /* 2665 * Helper routine used only by the freelist code to lock 2666 * a page. If the page is a large page then it succeeds in 2667 * locking all the constituent pages or none at all. 2668 * Returns 1 on sucess, 0 on failure. 2669 */ 2670 static int 2671 page_trylock_cons(page_t *pp, se_t se) 2672 { 2673 page_t *tpp, *first_pp = pp; 2674 2675 /* 2676 * Fail if can't lock first or only page. 2677 */ 2678 if (!page_trylock(pp, se)) { 2679 return (0); 2680 } 2681 2682 /* 2683 * PAGESIZE: common case. 2684 */ 2685 if (pp->p_szc == 0) { 2686 return (1); 2687 } 2688 2689 /* 2690 * Large page case. 2691 */ 2692 tpp = pp->p_next; 2693 while (tpp != pp) { 2694 if (!page_trylock(tpp, se)) { 2695 /* 2696 * On failure unlock what we have locked so far. 2697 * We want to avoid attempting to capture these 2698 * pages as the pcm mutex may be held which could 2699 * lead to a recursive mutex panic. 2700 */ 2701 while (first_pp != tpp) { 2702 page_unlock_nocapture(first_pp); 2703 first_pp = first_pp->p_next; 2704 } 2705 return (0); 2706 } 2707 tpp = tpp->p_next; 2708 } 2709 return (1); 2710 } 2711 2712 /* 2713 * init context for walking page lists 2714 * Called when a page of the given szc in unavailable. Sets markers 2715 * for the beginning of the search to detect when search has 2716 * completed a full cycle. Sets flags for splitting larger pages 2717 * and coalescing smaller pages. Page walking procedes until a page 2718 * of the desired equivalent color is found. 2719 */ 2720 void 2721 page_list_walk_init(uchar_t szc, uint_t flags, uint_t bin, int can_split, 2722 int use_ceq, page_list_walker_t *plw) 2723 { 2724 uint_t nszc, ceq_mask, colors; 2725 uchar_t ceq = use_ceq ? colorequivszc[szc] : 0; 2726 2727 ASSERT(szc < mmu_page_sizes); 2728 colors = PAGE_GET_PAGECOLORS(szc); 2729 2730 plw->plw_colors = colors; 2731 plw->plw_color_mask = colors - 1; 2732 plw->plw_bin_marker = plw->plw_bin0 = bin; 2733 plw->plw_bin_split_prev = bin; 2734 plw->plw_bin_step = (szc == 0) ? vac_colors : 1; 2735 2736 /* 2737 * if vac aliasing is possible make sure lower order color 2738 * bits are never ignored 2739 */ 2740 if (vac_colors > 1) 2741 ceq &= 0xf0; 2742 2743 /* 2744 * calculate the number of non-equivalent colors and 2745 * color equivalency mask 2746 */ 2747 plw->plw_ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf)); 2748 ASSERT(szc > 0 || plw->plw_ceq_dif >= vac_colors); 2749 ASSERT(plw->plw_ceq_dif > 0); 2750 plw->plw_ceq_mask[szc] = (plw->plw_ceq_dif - 1) << (ceq & 0xf); 2751 2752 if (flags & PG_MATCH_COLOR) { 2753 if (cpu_page_colors < 0) { 2754 /* 2755 * this is a heterogeneous machine with different CPUs 2756 * having different size e$ (not supported for ni2/rock 2757 */ 2758 uint_t cpucolors = CPUSETSIZE() >> PAGE_GET_SHIFT(szc); 2759 cpucolors = MAX(cpucolors, 1); 2760 ceq_mask = plw->plw_color_mask & (cpucolors - 1); 2761 plw->plw_ceq_mask[szc] = 2762 MIN(ceq_mask, plw->plw_ceq_mask[szc]); 2763 } 2764 plw->plw_ceq_dif = 1; 2765 } 2766 2767 /* we can split pages in the freelist, but not the cachelist */ 2768 if (can_split) { 2769 plw->plw_do_split = (szc + 1 < mmu_page_sizes) ? 1 : 0; 2770 2771 /* set next szc color masks and number of free list bins */ 2772 for (nszc = szc + 1; nszc < mmu_page_sizes; nszc++, szc++) { 2773 plw->plw_ceq_mask[nszc] = PAGE_GET_NSZ_MASK(szc, 2774 plw->plw_ceq_mask[szc]); 2775 plw->plw_bins[nszc] = PAGE_GET_PAGECOLORS(nszc); 2776 } 2777 plw->plw_ceq_mask[nszc] = INVALID_MASK; 2778 plw->plw_bins[nszc] = 0; 2779 2780 } else { 2781 ASSERT(szc == 0); 2782 plw->plw_do_split = 0; 2783 plw->plw_bins[1] = 0; 2784 plw->plw_ceq_mask[1] = INVALID_MASK; 2785 } 2786 } 2787 2788 /* 2789 * set mark to flag where next split should occur 2790 */ 2791 #define PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw) { \ 2792 uint_t bin_nsz = PAGE_GET_NSZ_COLOR(szc, bin); \ 2793 uint_t bin0_nsz = PAGE_GET_NSZ_COLOR(szc, plw->plw_bin0); \ 2794 uint_t neq_mask = ~plw->plw_ceq_mask[nszc] & plw->plw_color_mask; \ 2795 plw->plw_split_next = \ 2796 INC_MASKED(bin_nsz, neq_mask, plw->plw_color_mask); \ 2797 if (!((plw->plw_split_next ^ bin0_nsz) & plw->plw_ceq_mask[nszc])) { \ 2798 plw->plw_split_next = \ 2799 INC_MASKED(plw->plw_split_next, \ 2800 neq_mask, plw->plw_color_mask); \ 2801 } \ 2802 } 2803 2804 uint_t 2805 page_list_walk_next_bin(uchar_t szc, uint_t bin, page_list_walker_t *plw) 2806 { 2807 uint_t neq_mask = ~plw->plw_ceq_mask[szc] & plw->plw_color_mask; 2808 uint_t bin0_nsz, nbin_nsz, nbin0, nbin; 2809 uchar_t nszc = szc + 1; 2810 2811 nbin = ADD_MASKED(bin, 2812 plw->plw_bin_step, neq_mask, plw->plw_color_mask); 2813 2814 if (plw->plw_do_split) { 2815 plw->plw_bin_split_prev = bin; 2816 PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw); 2817 plw->plw_do_split = 0; 2818 } 2819 2820 if (szc == 0) { 2821 if (plw->plw_count != 0 || plw->plw_ceq_dif == vac_colors) { 2822 if (nbin == plw->plw_bin0 && 2823 (vac_colors == 1 || nbin != plw->plw_bin_marker)) { 2824 nbin = ADD_MASKED(nbin, plw->plw_bin_step, 2825 neq_mask, plw->plw_color_mask); 2826 plw->plw_bin_split_prev = plw->plw_bin0; 2827 } 2828 2829 if (vac_colors > 1 && nbin == plw->plw_bin_marker) { 2830 plw->plw_bin_marker = 2831 nbin = INC_MASKED(nbin, neq_mask, 2832 plw->plw_color_mask); 2833 plw->plw_bin_split_prev = plw->plw_bin0; 2834 /* 2835 * large pages all have the same vac color 2836 * so by now we should be done with next 2837 * size page splitting process 2838 */ 2839 ASSERT(plw->plw_bins[1] == 0); 2840 plw->plw_do_split = 0; 2841 return (nbin); 2842 } 2843 2844 } else { 2845 uint_t bin_jump = (vac_colors == 1) ? 2846 (BIN_STEP & ~3) - (plw->plw_bin0 & 3) : BIN_STEP; 2847 2848 bin_jump &= ~(vac_colors - 1); 2849 2850 nbin0 = ADD_MASKED(plw->plw_bin0, bin_jump, neq_mask, 2851 plw->plw_color_mask); 2852 2853 if ((nbin0 ^ plw->plw_bin0) & plw->plw_ceq_mask[szc]) { 2854 2855 plw->plw_bin_marker = nbin = nbin0; 2856 2857 if (plw->plw_bins[nszc] != 0) { 2858 /* 2859 * check if next page size bin is the 2860 * same as the next page size bin for 2861 * bin0 2862 */ 2863 nbin_nsz = PAGE_GET_NSZ_COLOR(szc, 2864 nbin); 2865 bin0_nsz = PAGE_GET_NSZ_COLOR(szc, 2866 plw->plw_bin0); 2867 2868 if ((bin0_nsz ^ nbin_nsz) & 2869 plw->plw_ceq_mask[nszc]) 2870 plw->plw_do_split = 1; 2871 } 2872 return (nbin); 2873 } 2874 } 2875 } 2876 2877 if (plw->plw_bins[nszc] != 0) { 2878 nbin_nsz = PAGE_GET_NSZ_COLOR(szc, nbin); 2879 if (!((plw->plw_split_next ^ nbin_nsz) & 2880 plw->plw_ceq_mask[nszc])) 2881 plw->plw_do_split = 1; 2882 } 2883 2884 return (nbin); 2885 } 2886 2887 page_t * 2888 page_get_mnode_freelist(int mnode, uint_t bin, int mtype, uchar_t szc, 2889 uint_t flags) 2890 { 2891 kmutex_t *pcm; 2892 page_t *pp, *first_pp; 2893 uint_t sbin; 2894 int plw_initialized; 2895 page_list_walker_t plw; 2896 2897 ASSERT(szc < mmu_page_sizes); 2898 2899 VM_STAT_ADD(vmm_vmstats.pgmf_alloc[szc]); 2900 2901 MTYPE_START(mnode, mtype, flags); 2902 if (mtype < 0) { /* mnode does not have memory in mtype range */ 2903 VM_STAT_ADD(vmm_vmstats.pgmf_allocempty[szc]); 2904 return (NULL); 2905 } 2906 try_again: 2907 2908 plw_initialized = 0; 2909 plw.plw_ceq_dif = 1; 2910 2911 /* 2912 * Only hold one freelist lock at a time, that way we 2913 * can start anywhere and not have to worry about lock 2914 * ordering. 2915 */ 2916 for (plw.plw_count = 0; 2917 plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) { 2918 sbin = bin; 2919 do { 2920 if (!PAGE_FREELISTS(mnode, szc, bin, mtype)) 2921 goto bin_empty_1; 2922 2923 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 2924 mutex_enter(pcm); 2925 pp = PAGE_FREELISTS(mnode, szc, bin, mtype); 2926 if (pp == NULL) 2927 goto bin_empty_0; 2928 2929 /* 2930 * These were set before the page 2931 * was put on the free list, 2932 * they must still be set. 2933 */ 2934 ASSERT(PP_ISFREE(pp)); 2935 ASSERT(PP_ISAGED(pp)); 2936 ASSERT(pp->p_vnode == NULL); 2937 ASSERT(pp->p_hash == NULL); 2938 ASSERT(pp->p_offset == (u_offset_t)-1); 2939 ASSERT(pp->p_szc == szc); 2940 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 2941 2942 /* 2943 * Walk down the hash chain. 2944 * 8k pages are linked on p_next 2945 * and p_prev fields. Large pages 2946 * are a contiguous group of 2947 * constituent pages linked together 2948 * on their p_next and p_prev fields. 2949 * The large pages are linked together 2950 * on the hash chain using p_vpnext 2951 * p_vpprev of the base constituent 2952 * page of each large page. 2953 */ 2954 first_pp = pp; 2955 while (!page_trylock_cons(pp, SE_EXCL) || 2956 IS_DUMP_PAGE(pp)) { 2957 if (szc == 0) { 2958 pp = pp->p_next; 2959 } else { 2960 pp = pp->p_vpnext; 2961 } 2962 2963 ASSERT(PP_ISFREE(pp)); 2964 ASSERT(PP_ISAGED(pp)); 2965 ASSERT(pp->p_vnode == NULL); 2966 ASSERT(pp->p_hash == NULL); 2967 ASSERT(pp->p_offset == (u_offset_t)-1); 2968 ASSERT(pp->p_szc == szc); 2969 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 2970 2971 if (pp == first_pp) 2972 goto bin_empty_0; 2973 } 2974 2975 ASSERT(pp != NULL); 2976 ASSERT(mtype == PP_2_MTYPE(pp)); 2977 ASSERT(pp->p_szc == szc); 2978 if (szc == 0) { 2979 page_sub(&PAGE_FREELISTS(mnode, 2980 szc, bin, mtype), pp); 2981 } else { 2982 page_vpsub(&PAGE_FREELISTS(mnode, 2983 szc, bin, mtype), pp); 2984 CHK_LPG(pp, szc); 2985 } 2986 page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST); 2987 2988 if ((PP_ISFREE(pp) == 0) || (PP_ISAGED(pp) == 0)) 2989 panic("free page is not. pp %p", (void *)pp); 2990 mutex_exit(pcm); 2991 2992 #if defined(__sparc) 2993 ASSERT(!kcage_on || PP_ISNORELOC(pp) || 2994 (flags & PG_NORELOC) == 0); 2995 2996 if (PP_ISNORELOC(pp)) 2997 kcage_freemem_sub(page_get_pagecnt(szc)); 2998 #endif 2999 VM_STAT_ADD(vmm_vmstats.pgmf_allocok[szc]); 3000 return (pp); 3001 3002 bin_empty_0: 3003 mutex_exit(pcm); 3004 bin_empty_1: 3005 if (plw_initialized == 0) { 3006 page_list_walk_init(szc, flags, bin, 1, 1, 3007 &plw); 3008 plw_initialized = 1; 3009 ASSERT(plw.plw_colors <= 3010 PAGE_GET_PAGECOLORS(szc)); 3011 ASSERT(plw.plw_colors > 0); 3012 ASSERT((plw.plw_colors & 3013 (plw.plw_colors - 1)) == 0); 3014 ASSERT(bin < plw.plw_colors); 3015 ASSERT(plw.plw_ceq_mask[szc] < plw.plw_colors); 3016 } 3017 /* calculate the next bin with equivalent color */ 3018 bin = ADD_MASKED(bin, plw.plw_bin_step, 3019 plw.plw_ceq_mask[szc], plw.plw_color_mask); 3020 } while (sbin != bin); 3021 3022 /* 3023 * color bins are all empty if color match. Try and 3024 * satisfy the request by breaking up or coalescing 3025 * pages from a different size freelist of the correct 3026 * color that satisfies the ORIGINAL color requested. 3027 * If that fails then try pages of the same size but 3028 * different colors assuming we are not called with 3029 * PG_MATCH_COLOR. 3030 */ 3031 if (plw.plw_do_split && 3032 (pp = page_freelist_split(szc, bin, mnode, 3033 mtype, PFNNULL, PFNNULL, &plw)) != NULL) 3034 return (pp); 3035 3036 if (szc > 0 && (pp = page_freelist_coalesce(mnode, szc, 3037 bin, plw.plw_ceq_mask[szc], mtype, PFNNULL)) != NULL) 3038 return (pp); 3039 3040 if (plw.plw_ceq_dif > 1) 3041 bin = page_list_walk_next_bin(szc, bin, &plw); 3042 } 3043 3044 /* if allowed, cycle through additional mtypes */ 3045 MTYPE_NEXT(mnode, mtype, flags); 3046 if (mtype >= 0) 3047 goto try_again; 3048 3049 VM_STAT_ADD(vmm_vmstats.pgmf_allocfailed[szc]); 3050 3051 return (NULL); 3052 } 3053 3054 /* 3055 * Returns the count of free pages for 'pp' with size code 'szc'. 3056 * Note: This function does not return an exact value as the page freelist 3057 * locks are not held and thus the values in the page_counters may be 3058 * changing as we walk through the data. 3059 */ 3060 static int 3061 page_freecnt(int mnode, page_t *pp, uchar_t szc) 3062 { 3063 pgcnt_t pgfree; 3064 pgcnt_t cnt; 3065 ssize_t r = szc; /* region size */ 3066 ssize_t idx; 3067 int i; 3068 int full, range; 3069 3070 /* Make sure pagenum passed in is aligned properly */ 3071 ASSERT((pp->p_pagenum & (PNUM_SIZE(szc) - 1)) == 0); 3072 ASSERT(szc > 0); 3073 3074 /* Prevent page_counters dynamic memory from being freed */ 3075 rw_enter(&page_ctrs_rwlock[mnode], RW_READER); 3076 idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum); 3077 cnt = PAGE_COUNTERS(mnode, r, idx); 3078 pgfree = cnt << PNUM_SHIFT(r - 1); 3079 range = FULL_REGION_CNT(szc); 3080 3081 /* Check for completely full region */ 3082 if (cnt == range) { 3083 rw_exit(&page_ctrs_rwlock[mnode]); 3084 return (pgfree); 3085 } 3086 3087 while (--r > 0) { 3088 idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum); 3089 full = FULL_REGION_CNT(r); 3090 for (i = 0; i < range; i++, idx++) { 3091 cnt = PAGE_COUNTERS(mnode, r, idx); 3092 /* 3093 * If cnt here is full, that means we have already 3094 * accounted for these pages earlier. 3095 */ 3096 if (cnt != full) { 3097 pgfree += (cnt << PNUM_SHIFT(r - 1)); 3098 } 3099 } 3100 range *= full; 3101 } 3102 rw_exit(&page_ctrs_rwlock[mnode]); 3103 return (pgfree); 3104 } 3105 3106 /* 3107 * Called from page_geti_contig_pages to exclusively lock constituent pages 3108 * starting from 'spp' for page size code 'szc'. 3109 * 3110 * If 'ptcpthreshold' is set, the number of free pages needed in the 'szc' 3111 * region needs to be greater than or equal to the threshold. 3112 */ 3113 static int 3114 page_trylock_contig_pages(int mnode, page_t *spp, uchar_t szc, int flags) 3115 { 3116 pgcnt_t pgcnt = PNUM_SIZE(szc); 3117 pgcnt_t pgfree, i; 3118 page_t *pp; 3119 3120 VM_STAT_ADD(vmm_vmstats.ptcp[szc]); 3121 3122 3123 if ((ptcpthreshold == 0) || (flags & PGI_PGCPHIPRI)) 3124 goto skipptcpcheck; 3125 /* 3126 * check if there are sufficient free pages available before attempting 3127 * to trylock. Count is approximate as page counters can change. 3128 */ 3129 pgfree = page_freecnt(mnode, spp, szc); 3130 3131 /* attempt to trylock if there are sufficient already free pages */ 3132 if (pgfree < pgcnt/ptcpthreshold) { 3133 VM_STAT_ADD(vmm_vmstats.ptcpfreethresh[szc]); 3134 return (0); 3135 } 3136 3137 skipptcpcheck: 3138 3139 for (i = 0; i < pgcnt; i++) { 3140 pp = &spp[i]; 3141 if (!page_trylock(pp, SE_EXCL)) { 3142 VM_STAT_ADD(vmm_vmstats.ptcpfailexcl[szc]); 3143 while (--i != (pgcnt_t)-1) { 3144 pp = &spp[i]; 3145 ASSERT(PAGE_EXCL(pp)); 3146 page_unlock_nocapture(pp); 3147 } 3148 return (0); 3149 } 3150 ASSERT(spp[i].p_pagenum == spp->p_pagenum + i); 3151 if ((pp->p_szc > szc || (szc && pp->p_szc == szc)) && 3152 !PP_ISFREE(pp)) { 3153 VM_STAT_ADD(vmm_vmstats.ptcpfailszc[szc]); 3154 ASSERT(i == 0); 3155 page_unlock_nocapture(pp); 3156 return (0); 3157 } 3158 if (PP_ISNORELOC(pp)) { 3159 VM_STAT_ADD(vmm_vmstats.ptcpfailcage[szc]); 3160 while (i != (pgcnt_t)-1) { 3161 pp = &spp[i]; 3162 ASSERT(PAGE_EXCL(pp)); 3163 page_unlock_nocapture(pp); 3164 i--; 3165 } 3166 return (0); 3167 } 3168 } 3169 VM_STAT_ADD(vmm_vmstats.ptcpok[szc]); 3170 return (1); 3171 } 3172 3173 /* 3174 * Claim large page pointed to by 'pp'. 'pp' is the starting set 3175 * of 'szc' constituent pages that had been locked exclusively previously. 3176 * Will attempt to relocate constituent pages in use. 3177 */ 3178 static page_t * 3179 page_claim_contig_pages(page_t *pp, uchar_t szc, int flags) 3180 { 3181 spgcnt_t pgcnt, npgs, i; 3182 page_t *targpp, *rpp, *hpp; 3183 page_t *replpp = NULL; 3184 page_t *pplist = NULL; 3185 3186 ASSERT(pp != NULL); 3187 3188 pgcnt = page_get_pagecnt(szc); 3189 while (pgcnt) { 3190 ASSERT(PAGE_EXCL(pp)); 3191 ASSERT(!PP_ISNORELOC(pp)); 3192 if (PP_ISFREE(pp)) { 3193 /* 3194 * If this is a PG_FREE_LIST page then its 3195 * size code can change underneath us due to 3196 * page promotion or demotion. As an optimzation 3197 * use page_list_sub_pages() instead of 3198 * page_list_sub(). 3199 */ 3200 if (PP_ISAGED(pp)) { 3201 page_list_sub_pages(pp, szc); 3202 if (pp->p_szc == szc) { 3203 return (pp); 3204 } 3205 ASSERT(pp->p_szc < szc); 3206 npgs = page_get_pagecnt(pp->p_szc); 3207 hpp = pp; 3208 for (i = 0; i < npgs; i++, pp++) { 3209 pp->p_szc = szc; 3210 } 3211 page_list_concat(&pplist, &hpp); 3212 pgcnt -= npgs; 3213 continue; 3214 } 3215 ASSERT(!PP_ISAGED(pp)); 3216 ASSERT(pp->p_szc == 0); 3217 page_list_sub(pp, PG_CACHE_LIST); 3218 page_hashout(pp, NULL); 3219 PP_SETAGED(pp); 3220 pp->p_szc = szc; 3221 page_list_concat(&pplist, &pp); 3222 pp++; 3223 pgcnt--; 3224 continue; 3225 } 3226 npgs = page_get_pagecnt(pp->p_szc); 3227 3228 /* 3229 * page_create_wait freemem accounting done by caller of 3230 * page_get_freelist and not necessary to call it prior to 3231 * calling page_get_replacement_page. 3232 * 3233 * page_get_replacement_page can call page_get_contig_pages 3234 * to acquire a large page (szc > 0); the replacement must be 3235 * smaller than the contig page size to avoid looping or 3236 * szc == 0 and PGI_PGCPSZC0 is set. 3237 */ 3238 if (pp->p_szc < szc || (szc == 0 && (flags & PGI_PGCPSZC0))) { 3239 replpp = page_get_replacement_page(pp, NULL, 0); 3240 if (replpp) { 3241 npgs = page_get_pagecnt(pp->p_szc); 3242 ASSERT(npgs <= pgcnt); 3243 targpp = pp; 3244 } 3245 } 3246 3247 /* 3248 * If replacement is NULL or do_page_relocate fails, fail 3249 * coalescing of pages. 3250 */ 3251 if (replpp == NULL || (do_page_relocate(&targpp, &replpp, 0, 3252 &npgs, NULL) != 0)) { 3253 /* 3254 * Unlock un-processed target list 3255 */ 3256 while (pgcnt--) { 3257 ASSERT(PAGE_EXCL(pp)); 3258 page_unlock_nocapture(pp); 3259 pp++; 3260 } 3261 /* 3262 * Free the processed target list. 3263 */ 3264 while (pplist) { 3265 pp = pplist; 3266 page_sub(&pplist, pp); 3267 ASSERT(PAGE_EXCL(pp)); 3268 ASSERT(pp->p_szc == szc); 3269 ASSERT(PP_ISFREE(pp)); 3270 ASSERT(PP_ISAGED(pp)); 3271 pp->p_szc = 0; 3272 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); 3273 page_unlock_nocapture(pp); 3274 } 3275 3276 if (replpp != NULL) 3277 page_free_replacement_page(replpp); 3278 3279 return (NULL); 3280 } 3281 ASSERT(pp == targpp); 3282 3283 /* LINTED */ 3284 ASSERT(hpp = pp); /* That's right, it's an assignment */ 3285 3286 pp += npgs; 3287 pgcnt -= npgs; 3288 3289 while (npgs--) { 3290 ASSERT(PAGE_EXCL(targpp)); 3291 ASSERT(!PP_ISFREE(targpp)); 3292 ASSERT(!PP_ISNORELOC(targpp)); 3293 PP_SETFREE(targpp); 3294 ASSERT(PP_ISAGED(targpp)); 3295 ASSERT(targpp->p_szc < szc || (szc == 0 && 3296 (flags & PGI_PGCPSZC0))); 3297 targpp->p_szc = szc; 3298 targpp = targpp->p_next; 3299 3300 rpp = replpp; 3301 ASSERT(rpp != NULL); 3302 page_sub(&replpp, rpp); 3303 ASSERT(PAGE_EXCL(rpp)); 3304 ASSERT(!PP_ISFREE(rpp)); 3305 page_unlock_nocapture(rpp); 3306 } 3307 ASSERT(targpp == hpp); 3308 ASSERT(replpp == NULL); 3309 page_list_concat(&pplist, &targpp); 3310 } 3311 CHK_LPG(pplist, szc); 3312 return (pplist); 3313 } 3314 3315 /* 3316 * Trim kernel cage from pfnlo-pfnhi and store result in lo-hi. Return code 3317 * of 0 means nothing left after trim. 3318 */ 3319 int 3320 trimkcage(struct memseg *mseg, pfn_t *lo, pfn_t *hi, pfn_t pfnlo, pfn_t pfnhi) 3321 { 3322 pfn_t kcagepfn; 3323 int decr; 3324 int rc = 0; 3325 3326 if (PP_ISNORELOC(mseg->pages)) { 3327 if (PP_ISNORELOC(mseg->epages - 1) == 0) { 3328 3329 /* lower part of this mseg inside kernel cage */ 3330 decr = kcage_current_pfn(&kcagepfn); 3331 3332 /* kernel cage may have transitioned past mseg */ 3333 if (kcagepfn >= mseg->pages_base && 3334 kcagepfn < mseg->pages_end) { 3335 ASSERT(decr == 0); 3336 *lo = MAX(kcagepfn, pfnlo); 3337 *hi = MIN(pfnhi, (mseg->pages_end - 1)); 3338 rc = 1; 3339 } 3340 } 3341 /* else entire mseg in the cage */ 3342 } else { 3343 if (PP_ISNORELOC(mseg->epages - 1)) { 3344 3345 /* upper part of this mseg inside kernel cage */ 3346 decr = kcage_current_pfn(&kcagepfn); 3347 3348 /* kernel cage may have transitioned past mseg */ 3349 if (kcagepfn >= mseg->pages_base && 3350 kcagepfn < mseg->pages_end) { 3351 ASSERT(decr); 3352 *hi = MIN(kcagepfn, pfnhi); 3353 *lo = MAX(pfnlo, mseg->pages_base); 3354 rc = 1; 3355 } 3356 } else { 3357 /* entire mseg outside of kernel cage */ 3358 *lo = MAX(pfnlo, mseg->pages_base); 3359 *hi = MIN(pfnhi, (mseg->pages_end - 1)); 3360 rc = 1; 3361 } 3362 } 3363 return (rc); 3364 } 3365 3366 /* 3367 * called from page_get_contig_pages to search 'pfnlo' thru 'pfnhi' to claim a 3368 * page with size code 'szc'. Claiming such a page requires acquiring 3369 * exclusive locks on all constituent pages (page_trylock_contig_pages), 3370 * relocating pages in use and concatenating these constituent pages into a 3371 * large page. 3372 * 3373 * The page lists do not have such a large page and page_freelist_split has 3374 * already failed to demote larger pages and/or coalesce smaller free pages. 3375 * 3376 * 'flags' may specify PG_COLOR_MATCH which would limit the search of large 3377 * pages with the same color as 'bin'. 3378 * 3379 * 'pfnflag' specifies the subset of the pfn range to search. 3380 */ 3381 3382 static page_t * 3383 page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags, 3384 pfn_t pfnlo, pfn_t pfnhi, pgcnt_t pfnflag) 3385 { 3386 struct memseg *mseg; 3387 pgcnt_t szcpgcnt = page_get_pagecnt(szc); 3388 pgcnt_t szcpgmask = szcpgcnt - 1; 3389 pfn_t randpfn; 3390 page_t *pp, *randpp, *endpp; 3391 uint_t colors, ceq_mask; 3392 /* LINTED : set but not used in function */ 3393 uint_t color_mask; 3394 pfn_t hi, lo; 3395 uint_t skip; 3396 MEM_NODE_ITERATOR_DECL(it); 3397 3398 ASSERT(szc != 0 || (flags & PGI_PGCPSZC0)); 3399 3400 pfnlo = P2ROUNDUP(pfnlo, szcpgcnt); 3401 3402 if ((pfnhi - pfnlo) + 1 < szcpgcnt || pfnlo >= pfnhi) 3403 return (NULL); 3404 3405 ASSERT(szc < mmu_page_sizes); 3406 3407 colors = PAGE_GET_PAGECOLORS(szc); 3408 color_mask = colors - 1; 3409 if ((colors > 1) && (flags & PG_MATCH_COLOR)) { 3410 uchar_t ceq = colorequivszc[szc]; 3411 uint_t ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf)); 3412 3413 ASSERT(ceq_dif > 0); 3414 ceq_mask = (ceq_dif - 1) << (ceq & 0xf); 3415 } else { 3416 ceq_mask = 0; 3417 } 3418 3419 ASSERT(bin < colors); 3420 3421 /* clear "non-significant" color bits */ 3422 bin &= ceq_mask; 3423 3424 /* 3425 * trim the pfn range to search based on pfnflag. pfnflag is set 3426 * when there have been previous page_get_contig_page failures to 3427 * limit the search. 3428 * 3429 * The high bit in pfnflag specifies the number of 'slots' in the 3430 * pfn range and the remainder of pfnflag specifies which slot. 3431 * For example, a value of 1010b would mean the second slot of 3432 * the pfn range that has been divided into 8 slots. 3433 */ 3434 if (pfnflag > 1) { 3435 int slots = 1 << (highbit(pfnflag) - 1); 3436 int slotid = pfnflag & (slots - 1); 3437 pgcnt_t szcpages; 3438 int slotlen; 3439 3440 pfnhi = P2ALIGN((pfnhi + 1), szcpgcnt) - 1; 3441 szcpages = ((pfnhi - pfnlo) + 1) / szcpgcnt; 3442 slotlen = howmany(szcpages, slots); 3443 /* skip if 'slotid' slot is empty */ 3444 if (slotid * slotlen >= szcpages) 3445 return (NULL); 3446 pfnlo = pfnlo + (((slotid * slotlen) % szcpages) * szcpgcnt); 3447 ASSERT(pfnlo < pfnhi); 3448 if (pfnhi > pfnlo + (slotlen * szcpgcnt)) 3449 pfnhi = pfnlo + (slotlen * szcpgcnt) - 1; 3450 } 3451 3452 memsegs_lock(0); 3453 3454 /* 3455 * loop through memsegs to look for contig page candidates 3456 */ 3457 3458 for (mseg = memsegs; mseg != NULL; mseg = mseg->next) { 3459 if (pfnhi < mseg->pages_base || pfnlo >= mseg->pages_end) { 3460 /* no overlap */ 3461 continue; 3462 } 3463 3464 if (mseg->pages_end - mseg->pages_base < szcpgcnt) 3465 /* mseg too small */ 3466 continue; 3467 3468 /* 3469 * trim off kernel cage pages from pfn range and check for 3470 * a trimmed pfn range returned that does not span the 3471 * desired large page size. 3472 */ 3473 if (kcage_on) { 3474 if (trimkcage(mseg, &lo, &hi, pfnlo, pfnhi) == 0 || 3475 lo >= hi || ((hi - lo) + 1) < szcpgcnt) 3476 continue; 3477 } else { 3478 lo = MAX(pfnlo, mseg->pages_base); 3479 hi = MIN(pfnhi, (mseg->pages_end - 1)); 3480 } 3481 3482 /* round to szcpgcnt boundaries */ 3483 lo = P2ROUNDUP(lo, szcpgcnt); 3484 3485 MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it); 3486 hi = P2ALIGN((hi + 1), szcpgcnt) - 1; 3487 3488 if (hi <= lo) 3489 continue; 3490 3491 /* 3492 * set lo to point to the pfn for the desired bin. Large 3493 * page sizes may only have a single page color 3494 */ 3495 skip = szcpgcnt; 3496 if (ceq_mask > 0 || interleaved_mnodes) { 3497 /* set lo to point at appropriate color */ 3498 if (((PFN_2_COLOR(lo, szc, &it) ^ bin) & ceq_mask) || 3499 (interleaved_mnodes && 3500 PFN_2_MEM_NODE(lo) != mnode)) { 3501 PAGE_NEXT_PFN_FOR_COLOR(lo, szc, bin, ceq_mask, 3502 color_mask, &it); 3503 } 3504 if (hi <= lo) 3505 /* mseg cannot satisfy color request */ 3506 continue; 3507 } 3508 3509 /* randomly choose a point between lo and hi to begin search */ 3510 3511 randpfn = (pfn_t)GETTICK(); 3512 randpfn = ((randpfn % (hi - lo)) + lo) & ~(skip - 1); 3513 MEM_NODE_ITERATOR_INIT(randpfn, mnode, szc, &it); 3514 if (ceq_mask || interleaved_mnodes || randpfn == (pfn_t)-1) { 3515 if (randpfn != (pfn_t)-1) { 3516 PAGE_NEXT_PFN_FOR_COLOR(randpfn, szc, bin, 3517 ceq_mask, color_mask, &it); 3518 } 3519 if (randpfn >= hi) { 3520 randpfn = lo; 3521 MEM_NODE_ITERATOR_INIT(randpfn, mnode, szc, 3522 &it); 3523 } 3524 } 3525 randpp = mseg->pages + (randpfn - mseg->pages_base); 3526 3527 ASSERT(randpp->p_pagenum == randpfn); 3528 3529 pp = randpp; 3530 endpp = mseg->pages + (hi - mseg->pages_base) + 1; 3531 3532 ASSERT(randpp + szcpgcnt <= endpp); 3533 3534 do { 3535 ASSERT(!(pp->p_pagenum & szcpgmask)); 3536 ASSERT(((PP_2_BIN(pp) ^ bin) & ceq_mask) == 0); 3537 3538 if (page_trylock_contig_pages(mnode, pp, szc, flags)) { 3539 /* pages unlocked by page_claim on failure */ 3540 if (page_claim_contig_pages(pp, szc, flags)) { 3541 memsegs_unlock(0); 3542 return (pp); 3543 } 3544 } 3545 3546 if (ceq_mask == 0 && !interleaved_mnodes) { 3547 pp += skip; 3548 } else { 3549 pfn_t pfn = pp->p_pagenum; 3550 3551 PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, bin, 3552 ceq_mask, color_mask, &it); 3553 if (pfn == (pfn_t)-1) { 3554 pp = endpp; 3555 } else { 3556 pp = mseg->pages + 3557 (pfn - mseg->pages_base); 3558 } 3559 } 3560 if (pp >= endpp) { 3561 /* start from the beginning */ 3562 MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it); 3563 pp = mseg->pages + (lo - mseg->pages_base); 3564 ASSERT(pp->p_pagenum == lo); 3565 ASSERT(pp + szcpgcnt <= endpp); 3566 } 3567 } while (pp != randpp); 3568 } 3569 memsegs_unlock(0); 3570 return (NULL); 3571 } 3572 3573 3574 /* 3575 * controlling routine that searches through physical memory in an attempt to 3576 * claim a large page based on the input parameters. 3577 * on the page free lists. 3578 * 3579 * calls page_geti_contig_pages with an initial pfn range from the mnode 3580 * and mtype. page_geti_contig_pages will trim off the parts of the pfn range 3581 * that overlaps with the kernel cage or does not match the requested page 3582 * color if PG_MATCH_COLOR is set. Since this search is very expensive, 3583 * page_geti_contig_pages may further limit the search range based on 3584 * previous failure counts (pgcpfailcnt[]). 3585 * 3586 * for PGI_PGCPSZC0 requests, page_get_contig_pages will relocate a base 3587 * pagesize page that satisfies mtype. 3588 */ 3589 page_t * 3590 page_get_contig_pages(int mnode, uint_t bin, int mtype, uchar_t szc, 3591 uint_t flags) 3592 { 3593 pfn_t pfnlo, pfnhi; /* contig pages pfn range */ 3594 page_t *pp; 3595 pgcnt_t pfnflag = 0; /* no limit on search if 0 */ 3596 3597 VM_STAT_ADD(vmm_vmstats.pgcp_alloc[szc]); 3598 3599 /* no allocations from cage */ 3600 flags |= PGI_NOCAGE; 3601 3602 /* LINTED */ 3603 MTYPE_START(mnode, mtype, flags); 3604 if (mtype < 0) { /* mnode does not have memory in mtype range */ 3605 VM_STAT_ADD(vmm_vmstats.pgcp_allocempty[szc]); 3606 return (NULL); 3607 } 3608 3609 ASSERT(szc > 0 || (flags & PGI_PGCPSZC0)); 3610 3611 /* do not limit search and ignore color if hi pri */ 3612 3613 if (pgcplimitsearch && ((flags & PGI_PGCPHIPRI) == 0)) 3614 pfnflag = pgcpfailcnt[szc]; 3615 3616 /* remove color match to improve chances */ 3617 3618 if (flags & PGI_PGCPHIPRI || pfnflag) 3619 flags &= ~PG_MATCH_COLOR; 3620 3621 do { 3622 /* get pfn range based on mnode and mtype */ 3623 MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi); 3624 3625 ASSERT(pfnhi >= pfnlo); 3626 3627 pp = page_geti_contig_pages(mnode, bin, szc, flags, 3628 pfnlo, pfnhi, pfnflag); 3629 3630 if (pp != NULL) { 3631 pfnflag = pgcpfailcnt[szc]; 3632 if (pfnflag) { 3633 /* double the search size */ 3634 pgcpfailcnt[szc] = pfnflag >> 1; 3635 } 3636 VM_STAT_ADD(vmm_vmstats.pgcp_allocok[szc]); 3637 return (pp); 3638 } 3639 MTYPE_NEXT(mnode, mtype, flags); 3640 } while (mtype >= 0); 3641 3642 VM_STAT_ADD(vmm_vmstats.pgcp_allocfailed[szc]); 3643 return (NULL); 3644 } 3645 3646 #if defined(__i386) || defined(__amd64) 3647 /* 3648 * Determine the likelihood of finding/coalescing a szc page. 3649 * Return 0 if the likelihood is small otherwise return 1. 3650 * 3651 * For now, be conservative and check only 1g pages and return 0 3652 * if there had been previous coalescing failures and the szc pages 3653 * needed to satisfy request would exhaust most of freemem. 3654 */ 3655 int 3656 page_chk_freelist(uint_t szc) 3657 { 3658 pgcnt_t pgcnt; 3659 3660 if (szc <= 1) 3661 return (1); 3662 3663 pgcnt = page_get_pagecnt(szc); 3664 if (pgcpfailcnt[szc] && pgcnt + throttlefree >= freemem) { 3665 VM_STAT_ADD(vmm_vmstats.pcf_deny[szc]); 3666 return (0); 3667 } 3668 VM_STAT_ADD(vmm_vmstats.pcf_allow[szc]); 3669 return (1); 3670 } 3671 #endif 3672 3673 /* 3674 * Find the `best' page on the freelist for this (vp,off) (as,vaddr) pair. 3675 * 3676 * Does its own locking and accounting. 3677 * If PG_MATCH_COLOR is set, then NULL will be returned if there are no 3678 * pages of the proper color even if there are pages of a different color. 3679 * 3680 * Finds a page, removes it, THEN locks it. 3681 */ 3682 3683 /*ARGSUSED*/ 3684 page_t * 3685 page_get_freelist(struct vnode *vp, u_offset_t off, struct seg *seg, 3686 caddr_t vaddr, size_t size, uint_t flags, struct lgrp *lgrp) 3687 { 3688 struct as *as = seg->s_as; 3689 page_t *pp = NULL; 3690 ulong_t bin; 3691 uchar_t szc; 3692 int mnode; 3693 int mtype; 3694 page_t *(*page_get_func)(int, uint_t, int, uchar_t, uint_t); 3695 lgrp_mnode_cookie_t lgrp_cookie; 3696 3697 page_get_func = page_get_mnode_freelist; 3698 3699 /* 3700 * If we aren't passed a specific lgroup, or passed a freed lgrp 3701 * assume we wish to allocate near to the current thread's home. 3702 */ 3703 if (!LGRP_EXISTS(lgrp)) 3704 lgrp = lgrp_home_lgrp(); 3705 3706 if (kcage_on) { 3707 if ((flags & (PG_NORELOC | PG_PANIC)) == PG_NORELOC && 3708 kcage_freemem < kcage_throttlefree + btop(size) && 3709 curthread != kcage_cageout_thread) { 3710 /* 3711 * Set a "reserve" of kcage_throttlefree pages for 3712 * PG_PANIC and cageout thread allocations. 3713 * 3714 * Everybody else has to serialize in 3715 * page_create_get_something() to get a cage page, so 3716 * that we don't deadlock cageout! 3717 */ 3718 return (NULL); 3719 } 3720 } else { 3721 flags &= ~PG_NORELOC; 3722 flags |= PGI_NOCAGE; 3723 } 3724 3725 /* LINTED */ 3726 MTYPE_INIT(mtype, vp, vaddr, flags, size); 3727 3728 /* 3729 * Convert size to page size code. 3730 */ 3731 if ((szc = page_szc(size)) == (uchar_t)-1) 3732 panic("page_get_freelist: illegal page size request"); 3733 ASSERT(szc < mmu_page_sizes); 3734 3735 VM_STAT_ADD(vmm_vmstats.pgf_alloc[szc]); 3736 3737 /* LINTED */ 3738 AS_2_BIN(as, seg, vp, vaddr, bin, szc); 3739 3740 ASSERT(bin < PAGE_GET_PAGECOLORS(szc)); 3741 3742 /* 3743 * Try to get a local page first, but try remote if we can't 3744 * get a page of the right color. 3745 */ 3746 pgretry: 3747 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL); 3748 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3749 pp = page_get_func(mnode, bin, mtype, szc, flags); 3750 if (pp != NULL) { 3751 VM_STAT_ADD(vmm_vmstats.pgf_allocok[szc]); 3752 DTRACE_PROBE4(page__get, 3753 lgrp_t *, lgrp, 3754 int, mnode, 3755 ulong_t, bin, 3756 uint_t, flags); 3757 return (pp); 3758 } 3759 } 3760 ASSERT(pp == NULL); 3761 3762 /* 3763 * for non-SZC0 PAGESIZE requests, check cachelist before checking 3764 * remote free lists. Caller expected to call page_get_cachelist which 3765 * will check local cache lists and remote free lists. 3766 */ 3767 if (szc == 0 && ((flags & PGI_PGCPSZC0) == 0)) { 3768 VM_STAT_ADD(vmm_vmstats.pgf_allocdeferred); 3769 return (NULL); 3770 } 3771 3772 ASSERT(szc > 0 || (flags & PGI_PGCPSZC0)); 3773 3774 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); 3775 3776 if (!(flags & PG_LOCAL)) { 3777 /* 3778 * Try to get a non-local freelist page. 3779 */ 3780 LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie); 3781 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3782 pp = page_get_func(mnode, bin, mtype, szc, flags); 3783 if (pp != NULL) { 3784 DTRACE_PROBE4(page__get, 3785 lgrp_t *, lgrp, 3786 int, mnode, 3787 ulong_t, bin, 3788 uint_t, flags); 3789 VM_STAT_ADD(vmm_vmstats.pgf_allocokrem[szc]); 3790 return (pp); 3791 } 3792 } 3793 ASSERT(pp == NULL); 3794 } 3795 3796 /* 3797 * when the cage is off chances are page_get_contig_pages() will fail 3798 * to lock a large page chunk therefore when the cage is off it's not 3799 * called by default. this can be changed via /etc/system. 3800 * 3801 * page_get_contig_pages() also called to acquire a base pagesize page 3802 * for page_create_get_something(). 3803 */ 3804 if (!(flags & PG_NORELOC) && (pg_contig_disable == 0) && 3805 (kcage_on || pg_lpgcreate_nocage || szc == 0) && 3806 (page_get_func != page_get_contig_pages)) { 3807 3808 VM_STAT_ADD(vmm_vmstats.pgf_allocretry[szc]); 3809 page_get_func = page_get_contig_pages; 3810 goto pgretry; 3811 } 3812 3813 if (!(flags & PG_LOCAL) && pgcplimitsearch && 3814 page_get_func == page_get_contig_pages) 3815 SETPGCPFAILCNT(szc); 3816 3817 VM_STAT_ADD(vmm_vmstats.pgf_allocfailed[szc]); 3818 return (NULL); 3819 } 3820 3821 /* 3822 * Find the `best' page on the cachelist for this (vp,off) (as,vaddr) pair. 3823 * 3824 * Does its own locking. 3825 * If PG_MATCH_COLOR is set, then NULL will be returned if there are no 3826 * pages of the proper color even if there are pages of a different color. 3827 * Otherwise, scan the bins for ones with pages. For each bin with pages, 3828 * try to lock one of them. If no page can be locked, try the 3829 * next bin. Return NULL if a page can not be found and locked. 3830 * 3831 * Finds a pages, trys to lock it, then removes it. 3832 */ 3833 3834 /*ARGSUSED*/ 3835 page_t * 3836 page_get_cachelist(struct vnode *vp, u_offset_t off, struct seg *seg, 3837 caddr_t vaddr, uint_t flags, struct lgrp *lgrp) 3838 { 3839 page_t *pp; 3840 struct as *as = seg->s_as; 3841 ulong_t bin; 3842 /*LINTED*/ 3843 int mnode; 3844 int mtype; 3845 lgrp_mnode_cookie_t lgrp_cookie; 3846 3847 /* 3848 * If we aren't passed a specific lgroup, or pasased a freed lgrp 3849 * assume we wish to allocate near to the current thread's home. 3850 */ 3851 if (!LGRP_EXISTS(lgrp)) 3852 lgrp = lgrp_home_lgrp(); 3853 3854 if (!kcage_on) { 3855 flags &= ~PG_NORELOC; 3856 flags |= PGI_NOCAGE; 3857 } 3858 3859 if ((flags & (PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == PG_NORELOC && 3860 kcage_freemem <= kcage_throttlefree) { 3861 /* 3862 * Reserve kcage_throttlefree pages for critical kernel 3863 * threads. 3864 * 3865 * Everybody else has to go to page_create_get_something() 3866 * to get a cage page, so we don't deadlock cageout. 3867 */ 3868 return (NULL); 3869 } 3870 3871 /* LINTED */ 3872 AS_2_BIN(as, seg, vp, vaddr, bin, 0); 3873 3874 ASSERT(bin < PAGE_GET_PAGECOLORS(0)); 3875 3876 /* LINTED */ 3877 MTYPE_INIT(mtype, vp, vaddr, flags, MMU_PAGESIZE); 3878 3879 VM_STAT_ADD(vmm_vmstats.pgc_alloc); 3880 3881 /* 3882 * Try local cachelists first 3883 */ 3884 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL); 3885 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3886 pp = page_get_mnode_cachelist(bin, flags, mnode, mtype); 3887 if (pp != NULL) { 3888 VM_STAT_ADD(vmm_vmstats.pgc_allocok); 3889 DTRACE_PROBE4(page__get, 3890 lgrp_t *, lgrp, 3891 int, mnode, 3892 ulong_t, bin, 3893 uint_t, flags); 3894 return (pp); 3895 } 3896 } 3897 3898 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); 3899 3900 /* 3901 * Try freelists/cachelists that are farther away 3902 * This is our only chance to allocate remote pages for PAGESIZE 3903 * requests. 3904 */ 3905 LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie); 3906 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3907 pp = page_get_mnode_freelist(mnode, bin, mtype, 3908 0, flags); 3909 if (pp != NULL) { 3910 VM_STAT_ADD(vmm_vmstats.pgc_allocokdeferred); 3911 DTRACE_PROBE4(page__get, 3912 lgrp_t *, lgrp, 3913 int, mnode, 3914 ulong_t, bin, 3915 uint_t, flags); 3916 return (pp); 3917 } 3918 pp = page_get_mnode_cachelist(bin, flags, mnode, mtype); 3919 if (pp != NULL) { 3920 VM_STAT_ADD(vmm_vmstats.pgc_allocokrem); 3921 DTRACE_PROBE4(page__get, 3922 lgrp_t *, lgrp, 3923 int, mnode, 3924 ulong_t, bin, 3925 uint_t, flags); 3926 return (pp); 3927 } 3928 } 3929 3930 VM_STAT_ADD(vmm_vmstats.pgc_allocfailed); 3931 return (NULL); 3932 } 3933 3934 page_t * 3935 page_get_mnode_cachelist(uint_t bin, uint_t flags, int mnode, int mtype) 3936 { 3937 kmutex_t *pcm; 3938 page_t *pp, *first_pp; 3939 uint_t sbin; 3940 int plw_initialized; 3941 page_list_walker_t plw; 3942 3943 VM_STAT_ADD(vmm_vmstats.pgmc_alloc); 3944 3945 /* LINTED */ 3946 MTYPE_START(mnode, mtype, flags); 3947 if (mtype < 0) { /* mnode does not have memory in mtype range */ 3948 VM_STAT_ADD(vmm_vmstats.pgmc_allocempty); 3949 return (NULL); 3950 } 3951 3952 try_again: 3953 3954 plw_initialized = 0; 3955 plw.plw_ceq_dif = 1; 3956 3957 /* 3958 * Only hold one cachelist lock at a time, that way we 3959 * can start anywhere and not have to worry about lock 3960 * ordering. 3961 */ 3962 3963 for (plw.plw_count = 0; 3964 plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) { 3965 sbin = bin; 3966 do { 3967 3968 if (!PAGE_CACHELISTS(mnode, bin, mtype)) 3969 goto bin_empty_1; 3970 pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST); 3971 mutex_enter(pcm); 3972 pp = PAGE_CACHELISTS(mnode, bin, mtype); 3973 if (pp == NULL) 3974 goto bin_empty_0; 3975 3976 first_pp = pp; 3977 ASSERT(pp->p_vnode); 3978 ASSERT(PP_ISAGED(pp) == 0); 3979 ASSERT(pp->p_szc == 0); 3980 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 3981 while (!page_trylock(pp, SE_EXCL)) { 3982 pp = pp->p_next; 3983 ASSERT(pp->p_szc == 0); 3984 if (pp == first_pp) { 3985 /* 3986 * We have searched the complete list! 3987 * And all of them (might only be one) 3988 * are locked. This can happen since 3989 * these pages can also be found via 3990 * the hash list. When found via the 3991 * hash list, they are locked first, 3992 * then removed. We give up to let the 3993 * other thread run. 3994 */ 3995 pp = NULL; 3996 break; 3997 } 3998 ASSERT(pp->p_vnode); 3999 ASSERT(PP_ISFREE(pp)); 4000 ASSERT(PP_ISAGED(pp) == 0); 4001 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == 4002 mnode); 4003 } 4004 4005 if (pp) { 4006 page_t **ppp; 4007 /* 4008 * Found and locked a page. 4009 * Pull it off the list. 4010 */ 4011 ASSERT(mtype == PP_2_MTYPE(pp)); 4012 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 4013 page_sub(ppp, pp); 4014 /* 4015 * Subtract counters before releasing pcm mutex 4016 * to avoid a race with page_freelist_coalesce 4017 * and page_freelist_split. 4018 */ 4019 page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST); 4020 mutex_exit(pcm); 4021 ASSERT(pp->p_vnode); 4022 ASSERT(PP_ISAGED(pp) == 0); 4023 #if defined(__sparc) 4024 ASSERT(!kcage_on || 4025 (flags & PG_NORELOC) == 0 || 4026 PP_ISNORELOC(pp)); 4027 if (PP_ISNORELOC(pp)) { 4028 kcage_freemem_sub(1); 4029 } 4030 #endif 4031 VM_STAT_ADD(vmm_vmstats. pgmc_allocok); 4032 return (pp); 4033 } 4034 bin_empty_0: 4035 mutex_exit(pcm); 4036 bin_empty_1: 4037 if (plw_initialized == 0) { 4038 page_list_walk_init(0, flags, bin, 0, 1, &plw); 4039 plw_initialized = 1; 4040 } 4041 /* calculate the next bin with equivalent color */ 4042 bin = ADD_MASKED(bin, plw.plw_bin_step, 4043 plw.plw_ceq_mask[0], plw.plw_color_mask); 4044 } while (sbin != bin); 4045 4046 if (plw.plw_ceq_dif > 1) 4047 bin = page_list_walk_next_bin(0, bin, &plw); 4048 } 4049 4050 MTYPE_NEXT(mnode, mtype, flags); 4051 if (mtype >= 0) 4052 goto try_again; 4053 4054 VM_STAT_ADD(vmm_vmstats.pgmc_allocfailed); 4055 return (NULL); 4056 } 4057 4058 #ifdef DEBUG 4059 #define REPL_PAGE_STATS 4060 #endif /* DEBUG */ 4061 4062 #ifdef REPL_PAGE_STATS 4063 struct repl_page_stats { 4064 uint_t ngets; 4065 uint_t ngets_noreloc; 4066 uint_t npgr_noreloc; 4067 uint_t nnopage_first; 4068 uint_t nnopage; 4069 uint_t nhashout; 4070 uint_t nnofree; 4071 uint_t nnext_pp; 4072 } repl_page_stats; 4073 #define REPL_STAT_INCR(v) atomic_add_32(&repl_page_stats.v, 1) 4074 #else /* REPL_PAGE_STATS */ 4075 #define REPL_STAT_INCR(v) 4076 #endif /* REPL_PAGE_STATS */ 4077 4078 int pgrppgcp; 4079 4080 /* 4081 * The freemem accounting must be done by the caller. 4082 * First we try to get a replacement page of the same size as like_pp, 4083 * if that is not possible, then we just get a set of discontiguous 4084 * PAGESIZE pages. 4085 */ 4086 page_t * 4087 page_get_replacement_page(page_t *orig_like_pp, struct lgrp *lgrp_target, 4088 uint_t pgrflags) 4089 { 4090 page_t *like_pp; 4091 page_t *pp, *pplist; 4092 page_t *pl = NULL; 4093 ulong_t bin; 4094 int mnode, page_mnode; 4095 int szc; 4096 spgcnt_t npgs, pg_cnt; 4097 pfn_t pfnum; 4098 int mtype; 4099 int flags = 0; 4100 lgrp_mnode_cookie_t lgrp_cookie; 4101 lgrp_t *lgrp; 4102 4103 REPL_STAT_INCR(ngets); 4104 like_pp = orig_like_pp; 4105 ASSERT(PAGE_EXCL(like_pp)); 4106 4107 szc = like_pp->p_szc; 4108 npgs = page_get_pagecnt(szc); 4109 /* 4110 * Now we reset like_pp to the base page_t. 4111 * That way, we won't walk past the end of this 'szc' page. 4112 */ 4113 pfnum = PFN_BASE(like_pp->p_pagenum, szc); 4114 like_pp = page_numtopp_nolock(pfnum); 4115 ASSERT(like_pp->p_szc == szc); 4116 4117 if (PP_ISNORELOC(like_pp)) { 4118 ASSERT(kcage_on); 4119 REPL_STAT_INCR(ngets_noreloc); 4120 flags = PGI_RELOCONLY; 4121 } else if (pgrflags & PGR_NORELOC) { 4122 ASSERT(kcage_on); 4123 REPL_STAT_INCR(npgr_noreloc); 4124 flags = PG_NORELOC; 4125 } 4126 4127 /* 4128 * Kernel pages must always be replaced with the same size 4129 * pages, since we cannot properly handle demotion of kernel 4130 * pages. 4131 */ 4132 if (PP_ISKAS(like_pp)) 4133 pgrflags |= PGR_SAMESZC; 4134 4135 /* LINTED */ 4136 MTYPE_PGR_INIT(mtype, flags, like_pp, page_mnode, npgs); 4137 4138 while (npgs) { 4139 pplist = NULL; 4140 for (;;) { 4141 pg_cnt = page_get_pagecnt(szc); 4142 bin = PP_2_BIN(like_pp); 4143 ASSERT(like_pp->p_szc == orig_like_pp->p_szc); 4144 ASSERT(pg_cnt <= npgs); 4145 4146 /* 4147 * If an lgroup was specified, try to get the 4148 * page from that lgroup. 4149 * NOTE: Must be careful with code below because 4150 * lgroup may disappear and reappear since there 4151 * is no locking for lgroup here. 4152 */ 4153 if (LGRP_EXISTS(lgrp_target)) { 4154 /* 4155 * Keep local variable for lgroup separate 4156 * from lgroup argument since this code should 4157 * only be exercised when lgroup argument 4158 * exists.... 4159 */ 4160 lgrp = lgrp_target; 4161 4162 /* Try the lgroup's freelists first */ 4163 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 4164 LGRP_SRCH_LOCAL); 4165 while ((pplist == NULL) && 4166 (mnode = lgrp_memnode_choose(&lgrp_cookie)) 4167 != -1) { 4168 pplist = 4169 page_get_mnode_freelist(mnode, bin, 4170 mtype, szc, flags); 4171 } 4172 4173 /* 4174 * Now try it's cachelists if this is a 4175 * small page. Don't need to do it for 4176 * larger ones since page_freelist_coalesce() 4177 * already failed. 4178 */ 4179 if (pplist != NULL || szc != 0) 4180 break; 4181 4182 /* Now try it's cachelists */ 4183 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 4184 LGRP_SRCH_LOCAL); 4185 4186 while ((pplist == NULL) && 4187 (mnode = lgrp_memnode_choose(&lgrp_cookie)) 4188 != -1) { 4189 pplist = 4190 page_get_mnode_cachelist(bin, flags, 4191 mnode, mtype); 4192 } 4193 if (pplist != NULL) { 4194 page_hashout(pplist, NULL); 4195 PP_SETAGED(pplist); 4196 REPL_STAT_INCR(nhashout); 4197 break; 4198 } 4199 /* Done looking in this lgroup. Bail out. */ 4200 break; 4201 } 4202 4203 /* 4204 * No lgroup was specified (or lgroup was removed by 4205 * DR, so just try to get the page as close to 4206 * like_pp's mnode as possible. 4207 * First try the local freelist... 4208 */ 4209 mnode = PP_2_MEM_NODE(like_pp); 4210 pplist = page_get_mnode_freelist(mnode, bin, 4211 mtype, szc, flags); 4212 if (pplist != NULL) 4213 break; 4214 4215 REPL_STAT_INCR(nnofree); 4216 4217 /* 4218 * ...then the local cachelist. Don't need to do it for 4219 * larger pages cause page_freelist_coalesce() already 4220 * failed there anyway. 4221 */ 4222 if (szc == 0) { 4223 pplist = page_get_mnode_cachelist(bin, flags, 4224 mnode, mtype); 4225 if (pplist != NULL) { 4226 page_hashout(pplist, NULL); 4227 PP_SETAGED(pplist); 4228 REPL_STAT_INCR(nhashout); 4229 break; 4230 } 4231 } 4232 4233 /* Now try remote freelists */ 4234 page_mnode = mnode; 4235 lgrp = 4236 lgrp_hand_to_lgrp(MEM_NODE_2_LGRPHAND(page_mnode)); 4237 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 4238 LGRP_SRCH_HIER); 4239 while (pplist == NULL && 4240 (mnode = lgrp_memnode_choose(&lgrp_cookie)) 4241 != -1) { 4242 /* 4243 * Skip local mnode. 4244 */ 4245 if ((mnode == page_mnode) || 4246 (mem_node_config[mnode].exists == 0)) 4247 continue; 4248 4249 pplist = page_get_mnode_freelist(mnode, 4250 bin, mtype, szc, flags); 4251 } 4252 4253 if (pplist != NULL) 4254 break; 4255 4256 4257 /* Now try remote cachelists */ 4258 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 4259 LGRP_SRCH_HIER); 4260 while (pplist == NULL && szc == 0) { 4261 mnode = lgrp_memnode_choose(&lgrp_cookie); 4262 if (mnode == -1) 4263 break; 4264 /* 4265 * Skip local mnode. 4266 */ 4267 if ((mnode == page_mnode) || 4268 (mem_node_config[mnode].exists == 0)) 4269 continue; 4270 4271 pplist = page_get_mnode_cachelist(bin, 4272 flags, mnode, mtype); 4273 4274 if (pplist != NULL) { 4275 page_hashout(pplist, NULL); 4276 PP_SETAGED(pplist); 4277 REPL_STAT_INCR(nhashout); 4278 break; 4279 } 4280 } 4281 4282 /* 4283 * Break out of while loop under the following cases: 4284 * - If we successfully got a page. 4285 * - If pgrflags specified only returning a specific 4286 * page size and we could not find that page size. 4287 * - If we could not satisfy the request with PAGESIZE 4288 * or larger pages. 4289 */ 4290 if (pplist != NULL || szc == 0) 4291 break; 4292 4293 if ((pgrflags & PGR_SAMESZC) || pgrppgcp) { 4294 /* try to find contig page */ 4295 4296 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 4297 LGRP_SRCH_HIER); 4298 4299 while ((pplist == NULL) && 4300 (mnode = 4301 lgrp_memnode_choose(&lgrp_cookie)) 4302 != -1) { 4303 pplist = page_get_contig_pages( 4304 mnode, bin, mtype, szc, 4305 flags | PGI_PGCPHIPRI); 4306 } 4307 break; 4308 } 4309 4310 /* 4311 * The correct thing to do here is try the next 4312 * page size down using szc--. Due to a bug 4313 * with the processing of HAT_RELOAD_SHARE 4314 * where the sfmmu_ttecnt arrays of all 4315 * hats sharing an ISM segment don't get updated, 4316 * using intermediate size pages for relocation 4317 * can lead to continuous page faults. 4318 */ 4319 szc = 0; 4320 } 4321 4322 if (pplist != NULL) { 4323 DTRACE_PROBE4(page__get, 4324 lgrp_t *, lgrp, 4325 int, mnode, 4326 ulong_t, bin, 4327 uint_t, flags); 4328 4329 while (pplist != NULL && pg_cnt--) { 4330 ASSERT(pplist != NULL); 4331 pp = pplist; 4332 page_sub(&pplist, pp); 4333 PP_CLRFREE(pp); 4334 PP_CLRAGED(pp); 4335 page_list_concat(&pl, &pp); 4336 npgs--; 4337 like_pp = like_pp + 1; 4338 REPL_STAT_INCR(nnext_pp); 4339 } 4340 ASSERT(pg_cnt == 0); 4341 } else { 4342 break; 4343 } 4344 } 4345 4346 if (npgs) { 4347 /* 4348 * We were unable to allocate the necessary number 4349 * of pages. 4350 * We need to free up any pl. 4351 */ 4352 REPL_STAT_INCR(nnopage); 4353 page_free_replacement_page(pl); 4354 return (NULL); 4355 } else { 4356 return (pl); 4357 } 4358 } 4359 4360 /* 4361 * demote a free large page to it's constituent pages 4362 */ 4363 void 4364 page_demote_free_pages(page_t *pp) 4365 { 4366 4367 int mnode; 4368 4369 ASSERT(pp != NULL); 4370 ASSERT(PAGE_LOCKED(pp)); 4371 ASSERT(PP_ISFREE(pp)); 4372 ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes); 4373 4374 mnode = PP_2_MEM_NODE(pp); 4375 page_freelist_lock(mnode); 4376 if (pp->p_szc != 0) { 4377 (void) page_demote(mnode, PFN_BASE(pp->p_pagenum, 4378 pp->p_szc), 0, pp->p_szc, 0, PC_NO_COLOR, PC_FREE); 4379 } 4380 page_freelist_unlock(mnode); 4381 ASSERT(pp->p_szc == 0); 4382 } 4383 4384 /* 4385 * Factor in colorequiv to check additional 'equivalent' bins. 4386 * colorequiv may be set in /etc/system 4387 */ 4388 void 4389 page_set_colorequiv_arr(void) 4390 { 4391 if (colorequiv > 1) { 4392 int i; 4393 uint_t sv_a = lowbit(colorequiv) - 1; 4394 4395 if (sv_a > 15) 4396 sv_a = 15; 4397 4398 for (i = 0; i < MMU_PAGE_SIZES; i++) { 4399 uint_t colors; 4400 uint_t a = sv_a; 4401 4402 if ((colors = hw_page_array[i].hp_colors) <= 1) { 4403 continue; 4404 } 4405 while ((colors >> a) == 0) 4406 a--; 4407 if ((a << 4) > colorequivszc[i]) { 4408 colorequivszc[i] = (a << 4); 4409 } 4410 } 4411 } 4412 } 4413