1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * Portions of this source code were derived from Berkeley 4.3 BSD 31 * under license from the Regents of the University of California. 32 */ 33 34 35 /* 36 * This file contains common functions to access and manage the page lists. 37 * Many of these routines originated from platform dependent modules 38 * (sun4/vm/vm_dep.c, i86pc/vm/vm_machdep.c) and modified to function in 39 * a platform independent manner. 40 * 41 * vm/vm_dep.h provides for platform specific support. 42 */ 43 44 #include <sys/types.h> 45 #include <sys/debug.h> 46 #include <sys/cmn_err.h> 47 #include <sys/systm.h> 48 #include <sys/atomic.h> 49 #include <sys/sysmacros.h> 50 #include <vm/as.h> 51 #include <vm/page.h> 52 #include <vm/seg_kmem.h> 53 #include <vm/seg_vn.h> 54 #include <sys/vmsystm.h> 55 #include <sys/memnode.h> 56 #include <vm/vm_dep.h> 57 #include <sys/lgrp.h> 58 #include <sys/mem_config.h> 59 #include <sys/callb.h> 60 #include <sys/mem_cage.h> 61 #include <sys/sdt.h> 62 63 extern uint_t vac_colors; 64 65 #define MAX_PRAGMA_ALIGN 128 66 67 /* vm_cpu_data0 for the boot cpu before kmem is initialized */ 68 69 #if L2CACHE_ALIGN_MAX <= MAX_PRAGMA_ALIGN 70 #pragma align L2CACHE_ALIGN_MAX(vm_cpu_data0) 71 #else 72 #pragma align MAX_PRAGMA_ALIGN(vm_cpu_data0) 73 #endif 74 char vm_cpu_data0[VM_CPU_DATA_PADSIZE]; 75 76 /* 77 * number of page colors equivalent to reqested color in page_get routines. 78 * If set, keeps large pages intact longer and keeps MPO allocation 79 * from the local mnode in favor of acquiring the 'correct' page color from 80 * a demoted large page or from a remote mnode. 81 */ 82 uint_t colorequiv; 83 84 /* 85 * color equivalency mask for each page size. 86 * Mask is computed based on cpu L2$ way sizes and colorequiv global. 87 * High 4 bits determine the number of high order bits of the color to ignore. 88 * Low 4 bits determines number of low order bits of color to ignore (it's only 89 * relevant for hashed index based page coloring). 90 */ 91 uchar_t colorequivszc[MMU_PAGE_SIZES]; 92 93 /* 94 * if set, specifies the percentage of large pages that are free from within 95 * a large page region before attempting to lock those pages for 96 * page_get_contig_pages processing. 97 * 98 * Should be turned on when kpr is available when page_trylock_contig_pages 99 * can be more selective. 100 */ 101 102 int ptcpthreshold; 103 104 /* 105 * Limit page get contig page search based on failure cnts in pgcpfailcnt[]. 106 * Enabled by default via pgcplimitsearch. 107 * 108 * pgcpfailcnt[] is bounded by PGCPFAILMAX (>= 1/2 of installed 109 * memory). When reached, pgcpfailcnt[] is reset to 1/2 of this upper 110 * bound. This upper bound range guarantees: 111 * - all large page 'slots' will be searched over time 112 * - the minimum (1) large page candidates considered on each pgcp call 113 * - count doesn't wrap around to 0 114 */ 115 pgcnt_t pgcpfailcnt[MMU_PAGE_SIZES]; 116 int pgcplimitsearch = 1; 117 118 #define PGCPFAILMAX (1 << (highbit(physinstalled) - 1)) 119 #define SETPGCPFAILCNT(szc) \ 120 if (++pgcpfailcnt[szc] >= PGCPFAILMAX) \ 121 pgcpfailcnt[szc] = PGCPFAILMAX / 2; 122 123 #ifdef VM_STATS 124 struct vmm_vmstats_str vmm_vmstats; 125 126 #endif /* VM_STATS */ 127 128 #if defined(__sparc) 129 #define LPGCREATE 0 130 #else 131 /* enable page_get_contig_pages */ 132 #define LPGCREATE 1 133 #endif 134 135 int pg_contig_disable; 136 int pg_lpgcreate_nocage = LPGCREATE; 137 138 /* 139 * page_freelist_split pfn flag to signify no lo or hi pfn requirement. 140 */ 141 #define PFNNULL 0 142 143 /* Flags involved in promotion and demotion routines */ 144 #define PC_FREE 0x1 /* put page on freelist */ 145 #define PC_ALLOC 0x2 /* return page for allocation */ 146 147 /* 148 * Flag for page_demote to be used with PC_FREE to denote that we don't care 149 * what the color is as the color parameter to the function is ignored. 150 */ 151 #define PC_NO_COLOR (-1) 152 153 /* mtype value for page_promote to use when mtype does not matter */ 154 #define PC_MTYPE_ANY (-1) 155 156 /* 157 * page counters candidates info 158 * See page_ctrs_cands comment below for more details. 159 * fields are as follows: 160 * pcc_pages_free: # pages which freelist coalesce can create 161 * pcc_color_free: pointer to page free counts per color 162 */ 163 typedef struct pcc_info { 164 pgcnt_t pcc_pages_free; 165 pgcnt_t *pcc_color_free; 166 uint_t pad[12]; 167 } pcc_info_t; 168 169 /* 170 * On big machines it can take a long time to check page_counters 171 * arrays. page_ctrs_cands is a summary array whose elements are a dynamically 172 * updated sum of all elements of the corresponding page_counters arrays. 173 * page_freelist_coalesce() searches page_counters only if an appropriate 174 * element of page_ctrs_cands array is greater than 0. 175 * 176 * page_ctrs_cands is indexed by mutex (i), region (r), mnode (m), mrange (g) 177 */ 178 pcc_info_t **page_ctrs_cands[NPC_MUTEX][MMU_PAGE_SIZES]; 179 180 /* 181 * Return in val the total number of free pages which can be created 182 * for the given mnode (m), mrange (g), and region size (r) 183 */ 184 #define PGCTRS_CANDS_GETVALUE(m, g, r, val) { \ 185 int i; \ 186 val = 0; \ 187 for (i = 0; i < NPC_MUTEX; i++) { \ 188 val += page_ctrs_cands[i][(r)][(m)][(g)].pcc_pages_free; \ 189 } \ 190 } 191 192 /* 193 * Return in val the total number of free pages which can be created 194 * for the given mnode (m), mrange (g), region size (r), and color (c) 195 */ 196 #define PGCTRS_CANDS_GETVALUECOLOR(m, g, r, c, val) { \ 197 int i; \ 198 val = 0; \ 199 ASSERT((c) < PAGE_GET_PAGECOLORS(r)); \ 200 for (i = 0; i < NPC_MUTEX; i++) { \ 201 val += \ 202 page_ctrs_cands[i][(r)][(m)][(g)].pcc_color_free[(c)]; \ 203 } \ 204 } 205 206 /* 207 * We can only allow a single thread to update a counter within the physical 208 * range of the largest supported page size. That is the finest granularity 209 * possible since the counter values are dependent on each other 210 * as you move accross region sizes. PP_CTR_LOCK_INDX is used to determine the 211 * ctr_mutex lock index for a particular physical range. 212 */ 213 static kmutex_t *ctr_mutex[NPC_MUTEX]; 214 215 #define PP_CTR_LOCK_INDX(pp) \ 216 (((pp)->p_pagenum >> \ 217 (PAGE_BSZS_SHIFT(mmu_page_sizes - 1))) & (NPC_MUTEX - 1)) 218 219 #define INVALID_COLOR 0xffffffff 220 #define INVALID_MASK 0xffffffff 221 222 /* 223 * Local functions prototypes. 224 */ 225 226 void page_ctr_add(int, int, page_t *, int); 227 void page_ctr_add_internal(int, int, page_t *, int); 228 void page_ctr_sub(int, int, page_t *, int); 229 void page_ctr_sub_internal(int, int, page_t *, int); 230 void page_freelist_lock(int); 231 void page_freelist_unlock(int); 232 page_t *page_promote(int, pfn_t, uchar_t, int, int); 233 page_t *page_demote(int, pfn_t, pfn_t, uchar_t, uchar_t, int, int); 234 page_t *page_freelist_split(uchar_t, 235 uint_t, int, int, pfn_t, pfn_t, page_list_walker_t *); 236 page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int); 237 static int page_trylock_cons(page_t *pp, se_t se); 238 239 /* 240 * The page_counters array below is used to keep track of free contiguous 241 * physical memory. A hw_page_map_t will be allocated per mnode per szc. 242 * This contains an array of counters, the size of the array, a shift value 243 * used to convert a pagenum into a counter array index or vice versa, as 244 * well as a cache of the last successful index to be promoted to a larger 245 * page size. As an optimization, we keep track of the last successful index 246 * to be promoted per page color for the given size region, and this is 247 * allocated dynamically based upon the number of colors for a given 248 * region size. 249 * 250 * Conceptually, the page counters are represented as: 251 * 252 * page_counters[region_size][mnode] 253 * 254 * region_size: size code of a candidate larger page made up 255 * of contiguous free smaller pages. 256 * 257 * page_counters[region_size][mnode].hpm_counters[index]: 258 * represents how many (region_size - 1) pages either 259 * exist or can be created within the given index range. 260 * 261 * Let's look at a sparc example: 262 * If we want to create a free 512k page, we look at region_size 2 263 * for the mnode we want. We calculate the index and look at a specific 264 * hpm_counters location. If we see 8 (FULL_REGION_CNT on sparc) at 265 * this location, it means that 8 64k pages either exist or can be created 266 * from 8K pages in order to make a single free 512k page at the given 267 * index. Note that when a region is full, it will contribute to the 268 * counts in the region above it. Thus we will not know what page 269 * size the free pages will be which can be promoted to this new free 270 * page unless we look at all regions below the current region. 271 */ 272 273 /* 274 * Note: hpmctr_t is defined in platform vm_dep.h 275 * hw_page_map_t contains all the information needed for the page_counters 276 * logic. The fields are as follows: 277 * 278 * hpm_counters: dynamically allocated array to hold counter data 279 * hpm_entries: entries in hpm_counters 280 * hpm_shift: shift for pnum/array index conv 281 * hpm_base: PFN mapped to counter index 0 282 * hpm_color_current: last index in counter array for this color at 283 * which we successfully created a large page 284 */ 285 typedef struct hw_page_map { 286 hpmctr_t *hpm_counters; 287 size_t hpm_entries; 288 int hpm_shift; 289 pfn_t hpm_base; 290 size_t *hpm_color_current[MAX_MNODE_MRANGES]; 291 #if defined(__sparc) 292 uint_t pad[4]; 293 #endif 294 } hw_page_map_t; 295 296 /* 297 * Element zero is not used, but is allocated for convenience. 298 */ 299 static hw_page_map_t *page_counters[MMU_PAGE_SIZES]; 300 301 /* 302 * Cached value of MNODE_RANGE_CNT(mnode). 303 * This is a function call in x86. 304 */ 305 static int mnode_nranges[MAX_MEM_NODES]; 306 static int mnode_maxmrange[MAX_MEM_NODES]; 307 308 /* 309 * The following macros are convenient ways to get access to the individual 310 * elements of the page_counters arrays. They can be used on both 311 * the left side and right side of equations. 312 */ 313 #define PAGE_COUNTERS(mnode, rg_szc, idx) \ 314 (page_counters[(rg_szc)][(mnode)].hpm_counters[(idx)]) 315 316 #define PAGE_COUNTERS_COUNTERS(mnode, rg_szc) \ 317 (page_counters[(rg_szc)][(mnode)].hpm_counters) 318 319 #define PAGE_COUNTERS_SHIFT(mnode, rg_szc) \ 320 (page_counters[(rg_szc)][(mnode)].hpm_shift) 321 322 #define PAGE_COUNTERS_ENTRIES(mnode, rg_szc) \ 323 (page_counters[(rg_szc)][(mnode)].hpm_entries) 324 325 #define PAGE_COUNTERS_BASE(mnode, rg_szc) \ 326 (page_counters[(rg_szc)][(mnode)].hpm_base) 327 328 #define PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, rg_szc, g) \ 329 (page_counters[(rg_szc)][(mnode)].hpm_color_current[(g)]) 330 331 #define PAGE_COUNTERS_CURRENT_COLOR(mnode, rg_szc, color, mrange) \ 332 (page_counters[(rg_szc)][(mnode)]. \ 333 hpm_color_current[(mrange)][(color)]) 334 335 #define PNUM_TO_IDX(mnode, rg_szc, pnum) \ 336 (((pnum) - PAGE_COUNTERS_BASE((mnode), (rg_szc))) >> \ 337 PAGE_COUNTERS_SHIFT((mnode), (rg_szc))) 338 339 #define IDX_TO_PNUM(mnode, rg_szc, index) \ 340 (PAGE_COUNTERS_BASE((mnode), (rg_szc)) + \ 341 ((index) << PAGE_COUNTERS_SHIFT((mnode), (rg_szc)))) 342 343 /* 344 * Protects the hpm_counters and hpm_color_current memory from changing while 345 * looking at page counters information. 346 * Grab the write lock to modify what these fields point at. 347 * Grab the read lock to prevent any pointers from changing. 348 * The write lock can not be held during memory allocation due to a possible 349 * recursion deadlock with trying to grab the read lock while the 350 * write lock is already held. 351 */ 352 krwlock_t page_ctrs_rwlock[MAX_MEM_NODES]; 353 354 355 /* 356 * initialize cpu_vm_data to point at cache aligned vm_cpu_data_t. 357 */ 358 void 359 cpu_vm_data_init(struct cpu *cp) 360 { 361 if (cp == CPU0) { 362 cp->cpu_vm_data = (void *)&vm_cpu_data0; 363 } else { 364 void *kmptr; 365 int align; 366 size_t sz; 367 368 align = (L2CACHE_ALIGN) ? L2CACHE_ALIGN : L2CACHE_ALIGN_MAX; 369 sz = P2ROUNDUP(sizeof (vm_cpu_data_t), align) + align; 370 kmptr = kmem_zalloc(sz, KM_SLEEP); 371 cp->cpu_vm_data = (void *) P2ROUNDUP((uintptr_t)kmptr, align); 372 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr = kmptr; 373 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize = sz; 374 } 375 } 376 377 /* 378 * free cpu_vm_data 379 */ 380 void 381 cpu_vm_data_destroy(struct cpu *cp) 382 { 383 if (cp->cpu_seqid && cp->cpu_vm_data) { 384 ASSERT(cp != CPU0); 385 kmem_free(((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr, 386 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize); 387 } 388 cp->cpu_vm_data = NULL; 389 } 390 391 392 /* 393 * page size to page size code 394 */ 395 int 396 page_szc(size_t pagesize) 397 { 398 int i = 0; 399 400 while (hw_page_array[i].hp_size) { 401 if (pagesize == hw_page_array[i].hp_size) 402 return (i); 403 i++; 404 } 405 return (-1); 406 } 407 408 /* 409 * page size to page size code with the restriction that it be a supported 410 * user page size. If it's not a supported user page size, -1 will be returned. 411 */ 412 int 413 page_szc_user_filtered(size_t pagesize) 414 { 415 int szc = page_szc(pagesize); 416 if ((szc != -1) && (SZC_2_USERSZC(szc) != -1)) { 417 return (szc); 418 } 419 return (-1); 420 } 421 422 /* 423 * Return how many page sizes are available for the user to use. This is 424 * what the hardware supports and not based upon how the OS implements the 425 * support of different page sizes. 426 * 427 * If legacy is non-zero, return the number of pagesizes available to legacy 428 * applications. The number of legacy page sizes might be less than the 429 * exported user page sizes. This is to prevent legacy applications that 430 * use the largest page size returned from getpagesizes(3c) from inadvertantly 431 * using the 'new' large pagesizes. 432 */ 433 uint_t 434 page_num_user_pagesizes(int legacy) 435 { 436 if (legacy) 437 return (mmu_legacy_page_sizes); 438 return (mmu_exported_page_sizes); 439 } 440 441 uint_t 442 page_num_pagesizes(void) 443 { 444 return (mmu_page_sizes); 445 } 446 447 /* 448 * returns the count of the number of base pagesize pages associated with szc 449 */ 450 pgcnt_t 451 page_get_pagecnt(uint_t szc) 452 { 453 if (szc >= mmu_page_sizes) 454 panic("page_get_pagecnt: out of range %d", szc); 455 return (hw_page_array[szc].hp_pgcnt); 456 } 457 458 size_t 459 page_get_pagesize(uint_t szc) 460 { 461 if (szc >= mmu_page_sizes) 462 panic("page_get_pagesize: out of range %d", szc); 463 return (hw_page_array[szc].hp_size); 464 } 465 466 /* 467 * Return the size of a page based upon the index passed in. An index of 468 * zero refers to the smallest page size in the system, and as index increases 469 * it refers to the next larger supported page size in the system. 470 * Note that szc and userszc may not be the same due to unsupported szc's on 471 * some systems. 472 */ 473 size_t 474 page_get_user_pagesize(uint_t userszc) 475 { 476 uint_t szc = USERSZC_2_SZC(userszc); 477 478 if (szc >= mmu_page_sizes) 479 panic("page_get_user_pagesize: out of range %d", szc); 480 return (hw_page_array[szc].hp_size); 481 } 482 483 uint_t 484 page_get_shift(uint_t szc) 485 { 486 if (szc >= mmu_page_sizes) 487 panic("page_get_shift: out of range %d", szc); 488 return (PAGE_GET_SHIFT(szc)); 489 } 490 491 uint_t 492 page_get_pagecolors(uint_t szc) 493 { 494 if (szc >= mmu_page_sizes) 495 panic("page_get_pagecolors: out of range %d", szc); 496 return (PAGE_GET_PAGECOLORS(szc)); 497 } 498 499 /* 500 * this assigns the desired equivalent color after a split 501 */ 502 uint_t 503 page_correct_color(uchar_t szc, uchar_t nszc, uint_t color, 504 uint_t ncolor, uint_t ceq_mask) 505 { 506 ASSERT(nszc > szc); 507 ASSERT(szc < mmu_page_sizes); 508 ASSERT(color < PAGE_GET_PAGECOLORS(szc)); 509 ASSERT(ncolor < PAGE_GET_PAGECOLORS(nszc)); 510 511 color &= ceq_mask; 512 ncolor = PAGE_CONVERT_COLOR(ncolor, szc, nszc); 513 return (color | (ncolor & ~ceq_mask)); 514 } 515 516 /* 517 * The interleaved_mnodes flag is set when mnodes overlap in 518 * the physbase..physmax range, but have disjoint slices. 519 * In this case hpm_counters is shared by all mnodes. 520 * This flag is set dynamically by the platform. 521 */ 522 int interleaved_mnodes = 0; 523 524 /* 525 * Called by startup(). 526 * Size up the per page size free list counters based on physmax 527 * of each node and max_mem_nodes. 528 * 529 * If interleaved_mnodes is set we need to find the first mnode that 530 * exists. hpm_counters for the first mnode will then be shared by 531 * all other mnodes. If interleaved_mnodes is not set, just set 532 * first=mnode each time. That means there will be no sharing. 533 */ 534 size_t 535 page_ctrs_sz(void) 536 { 537 int r; /* region size */ 538 int mnode; 539 int firstmn; /* first mnode that exists */ 540 int nranges; 541 pfn_t physbase; 542 pfn_t physmax; 543 uint_t ctrs_sz = 0; 544 int i; 545 pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; 546 547 /* 548 * We need to determine how many page colors there are for each 549 * page size in order to allocate memory for any color specific 550 * arrays. 551 */ 552 for (i = 0; i < mmu_page_sizes; i++) { 553 colors_per_szc[i] = PAGE_GET_PAGECOLORS(i); 554 } 555 556 for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) { 557 558 pgcnt_t r_pgcnt; 559 pfn_t r_base; 560 pgcnt_t r_align; 561 562 if (mem_node_config[mnode].exists == 0) 563 continue; 564 565 HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn); 566 nranges = MNODE_RANGE_CNT(mnode); 567 mnode_nranges[mnode] = nranges; 568 mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode); 569 570 /* 571 * determine size needed for page counter arrays with 572 * base aligned to large page size. 573 */ 574 for (r = 1; r < mmu_page_sizes; r++) { 575 /* add in space for hpm_color_current */ 576 ctrs_sz += sizeof (size_t) * 577 colors_per_szc[r] * nranges; 578 579 if (firstmn != mnode) 580 continue; 581 582 /* add in space for hpm_counters */ 583 r_align = page_get_pagecnt(r); 584 r_base = physbase; 585 r_base &= ~(r_align - 1); 586 r_pgcnt = howmany(physmax - r_base + 1, r_align); 587 588 /* 589 * Round up to always allocate on pointer sized 590 * boundaries. 591 */ 592 ctrs_sz += P2ROUNDUP((r_pgcnt * sizeof (hpmctr_t)), 593 sizeof (hpmctr_t *)); 594 } 595 } 596 597 for (r = 1; r < mmu_page_sizes; r++) { 598 ctrs_sz += (max_mem_nodes * sizeof (hw_page_map_t)); 599 } 600 601 /* add in space for page_ctrs_cands and pcc_color_free */ 602 ctrs_sz += sizeof (pcc_info_t *) * max_mem_nodes * 603 mmu_page_sizes * NPC_MUTEX; 604 605 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 606 607 if (mem_node_config[mnode].exists == 0) 608 continue; 609 610 nranges = mnode_nranges[mnode]; 611 ctrs_sz += sizeof (pcc_info_t) * nranges * 612 mmu_page_sizes * NPC_MUTEX; 613 for (r = 1; r < mmu_page_sizes; r++) { 614 ctrs_sz += sizeof (pgcnt_t) * nranges * 615 colors_per_szc[r] * NPC_MUTEX; 616 } 617 } 618 619 /* ctr_mutex */ 620 ctrs_sz += (max_mem_nodes * NPC_MUTEX * sizeof (kmutex_t)); 621 622 /* size for page list counts */ 623 PLCNT_SZ(ctrs_sz); 624 625 /* 626 * add some slop for roundups. page_ctrs_alloc will roundup the start 627 * address of the counters to ecache_alignsize boundary for every 628 * memory node. 629 */ 630 return (ctrs_sz + max_mem_nodes * L2CACHE_ALIGN); 631 } 632 633 caddr_t 634 page_ctrs_alloc(caddr_t alloc_base) 635 { 636 int mnode; 637 int mrange, nranges; 638 int r; /* region size */ 639 int i; 640 int firstmn; /* first mnode that exists */ 641 pfn_t physbase; 642 pfn_t physmax; 643 pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; 644 645 /* 646 * We need to determine how many page colors there are for each 647 * page size in order to allocate memory for any color specific 648 * arrays. 649 */ 650 for (i = 0; i < mmu_page_sizes; i++) { 651 colors_per_szc[i] = PAGE_GET_PAGECOLORS(i); 652 } 653 654 for (r = 1; r < mmu_page_sizes; r++) { 655 page_counters[r] = (hw_page_map_t *)alloc_base; 656 alloc_base += (max_mem_nodes * sizeof (hw_page_map_t)); 657 } 658 659 /* page_ctrs_cands and pcc_color_free array */ 660 for (i = 0; i < NPC_MUTEX; i++) { 661 for (r = 1; r < mmu_page_sizes; r++) { 662 663 page_ctrs_cands[i][r] = (pcc_info_t **)alloc_base; 664 alloc_base += sizeof (pcc_info_t *) * max_mem_nodes; 665 666 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 667 pcc_info_t *pi; 668 669 if (mem_node_config[mnode].exists == 0) 670 continue; 671 672 nranges = mnode_nranges[mnode]; 673 674 pi = (pcc_info_t *)alloc_base; 675 alloc_base += sizeof (pcc_info_t) * nranges; 676 page_ctrs_cands[i][r][mnode] = pi; 677 678 for (mrange = 0; mrange < nranges; mrange++) { 679 pi->pcc_color_free = 680 (pgcnt_t *)alloc_base; 681 alloc_base += sizeof (pgcnt_t) * 682 colors_per_szc[r]; 683 pi++; 684 } 685 } 686 } 687 } 688 689 /* ctr_mutex */ 690 for (i = 0; i < NPC_MUTEX; i++) { 691 ctr_mutex[i] = (kmutex_t *)alloc_base; 692 alloc_base += (max_mem_nodes * sizeof (kmutex_t)); 693 } 694 695 /* initialize page list counts */ 696 PLCNT_INIT(alloc_base); 697 698 for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) { 699 700 pgcnt_t r_pgcnt; 701 pfn_t r_base; 702 pgcnt_t r_align; 703 int r_shift; 704 int nranges = mnode_nranges[mnode]; 705 706 if (mem_node_config[mnode].exists == 0) 707 continue; 708 709 HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn); 710 711 for (r = 1; r < mmu_page_sizes; r++) { 712 /* 713 * the page_counters base has to be aligned to the 714 * page count of page size code r otherwise the counts 715 * will cross large page boundaries. 716 */ 717 r_align = page_get_pagecnt(r); 718 r_base = physbase; 719 /* base needs to be aligned - lower to aligned value */ 720 r_base &= ~(r_align - 1); 721 r_pgcnt = howmany(physmax - r_base + 1, r_align); 722 r_shift = PAGE_BSZS_SHIFT(r); 723 724 PAGE_COUNTERS_SHIFT(mnode, r) = r_shift; 725 PAGE_COUNTERS_ENTRIES(mnode, r) = r_pgcnt; 726 PAGE_COUNTERS_BASE(mnode, r) = r_base; 727 for (mrange = 0; mrange < nranges; mrange++) { 728 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, 729 r, mrange) = (size_t *)alloc_base; 730 alloc_base += sizeof (size_t) * 731 colors_per_szc[r]; 732 } 733 for (i = 0; i < colors_per_szc[r]; i++) { 734 uint_t color_mask = colors_per_szc[r] - 1; 735 pfn_t pfnum = r_base; 736 size_t idx; 737 int mrange; 738 MEM_NODE_ITERATOR_DECL(it); 739 740 MEM_NODE_ITERATOR_INIT(pfnum, mnode, r, &it); 741 if (pfnum == (pfn_t)-1) { 742 idx = 0; 743 } else { 744 PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i, 745 color_mask, color_mask, &it); 746 idx = PNUM_TO_IDX(mnode, r, pfnum); 747 idx = (idx >= r_pgcnt) ? 0 : idx; 748 } 749 for (mrange = 0; mrange < nranges; mrange++) { 750 PAGE_COUNTERS_CURRENT_COLOR(mnode, 751 r, i, mrange) = idx; 752 } 753 } 754 755 /* hpm_counters may be shared by all mnodes */ 756 if (firstmn == mnode) { 757 PAGE_COUNTERS_COUNTERS(mnode, r) = 758 (hpmctr_t *)alloc_base; 759 alloc_base += 760 P2ROUNDUP((sizeof (hpmctr_t) * r_pgcnt), 761 sizeof (hpmctr_t *)); 762 } else { 763 PAGE_COUNTERS_COUNTERS(mnode, r) = 764 PAGE_COUNTERS_COUNTERS(firstmn, r); 765 } 766 767 /* 768 * Verify that PNUM_TO_IDX and IDX_TO_PNUM 769 * satisfy the identity requirement. 770 * We should be able to go from one to the other 771 * and get consistent values. 772 */ 773 ASSERT(PNUM_TO_IDX(mnode, r, 774 (IDX_TO_PNUM(mnode, r, 0))) == 0); 775 ASSERT(IDX_TO_PNUM(mnode, r, 776 (PNUM_TO_IDX(mnode, r, r_base))) == r_base); 777 } 778 /* 779 * Roundup the start address of the page_counters to 780 * cache aligned boundary for every memory node. 781 * page_ctrs_sz() has added some slop for these roundups. 782 */ 783 alloc_base = (caddr_t)P2ROUNDUP((uintptr_t)alloc_base, 784 L2CACHE_ALIGN); 785 } 786 787 /* Initialize other page counter specific data structures. */ 788 for (mnode = 0; mnode < MAX_MEM_NODES; mnode++) { 789 rw_init(&page_ctrs_rwlock[mnode], NULL, RW_DEFAULT, NULL); 790 } 791 792 return (alloc_base); 793 } 794 795 /* 796 * Functions to adjust region counters for each size free list. 797 * Caller is responsible to acquire the ctr_mutex lock if necessary and 798 * thus can be called during startup without locks. 799 */ 800 /* ARGSUSED */ 801 void 802 page_ctr_add_internal(int mnode, int mtype, page_t *pp, int flags) 803 { 804 ssize_t r; /* region size */ 805 ssize_t idx; 806 pfn_t pfnum; 807 int lckidx; 808 809 ASSERT(mnode == PP_2_MEM_NODE(pp)); 810 ASSERT(mtype == PP_2_MTYPE(pp)); 811 812 ASSERT(pp->p_szc < mmu_page_sizes); 813 814 PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags); 815 816 /* no counter update needed for largest page size */ 817 if (pp->p_szc >= mmu_page_sizes - 1) { 818 return; 819 } 820 821 r = pp->p_szc + 1; 822 pfnum = pp->p_pagenum; 823 lckidx = PP_CTR_LOCK_INDX(pp); 824 825 /* 826 * Increment the count of free pages for the current 827 * region. Continue looping up in region size incrementing 828 * count if the preceeding region is full. 829 */ 830 while (r < mmu_page_sizes) { 831 idx = PNUM_TO_IDX(mnode, r, pfnum); 832 833 ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r)); 834 ASSERT(PAGE_COUNTERS(mnode, r, idx) < FULL_REGION_CNT(r)); 835 836 if (++PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r)) { 837 break; 838 } else { 839 int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r)); 840 pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode] 841 [MTYPE_2_MRANGE(mnode, root_mtype)]; 842 843 cand->pcc_pages_free++; 844 cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]++; 845 } 846 r++; 847 } 848 } 849 850 void 851 page_ctr_add(int mnode, int mtype, page_t *pp, int flags) 852 { 853 int lckidx = PP_CTR_LOCK_INDX(pp); 854 kmutex_t *lock = &ctr_mutex[lckidx][mnode]; 855 856 mutex_enter(lock); 857 page_ctr_add_internal(mnode, mtype, pp, flags); 858 mutex_exit(lock); 859 } 860 861 void 862 page_ctr_sub_internal(int mnode, int mtype, page_t *pp, int flags) 863 { 864 int lckidx; 865 ssize_t r; /* region size */ 866 ssize_t idx; 867 pfn_t pfnum; 868 869 ASSERT(mnode == PP_2_MEM_NODE(pp)); 870 ASSERT(mtype == PP_2_MTYPE(pp)); 871 872 ASSERT(pp->p_szc < mmu_page_sizes); 873 874 PLCNT_DECR(pp, mnode, mtype, pp->p_szc, flags); 875 876 /* no counter update needed for largest page size */ 877 if (pp->p_szc >= mmu_page_sizes - 1) { 878 return; 879 } 880 881 r = pp->p_szc + 1; 882 pfnum = pp->p_pagenum; 883 lckidx = PP_CTR_LOCK_INDX(pp); 884 885 /* 886 * Decrement the count of free pages for the current 887 * region. Continue looping up in region size decrementing 888 * count if the preceeding region was full. 889 */ 890 while (r < mmu_page_sizes) { 891 idx = PNUM_TO_IDX(mnode, r, pfnum); 892 893 ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r)); 894 ASSERT(PAGE_COUNTERS(mnode, r, idx) > 0); 895 896 if (--PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r) - 1) { 897 break; 898 } else { 899 int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r)); 900 pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode] 901 [MTYPE_2_MRANGE(mnode, root_mtype)]; 902 903 ASSERT(cand->pcc_pages_free != 0); 904 ASSERT(cand->pcc_color_free[PP_2_BIN_SZC(pp, r)] != 0); 905 906 cand->pcc_pages_free--; 907 cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]--; 908 } 909 r++; 910 } 911 } 912 913 void 914 page_ctr_sub(int mnode, int mtype, page_t *pp, int flags) 915 { 916 int lckidx = PP_CTR_LOCK_INDX(pp); 917 kmutex_t *lock = &ctr_mutex[lckidx][mnode]; 918 919 mutex_enter(lock); 920 page_ctr_sub_internal(mnode, mtype, pp, flags); 921 mutex_exit(lock); 922 } 923 924 /* 925 * Adjust page counters following a memory attach, since typically the 926 * size of the array needs to change, and the PFN to counter index 927 * mapping needs to change. 928 * 929 * It is possible this mnode did not exist at startup. In that case 930 * allocate pcc_info_t and pcc_color_free arrays. Also, allow for nranges 931 * to change (a theoretical possibility on x86), which means pcc_color_free 932 * arrays must be extended. 933 */ 934 uint_t 935 page_ctrs_adjust(int mnode) 936 { 937 pgcnt_t npgs; 938 int r; /* region size */ 939 int i; 940 size_t pcsz, old_csz; 941 hpmctr_t *new_ctr, *old_ctr; 942 pfn_t oldbase, newbase; 943 pfn_t physbase, physmax; 944 size_t old_npgs; 945 hpmctr_t *ctr_cache[MMU_PAGE_SIZES]; 946 size_t size_cache[MMU_PAGE_SIZES]; 947 size_t *color_cache[MMU_PAGE_SIZES][MAX_MNODE_MRANGES]; 948 size_t *old_color_array[MAX_MNODE_MRANGES]; 949 pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; 950 pcc_info_t **cands_cache; 951 pcc_info_t *old_pi, *pi; 952 pgcnt_t *pgcntp; 953 int nr, old_nranges, mrange, nranges = MNODE_RANGE_CNT(mnode); 954 int cands_cache_nranges; 955 int old_maxmrange, new_maxmrange; 956 int rc = 0; 957 int oldmnode; 958 959 cands_cache = kmem_zalloc(sizeof (pcc_info_t *) * NPC_MUTEX * 960 MMU_PAGE_SIZES, KM_NOSLEEP); 961 if (cands_cache == NULL) 962 return (ENOMEM); 963 964 i = -1; 965 HPM_COUNTERS_LIMITS(mnode, physbase, physmax, i); 966 967 newbase = physbase & ~PC_BASE_ALIGN_MASK; 968 npgs = roundup(physmax, PC_BASE_ALIGN) - newbase; 969 970 /* prepare to free non-null pointers on the way out */ 971 cands_cache_nranges = nranges; 972 bzero(ctr_cache, sizeof (ctr_cache)); 973 bzero(color_cache, sizeof (color_cache)); 974 975 /* 976 * We need to determine how many page colors there are for each 977 * page size in order to allocate memory for any color specific 978 * arrays. 979 */ 980 for (r = 0; r < mmu_page_sizes; r++) { 981 colors_per_szc[r] = PAGE_GET_PAGECOLORS(r); 982 } 983 984 /* 985 * Preallocate all of the new hpm_counters arrays as we can't 986 * hold the page_ctrs_rwlock as a writer and allocate memory. 987 * If we can't allocate all of the arrays, undo our work so far 988 * and return failure. 989 */ 990 for (r = 1; r < mmu_page_sizes; r++) { 991 pcsz = npgs >> PAGE_BSZS_SHIFT(r); 992 size_cache[r] = pcsz; 993 ctr_cache[r] = kmem_zalloc(pcsz * 994 sizeof (hpmctr_t), KM_NOSLEEP); 995 if (ctr_cache[r] == NULL) { 996 rc = ENOMEM; 997 goto cleanup; 998 } 999 } 1000 1001 /* 1002 * Preallocate all of the new color current arrays as we can't 1003 * hold the page_ctrs_rwlock as a writer and allocate memory. 1004 * If we can't allocate all of the arrays, undo our work so far 1005 * and return failure. 1006 */ 1007 for (r = 1; r < mmu_page_sizes; r++) { 1008 for (mrange = 0; mrange < nranges; mrange++) { 1009 color_cache[r][mrange] = kmem_zalloc(sizeof (size_t) * 1010 colors_per_szc[r], KM_NOSLEEP); 1011 if (color_cache[r][mrange] == NULL) { 1012 rc = ENOMEM; 1013 goto cleanup; 1014 } 1015 } 1016 } 1017 1018 /* 1019 * Preallocate all of the new pcc_info_t arrays as we can't 1020 * hold the page_ctrs_rwlock as a writer and allocate memory. 1021 * If we can't allocate all of the arrays, undo our work so far 1022 * and return failure. 1023 */ 1024 for (r = 1; r < mmu_page_sizes; r++) { 1025 for (i = 0; i < NPC_MUTEX; i++) { 1026 pi = kmem_zalloc(nranges * sizeof (pcc_info_t), 1027 KM_NOSLEEP); 1028 if (pi == NULL) { 1029 rc = ENOMEM; 1030 goto cleanup; 1031 } 1032 cands_cache[i * MMU_PAGE_SIZES + r] = pi; 1033 1034 for (mrange = 0; mrange < nranges; mrange++, pi++) { 1035 pgcntp = kmem_zalloc(colors_per_szc[r] * 1036 sizeof (pgcnt_t), KM_NOSLEEP); 1037 if (pgcntp == NULL) { 1038 rc = ENOMEM; 1039 goto cleanup; 1040 } 1041 pi->pcc_color_free = pgcntp; 1042 } 1043 } 1044 } 1045 1046 /* 1047 * Grab the write lock to prevent others from walking these arrays 1048 * while we are modifying them. 1049 */ 1050 PAGE_CTRS_WRITE_LOCK(mnode); 1051 1052 /* 1053 * For interleaved mnodes, find the first mnode 1054 * with valid page counters since the current 1055 * mnode may have just been added and not have 1056 * valid page counters. 1057 */ 1058 if (interleaved_mnodes) { 1059 for (i = 0; i < max_mem_nodes; i++) 1060 if (PAGE_COUNTERS_COUNTERS(i, 1) != NULL) 1061 break; 1062 ASSERT(i < max_mem_nodes); 1063 oldmnode = i; 1064 } else 1065 oldmnode = mnode; 1066 1067 old_nranges = mnode_nranges[mnode]; 1068 cands_cache_nranges = old_nranges; 1069 mnode_nranges[mnode] = nranges; 1070 old_maxmrange = mnode_maxmrange[mnode]; 1071 mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode); 1072 new_maxmrange = mnode_maxmrange[mnode]; 1073 1074 for (r = 1; r < mmu_page_sizes; r++) { 1075 PAGE_COUNTERS_SHIFT(mnode, r) = PAGE_BSZS_SHIFT(r); 1076 old_ctr = PAGE_COUNTERS_COUNTERS(oldmnode, r); 1077 old_csz = PAGE_COUNTERS_ENTRIES(oldmnode, r); 1078 oldbase = PAGE_COUNTERS_BASE(oldmnode, r); 1079 old_npgs = old_csz << PAGE_COUNTERS_SHIFT(oldmnode, r); 1080 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) { 1081 old_color_array[mrange] = 1082 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, 1083 r, mrange); 1084 } 1085 1086 pcsz = npgs >> PAGE_COUNTERS_SHIFT(mnode, r); 1087 new_ctr = ctr_cache[r]; 1088 ctr_cache[r] = NULL; 1089 if (old_ctr != NULL && 1090 (oldbase + old_npgs > newbase) && 1091 (newbase + npgs > oldbase)) { 1092 /* 1093 * Map the intersection of the old and new 1094 * counters into the new array. 1095 */ 1096 size_t offset; 1097 if (newbase > oldbase) { 1098 offset = (newbase - oldbase) >> 1099 PAGE_COUNTERS_SHIFT(mnode, r); 1100 bcopy(old_ctr + offset, new_ctr, 1101 MIN(pcsz, (old_csz - offset)) * 1102 sizeof (hpmctr_t)); 1103 } else { 1104 offset = (oldbase - newbase) >> 1105 PAGE_COUNTERS_SHIFT(mnode, r); 1106 bcopy(old_ctr, new_ctr + offset, 1107 MIN(pcsz - offset, old_csz) * 1108 sizeof (hpmctr_t)); 1109 } 1110 } 1111 1112 PAGE_COUNTERS_COUNTERS(mnode, r) = new_ctr; 1113 PAGE_COUNTERS_ENTRIES(mnode, r) = pcsz; 1114 PAGE_COUNTERS_BASE(mnode, r) = newbase; 1115 1116 /* update shared hpm_counters in other mnodes */ 1117 if (interleaved_mnodes) { 1118 for (i = 0; i < max_mem_nodes; i++) { 1119 if (i == mnode) 1120 continue; 1121 ASSERT( 1122 PAGE_COUNTERS_COUNTERS(i, r) == old_ctr || 1123 PAGE_COUNTERS_COUNTERS(i, r) == NULL); 1124 if (mem_node_config[i].exists == 0) 1125 continue; 1126 PAGE_COUNTERS_COUNTERS(i, r) = new_ctr; 1127 PAGE_COUNTERS_ENTRIES(i, r) = pcsz; 1128 PAGE_COUNTERS_BASE(i, r) = newbase; 1129 } 1130 } 1131 1132 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) { 1133 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r, mrange) = 1134 color_cache[r][mrange]; 1135 color_cache[r][mrange] = NULL; 1136 } 1137 /* 1138 * for now, just reset on these events as it's probably 1139 * not worthwhile to try and optimize this. 1140 */ 1141 for (i = 0; i < colors_per_szc[r]; i++) { 1142 uint_t color_mask = colors_per_szc[r] - 1; 1143 int mlo = interleaved_mnodes ? 0 : mnode; 1144 int mhi = interleaved_mnodes ? max_mem_nodes : 1145 (mnode + 1); 1146 int m; 1147 pfn_t pfnum; 1148 size_t idx; 1149 MEM_NODE_ITERATOR_DECL(it); 1150 1151 for (m = mlo; m < mhi; m++) { 1152 if (mem_node_config[m].exists == 0) 1153 continue; 1154 pfnum = newbase; 1155 MEM_NODE_ITERATOR_INIT(pfnum, m, r, &it); 1156 if (pfnum == (pfn_t)-1) { 1157 idx = 0; 1158 } else { 1159 PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i, 1160 color_mask, color_mask, &it); 1161 idx = PNUM_TO_IDX(m, r, pfnum); 1162 idx = (idx < pcsz) ? idx : 0; 1163 } 1164 for (mrange = 0; mrange < nranges; mrange++) { 1165 if (PAGE_COUNTERS_CURRENT_COLOR_ARRAY(m, 1166 r, mrange) != NULL) 1167 PAGE_COUNTERS_CURRENT_COLOR(m, 1168 r, i, mrange) = idx; 1169 } 1170 } 1171 } 1172 1173 /* cache info for freeing out of the critical path */ 1174 if ((caddr_t)old_ctr >= kernelheap && 1175 (caddr_t)old_ctr < ekernelheap) { 1176 ctr_cache[r] = old_ctr; 1177 size_cache[r] = old_csz; 1178 } 1179 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) { 1180 size_t *tmp = old_color_array[mrange]; 1181 if ((caddr_t)tmp >= kernelheap && 1182 (caddr_t)tmp < ekernelheap) { 1183 color_cache[r][mrange] = tmp; 1184 } 1185 } 1186 /* 1187 * Verify that PNUM_TO_IDX and IDX_TO_PNUM 1188 * satisfy the identity requirement. 1189 * We should be able to go from one to the other 1190 * and get consistent values. 1191 */ 1192 ASSERT(PNUM_TO_IDX(mnode, r, 1193 (IDX_TO_PNUM(mnode, r, 0))) == 0); 1194 ASSERT(IDX_TO_PNUM(mnode, r, 1195 (PNUM_TO_IDX(mnode, r, newbase))) == newbase); 1196 1197 /* pcc_info_t and pcc_color_free */ 1198 for (i = 0; i < NPC_MUTEX; i++) { 1199 pcc_info_t *epi; 1200 pcc_info_t *eold_pi; 1201 1202 pi = cands_cache[i * MMU_PAGE_SIZES + r]; 1203 old_pi = page_ctrs_cands[i][r][mnode]; 1204 page_ctrs_cands[i][r][mnode] = pi; 1205 cands_cache[i * MMU_PAGE_SIZES + r] = old_pi; 1206 1207 /* preserve old pcc_color_free values, if any */ 1208 if (old_pi == NULL) 1209 continue; 1210 1211 /* 1212 * when/if x86 does DR, must account for 1213 * possible change in range index when 1214 * preserving pcc_info 1215 */ 1216 epi = &pi[nranges]; 1217 eold_pi = &old_pi[old_nranges]; 1218 if (new_maxmrange > old_maxmrange) { 1219 pi += new_maxmrange - old_maxmrange; 1220 } else if (new_maxmrange < old_maxmrange) { 1221 old_pi += old_maxmrange - new_maxmrange; 1222 } 1223 for (; pi < epi && old_pi < eold_pi; pi++, old_pi++) { 1224 pcc_info_t tmp = *pi; 1225 *pi = *old_pi; 1226 *old_pi = tmp; 1227 } 1228 } 1229 } 1230 PAGE_CTRS_WRITE_UNLOCK(mnode); 1231 1232 /* 1233 * Now that we have dropped the write lock, it is safe to free all 1234 * of the memory we have cached above. 1235 * We come thru here to free memory when pre-alloc fails, and also to 1236 * free old pointers which were recorded while locked. 1237 */ 1238 cleanup: 1239 for (r = 1; r < mmu_page_sizes; r++) { 1240 if (ctr_cache[r] != NULL) { 1241 kmem_free(ctr_cache[r], 1242 size_cache[r] * sizeof (hpmctr_t)); 1243 } 1244 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) { 1245 if (color_cache[r][mrange] != NULL) { 1246 kmem_free(color_cache[r][mrange], 1247 colors_per_szc[r] * sizeof (size_t)); 1248 } 1249 } 1250 for (i = 0; i < NPC_MUTEX; i++) { 1251 pi = cands_cache[i * MMU_PAGE_SIZES + r]; 1252 if (pi == NULL) 1253 continue; 1254 nr = cands_cache_nranges; 1255 for (mrange = 0; mrange < nr; mrange++, pi++) { 1256 pgcntp = pi->pcc_color_free; 1257 if (pgcntp == NULL) 1258 continue; 1259 if ((caddr_t)pgcntp >= kernelheap && 1260 (caddr_t)pgcntp < ekernelheap) { 1261 kmem_free(pgcntp, 1262 colors_per_szc[r] * 1263 sizeof (pgcnt_t)); 1264 } 1265 } 1266 pi = cands_cache[i * MMU_PAGE_SIZES + r]; 1267 if ((caddr_t)pi >= kernelheap && 1268 (caddr_t)pi < ekernelheap) { 1269 kmem_free(pi, nr * sizeof (pcc_info_t)); 1270 } 1271 } 1272 } 1273 1274 kmem_free(cands_cache, 1275 sizeof (pcc_info_t *) * NPC_MUTEX * MMU_PAGE_SIZES); 1276 return (rc); 1277 } 1278 1279 1280 #ifdef DEBUG 1281 1282 /* 1283 * confirm pp is a large page corresponding to szc 1284 */ 1285 void 1286 chk_lpg(page_t *pp, uchar_t szc) 1287 { 1288 spgcnt_t npgs = page_get_pagecnt(pp->p_szc); 1289 uint_t noreloc; 1290 1291 if (npgs == 1) { 1292 ASSERT(pp->p_szc == 0); 1293 ASSERT(pp->p_next == pp); 1294 ASSERT(pp->p_prev == pp); 1295 return; 1296 } 1297 1298 ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL); 1299 ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL); 1300 1301 ASSERT(IS_P2ALIGNED(pp->p_pagenum, npgs)); 1302 ASSERT(pp->p_pagenum == (pp->p_next->p_pagenum - 1)); 1303 ASSERT(pp->p_prev->p_pagenum == (pp->p_pagenum + (npgs - 1))); 1304 ASSERT(pp->p_prev == (pp + (npgs - 1))); 1305 1306 /* 1307 * Check list of pages. 1308 */ 1309 noreloc = PP_ISNORELOC(pp); 1310 while (npgs--) { 1311 if (npgs != 0) { 1312 ASSERT(pp->p_pagenum == pp->p_next->p_pagenum - 1); 1313 ASSERT(pp->p_next == (pp + 1)); 1314 } 1315 ASSERT(pp->p_szc == szc); 1316 ASSERT(PP_ISFREE(pp)); 1317 ASSERT(PP_ISAGED(pp)); 1318 ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL); 1319 ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL); 1320 ASSERT(pp->p_vnode == NULL); 1321 ASSERT(PP_ISNORELOC(pp) == noreloc); 1322 1323 pp = pp->p_next; 1324 } 1325 } 1326 #endif /* DEBUG */ 1327 1328 void 1329 page_freelist_lock(int mnode) 1330 { 1331 int i; 1332 for (i = 0; i < NPC_MUTEX; i++) { 1333 mutex_enter(FPC_MUTEX(mnode, i)); 1334 mutex_enter(CPC_MUTEX(mnode, i)); 1335 } 1336 } 1337 1338 void 1339 page_freelist_unlock(int mnode) 1340 { 1341 int i; 1342 for (i = 0; i < NPC_MUTEX; i++) { 1343 mutex_exit(FPC_MUTEX(mnode, i)); 1344 mutex_exit(CPC_MUTEX(mnode, i)); 1345 } 1346 } 1347 1348 /* 1349 * add pp to the specified page list. Defaults to head of the page list 1350 * unless PG_LIST_TAIL is specified. 1351 */ 1352 void 1353 page_list_add(page_t *pp, int flags) 1354 { 1355 page_t **ppp; 1356 kmutex_t *pcm; 1357 uint_t bin, mtype; 1358 int mnode; 1359 1360 ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT)); 1361 ASSERT(PP_ISFREE(pp)); 1362 ASSERT(!hat_page_is_mapped(pp)); 1363 ASSERT(hat_page_getshare(pp) == 0); 1364 1365 /* 1366 * Large pages should be freed via page_list_add_pages(). 1367 */ 1368 ASSERT(pp->p_szc == 0); 1369 1370 /* 1371 * Don't need to lock the freelist first here 1372 * because the page isn't on the freelist yet. 1373 * This means p_szc can't change on us. 1374 */ 1375 1376 bin = PP_2_BIN(pp); 1377 mnode = PP_2_MEM_NODE(pp); 1378 mtype = PP_2_MTYPE(pp); 1379 1380 if (flags & PG_LIST_ISINIT) { 1381 /* 1382 * PG_LIST_ISINIT is set during system startup (ie. single 1383 * threaded), add a page to the free list and add to the 1384 * the free region counters w/o any locking 1385 */ 1386 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1387 1388 /* inline version of page_add() */ 1389 if (*ppp != NULL) { 1390 pp->p_next = *ppp; 1391 pp->p_prev = (*ppp)->p_prev; 1392 (*ppp)->p_prev = pp; 1393 pp->p_prev->p_next = pp; 1394 } else 1395 *ppp = pp; 1396 1397 page_ctr_add_internal(mnode, mtype, pp, flags); 1398 VM_STAT_ADD(vmm_vmstats.pladd_free[0]); 1399 } else { 1400 pcm = PC_BIN_MUTEX(mnode, bin, flags); 1401 1402 if (flags & PG_FREE_LIST) { 1403 VM_STAT_ADD(vmm_vmstats.pladd_free[0]); 1404 ASSERT(PP_ISAGED(pp)); 1405 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1406 1407 } else { 1408 VM_STAT_ADD(vmm_vmstats.pladd_cache); 1409 ASSERT(pp->p_vnode); 1410 ASSERT((pp->p_offset & PAGEOFFSET) == 0); 1411 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1412 } 1413 mutex_enter(pcm); 1414 page_add(ppp, pp); 1415 1416 if (flags & PG_LIST_TAIL) 1417 *ppp = (*ppp)->p_next; 1418 /* 1419 * Add counters before releasing pcm mutex to avoid a race with 1420 * page_freelist_coalesce and page_freelist_split. 1421 */ 1422 page_ctr_add(mnode, mtype, pp, flags); 1423 mutex_exit(pcm); 1424 } 1425 1426 1427 #if defined(__sparc) 1428 if (PP_ISNORELOC(pp)) { 1429 kcage_freemem_add(1); 1430 } 1431 #endif 1432 /* 1433 * It is up to the caller to unlock the page! 1434 */ 1435 ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT)); 1436 } 1437 1438 1439 #ifdef __sparc 1440 /* 1441 * This routine is only used by kcage_init during system startup. 1442 * It performs the function of page_list_sub/PP_SETNORELOC/page_list_add 1443 * without the overhead of taking locks and updating counters. 1444 */ 1445 void 1446 page_list_noreloc_startup(page_t *pp) 1447 { 1448 page_t **ppp; 1449 uint_t bin; 1450 int mnode; 1451 int mtype; 1452 int flags = 0; 1453 1454 /* 1455 * If this is a large page on the freelist then 1456 * break it up into smaller pages. 1457 */ 1458 if (pp->p_szc != 0) 1459 page_boot_demote(pp); 1460 1461 /* 1462 * Get list page is currently on. 1463 */ 1464 bin = PP_2_BIN(pp); 1465 mnode = PP_2_MEM_NODE(pp); 1466 mtype = PP_2_MTYPE(pp); 1467 ASSERT(mtype == MTYPE_RELOC); 1468 ASSERT(pp->p_szc == 0); 1469 1470 if (PP_ISAGED(pp)) { 1471 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1472 flags |= PG_FREE_LIST; 1473 } else { 1474 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1475 flags |= PG_CACHE_LIST; 1476 } 1477 1478 ASSERT(*ppp != NULL); 1479 1480 /* 1481 * Delete page from current list. 1482 */ 1483 if (*ppp == pp) 1484 *ppp = pp->p_next; /* go to next page */ 1485 if (*ppp == pp) { 1486 *ppp = NULL; /* page list is gone */ 1487 } else { 1488 pp->p_prev->p_next = pp->p_next; 1489 pp->p_next->p_prev = pp->p_prev; 1490 } 1491 1492 /* 1493 * Decrement page counters 1494 */ 1495 page_ctr_sub_internal(mnode, mtype, pp, flags); 1496 1497 /* 1498 * Set no reloc for cage initted pages. 1499 */ 1500 PP_SETNORELOC(pp); 1501 1502 mtype = PP_2_MTYPE(pp); 1503 ASSERT(mtype == MTYPE_NORELOC); 1504 1505 /* 1506 * Get new list for page. 1507 */ 1508 if (PP_ISAGED(pp)) { 1509 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1510 } else { 1511 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1512 } 1513 1514 /* 1515 * Insert page on new list. 1516 */ 1517 if (*ppp == NULL) { 1518 *ppp = pp; 1519 pp->p_next = pp->p_prev = pp; 1520 } else { 1521 pp->p_next = *ppp; 1522 pp->p_prev = (*ppp)->p_prev; 1523 (*ppp)->p_prev = pp; 1524 pp->p_prev->p_next = pp; 1525 } 1526 1527 /* 1528 * Increment page counters 1529 */ 1530 page_ctr_add_internal(mnode, mtype, pp, flags); 1531 1532 /* 1533 * Update cage freemem counter 1534 */ 1535 atomic_add_long(&kcage_freemem, 1); 1536 } 1537 #else /* __sparc */ 1538 1539 /* ARGSUSED */ 1540 void 1541 page_list_noreloc_startup(page_t *pp) 1542 { 1543 panic("page_list_noreloc_startup: should be here only for sparc"); 1544 } 1545 #endif 1546 1547 void 1548 page_list_add_pages(page_t *pp, int flags) 1549 { 1550 kmutex_t *pcm; 1551 pgcnt_t pgcnt; 1552 uint_t bin, mtype, i; 1553 int mnode; 1554 1555 /* default to freelist/head */ 1556 ASSERT((flags & (PG_CACHE_LIST | PG_LIST_TAIL)) == 0); 1557 1558 CHK_LPG(pp, pp->p_szc); 1559 VM_STAT_ADD(vmm_vmstats.pladd_free[pp->p_szc]); 1560 1561 bin = PP_2_BIN(pp); 1562 mnode = PP_2_MEM_NODE(pp); 1563 mtype = PP_2_MTYPE(pp); 1564 1565 if (flags & PG_LIST_ISINIT) { 1566 ASSERT(pp->p_szc == mmu_page_sizes - 1); 1567 page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1568 ASSERT(!PP_ISNORELOC(pp)); 1569 PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags); 1570 } else { 1571 1572 ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes); 1573 1574 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 1575 1576 mutex_enter(pcm); 1577 page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1578 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST); 1579 mutex_exit(pcm); 1580 1581 pgcnt = page_get_pagecnt(pp->p_szc); 1582 #if defined(__sparc) 1583 if (PP_ISNORELOC(pp)) 1584 kcage_freemem_add(pgcnt); 1585 #endif 1586 for (i = 0; i < pgcnt; i++, pp++) 1587 page_unlock_nocapture(pp); 1588 } 1589 } 1590 1591 /* 1592 * During boot, need to demote a large page to base 1593 * pagesize pages for seg_kmem for use in boot_alloc() 1594 */ 1595 void 1596 page_boot_demote(page_t *pp) 1597 { 1598 ASSERT(pp->p_szc != 0); 1599 ASSERT(PP_ISFREE(pp)); 1600 ASSERT(PP_ISAGED(pp)); 1601 1602 (void) page_demote(PP_2_MEM_NODE(pp), 1603 PFN_BASE(pp->p_pagenum, pp->p_szc), 0, pp->p_szc, 0, PC_NO_COLOR, 1604 PC_FREE); 1605 1606 ASSERT(PP_ISFREE(pp)); 1607 ASSERT(PP_ISAGED(pp)); 1608 ASSERT(pp->p_szc == 0); 1609 } 1610 1611 /* 1612 * Take a particular page off of whatever freelist the page 1613 * is claimed to be on. 1614 * 1615 * NOTE: Only used for PAGESIZE pages. 1616 */ 1617 void 1618 page_list_sub(page_t *pp, int flags) 1619 { 1620 int bin; 1621 uint_t mtype; 1622 int mnode; 1623 kmutex_t *pcm; 1624 page_t **ppp; 1625 1626 ASSERT(PAGE_EXCL(pp)); 1627 ASSERT(PP_ISFREE(pp)); 1628 1629 /* 1630 * The p_szc field can only be changed by page_promote() 1631 * and page_demote(). Only free pages can be promoted and 1632 * demoted and the free list MUST be locked during these 1633 * operations. So to prevent a race in page_list_sub() 1634 * between computing which bin of the freelist lock to 1635 * grab and actually grabing the lock we check again that 1636 * the bin we locked is still the correct one. Notice that 1637 * the p_szc field could have actually changed on us but 1638 * if the bin happens to still be the same we are safe. 1639 */ 1640 try_again: 1641 bin = PP_2_BIN(pp); 1642 mnode = PP_2_MEM_NODE(pp); 1643 pcm = PC_BIN_MUTEX(mnode, bin, flags); 1644 mutex_enter(pcm); 1645 if (PP_2_BIN(pp) != bin) { 1646 mutex_exit(pcm); 1647 goto try_again; 1648 } 1649 mtype = PP_2_MTYPE(pp); 1650 1651 if (flags & PG_FREE_LIST) { 1652 VM_STAT_ADD(vmm_vmstats.plsub_free[0]); 1653 ASSERT(PP_ISAGED(pp)); 1654 ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype); 1655 } else { 1656 VM_STAT_ADD(vmm_vmstats.plsub_cache); 1657 ASSERT(!PP_ISAGED(pp)); 1658 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1659 } 1660 1661 /* 1662 * Common PAGESIZE case. 1663 * 1664 * Note that we locked the freelist. This prevents 1665 * any page promotion/demotion operations. Therefore 1666 * the p_szc will not change until we drop pcm mutex. 1667 */ 1668 if (pp->p_szc == 0) { 1669 page_sub(ppp, pp); 1670 /* 1671 * Subtract counters before releasing pcm mutex 1672 * to avoid race with page_freelist_coalesce. 1673 */ 1674 page_ctr_sub(mnode, mtype, pp, flags); 1675 mutex_exit(pcm); 1676 1677 #if defined(__sparc) 1678 if (PP_ISNORELOC(pp)) { 1679 kcage_freemem_sub(1); 1680 } 1681 #endif 1682 return; 1683 } 1684 1685 /* 1686 * Large pages on the cache list are not supported. 1687 */ 1688 if (flags & PG_CACHE_LIST) 1689 panic("page_list_sub: large page on cachelist"); 1690 1691 /* 1692 * Slow but rare. 1693 * 1694 * Somebody wants this particular page which is part 1695 * of a large page. In this case we just demote the page 1696 * if it's on the freelist. 1697 * 1698 * We have to drop pcm before locking the entire freelist. 1699 * Once we have re-locked the freelist check to make sure 1700 * the page hasn't already been demoted or completely 1701 * freed. 1702 */ 1703 mutex_exit(pcm); 1704 page_freelist_lock(mnode); 1705 if (pp->p_szc != 0) { 1706 /* 1707 * Large page is on freelist. 1708 */ 1709 (void) page_demote(mnode, PFN_BASE(pp->p_pagenum, pp->p_szc), 1710 0, pp->p_szc, 0, PC_NO_COLOR, PC_FREE); 1711 } 1712 ASSERT(PP_ISFREE(pp)); 1713 ASSERT(PP_ISAGED(pp)); 1714 ASSERT(pp->p_szc == 0); 1715 1716 /* 1717 * Subtract counters before releasing pcm mutex 1718 * to avoid race with page_freelist_coalesce. 1719 */ 1720 bin = PP_2_BIN(pp); 1721 mtype = PP_2_MTYPE(pp); 1722 ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype); 1723 1724 page_sub(ppp, pp); 1725 page_ctr_sub(mnode, mtype, pp, flags); 1726 page_freelist_unlock(mnode); 1727 1728 #if defined(__sparc) 1729 if (PP_ISNORELOC(pp)) { 1730 kcage_freemem_sub(1); 1731 } 1732 #endif 1733 } 1734 1735 void 1736 page_list_sub_pages(page_t *pp, uint_t szc) 1737 { 1738 kmutex_t *pcm; 1739 uint_t bin, mtype; 1740 int mnode; 1741 1742 ASSERT(PAGE_EXCL(pp)); 1743 ASSERT(PP_ISFREE(pp)); 1744 ASSERT(PP_ISAGED(pp)); 1745 1746 /* 1747 * See comment in page_list_sub(). 1748 */ 1749 try_again: 1750 bin = PP_2_BIN(pp); 1751 mnode = PP_2_MEM_NODE(pp); 1752 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 1753 mutex_enter(pcm); 1754 if (PP_2_BIN(pp) != bin) { 1755 mutex_exit(pcm); 1756 goto try_again; 1757 } 1758 1759 /* 1760 * If we're called with a page larger than szc or it got 1761 * promoted above szc before we locked the freelist then 1762 * drop pcm and re-lock entire freelist. If page still larger 1763 * than szc then demote it. 1764 */ 1765 if (pp->p_szc > szc) { 1766 mutex_exit(pcm); 1767 pcm = NULL; 1768 page_freelist_lock(mnode); 1769 if (pp->p_szc > szc) { 1770 VM_STAT_ADD(vmm_vmstats.plsubpages_szcbig); 1771 (void) page_demote(mnode, 1772 PFN_BASE(pp->p_pagenum, pp->p_szc), 0, 1773 pp->p_szc, szc, PC_NO_COLOR, PC_FREE); 1774 } 1775 bin = PP_2_BIN(pp); 1776 } 1777 ASSERT(PP_ISFREE(pp)); 1778 ASSERT(PP_ISAGED(pp)); 1779 ASSERT(pp->p_szc <= szc); 1780 ASSERT(pp == PP_PAGEROOT(pp)); 1781 1782 VM_STAT_ADD(vmm_vmstats.plsub_free[pp->p_szc]); 1783 1784 mtype = PP_2_MTYPE(pp); 1785 if (pp->p_szc != 0) { 1786 page_vpsub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1787 CHK_LPG(pp, pp->p_szc); 1788 } else { 1789 VM_STAT_ADD(vmm_vmstats.plsubpages_szc0); 1790 page_sub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1791 } 1792 page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST); 1793 1794 if (pcm != NULL) { 1795 mutex_exit(pcm); 1796 } else { 1797 page_freelist_unlock(mnode); 1798 } 1799 1800 #if defined(__sparc) 1801 if (PP_ISNORELOC(pp)) { 1802 pgcnt_t pgcnt; 1803 1804 pgcnt = page_get_pagecnt(pp->p_szc); 1805 kcage_freemem_sub(pgcnt); 1806 } 1807 #endif 1808 } 1809 1810 /* 1811 * Add the page to the front of a linked list of pages 1812 * using the p_next & p_prev pointers for the list. 1813 * The caller is responsible for protecting the list pointers. 1814 */ 1815 void 1816 mach_page_add(page_t **ppp, page_t *pp) 1817 { 1818 if (*ppp == NULL) { 1819 pp->p_next = pp->p_prev = pp; 1820 } else { 1821 pp->p_next = *ppp; 1822 pp->p_prev = (*ppp)->p_prev; 1823 (*ppp)->p_prev = pp; 1824 pp->p_prev->p_next = pp; 1825 } 1826 *ppp = pp; 1827 } 1828 1829 /* 1830 * Remove this page from a linked list of pages 1831 * using the p_next & p_prev pointers for the list. 1832 * 1833 * The caller is responsible for protecting the list pointers. 1834 */ 1835 void 1836 mach_page_sub(page_t **ppp, page_t *pp) 1837 { 1838 ASSERT(PP_ISFREE(pp)); 1839 1840 if (*ppp == NULL || pp == NULL) 1841 panic("mach_page_sub"); 1842 1843 if (*ppp == pp) 1844 *ppp = pp->p_next; /* go to next page */ 1845 1846 if (*ppp == pp) 1847 *ppp = NULL; /* page list is gone */ 1848 else { 1849 pp->p_prev->p_next = pp->p_next; 1850 pp->p_next->p_prev = pp->p_prev; 1851 } 1852 pp->p_prev = pp->p_next = pp; /* make pp a list of one */ 1853 } 1854 1855 /* 1856 * Routine fsflush uses to gradually coalesce the free list into larger pages. 1857 */ 1858 void 1859 page_promote_size(page_t *pp, uint_t cur_szc) 1860 { 1861 pfn_t pfn; 1862 int mnode; 1863 int idx; 1864 int new_szc = cur_szc + 1; 1865 int full = FULL_REGION_CNT(new_szc); 1866 1867 pfn = page_pptonum(pp); 1868 mnode = PFN_2_MEM_NODE(pfn); 1869 1870 page_freelist_lock(mnode); 1871 1872 idx = PNUM_TO_IDX(mnode, new_szc, pfn); 1873 if (PAGE_COUNTERS(mnode, new_szc, idx) == full) 1874 (void) page_promote(mnode, pfn, new_szc, PC_FREE, PC_MTYPE_ANY); 1875 1876 page_freelist_unlock(mnode); 1877 } 1878 1879 static uint_t page_promote_err; 1880 static uint_t page_promote_noreloc_err; 1881 1882 /* 1883 * Create a single larger page (of szc new_szc) from smaller contiguous pages 1884 * for the given mnode starting at pfnum. Pages involved are on the freelist 1885 * before the call and may be returned to the caller if requested, otherwise 1886 * they will be placed back on the freelist. 1887 * If flags is PC_ALLOC, then the large page will be returned to the user in 1888 * a state which is consistent with a page being taken off the freelist. If 1889 * we failed to lock the new large page, then we will return NULL to the 1890 * caller and put the large page on the freelist instead. 1891 * If flags is PC_FREE, then the large page will be placed on the freelist, 1892 * and NULL will be returned. 1893 * The caller is responsible for locking the freelist as well as any other 1894 * accounting which needs to be done for a returned page. 1895 * 1896 * RFE: For performance pass in pp instead of pfnum so 1897 * we can avoid excessive calls to page_numtopp_nolock(). 1898 * This would depend on an assumption that all contiguous 1899 * pages are in the same memseg so we can just add/dec 1900 * our pp. 1901 * 1902 * Lock ordering: 1903 * 1904 * There is a potential but rare deadlock situation 1905 * for page promotion and demotion operations. The problem 1906 * is there are two paths into the freelist manager and 1907 * they have different lock orders: 1908 * 1909 * page_create() 1910 * lock freelist 1911 * page_lock(EXCL) 1912 * unlock freelist 1913 * return 1914 * caller drops page_lock 1915 * 1916 * page_free() and page_reclaim() 1917 * caller grabs page_lock(EXCL) 1918 * 1919 * lock freelist 1920 * unlock freelist 1921 * drop page_lock 1922 * 1923 * What prevents a thread in page_create() from deadlocking 1924 * with a thread freeing or reclaiming the same page is the 1925 * page_trylock() in page_get_freelist(). If the trylock fails 1926 * it skips the page. 1927 * 1928 * The lock ordering for promotion and demotion is the same as 1929 * for page_create(). Since the same deadlock could occur during 1930 * page promotion and freeing or reclaiming of a page on the 1931 * cache list we might have to fail the operation and undo what 1932 * have done so far. Again this is rare. 1933 */ 1934 page_t * 1935 page_promote(int mnode, pfn_t pfnum, uchar_t new_szc, int flags, int mtype) 1936 { 1937 page_t *pp, *pplist, *tpp, *start_pp; 1938 pgcnt_t new_npgs, npgs; 1939 uint_t bin; 1940 pgcnt_t tmpnpgs, pages_left; 1941 uint_t noreloc; 1942 int which_list; 1943 ulong_t index; 1944 kmutex_t *phm; 1945 1946 /* 1947 * General algorithm: 1948 * Find the starting page 1949 * Walk each page struct removing it from the freelist, 1950 * and linking it to all the other pages removed. 1951 * Once all pages are off the freelist, 1952 * walk the list, modifying p_szc to new_szc and what 1953 * ever other info needs to be done to create a large free page. 1954 * According to the flags, either return the page or put it 1955 * on the freelist. 1956 */ 1957 1958 start_pp = page_numtopp_nolock(pfnum); 1959 ASSERT(start_pp && (start_pp->p_pagenum == pfnum)); 1960 new_npgs = page_get_pagecnt(new_szc); 1961 ASSERT(IS_P2ALIGNED(pfnum, new_npgs)); 1962 1963 /* don't return page of the wrong mtype */ 1964 if (mtype != PC_MTYPE_ANY && mtype != PP_2_MTYPE(start_pp)) 1965 return (NULL); 1966 1967 /* 1968 * Loop through smaller pages to confirm that all pages 1969 * give the same result for PP_ISNORELOC(). 1970 * We can check this reliably here as the protocol for setting 1971 * P_NORELOC requires pages to be taken off the free list first. 1972 */ 1973 noreloc = PP_ISNORELOC(start_pp); 1974 for (pp = start_pp + new_npgs; --pp > start_pp; ) { 1975 if (noreloc != PP_ISNORELOC(pp)) { 1976 page_promote_noreloc_err++; 1977 page_promote_err++; 1978 return (NULL); 1979 } 1980 } 1981 1982 pages_left = new_npgs; 1983 pplist = NULL; 1984 pp = start_pp; 1985 1986 /* Loop around coalescing the smaller pages into a big page. */ 1987 while (pages_left) { 1988 /* 1989 * Remove from the freelist. 1990 */ 1991 ASSERT(PP_ISFREE(pp)); 1992 bin = PP_2_BIN(pp); 1993 ASSERT(mnode == PP_2_MEM_NODE(pp)); 1994 mtype = PP_2_MTYPE(pp); 1995 if (PP_ISAGED(pp)) { 1996 1997 /* 1998 * PG_FREE_LIST 1999 */ 2000 if (pp->p_szc) { 2001 page_vpsub(&PAGE_FREELISTS(mnode, 2002 pp->p_szc, bin, mtype), pp); 2003 } else { 2004 mach_page_sub(&PAGE_FREELISTS(mnode, 0, 2005 bin, mtype), pp); 2006 } 2007 which_list = PG_FREE_LIST; 2008 } else { 2009 ASSERT(pp->p_szc == 0); 2010 2011 /* 2012 * PG_CACHE_LIST 2013 * 2014 * Since this page comes from the 2015 * cachelist, we must destroy the 2016 * vnode association. 2017 */ 2018 if (!page_trylock(pp, SE_EXCL)) { 2019 goto fail_promote; 2020 } 2021 2022 /* 2023 * We need to be careful not to deadlock 2024 * with another thread in page_lookup(). 2025 * The page_lookup() thread could be holding 2026 * the same phm that we need if the two 2027 * pages happen to hash to the same phm lock. 2028 * At this point we have locked the entire 2029 * freelist and page_lookup() could be trying 2030 * to grab a freelist lock. 2031 */ 2032 index = PAGE_HASH_FUNC(pp->p_vnode, pp->p_offset); 2033 phm = PAGE_HASH_MUTEX(index); 2034 if (!mutex_tryenter(phm)) { 2035 page_unlock_nocapture(pp); 2036 goto fail_promote; 2037 } 2038 2039 mach_page_sub(&PAGE_CACHELISTS(mnode, bin, mtype), pp); 2040 page_hashout(pp, phm); 2041 mutex_exit(phm); 2042 PP_SETAGED(pp); 2043 page_unlock_nocapture(pp); 2044 which_list = PG_CACHE_LIST; 2045 } 2046 page_ctr_sub(mnode, mtype, pp, which_list); 2047 2048 /* 2049 * Concatenate the smaller page(s) onto 2050 * the large page list. 2051 */ 2052 tmpnpgs = npgs = page_get_pagecnt(pp->p_szc); 2053 pages_left -= npgs; 2054 tpp = pp; 2055 while (npgs--) { 2056 tpp->p_szc = new_szc; 2057 tpp = tpp->p_next; 2058 } 2059 page_list_concat(&pplist, &pp); 2060 pp += tmpnpgs; 2061 } 2062 CHK_LPG(pplist, new_szc); 2063 2064 /* 2065 * return the page to the user if requested 2066 * in the properly locked state. 2067 */ 2068 if (flags == PC_ALLOC && (page_trylock_cons(pplist, SE_EXCL))) { 2069 return (pplist); 2070 } 2071 2072 /* 2073 * Otherwise place the new large page on the freelist 2074 */ 2075 bin = PP_2_BIN(pplist); 2076 mnode = PP_2_MEM_NODE(pplist); 2077 mtype = PP_2_MTYPE(pplist); 2078 page_vpadd(&PAGE_FREELISTS(mnode, new_szc, bin, mtype), pplist); 2079 2080 page_ctr_add(mnode, mtype, pplist, PG_FREE_LIST); 2081 return (NULL); 2082 2083 fail_promote: 2084 /* 2085 * A thread must have still been freeing or 2086 * reclaiming the page on the cachelist. 2087 * To prevent a deadlock undo what we have 2088 * done sofar and return failure. This 2089 * situation can only happen while promoting 2090 * PAGESIZE pages. 2091 */ 2092 page_promote_err++; 2093 while (pplist) { 2094 pp = pplist; 2095 mach_page_sub(&pplist, pp); 2096 pp->p_szc = 0; 2097 bin = PP_2_BIN(pp); 2098 mtype = PP_2_MTYPE(pp); 2099 mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, mtype), pp); 2100 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST); 2101 } 2102 return (NULL); 2103 2104 } 2105 2106 /* 2107 * Break up a large page into smaller size pages. 2108 * Pages involved are on the freelist before the call and may 2109 * be returned to the caller if requested, otherwise they will 2110 * be placed back on the freelist. 2111 * The caller is responsible for locking the freelist as well as any other 2112 * accounting which needs to be done for a returned page. 2113 * If flags is not PC_ALLOC, the color argument is ignored, and thus 2114 * technically, any value may be passed in but PC_NO_COLOR is the standard 2115 * which should be followed for clarity's sake. 2116 * Returns a page whose pfn is < pfnmax 2117 */ 2118 page_t * 2119 page_demote(int mnode, pfn_t pfnum, pfn_t pfnmax, uchar_t cur_szc, 2120 uchar_t new_szc, int color, int flags) 2121 { 2122 page_t *pp, *pplist, *npplist; 2123 pgcnt_t npgs, n; 2124 uint_t bin; 2125 uint_t mtype; 2126 page_t *ret_pp = NULL; 2127 2128 ASSERT(cur_szc != 0); 2129 ASSERT(new_szc < cur_szc); 2130 2131 pplist = page_numtopp_nolock(pfnum); 2132 ASSERT(pplist != NULL); 2133 2134 ASSERT(pplist->p_szc == cur_szc); 2135 2136 bin = PP_2_BIN(pplist); 2137 ASSERT(mnode == PP_2_MEM_NODE(pplist)); 2138 mtype = PP_2_MTYPE(pplist); 2139 page_vpsub(&PAGE_FREELISTS(mnode, cur_szc, bin, mtype), pplist); 2140 2141 CHK_LPG(pplist, cur_szc); 2142 page_ctr_sub(mnode, mtype, pplist, PG_FREE_LIST); 2143 2144 /* 2145 * Number of PAGESIZE pages for smaller new_szc 2146 * page. 2147 */ 2148 npgs = page_get_pagecnt(new_szc); 2149 2150 while (pplist) { 2151 pp = pplist; 2152 2153 ASSERT(pp->p_szc == cur_szc); 2154 2155 /* 2156 * We either break it up into PAGESIZE pages or larger. 2157 */ 2158 if (npgs == 1) { /* PAGESIZE case */ 2159 mach_page_sub(&pplist, pp); 2160 ASSERT(pp->p_szc == cur_szc); 2161 ASSERT(new_szc == 0); 2162 ASSERT(mnode == PP_2_MEM_NODE(pp)); 2163 pp->p_szc = new_szc; 2164 bin = PP_2_BIN(pp); 2165 if ((bin == color) && (flags == PC_ALLOC) && 2166 (ret_pp == NULL) && (pfnmax == 0 || 2167 pp->p_pagenum < pfnmax) && 2168 page_trylock_cons(pp, SE_EXCL)) { 2169 ret_pp = pp; 2170 } else { 2171 mtype = PP_2_MTYPE(pp); 2172 mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, 2173 mtype), pp); 2174 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST); 2175 } 2176 } else { 2177 page_t *try_to_return_this_page = NULL; 2178 int count = 0; 2179 2180 /* 2181 * Break down into smaller lists of pages. 2182 */ 2183 page_list_break(&pplist, &npplist, npgs); 2184 2185 pp = pplist; 2186 n = npgs; 2187 while (n--) { 2188 ASSERT(pp->p_szc == cur_szc); 2189 /* 2190 * Check whether all the pages in this list 2191 * fit the request criteria. 2192 */ 2193 if (pfnmax == 0 || pp->p_pagenum < pfnmax) { 2194 count++; 2195 } 2196 pp->p_szc = new_szc; 2197 pp = pp->p_next; 2198 } 2199 2200 if (count == npgs && 2201 (pfnmax == 0 || pp->p_pagenum < pfnmax)) { 2202 try_to_return_this_page = pp; 2203 } 2204 2205 CHK_LPG(pplist, new_szc); 2206 2207 bin = PP_2_BIN(pplist); 2208 if (try_to_return_this_page) 2209 ASSERT(mnode == 2210 PP_2_MEM_NODE(try_to_return_this_page)); 2211 if ((bin == color) && (flags == PC_ALLOC) && 2212 (ret_pp == NULL) && try_to_return_this_page && 2213 page_trylock_cons(try_to_return_this_page, 2214 SE_EXCL)) { 2215 ret_pp = try_to_return_this_page; 2216 } else { 2217 mtype = PP_2_MTYPE(pp); 2218 page_vpadd(&PAGE_FREELISTS(mnode, new_szc, 2219 bin, mtype), pplist); 2220 2221 page_ctr_add(mnode, mtype, pplist, 2222 PG_FREE_LIST); 2223 } 2224 pplist = npplist; 2225 } 2226 } 2227 return (ret_pp); 2228 } 2229 2230 int mpss_coalesce_disable = 0; 2231 2232 /* 2233 * Coalesce free pages into a page of the given szc and color if possible. 2234 * Return the pointer to the page created, otherwise, return NULL. 2235 * 2236 * If pfnhi is non-zero, search for large page with pfn range less than pfnhi. 2237 */ 2238 page_t * 2239 page_freelist_coalesce(int mnode, uchar_t szc, uint_t color, uint_t ceq_mask, 2240 int mtype, pfn_t pfnhi) 2241 { 2242 int r = szc; /* region size */ 2243 int mrange; 2244 uint_t full, bin, color_mask, wrap = 0; 2245 pfn_t pfnum, lo, hi; 2246 size_t len, idx, idx0; 2247 pgcnt_t cands = 0, szcpgcnt = page_get_pagecnt(szc); 2248 page_t *ret_pp; 2249 MEM_NODE_ITERATOR_DECL(it); 2250 #if defined(__sparc) 2251 pfn_t pfnum0, nlo, nhi; 2252 #endif 2253 2254 if (mpss_coalesce_disable) { 2255 ASSERT(szc < MMU_PAGE_SIZES); 2256 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[szc][0]); 2257 return (NULL); 2258 } 2259 2260 ASSERT(szc < mmu_page_sizes); 2261 color_mask = PAGE_GET_PAGECOLORS(szc) - 1; 2262 ASSERT(ceq_mask <= color_mask); 2263 ASSERT(color <= color_mask); 2264 color &= ceq_mask; 2265 2266 /* Prevent page_counters dynamic memory from being freed */ 2267 rw_enter(&page_ctrs_rwlock[mnode], RW_READER); 2268 2269 mrange = MTYPE_2_MRANGE(mnode, mtype); 2270 ASSERT(mrange < mnode_nranges[mnode]); 2271 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[r][mrange]); 2272 2273 /* get pfn range for mtype */ 2274 len = PAGE_COUNTERS_ENTRIES(mnode, r); 2275 MNODETYPE_2_PFN(mnode, mtype, lo, hi); 2276 hi++; 2277 2278 /* use lower limit if given */ 2279 if (pfnhi != PFNNULL && pfnhi < hi) 2280 hi = pfnhi; 2281 2282 /* round to szcpgcnt boundaries */ 2283 lo = P2ROUNDUP(lo, szcpgcnt); 2284 MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it); 2285 if (lo == (pfn_t)-1) { 2286 rw_exit(&page_ctrs_rwlock[mnode]); 2287 return (NULL); 2288 } 2289 hi = hi & ~(szcpgcnt - 1); 2290 2291 /* set lo to the closest pfn of the right color */ 2292 if (((PFN_2_COLOR(lo, szc, &it) ^ color) & ceq_mask) || 2293 (interleaved_mnodes && PFN_2_MEM_NODE(lo) != mnode)) { 2294 PAGE_NEXT_PFN_FOR_COLOR(lo, szc, color, ceq_mask, color_mask, 2295 &it); 2296 } 2297 2298 if (hi <= lo) { 2299 rw_exit(&page_ctrs_rwlock[mnode]); 2300 return (NULL); 2301 } 2302 2303 full = FULL_REGION_CNT(r); 2304 2305 /* calculate the number of page candidates and initial search index */ 2306 bin = color; 2307 idx0 = (size_t)(-1); 2308 do { 2309 pgcnt_t acand; 2310 2311 PGCTRS_CANDS_GETVALUECOLOR(mnode, mrange, r, bin, acand); 2312 if (acand) { 2313 idx = PAGE_COUNTERS_CURRENT_COLOR(mnode, 2314 r, bin, mrange); 2315 idx0 = MIN(idx0, idx); 2316 cands += acand; 2317 } 2318 bin = ADD_MASKED(bin, 1, ceq_mask, color_mask); 2319 } while (bin != color); 2320 2321 if (cands == 0) { 2322 VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip[r][mrange]); 2323 rw_exit(&page_ctrs_rwlock[mnode]); 2324 return (NULL); 2325 } 2326 2327 pfnum = IDX_TO_PNUM(mnode, r, idx0); 2328 if (pfnum < lo || pfnum >= hi) { 2329 pfnum = lo; 2330 } else { 2331 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it); 2332 if (pfnum == (pfn_t)-1) { 2333 pfnum = lo; 2334 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it); 2335 ASSERT(pfnum != (pfn_t)-1); 2336 } else if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) & ceq_mask || 2337 (interleaved_mnodes && PFN_2_MEM_NODE(pfnum) != mnode)) { 2338 /* invalid color, get the closest correct pfn */ 2339 PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask, 2340 color_mask, &it); 2341 if (pfnum >= hi) { 2342 pfnum = lo; 2343 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it); 2344 } 2345 } 2346 } 2347 2348 /* set starting index */ 2349 idx0 = PNUM_TO_IDX(mnode, r, pfnum); 2350 ASSERT(idx0 < len); 2351 2352 #if defined(__sparc) 2353 pfnum0 = pfnum; /* page corresponding to idx0 */ 2354 nhi = 0; /* search kcage ranges */ 2355 #endif 2356 2357 for (idx = idx0; wrap == 0 || (idx < idx0 && wrap < 2); ) { 2358 2359 #if defined(__sparc) 2360 /* 2361 * Find lowest intersection of kcage ranges and mnode. 2362 * MTYPE_NORELOC means look in the cage, otherwise outside. 2363 */ 2364 if (nhi <= pfnum) { 2365 if (kcage_next_range(mtype == MTYPE_NORELOC, pfnum, 2366 (wrap == 0 ? hi : pfnum0), &nlo, &nhi)) 2367 goto wrapit; 2368 2369 /* jump to the next page in the range */ 2370 if (pfnum < nlo) { 2371 pfnum = P2ROUNDUP(nlo, szcpgcnt); 2372 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it); 2373 idx = PNUM_TO_IDX(mnode, r, pfnum); 2374 if (idx >= len || pfnum >= hi) 2375 goto wrapit; 2376 if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) & 2377 ceq_mask) 2378 goto next; 2379 if (interleaved_mnodes && 2380 PFN_2_MEM_NODE(pfnum) != mnode) 2381 goto next; 2382 } 2383 } 2384 #endif 2385 2386 if (PAGE_COUNTERS(mnode, r, idx) != full) 2387 goto next; 2388 2389 /* 2390 * RFE: For performance maybe we can do something less 2391 * brutal than locking the entire freelist. So far 2392 * this doesn't seem to be a performance problem? 2393 */ 2394 page_freelist_lock(mnode); 2395 if (PAGE_COUNTERS(mnode, r, idx) == full) { 2396 ret_pp = 2397 page_promote(mnode, pfnum, r, PC_ALLOC, mtype); 2398 if (ret_pp != NULL) { 2399 VM_STAT_ADD(vmm_vmstats.pfc_coalok[r][mrange]); 2400 PAGE_COUNTERS_CURRENT_COLOR(mnode, r, 2401 PFN_2_COLOR(pfnum, szc, &it), mrange) = idx; 2402 page_freelist_unlock(mnode); 2403 rw_exit(&page_ctrs_rwlock[mnode]); 2404 #if defined(__sparc) 2405 if (PP_ISNORELOC(ret_pp)) { 2406 pgcnt_t npgs; 2407 2408 npgs = page_get_pagecnt(ret_pp->p_szc); 2409 kcage_freemem_sub(npgs); 2410 } 2411 #endif 2412 return (ret_pp); 2413 } 2414 } else { 2415 VM_STAT_ADD(vmm_vmstats.page_ctrs_changed[r][mrange]); 2416 } 2417 2418 page_freelist_unlock(mnode); 2419 /* 2420 * No point looking for another page if we've 2421 * already tried all of the ones that 2422 * page_ctr_cands indicated. Stash off where we left 2423 * off. 2424 * Note: this is not exact since we don't hold the 2425 * page_freelist_locks before we initially get the 2426 * value of cands for performance reasons, but should 2427 * be a decent approximation. 2428 */ 2429 if (--cands == 0) { 2430 PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color, mrange) = 2431 idx; 2432 break; 2433 } 2434 next: 2435 PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask, 2436 color_mask, &it); 2437 idx = PNUM_TO_IDX(mnode, r, pfnum); 2438 if (idx >= len || pfnum >= hi) { 2439 wrapit: 2440 pfnum = lo; 2441 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it); 2442 idx = PNUM_TO_IDX(mnode, r, pfnum); 2443 wrap++; 2444 #if defined(__sparc) 2445 nhi = 0; /* search kcage ranges */ 2446 #endif 2447 } 2448 } 2449 2450 rw_exit(&page_ctrs_rwlock[mnode]); 2451 VM_STAT_ADD(vmm_vmstats.page_ctrs_failed[r][mrange]); 2452 return (NULL); 2453 } 2454 2455 /* 2456 * For the given mnode, promote as many small pages to large pages as possible. 2457 * mnode can be -1, which means do them all 2458 */ 2459 void 2460 page_freelist_coalesce_all(int mnode) 2461 { 2462 int r; /* region size */ 2463 int idx, full; 2464 size_t len; 2465 int doall = interleaved_mnodes || mnode < 0; 2466 int mlo = doall ? 0 : mnode; 2467 int mhi = doall ? max_mem_nodes : (mnode + 1); 2468 2469 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce_all); 2470 2471 if (mpss_coalesce_disable) { 2472 return; 2473 } 2474 2475 /* 2476 * Lock the entire freelist and coalesce what we can. 2477 * 2478 * Always promote to the largest page possible 2479 * first to reduce the number of page promotions. 2480 */ 2481 for (mnode = mlo; mnode < mhi; mnode++) { 2482 rw_enter(&page_ctrs_rwlock[mnode], RW_READER); 2483 page_freelist_lock(mnode); 2484 } 2485 for (r = mmu_page_sizes - 1; r > 0; r--) { 2486 for (mnode = mlo; mnode < mhi; mnode++) { 2487 pgcnt_t cands = 0; 2488 int mrange, nranges = mnode_nranges[mnode]; 2489 2490 for (mrange = 0; mrange < nranges; mrange++) { 2491 PGCTRS_CANDS_GETVALUE(mnode, mrange, r, cands); 2492 if (cands != 0) 2493 break; 2494 } 2495 if (cands == 0) { 2496 VM_STAT_ADD(vmm_vmstats. 2497 page_ctrs_cands_skip_all); 2498 continue; 2499 } 2500 2501 full = FULL_REGION_CNT(r); 2502 len = PAGE_COUNTERS_ENTRIES(mnode, r); 2503 2504 for (idx = 0; idx < len; idx++) { 2505 if (PAGE_COUNTERS(mnode, r, idx) == full) { 2506 pfn_t pfnum = 2507 IDX_TO_PNUM(mnode, r, idx); 2508 int tmnode = interleaved_mnodes ? 2509 PFN_2_MEM_NODE(pfnum) : mnode; 2510 2511 ASSERT(pfnum >= 2512 mem_node_config[tmnode].physbase && 2513 pfnum < 2514 mem_node_config[tmnode].physmax); 2515 2516 (void) page_promote(tmnode, 2517 pfnum, r, PC_FREE, PC_MTYPE_ANY); 2518 } 2519 } 2520 /* shared hpm_counters covers all mnodes, so we quit */ 2521 if (interleaved_mnodes) 2522 break; 2523 } 2524 } 2525 for (mnode = mlo; mnode < mhi; mnode++) { 2526 page_freelist_unlock(mnode); 2527 rw_exit(&page_ctrs_rwlock[mnode]); 2528 } 2529 } 2530 2531 /* 2532 * This is where all polices for moving pages around 2533 * to different page size free lists is implemented. 2534 * Returns 1 on success, 0 on failure. 2535 * 2536 * So far these are the priorities for this algorithm in descending 2537 * order: 2538 * 2539 * 1) When servicing a request try to do so with a free page 2540 * from next size up. Helps defer fragmentation as long 2541 * as possible. 2542 * 2543 * 2) Page coalesce on demand. Only when a freelist 2544 * larger than PAGESIZE is empty and step 1 2545 * will not work since all larger size lists are 2546 * also empty. 2547 * 2548 * If pfnhi is non-zero, search for large page with pfn range less than pfnhi. 2549 */ 2550 2551 page_t * 2552 page_freelist_split(uchar_t szc, uint_t color, int mnode, int mtype, 2553 pfn_t pfnlo, pfn_t pfnhi, page_list_walker_t *plw) 2554 { 2555 uchar_t nszc = szc + 1; 2556 uint_t bin, sbin, bin_prev; 2557 page_t *pp, *firstpp; 2558 page_t *ret_pp = NULL; 2559 uint_t color_mask; 2560 2561 if (nszc == mmu_page_sizes) 2562 return (NULL); 2563 2564 ASSERT(nszc < mmu_page_sizes); 2565 color_mask = PAGE_GET_PAGECOLORS(nszc) - 1; 2566 bin = sbin = PAGE_GET_NSZ_COLOR(szc, color); 2567 bin_prev = (plw->plw_bin_split_prev == color) ? INVALID_COLOR : 2568 PAGE_GET_NSZ_COLOR(szc, plw->plw_bin_split_prev); 2569 2570 VM_STAT_ADD(vmm_vmstats.pfs_req[szc]); 2571 /* 2572 * First try to break up a larger page to fill current size freelist. 2573 */ 2574 while (plw->plw_bins[nszc] != 0) { 2575 2576 ASSERT(nszc < mmu_page_sizes); 2577 2578 /* 2579 * If page found then demote it. 2580 */ 2581 if (PAGE_FREELISTS(mnode, nszc, bin, mtype)) { 2582 page_freelist_lock(mnode); 2583 firstpp = pp = PAGE_FREELISTS(mnode, nszc, bin, mtype); 2584 2585 /* 2586 * If pfnhi is not PFNNULL, look for large page below 2587 * pfnhi. PFNNULL signifies no pfn requirement. 2588 */ 2589 if (pp && 2590 ((pfnhi != PFNNULL && pp->p_pagenum >= pfnhi) || 2591 (pfnlo != PFNNULL && pp->p_pagenum < pfnlo))) { 2592 do { 2593 pp = pp->p_vpnext; 2594 if (pp == firstpp) { 2595 pp = NULL; 2596 break; 2597 } 2598 } while ((pfnhi != PFNNULL && 2599 pp->p_pagenum >= pfnhi) || 2600 (pfnlo != PFNNULL && 2601 pp->p_pagenum < pfnlo)); 2602 2603 if (pfnhi != PFNNULL && pp != NULL) 2604 ASSERT(pp->p_pagenum < pfnhi); 2605 2606 if (pfnlo != PFNNULL && pp != NULL) 2607 ASSERT(pp->p_pagenum >= pfnlo); 2608 } 2609 if (pp) { 2610 uint_t ccolor = page_correct_color(szc, nszc, 2611 color, bin, plw->plw_ceq_mask[szc]); 2612 2613 ASSERT(pp->p_szc == nszc); 2614 VM_STAT_ADD(vmm_vmstats.pfs_demote[nszc]); 2615 ret_pp = page_demote(mnode, pp->p_pagenum, 2616 pfnhi, pp->p_szc, szc, ccolor, PC_ALLOC); 2617 if (ret_pp) { 2618 page_freelist_unlock(mnode); 2619 #if defined(__sparc) 2620 if (PP_ISNORELOC(ret_pp)) { 2621 pgcnt_t npgs; 2622 2623 npgs = page_get_pagecnt( 2624 ret_pp->p_szc); 2625 kcage_freemem_sub(npgs); 2626 } 2627 #endif 2628 return (ret_pp); 2629 } 2630 } 2631 page_freelist_unlock(mnode); 2632 } 2633 2634 /* loop through next size bins */ 2635 bin = ADD_MASKED(bin, 1, plw->plw_ceq_mask[nszc], color_mask); 2636 plw->plw_bins[nszc]--; 2637 2638 if (bin == sbin) { 2639 uchar_t nnszc = nszc + 1; 2640 2641 /* we are done with this page size - check next */ 2642 if (plw->plw_bins[nnszc] == 0) 2643 /* we have already checked next size bins */ 2644 break; 2645 2646 bin = sbin = PAGE_GET_NSZ_COLOR(nszc, bin); 2647 if (bin_prev != INVALID_COLOR) { 2648 bin_prev = PAGE_GET_NSZ_COLOR(nszc, bin_prev); 2649 if (!((bin ^ bin_prev) & 2650 plw->plw_ceq_mask[nnszc])) 2651 break; 2652 } 2653 ASSERT(nnszc < mmu_page_sizes); 2654 color_mask = PAGE_GET_PAGECOLORS(nnszc) - 1; 2655 nszc = nnszc; 2656 ASSERT(nszc < mmu_page_sizes); 2657 } 2658 } 2659 2660 return (ret_pp); 2661 } 2662 2663 /* 2664 * Helper routine used only by the freelist code to lock 2665 * a page. If the page is a large page then it succeeds in 2666 * locking all the constituent pages or none at all. 2667 * Returns 1 on sucess, 0 on failure. 2668 */ 2669 static int 2670 page_trylock_cons(page_t *pp, se_t se) 2671 { 2672 page_t *tpp, *first_pp = pp; 2673 2674 /* 2675 * Fail if can't lock first or only page. 2676 */ 2677 if (!page_trylock(pp, se)) { 2678 return (0); 2679 } 2680 2681 /* 2682 * PAGESIZE: common case. 2683 */ 2684 if (pp->p_szc == 0) { 2685 return (1); 2686 } 2687 2688 /* 2689 * Large page case. 2690 */ 2691 tpp = pp->p_next; 2692 while (tpp != pp) { 2693 if (!page_trylock(tpp, se)) { 2694 /* 2695 * On failure unlock what we have locked so far. 2696 * We want to avoid attempting to capture these 2697 * pages as the pcm mutex may be held which could 2698 * lead to a recursive mutex panic. 2699 */ 2700 while (first_pp != tpp) { 2701 page_unlock_nocapture(first_pp); 2702 first_pp = first_pp->p_next; 2703 } 2704 return (0); 2705 } 2706 tpp = tpp->p_next; 2707 } 2708 return (1); 2709 } 2710 2711 /* 2712 * init context for walking page lists 2713 * Called when a page of the given szc in unavailable. Sets markers 2714 * for the beginning of the search to detect when search has 2715 * completed a full cycle. Sets flags for splitting larger pages 2716 * and coalescing smaller pages. Page walking procedes until a page 2717 * of the desired equivalent color is found. 2718 */ 2719 void 2720 page_list_walk_init(uchar_t szc, uint_t flags, uint_t bin, int can_split, 2721 int use_ceq, page_list_walker_t *plw) 2722 { 2723 uint_t nszc, ceq_mask, colors; 2724 uchar_t ceq = use_ceq ? colorequivszc[szc] : 0; 2725 2726 ASSERT(szc < mmu_page_sizes); 2727 colors = PAGE_GET_PAGECOLORS(szc); 2728 2729 plw->plw_colors = colors; 2730 plw->plw_color_mask = colors - 1; 2731 plw->plw_bin_marker = plw->plw_bin0 = bin; 2732 plw->plw_bin_split_prev = bin; 2733 plw->plw_bin_step = (szc == 0) ? vac_colors : 1; 2734 2735 /* 2736 * if vac aliasing is possible make sure lower order color 2737 * bits are never ignored 2738 */ 2739 if (vac_colors > 1) 2740 ceq &= 0xf0; 2741 2742 /* 2743 * calculate the number of non-equivalent colors and 2744 * color equivalency mask 2745 */ 2746 plw->plw_ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf)); 2747 ASSERT(szc > 0 || plw->plw_ceq_dif >= vac_colors); 2748 ASSERT(plw->plw_ceq_dif > 0); 2749 plw->plw_ceq_mask[szc] = (plw->plw_ceq_dif - 1) << (ceq & 0xf); 2750 2751 if (flags & PG_MATCH_COLOR) { 2752 if (cpu_page_colors < 0) { 2753 /* 2754 * this is a heterogeneous machine with different CPUs 2755 * having different size e$ (not supported for ni2/rock 2756 */ 2757 uint_t cpucolors = CPUSETSIZE() >> PAGE_GET_SHIFT(szc); 2758 cpucolors = MAX(cpucolors, 1); 2759 ceq_mask = plw->plw_color_mask & (cpucolors - 1); 2760 plw->plw_ceq_mask[szc] = 2761 MIN(ceq_mask, plw->plw_ceq_mask[szc]); 2762 } 2763 plw->plw_ceq_dif = 1; 2764 } 2765 2766 /* we can split pages in the freelist, but not the cachelist */ 2767 if (can_split) { 2768 plw->plw_do_split = (szc + 1 < mmu_page_sizes) ? 1 : 0; 2769 2770 /* set next szc color masks and number of free list bins */ 2771 for (nszc = szc + 1; nszc < mmu_page_sizes; nszc++, szc++) { 2772 plw->plw_ceq_mask[nszc] = PAGE_GET_NSZ_MASK(szc, 2773 plw->plw_ceq_mask[szc]); 2774 plw->plw_bins[nszc] = PAGE_GET_PAGECOLORS(nszc); 2775 } 2776 plw->plw_ceq_mask[nszc] = INVALID_MASK; 2777 plw->plw_bins[nszc] = 0; 2778 2779 } else { 2780 ASSERT(szc == 0); 2781 plw->plw_do_split = 0; 2782 plw->plw_bins[1] = 0; 2783 plw->plw_ceq_mask[1] = INVALID_MASK; 2784 } 2785 } 2786 2787 /* 2788 * set mark to flag where next split should occur 2789 */ 2790 #define PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw) { \ 2791 uint_t bin_nsz = PAGE_GET_NSZ_COLOR(szc, bin); \ 2792 uint_t bin0_nsz = PAGE_GET_NSZ_COLOR(szc, plw->plw_bin0); \ 2793 uint_t neq_mask = ~plw->plw_ceq_mask[nszc] & plw->plw_color_mask; \ 2794 plw->plw_split_next = \ 2795 INC_MASKED(bin_nsz, neq_mask, plw->plw_color_mask); \ 2796 if (!((plw->plw_split_next ^ bin0_nsz) & plw->plw_ceq_mask[nszc])) { \ 2797 plw->plw_split_next = \ 2798 INC_MASKED(plw->plw_split_next, \ 2799 neq_mask, plw->plw_color_mask); \ 2800 } \ 2801 } 2802 2803 uint_t 2804 page_list_walk_next_bin(uchar_t szc, uint_t bin, page_list_walker_t *plw) 2805 { 2806 uint_t neq_mask = ~plw->plw_ceq_mask[szc] & plw->plw_color_mask; 2807 uint_t bin0_nsz, nbin_nsz, nbin0, nbin; 2808 uchar_t nszc = szc + 1; 2809 2810 nbin = ADD_MASKED(bin, 2811 plw->plw_bin_step, neq_mask, plw->plw_color_mask); 2812 2813 if (plw->plw_do_split) { 2814 plw->plw_bin_split_prev = bin; 2815 PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw); 2816 plw->plw_do_split = 0; 2817 } 2818 2819 if (szc == 0) { 2820 if (plw->plw_count != 0 || plw->plw_ceq_dif == vac_colors) { 2821 if (nbin == plw->plw_bin0 && 2822 (vac_colors == 1 || nbin != plw->plw_bin_marker)) { 2823 nbin = ADD_MASKED(nbin, plw->plw_bin_step, 2824 neq_mask, plw->plw_color_mask); 2825 plw->plw_bin_split_prev = plw->plw_bin0; 2826 } 2827 2828 if (vac_colors > 1 && nbin == plw->plw_bin_marker) { 2829 plw->plw_bin_marker = 2830 nbin = INC_MASKED(nbin, neq_mask, 2831 plw->plw_color_mask); 2832 plw->plw_bin_split_prev = plw->plw_bin0; 2833 /* 2834 * large pages all have the same vac color 2835 * so by now we should be done with next 2836 * size page splitting process 2837 */ 2838 ASSERT(plw->plw_bins[1] == 0); 2839 plw->plw_do_split = 0; 2840 return (nbin); 2841 } 2842 2843 } else { 2844 uint_t bin_jump = (vac_colors == 1) ? 2845 (BIN_STEP & ~3) - (plw->plw_bin0 & 3) : BIN_STEP; 2846 2847 bin_jump &= ~(vac_colors - 1); 2848 2849 nbin0 = ADD_MASKED(plw->plw_bin0, bin_jump, neq_mask, 2850 plw->plw_color_mask); 2851 2852 if ((nbin0 ^ plw->plw_bin0) & plw->plw_ceq_mask[szc]) { 2853 2854 plw->plw_bin_marker = nbin = nbin0; 2855 2856 if (plw->plw_bins[nszc] != 0) { 2857 /* 2858 * check if next page size bin is the 2859 * same as the next page size bin for 2860 * bin0 2861 */ 2862 nbin_nsz = PAGE_GET_NSZ_COLOR(szc, 2863 nbin); 2864 bin0_nsz = PAGE_GET_NSZ_COLOR(szc, 2865 plw->plw_bin0); 2866 2867 if ((bin0_nsz ^ nbin_nsz) & 2868 plw->plw_ceq_mask[nszc]) 2869 plw->plw_do_split = 1; 2870 } 2871 return (nbin); 2872 } 2873 } 2874 } 2875 2876 if (plw->plw_bins[nszc] != 0) { 2877 nbin_nsz = PAGE_GET_NSZ_COLOR(szc, nbin); 2878 if (!((plw->plw_split_next ^ nbin_nsz) & 2879 plw->plw_ceq_mask[nszc])) 2880 plw->plw_do_split = 1; 2881 } 2882 2883 return (nbin); 2884 } 2885 2886 page_t * 2887 page_get_mnode_freelist(int mnode, uint_t bin, int mtype, uchar_t szc, 2888 uint_t flags) 2889 { 2890 kmutex_t *pcm; 2891 page_t *pp, *first_pp; 2892 uint_t sbin; 2893 int plw_initialized; 2894 page_list_walker_t plw; 2895 2896 ASSERT(szc < mmu_page_sizes); 2897 2898 VM_STAT_ADD(vmm_vmstats.pgmf_alloc[szc]); 2899 2900 MTYPE_START(mnode, mtype, flags); 2901 if (mtype < 0) { /* mnode does not have memory in mtype range */ 2902 VM_STAT_ADD(vmm_vmstats.pgmf_allocempty[szc]); 2903 return (NULL); 2904 } 2905 try_again: 2906 2907 plw_initialized = 0; 2908 plw.plw_ceq_dif = 1; 2909 2910 /* 2911 * Only hold one freelist lock at a time, that way we 2912 * can start anywhere and not have to worry about lock 2913 * ordering. 2914 */ 2915 for (plw.plw_count = 0; 2916 plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) { 2917 sbin = bin; 2918 do { 2919 if (!PAGE_FREELISTS(mnode, szc, bin, mtype)) 2920 goto bin_empty_1; 2921 2922 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 2923 mutex_enter(pcm); 2924 pp = PAGE_FREELISTS(mnode, szc, bin, mtype); 2925 if (pp == NULL) 2926 goto bin_empty_0; 2927 2928 /* 2929 * These were set before the page 2930 * was put on the free list, 2931 * they must still be set. 2932 */ 2933 ASSERT(PP_ISFREE(pp)); 2934 ASSERT(PP_ISAGED(pp)); 2935 ASSERT(pp->p_vnode == NULL); 2936 ASSERT(pp->p_hash == NULL); 2937 ASSERT(pp->p_offset == (u_offset_t)-1); 2938 ASSERT(pp->p_szc == szc); 2939 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 2940 2941 /* 2942 * Walk down the hash chain. 2943 * 8k pages are linked on p_next 2944 * and p_prev fields. Large pages 2945 * are a contiguous group of 2946 * constituent pages linked together 2947 * on their p_next and p_prev fields. 2948 * The large pages are linked together 2949 * on the hash chain using p_vpnext 2950 * p_vpprev of the base constituent 2951 * page of each large page. 2952 */ 2953 first_pp = pp; 2954 while (!page_trylock_cons(pp, SE_EXCL)) { 2955 if (szc == 0) { 2956 pp = pp->p_next; 2957 } else { 2958 pp = pp->p_vpnext; 2959 } 2960 2961 ASSERT(PP_ISFREE(pp)); 2962 ASSERT(PP_ISAGED(pp)); 2963 ASSERT(pp->p_vnode == NULL); 2964 ASSERT(pp->p_hash == NULL); 2965 ASSERT(pp->p_offset == (u_offset_t)-1); 2966 ASSERT(pp->p_szc == szc); 2967 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 2968 2969 if (pp == first_pp) 2970 goto bin_empty_0; 2971 } 2972 2973 ASSERT(pp != NULL); 2974 ASSERT(mtype == PP_2_MTYPE(pp)); 2975 ASSERT(pp->p_szc == szc); 2976 if (szc == 0) { 2977 page_sub(&PAGE_FREELISTS(mnode, 2978 szc, bin, mtype), pp); 2979 } else { 2980 page_vpsub(&PAGE_FREELISTS(mnode, 2981 szc, bin, mtype), pp); 2982 CHK_LPG(pp, szc); 2983 } 2984 page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST); 2985 2986 if ((PP_ISFREE(pp) == 0) || (PP_ISAGED(pp) == 0)) 2987 panic("free page is not. pp %p", (void *)pp); 2988 mutex_exit(pcm); 2989 2990 #if defined(__sparc) 2991 ASSERT(!kcage_on || PP_ISNORELOC(pp) || 2992 (flags & PG_NORELOC) == 0); 2993 2994 if (PP_ISNORELOC(pp)) 2995 kcage_freemem_sub(page_get_pagecnt(szc)); 2996 #endif 2997 VM_STAT_ADD(vmm_vmstats.pgmf_allocok[szc]); 2998 return (pp); 2999 3000 bin_empty_0: 3001 mutex_exit(pcm); 3002 bin_empty_1: 3003 if (plw_initialized == 0) { 3004 page_list_walk_init(szc, flags, bin, 1, 1, 3005 &plw); 3006 plw_initialized = 1; 3007 ASSERT(plw.plw_colors <= 3008 PAGE_GET_PAGECOLORS(szc)); 3009 ASSERT(plw.plw_colors > 0); 3010 ASSERT((plw.plw_colors & 3011 (plw.plw_colors - 1)) == 0); 3012 ASSERT(bin < plw.plw_colors); 3013 ASSERT(plw.plw_ceq_mask[szc] < plw.plw_colors); 3014 } 3015 /* calculate the next bin with equivalent color */ 3016 bin = ADD_MASKED(bin, plw.plw_bin_step, 3017 plw.plw_ceq_mask[szc], plw.plw_color_mask); 3018 } while (sbin != bin); 3019 3020 /* 3021 * color bins are all empty if color match. Try and 3022 * satisfy the request by breaking up or coalescing 3023 * pages from a different size freelist of the correct 3024 * color that satisfies the ORIGINAL color requested. 3025 * If that fails then try pages of the same size but 3026 * different colors assuming we are not called with 3027 * PG_MATCH_COLOR. 3028 */ 3029 if (plw.plw_do_split && 3030 (pp = page_freelist_split(szc, bin, mnode, 3031 mtype, PFNNULL, PFNNULL, &plw)) != NULL) 3032 return (pp); 3033 3034 if (szc > 0 && (pp = page_freelist_coalesce(mnode, szc, 3035 bin, plw.plw_ceq_mask[szc], mtype, PFNNULL)) != NULL) 3036 return (pp); 3037 3038 if (plw.plw_ceq_dif > 1) 3039 bin = page_list_walk_next_bin(szc, bin, &plw); 3040 } 3041 3042 /* if allowed, cycle through additional mtypes */ 3043 MTYPE_NEXT(mnode, mtype, flags); 3044 if (mtype >= 0) 3045 goto try_again; 3046 3047 VM_STAT_ADD(vmm_vmstats.pgmf_allocfailed[szc]); 3048 3049 return (NULL); 3050 } 3051 3052 /* 3053 * Returns the count of free pages for 'pp' with size code 'szc'. 3054 * Note: This function does not return an exact value as the page freelist 3055 * locks are not held and thus the values in the page_counters may be 3056 * changing as we walk through the data. 3057 */ 3058 static int 3059 page_freecnt(int mnode, page_t *pp, uchar_t szc) 3060 { 3061 pgcnt_t pgfree; 3062 pgcnt_t cnt; 3063 ssize_t r = szc; /* region size */ 3064 ssize_t idx; 3065 int i; 3066 int full, range; 3067 3068 /* Make sure pagenum passed in is aligned properly */ 3069 ASSERT((pp->p_pagenum & (PNUM_SIZE(szc) - 1)) == 0); 3070 ASSERT(szc > 0); 3071 3072 /* Prevent page_counters dynamic memory from being freed */ 3073 rw_enter(&page_ctrs_rwlock[mnode], RW_READER); 3074 idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum); 3075 cnt = PAGE_COUNTERS(mnode, r, idx); 3076 pgfree = cnt << PNUM_SHIFT(r - 1); 3077 range = FULL_REGION_CNT(szc); 3078 3079 /* Check for completely full region */ 3080 if (cnt == range) { 3081 rw_exit(&page_ctrs_rwlock[mnode]); 3082 return (pgfree); 3083 } 3084 3085 while (--r > 0) { 3086 idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum); 3087 full = FULL_REGION_CNT(r); 3088 for (i = 0; i < range; i++, idx++) { 3089 cnt = PAGE_COUNTERS(mnode, r, idx); 3090 /* 3091 * If cnt here is full, that means we have already 3092 * accounted for these pages earlier. 3093 */ 3094 if (cnt != full) { 3095 pgfree += (cnt << PNUM_SHIFT(r - 1)); 3096 } 3097 } 3098 range *= full; 3099 } 3100 rw_exit(&page_ctrs_rwlock[mnode]); 3101 return (pgfree); 3102 } 3103 3104 /* 3105 * Called from page_geti_contig_pages to exclusively lock constituent pages 3106 * starting from 'spp' for page size code 'szc'. 3107 * 3108 * If 'ptcpthreshold' is set, the number of free pages needed in the 'szc' 3109 * region needs to be greater than or equal to the threshold. 3110 */ 3111 static int 3112 page_trylock_contig_pages(int mnode, page_t *spp, uchar_t szc, int flags) 3113 { 3114 pgcnt_t pgcnt = PNUM_SIZE(szc); 3115 pgcnt_t pgfree, i; 3116 page_t *pp; 3117 3118 VM_STAT_ADD(vmm_vmstats.ptcp[szc]); 3119 3120 3121 if ((ptcpthreshold == 0) || (flags & PGI_PGCPHIPRI)) 3122 goto skipptcpcheck; 3123 /* 3124 * check if there are sufficient free pages available before attempting 3125 * to trylock. Count is approximate as page counters can change. 3126 */ 3127 pgfree = page_freecnt(mnode, spp, szc); 3128 3129 /* attempt to trylock if there are sufficient already free pages */ 3130 if (pgfree < pgcnt/ptcpthreshold) { 3131 VM_STAT_ADD(vmm_vmstats.ptcpfreethresh[szc]); 3132 return (0); 3133 } 3134 3135 skipptcpcheck: 3136 3137 for (i = 0; i < pgcnt; i++) { 3138 pp = &spp[i]; 3139 if (!page_trylock(pp, SE_EXCL)) { 3140 VM_STAT_ADD(vmm_vmstats.ptcpfailexcl[szc]); 3141 while (--i != (pgcnt_t)-1) { 3142 pp = &spp[i]; 3143 ASSERT(PAGE_EXCL(pp)); 3144 page_unlock_nocapture(pp); 3145 } 3146 return (0); 3147 } 3148 ASSERT(spp[i].p_pagenum == spp->p_pagenum + i); 3149 if ((pp->p_szc > szc || (szc && pp->p_szc == szc)) && 3150 !PP_ISFREE(pp)) { 3151 VM_STAT_ADD(vmm_vmstats.ptcpfailszc[szc]); 3152 ASSERT(i == 0); 3153 page_unlock_nocapture(pp); 3154 return (0); 3155 } 3156 if (PP_ISNORELOC(pp)) { 3157 VM_STAT_ADD(vmm_vmstats.ptcpfailcage[szc]); 3158 while (i != (pgcnt_t)-1) { 3159 pp = &spp[i]; 3160 ASSERT(PAGE_EXCL(pp)); 3161 page_unlock_nocapture(pp); 3162 i--; 3163 } 3164 return (0); 3165 } 3166 } 3167 VM_STAT_ADD(vmm_vmstats.ptcpok[szc]); 3168 return (1); 3169 } 3170 3171 /* 3172 * Claim large page pointed to by 'pp'. 'pp' is the starting set 3173 * of 'szc' constituent pages that had been locked exclusively previously. 3174 * Will attempt to relocate constituent pages in use. 3175 */ 3176 static page_t * 3177 page_claim_contig_pages(page_t *pp, uchar_t szc, int flags) 3178 { 3179 spgcnt_t pgcnt, npgs, i; 3180 page_t *targpp, *rpp, *hpp; 3181 page_t *replpp = NULL; 3182 page_t *pplist = NULL; 3183 3184 ASSERT(pp != NULL); 3185 3186 pgcnt = page_get_pagecnt(szc); 3187 while (pgcnt) { 3188 ASSERT(PAGE_EXCL(pp)); 3189 ASSERT(!PP_ISNORELOC(pp)); 3190 if (PP_ISFREE(pp)) { 3191 /* 3192 * If this is a PG_FREE_LIST page then its 3193 * size code can change underneath us due to 3194 * page promotion or demotion. As an optimzation 3195 * use page_list_sub_pages() instead of 3196 * page_list_sub(). 3197 */ 3198 if (PP_ISAGED(pp)) { 3199 page_list_sub_pages(pp, szc); 3200 if (pp->p_szc == szc) { 3201 return (pp); 3202 } 3203 ASSERT(pp->p_szc < szc); 3204 npgs = page_get_pagecnt(pp->p_szc); 3205 hpp = pp; 3206 for (i = 0; i < npgs; i++, pp++) { 3207 pp->p_szc = szc; 3208 } 3209 page_list_concat(&pplist, &hpp); 3210 pgcnt -= npgs; 3211 continue; 3212 } 3213 ASSERT(!PP_ISAGED(pp)); 3214 ASSERT(pp->p_szc == 0); 3215 page_list_sub(pp, PG_CACHE_LIST); 3216 page_hashout(pp, NULL); 3217 PP_SETAGED(pp); 3218 pp->p_szc = szc; 3219 page_list_concat(&pplist, &pp); 3220 pp++; 3221 pgcnt--; 3222 continue; 3223 } 3224 npgs = page_get_pagecnt(pp->p_szc); 3225 3226 /* 3227 * page_create_wait freemem accounting done by caller of 3228 * page_get_freelist and not necessary to call it prior to 3229 * calling page_get_replacement_page. 3230 * 3231 * page_get_replacement_page can call page_get_contig_pages 3232 * to acquire a large page (szc > 0); the replacement must be 3233 * smaller than the contig page size to avoid looping or 3234 * szc == 0 and PGI_PGCPSZC0 is set. 3235 */ 3236 if (pp->p_szc < szc || (szc == 0 && (flags & PGI_PGCPSZC0))) { 3237 replpp = page_get_replacement_page(pp, NULL, 0); 3238 if (replpp) { 3239 npgs = page_get_pagecnt(pp->p_szc); 3240 ASSERT(npgs <= pgcnt); 3241 targpp = pp; 3242 } 3243 } 3244 3245 /* 3246 * If replacement is NULL or do_page_relocate fails, fail 3247 * coalescing of pages. 3248 */ 3249 if (replpp == NULL || (do_page_relocate(&targpp, &replpp, 0, 3250 &npgs, NULL) != 0)) { 3251 /* 3252 * Unlock un-processed target list 3253 */ 3254 while (pgcnt--) { 3255 ASSERT(PAGE_EXCL(pp)); 3256 page_unlock_nocapture(pp); 3257 pp++; 3258 } 3259 /* 3260 * Free the processed target list. 3261 */ 3262 while (pplist) { 3263 pp = pplist; 3264 page_sub(&pplist, pp); 3265 ASSERT(PAGE_EXCL(pp)); 3266 ASSERT(pp->p_szc == szc); 3267 ASSERT(PP_ISFREE(pp)); 3268 ASSERT(PP_ISAGED(pp)); 3269 pp->p_szc = 0; 3270 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); 3271 page_unlock_nocapture(pp); 3272 } 3273 3274 if (replpp != NULL) 3275 page_free_replacement_page(replpp); 3276 3277 return (NULL); 3278 } 3279 ASSERT(pp == targpp); 3280 3281 /* LINTED */ 3282 ASSERT(hpp = pp); /* That's right, it's an assignment */ 3283 3284 pp += npgs; 3285 pgcnt -= npgs; 3286 3287 while (npgs--) { 3288 ASSERT(PAGE_EXCL(targpp)); 3289 ASSERT(!PP_ISFREE(targpp)); 3290 ASSERT(!PP_ISNORELOC(targpp)); 3291 PP_SETFREE(targpp); 3292 ASSERT(PP_ISAGED(targpp)); 3293 ASSERT(targpp->p_szc < szc || (szc == 0 && 3294 (flags & PGI_PGCPSZC0))); 3295 targpp->p_szc = szc; 3296 targpp = targpp->p_next; 3297 3298 rpp = replpp; 3299 ASSERT(rpp != NULL); 3300 page_sub(&replpp, rpp); 3301 ASSERT(PAGE_EXCL(rpp)); 3302 ASSERT(!PP_ISFREE(rpp)); 3303 page_unlock_nocapture(rpp); 3304 } 3305 ASSERT(targpp == hpp); 3306 ASSERT(replpp == NULL); 3307 page_list_concat(&pplist, &targpp); 3308 } 3309 CHK_LPG(pplist, szc); 3310 return (pplist); 3311 } 3312 3313 /* 3314 * Trim kernel cage from pfnlo-pfnhi and store result in lo-hi. Return code 3315 * of 0 means nothing left after trim. 3316 */ 3317 int 3318 trimkcage(struct memseg *mseg, pfn_t *lo, pfn_t *hi, pfn_t pfnlo, pfn_t pfnhi) 3319 { 3320 pfn_t kcagepfn; 3321 int decr; 3322 int rc = 0; 3323 3324 if (PP_ISNORELOC(mseg->pages)) { 3325 if (PP_ISNORELOC(mseg->epages - 1) == 0) { 3326 3327 /* lower part of this mseg inside kernel cage */ 3328 decr = kcage_current_pfn(&kcagepfn); 3329 3330 /* kernel cage may have transitioned past mseg */ 3331 if (kcagepfn >= mseg->pages_base && 3332 kcagepfn < mseg->pages_end) { 3333 ASSERT(decr == 0); 3334 *lo = MAX(kcagepfn, pfnlo); 3335 *hi = MIN(pfnhi, (mseg->pages_end - 1)); 3336 rc = 1; 3337 } 3338 } 3339 /* else entire mseg in the cage */ 3340 } else { 3341 if (PP_ISNORELOC(mseg->epages - 1)) { 3342 3343 /* upper part of this mseg inside kernel cage */ 3344 decr = kcage_current_pfn(&kcagepfn); 3345 3346 /* kernel cage may have transitioned past mseg */ 3347 if (kcagepfn >= mseg->pages_base && 3348 kcagepfn < mseg->pages_end) { 3349 ASSERT(decr); 3350 *hi = MIN(kcagepfn, pfnhi); 3351 *lo = MAX(pfnlo, mseg->pages_base); 3352 rc = 1; 3353 } 3354 } else { 3355 /* entire mseg outside of kernel cage */ 3356 *lo = MAX(pfnlo, mseg->pages_base); 3357 *hi = MIN(pfnhi, (mseg->pages_end - 1)); 3358 rc = 1; 3359 } 3360 } 3361 return (rc); 3362 } 3363 3364 /* 3365 * called from page_get_contig_pages to search 'pfnlo' thru 'pfnhi' to claim a 3366 * page with size code 'szc'. Claiming such a page requires acquiring 3367 * exclusive locks on all constituent pages (page_trylock_contig_pages), 3368 * relocating pages in use and concatenating these constituent pages into a 3369 * large page. 3370 * 3371 * The page lists do not have such a large page and page_freelist_split has 3372 * already failed to demote larger pages and/or coalesce smaller free pages. 3373 * 3374 * 'flags' may specify PG_COLOR_MATCH which would limit the search of large 3375 * pages with the same color as 'bin'. 3376 * 3377 * 'pfnflag' specifies the subset of the pfn range to search. 3378 */ 3379 3380 static page_t * 3381 page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags, 3382 pfn_t pfnlo, pfn_t pfnhi, pgcnt_t pfnflag) 3383 { 3384 struct memseg *mseg; 3385 pgcnt_t szcpgcnt = page_get_pagecnt(szc); 3386 pgcnt_t szcpgmask = szcpgcnt - 1; 3387 pfn_t randpfn; 3388 page_t *pp, *randpp, *endpp; 3389 uint_t colors, ceq_mask; 3390 /* LINTED : set but not used in function */ 3391 uint_t color_mask; 3392 pfn_t hi, lo; 3393 uint_t skip; 3394 MEM_NODE_ITERATOR_DECL(it); 3395 3396 ASSERT(szc != 0 || (flags & PGI_PGCPSZC0)); 3397 3398 pfnlo = P2ROUNDUP(pfnlo, szcpgcnt); 3399 3400 if ((pfnhi - pfnlo) + 1 < szcpgcnt || pfnlo >= pfnhi) 3401 return (NULL); 3402 3403 ASSERT(szc < mmu_page_sizes); 3404 3405 colors = PAGE_GET_PAGECOLORS(szc); 3406 color_mask = colors - 1; 3407 if ((colors > 1) && (flags & PG_MATCH_COLOR)) { 3408 uchar_t ceq = colorequivszc[szc]; 3409 uint_t ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf)); 3410 3411 ASSERT(ceq_dif > 0); 3412 ceq_mask = (ceq_dif - 1) << (ceq & 0xf); 3413 } else { 3414 ceq_mask = 0; 3415 } 3416 3417 ASSERT(bin < colors); 3418 3419 /* clear "non-significant" color bits */ 3420 bin &= ceq_mask; 3421 3422 /* 3423 * trim the pfn range to search based on pfnflag. pfnflag is set 3424 * when there have been previous page_get_contig_page failures to 3425 * limit the search. 3426 * 3427 * The high bit in pfnflag specifies the number of 'slots' in the 3428 * pfn range and the remainder of pfnflag specifies which slot. 3429 * For example, a value of 1010b would mean the second slot of 3430 * the pfn range that has been divided into 8 slots. 3431 */ 3432 if (pfnflag > 1) { 3433 int slots = 1 << (highbit(pfnflag) - 1); 3434 int slotid = pfnflag & (slots - 1); 3435 pgcnt_t szcpages; 3436 int slotlen; 3437 3438 pfnhi = P2ALIGN((pfnhi + 1), szcpgcnt) - 1; 3439 szcpages = ((pfnhi - pfnlo) + 1) / szcpgcnt; 3440 slotlen = howmany(szcpages, slots); 3441 /* skip if 'slotid' slot is empty */ 3442 if (slotid * slotlen >= szcpages) 3443 return (NULL); 3444 pfnlo = pfnlo + (((slotid * slotlen) % szcpages) * szcpgcnt); 3445 ASSERT(pfnlo < pfnhi); 3446 if (pfnhi > pfnlo + (slotlen * szcpgcnt)) 3447 pfnhi = pfnlo + (slotlen * szcpgcnt) - 1; 3448 } 3449 3450 memsegs_lock(0); 3451 3452 /* 3453 * loop through memsegs to look for contig page candidates 3454 */ 3455 3456 for (mseg = memsegs; mseg != NULL; mseg = mseg->next) { 3457 if (pfnhi < mseg->pages_base || pfnlo >= mseg->pages_end) { 3458 /* no overlap */ 3459 continue; 3460 } 3461 3462 if (mseg->pages_end - mseg->pages_base < szcpgcnt) 3463 /* mseg too small */ 3464 continue; 3465 3466 /* 3467 * trim off kernel cage pages from pfn range and check for 3468 * a trimmed pfn range returned that does not span the 3469 * desired large page size. 3470 */ 3471 if (kcage_on) { 3472 if (trimkcage(mseg, &lo, &hi, pfnlo, pfnhi) == 0 || 3473 lo >= hi || ((hi - lo) + 1) < szcpgcnt) 3474 continue; 3475 } else { 3476 lo = MAX(pfnlo, mseg->pages_base); 3477 hi = MIN(pfnhi, (mseg->pages_end - 1)); 3478 } 3479 3480 /* round to szcpgcnt boundaries */ 3481 lo = P2ROUNDUP(lo, szcpgcnt); 3482 3483 MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it); 3484 hi = P2ALIGN((hi + 1), szcpgcnt) - 1; 3485 3486 if (hi <= lo) 3487 continue; 3488 3489 /* 3490 * set lo to point to the pfn for the desired bin. Large 3491 * page sizes may only have a single page color 3492 */ 3493 skip = szcpgcnt; 3494 if (ceq_mask > 0 || interleaved_mnodes) { 3495 /* set lo to point at appropriate color */ 3496 if (((PFN_2_COLOR(lo, szc, &it) ^ bin) & ceq_mask) || 3497 (interleaved_mnodes && 3498 PFN_2_MEM_NODE(lo) != mnode)) { 3499 PAGE_NEXT_PFN_FOR_COLOR(lo, szc, bin, ceq_mask, 3500 color_mask, &it); 3501 } 3502 if (hi <= lo) 3503 /* mseg cannot satisfy color request */ 3504 continue; 3505 } 3506 3507 /* randomly choose a point between lo and hi to begin search */ 3508 3509 randpfn = (pfn_t)GETTICK(); 3510 randpfn = ((randpfn % (hi - lo)) + lo) & ~(skip - 1); 3511 MEM_NODE_ITERATOR_INIT(randpfn, mnode, szc, &it); 3512 if (ceq_mask || interleaved_mnodes || randpfn == (pfn_t)-1) { 3513 if (randpfn != (pfn_t)-1) { 3514 PAGE_NEXT_PFN_FOR_COLOR(randpfn, szc, bin, 3515 ceq_mask, color_mask, &it); 3516 } 3517 if (randpfn >= hi) { 3518 randpfn = lo; 3519 MEM_NODE_ITERATOR_INIT(randpfn, mnode, szc, 3520 &it); 3521 } 3522 } 3523 randpp = mseg->pages + (randpfn - mseg->pages_base); 3524 3525 ASSERT(randpp->p_pagenum == randpfn); 3526 3527 pp = randpp; 3528 endpp = mseg->pages + (hi - mseg->pages_base) + 1; 3529 3530 ASSERT(randpp + szcpgcnt <= endpp); 3531 3532 do { 3533 ASSERT(!(pp->p_pagenum & szcpgmask)); 3534 ASSERT(((PP_2_BIN(pp) ^ bin) & ceq_mask) == 0); 3535 3536 if (page_trylock_contig_pages(mnode, pp, szc, flags)) { 3537 /* pages unlocked by page_claim on failure */ 3538 if (page_claim_contig_pages(pp, szc, flags)) { 3539 memsegs_unlock(0); 3540 return (pp); 3541 } 3542 } 3543 3544 if (ceq_mask == 0 && !interleaved_mnodes) { 3545 pp += skip; 3546 } else { 3547 pfn_t pfn = pp->p_pagenum; 3548 3549 PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, bin, 3550 ceq_mask, color_mask, &it); 3551 if (pfn == (pfn_t)-1) { 3552 pp = endpp; 3553 } else { 3554 pp = mseg->pages + 3555 (pfn - mseg->pages_base); 3556 } 3557 } 3558 if (pp >= endpp) { 3559 /* start from the beginning */ 3560 MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it); 3561 pp = mseg->pages + (lo - mseg->pages_base); 3562 ASSERT(pp->p_pagenum == lo); 3563 ASSERT(pp + szcpgcnt <= endpp); 3564 } 3565 } while (pp != randpp); 3566 } 3567 memsegs_unlock(0); 3568 return (NULL); 3569 } 3570 3571 3572 /* 3573 * controlling routine that searches through physical memory in an attempt to 3574 * claim a large page based on the input parameters. 3575 * on the page free lists. 3576 * 3577 * calls page_geti_contig_pages with an initial pfn range from the mnode 3578 * and mtype. page_geti_contig_pages will trim off the parts of the pfn range 3579 * that overlaps with the kernel cage or does not match the requested page 3580 * color if PG_MATCH_COLOR is set. Since this search is very expensive, 3581 * page_geti_contig_pages may further limit the search range based on 3582 * previous failure counts (pgcpfailcnt[]). 3583 * 3584 * for PGI_PGCPSZC0 requests, page_get_contig_pages will relocate a base 3585 * pagesize page that satisfies mtype. 3586 */ 3587 page_t * 3588 page_get_contig_pages(int mnode, uint_t bin, int mtype, uchar_t szc, 3589 uint_t flags) 3590 { 3591 pfn_t pfnlo, pfnhi; /* contig pages pfn range */ 3592 page_t *pp; 3593 pgcnt_t pfnflag = 0; /* no limit on search if 0 */ 3594 3595 VM_STAT_ADD(vmm_vmstats.pgcp_alloc[szc]); 3596 3597 /* no allocations from cage */ 3598 flags |= PGI_NOCAGE; 3599 3600 /* LINTED */ 3601 MTYPE_START(mnode, mtype, flags); 3602 if (mtype < 0) { /* mnode does not have memory in mtype range */ 3603 VM_STAT_ADD(vmm_vmstats.pgcp_allocempty[szc]); 3604 return (NULL); 3605 } 3606 3607 ASSERT(szc > 0 || (flags & PGI_PGCPSZC0)); 3608 3609 /* do not limit search and ignore color if hi pri */ 3610 3611 if (pgcplimitsearch && ((flags & PGI_PGCPHIPRI) == 0)) 3612 pfnflag = pgcpfailcnt[szc]; 3613 3614 /* remove color match to improve chances */ 3615 3616 if (flags & PGI_PGCPHIPRI || pfnflag) 3617 flags &= ~PG_MATCH_COLOR; 3618 3619 do { 3620 /* get pfn range based on mnode and mtype */ 3621 MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi); 3622 3623 ASSERT(pfnhi >= pfnlo); 3624 3625 pp = page_geti_contig_pages(mnode, bin, szc, flags, 3626 pfnlo, pfnhi, pfnflag); 3627 3628 if (pp != NULL) { 3629 pfnflag = pgcpfailcnt[szc]; 3630 if (pfnflag) { 3631 /* double the search size */ 3632 pgcpfailcnt[szc] = pfnflag >> 1; 3633 } 3634 VM_STAT_ADD(vmm_vmstats.pgcp_allocok[szc]); 3635 return (pp); 3636 } 3637 MTYPE_NEXT(mnode, mtype, flags); 3638 } while (mtype >= 0); 3639 3640 VM_STAT_ADD(vmm_vmstats.pgcp_allocfailed[szc]); 3641 return (NULL); 3642 } 3643 3644 #if defined(__i386) || defined(__amd64) 3645 /* 3646 * Determine the likelihood of finding/coalescing a szc page. 3647 * Return 0 if the likelihood is small otherwise return 1. 3648 * 3649 * For now, be conservative and check only 1g pages and return 0 3650 * if there had been previous coalescing failures and the szc pages 3651 * needed to satisfy request would exhaust most of freemem. 3652 */ 3653 int 3654 page_chk_freelist(uint_t szc) 3655 { 3656 pgcnt_t pgcnt; 3657 3658 if (szc <= 1) 3659 return (1); 3660 3661 pgcnt = page_get_pagecnt(szc); 3662 if (pgcpfailcnt[szc] && pgcnt + throttlefree >= freemem) { 3663 VM_STAT_ADD(vmm_vmstats.pcf_deny[szc]); 3664 return (0); 3665 } 3666 VM_STAT_ADD(vmm_vmstats.pcf_allow[szc]); 3667 return (1); 3668 } 3669 #endif 3670 3671 /* 3672 * Find the `best' page on the freelist for this (vp,off) (as,vaddr) pair. 3673 * 3674 * Does its own locking and accounting. 3675 * If PG_MATCH_COLOR is set, then NULL will be returned if there are no 3676 * pages of the proper color even if there are pages of a different color. 3677 * 3678 * Finds a page, removes it, THEN locks it. 3679 */ 3680 3681 /*ARGSUSED*/ 3682 page_t * 3683 page_get_freelist(struct vnode *vp, u_offset_t off, struct seg *seg, 3684 caddr_t vaddr, size_t size, uint_t flags, struct lgrp *lgrp) 3685 { 3686 struct as *as = seg->s_as; 3687 page_t *pp = NULL; 3688 ulong_t bin; 3689 uchar_t szc; 3690 int mnode; 3691 int mtype; 3692 page_t *(*page_get_func)(int, uint_t, int, uchar_t, uint_t); 3693 lgrp_mnode_cookie_t lgrp_cookie; 3694 3695 page_get_func = page_get_mnode_freelist; 3696 3697 /* 3698 * If we aren't passed a specific lgroup, or passed a freed lgrp 3699 * assume we wish to allocate near to the current thread's home. 3700 */ 3701 if (!LGRP_EXISTS(lgrp)) 3702 lgrp = lgrp_home_lgrp(); 3703 3704 if (kcage_on) { 3705 if ((flags & (PG_NORELOC | PG_PANIC)) == PG_NORELOC && 3706 kcage_freemem < kcage_throttlefree + btop(size) && 3707 curthread != kcage_cageout_thread) { 3708 /* 3709 * Set a "reserve" of kcage_throttlefree pages for 3710 * PG_PANIC and cageout thread allocations. 3711 * 3712 * Everybody else has to serialize in 3713 * page_create_get_something() to get a cage page, so 3714 * that we don't deadlock cageout! 3715 */ 3716 return (NULL); 3717 } 3718 } else { 3719 flags &= ~PG_NORELOC; 3720 flags |= PGI_NOCAGE; 3721 } 3722 3723 /* LINTED */ 3724 MTYPE_INIT(mtype, vp, vaddr, flags, size); 3725 3726 /* 3727 * Convert size to page size code. 3728 */ 3729 if ((szc = page_szc(size)) == (uchar_t)-1) 3730 panic("page_get_freelist: illegal page size request"); 3731 ASSERT(szc < mmu_page_sizes); 3732 3733 VM_STAT_ADD(vmm_vmstats.pgf_alloc[szc]); 3734 3735 /* LINTED */ 3736 AS_2_BIN(as, seg, vp, vaddr, bin, szc); 3737 3738 ASSERT(bin < PAGE_GET_PAGECOLORS(szc)); 3739 3740 /* 3741 * Try to get a local page first, but try remote if we can't 3742 * get a page of the right color. 3743 */ 3744 pgretry: 3745 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL); 3746 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3747 pp = page_get_func(mnode, bin, mtype, szc, flags); 3748 if (pp != NULL) { 3749 VM_STAT_ADD(vmm_vmstats.pgf_allocok[szc]); 3750 DTRACE_PROBE4(page__get, 3751 lgrp_t *, lgrp, 3752 int, mnode, 3753 ulong_t, bin, 3754 uint_t, flags); 3755 return (pp); 3756 } 3757 } 3758 ASSERT(pp == NULL); 3759 3760 /* 3761 * for non-SZC0 PAGESIZE requests, check cachelist before checking 3762 * remote free lists. Caller expected to call page_get_cachelist which 3763 * will check local cache lists and remote free lists. 3764 */ 3765 if (szc == 0 && ((flags & PGI_PGCPSZC0) == 0)) { 3766 VM_STAT_ADD(vmm_vmstats.pgf_allocdeferred); 3767 return (NULL); 3768 } 3769 3770 ASSERT(szc > 0 || (flags & PGI_PGCPSZC0)); 3771 3772 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); 3773 3774 if (!(flags & PG_LOCAL)) { 3775 /* 3776 * Try to get a non-local freelist page. 3777 */ 3778 LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie); 3779 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3780 pp = page_get_func(mnode, bin, mtype, szc, flags); 3781 if (pp != NULL) { 3782 DTRACE_PROBE4(page__get, 3783 lgrp_t *, lgrp, 3784 int, mnode, 3785 ulong_t, bin, 3786 uint_t, flags); 3787 VM_STAT_ADD(vmm_vmstats.pgf_allocokrem[szc]); 3788 return (pp); 3789 } 3790 } 3791 ASSERT(pp == NULL); 3792 } 3793 3794 /* 3795 * when the cage is off chances are page_get_contig_pages() will fail 3796 * to lock a large page chunk therefore when the cage is off it's not 3797 * called by default. this can be changed via /etc/system. 3798 * 3799 * page_get_contig_pages() also called to acquire a base pagesize page 3800 * for page_create_get_something(). 3801 */ 3802 if (!(flags & PG_NORELOC) && (pg_contig_disable == 0) && 3803 (kcage_on || pg_lpgcreate_nocage || szc == 0) && 3804 (page_get_func != page_get_contig_pages)) { 3805 3806 VM_STAT_ADD(vmm_vmstats.pgf_allocretry[szc]); 3807 page_get_func = page_get_contig_pages; 3808 goto pgretry; 3809 } 3810 3811 if (!(flags & PG_LOCAL) && pgcplimitsearch && 3812 page_get_func == page_get_contig_pages) 3813 SETPGCPFAILCNT(szc); 3814 3815 VM_STAT_ADD(vmm_vmstats.pgf_allocfailed[szc]); 3816 return (NULL); 3817 } 3818 3819 /* 3820 * Find the `best' page on the cachelist for this (vp,off) (as,vaddr) pair. 3821 * 3822 * Does its own locking. 3823 * If PG_MATCH_COLOR is set, then NULL will be returned if there are no 3824 * pages of the proper color even if there are pages of a different color. 3825 * Otherwise, scan the bins for ones with pages. For each bin with pages, 3826 * try to lock one of them. If no page can be locked, try the 3827 * next bin. Return NULL if a page can not be found and locked. 3828 * 3829 * Finds a pages, trys to lock it, then removes it. 3830 */ 3831 3832 /*ARGSUSED*/ 3833 page_t * 3834 page_get_cachelist(struct vnode *vp, u_offset_t off, struct seg *seg, 3835 caddr_t vaddr, uint_t flags, struct lgrp *lgrp) 3836 { 3837 page_t *pp; 3838 struct as *as = seg->s_as; 3839 ulong_t bin; 3840 /*LINTED*/ 3841 int mnode; 3842 int mtype; 3843 lgrp_mnode_cookie_t lgrp_cookie; 3844 3845 /* 3846 * If we aren't passed a specific lgroup, or pasased a freed lgrp 3847 * assume we wish to allocate near to the current thread's home. 3848 */ 3849 if (!LGRP_EXISTS(lgrp)) 3850 lgrp = lgrp_home_lgrp(); 3851 3852 if (!kcage_on) { 3853 flags &= ~PG_NORELOC; 3854 flags |= PGI_NOCAGE; 3855 } 3856 3857 if ((flags & (PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == PG_NORELOC && 3858 kcage_freemem <= kcage_throttlefree) { 3859 /* 3860 * Reserve kcage_throttlefree pages for critical kernel 3861 * threads. 3862 * 3863 * Everybody else has to go to page_create_get_something() 3864 * to get a cage page, so we don't deadlock cageout. 3865 */ 3866 return (NULL); 3867 } 3868 3869 /* LINTED */ 3870 AS_2_BIN(as, seg, vp, vaddr, bin, 0); 3871 3872 ASSERT(bin < PAGE_GET_PAGECOLORS(0)); 3873 3874 /* LINTED */ 3875 MTYPE_INIT(mtype, vp, vaddr, flags, MMU_PAGESIZE); 3876 3877 VM_STAT_ADD(vmm_vmstats.pgc_alloc); 3878 3879 /* 3880 * Try local cachelists first 3881 */ 3882 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL); 3883 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3884 pp = page_get_mnode_cachelist(bin, flags, mnode, mtype); 3885 if (pp != NULL) { 3886 VM_STAT_ADD(vmm_vmstats.pgc_allocok); 3887 DTRACE_PROBE4(page__get, 3888 lgrp_t *, lgrp, 3889 int, mnode, 3890 ulong_t, bin, 3891 uint_t, flags); 3892 return (pp); 3893 } 3894 } 3895 3896 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); 3897 3898 /* 3899 * Try freelists/cachelists that are farther away 3900 * This is our only chance to allocate remote pages for PAGESIZE 3901 * requests. 3902 */ 3903 LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie); 3904 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3905 pp = page_get_mnode_freelist(mnode, bin, mtype, 3906 0, flags); 3907 if (pp != NULL) { 3908 VM_STAT_ADD(vmm_vmstats.pgc_allocokdeferred); 3909 DTRACE_PROBE4(page__get, 3910 lgrp_t *, lgrp, 3911 int, mnode, 3912 ulong_t, bin, 3913 uint_t, flags); 3914 return (pp); 3915 } 3916 pp = page_get_mnode_cachelist(bin, flags, mnode, mtype); 3917 if (pp != NULL) { 3918 VM_STAT_ADD(vmm_vmstats.pgc_allocokrem); 3919 DTRACE_PROBE4(page__get, 3920 lgrp_t *, lgrp, 3921 int, mnode, 3922 ulong_t, bin, 3923 uint_t, flags); 3924 return (pp); 3925 } 3926 } 3927 3928 VM_STAT_ADD(vmm_vmstats.pgc_allocfailed); 3929 return (NULL); 3930 } 3931 3932 page_t * 3933 page_get_mnode_cachelist(uint_t bin, uint_t flags, int mnode, int mtype) 3934 { 3935 kmutex_t *pcm; 3936 page_t *pp, *first_pp; 3937 uint_t sbin; 3938 int plw_initialized; 3939 page_list_walker_t plw; 3940 3941 VM_STAT_ADD(vmm_vmstats.pgmc_alloc); 3942 3943 /* LINTED */ 3944 MTYPE_START(mnode, mtype, flags); 3945 if (mtype < 0) { /* mnode does not have memory in mtype range */ 3946 VM_STAT_ADD(vmm_vmstats.pgmc_allocempty); 3947 return (NULL); 3948 } 3949 3950 try_again: 3951 3952 plw_initialized = 0; 3953 plw.plw_ceq_dif = 1; 3954 3955 /* 3956 * Only hold one cachelist lock at a time, that way we 3957 * can start anywhere and not have to worry about lock 3958 * ordering. 3959 */ 3960 3961 for (plw.plw_count = 0; 3962 plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) { 3963 sbin = bin; 3964 do { 3965 3966 if (!PAGE_CACHELISTS(mnode, bin, mtype)) 3967 goto bin_empty_1; 3968 pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST); 3969 mutex_enter(pcm); 3970 pp = PAGE_CACHELISTS(mnode, bin, mtype); 3971 if (pp == NULL) 3972 goto bin_empty_0; 3973 3974 first_pp = pp; 3975 ASSERT(pp->p_vnode); 3976 ASSERT(PP_ISAGED(pp) == 0); 3977 ASSERT(pp->p_szc == 0); 3978 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 3979 while (!page_trylock(pp, SE_EXCL)) { 3980 pp = pp->p_next; 3981 ASSERT(pp->p_szc == 0); 3982 if (pp == first_pp) { 3983 /* 3984 * We have searched the complete list! 3985 * And all of them (might only be one) 3986 * are locked. This can happen since 3987 * these pages can also be found via 3988 * the hash list. When found via the 3989 * hash list, they are locked first, 3990 * then removed. We give up to let the 3991 * other thread run. 3992 */ 3993 pp = NULL; 3994 break; 3995 } 3996 ASSERT(pp->p_vnode); 3997 ASSERT(PP_ISFREE(pp)); 3998 ASSERT(PP_ISAGED(pp) == 0); 3999 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == 4000 mnode); 4001 } 4002 4003 if (pp) { 4004 page_t **ppp; 4005 /* 4006 * Found and locked a page. 4007 * Pull it off the list. 4008 */ 4009 ASSERT(mtype == PP_2_MTYPE(pp)); 4010 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 4011 page_sub(ppp, pp); 4012 /* 4013 * Subtract counters before releasing pcm mutex 4014 * to avoid a race with page_freelist_coalesce 4015 * and page_freelist_split. 4016 */ 4017 page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST); 4018 mutex_exit(pcm); 4019 ASSERT(pp->p_vnode); 4020 ASSERT(PP_ISAGED(pp) == 0); 4021 #if defined(__sparc) 4022 ASSERT(!kcage_on || 4023 (flags & PG_NORELOC) == 0 || 4024 PP_ISNORELOC(pp)); 4025 if (PP_ISNORELOC(pp)) { 4026 kcage_freemem_sub(1); 4027 } 4028 #endif 4029 VM_STAT_ADD(vmm_vmstats. pgmc_allocok); 4030 return (pp); 4031 } 4032 bin_empty_0: 4033 mutex_exit(pcm); 4034 bin_empty_1: 4035 if (plw_initialized == 0) { 4036 page_list_walk_init(0, flags, bin, 0, 1, &plw); 4037 plw_initialized = 1; 4038 } 4039 /* calculate the next bin with equivalent color */ 4040 bin = ADD_MASKED(bin, plw.plw_bin_step, 4041 plw.plw_ceq_mask[0], plw.plw_color_mask); 4042 } while (sbin != bin); 4043 4044 if (plw.plw_ceq_dif > 1) 4045 bin = page_list_walk_next_bin(0, bin, &plw); 4046 } 4047 4048 MTYPE_NEXT(mnode, mtype, flags); 4049 if (mtype >= 0) 4050 goto try_again; 4051 4052 VM_STAT_ADD(vmm_vmstats.pgmc_allocfailed); 4053 return (NULL); 4054 } 4055 4056 #ifdef DEBUG 4057 #define REPL_PAGE_STATS 4058 #endif /* DEBUG */ 4059 4060 #ifdef REPL_PAGE_STATS 4061 struct repl_page_stats { 4062 uint_t ngets; 4063 uint_t ngets_noreloc; 4064 uint_t npgr_noreloc; 4065 uint_t nnopage_first; 4066 uint_t nnopage; 4067 uint_t nhashout; 4068 uint_t nnofree; 4069 uint_t nnext_pp; 4070 } repl_page_stats; 4071 #define REPL_STAT_INCR(v) atomic_add_32(&repl_page_stats.v, 1) 4072 #else /* REPL_PAGE_STATS */ 4073 #define REPL_STAT_INCR(v) 4074 #endif /* REPL_PAGE_STATS */ 4075 4076 int pgrppgcp; 4077 4078 /* 4079 * The freemem accounting must be done by the caller. 4080 * First we try to get a replacement page of the same size as like_pp, 4081 * if that is not possible, then we just get a set of discontiguous 4082 * PAGESIZE pages. 4083 */ 4084 page_t * 4085 page_get_replacement_page(page_t *orig_like_pp, struct lgrp *lgrp_target, 4086 uint_t pgrflags) 4087 { 4088 page_t *like_pp; 4089 page_t *pp, *pplist; 4090 page_t *pl = NULL; 4091 ulong_t bin; 4092 int mnode, page_mnode; 4093 int szc; 4094 spgcnt_t npgs, pg_cnt; 4095 pfn_t pfnum; 4096 int mtype; 4097 int flags = 0; 4098 lgrp_mnode_cookie_t lgrp_cookie; 4099 lgrp_t *lgrp; 4100 4101 REPL_STAT_INCR(ngets); 4102 like_pp = orig_like_pp; 4103 ASSERT(PAGE_EXCL(like_pp)); 4104 4105 szc = like_pp->p_szc; 4106 npgs = page_get_pagecnt(szc); 4107 /* 4108 * Now we reset like_pp to the base page_t. 4109 * That way, we won't walk past the end of this 'szc' page. 4110 */ 4111 pfnum = PFN_BASE(like_pp->p_pagenum, szc); 4112 like_pp = page_numtopp_nolock(pfnum); 4113 ASSERT(like_pp->p_szc == szc); 4114 4115 if (PP_ISNORELOC(like_pp)) { 4116 ASSERT(kcage_on); 4117 REPL_STAT_INCR(ngets_noreloc); 4118 flags = PGI_RELOCONLY; 4119 } else if (pgrflags & PGR_NORELOC) { 4120 ASSERT(kcage_on); 4121 REPL_STAT_INCR(npgr_noreloc); 4122 flags = PG_NORELOC; 4123 } 4124 4125 /* 4126 * Kernel pages must always be replaced with the same size 4127 * pages, since we cannot properly handle demotion of kernel 4128 * pages. 4129 */ 4130 if (PP_ISKAS(like_pp)) 4131 pgrflags |= PGR_SAMESZC; 4132 4133 /* LINTED */ 4134 MTYPE_PGR_INIT(mtype, flags, like_pp, page_mnode, npgs); 4135 4136 while (npgs) { 4137 pplist = NULL; 4138 for (;;) { 4139 pg_cnt = page_get_pagecnt(szc); 4140 bin = PP_2_BIN(like_pp); 4141 ASSERT(like_pp->p_szc == orig_like_pp->p_szc); 4142 ASSERT(pg_cnt <= npgs); 4143 4144 /* 4145 * If an lgroup was specified, try to get the 4146 * page from that lgroup. 4147 * NOTE: Must be careful with code below because 4148 * lgroup may disappear and reappear since there 4149 * is no locking for lgroup here. 4150 */ 4151 if (LGRP_EXISTS(lgrp_target)) { 4152 /* 4153 * Keep local variable for lgroup separate 4154 * from lgroup argument since this code should 4155 * only be exercised when lgroup argument 4156 * exists.... 4157 */ 4158 lgrp = lgrp_target; 4159 4160 /* Try the lgroup's freelists first */ 4161 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 4162 LGRP_SRCH_LOCAL); 4163 while ((pplist == NULL) && 4164 (mnode = lgrp_memnode_choose(&lgrp_cookie)) 4165 != -1) { 4166 pplist = 4167 page_get_mnode_freelist(mnode, bin, 4168 mtype, szc, flags); 4169 } 4170 4171 /* 4172 * Now try it's cachelists if this is a 4173 * small page. Don't need to do it for 4174 * larger ones since page_freelist_coalesce() 4175 * already failed. 4176 */ 4177 if (pplist != NULL || szc != 0) 4178 break; 4179 4180 /* Now try it's cachelists */ 4181 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 4182 LGRP_SRCH_LOCAL); 4183 4184 while ((pplist == NULL) && 4185 (mnode = lgrp_memnode_choose(&lgrp_cookie)) 4186 != -1) { 4187 pplist = 4188 page_get_mnode_cachelist(bin, flags, 4189 mnode, mtype); 4190 } 4191 if (pplist != NULL) { 4192 page_hashout(pplist, NULL); 4193 PP_SETAGED(pplist); 4194 REPL_STAT_INCR(nhashout); 4195 break; 4196 } 4197 /* Done looking in this lgroup. Bail out. */ 4198 break; 4199 } 4200 4201 /* 4202 * No lgroup was specified (or lgroup was removed by 4203 * DR, so just try to get the page as close to 4204 * like_pp's mnode as possible. 4205 * First try the local freelist... 4206 */ 4207 mnode = PP_2_MEM_NODE(like_pp); 4208 pplist = page_get_mnode_freelist(mnode, bin, 4209 mtype, szc, flags); 4210 if (pplist != NULL) 4211 break; 4212 4213 REPL_STAT_INCR(nnofree); 4214 4215 /* 4216 * ...then the local cachelist. Don't need to do it for 4217 * larger pages cause page_freelist_coalesce() already 4218 * failed there anyway. 4219 */ 4220 if (szc == 0) { 4221 pplist = page_get_mnode_cachelist(bin, flags, 4222 mnode, mtype); 4223 if (pplist != NULL) { 4224 page_hashout(pplist, NULL); 4225 PP_SETAGED(pplist); 4226 REPL_STAT_INCR(nhashout); 4227 break; 4228 } 4229 } 4230 4231 /* Now try remote freelists */ 4232 page_mnode = mnode; 4233 lgrp = 4234 lgrp_hand_to_lgrp(MEM_NODE_2_LGRPHAND(page_mnode)); 4235 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 4236 LGRP_SRCH_HIER); 4237 while (pplist == NULL && 4238 (mnode = lgrp_memnode_choose(&lgrp_cookie)) 4239 != -1) { 4240 /* 4241 * Skip local mnode. 4242 */ 4243 if ((mnode == page_mnode) || 4244 (mem_node_config[mnode].exists == 0)) 4245 continue; 4246 4247 pplist = page_get_mnode_freelist(mnode, 4248 bin, mtype, szc, flags); 4249 } 4250 4251 if (pplist != NULL) 4252 break; 4253 4254 4255 /* Now try remote cachelists */ 4256 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 4257 LGRP_SRCH_HIER); 4258 while (pplist == NULL && szc == 0) { 4259 mnode = lgrp_memnode_choose(&lgrp_cookie); 4260 if (mnode == -1) 4261 break; 4262 /* 4263 * Skip local mnode. 4264 */ 4265 if ((mnode == page_mnode) || 4266 (mem_node_config[mnode].exists == 0)) 4267 continue; 4268 4269 pplist = page_get_mnode_cachelist(bin, 4270 flags, mnode, mtype); 4271 4272 if (pplist != NULL) { 4273 page_hashout(pplist, NULL); 4274 PP_SETAGED(pplist); 4275 REPL_STAT_INCR(nhashout); 4276 break; 4277 } 4278 } 4279 4280 /* 4281 * Break out of while loop under the following cases: 4282 * - If we successfully got a page. 4283 * - If pgrflags specified only returning a specific 4284 * page size and we could not find that page size. 4285 * - If we could not satisfy the request with PAGESIZE 4286 * or larger pages. 4287 */ 4288 if (pplist != NULL || szc == 0) 4289 break; 4290 4291 if ((pgrflags & PGR_SAMESZC) || pgrppgcp) { 4292 /* try to find contig page */ 4293 4294 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 4295 LGRP_SRCH_HIER); 4296 4297 while ((pplist == NULL) && 4298 (mnode = 4299 lgrp_memnode_choose(&lgrp_cookie)) 4300 != -1) { 4301 pplist = page_get_contig_pages( 4302 mnode, bin, mtype, szc, 4303 flags | PGI_PGCPHIPRI); 4304 } 4305 break; 4306 } 4307 4308 /* 4309 * The correct thing to do here is try the next 4310 * page size down using szc--. Due to a bug 4311 * with the processing of HAT_RELOAD_SHARE 4312 * where the sfmmu_ttecnt arrays of all 4313 * hats sharing an ISM segment don't get updated, 4314 * using intermediate size pages for relocation 4315 * can lead to continuous page faults. 4316 */ 4317 szc = 0; 4318 } 4319 4320 if (pplist != NULL) { 4321 DTRACE_PROBE4(page__get, 4322 lgrp_t *, lgrp, 4323 int, mnode, 4324 ulong_t, bin, 4325 uint_t, flags); 4326 4327 while (pplist != NULL && pg_cnt--) { 4328 ASSERT(pplist != NULL); 4329 pp = pplist; 4330 page_sub(&pplist, pp); 4331 PP_CLRFREE(pp); 4332 PP_CLRAGED(pp); 4333 page_list_concat(&pl, &pp); 4334 npgs--; 4335 like_pp = like_pp + 1; 4336 REPL_STAT_INCR(nnext_pp); 4337 } 4338 ASSERT(pg_cnt == 0); 4339 } else { 4340 break; 4341 } 4342 } 4343 4344 if (npgs) { 4345 /* 4346 * We were unable to allocate the necessary number 4347 * of pages. 4348 * We need to free up any pl. 4349 */ 4350 REPL_STAT_INCR(nnopage); 4351 page_free_replacement_page(pl); 4352 return (NULL); 4353 } else { 4354 return (pl); 4355 } 4356 } 4357 4358 /* 4359 * demote a free large page to it's constituent pages 4360 */ 4361 void 4362 page_demote_free_pages(page_t *pp) 4363 { 4364 4365 int mnode; 4366 4367 ASSERT(pp != NULL); 4368 ASSERT(PAGE_LOCKED(pp)); 4369 ASSERT(PP_ISFREE(pp)); 4370 ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes); 4371 4372 mnode = PP_2_MEM_NODE(pp); 4373 page_freelist_lock(mnode); 4374 if (pp->p_szc != 0) { 4375 (void) page_demote(mnode, PFN_BASE(pp->p_pagenum, 4376 pp->p_szc), 0, pp->p_szc, 0, PC_NO_COLOR, PC_FREE); 4377 } 4378 page_freelist_unlock(mnode); 4379 ASSERT(pp->p_szc == 0); 4380 } 4381 4382 /* 4383 * Factor in colorequiv to check additional 'equivalent' bins. 4384 * colorequiv may be set in /etc/system 4385 */ 4386 void 4387 page_set_colorequiv_arr(void) 4388 { 4389 if (colorequiv > 1) { 4390 int i; 4391 uint_t sv_a = lowbit(colorequiv) - 1; 4392 4393 if (sv_a > 15) 4394 sv_a = 15; 4395 4396 for (i = 0; i < MMU_PAGE_SIZES; i++) { 4397 uint_t colors; 4398 uint_t a = sv_a; 4399 4400 if ((colors = hw_page_array[i].hp_colors) <= 1) { 4401 continue; 4402 } 4403 while ((colors >> a) == 0) 4404 a--; 4405 if ((a << 4) > colorequivszc[i]) { 4406 colorequivszc[i] = (a << 4); 4407 } 4408 } 4409 } 4410 } 4411