1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * Portions of this source code were derived from Berkeley 4.3 BSD 31 * under license from the Regents of the University of California. 32 */ 33 34 35 /* 36 * This file contains common functions to access and manage the page lists. 37 * Many of these routines originated from platform dependent modules 38 * (sun4/vm/vm_dep.c, i86pc/vm/vm_machdep.c) and modified to function in 39 * a platform independent manner. 40 * 41 * vm/vm_dep.h provides for platform specific support. 42 */ 43 44 #include <sys/types.h> 45 #include <sys/debug.h> 46 #include <sys/cmn_err.h> 47 #include <sys/systm.h> 48 #include <sys/atomic.h> 49 #include <sys/sysmacros.h> 50 #include <vm/as.h> 51 #include <vm/page.h> 52 #include <vm/seg_kmem.h> 53 #include <vm/seg_vn.h> 54 #include <sys/vmsystm.h> 55 #include <sys/memnode.h> 56 #include <vm/vm_dep.h> 57 #include <sys/lgrp.h> 58 #include <sys/mem_config.h> 59 #include <sys/callb.h> 60 #include <sys/mem_cage.h> 61 #include <sys/sdt.h> 62 #include <sys/dumphdr.h> 63 64 extern uint_t vac_colors; 65 66 #define MAX_PRAGMA_ALIGN 128 67 68 /* vm_cpu_data0 for the boot cpu before kmem is initialized */ 69 70 #if L2CACHE_ALIGN_MAX <= MAX_PRAGMA_ALIGN 71 #pragma align L2CACHE_ALIGN_MAX(vm_cpu_data0) 72 #else 73 #pragma align MAX_PRAGMA_ALIGN(vm_cpu_data0) 74 #endif 75 char vm_cpu_data0[VM_CPU_DATA_PADSIZE]; 76 77 /* 78 * number of page colors equivalent to reqested color in page_get routines. 79 * If set, keeps large pages intact longer and keeps MPO allocation 80 * from the local mnode in favor of acquiring the 'correct' page color from 81 * a demoted large page or from a remote mnode. 82 */ 83 uint_t colorequiv; 84 85 /* 86 * color equivalency mask for each page size. 87 * Mask is computed based on cpu L2$ way sizes and colorequiv global. 88 * High 4 bits determine the number of high order bits of the color to ignore. 89 * Low 4 bits determines number of low order bits of color to ignore (it's only 90 * relevant for hashed index based page coloring). 91 */ 92 uchar_t colorequivszc[MMU_PAGE_SIZES]; 93 94 /* 95 * if set, specifies the percentage of large pages that are free from within 96 * a large page region before attempting to lock those pages for 97 * page_get_contig_pages processing. 98 * 99 * Should be turned on when kpr is available when page_trylock_contig_pages 100 * can be more selective. 101 */ 102 103 int ptcpthreshold; 104 105 /* 106 * Limit page get contig page search based on failure cnts in pgcpfailcnt[]. 107 * Enabled by default via pgcplimitsearch. 108 * 109 * pgcpfailcnt[] is bounded by PGCPFAILMAX (>= 1/2 of installed 110 * memory). When reached, pgcpfailcnt[] is reset to 1/2 of this upper 111 * bound. This upper bound range guarantees: 112 * - all large page 'slots' will be searched over time 113 * - the minimum (1) large page candidates considered on each pgcp call 114 * - count doesn't wrap around to 0 115 */ 116 pgcnt_t pgcpfailcnt[MMU_PAGE_SIZES]; 117 int pgcplimitsearch = 1; 118 119 #define PGCPFAILMAX (1 << (highbit(physinstalled) - 1)) 120 #define SETPGCPFAILCNT(szc) \ 121 if (++pgcpfailcnt[szc] >= PGCPFAILMAX) \ 122 pgcpfailcnt[szc] = PGCPFAILMAX / 2; 123 124 #ifdef VM_STATS 125 struct vmm_vmstats_str vmm_vmstats; 126 127 #endif /* VM_STATS */ 128 129 #if defined(__sparc) 130 #define LPGCREATE 0 131 #else 132 /* enable page_get_contig_pages */ 133 #define LPGCREATE 1 134 #endif 135 136 int pg_contig_disable; 137 int pg_lpgcreate_nocage = LPGCREATE; 138 139 /* 140 * page_freelist_split pfn flag to signify no lo or hi pfn requirement. 141 */ 142 #define PFNNULL 0 143 144 /* Flags involved in promotion and demotion routines */ 145 #define PC_FREE 0x1 /* put page on freelist */ 146 #define PC_ALLOC 0x2 /* return page for allocation */ 147 148 /* 149 * Flag for page_demote to be used with PC_FREE to denote that we don't care 150 * what the color is as the color parameter to the function is ignored. 151 */ 152 #define PC_NO_COLOR (-1) 153 154 /* mtype value for page_promote to use when mtype does not matter */ 155 #define PC_MTYPE_ANY (-1) 156 157 /* 158 * page counters candidates info 159 * See page_ctrs_cands comment below for more details. 160 * fields are as follows: 161 * pcc_pages_free: # pages which freelist coalesce can create 162 * pcc_color_free: pointer to page free counts per color 163 */ 164 typedef struct pcc_info { 165 pgcnt_t pcc_pages_free; 166 pgcnt_t *pcc_color_free; 167 uint_t pad[12]; 168 } pcc_info_t; 169 170 /* 171 * On big machines it can take a long time to check page_counters 172 * arrays. page_ctrs_cands is a summary array whose elements are a dynamically 173 * updated sum of all elements of the corresponding page_counters arrays. 174 * page_freelist_coalesce() searches page_counters only if an appropriate 175 * element of page_ctrs_cands array is greater than 0. 176 * 177 * page_ctrs_cands is indexed by mutex (i), region (r), mnode (m), mrange (g) 178 */ 179 pcc_info_t **page_ctrs_cands[NPC_MUTEX][MMU_PAGE_SIZES]; 180 181 /* 182 * Return in val the total number of free pages which can be created 183 * for the given mnode (m), mrange (g), and region size (r) 184 */ 185 #define PGCTRS_CANDS_GETVALUE(m, g, r, val) { \ 186 int i; \ 187 val = 0; \ 188 for (i = 0; i < NPC_MUTEX; i++) { \ 189 val += page_ctrs_cands[i][(r)][(m)][(g)].pcc_pages_free; \ 190 } \ 191 } 192 193 /* 194 * Return in val the total number of free pages which can be created 195 * for the given mnode (m), mrange (g), region size (r), and color (c) 196 */ 197 #define PGCTRS_CANDS_GETVALUECOLOR(m, g, r, c, val) { \ 198 int i; \ 199 val = 0; \ 200 ASSERT((c) < PAGE_GET_PAGECOLORS(r)); \ 201 for (i = 0; i < NPC_MUTEX; i++) { \ 202 val += \ 203 page_ctrs_cands[i][(r)][(m)][(g)].pcc_color_free[(c)]; \ 204 } \ 205 } 206 207 /* 208 * We can only allow a single thread to update a counter within the physical 209 * range of the largest supported page size. That is the finest granularity 210 * possible since the counter values are dependent on each other 211 * as you move accross region sizes. PP_CTR_LOCK_INDX is used to determine the 212 * ctr_mutex lock index for a particular physical range. 213 */ 214 static kmutex_t *ctr_mutex[NPC_MUTEX]; 215 216 #define PP_CTR_LOCK_INDX(pp) \ 217 (((pp)->p_pagenum >> \ 218 (PAGE_BSZS_SHIFT(mmu_page_sizes - 1))) & (NPC_MUTEX - 1)) 219 220 #define INVALID_COLOR 0xffffffff 221 #define INVALID_MASK 0xffffffff 222 223 /* 224 * Local functions prototypes. 225 */ 226 227 void page_ctr_add(int, int, page_t *, int); 228 void page_ctr_add_internal(int, int, page_t *, int); 229 void page_ctr_sub(int, int, page_t *, int); 230 void page_ctr_sub_internal(int, int, page_t *, int); 231 void page_freelist_lock(int); 232 void page_freelist_unlock(int); 233 page_t *page_promote(int, pfn_t, uchar_t, int, int); 234 page_t *page_demote(int, pfn_t, pfn_t, uchar_t, uchar_t, int, int); 235 page_t *page_freelist_split(uchar_t, 236 uint_t, int, int, pfn_t, pfn_t, page_list_walker_t *); 237 page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int); 238 static int page_trylock_cons(page_t *pp, se_t se); 239 240 /* 241 * The page_counters array below is used to keep track of free contiguous 242 * physical memory. A hw_page_map_t will be allocated per mnode per szc. 243 * This contains an array of counters, the size of the array, a shift value 244 * used to convert a pagenum into a counter array index or vice versa, as 245 * well as a cache of the last successful index to be promoted to a larger 246 * page size. As an optimization, we keep track of the last successful index 247 * to be promoted per page color for the given size region, and this is 248 * allocated dynamically based upon the number of colors for a given 249 * region size. 250 * 251 * Conceptually, the page counters are represented as: 252 * 253 * page_counters[region_size][mnode] 254 * 255 * region_size: size code of a candidate larger page made up 256 * of contiguous free smaller pages. 257 * 258 * page_counters[region_size][mnode].hpm_counters[index]: 259 * represents how many (region_size - 1) pages either 260 * exist or can be created within the given index range. 261 * 262 * Let's look at a sparc example: 263 * If we want to create a free 512k page, we look at region_size 2 264 * for the mnode we want. We calculate the index and look at a specific 265 * hpm_counters location. If we see 8 (FULL_REGION_CNT on sparc) at 266 * this location, it means that 8 64k pages either exist or can be created 267 * from 8K pages in order to make a single free 512k page at the given 268 * index. Note that when a region is full, it will contribute to the 269 * counts in the region above it. Thus we will not know what page 270 * size the free pages will be which can be promoted to this new free 271 * page unless we look at all regions below the current region. 272 */ 273 274 /* 275 * Note: hpmctr_t is defined in platform vm_dep.h 276 * hw_page_map_t contains all the information needed for the page_counters 277 * logic. The fields are as follows: 278 * 279 * hpm_counters: dynamically allocated array to hold counter data 280 * hpm_entries: entries in hpm_counters 281 * hpm_shift: shift for pnum/array index conv 282 * hpm_base: PFN mapped to counter index 0 283 * hpm_color_current: last index in counter array for this color at 284 * which we successfully created a large page 285 */ 286 typedef struct hw_page_map { 287 hpmctr_t *hpm_counters; 288 size_t hpm_entries; 289 int hpm_shift; 290 pfn_t hpm_base; 291 size_t *hpm_color_current[MAX_MNODE_MRANGES]; 292 #if defined(__sparc) 293 uint_t pad[4]; 294 #endif 295 } hw_page_map_t; 296 297 /* 298 * Element zero is not used, but is allocated for convenience. 299 */ 300 static hw_page_map_t *page_counters[MMU_PAGE_SIZES]; 301 302 /* 303 * Cached value of MNODE_RANGE_CNT(mnode). 304 * This is a function call in x86. 305 */ 306 static int mnode_nranges[MAX_MEM_NODES]; 307 static int mnode_maxmrange[MAX_MEM_NODES]; 308 309 /* 310 * The following macros are convenient ways to get access to the individual 311 * elements of the page_counters arrays. They can be used on both 312 * the left side and right side of equations. 313 */ 314 #define PAGE_COUNTERS(mnode, rg_szc, idx) \ 315 (page_counters[(rg_szc)][(mnode)].hpm_counters[(idx)]) 316 317 #define PAGE_COUNTERS_COUNTERS(mnode, rg_szc) \ 318 (page_counters[(rg_szc)][(mnode)].hpm_counters) 319 320 #define PAGE_COUNTERS_SHIFT(mnode, rg_szc) \ 321 (page_counters[(rg_szc)][(mnode)].hpm_shift) 322 323 #define PAGE_COUNTERS_ENTRIES(mnode, rg_szc) \ 324 (page_counters[(rg_szc)][(mnode)].hpm_entries) 325 326 #define PAGE_COUNTERS_BASE(mnode, rg_szc) \ 327 (page_counters[(rg_szc)][(mnode)].hpm_base) 328 329 #define PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, rg_szc, g) \ 330 (page_counters[(rg_szc)][(mnode)].hpm_color_current[(g)]) 331 332 #define PAGE_COUNTERS_CURRENT_COLOR(mnode, rg_szc, color, mrange) \ 333 (page_counters[(rg_szc)][(mnode)]. \ 334 hpm_color_current[(mrange)][(color)]) 335 336 #define PNUM_TO_IDX(mnode, rg_szc, pnum) \ 337 (((pnum) - PAGE_COUNTERS_BASE((mnode), (rg_szc))) >> \ 338 PAGE_COUNTERS_SHIFT((mnode), (rg_szc))) 339 340 #define IDX_TO_PNUM(mnode, rg_szc, index) \ 341 (PAGE_COUNTERS_BASE((mnode), (rg_szc)) + \ 342 ((index) << PAGE_COUNTERS_SHIFT((mnode), (rg_szc)))) 343 344 /* 345 * Protects the hpm_counters and hpm_color_current memory from changing while 346 * looking at page counters information. 347 * Grab the write lock to modify what these fields point at. 348 * Grab the read lock to prevent any pointers from changing. 349 * The write lock can not be held during memory allocation due to a possible 350 * recursion deadlock with trying to grab the read lock while the 351 * write lock is already held. 352 */ 353 krwlock_t page_ctrs_rwlock[MAX_MEM_NODES]; 354 355 356 /* 357 * initialize cpu_vm_data to point at cache aligned vm_cpu_data_t. 358 */ 359 void 360 cpu_vm_data_init(struct cpu *cp) 361 { 362 if (cp == CPU0) { 363 cp->cpu_vm_data = (void *)&vm_cpu_data0; 364 } else { 365 void *kmptr; 366 int align; 367 size_t sz; 368 369 align = (L2CACHE_ALIGN) ? L2CACHE_ALIGN : L2CACHE_ALIGN_MAX; 370 sz = P2ROUNDUP(sizeof (vm_cpu_data_t), align) + align; 371 kmptr = kmem_zalloc(sz, KM_SLEEP); 372 cp->cpu_vm_data = (void *) P2ROUNDUP((uintptr_t)kmptr, align); 373 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr = kmptr; 374 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize = sz; 375 } 376 } 377 378 /* 379 * free cpu_vm_data 380 */ 381 void 382 cpu_vm_data_destroy(struct cpu *cp) 383 { 384 if (cp->cpu_seqid && cp->cpu_vm_data) { 385 ASSERT(cp != CPU0); 386 kmem_free(((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr, 387 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize); 388 } 389 cp->cpu_vm_data = NULL; 390 } 391 392 393 /* 394 * page size to page size code 395 */ 396 int 397 page_szc(size_t pagesize) 398 { 399 int i = 0; 400 401 while (hw_page_array[i].hp_size) { 402 if (pagesize == hw_page_array[i].hp_size) 403 return (i); 404 i++; 405 } 406 return (-1); 407 } 408 409 /* 410 * page size to page size code with the restriction that it be a supported 411 * user page size. If it's not a supported user page size, -1 will be returned. 412 */ 413 int 414 page_szc_user_filtered(size_t pagesize) 415 { 416 int szc = page_szc(pagesize); 417 if ((szc != -1) && (SZC_2_USERSZC(szc) != -1)) { 418 return (szc); 419 } 420 return (-1); 421 } 422 423 /* 424 * Return how many page sizes are available for the user to use. This is 425 * what the hardware supports and not based upon how the OS implements the 426 * support of different page sizes. 427 * 428 * If legacy is non-zero, return the number of pagesizes available to legacy 429 * applications. The number of legacy page sizes might be less than the 430 * exported user page sizes. This is to prevent legacy applications that 431 * use the largest page size returned from getpagesizes(3c) from inadvertantly 432 * using the 'new' large pagesizes. 433 */ 434 uint_t 435 page_num_user_pagesizes(int legacy) 436 { 437 if (legacy) 438 return (mmu_legacy_page_sizes); 439 return (mmu_exported_page_sizes); 440 } 441 442 uint_t 443 page_num_pagesizes(void) 444 { 445 return (mmu_page_sizes); 446 } 447 448 /* 449 * returns the count of the number of base pagesize pages associated with szc 450 */ 451 pgcnt_t 452 page_get_pagecnt(uint_t szc) 453 { 454 if (szc >= mmu_page_sizes) 455 panic("page_get_pagecnt: out of range %d", szc); 456 return (hw_page_array[szc].hp_pgcnt); 457 } 458 459 size_t 460 page_get_pagesize(uint_t szc) 461 { 462 if (szc >= mmu_page_sizes) 463 panic("page_get_pagesize: out of range %d", szc); 464 return (hw_page_array[szc].hp_size); 465 } 466 467 /* 468 * Return the size of a page based upon the index passed in. An index of 469 * zero refers to the smallest page size in the system, and as index increases 470 * it refers to the next larger supported page size in the system. 471 * Note that szc and userszc may not be the same due to unsupported szc's on 472 * some systems. 473 */ 474 size_t 475 page_get_user_pagesize(uint_t userszc) 476 { 477 uint_t szc = USERSZC_2_SZC(userszc); 478 479 if (szc >= mmu_page_sizes) 480 panic("page_get_user_pagesize: out of range %d", szc); 481 return (hw_page_array[szc].hp_size); 482 } 483 484 uint_t 485 page_get_shift(uint_t szc) 486 { 487 if (szc >= mmu_page_sizes) 488 panic("page_get_shift: out of range %d", szc); 489 return (PAGE_GET_SHIFT(szc)); 490 } 491 492 uint_t 493 page_get_pagecolors(uint_t szc) 494 { 495 if (szc >= mmu_page_sizes) 496 panic("page_get_pagecolors: out of range %d", szc); 497 return (PAGE_GET_PAGECOLORS(szc)); 498 } 499 500 /* 501 * this assigns the desired equivalent color after a split 502 */ 503 uint_t 504 page_correct_color(uchar_t szc, uchar_t nszc, uint_t color, 505 uint_t ncolor, uint_t ceq_mask) 506 { 507 ASSERT(nszc > szc); 508 ASSERT(szc < mmu_page_sizes); 509 ASSERT(color < PAGE_GET_PAGECOLORS(szc)); 510 ASSERT(ncolor < PAGE_GET_PAGECOLORS(nszc)); 511 512 color &= ceq_mask; 513 ncolor = PAGE_CONVERT_COLOR(ncolor, szc, nszc); 514 return (color | (ncolor & ~ceq_mask)); 515 } 516 517 /* 518 * The interleaved_mnodes flag is set when mnodes overlap in 519 * the physbase..physmax range, but have disjoint slices. 520 * In this case hpm_counters is shared by all mnodes. 521 * This flag is set dynamically by the platform. 522 */ 523 int interleaved_mnodes = 0; 524 525 /* 526 * Called by startup(). 527 * Size up the per page size free list counters based on physmax 528 * of each node and max_mem_nodes. 529 * 530 * If interleaved_mnodes is set we need to find the first mnode that 531 * exists. hpm_counters for the first mnode will then be shared by 532 * all other mnodes. If interleaved_mnodes is not set, just set 533 * first=mnode each time. That means there will be no sharing. 534 */ 535 size_t 536 page_ctrs_sz(void) 537 { 538 int r; /* region size */ 539 int mnode; 540 int firstmn; /* first mnode that exists */ 541 int nranges; 542 pfn_t physbase; 543 pfn_t physmax; 544 uint_t ctrs_sz = 0; 545 int i; 546 pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; 547 548 /* 549 * We need to determine how many page colors there are for each 550 * page size in order to allocate memory for any color specific 551 * arrays. 552 */ 553 for (i = 0; i < mmu_page_sizes; i++) { 554 colors_per_szc[i] = PAGE_GET_PAGECOLORS(i); 555 } 556 557 for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) { 558 559 pgcnt_t r_pgcnt; 560 pfn_t r_base; 561 pgcnt_t r_align; 562 563 if (mem_node_config[mnode].exists == 0) 564 continue; 565 566 HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn); 567 nranges = MNODE_RANGE_CNT(mnode); 568 mnode_nranges[mnode] = nranges; 569 mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode); 570 571 /* 572 * determine size needed for page counter arrays with 573 * base aligned to large page size. 574 */ 575 for (r = 1; r < mmu_page_sizes; r++) { 576 /* add in space for hpm_color_current */ 577 ctrs_sz += sizeof (size_t) * 578 colors_per_szc[r] * nranges; 579 580 if (firstmn != mnode) 581 continue; 582 583 /* add in space for hpm_counters */ 584 r_align = page_get_pagecnt(r); 585 r_base = physbase; 586 r_base &= ~(r_align - 1); 587 r_pgcnt = howmany(physmax - r_base + 1, r_align); 588 589 /* 590 * Round up to always allocate on pointer sized 591 * boundaries. 592 */ 593 ctrs_sz += P2ROUNDUP((r_pgcnt * sizeof (hpmctr_t)), 594 sizeof (hpmctr_t *)); 595 } 596 } 597 598 for (r = 1; r < mmu_page_sizes; r++) { 599 ctrs_sz += (max_mem_nodes * sizeof (hw_page_map_t)); 600 } 601 602 /* add in space for page_ctrs_cands and pcc_color_free */ 603 ctrs_sz += sizeof (pcc_info_t *) * max_mem_nodes * 604 mmu_page_sizes * NPC_MUTEX; 605 606 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 607 608 if (mem_node_config[mnode].exists == 0) 609 continue; 610 611 nranges = mnode_nranges[mnode]; 612 ctrs_sz += sizeof (pcc_info_t) * nranges * 613 mmu_page_sizes * NPC_MUTEX; 614 for (r = 1; r < mmu_page_sizes; r++) { 615 ctrs_sz += sizeof (pgcnt_t) * nranges * 616 colors_per_szc[r] * NPC_MUTEX; 617 } 618 } 619 620 /* ctr_mutex */ 621 ctrs_sz += (max_mem_nodes * NPC_MUTEX * sizeof (kmutex_t)); 622 623 /* size for page list counts */ 624 PLCNT_SZ(ctrs_sz); 625 626 /* 627 * add some slop for roundups. page_ctrs_alloc will roundup the start 628 * address of the counters to ecache_alignsize boundary for every 629 * memory node. 630 */ 631 return (ctrs_sz + max_mem_nodes * L2CACHE_ALIGN); 632 } 633 634 caddr_t 635 page_ctrs_alloc(caddr_t alloc_base) 636 { 637 int mnode; 638 int mrange, nranges; 639 int r; /* region size */ 640 int i; 641 int firstmn; /* first mnode that exists */ 642 pfn_t physbase; 643 pfn_t physmax; 644 pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; 645 646 /* 647 * We need to determine how many page colors there are for each 648 * page size in order to allocate memory for any color specific 649 * arrays. 650 */ 651 for (i = 0; i < mmu_page_sizes; i++) { 652 colors_per_szc[i] = PAGE_GET_PAGECOLORS(i); 653 } 654 655 for (r = 1; r < mmu_page_sizes; r++) { 656 page_counters[r] = (hw_page_map_t *)alloc_base; 657 alloc_base += (max_mem_nodes * sizeof (hw_page_map_t)); 658 } 659 660 /* page_ctrs_cands and pcc_color_free array */ 661 for (i = 0; i < NPC_MUTEX; i++) { 662 for (r = 1; r < mmu_page_sizes; r++) { 663 664 page_ctrs_cands[i][r] = (pcc_info_t **)alloc_base; 665 alloc_base += sizeof (pcc_info_t *) * max_mem_nodes; 666 667 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 668 pcc_info_t *pi; 669 670 if (mem_node_config[mnode].exists == 0) 671 continue; 672 673 nranges = mnode_nranges[mnode]; 674 675 pi = (pcc_info_t *)alloc_base; 676 alloc_base += sizeof (pcc_info_t) * nranges; 677 page_ctrs_cands[i][r][mnode] = pi; 678 679 for (mrange = 0; mrange < nranges; mrange++) { 680 pi->pcc_color_free = 681 (pgcnt_t *)alloc_base; 682 alloc_base += sizeof (pgcnt_t) * 683 colors_per_szc[r]; 684 pi++; 685 } 686 } 687 } 688 } 689 690 /* ctr_mutex */ 691 for (i = 0; i < NPC_MUTEX; i++) { 692 ctr_mutex[i] = (kmutex_t *)alloc_base; 693 alloc_base += (max_mem_nodes * sizeof (kmutex_t)); 694 } 695 696 /* initialize page list counts */ 697 PLCNT_INIT(alloc_base); 698 699 for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) { 700 701 pgcnt_t r_pgcnt; 702 pfn_t r_base; 703 pgcnt_t r_align; 704 int r_shift; 705 int nranges = mnode_nranges[mnode]; 706 707 if (mem_node_config[mnode].exists == 0) 708 continue; 709 710 HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn); 711 712 for (r = 1; r < mmu_page_sizes; r++) { 713 /* 714 * the page_counters base has to be aligned to the 715 * page count of page size code r otherwise the counts 716 * will cross large page boundaries. 717 */ 718 r_align = page_get_pagecnt(r); 719 r_base = physbase; 720 /* base needs to be aligned - lower to aligned value */ 721 r_base &= ~(r_align - 1); 722 r_pgcnt = howmany(physmax - r_base + 1, r_align); 723 r_shift = PAGE_BSZS_SHIFT(r); 724 725 PAGE_COUNTERS_SHIFT(mnode, r) = r_shift; 726 PAGE_COUNTERS_ENTRIES(mnode, r) = r_pgcnt; 727 PAGE_COUNTERS_BASE(mnode, r) = r_base; 728 for (mrange = 0; mrange < nranges; mrange++) { 729 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, 730 r, mrange) = (size_t *)alloc_base; 731 alloc_base += sizeof (size_t) * 732 colors_per_szc[r]; 733 } 734 for (i = 0; i < colors_per_szc[r]; i++) { 735 uint_t color_mask = colors_per_szc[r] - 1; 736 pfn_t pfnum = r_base; 737 size_t idx; 738 int mrange; 739 MEM_NODE_ITERATOR_DECL(it); 740 741 MEM_NODE_ITERATOR_INIT(pfnum, mnode, r, &it); 742 if (pfnum == (pfn_t)-1) { 743 idx = 0; 744 } else { 745 PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i, 746 color_mask, color_mask, &it); 747 idx = PNUM_TO_IDX(mnode, r, pfnum); 748 idx = (idx >= r_pgcnt) ? 0 : idx; 749 } 750 for (mrange = 0; mrange < nranges; mrange++) { 751 PAGE_COUNTERS_CURRENT_COLOR(mnode, 752 r, i, mrange) = idx; 753 } 754 } 755 756 /* hpm_counters may be shared by all mnodes */ 757 if (firstmn == mnode) { 758 PAGE_COUNTERS_COUNTERS(mnode, r) = 759 (hpmctr_t *)alloc_base; 760 alloc_base += 761 P2ROUNDUP((sizeof (hpmctr_t) * r_pgcnt), 762 sizeof (hpmctr_t *)); 763 } else { 764 PAGE_COUNTERS_COUNTERS(mnode, r) = 765 PAGE_COUNTERS_COUNTERS(firstmn, r); 766 } 767 768 /* 769 * Verify that PNUM_TO_IDX and IDX_TO_PNUM 770 * satisfy the identity requirement. 771 * We should be able to go from one to the other 772 * and get consistent values. 773 */ 774 ASSERT(PNUM_TO_IDX(mnode, r, 775 (IDX_TO_PNUM(mnode, r, 0))) == 0); 776 ASSERT(IDX_TO_PNUM(mnode, r, 777 (PNUM_TO_IDX(mnode, r, r_base))) == r_base); 778 } 779 /* 780 * Roundup the start address of the page_counters to 781 * cache aligned boundary for every memory node. 782 * page_ctrs_sz() has added some slop for these roundups. 783 */ 784 alloc_base = (caddr_t)P2ROUNDUP((uintptr_t)alloc_base, 785 L2CACHE_ALIGN); 786 } 787 788 /* Initialize other page counter specific data structures. */ 789 for (mnode = 0; mnode < MAX_MEM_NODES; mnode++) { 790 rw_init(&page_ctrs_rwlock[mnode], NULL, RW_DEFAULT, NULL); 791 } 792 793 return (alloc_base); 794 } 795 796 /* 797 * Functions to adjust region counters for each size free list. 798 * Caller is responsible to acquire the ctr_mutex lock if necessary and 799 * thus can be called during startup without locks. 800 */ 801 /* ARGSUSED */ 802 void 803 page_ctr_add_internal(int mnode, int mtype, page_t *pp, int flags) 804 { 805 ssize_t r; /* region size */ 806 ssize_t idx; 807 pfn_t pfnum; 808 int lckidx; 809 810 ASSERT(mnode == PP_2_MEM_NODE(pp)); 811 ASSERT(mtype == PP_2_MTYPE(pp)); 812 813 ASSERT(pp->p_szc < mmu_page_sizes); 814 815 PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags); 816 817 /* no counter update needed for largest page size */ 818 if (pp->p_szc >= mmu_page_sizes - 1) { 819 return; 820 } 821 822 r = pp->p_szc + 1; 823 pfnum = pp->p_pagenum; 824 lckidx = PP_CTR_LOCK_INDX(pp); 825 826 /* 827 * Increment the count of free pages for the current 828 * region. Continue looping up in region size incrementing 829 * count if the preceeding region is full. 830 */ 831 while (r < mmu_page_sizes) { 832 idx = PNUM_TO_IDX(mnode, r, pfnum); 833 834 ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r)); 835 ASSERT(PAGE_COUNTERS(mnode, r, idx) < FULL_REGION_CNT(r)); 836 837 if (++PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r)) { 838 break; 839 } else { 840 int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r)); 841 pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode] 842 [MTYPE_2_MRANGE(mnode, root_mtype)]; 843 844 cand->pcc_pages_free++; 845 cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]++; 846 } 847 r++; 848 } 849 } 850 851 void 852 page_ctr_add(int mnode, int mtype, page_t *pp, int flags) 853 { 854 int lckidx = PP_CTR_LOCK_INDX(pp); 855 kmutex_t *lock = &ctr_mutex[lckidx][mnode]; 856 857 mutex_enter(lock); 858 page_ctr_add_internal(mnode, mtype, pp, flags); 859 mutex_exit(lock); 860 } 861 862 void 863 page_ctr_sub_internal(int mnode, int mtype, page_t *pp, int flags) 864 { 865 int lckidx; 866 ssize_t r; /* region size */ 867 ssize_t idx; 868 pfn_t pfnum; 869 870 ASSERT(mnode == PP_2_MEM_NODE(pp)); 871 ASSERT(mtype == PP_2_MTYPE(pp)); 872 873 ASSERT(pp->p_szc < mmu_page_sizes); 874 875 PLCNT_DECR(pp, mnode, mtype, pp->p_szc, flags); 876 877 /* no counter update needed for largest page size */ 878 if (pp->p_szc >= mmu_page_sizes - 1) { 879 return; 880 } 881 882 r = pp->p_szc + 1; 883 pfnum = pp->p_pagenum; 884 lckidx = PP_CTR_LOCK_INDX(pp); 885 886 /* 887 * Decrement the count of free pages for the current 888 * region. Continue looping up in region size decrementing 889 * count if the preceeding region was full. 890 */ 891 while (r < mmu_page_sizes) { 892 idx = PNUM_TO_IDX(mnode, r, pfnum); 893 894 ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r)); 895 ASSERT(PAGE_COUNTERS(mnode, r, idx) > 0); 896 897 if (--PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r) - 1) { 898 break; 899 } else { 900 int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r)); 901 pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode] 902 [MTYPE_2_MRANGE(mnode, root_mtype)]; 903 904 ASSERT(cand->pcc_pages_free != 0); 905 ASSERT(cand->pcc_color_free[PP_2_BIN_SZC(pp, r)] != 0); 906 907 cand->pcc_pages_free--; 908 cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]--; 909 } 910 r++; 911 } 912 } 913 914 void 915 page_ctr_sub(int mnode, int mtype, page_t *pp, int flags) 916 { 917 int lckidx = PP_CTR_LOCK_INDX(pp); 918 kmutex_t *lock = &ctr_mutex[lckidx][mnode]; 919 920 mutex_enter(lock); 921 page_ctr_sub_internal(mnode, mtype, pp, flags); 922 mutex_exit(lock); 923 } 924 925 /* 926 * Adjust page counters following a memory attach, since typically the 927 * size of the array needs to change, and the PFN to counter index 928 * mapping needs to change. 929 * 930 * It is possible this mnode did not exist at startup. In that case 931 * allocate pcc_info_t and pcc_color_free arrays. Also, allow for nranges 932 * to change (a theoretical possibility on x86), which means pcc_color_free 933 * arrays must be extended. 934 */ 935 uint_t 936 page_ctrs_adjust(int mnode) 937 { 938 pgcnt_t npgs; 939 int r; /* region size */ 940 int i; 941 size_t pcsz, old_csz; 942 hpmctr_t *new_ctr, *old_ctr; 943 pfn_t oldbase, newbase; 944 pfn_t physbase, physmax; 945 size_t old_npgs; 946 hpmctr_t *ctr_cache[MMU_PAGE_SIZES]; 947 size_t size_cache[MMU_PAGE_SIZES]; 948 size_t *color_cache[MMU_PAGE_SIZES][MAX_MNODE_MRANGES]; 949 size_t *old_color_array[MAX_MNODE_MRANGES]; 950 pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; 951 pcc_info_t **cands_cache; 952 pcc_info_t *old_pi, *pi; 953 pgcnt_t *pgcntp; 954 int nr, old_nranges, mrange, nranges = MNODE_RANGE_CNT(mnode); 955 int cands_cache_nranges; 956 int old_maxmrange, new_maxmrange; 957 int rc = 0; 958 int oldmnode; 959 960 cands_cache = kmem_zalloc(sizeof (pcc_info_t *) * NPC_MUTEX * 961 MMU_PAGE_SIZES, KM_NOSLEEP); 962 if (cands_cache == NULL) 963 return (ENOMEM); 964 965 i = -1; 966 HPM_COUNTERS_LIMITS(mnode, physbase, physmax, i); 967 968 newbase = physbase & ~PC_BASE_ALIGN_MASK; 969 npgs = roundup(physmax, PC_BASE_ALIGN) - newbase; 970 971 /* prepare to free non-null pointers on the way out */ 972 cands_cache_nranges = nranges; 973 bzero(ctr_cache, sizeof (ctr_cache)); 974 bzero(color_cache, sizeof (color_cache)); 975 976 /* 977 * We need to determine how many page colors there are for each 978 * page size in order to allocate memory for any color specific 979 * arrays. 980 */ 981 for (r = 0; r < mmu_page_sizes; r++) { 982 colors_per_szc[r] = PAGE_GET_PAGECOLORS(r); 983 } 984 985 /* 986 * Preallocate all of the new hpm_counters arrays as we can't 987 * hold the page_ctrs_rwlock as a writer and allocate memory. 988 * If we can't allocate all of the arrays, undo our work so far 989 * and return failure. 990 */ 991 for (r = 1; r < mmu_page_sizes; r++) { 992 pcsz = npgs >> PAGE_BSZS_SHIFT(r); 993 size_cache[r] = pcsz; 994 ctr_cache[r] = kmem_zalloc(pcsz * 995 sizeof (hpmctr_t), KM_NOSLEEP); 996 if (ctr_cache[r] == NULL) { 997 rc = ENOMEM; 998 goto cleanup; 999 } 1000 } 1001 1002 /* 1003 * Preallocate all of the new color current arrays as we can't 1004 * hold the page_ctrs_rwlock as a writer and allocate memory. 1005 * If we can't allocate all of the arrays, undo our work so far 1006 * and return failure. 1007 */ 1008 for (r = 1; r < mmu_page_sizes; r++) { 1009 for (mrange = 0; mrange < nranges; mrange++) { 1010 color_cache[r][mrange] = kmem_zalloc(sizeof (size_t) * 1011 colors_per_szc[r], KM_NOSLEEP); 1012 if (color_cache[r][mrange] == NULL) { 1013 rc = ENOMEM; 1014 goto cleanup; 1015 } 1016 } 1017 } 1018 1019 /* 1020 * Preallocate all of the new pcc_info_t arrays as we can't 1021 * hold the page_ctrs_rwlock as a writer and allocate memory. 1022 * If we can't allocate all of the arrays, undo our work so far 1023 * and return failure. 1024 */ 1025 for (r = 1; r < mmu_page_sizes; r++) { 1026 for (i = 0; i < NPC_MUTEX; i++) { 1027 pi = kmem_zalloc(nranges * sizeof (pcc_info_t), 1028 KM_NOSLEEP); 1029 if (pi == NULL) { 1030 rc = ENOMEM; 1031 goto cleanup; 1032 } 1033 cands_cache[i * MMU_PAGE_SIZES + r] = pi; 1034 1035 for (mrange = 0; mrange < nranges; mrange++, pi++) { 1036 pgcntp = kmem_zalloc(colors_per_szc[r] * 1037 sizeof (pgcnt_t), KM_NOSLEEP); 1038 if (pgcntp == NULL) { 1039 rc = ENOMEM; 1040 goto cleanup; 1041 } 1042 pi->pcc_color_free = pgcntp; 1043 } 1044 } 1045 } 1046 1047 /* 1048 * Grab the write lock to prevent others from walking these arrays 1049 * while we are modifying them. 1050 */ 1051 PAGE_CTRS_WRITE_LOCK(mnode); 1052 1053 /* 1054 * For interleaved mnodes, find the first mnode 1055 * with valid page counters since the current 1056 * mnode may have just been added and not have 1057 * valid page counters. 1058 */ 1059 if (interleaved_mnodes) { 1060 for (i = 0; i < max_mem_nodes; i++) 1061 if (PAGE_COUNTERS_COUNTERS(i, 1) != NULL) 1062 break; 1063 ASSERT(i < max_mem_nodes); 1064 oldmnode = i; 1065 } else 1066 oldmnode = mnode; 1067 1068 old_nranges = mnode_nranges[mnode]; 1069 cands_cache_nranges = old_nranges; 1070 mnode_nranges[mnode] = nranges; 1071 old_maxmrange = mnode_maxmrange[mnode]; 1072 mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode); 1073 new_maxmrange = mnode_maxmrange[mnode]; 1074 1075 for (r = 1; r < mmu_page_sizes; r++) { 1076 PAGE_COUNTERS_SHIFT(mnode, r) = PAGE_BSZS_SHIFT(r); 1077 old_ctr = PAGE_COUNTERS_COUNTERS(oldmnode, r); 1078 old_csz = PAGE_COUNTERS_ENTRIES(oldmnode, r); 1079 oldbase = PAGE_COUNTERS_BASE(oldmnode, r); 1080 old_npgs = old_csz << PAGE_COUNTERS_SHIFT(oldmnode, r); 1081 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) { 1082 old_color_array[mrange] = 1083 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, 1084 r, mrange); 1085 } 1086 1087 pcsz = npgs >> PAGE_COUNTERS_SHIFT(mnode, r); 1088 new_ctr = ctr_cache[r]; 1089 ctr_cache[r] = NULL; 1090 if (old_ctr != NULL && 1091 (oldbase + old_npgs > newbase) && 1092 (newbase + npgs > oldbase)) { 1093 /* 1094 * Map the intersection of the old and new 1095 * counters into the new array. 1096 */ 1097 size_t offset; 1098 if (newbase > oldbase) { 1099 offset = (newbase - oldbase) >> 1100 PAGE_COUNTERS_SHIFT(mnode, r); 1101 bcopy(old_ctr + offset, new_ctr, 1102 MIN(pcsz, (old_csz - offset)) * 1103 sizeof (hpmctr_t)); 1104 } else { 1105 offset = (oldbase - newbase) >> 1106 PAGE_COUNTERS_SHIFT(mnode, r); 1107 bcopy(old_ctr, new_ctr + offset, 1108 MIN(pcsz - offset, old_csz) * 1109 sizeof (hpmctr_t)); 1110 } 1111 } 1112 1113 PAGE_COUNTERS_COUNTERS(mnode, r) = new_ctr; 1114 PAGE_COUNTERS_ENTRIES(mnode, r) = pcsz; 1115 PAGE_COUNTERS_BASE(mnode, r) = newbase; 1116 1117 /* update shared hpm_counters in other mnodes */ 1118 if (interleaved_mnodes) { 1119 for (i = 0; i < max_mem_nodes; i++) { 1120 if ((i == mnode) || 1121 (mem_node_config[i].exists == 0)) 1122 continue; 1123 ASSERT( 1124 PAGE_COUNTERS_COUNTERS(i, r) == old_ctr || 1125 PAGE_COUNTERS_COUNTERS(i, r) == NULL); 1126 PAGE_COUNTERS_COUNTERS(i, r) = new_ctr; 1127 PAGE_COUNTERS_ENTRIES(i, r) = pcsz; 1128 PAGE_COUNTERS_BASE(i, r) = newbase; 1129 } 1130 } 1131 1132 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) { 1133 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r, mrange) = 1134 color_cache[r][mrange]; 1135 color_cache[r][mrange] = NULL; 1136 } 1137 /* 1138 * for now, just reset on these events as it's probably 1139 * not worthwhile to try and optimize this. 1140 */ 1141 for (i = 0; i < colors_per_szc[r]; i++) { 1142 uint_t color_mask = colors_per_szc[r] - 1; 1143 int mlo = interleaved_mnodes ? 0 : mnode; 1144 int mhi = interleaved_mnodes ? max_mem_nodes : 1145 (mnode + 1); 1146 int m; 1147 pfn_t pfnum; 1148 size_t idx; 1149 MEM_NODE_ITERATOR_DECL(it); 1150 1151 for (m = mlo; m < mhi; m++) { 1152 if (mem_node_config[m].exists == 0) 1153 continue; 1154 pfnum = newbase; 1155 MEM_NODE_ITERATOR_INIT(pfnum, m, r, &it); 1156 if (pfnum == (pfn_t)-1) { 1157 idx = 0; 1158 } else { 1159 PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i, 1160 color_mask, color_mask, &it); 1161 idx = PNUM_TO_IDX(m, r, pfnum); 1162 idx = (idx < pcsz) ? idx : 0; 1163 } 1164 for (mrange = 0; mrange < nranges; mrange++) { 1165 if (PAGE_COUNTERS_CURRENT_COLOR_ARRAY(m, 1166 r, mrange) != NULL) 1167 PAGE_COUNTERS_CURRENT_COLOR(m, 1168 r, i, mrange) = idx; 1169 } 1170 } 1171 } 1172 1173 /* cache info for freeing out of the critical path */ 1174 if ((caddr_t)old_ctr >= kernelheap && 1175 (caddr_t)old_ctr < ekernelheap) { 1176 ctr_cache[r] = old_ctr; 1177 size_cache[r] = old_csz; 1178 } 1179 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) { 1180 size_t *tmp = old_color_array[mrange]; 1181 if ((caddr_t)tmp >= kernelheap && 1182 (caddr_t)tmp < ekernelheap) { 1183 color_cache[r][mrange] = tmp; 1184 } 1185 } 1186 /* 1187 * Verify that PNUM_TO_IDX and IDX_TO_PNUM 1188 * satisfy the identity requirement. 1189 * We should be able to go from one to the other 1190 * and get consistent values. 1191 */ 1192 ASSERT(PNUM_TO_IDX(mnode, r, 1193 (IDX_TO_PNUM(mnode, r, 0))) == 0); 1194 ASSERT(IDX_TO_PNUM(mnode, r, 1195 (PNUM_TO_IDX(mnode, r, newbase))) == newbase); 1196 1197 /* pcc_info_t and pcc_color_free */ 1198 for (i = 0; i < NPC_MUTEX; i++) { 1199 pcc_info_t *epi; 1200 pcc_info_t *eold_pi; 1201 1202 pi = cands_cache[i * MMU_PAGE_SIZES + r]; 1203 old_pi = page_ctrs_cands[i][r][mnode]; 1204 page_ctrs_cands[i][r][mnode] = pi; 1205 cands_cache[i * MMU_PAGE_SIZES + r] = old_pi; 1206 1207 /* preserve old pcc_color_free values, if any */ 1208 if (old_pi == NULL) 1209 continue; 1210 1211 /* 1212 * when/if x86 does DR, must account for 1213 * possible change in range index when 1214 * preserving pcc_info 1215 */ 1216 epi = &pi[nranges]; 1217 eold_pi = &old_pi[old_nranges]; 1218 if (new_maxmrange > old_maxmrange) { 1219 pi += new_maxmrange - old_maxmrange; 1220 } else if (new_maxmrange < old_maxmrange) { 1221 old_pi += old_maxmrange - new_maxmrange; 1222 } 1223 for (; pi < epi && old_pi < eold_pi; pi++, old_pi++) { 1224 pcc_info_t tmp = *pi; 1225 *pi = *old_pi; 1226 *old_pi = tmp; 1227 } 1228 } 1229 } 1230 PAGE_CTRS_WRITE_UNLOCK(mnode); 1231 1232 /* 1233 * Now that we have dropped the write lock, it is safe to free all 1234 * of the memory we have cached above. 1235 * We come thru here to free memory when pre-alloc fails, and also to 1236 * free old pointers which were recorded while locked. 1237 */ 1238 cleanup: 1239 for (r = 1; r < mmu_page_sizes; r++) { 1240 if (ctr_cache[r] != NULL) { 1241 kmem_free(ctr_cache[r], 1242 size_cache[r] * sizeof (hpmctr_t)); 1243 } 1244 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) { 1245 if (color_cache[r][mrange] != NULL) { 1246 kmem_free(color_cache[r][mrange], 1247 colors_per_szc[r] * sizeof (size_t)); 1248 } 1249 } 1250 for (i = 0; i < NPC_MUTEX; i++) { 1251 pi = cands_cache[i * MMU_PAGE_SIZES + r]; 1252 if (pi == NULL) 1253 continue; 1254 nr = cands_cache_nranges; 1255 for (mrange = 0; mrange < nr; mrange++, pi++) { 1256 pgcntp = pi->pcc_color_free; 1257 if (pgcntp == NULL) 1258 continue; 1259 if ((caddr_t)pgcntp >= kernelheap && 1260 (caddr_t)pgcntp < ekernelheap) { 1261 kmem_free(pgcntp, 1262 colors_per_szc[r] * 1263 sizeof (pgcnt_t)); 1264 } 1265 } 1266 pi = cands_cache[i * MMU_PAGE_SIZES + r]; 1267 if ((caddr_t)pi >= kernelheap && 1268 (caddr_t)pi < ekernelheap) { 1269 kmem_free(pi, nr * sizeof (pcc_info_t)); 1270 } 1271 } 1272 } 1273 1274 kmem_free(cands_cache, 1275 sizeof (pcc_info_t *) * NPC_MUTEX * MMU_PAGE_SIZES); 1276 return (rc); 1277 } 1278 1279 /* 1280 * Cleanup the hpm_counters field in the page counters 1281 * array. 1282 */ 1283 void 1284 page_ctrs_cleanup(void) 1285 { 1286 int r; /* region size */ 1287 int i; /* mnode index */ 1288 1289 /* 1290 * Get the page counters write lock while we are 1291 * setting the page hpm_counters field to NULL 1292 * for non-existent mnodes. 1293 */ 1294 for (i = 0; i < max_mem_nodes; i++) { 1295 PAGE_CTRS_WRITE_LOCK(i); 1296 if (mem_node_config[i].exists) { 1297 PAGE_CTRS_WRITE_UNLOCK(i); 1298 continue; 1299 } 1300 for (r = 1; r < mmu_page_sizes; r++) { 1301 PAGE_COUNTERS_COUNTERS(i, r) = NULL; 1302 } 1303 PAGE_CTRS_WRITE_UNLOCK(i); 1304 } 1305 } 1306 1307 #ifdef DEBUG 1308 1309 /* 1310 * confirm pp is a large page corresponding to szc 1311 */ 1312 void 1313 chk_lpg(page_t *pp, uchar_t szc) 1314 { 1315 spgcnt_t npgs = page_get_pagecnt(pp->p_szc); 1316 uint_t noreloc; 1317 1318 if (npgs == 1) { 1319 ASSERT(pp->p_szc == 0); 1320 ASSERT(pp->p_next == pp); 1321 ASSERT(pp->p_prev == pp); 1322 return; 1323 } 1324 1325 ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL); 1326 ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL); 1327 1328 ASSERT(IS_P2ALIGNED(pp->p_pagenum, npgs)); 1329 ASSERT(pp->p_pagenum == (pp->p_next->p_pagenum - 1)); 1330 ASSERT(pp->p_prev->p_pagenum == (pp->p_pagenum + (npgs - 1))); 1331 ASSERT(pp->p_prev == (pp + (npgs - 1))); 1332 1333 /* 1334 * Check list of pages. 1335 */ 1336 noreloc = PP_ISNORELOC(pp); 1337 while (npgs--) { 1338 if (npgs != 0) { 1339 ASSERT(pp->p_pagenum == pp->p_next->p_pagenum - 1); 1340 ASSERT(pp->p_next == (pp + 1)); 1341 } 1342 ASSERT(pp->p_szc == szc); 1343 ASSERT(PP_ISFREE(pp)); 1344 ASSERT(PP_ISAGED(pp)); 1345 ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL); 1346 ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL); 1347 ASSERT(pp->p_vnode == NULL); 1348 ASSERT(PP_ISNORELOC(pp) == noreloc); 1349 1350 pp = pp->p_next; 1351 } 1352 } 1353 #endif /* DEBUG */ 1354 1355 void 1356 page_freelist_lock(int mnode) 1357 { 1358 int i; 1359 for (i = 0; i < NPC_MUTEX; i++) { 1360 mutex_enter(FPC_MUTEX(mnode, i)); 1361 mutex_enter(CPC_MUTEX(mnode, i)); 1362 } 1363 } 1364 1365 void 1366 page_freelist_unlock(int mnode) 1367 { 1368 int i; 1369 for (i = 0; i < NPC_MUTEX; i++) { 1370 mutex_exit(FPC_MUTEX(mnode, i)); 1371 mutex_exit(CPC_MUTEX(mnode, i)); 1372 } 1373 } 1374 1375 /* 1376 * add pp to the specified page list. Defaults to head of the page list 1377 * unless PG_LIST_TAIL is specified. 1378 */ 1379 void 1380 page_list_add(page_t *pp, int flags) 1381 { 1382 page_t **ppp; 1383 kmutex_t *pcm; 1384 uint_t bin, mtype; 1385 int mnode; 1386 1387 ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT)); 1388 ASSERT(PP_ISFREE(pp)); 1389 ASSERT(!hat_page_is_mapped(pp)); 1390 ASSERT(hat_page_getshare(pp) == 0); 1391 1392 /* 1393 * Large pages should be freed via page_list_add_pages(). 1394 */ 1395 ASSERT(pp->p_szc == 0); 1396 1397 /* 1398 * Don't need to lock the freelist first here 1399 * because the page isn't on the freelist yet. 1400 * This means p_szc can't change on us. 1401 */ 1402 1403 bin = PP_2_BIN(pp); 1404 mnode = PP_2_MEM_NODE(pp); 1405 mtype = PP_2_MTYPE(pp); 1406 1407 if (flags & PG_LIST_ISINIT) { 1408 /* 1409 * PG_LIST_ISINIT is set during system startup (ie. single 1410 * threaded), add a page to the free list and add to the 1411 * the free region counters w/o any locking 1412 */ 1413 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1414 1415 /* inline version of page_add() */ 1416 if (*ppp != NULL) { 1417 pp->p_next = *ppp; 1418 pp->p_prev = (*ppp)->p_prev; 1419 (*ppp)->p_prev = pp; 1420 pp->p_prev->p_next = pp; 1421 } else 1422 *ppp = pp; 1423 1424 page_ctr_add_internal(mnode, mtype, pp, flags); 1425 VM_STAT_ADD(vmm_vmstats.pladd_free[0]); 1426 } else { 1427 pcm = PC_BIN_MUTEX(mnode, bin, flags); 1428 1429 if (flags & PG_FREE_LIST) { 1430 VM_STAT_ADD(vmm_vmstats.pladd_free[0]); 1431 ASSERT(PP_ISAGED(pp)); 1432 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1433 1434 } else { 1435 VM_STAT_ADD(vmm_vmstats.pladd_cache); 1436 ASSERT(pp->p_vnode); 1437 ASSERT((pp->p_offset & PAGEOFFSET) == 0); 1438 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1439 } 1440 mutex_enter(pcm); 1441 page_add(ppp, pp); 1442 1443 if (flags & PG_LIST_TAIL) 1444 *ppp = (*ppp)->p_next; 1445 /* 1446 * Add counters before releasing pcm mutex to avoid a race with 1447 * page_freelist_coalesce and page_freelist_split. 1448 */ 1449 page_ctr_add(mnode, mtype, pp, flags); 1450 mutex_exit(pcm); 1451 } 1452 1453 1454 #if defined(__sparc) 1455 if (PP_ISNORELOC(pp)) { 1456 kcage_freemem_add(1); 1457 } 1458 #endif 1459 /* 1460 * It is up to the caller to unlock the page! 1461 */ 1462 ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT)); 1463 } 1464 1465 1466 #ifdef __sparc 1467 /* 1468 * This routine is only used by kcage_init during system startup. 1469 * It performs the function of page_list_sub/PP_SETNORELOC/page_list_add 1470 * without the overhead of taking locks and updating counters. 1471 */ 1472 void 1473 page_list_noreloc_startup(page_t *pp) 1474 { 1475 page_t **ppp; 1476 uint_t bin; 1477 int mnode; 1478 int mtype; 1479 int flags = 0; 1480 1481 /* 1482 * If this is a large page on the freelist then 1483 * break it up into smaller pages. 1484 */ 1485 if (pp->p_szc != 0) 1486 page_boot_demote(pp); 1487 1488 /* 1489 * Get list page is currently on. 1490 */ 1491 bin = PP_2_BIN(pp); 1492 mnode = PP_2_MEM_NODE(pp); 1493 mtype = PP_2_MTYPE(pp); 1494 ASSERT(mtype == MTYPE_RELOC); 1495 ASSERT(pp->p_szc == 0); 1496 1497 if (PP_ISAGED(pp)) { 1498 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1499 flags |= PG_FREE_LIST; 1500 } else { 1501 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1502 flags |= PG_CACHE_LIST; 1503 } 1504 1505 ASSERT(*ppp != NULL); 1506 1507 /* 1508 * Delete page from current list. 1509 */ 1510 if (*ppp == pp) 1511 *ppp = pp->p_next; /* go to next page */ 1512 if (*ppp == pp) { 1513 *ppp = NULL; /* page list is gone */ 1514 } else { 1515 pp->p_prev->p_next = pp->p_next; 1516 pp->p_next->p_prev = pp->p_prev; 1517 } 1518 1519 /* 1520 * Decrement page counters 1521 */ 1522 page_ctr_sub_internal(mnode, mtype, pp, flags); 1523 1524 /* 1525 * Set no reloc for cage initted pages. 1526 */ 1527 PP_SETNORELOC(pp); 1528 1529 mtype = PP_2_MTYPE(pp); 1530 ASSERT(mtype == MTYPE_NORELOC); 1531 1532 /* 1533 * Get new list for page. 1534 */ 1535 if (PP_ISAGED(pp)) { 1536 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1537 } else { 1538 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1539 } 1540 1541 /* 1542 * Insert page on new list. 1543 */ 1544 if (*ppp == NULL) { 1545 *ppp = pp; 1546 pp->p_next = pp->p_prev = pp; 1547 } else { 1548 pp->p_next = *ppp; 1549 pp->p_prev = (*ppp)->p_prev; 1550 (*ppp)->p_prev = pp; 1551 pp->p_prev->p_next = pp; 1552 } 1553 1554 /* 1555 * Increment page counters 1556 */ 1557 page_ctr_add_internal(mnode, mtype, pp, flags); 1558 1559 /* 1560 * Update cage freemem counter 1561 */ 1562 atomic_add_long(&kcage_freemem, 1); 1563 } 1564 #else /* __sparc */ 1565 1566 /* ARGSUSED */ 1567 void 1568 page_list_noreloc_startup(page_t *pp) 1569 { 1570 panic("page_list_noreloc_startup: should be here only for sparc"); 1571 } 1572 #endif 1573 1574 void 1575 page_list_add_pages(page_t *pp, int flags) 1576 { 1577 kmutex_t *pcm; 1578 pgcnt_t pgcnt; 1579 uint_t bin, mtype, i; 1580 int mnode; 1581 1582 /* default to freelist/head */ 1583 ASSERT((flags & (PG_CACHE_LIST | PG_LIST_TAIL)) == 0); 1584 1585 CHK_LPG(pp, pp->p_szc); 1586 VM_STAT_ADD(vmm_vmstats.pladd_free[pp->p_szc]); 1587 1588 bin = PP_2_BIN(pp); 1589 mnode = PP_2_MEM_NODE(pp); 1590 mtype = PP_2_MTYPE(pp); 1591 1592 if (flags & PG_LIST_ISINIT) { 1593 ASSERT(pp->p_szc == mmu_page_sizes - 1); 1594 page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1595 ASSERT(!PP_ISNORELOC(pp)); 1596 PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags); 1597 } else { 1598 1599 ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes); 1600 1601 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 1602 1603 mutex_enter(pcm); 1604 page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1605 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST); 1606 mutex_exit(pcm); 1607 1608 pgcnt = page_get_pagecnt(pp->p_szc); 1609 #if defined(__sparc) 1610 if (PP_ISNORELOC(pp)) 1611 kcage_freemem_add(pgcnt); 1612 #endif 1613 for (i = 0; i < pgcnt; i++, pp++) 1614 page_unlock_nocapture(pp); 1615 } 1616 } 1617 1618 /* 1619 * During boot, need to demote a large page to base 1620 * pagesize pages for seg_kmem for use in boot_alloc() 1621 */ 1622 void 1623 page_boot_demote(page_t *pp) 1624 { 1625 ASSERT(pp->p_szc != 0); 1626 ASSERT(PP_ISFREE(pp)); 1627 ASSERT(PP_ISAGED(pp)); 1628 1629 (void) page_demote(PP_2_MEM_NODE(pp), 1630 PFN_BASE(pp->p_pagenum, pp->p_szc), 0, pp->p_szc, 0, PC_NO_COLOR, 1631 PC_FREE); 1632 1633 ASSERT(PP_ISFREE(pp)); 1634 ASSERT(PP_ISAGED(pp)); 1635 ASSERT(pp->p_szc == 0); 1636 } 1637 1638 /* 1639 * Take a particular page off of whatever freelist the page 1640 * is claimed to be on. 1641 * 1642 * NOTE: Only used for PAGESIZE pages. 1643 */ 1644 void 1645 page_list_sub(page_t *pp, int flags) 1646 { 1647 int bin; 1648 uint_t mtype; 1649 int mnode; 1650 kmutex_t *pcm; 1651 page_t **ppp; 1652 1653 ASSERT(PAGE_EXCL(pp)); 1654 ASSERT(PP_ISFREE(pp)); 1655 1656 /* 1657 * The p_szc field can only be changed by page_promote() 1658 * and page_demote(). Only free pages can be promoted and 1659 * demoted and the free list MUST be locked during these 1660 * operations. So to prevent a race in page_list_sub() 1661 * between computing which bin of the freelist lock to 1662 * grab and actually grabing the lock we check again that 1663 * the bin we locked is still the correct one. Notice that 1664 * the p_szc field could have actually changed on us but 1665 * if the bin happens to still be the same we are safe. 1666 */ 1667 try_again: 1668 bin = PP_2_BIN(pp); 1669 mnode = PP_2_MEM_NODE(pp); 1670 pcm = PC_BIN_MUTEX(mnode, bin, flags); 1671 mutex_enter(pcm); 1672 if (PP_2_BIN(pp) != bin) { 1673 mutex_exit(pcm); 1674 goto try_again; 1675 } 1676 mtype = PP_2_MTYPE(pp); 1677 1678 if (flags & PG_FREE_LIST) { 1679 VM_STAT_ADD(vmm_vmstats.plsub_free[0]); 1680 ASSERT(PP_ISAGED(pp)); 1681 ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype); 1682 } else { 1683 VM_STAT_ADD(vmm_vmstats.plsub_cache); 1684 ASSERT(!PP_ISAGED(pp)); 1685 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1686 } 1687 1688 /* 1689 * Common PAGESIZE case. 1690 * 1691 * Note that we locked the freelist. This prevents 1692 * any page promotion/demotion operations. Therefore 1693 * the p_szc will not change until we drop pcm mutex. 1694 */ 1695 if (pp->p_szc == 0) { 1696 page_sub(ppp, pp); 1697 /* 1698 * Subtract counters before releasing pcm mutex 1699 * to avoid race with page_freelist_coalesce. 1700 */ 1701 page_ctr_sub(mnode, mtype, pp, flags); 1702 mutex_exit(pcm); 1703 1704 #if defined(__sparc) 1705 if (PP_ISNORELOC(pp)) { 1706 kcage_freemem_sub(1); 1707 } 1708 #endif 1709 return; 1710 } 1711 1712 /* 1713 * Large pages on the cache list are not supported. 1714 */ 1715 if (flags & PG_CACHE_LIST) 1716 panic("page_list_sub: large page on cachelist"); 1717 1718 /* 1719 * Slow but rare. 1720 * 1721 * Somebody wants this particular page which is part 1722 * of a large page. In this case we just demote the page 1723 * if it's on the freelist. 1724 * 1725 * We have to drop pcm before locking the entire freelist. 1726 * Once we have re-locked the freelist check to make sure 1727 * the page hasn't already been demoted or completely 1728 * freed. 1729 */ 1730 mutex_exit(pcm); 1731 page_freelist_lock(mnode); 1732 if (pp->p_szc != 0) { 1733 /* 1734 * Large page is on freelist. 1735 */ 1736 (void) page_demote(mnode, PFN_BASE(pp->p_pagenum, pp->p_szc), 1737 0, pp->p_szc, 0, PC_NO_COLOR, PC_FREE); 1738 } 1739 ASSERT(PP_ISFREE(pp)); 1740 ASSERT(PP_ISAGED(pp)); 1741 ASSERT(pp->p_szc == 0); 1742 1743 /* 1744 * Subtract counters before releasing pcm mutex 1745 * to avoid race with page_freelist_coalesce. 1746 */ 1747 bin = PP_2_BIN(pp); 1748 mtype = PP_2_MTYPE(pp); 1749 ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype); 1750 1751 page_sub(ppp, pp); 1752 page_ctr_sub(mnode, mtype, pp, flags); 1753 page_freelist_unlock(mnode); 1754 1755 #if defined(__sparc) 1756 if (PP_ISNORELOC(pp)) { 1757 kcage_freemem_sub(1); 1758 } 1759 #endif 1760 } 1761 1762 void 1763 page_list_sub_pages(page_t *pp, uint_t szc) 1764 { 1765 kmutex_t *pcm; 1766 uint_t bin, mtype; 1767 int mnode; 1768 1769 ASSERT(PAGE_EXCL(pp)); 1770 ASSERT(PP_ISFREE(pp)); 1771 ASSERT(PP_ISAGED(pp)); 1772 1773 /* 1774 * See comment in page_list_sub(). 1775 */ 1776 try_again: 1777 bin = PP_2_BIN(pp); 1778 mnode = PP_2_MEM_NODE(pp); 1779 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 1780 mutex_enter(pcm); 1781 if (PP_2_BIN(pp) != bin) { 1782 mutex_exit(pcm); 1783 goto try_again; 1784 } 1785 1786 /* 1787 * If we're called with a page larger than szc or it got 1788 * promoted above szc before we locked the freelist then 1789 * drop pcm and re-lock entire freelist. If page still larger 1790 * than szc then demote it. 1791 */ 1792 if (pp->p_szc > szc) { 1793 mutex_exit(pcm); 1794 pcm = NULL; 1795 page_freelist_lock(mnode); 1796 if (pp->p_szc > szc) { 1797 VM_STAT_ADD(vmm_vmstats.plsubpages_szcbig); 1798 (void) page_demote(mnode, 1799 PFN_BASE(pp->p_pagenum, pp->p_szc), 0, 1800 pp->p_szc, szc, PC_NO_COLOR, PC_FREE); 1801 } 1802 bin = PP_2_BIN(pp); 1803 } 1804 ASSERT(PP_ISFREE(pp)); 1805 ASSERT(PP_ISAGED(pp)); 1806 ASSERT(pp->p_szc <= szc); 1807 ASSERT(pp == PP_PAGEROOT(pp)); 1808 1809 VM_STAT_ADD(vmm_vmstats.plsub_free[pp->p_szc]); 1810 1811 mtype = PP_2_MTYPE(pp); 1812 if (pp->p_szc != 0) { 1813 page_vpsub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1814 CHK_LPG(pp, pp->p_szc); 1815 } else { 1816 VM_STAT_ADD(vmm_vmstats.plsubpages_szc0); 1817 page_sub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1818 } 1819 page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST); 1820 1821 if (pcm != NULL) { 1822 mutex_exit(pcm); 1823 } else { 1824 page_freelist_unlock(mnode); 1825 } 1826 1827 #if defined(__sparc) 1828 if (PP_ISNORELOC(pp)) { 1829 pgcnt_t pgcnt; 1830 1831 pgcnt = page_get_pagecnt(pp->p_szc); 1832 kcage_freemem_sub(pgcnt); 1833 } 1834 #endif 1835 } 1836 1837 /* 1838 * Add the page to the front of a linked list of pages 1839 * using the p_next & p_prev pointers for the list. 1840 * The caller is responsible for protecting the list pointers. 1841 */ 1842 void 1843 mach_page_add(page_t **ppp, page_t *pp) 1844 { 1845 if (*ppp == NULL) { 1846 pp->p_next = pp->p_prev = pp; 1847 } else { 1848 pp->p_next = *ppp; 1849 pp->p_prev = (*ppp)->p_prev; 1850 (*ppp)->p_prev = pp; 1851 pp->p_prev->p_next = pp; 1852 } 1853 *ppp = pp; 1854 } 1855 1856 /* 1857 * Remove this page from a linked list of pages 1858 * using the p_next & p_prev pointers for the list. 1859 * 1860 * The caller is responsible for protecting the list pointers. 1861 */ 1862 void 1863 mach_page_sub(page_t **ppp, page_t *pp) 1864 { 1865 ASSERT(PP_ISFREE(pp)); 1866 1867 if (*ppp == NULL || pp == NULL) 1868 panic("mach_page_sub"); 1869 1870 if (*ppp == pp) 1871 *ppp = pp->p_next; /* go to next page */ 1872 1873 if (*ppp == pp) 1874 *ppp = NULL; /* page list is gone */ 1875 else { 1876 pp->p_prev->p_next = pp->p_next; 1877 pp->p_next->p_prev = pp->p_prev; 1878 } 1879 pp->p_prev = pp->p_next = pp; /* make pp a list of one */ 1880 } 1881 1882 /* 1883 * Routine fsflush uses to gradually coalesce the free list into larger pages. 1884 */ 1885 void 1886 page_promote_size(page_t *pp, uint_t cur_szc) 1887 { 1888 pfn_t pfn; 1889 int mnode; 1890 int idx; 1891 int new_szc = cur_szc + 1; 1892 int full = FULL_REGION_CNT(new_szc); 1893 1894 pfn = page_pptonum(pp); 1895 mnode = PFN_2_MEM_NODE(pfn); 1896 1897 page_freelist_lock(mnode); 1898 1899 idx = PNUM_TO_IDX(mnode, new_szc, pfn); 1900 if (PAGE_COUNTERS(mnode, new_szc, idx) == full) 1901 (void) page_promote(mnode, pfn, new_szc, PC_FREE, PC_MTYPE_ANY); 1902 1903 page_freelist_unlock(mnode); 1904 } 1905 1906 static uint_t page_promote_err; 1907 static uint_t page_promote_noreloc_err; 1908 1909 /* 1910 * Create a single larger page (of szc new_szc) from smaller contiguous pages 1911 * for the given mnode starting at pfnum. Pages involved are on the freelist 1912 * before the call and may be returned to the caller if requested, otherwise 1913 * they will be placed back on the freelist. 1914 * If flags is PC_ALLOC, then the large page will be returned to the user in 1915 * a state which is consistent with a page being taken off the freelist. If 1916 * we failed to lock the new large page, then we will return NULL to the 1917 * caller and put the large page on the freelist instead. 1918 * If flags is PC_FREE, then the large page will be placed on the freelist, 1919 * and NULL will be returned. 1920 * The caller is responsible for locking the freelist as well as any other 1921 * accounting which needs to be done for a returned page. 1922 * 1923 * RFE: For performance pass in pp instead of pfnum so 1924 * we can avoid excessive calls to page_numtopp_nolock(). 1925 * This would depend on an assumption that all contiguous 1926 * pages are in the same memseg so we can just add/dec 1927 * our pp. 1928 * 1929 * Lock ordering: 1930 * 1931 * There is a potential but rare deadlock situation 1932 * for page promotion and demotion operations. The problem 1933 * is there are two paths into the freelist manager and 1934 * they have different lock orders: 1935 * 1936 * page_create() 1937 * lock freelist 1938 * page_lock(EXCL) 1939 * unlock freelist 1940 * return 1941 * caller drops page_lock 1942 * 1943 * page_free() and page_reclaim() 1944 * caller grabs page_lock(EXCL) 1945 * 1946 * lock freelist 1947 * unlock freelist 1948 * drop page_lock 1949 * 1950 * What prevents a thread in page_create() from deadlocking 1951 * with a thread freeing or reclaiming the same page is the 1952 * page_trylock() in page_get_freelist(). If the trylock fails 1953 * it skips the page. 1954 * 1955 * The lock ordering for promotion and demotion is the same as 1956 * for page_create(). Since the same deadlock could occur during 1957 * page promotion and freeing or reclaiming of a page on the 1958 * cache list we might have to fail the operation and undo what 1959 * have done so far. Again this is rare. 1960 */ 1961 page_t * 1962 page_promote(int mnode, pfn_t pfnum, uchar_t new_szc, int flags, int mtype) 1963 { 1964 page_t *pp, *pplist, *tpp, *start_pp; 1965 pgcnt_t new_npgs, npgs; 1966 uint_t bin; 1967 pgcnt_t tmpnpgs, pages_left; 1968 uint_t noreloc; 1969 int which_list; 1970 ulong_t index; 1971 kmutex_t *phm; 1972 1973 /* 1974 * General algorithm: 1975 * Find the starting page 1976 * Walk each page struct removing it from the freelist, 1977 * and linking it to all the other pages removed. 1978 * Once all pages are off the freelist, 1979 * walk the list, modifying p_szc to new_szc and what 1980 * ever other info needs to be done to create a large free page. 1981 * According to the flags, either return the page or put it 1982 * on the freelist. 1983 */ 1984 1985 start_pp = page_numtopp_nolock(pfnum); 1986 ASSERT(start_pp && (start_pp->p_pagenum == pfnum)); 1987 new_npgs = page_get_pagecnt(new_szc); 1988 ASSERT(IS_P2ALIGNED(pfnum, new_npgs)); 1989 1990 /* don't return page of the wrong mtype */ 1991 if (mtype != PC_MTYPE_ANY && mtype != PP_2_MTYPE(start_pp)) 1992 return (NULL); 1993 1994 /* 1995 * Loop through smaller pages to confirm that all pages 1996 * give the same result for PP_ISNORELOC(). 1997 * We can check this reliably here as the protocol for setting 1998 * P_NORELOC requires pages to be taken off the free list first. 1999 */ 2000 noreloc = PP_ISNORELOC(start_pp); 2001 for (pp = start_pp + new_npgs; --pp > start_pp; ) { 2002 if (noreloc != PP_ISNORELOC(pp)) { 2003 page_promote_noreloc_err++; 2004 page_promote_err++; 2005 return (NULL); 2006 } 2007 } 2008 2009 pages_left = new_npgs; 2010 pplist = NULL; 2011 pp = start_pp; 2012 2013 /* Loop around coalescing the smaller pages into a big page. */ 2014 while (pages_left) { 2015 /* 2016 * Remove from the freelist. 2017 */ 2018 ASSERT(PP_ISFREE(pp)); 2019 bin = PP_2_BIN(pp); 2020 ASSERT(mnode == PP_2_MEM_NODE(pp)); 2021 mtype = PP_2_MTYPE(pp); 2022 if (PP_ISAGED(pp)) { 2023 2024 /* 2025 * PG_FREE_LIST 2026 */ 2027 if (pp->p_szc) { 2028 page_vpsub(&PAGE_FREELISTS(mnode, 2029 pp->p_szc, bin, mtype), pp); 2030 } else { 2031 mach_page_sub(&PAGE_FREELISTS(mnode, 0, 2032 bin, mtype), pp); 2033 } 2034 which_list = PG_FREE_LIST; 2035 } else { 2036 ASSERT(pp->p_szc == 0); 2037 2038 /* 2039 * PG_CACHE_LIST 2040 * 2041 * Since this page comes from the 2042 * cachelist, we must destroy the 2043 * vnode association. 2044 */ 2045 if (!page_trylock(pp, SE_EXCL)) { 2046 goto fail_promote; 2047 } 2048 2049 /* 2050 * We need to be careful not to deadlock 2051 * with another thread in page_lookup(). 2052 * The page_lookup() thread could be holding 2053 * the same phm that we need if the two 2054 * pages happen to hash to the same phm lock. 2055 * At this point we have locked the entire 2056 * freelist and page_lookup() could be trying 2057 * to grab a freelist lock. 2058 */ 2059 index = PAGE_HASH_FUNC(pp->p_vnode, pp->p_offset); 2060 phm = PAGE_HASH_MUTEX(index); 2061 if (!mutex_tryenter(phm)) { 2062 page_unlock_nocapture(pp); 2063 goto fail_promote; 2064 } 2065 2066 mach_page_sub(&PAGE_CACHELISTS(mnode, bin, mtype), pp); 2067 page_hashout(pp, phm); 2068 mutex_exit(phm); 2069 PP_SETAGED(pp); 2070 page_unlock_nocapture(pp); 2071 which_list = PG_CACHE_LIST; 2072 } 2073 page_ctr_sub(mnode, mtype, pp, which_list); 2074 2075 /* 2076 * Concatenate the smaller page(s) onto 2077 * the large page list. 2078 */ 2079 tmpnpgs = npgs = page_get_pagecnt(pp->p_szc); 2080 pages_left -= npgs; 2081 tpp = pp; 2082 while (npgs--) { 2083 tpp->p_szc = new_szc; 2084 tpp = tpp->p_next; 2085 } 2086 page_list_concat(&pplist, &pp); 2087 pp += tmpnpgs; 2088 } 2089 CHK_LPG(pplist, new_szc); 2090 2091 /* 2092 * return the page to the user if requested 2093 * in the properly locked state. 2094 */ 2095 if (flags == PC_ALLOC && (page_trylock_cons(pplist, SE_EXCL))) { 2096 return (pplist); 2097 } 2098 2099 /* 2100 * Otherwise place the new large page on the freelist 2101 */ 2102 bin = PP_2_BIN(pplist); 2103 mnode = PP_2_MEM_NODE(pplist); 2104 mtype = PP_2_MTYPE(pplist); 2105 page_vpadd(&PAGE_FREELISTS(mnode, new_szc, bin, mtype), pplist); 2106 2107 page_ctr_add(mnode, mtype, pplist, PG_FREE_LIST); 2108 return (NULL); 2109 2110 fail_promote: 2111 /* 2112 * A thread must have still been freeing or 2113 * reclaiming the page on the cachelist. 2114 * To prevent a deadlock undo what we have 2115 * done sofar and return failure. This 2116 * situation can only happen while promoting 2117 * PAGESIZE pages. 2118 */ 2119 page_promote_err++; 2120 while (pplist) { 2121 pp = pplist; 2122 mach_page_sub(&pplist, pp); 2123 pp->p_szc = 0; 2124 bin = PP_2_BIN(pp); 2125 mtype = PP_2_MTYPE(pp); 2126 mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, mtype), pp); 2127 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST); 2128 } 2129 return (NULL); 2130 2131 } 2132 2133 /* 2134 * Break up a large page into smaller size pages. 2135 * Pages involved are on the freelist before the call and may 2136 * be returned to the caller if requested, otherwise they will 2137 * be placed back on the freelist. 2138 * The caller is responsible for locking the freelist as well as any other 2139 * accounting which needs to be done for a returned page. 2140 * If flags is not PC_ALLOC, the color argument is ignored, and thus 2141 * technically, any value may be passed in but PC_NO_COLOR is the standard 2142 * which should be followed for clarity's sake. 2143 * Returns a page whose pfn is < pfnmax 2144 */ 2145 page_t * 2146 page_demote(int mnode, pfn_t pfnum, pfn_t pfnmax, uchar_t cur_szc, 2147 uchar_t new_szc, int color, int flags) 2148 { 2149 page_t *pp, *pplist, *npplist; 2150 pgcnt_t npgs, n; 2151 uint_t bin; 2152 uint_t mtype; 2153 page_t *ret_pp = NULL; 2154 2155 ASSERT(cur_szc != 0); 2156 ASSERT(new_szc < cur_szc); 2157 2158 pplist = page_numtopp_nolock(pfnum); 2159 ASSERT(pplist != NULL); 2160 2161 ASSERT(pplist->p_szc == cur_szc); 2162 2163 bin = PP_2_BIN(pplist); 2164 ASSERT(mnode == PP_2_MEM_NODE(pplist)); 2165 mtype = PP_2_MTYPE(pplist); 2166 page_vpsub(&PAGE_FREELISTS(mnode, cur_szc, bin, mtype), pplist); 2167 2168 CHK_LPG(pplist, cur_szc); 2169 page_ctr_sub(mnode, mtype, pplist, PG_FREE_LIST); 2170 2171 /* 2172 * Number of PAGESIZE pages for smaller new_szc 2173 * page. 2174 */ 2175 npgs = page_get_pagecnt(new_szc); 2176 2177 while (pplist) { 2178 pp = pplist; 2179 2180 ASSERT(pp->p_szc == cur_szc); 2181 2182 /* 2183 * We either break it up into PAGESIZE pages or larger. 2184 */ 2185 if (npgs == 1) { /* PAGESIZE case */ 2186 mach_page_sub(&pplist, pp); 2187 ASSERT(pp->p_szc == cur_szc); 2188 ASSERT(new_szc == 0); 2189 ASSERT(mnode == PP_2_MEM_NODE(pp)); 2190 pp->p_szc = new_szc; 2191 bin = PP_2_BIN(pp); 2192 if ((bin == color) && (flags == PC_ALLOC) && 2193 (ret_pp == NULL) && (pfnmax == 0 || 2194 pp->p_pagenum < pfnmax) && 2195 page_trylock_cons(pp, SE_EXCL)) { 2196 ret_pp = pp; 2197 } else { 2198 mtype = PP_2_MTYPE(pp); 2199 mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, 2200 mtype), pp); 2201 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST); 2202 } 2203 } else { 2204 page_t *try_to_return_this_page = NULL; 2205 int count = 0; 2206 2207 /* 2208 * Break down into smaller lists of pages. 2209 */ 2210 page_list_break(&pplist, &npplist, npgs); 2211 2212 pp = pplist; 2213 n = npgs; 2214 while (n--) { 2215 ASSERT(pp->p_szc == cur_szc); 2216 /* 2217 * Check whether all the pages in this list 2218 * fit the request criteria. 2219 */ 2220 if (pfnmax == 0 || pp->p_pagenum < pfnmax) { 2221 count++; 2222 } 2223 pp->p_szc = new_szc; 2224 pp = pp->p_next; 2225 } 2226 2227 if (count == npgs && 2228 (pfnmax == 0 || pp->p_pagenum < pfnmax)) { 2229 try_to_return_this_page = pp; 2230 } 2231 2232 CHK_LPG(pplist, new_szc); 2233 2234 bin = PP_2_BIN(pplist); 2235 if (try_to_return_this_page) 2236 ASSERT(mnode == 2237 PP_2_MEM_NODE(try_to_return_this_page)); 2238 if ((bin == color) && (flags == PC_ALLOC) && 2239 (ret_pp == NULL) && try_to_return_this_page && 2240 page_trylock_cons(try_to_return_this_page, 2241 SE_EXCL)) { 2242 ret_pp = try_to_return_this_page; 2243 } else { 2244 mtype = PP_2_MTYPE(pp); 2245 page_vpadd(&PAGE_FREELISTS(mnode, new_szc, 2246 bin, mtype), pplist); 2247 2248 page_ctr_add(mnode, mtype, pplist, 2249 PG_FREE_LIST); 2250 } 2251 pplist = npplist; 2252 } 2253 } 2254 return (ret_pp); 2255 } 2256 2257 int mpss_coalesce_disable = 0; 2258 2259 /* 2260 * Coalesce free pages into a page of the given szc and color if possible. 2261 * Return the pointer to the page created, otherwise, return NULL. 2262 * 2263 * If pfnhi is non-zero, search for large page with pfn range less than pfnhi. 2264 */ 2265 page_t * 2266 page_freelist_coalesce(int mnode, uchar_t szc, uint_t color, uint_t ceq_mask, 2267 int mtype, pfn_t pfnhi) 2268 { 2269 int r = szc; /* region size */ 2270 int mrange; 2271 uint_t full, bin, color_mask, wrap = 0; 2272 pfn_t pfnum, lo, hi; 2273 size_t len, idx, idx0; 2274 pgcnt_t cands = 0, szcpgcnt = page_get_pagecnt(szc); 2275 page_t *ret_pp; 2276 MEM_NODE_ITERATOR_DECL(it); 2277 #if defined(__sparc) 2278 pfn_t pfnum0, nlo, nhi; 2279 #endif 2280 2281 if (mpss_coalesce_disable) { 2282 ASSERT(szc < MMU_PAGE_SIZES); 2283 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[szc][0]); 2284 return (NULL); 2285 } 2286 2287 ASSERT(szc < mmu_page_sizes); 2288 color_mask = PAGE_GET_PAGECOLORS(szc) - 1; 2289 ASSERT(ceq_mask <= color_mask); 2290 ASSERT(color <= color_mask); 2291 color &= ceq_mask; 2292 2293 /* Prevent page_counters dynamic memory from being freed */ 2294 rw_enter(&page_ctrs_rwlock[mnode], RW_READER); 2295 2296 mrange = MTYPE_2_MRANGE(mnode, mtype); 2297 ASSERT(mrange < mnode_nranges[mnode]); 2298 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[r][mrange]); 2299 2300 /* get pfn range for mtype */ 2301 len = PAGE_COUNTERS_ENTRIES(mnode, r); 2302 MNODETYPE_2_PFN(mnode, mtype, lo, hi); 2303 hi++; 2304 2305 /* use lower limit if given */ 2306 if (pfnhi != PFNNULL && pfnhi < hi) 2307 hi = pfnhi; 2308 2309 /* round to szcpgcnt boundaries */ 2310 lo = P2ROUNDUP(lo, szcpgcnt); 2311 MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it); 2312 if (lo == (pfn_t)-1) { 2313 rw_exit(&page_ctrs_rwlock[mnode]); 2314 return (NULL); 2315 } 2316 hi = hi & ~(szcpgcnt - 1); 2317 2318 /* set lo to the closest pfn of the right color */ 2319 if (((PFN_2_COLOR(lo, szc, &it) ^ color) & ceq_mask) || 2320 (interleaved_mnodes && PFN_2_MEM_NODE(lo) != mnode)) { 2321 PAGE_NEXT_PFN_FOR_COLOR(lo, szc, color, ceq_mask, color_mask, 2322 &it); 2323 } 2324 2325 if (hi <= lo) { 2326 rw_exit(&page_ctrs_rwlock[mnode]); 2327 return (NULL); 2328 } 2329 2330 full = FULL_REGION_CNT(r); 2331 2332 /* calculate the number of page candidates and initial search index */ 2333 bin = color; 2334 idx0 = (size_t)(-1); 2335 do { 2336 pgcnt_t acand; 2337 2338 PGCTRS_CANDS_GETVALUECOLOR(mnode, mrange, r, bin, acand); 2339 if (acand) { 2340 idx = PAGE_COUNTERS_CURRENT_COLOR(mnode, 2341 r, bin, mrange); 2342 idx0 = MIN(idx0, idx); 2343 cands += acand; 2344 } 2345 bin = ADD_MASKED(bin, 1, ceq_mask, color_mask); 2346 } while (bin != color); 2347 2348 if (cands == 0) { 2349 VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip[r][mrange]); 2350 rw_exit(&page_ctrs_rwlock[mnode]); 2351 return (NULL); 2352 } 2353 2354 pfnum = IDX_TO_PNUM(mnode, r, idx0); 2355 if (pfnum < lo || pfnum >= hi) { 2356 pfnum = lo; 2357 } else { 2358 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it); 2359 if (pfnum == (pfn_t)-1) { 2360 pfnum = lo; 2361 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it); 2362 ASSERT(pfnum != (pfn_t)-1); 2363 } else if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) & ceq_mask || 2364 (interleaved_mnodes && PFN_2_MEM_NODE(pfnum) != mnode)) { 2365 /* invalid color, get the closest correct pfn */ 2366 PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask, 2367 color_mask, &it); 2368 if (pfnum >= hi) { 2369 pfnum = lo; 2370 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it); 2371 } 2372 } 2373 } 2374 2375 /* set starting index */ 2376 idx0 = PNUM_TO_IDX(mnode, r, pfnum); 2377 ASSERT(idx0 < len); 2378 2379 #if defined(__sparc) 2380 pfnum0 = pfnum; /* page corresponding to idx0 */ 2381 nhi = 0; /* search kcage ranges */ 2382 #endif 2383 2384 for (idx = idx0; wrap == 0 || (idx < idx0 && wrap < 2); ) { 2385 2386 #if defined(__sparc) 2387 /* 2388 * Find lowest intersection of kcage ranges and mnode. 2389 * MTYPE_NORELOC means look in the cage, otherwise outside. 2390 */ 2391 if (nhi <= pfnum) { 2392 if (kcage_next_range(mtype == MTYPE_NORELOC, pfnum, 2393 (wrap == 0 ? hi : pfnum0), &nlo, &nhi)) 2394 goto wrapit; 2395 2396 /* jump to the next page in the range */ 2397 if (pfnum < nlo) { 2398 pfnum = P2ROUNDUP(nlo, szcpgcnt); 2399 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it); 2400 idx = PNUM_TO_IDX(mnode, r, pfnum); 2401 if (idx >= len || pfnum >= hi) 2402 goto wrapit; 2403 if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) & 2404 ceq_mask) 2405 goto next; 2406 if (interleaved_mnodes && 2407 PFN_2_MEM_NODE(pfnum) != mnode) 2408 goto next; 2409 } 2410 } 2411 #endif 2412 2413 if (PAGE_COUNTERS(mnode, r, idx) != full) 2414 goto next; 2415 2416 /* 2417 * RFE: For performance maybe we can do something less 2418 * brutal than locking the entire freelist. So far 2419 * this doesn't seem to be a performance problem? 2420 */ 2421 page_freelist_lock(mnode); 2422 if (PAGE_COUNTERS(mnode, r, idx) == full) { 2423 ret_pp = 2424 page_promote(mnode, pfnum, r, PC_ALLOC, mtype); 2425 if (ret_pp != NULL) { 2426 VM_STAT_ADD(vmm_vmstats.pfc_coalok[r][mrange]); 2427 PAGE_COUNTERS_CURRENT_COLOR(mnode, r, 2428 PFN_2_COLOR(pfnum, szc, &it), mrange) = idx; 2429 page_freelist_unlock(mnode); 2430 rw_exit(&page_ctrs_rwlock[mnode]); 2431 #if defined(__sparc) 2432 if (PP_ISNORELOC(ret_pp)) { 2433 pgcnt_t npgs; 2434 2435 npgs = page_get_pagecnt(ret_pp->p_szc); 2436 kcage_freemem_sub(npgs); 2437 } 2438 #endif 2439 return (ret_pp); 2440 } 2441 } else { 2442 VM_STAT_ADD(vmm_vmstats.page_ctrs_changed[r][mrange]); 2443 } 2444 2445 page_freelist_unlock(mnode); 2446 /* 2447 * No point looking for another page if we've 2448 * already tried all of the ones that 2449 * page_ctr_cands indicated. Stash off where we left 2450 * off. 2451 * Note: this is not exact since we don't hold the 2452 * page_freelist_locks before we initially get the 2453 * value of cands for performance reasons, but should 2454 * be a decent approximation. 2455 */ 2456 if (--cands == 0) { 2457 PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color, mrange) = 2458 idx; 2459 break; 2460 } 2461 next: 2462 PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask, 2463 color_mask, &it); 2464 idx = PNUM_TO_IDX(mnode, r, pfnum); 2465 if (idx >= len || pfnum >= hi) { 2466 wrapit: 2467 pfnum = lo; 2468 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it); 2469 idx = PNUM_TO_IDX(mnode, r, pfnum); 2470 wrap++; 2471 #if defined(__sparc) 2472 nhi = 0; /* search kcage ranges */ 2473 #endif 2474 } 2475 } 2476 2477 rw_exit(&page_ctrs_rwlock[mnode]); 2478 VM_STAT_ADD(vmm_vmstats.page_ctrs_failed[r][mrange]); 2479 return (NULL); 2480 } 2481 2482 /* 2483 * For the given mnode, promote as many small pages to large pages as possible. 2484 * mnode can be -1, which means do them all 2485 */ 2486 void 2487 page_freelist_coalesce_all(int mnode) 2488 { 2489 int r; /* region size */ 2490 int idx, full; 2491 size_t len; 2492 int doall = interleaved_mnodes || mnode < 0; 2493 int mlo = doall ? 0 : mnode; 2494 int mhi = doall ? max_mem_nodes : (mnode + 1); 2495 2496 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce_all); 2497 2498 if (mpss_coalesce_disable) { 2499 return; 2500 } 2501 2502 /* 2503 * Lock the entire freelist and coalesce what we can. 2504 * 2505 * Always promote to the largest page possible 2506 * first to reduce the number of page promotions. 2507 */ 2508 for (mnode = mlo; mnode < mhi; mnode++) { 2509 rw_enter(&page_ctrs_rwlock[mnode], RW_READER); 2510 page_freelist_lock(mnode); 2511 } 2512 for (r = mmu_page_sizes - 1; r > 0; r--) { 2513 for (mnode = mlo; mnode < mhi; mnode++) { 2514 pgcnt_t cands = 0; 2515 int mrange, nranges = mnode_nranges[mnode]; 2516 2517 for (mrange = 0; mrange < nranges; mrange++) { 2518 PGCTRS_CANDS_GETVALUE(mnode, mrange, r, cands); 2519 if (cands != 0) 2520 break; 2521 } 2522 if (cands == 0) { 2523 VM_STAT_ADD(vmm_vmstats. 2524 page_ctrs_cands_skip_all); 2525 continue; 2526 } 2527 2528 full = FULL_REGION_CNT(r); 2529 len = PAGE_COUNTERS_ENTRIES(mnode, r); 2530 2531 for (idx = 0; idx < len; idx++) { 2532 if (PAGE_COUNTERS(mnode, r, idx) == full) { 2533 pfn_t pfnum = 2534 IDX_TO_PNUM(mnode, r, idx); 2535 int tmnode = interleaved_mnodes ? 2536 PFN_2_MEM_NODE(pfnum) : mnode; 2537 2538 ASSERT(pfnum >= 2539 mem_node_config[tmnode].physbase && 2540 pfnum < 2541 mem_node_config[tmnode].physmax); 2542 2543 (void) page_promote(tmnode, 2544 pfnum, r, PC_FREE, PC_MTYPE_ANY); 2545 } 2546 } 2547 /* shared hpm_counters covers all mnodes, so we quit */ 2548 if (interleaved_mnodes) 2549 break; 2550 } 2551 } 2552 for (mnode = mlo; mnode < mhi; mnode++) { 2553 page_freelist_unlock(mnode); 2554 rw_exit(&page_ctrs_rwlock[mnode]); 2555 } 2556 } 2557 2558 /* 2559 * This is where all polices for moving pages around 2560 * to different page size free lists is implemented. 2561 * Returns 1 on success, 0 on failure. 2562 * 2563 * So far these are the priorities for this algorithm in descending 2564 * order: 2565 * 2566 * 1) When servicing a request try to do so with a free page 2567 * from next size up. Helps defer fragmentation as long 2568 * as possible. 2569 * 2570 * 2) Page coalesce on demand. Only when a freelist 2571 * larger than PAGESIZE is empty and step 1 2572 * will not work since all larger size lists are 2573 * also empty. 2574 * 2575 * If pfnhi is non-zero, search for large page with pfn range less than pfnhi. 2576 */ 2577 2578 page_t * 2579 page_freelist_split(uchar_t szc, uint_t color, int mnode, int mtype, 2580 pfn_t pfnlo, pfn_t pfnhi, page_list_walker_t *plw) 2581 { 2582 uchar_t nszc = szc + 1; 2583 uint_t bin, sbin, bin_prev; 2584 page_t *pp, *firstpp; 2585 page_t *ret_pp = NULL; 2586 uint_t color_mask; 2587 2588 if (nszc == mmu_page_sizes) 2589 return (NULL); 2590 2591 ASSERT(nszc < mmu_page_sizes); 2592 color_mask = PAGE_GET_PAGECOLORS(nszc) - 1; 2593 bin = sbin = PAGE_GET_NSZ_COLOR(szc, color); 2594 bin_prev = (plw->plw_bin_split_prev == color) ? INVALID_COLOR : 2595 PAGE_GET_NSZ_COLOR(szc, plw->plw_bin_split_prev); 2596 2597 VM_STAT_ADD(vmm_vmstats.pfs_req[szc]); 2598 /* 2599 * First try to break up a larger page to fill current size freelist. 2600 */ 2601 while (plw->plw_bins[nszc] != 0) { 2602 2603 ASSERT(nszc < mmu_page_sizes); 2604 2605 /* 2606 * If page found then demote it. 2607 */ 2608 if (PAGE_FREELISTS(mnode, nszc, bin, mtype)) { 2609 page_freelist_lock(mnode); 2610 firstpp = pp = PAGE_FREELISTS(mnode, nszc, bin, mtype); 2611 2612 /* 2613 * If pfnhi is not PFNNULL, look for large page below 2614 * pfnhi. PFNNULL signifies no pfn requirement. 2615 */ 2616 if (pp && 2617 ((pfnhi != PFNNULL && pp->p_pagenum >= pfnhi) || 2618 (pfnlo != PFNNULL && pp->p_pagenum < pfnlo))) { 2619 do { 2620 pp = pp->p_vpnext; 2621 if (pp == firstpp) { 2622 pp = NULL; 2623 break; 2624 } 2625 } while ((pfnhi != PFNNULL && 2626 pp->p_pagenum >= pfnhi) || 2627 (pfnlo != PFNNULL && 2628 pp->p_pagenum < pfnlo)); 2629 2630 if (pfnhi != PFNNULL && pp != NULL) 2631 ASSERT(pp->p_pagenum < pfnhi); 2632 2633 if (pfnlo != PFNNULL && pp != NULL) 2634 ASSERT(pp->p_pagenum >= pfnlo); 2635 } 2636 if (pp) { 2637 uint_t ccolor = page_correct_color(szc, nszc, 2638 color, bin, plw->plw_ceq_mask[szc]); 2639 2640 ASSERT(pp->p_szc == nszc); 2641 VM_STAT_ADD(vmm_vmstats.pfs_demote[nszc]); 2642 ret_pp = page_demote(mnode, pp->p_pagenum, 2643 pfnhi, pp->p_szc, szc, ccolor, PC_ALLOC); 2644 if (ret_pp) { 2645 page_freelist_unlock(mnode); 2646 #if defined(__sparc) 2647 if (PP_ISNORELOC(ret_pp)) { 2648 pgcnt_t npgs; 2649 2650 npgs = page_get_pagecnt( 2651 ret_pp->p_szc); 2652 kcage_freemem_sub(npgs); 2653 } 2654 #endif 2655 return (ret_pp); 2656 } 2657 } 2658 page_freelist_unlock(mnode); 2659 } 2660 2661 /* loop through next size bins */ 2662 bin = ADD_MASKED(bin, 1, plw->plw_ceq_mask[nszc], color_mask); 2663 plw->plw_bins[nszc]--; 2664 2665 if (bin == sbin) { 2666 uchar_t nnszc = nszc + 1; 2667 2668 /* we are done with this page size - check next */ 2669 if (plw->plw_bins[nnszc] == 0) 2670 /* we have already checked next size bins */ 2671 break; 2672 2673 bin = sbin = PAGE_GET_NSZ_COLOR(nszc, bin); 2674 if (bin_prev != INVALID_COLOR) { 2675 bin_prev = PAGE_GET_NSZ_COLOR(nszc, bin_prev); 2676 if (!((bin ^ bin_prev) & 2677 plw->plw_ceq_mask[nnszc])) 2678 break; 2679 } 2680 ASSERT(nnszc < mmu_page_sizes); 2681 color_mask = PAGE_GET_PAGECOLORS(nnszc) - 1; 2682 nszc = nnszc; 2683 ASSERT(nszc < mmu_page_sizes); 2684 } 2685 } 2686 2687 return (ret_pp); 2688 } 2689 2690 /* 2691 * Helper routine used only by the freelist code to lock 2692 * a page. If the page is a large page then it succeeds in 2693 * locking all the constituent pages or none at all. 2694 * Returns 1 on sucess, 0 on failure. 2695 */ 2696 static int 2697 page_trylock_cons(page_t *pp, se_t se) 2698 { 2699 page_t *tpp, *first_pp = pp; 2700 2701 /* 2702 * Fail if can't lock first or only page. 2703 */ 2704 if (!page_trylock(pp, se)) { 2705 return (0); 2706 } 2707 2708 /* 2709 * PAGESIZE: common case. 2710 */ 2711 if (pp->p_szc == 0) { 2712 return (1); 2713 } 2714 2715 /* 2716 * Large page case. 2717 */ 2718 tpp = pp->p_next; 2719 while (tpp != pp) { 2720 if (!page_trylock(tpp, se)) { 2721 /* 2722 * On failure unlock what we have locked so far. 2723 * We want to avoid attempting to capture these 2724 * pages as the pcm mutex may be held which could 2725 * lead to a recursive mutex panic. 2726 */ 2727 while (first_pp != tpp) { 2728 page_unlock_nocapture(first_pp); 2729 first_pp = first_pp->p_next; 2730 } 2731 return (0); 2732 } 2733 tpp = tpp->p_next; 2734 } 2735 return (1); 2736 } 2737 2738 /* 2739 * init context for walking page lists 2740 * Called when a page of the given szc in unavailable. Sets markers 2741 * for the beginning of the search to detect when search has 2742 * completed a full cycle. Sets flags for splitting larger pages 2743 * and coalescing smaller pages. Page walking procedes until a page 2744 * of the desired equivalent color is found. 2745 */ 2746 void 2747 page_list_walk_init(uchar_t szc, uint_t flags, uint_t bin, int can_split, 2748 int use_ceq, page_list_walker_t *plw) 2749 { 2750 uint_t nszc, ceq_mask, colors; 2751 uchar_t ceq = use_ceq ? colorequivszc[szc] : 0; 2752 2753 ASSERT(szc < mmu_page_sizes); 2754 colors = PAGE_GET_PAGECOLORS(szc); 2755 2756 plw->plw_colors = colors; 2757 plw->plw_color_mask = colors - 1; 2758 plw->plw_bin_marker = plw->plw_bin0 = bin; 2759 plw->plw_bin_split_prev = bin; 2760 plw->plw_bin_step = (szc == 0) ? vac_colors : 1; 2761 2762 /* 2763 * if vac aliasing is possible make sure lower order color 2764 * bits are never ignored 2765 */ 2766 if (vac_colors > 1) 2767 ceq &= 0xf0; 2768 2769 /* 2770 * calculate the number of non-equivalent colors and 2771 * color equivalency mask 2772 */ 2773 plw->plw_ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf)); 2774 ASSERT(szc > 0 || plw->plw_ceq_dif >= vac_colors); 2775 ASSERT(plw->plw_ceq_dif > 0); 2776 plw->plw_ceq_mask[szc] = (plw->plw_ceq_dif - 1) << (ceq & 0xf); 2777 2778 if (flags & PG_MATCH_COLOR) { 2779 if (cpu_page_colors < 0) { 2780 /* 2781 * this is a heterogeneous machine with different CPUs 2782 * having different size e$ (not supported for ni2/rock 2783 */ 2784 uint_t cpucolors = CPUSETSIZE() >> PAGE_GET_SHIFT(szc); 2785 cpucolors = MAX(cpucolors, 1); 2786 ceq_mask = plw->plw_color_mask & (cpucolors - 1); 2787 plw->plw_ceq_mask[szc] = 2788 MIN(ceq_mask, plw->plw_ceq_mask[szc]); 2789 } 2790 plw->plw_ceq_dif = 1; 2791 } 2792 2793 /* we can split pages in the freelist, but not the cachelist */ 2794 if (can_split) { 2795 plw->plw_do_split = (szc + 1 < mmu_page_sizes) ? 1 : 0; 2796 2797 /* set next szc color masks and number of free list bins */ 2798 for (nszc = szc + 1; nszc < mmu_page_sizes; nszc++, szc++) { 2799 plw->plw_ceq_mask[nszc] = PAGE_GET_NSZ_MASK(szc, 2800 plw->plw_ceq_mask[szc]); 2801 plw->plw_bins[nszc] = PAGE_GET_PAGECOLORS(nszc); 2802 } 2803 plw->plw_ceq_mask[nszc] = INVALID_MASK; 2804 plw->plw_bins[nszc] = 0; 2805 2806 } else { 2807 ASSERT(szc == 0); 2808 plw->plw_do_split = 0; 2809 plw->plw_bins[1] = 0; 2810 plw->plw_ceq_mask[1] = INVALID_MASK; 2811 } 2812 } 2813 2814 /* 2815 * set mark to flag where next split should occur 2816 */ 2817 #define PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw) { \ 2818 uint_t bin_nsz = PAGE_GET_NSZ_COLOR(szc, bin); \ 2819 uint_t bin0_nsz = PAGE_GET_NSZ_COLOR(szc, plw->plw_bin0); \ 2820 uint_t neq_mask = ~plw->plw_ceq_mask[nszc] & plw->plw_color_mask; \ 2821 plw->plw_split_next = \ 2822 INC_MASKED(bin_nsz, neq_mask, plw->plw_color_mask); \ 2823 if (!((plw->plw_split_next ^ bin0_nsz) & plw->plw_ceq_mask[nszc])) { \ 2824 plw->plw_split_next = \ 2825 INC_MASKED(plw->plw_split_next, \ 2826 neq_mask, plw->plw_color_mask); \ 2827 } \ 2828 } 2829 2830 uint_t 2831 page_list_walk_next_bin(uchar_t szc, uint_t bin, page_list_walker_t *plw) 2832 { 2833 uint_t neq_mask = ~plw->plw_ceq_mask[szc] & plw->plw_color_mask; 2834 uint_t bin0_nsz, nbin_nsz, nbin0, nbin; 2835 uchar_t nszc = szc + 1; 2836 2837 nbin = ADD_MASKED(bin, 2838 plw->plw_bin_step, neq_mask, plw->plw_color_mask); 2839 2840 if (plw->plw_do_split) { 2841 plw->plw_bin_split_prev = bin; 2842 PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw); 2843 plw->plw_do_split = 0; 2844 } 2845 2846 if (szc == 0) { 2847 if (plw->plw_count != 0 || plw->plw_ceq_dif == vac_colors) { 2848 if (nbin == plw->plw_bin0 && 2849 (vac_colors == 1 || nbin != plw->plw_bin_marker)) { 2850 nbin = ADD_MASKED(nbin, plw->plw_bin_step, 2851 neq_mask, plw->plw_color_mask); 2852 plw->plw_bin_split_prev = plw->plw_bin0; 2853 } 2854 2855 if (vac_colors > 1 && nbin == plw->plw_bin_marker) { 2856 plw->plw_bin_marker = 2857 nbin = INC_MASKED(nbin, neq_mask, 2858 plw->plw_color_mask); 2859 plw->plw_bin_split_prev = plw->plw_bin0; 2860 /* 2861 * large pages all have the same vac color 2862 * so by now we should be done with next 2863 * size page splitting process 2864 */ 2865 ASSERT(plw->plw_bins[1] == 0); 2866 plw->plw_do_split = 0; 2867 return (nbin); 2868 } 2869 2870 } else { 2871 uint_t bin_jump = (vac_colors == 1) ? 2872 (BIN_STEP & ~3) - (plw->plw_bin0 & 3) : BIN_STEP; 2873 2874 bin_jump &= ~(vac_colors - 1); 2875 2876 nbin0 = ADD_MASKED(plw->plw_bin0, bin_jump, neq_mask, 2877 plw->plw_color_mask); 2878 2879 if ((nbin0 ^ plw->plw_bin0) & plw->plw_ceq_mask[szc]) { 2880 2881 plw->plw_bin_marker = nbin = nbin0; 2882 2883 if (plw->plw_bins[nszc] != 0) { 2884 /* 2885 * check if next page size bin is the 2886 * same as the next page size bin for 2887 * bin0 2888 */ 2889 nbin_nsz = PAGE_GET_NSZ_COLOR(szc, 2890 nbin); 2891 bin0_nsz = PAGE_GET_NSZ_COLOR(szc, 2892 plw->plw_bin0); 2893 2894 if ((bin0_nsz ^ nbin_nsz) & 2895 plw->plw_ceq_mask[nszc]) 2896 plw->plw_do_split = 1; 2897 } 2898 return (nbin); 2899 } 2900 } 2901 } 2902 2903 if (plw->plw_bins[nszc] != 0) { 2904 nbin_nsz = PAGE_GET_NSZ_COLOR(szc, nbin); 2905 if (!((plw->plw_split_next ^ nbin_nsz) & 2906 plw->plw_ceq_mask[nszc])) 2907 plw->plw_do_split = 1; 2908 } 2909 2910 return (nbin); 2911 } 2912 2913 page_t * 2914 page_get_mnode_freelist(int mnode, uint_t bin, int mtype, uchar_t szc, 2915 uint_t flags) 2916 { 2917 kmutex_t *pcm; 2918 page_t *pp, *first_pp; 2919 uint_t sbin; 2920 int plw_initialized; 2921 page_list_walker_t plw; 2922 2923 ASSERT(szc < mmu_page_sizes); 2924 2925 VM_STAT_ADD(vmm_vmstats.pgmf_alloc[szc]); 2926 2927 MTYPE_START(mnode, mtype, flags); 2928 if (mtype < 0) { /* mnode does not have memory in mtype range */ 2929 VM_STAT_ADD(vmm_vmstats.pgmf_allocempty[szc]); 2930 return (NULL); 2931 } 2932 try_again: 2933 2934 plw_initialized = 0; 2935 plw.plw_ceq_dif = 1; 2936 2937 /* 2938 * Only hold one freelist lock at a time, that way we 2939 * can start anywhere and not have to worry about lock 2940 * ordering. 2941 */ 2942 for (plw.plw_count = 0; 2943 plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) { 2944 sbin = bin; 2945 do { 2946 if (!PAGE_FREELISTS(mnode, szc, bin, mtype)) 2947 goto bin_empty_1; 2948 2949 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 2950 mutex_enter(pcm); 2951 pp = PAGE_FREELISTS(mnode, szc, bin, mtype); 2952 if (pp == NULL) 2953 goto bin_empty_0; 2954 2955 /* 2956 * These were set before the page 2957 * was put on the free list, 2958 * they must still be set. 2959 */ 2960 ASSERT(PP_ISFREE(pp)); 2961 ASSERT(PP_ISAGED(pp)); 2962 ASSERT(pp->p_vnode == NULL); 2963 ASSERT(pp->p_hash == NULL); 2964 ASSERT(pp->p_offset == (u_offset_t)-1); 2965 ASSERT(pp->p_szc == szc); 2966 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 2967 2968 /* 2969 * Walk down the hash chain. 2970 * 8k pages are linked on p_next 2971 * and p_prev fields. Large pages 2972 * are a contiguous group of 2973 * constituent pages linked together 2974 * on their p_next and p_prev fields. 2975 * The large pages are linked together 2976 * on the hash chain using p_vpnext 2977 * p_vpprev of the base constituent 2978 * page of each large page. 2979 */ 2980 first_pp = pp; 2981 while (!page_trylock_cons(pp, SE_EXCL) || 2982 IS_DUMP_PAGE(pp)) { 2983 if (szc == 0) { 2984 pp = pp->p_next; 2985 } else { 2986 pp = pp->p_vpnext; 2987 } 2988 2989 ASSERT(PP_ISFREE(pp)); 2990 ASSERT(PP_ISAGED(pp)); 2991 ASSERT(pp->p_vnode == NULL); 2992 ASSERT(pp->p_hash == NULL); 2993 ASSERT(pp->p_offset == (u_offset_t)-1); 2994 ASSERT(pp->p_szc == szc); 2995 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 2996 2997 if (pp == first_pp) 2998 goto bin_empty_0; 2999 } 3000 3001 ASSERT(pp != NULL); 3002 ASSERT(mtype == PP_2_MTYPE(pp)); 3003 ASSERT(pp->p_szc == szc); 3004 if (szc == 0) { 3005 page_sub(&PAGE_FREELISTS(mnode, 3006 szc, bin, mtype), pp); 3007 } else { 3008 page_vpsub(&PAGE_FREELISTS(mnode, 3009 szc, bin, mtype), pp); 3010 CHK_LPG(pp, szc); 3011 } 3012 page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST); 3013 3014 if ((PP_ISFREE(pp) == 0) || (PP_ISAGED(pp) == 0)) 3015 panic("free page is not. pp %p", (void *)pp); 3016 mutex_exit(pcm); 3017 3018 #if defined(__sparc) 3019 ASSERT(!kcage_on || PP_ISNORELOC(pp) || 3020 (flags & PG_NORELOC) == 0); 3021 3022 if (PP_ISNORELOC(pp)) 3023 kcage_freemem_sub(page_get_pagecnt(szc)); 3024 #endif 3025 VM_STAT_ADD(vmm_vmstats.pgmf_allocok[szc]); 3026 return (pp); 3027 3028 bin_empty_0: 3029 mutex_exit(pcm); 3030 bin_empty_1: 3031 if (plw_initialized == 0) { 3032 page_list_walk_init(szc, flags, bin, 1, 1, 3033 &plw); 3034 plw_initialized = 1; 3035 ASSERT(plw.plw_colors <= 3036 PAGE_GET_PAGECOLORS(szc)); 3037 ASSERT(plw.plw_colors > 0); 3038 ASSERT((plw.plw_colors & 3039 (plw.plw_colors - 1)) == 0); 3040 ASSERT(bin < plw.plw_colors); 3041 ASSERT(plw.plw_ceq_mask[szc] < plw.plw_colors); 3042 } 3043 /* calculate the next bin with equivalent color */ 3044 bin = ADD_MASKED(bin, plw.plw_bin_step, 3045 plw.plw_ceq_mask[szc], plw.plw_color_mask); 3046 } while (sbin != bin); 3047 3048 /* 3049 * color bins are all empty if color match. Try and 3050 * satisfy the request by breaking up or coalescing 3051 * pages from a different size freelist of the correct 3052 * color that satisfies the ORIGINAL color requested. 3053 * If that fails then try pages of the same size but 3054 * different colors assuming we are not called with 3055 * PG_MATCH_COLOR. 3056 */ 3057 if (plw.plw_do_split && 3058 (pp = page_freelist_split(szc, bin, mnode, 3059 mtype, PFNNULL, PFNNULL, &plw)) != NULL) 3060 return (pp); 3061 3062 if (szc > 0 && (pp = page_freelist_coalesce(mnode, szc, 3063 bin, plw.plw_ceq_mask[szc], mtype, PFNNULL)) != NULL) 3064 return (pp); 3065 3066 if (plw.plw_ceq_dif > 1) 3067 bin = page_list_walk_next_bin(szc, bin, &plw); 3068 } 3069 3070 /* if allowed, cycle through additional mtypes */ 3071 MTYPE_NEXT(mnode, mtype, flags); 3072 if (mtype >= 0) 3073 goto try_again; 3074 3075 VM_STAT_ADD(vmm_vmstats.pgmf_allocfailed[szc]); 3076 3077 return (NULL); 3078 } 3079 3080 /* 3081 * Returns the count of free pages for 'pp' with size code 'szc'. 3082 * Note: This function does not return an exact value as the page freelist 3083 * locks are not held and thus the values in the page_counters may be 3084 * changing as we walk through the data. 3085 */ 3086 static int 3087 page_freecnt(int mnode, page_t *pp, uchar_t szc) 3088 { 3089 pgcnt_t pgfree; 3090 pgcnt_t cnt; 3091 ssize_t r = szc; /* region size */ 3092 ssize_t idx; 3093 int i; 3094 int full, range; 3095 3096 /* Make sure pagenum passed in is aligned properly */ 3097 ASSERT((pp->p_pagenum & (PNUM_SIZE(szc) - 1)) == 0); 3098 ASSERT(szc > 0); 3099 3100 /* Prevent page_counters dynamic memory from being freed */ 3101 rw_enter(&page_ctrs_rwlock[mnode], RW_READER); 3102 idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum); 3103 cnt = PAGE_COUNTERS(mnode, r, idx); 3104 pgfree = cnt << PNUM_SHIFT(r - 1); 3105 range = FULL_REGION_CNT(szc); 3106 3107 /* Check for completely full region */ 3108 if (cnt == range) { 3109 rw_exit(&page_ctrs_rwlock[mnode]); 3110 return (pgfree); 3111 } 3112 3113 while (--r > 0) { 3114 idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum); 3115 full = FULL_REGION_CNT(r); 3116 for (i = 0; i < range; i++, idx++) { 3117 cnt = PAGE_COUNTERS(mnode, r, idx); 3118 /* 3119 * If cnt here is full, that means we have already 3120 * accounted for these pages earlier. 3121 */ 3122 if (cnt != full) { 3123 pgfree += (cnt << PNUM_SHIFT(r - 1)); 3124 } 3125 } 3126 range *= full; 3127 } 3128 rw_exit(&page_ctrs_rwlock[mnode]); 3129 return (pgfree); 3130 } 3131 3132 /* 3133 * Called from page_geti_contig_pages to exclusively lock constituent pages 3134 * starting from 'spp' for page size code 'szc'. 3135 * 3136 * If 'ptcpthreshold' is set, the number of free pages needed in the 'szc' 3137 * region needs to be greater than or equal to the threshold. 3138 */ 3139 static int 3140 page_trylock_contig_pages(int mnode, page_t *spp, uchar_t szc, int flags) 3141 { 3142 pgcnt_t pgcnt = PNUM_SIZE(szc); 3143 pgcnt_t pgfree, i; 3144 page_t *pp; 3145 3146 VM_STAT_ADD(vmm_vmstats.ptcp[szc]); 3147 3148 3149 if ((ptcpthreshold == 0) || (flags & PGI_PGCPHIPRI)) 3150 goto skipptcpcheck; 3151 /* 3152 * check if there are sufficient free pages available before attempting 3153 * to trylock. Count is approximate as page counters can change. 3154 */ 3155 pgfree = page_freecnt(mnode, spp, szc); 3156 3157 /* attempt to trylock if there are sufficient already free pages */ 3158 if (pgfree < pgcnt/ptcpthreshold) { 3159 VM_STAT_ADD(vmm_vmstats.ptcpfreethresh[szc]); 3160 return (0); 3161 } 3162 3163 skipptcpcheck: 3164 3165 for (i = 0; i < pgcnt; i++) { 3166 pp = &spp[i]; 3167 if (!page_trylock(pp, SE_EXCL)) { 3168 VM_STAT_ADD(vmm_vmstats.ptcpfailexcl[szc]); 3169 while (--i != (pgcnt_t)-1) { 3170 pp = &spp[i]; 3171 ASSERT(PAGE_EXCL(pp)); 3172 page_unlock_nocapture(pp); 3173 } 3174 return (0); 3175 } 3176 ASSERT(spp[i].p_pagenum == spp->p_pagenum + i); 3177 if ((pp->p_szc > szc || (szc && pp->p_szc == szc)) && 3178 !PP_ISFREE(pp)) { 3179 VM_STAT_ADD(vmm_vmstats.ptcpfailszc[szc]); 3180 ASSERT(i == 0); 3181 page_unlock_nocapture(pp); 3182 return (0); 3183 } 3184 if (PP_ISNORELOC(pp)) { 3185 VM_STAT_ADD(vmm_vmstats.ptcpfailcage[szc]); 3186 while (i != (pgcnt_t)-1) { 3187 pp = &spp[i]; 3188 ASSERT(PAGE_EXCL(pp)); 3189 page_unlock_nocapture(pp); 3190 i--; 3191 } 3192 return (0); 3193 } 3194 } 3195 VM_STAT_ADD(vmm_vmstats.ptcpok[szc]); 3196 return (1); 3197 } 3198 3199 /* 3200 * Claim large page pointed to by 'pp'. 'pp' is the starting set 3201 * of 'szc' constituent pages that had been locked exclusively previously. 3202 * Will attempt to relocate constituent pages in use. 3203 */ 3204 static page_t * 3205 page_claim_contig_pages(page_t *pp, uchar_t szc, int flags) 3206 { 3207 spgcnt_t pgcnt, npgs, i; 3208 page_t *targpp, *rpp, *hpp; 3209 page_t *replpp = NULL; 3210 page_t *pplist = NULL; 3211 3212 ASSERT(pp != NULL); 3213 3214 pgcnt = page_get_pagecnt(szc); 3215 while (pgcnt) { 3216 ASSERT(PAGE_EXCL(pp)); 3217 ASSERT(!PP_ISNORELOC(pp)); 3218 if (PP_ISFREE(pp)) { 3219 /* 3220 * If this is a PG_FREE_LIST page then its 3221 * size code can change underneath us due to 3222 * page promotion or demotion. As an optimzation 3223 * use page_list_sub_pages() instead of 3224 * page_list_sub(). 3225 */ 3226 if (PP_ISAGED(pp)) { 3227 page_list_sub_pages(pp, szc); 3228 if (pp->p_szc == szc) { 3229 return (pp); 3230 } 3231 ASSERT(pp->p_szc < szc); 3232 npgs = page_get_pagecnt(pp->p_szc); 3233 hpp = pp; 3234 for (i = 0; i < npgs; i++, pp++) { 3235 pp->p_szc = szc; 3236 } 3237 page_list_concat(&pplist, &hpp); 3238 pgcnt -= npgs; 3239 continue; 3240 } 3241 ASSERT(!PP_ISAGED(pp)); 3242 ASSERT(pp->p_szc == 0); 3243 page_list_sub(pp, PG_CACHE_LIST); 3244 page_hashout(pp, NULL); 3245 PP_SETAGED(pp); 3246 pp->p_szc = szc; 3247 page_list_concat(&pplist, &pp); 3248 pp++; 3249 pgcnt--; 3250 continue; 3251 } 3252 npgs = page_get_pagecnt(pp->p_szc); 3253 3254 /* 3255 * page_create_wait freemem accounting done by caller of 3256 * page_get_freelist and not necessary to call it prior to 3257 * calling page_get_replacement_page. 3258 * 3259 * page_get_replacement_page can call page_get_contig_pages 3260 * to acquire a large page (szc > 0); the replacement must be 3261 * smaller than the contig page size to avoid looping or 3262 * szc == 0 and PGI_PGCPSZC0 is set. 3263 */ 3264 if (pp->p_szc < szc || (szc == 0 && (flags & PGI_PGCPSZC0))) { 3265 replpp = page_get_replacement_page(pp, NULL, 0); 3266 if (replpp) { 3267 npgs = page_get_pagecnt(pp->p_szc); 3268 ASSERT(npgs <= pgcnt); 3269 targpp = pp; 3270 } 3271 } 3272 3273 /* 3274 * If replacement is NULL or do_page_relocate fails, fail 3275 * coalescing of pages. 3276 */ 3277 if (replpp == NULL || (do_page_relocate(&targpp, &replpp, 0, 3278 &npgs, NULL) != 0)) { 3279 /* 3280 * Unlock un-processed target list 3281 */ 3282 while (pgcnt--) { 3283 ASSERT(PAGE_EXCL(pp)); 3284 page_unlock_nocapture(pp); 3285 pp++; 3286 } 3287 /* 3288 * Free the processed target list. 3289 */ 3290 while (pplist) { 3291 pp = pplist; 3292 page_sub(&pplist, pp); 3293 ASSERT(PAGE_EXCL(pp)); 3294 ASSERT(pp->p_szc == szc); 3295 ASSERT(PP_ISFREE(pp)); 3296 ASSERT(PP_ISAGED(pp)); 3297 pp->p_szc = 0; 3298 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); 3299 page_unlock_nocapture(pp); 3300 } 3301 3302 if (replpp != NULL) 3303 page_free_replacement_page(replpp); 3304 3305 return (NULL); 3306 } 3307 ASSERT(pp == targpp); 3308 3309 /* LINTED */ 3310 ASSERT(hpp = pp); /* That's right, it's an assignment */ 3311 3312 pp += npgs; 3313 pgcnt -= npgs; 3314 3315 while (npgs--) { 3316 ASSERT(PAGE_EXCL(targpp)); 3317 ASSERT(!PP_ISFREE(targpp)); 3318 ASSERT(!PP_ISNORELOC(targpp)); 3319 PP_SETFREE(targpp); 3320 ASSERT(PP_ISAGED(targpp)); 3321 ASSERT(targpp->p_szc < szc || (szc == 0 && 3322 (flags & PGI_PGCPSZC0))); 3323 targpp->p_szc = szc; 3324 targpp = targpp->p_next; 3325 3326 rpp = replpp; 3327 ASSERT(rpp != NULL); 3328 page_sub(&replpp, rpp); 3329 ASSERT(PAGE_EXCL(rpp)); 3330 ASSERT(!PP_ISFREE(rpp)); 3331 page_unlock_nocapture(rpp); 3332 } 3333 ASSERT(targpp == hpp); 3334 ASSERT(replpp == NULL); 3335 page_list_concat(&pplist, &targpp); 3336 } 3337 CHK_LPG(pplist, szc); 3338 return (pplist); 3339 } 3340 3341 /* 3342 * Trim kernel cage from pfnlo-pfnhi and store result in lo-hi. Return code 3343 * of 0 means nothing left after trim. 3344 */ 3345 int 3346 trimkcage(struct memseg *mseg, pfn_t *lo, pfn_t *hi, pfn_t pfnlo, pfn_t pfnhi) 3347 { 3348 pfn_t kcagepfn; 3349 int decr; 3350 int rc = 0; 3351 3352 if (PP_ISNORELOC(mseg->pages)) { 3353 if (PP_ISNORELOC(mseg->epages - 1) == 0) { 3354 3355 /* lower part of this mseg inside kernel cage */ 3356 decr = kcage_current_pfn(&kcagepfn); 3357 3358 /* kernel cage may have transitioned past mseg */ 3359 if (kcagepfn >= mseg->pages_base && 3360 kcagepfn < mseg->pages_end) { 3361 ASSERT(decr == 0); 3362 *lo = MAX(kcagepfn, pfnlo); 3363 *hi = MIN(pfnhi, (mseg->pages_end - 1)); 3364 rc = 1; 3365 } 3366 } 3367 /* else entire mseg in the cage */ 3368 } else { 3369 if (PP_ISNORELOC(mseg->epages - 1)) { 3370 3371 /* upper part of this mseg inside kernel cage */ 3372 decr = kcage_current_pfn(&kcagepfn); 3373 3374 /* kernel cage may have transitioned past mseg */ 3375 if (kcagepfn >= mseg->pages_base && 3376 kcagepfn < mseg->pages_end) { 3377 ASSERT(decr); 3378 *hi = MIN(kcagepfn, pfnhi); 3379 *lo = MAX(pfnlo, mseg->pages_base); 3380 rc = 1; 3381 } 3382 } else { 3383 /* entire mseg outside of kernel cage */ 3384 *lo = MAX(pfnlo, mseg->pages_base); 3385 *hi = MIN(pfnhi, (mseg->pages_end - 1)); 3386 rc = 1; 3387 } 3388 } 3389 return (rc); 3390 } 3391 3392 /* 3393 * called from page_get_contig_pages to search 'pfnlo' thru 'pfnhi' to claim a 3394 * page with size code 'szc'. Claiming such a page requires acquiring 3395 * exclusive locks on all constituent pages (page_trylock_contig_pages), 3396 * relocating pages in use and concatenating these constituent pages into a 3397 * large page. 3398 * 3399 * The page lists do not have such a large page and page_freelist_split has 3400 * already failed to demote larger pages and/or coalesce smaller free pages. 3401 * 3402 * 'flags' may specify PG_COLOR_MATCH which would limit the search of large 3403 * pages with the same color as 'bin'. 3404 * 3405 * 'pfnflag' specifies the subset of the pfn range to search. 3406 */ 3407 3408 static page_t * 3409 page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags, 3410 pfn_t pfnlo, pfn_t pfnhi, pgcnt_t pfnflag) 3411 { 3412 struct memseg *mseg; 3413 pgcnt_t szcpgcnt = page_get_pagecnt(szc); 3414 pgcnt_t szcpgmask = szcpgcnt - 1; 3415 pfn_t randpfn; 3416 page_t *pp, *randpp, *endpp; 3417 uint_t colors, ceq_mask; 3418 /* LINTED : set but not used in function */ 3419 uint_t color_mask; 3420 pfn_t hi, lo; 3421 uint_t skip; 3422 MEM_NODE_ITERATOR_DECL(it); 3423 3424 ASSERT(szc != 0 || (flags & PGI_PGCPSZC0)); 3425 3426 pfnlo = P2ROUNDUP(pfnlo, szcpgcnt); 3427 3428 if ((pfnhi - pfnlo) + 1 < szcpgcnt || pfnlo >= pfnhi) 3429 return (NULL); 3430 3431 ASSERT(szc < mmu_page_sizes); 3432 3433 colors = PAGE_GET_PAGECOLORS(szc); 3434 color_mask = colors - 1; 3435 if ((colors > 1) && (flags & PG_MATCH_COLOR)) { 3436 uchar_t ceq = colorequivszc[szc]; 3437 uint_t ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf)); 3438 3439 ASSERT(ceq_dif > 0); 3440 ceq_mask = (ceq_dif - 1) << (ceq & 0xf); 3441 } else { 3442 ceq_mask = 0; 3443 } 3444 3445 ASSERT(bin < colors); 3446 3447 /* clear "non-significant" color bits */ 3448 bin &= ceq_mask; 3449 3450 /* 3451 * trim the pfn range to search based on pfnflag. pfnflag is set 3452 * when there have been previous page_get_contig_page failures to 3453 * limit the search. 3454 * 3455 * The high bit in pfnflag specifies the number of 'slots' in the 3456 * pfn range and the remainder of pfnflag specifies which slot. 3457 * For example, a value of 1010b would mean the second slot of 3458 * the pfn range that has been divided into 8 slots. 3459 */ 3460 if (pfnflag > 1) { 3461 int slots = 1 << (highbit(pfnflag) - 1); 3462 int slotid = pfnflag & (slots - 1); 3463 pgcnt_t szcpages; 3464 int slotlen; 3465 3466 pfnhi = P2ALIGN((pfnhi + 1), szcpgcnt) - 1; 3467 szcpages = ((pfnhi - pfnlo) + 1) / szcpgcnt; 3468 slotlen = howmany(szcpages, slots); 3469 /* skip if 'slotid' slot is empty */ 3470 if (slotid * slotlen >= szcpages) 3471 return (NULL); 3472 pfnlo = pfnlo + (((slotid * slotlen) % szcpages) * szcpgcnt); 3473 ASSERT(pfnlo < pfnhi); 3474 if (pfnhi > pfnlo + (slotlen * szcpgcnt)) 3475 pfnhi = pfnlo + (slotlen * szcpgcnt) - 1; 3476 } 3477 3478 /* 3479 * This routine is can be called recursively so we shouldn't 3480 * acquire a reader lock if a write request is pending. This 3481 * could lead to a deadlock with the DR thread. 3482 * 3483 * Returning NULL informs the caller that we could not get 3484 * a contig page with the required characteristics. 3485 */ 3486 3487 if (!memsegs_trylock(0)) 3488 return (NULL); 3489 3490 /* 3491 * loop through memsegs to look for contig page candidates 3492 */ 3493 3494 for (mseg = memsegs; mseg != NULL; mseg = mseg->next) { 3495 if (pfnhi < mseg->pages_base || pfnlo >= mseg->pages_end) { 3496 /* no overlap */ 3497 continue; 3498 } 3499 3500 if (mseg->pages_end - mseg->pages_base < szcpgcnt) 3501 /* mseg too small */ 3502 continue; 3503 3504 /* 3505 * trim off kernel cage pages from pfn range and check for 3506 * a trimmed pfn range returned that does not span the 3507 * desired large page size. 3508 */ 3509 if (kcage_on) { 3510 if (trimkcage(mseg, &lo, &hi, pfnlo, pfnhi) == 0 || 3511 lo >= hi || ((hi - lo) + 1) < szcpgcnt) 3512 continue; 3513 } else { 3514 lo = MAX(pfnlo, mseg->pages_base); 3515 hi = MIN(pfnhi, (mseg->pages_end - 1)); 3516 } 3517 3518 /* round to szcpgcnt boundaries */ 3519 lo = P2ROUNDUP(lo, szcpgcnt); 3520 3521 MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it); 3522 hi = P2ALIGN((hi + 1), szcpgcnt) - 1; 3523 3524 if (hi <= lo) 3525 continue; 3526 3527 /* 3528 * set lo to point to the pfn for the desired bin. Large 3529 * page sizes may only have a single page color 3530 */ 3531 skip = szcpgcnt; 3532 if (ceq_mask > 0 || interleaved_mnodes) { 3533 /* set lo to point at appropriate color */ 3534 if (((PFN_2_COLOR(lo, szc, &it) ^ bin) & ceq_mask) || 3535 (interleaved_mnodes && 3536 PFN_2_MEM_NODE(lo) != mnode)) { 3537 PAGE_NEXT_PFN_FOR_COLOR(lo, szc, bin, ceq_mask, 3538 color_mask, &it); 3539 } 3540 if (hi <= lo) 3541 /* mseg cannot satisfy color request */ 3542 continue; 3543 } 3544 3545 /* randomly choose a point between lo and hi to begin search */ 3546 3547 randpfn = (pfn_t)GETTICK(); 3548 randpfn = ((randpfn % (hi - lo)) + lo) & ~(skip - 1); 3549 MEM_NODE_ITERATOR_INIT(randpfn, mnode, szc, &it); 3550 if (ceq_mask || interleaved_mnodes || randpfn == (pfn_t)-1) { 3551 if (randpfn != (pfn_t)-1) { 3552 PAGE_NEXT_PFN_FOR_COLOR(randpfn, szc, bin, 3553 ceq_mask, color_mask, &it); 3554 } 3555 if (randpfn >= hi) { 3556 randpfn = lo; 3557 MEM_NODE_ITERATOR_INIT(randpfn, mnode, szc, 3558 &it); 3559 } 3560 } 3561 randpp = mseg->pages + (randpfn - mseg->pages_base); 3562 3563 ASSERT(randpp->p_pagenum == randpfn); 3564 3565 pp = randpp; 3566 endpp = mseg->pages + (hi - mseg->pages_base) + 1; 3567 3568 ASSERT(randpp + szcpgcnt <= endpp); 3569 3570 do { 3571 ASSERT(!(pp->p_pagenum & szcpgmask)); 3572 ASSERT(((PP_2_BIN(pp) ^ bin) & ceq_mask) == 0); 3573 3574 if (page_trylock_contig_pages(mnode, pp, szc, flags)) { 3575 /* pages unlocked by page_claim on failure */ 3576 if (page_claim_contig_pages(pp, szc, flags)) { 3577 memsegs_unlock(0); 3578 return (pp); 3579 } 3580 } 3581 3582 if (ceq_mask == 0 && !interleaved_mnodes) { 3583 pp += skip; 3584 } else { 3585 pfn_t pfn = pp->p_pagenum; 3586 3587 PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, bin, 3588 ceq_mask, color_mask, &it); 3589 if (pfn == (pfn_t)-1) { 3590 pp = endpp; 3591 } else { 3592 pp = mseg->pages + 3593 (pfn - mseg->pages_base); 3594 } 3595 } 3596 if (pp >= endpp) { 3597 /* start from the beginning */ 3598 MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it); 3599 pp = mseg->pages + (lo - mseg->pages_base); 3600 ASSERT(pp->p_pagenum == lo); 3601 ASSERT(pp + szcpgcnt <= endpp); 3602 } 3603 } while (pp != randpp); 3604 } 3605 memsegs_unlock(0); 3606 return (NULL); 3607 } 3608 3609 3610 /* 3611 * controlling routine that searches through physical memory in an attempt to 3612 * claim a large page based on the input parameters. 3613 * on the page free lists. 3614 * 3615 * calls page_geti_contig_pages with an initial pfn range from the mnode 3616 * and mtype. page_geti_contig_pages will trim off the parts of the pfn range 3617 * that overlaps with the kernel cage or does not match the requested page 3618 * color if PG_MATCH_COLOR is set. Since this search is very expensive, 3619 * page_geti_contig_pages may further limit the search range based on 3620 * previous failure counts (pgcpfailcnt[]). 3621 * 3622 * for PGI_PGCPSZC0 requests, page_get_contig_pages will relocate a base 3623 * pagesize page that satisfies mtype. 3624 */ 3625 page_t * 3626 page_get_contig_pages(int mnode, uint_t bin, int mtype, uchar_t szc, 3627 uint_t flags) 3628 { 3629 pfn_t pfnlo, pfnhi; /* contig pages pfn range */ 3630 page_t *pp; 3631 pgcnt_t pfnflag = 0; /* no limit on search if 0 */ 3632 3633 VM_STAT_ADD(vmm_vmstats.pgcp_alloc[szc]); 3634 3635 /* no allocations from cage */ 3636 flags |= PGI_NOCAGE; 3637 3638 /* LINTED */ 3639 MTYPE_START(mnode, mtype, flags); 3640 if (mtype < 0) { /* mnode does not have memory in mtype range */ 3641 VM_STAT_ADD(vmm_vmstats.pgcp_allocempty[szc]); 3642 return (NULL); 3643 } 3644 3645 ASSERT(szc > 0 || (flags & PGI_PGCPSZC0)); 3646 3647 /* do not limit search and ignore color if hi pri */ 3648 3649 if (pgcplimitsearch && ((flags & PGI_PGCPHIPRI) == 0)) 3650 pfnflag = pgcpfailcnt[szc]; 3651 3652 /* remove color match to improve chances */ 3653 3654 if (flags & PGI_PGCPHIPRI || pfnflag) 3655 flags &= ~PG_MATCH_COLOR; 3656 3657 do { 3658 /* get pfn range based on mnode and mtype */ 3659 MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi); 3660 3661 ASSERT(pfnhi >= pfnlo); 3662 3663 pp = page_geti_contig_pages(mnode, bin, szc, flags, 3664 pfnlo, pfnhi, pfnflag); 3665 3666 if (pp != NULL) { 3667 pfnflag = pgcpfailcnt[szc]; 3668 if (pfnflag) { 3669 /* double the search size */ 3670 pgcpfailcnt[szc] = pfnflag >> 1; 3671 } 3672 VM_STAT_ADD(vmm_vmstats.pgcp_allocok[szc]); 3673 return (pp); 3674 } 3675 MTYPE_NEXT(mnode, mtype, flags); 3676 } while (mtype >= 0); 3677 3678 VM_STAT_ADD(vmm_vmstats.pgcp_allocfailed[szc]); 3679 return (NULL); 3680 } 3681 3682 #if defined(__i386) || defined(__amd64) 3683 /* 3684 * Determine the likelihood of finding/coalescing a szc page. 3685 * Return 0 if the likelihood is small otherwise return 1. 3686 * 3687 * For now, be conservative and check only 1g pages and return 0 3688 * if there had been previous coalescing failures and the szc pages 3689 * needed to satisfy request would exhaust most of freemem. 3690 */ 3691 int 3692 page_chk_freelist(uint_t szc) 3693 { 3694 pgcnt_t pgcnt; 3695 3696 if (szc <= 1) 3697 return (1); 3698 3699 pgcnt = page_get_pagecnt(szc); 3700 if (pgcpfailcnt[szc] && pgcnt + throttlefree >= freemem) { 3701 VM_STAT_ADD(vmm_vmstats.pcf_deny[szc]); 3702 return (0); 3703 } 3704 VM_STAT_ADD(vmm_vmstats.pcf_allow[szc]); 3705 return (1); 3706 } 3707 #endif 3708 3709 /* 3710 * Find the `best' page on the freelist for this (vp,off) (as,vaddr) pair. 3711 * 3712 * Does its own locking and accounting. 3713 * If PG_MATCH_COLOR is set, then NULL will be returned if there are no 3714 * pages of the proper color even if there are pages of a different color. 3715 * 3716 * Finds a page, removes it, THEN locks it. 3717 */ 3718 3719 /*ARGSUSED*/ 3720 page_t * 3721 page_get_freelist(struct vnode *vp, u_offset_t off, struct seg *seg, 3722 caddr_t vaddr, size_t size, uint_t flags, struct lgrp *lgrp) 3723 { 3724 struct as *as = seg->s_as; 3725 page_t *pp = NULL; 3726 ulong_t bin; 3727 uchar_t szc; 3728 int mnode; 3729 int mtype; 3730 page_t *(*page_get_func)(int, uint_t, int, uchar_t, uint_t); 3731 lgrp_mnode_cookie_t lgrp_cookie; 3732 3733 page_get_func = page_get_mnode_freelist; 3734 3735 /* 3736 * If we aren't passed a specific lgroup, or passed a freed lgrp 3737 * assume we wish to allocate near to the current thread's home. 3738 */ 3739 if (!LGRP_EXISTS(lgrp)) 3740 lgrp = lgrp_home_lgrp(); 3741 3742 if (kcage_on) { 3743 if ((flags & (PG_NORELOC | PG_PANIC)) == PG_NORELOC && 3744 kcage_freemem < kcage_throttlefree + btop(size) && 3745 curthread != kcage_cageout_thread) { 3746 /* 3747 * Set a "reserve" of kcage_throttlefree pages for 3748 * PG_PANIC and cageout thread allocations. 3749 * 3750 * Everybody else has to serialize in 3751 * page_create_get_something() to get a cage page, so 3752 * that we don't deadlock cageout! 3753 */ 3754 return (NULL); 3755 } 3756 } else { 3757 flags &= ~PG_NORELOC; 3758 flags |= PGI_NOCAGE; 3759 } 3760 3761 /* LINTED */ 3762 MTYPE_INIT(mtype, vp, vaddr, flags, size); 3763 3764 /* 3765 * Convert size to page size code. 3766 */ 3767 if ((szc = page_szc(size)) == (uchar_t)-1) 3768 panic("page_get_freelist: illegal page size request"); 3769 ASSERT(szc < mmu_page_sizes); 3770 3771 VM_STAT_ADD(vmm_vmstats.pgf_alloc[szc]); 3772 3773 /* LINTED */ 3774 AS_2_BIN(as, seg, vp, vaddr, bin, szc); 3775 3776 ASSERT(bin < PAGE_GET_PAGECOLORS(szc)); 3777 3778 /* 3779 * Try to get a local page first, but try remote if we can't 3780 * get a page of the right color. 3781 */ 3782 pgretry: 3783 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL); 3784 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3785 pp = page_get_func(mnode, bin, mtype, szc, flags); 3786 if (pp != NULL) { 3787 VM_STAT_ADD(vmm_vmstats.pgf_allocok[szc]); 3788 DTRACE_PROBE4(page__get, 3789 lgrp_t *, lgrp, 3790 int, mnode, 3791 ulong_t, bin, 3792 uint_t, flags); 3793 return (pp); 3794 } 3795 } 3796 ASSERT(pp == NULL); 3797 3798 /* 3799 * for non-SZC0 PAGESIZE requests, check cachelist before checking 3800 * remote free lists. Caller expected to call page_get_cachelist which 3801 * will check local cache lists and remote free lists. 3802 */ 3803 if (szc == 0 && ((flags & PGI_PGCPSZC0) == 0)) { 3804 VM_STAT_ADD(vmm_vmstats.pgf_allocdeferred); 3805 return (NULL); 3806 } 3807 3808 ASSERT(szc > 0 || (flags & PGI_PGCPSZC0)); 3809 3810 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); 3811 3812 if (!(flags & PG_LOCAL)) { 3813 /* 3814 * Try to get a non-local freelist page. 3815 */ 3816 LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie); 3817 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3818 pp = page_get_func(mnode, bin, mtype, szc, flags); 3819 if (pp != NULL) { 3820 DTRACE_PROBE4(page__get, 3821 lgrp_t *, lgrp, 3822 int, mnode, 3823 ulong_t, bin, 3824 uint_t, flags); 3825 VM_STAT_ADD(vmm_vmstats.pgf_allocokrem[szc]); 3826 return (pp); 3827 } 3828 } 3829 ASSERT(pp == NULL); 3830 } 3831 3832 /* 3833 * when the cage is off chances are page_get_contig_pages() will fail 3834 * to lock a large page chunk therefore when the cage is off it's not 3835 * called by default. this can be changed via /etc/system. 3836 * 3837 * page_get_contig_pages() also called to acquire a base pagesize page 3838 * for page_create_get_something(). 3839 */ 3840 if (!(flags & PG_NORELOC) && (pg_contig_disable == 0) && 3841 (kcage_on || pg_lpgcreate_nocage || szc == 0) && 3842 (page_get_func != page_get_contig_pages)) { 3843 3844 VM_STAT_ADD(vmm_vmstats.pgf_allocretry[szc]); 3845 page_get_func = page_get_contig_pages; 3846 goto pgretry; 3847 } 3848 3849 if (!(flags & PG_LOCAL) && pgcplimitsearch && 3850 page_get_func == page_get_contig_pages) 3851 SETPGCPFAILCNT(szc); 3852 3853 VM_STAT_ADD(vmm_vmstats.pgf_allocfailed[szc]); 3854 return (NULL); 3855 } 3856 3857 /* 3858 * Find the `best' page on the cachelist for this (vp,off) (as,vaddr) pair. 3859 * 3860 * Does its own locking. 3861 * If PG_MATCH_COLOR is set, then NULL will be returned if there are no 3862 * pages of the proper color even if there are pages of a different color. 3863 * Otherwise, scan the bins for ones with pages. For each bin with pages, 3864 * try to lock one of them. If no page can be locked, try the 3865 * next bin. Return NULL if a page can not be found and locked. 3866 * 3867 * Finds a pages, trys to lock it, then removes it. 3868 */ 3869 3870 /*ARGSUSED*/ 3871 page_t * 3872 page_get_cachelist(struct vnode *vp, u_offset_t off, struct seg *seg, 3873 caddr_t vaddr, uint_t flags, struct lgrp *lgrp) 3874 { 3875 page_t *pp; 3876 struct as *as = seg->s_as; 3877 ulong_t bin; 3878 /*LINTED*/ 3879 int mnode; 3880 int mtype; 3881 lgrp_mnode_cookie_t lgrp_cookie; 3882 3883 /* 3884 * If we aren't passed a specific lgroup, or pasased a freed lgrp 3885 * assume we wish to allocate near to the current thread's home. 3886 */ 3887 if (!LGRP_EXISTS(lgrp)) 3888 lgrp = lgrp_home_lgrp(); 3889 3890 if (!kcage_on) { 3891 flags &= ~PG_NORELOC; 3892 flags |= PGI_NOCAGE; 3893 } 3894 3895 if ((flags & (PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == PG_NORELOC && 3896 kcage_freemem <= kcage_throttlefree) { 3897 /* 3898 * Reserve kcage_throttlefree pages for critical kernel 3899 * threads. 3900 * 3901 * Everybody else has to go to page_create_get_something() 3902 * to get a cage page, so we don't deadlock cageout. 3903 */ 3904 return (NULL); 3905 } 3906 3907 /* LINTED */ 3908 AS_2_BIN(as, seg, vp, vaddr, bin, 0); 3909 3910 ASSERT(bin < PAGE_GET_PAGECOLORS(0)); 3911 3912 /* LINTED */ 3913 MTYPE_INIT(mtype, vp, vaddr, flags, MMU_PAGESIZE); 3914 3915 VM_STAT_ADD(vmm_vmstats.pgc_alloc); 3916 3917 /* 3918 * Try local cachelists first 3919 */ 3920 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL); 3921 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3922 pp = page_get_mnode_cachelist(bin, flags, mnode, mtype); 3923 if (pp != NULL) { 3924 VM_STAT_ADD(vmm_vmstats.pgc_allocok); 3925 DTRACE_PROBE4(page__get, 3926 lgrp_t *, lgrp, 3927 int, mnode, 3928 ulong_t, bin, 3929 uint_t, flags); 3930 return (pp); 3931 } 3932 } 3933 3934 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); 3935 3936 /* 3937 * Try freelists/cachelists that are farther away 3938 * This is our only chance to allocate remote pages for PAGESIZE 3939 * requests. 3940 */ 3941 LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie); 3942 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3943 pp = page_get_mnode_freelist(mnode, bin, mtype, 3944 0, flags); 3945 if (pp != NULL) { 3946 VM_STAT_ADD(vmm_vmstats.pgc_allocokdeferred); 3947 DTRACE_PROBE4(page__get, 3948 lgrp_t *, lgrp, 3949 int, mnode, 3950 ulong_t, bin, 3951 uint_t, flags); 3952 return (pp); 3953 } 3954 pp = page_get_mnode_cachelist(bin, flags, mnode, mtype); 3955 if (pp != NULL) { 3956 VM_STAT_ADD(vmm_vmstats.pgc_allocokrem); 3957 DTRACE_PROBE4(page__get, 3958 lgrp_t *, lgrp, 3959 int, mnode, 3960 ulong_t, bin, 3961 uint_t, flags); 3962 return (pp); 3963 } 3964 } 3965 3966 VM_STAT_ADD(vmm_vmstats.pgc_allocfailed); 3967 return (NULL); 3968 } 3969 3970 page_t * 3971 page_get_mnode_cachelist(uint_t bin, uint_t flags, int mnode, int mtype) 3972 { 3973 kmutex_t *pcm; 3974 page_t *pp, *first_pp; 3975 uint_t sbin; 3976 int plw_initialized; 3977 page_list_walker_t plw; 3978 3979 VM_STAT_ADD(vmm_vmstats.pgmc_alloc); 3980 3981 /* LINTED */ 3982 MTYPE_START(mnode, mtype, flags); 3983 if (mtype < 0) { /* mnode does not have memory in mtype range */ 3984 VM_STAT_ADD(vmm_vmstats.pgmc_allocempty); 3985 return (NULL); 3986 } 3987 3988 try_again: 3989 3990 plw_initialized = 0; 3991 plw.plw_ceq_dif = 1; 3992 3993 /* 3994 * Only hold one cachelist lock at a time, that way we 3995 * can start anywhere and not have to worry about lock 3996 * ordering. 3997 */ 3998 3999 for (plw.plw_count = 0; 4000 plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) { 4001 sbin = bin; 4002 do { 4003 4004 if (!PAGE_CACHELISTS(mnode, bin, mtype)) 4005 goto bin_empty_1; 4006 pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST); 4007 mutex_enter(pcm); 4008 pp = PAGE_CACHELISTS(mnode, bin, mtype); 4009 if (pp == NULL) 4010 goto bin_empty_0; 4011 4012 first_pp = pp; 4013 ASSERT(pp->p_vnode); 4014 ASSERT(PP_ISAGED(pp) == 0); 4015 ASSERT(pp->p_szc == 0); 4016 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 4017 while (!page_trylock(pp, SE_EXCL)) { 4018 pp = pp->p_next; 4019 ASSERT(pp->p_szc == 0); 4020 if (pp == first_pp) { 4021 /* 4022 * We have searched the complete list! 4023 * And all of them (might only be one) 4024 * are locked. This can happen since 4025 * these pages can also be found via 4026 * the hash list. When found via the 4027 * hash list, they are locked first, 4028 * then removed. We give up to let the 4029 * other thread run. 4030 */ 4031 pp = NULL; 4032 break; 4033 } 4034 ASSERT(pp->p_vnode); 4035 ASSERT(PP_ISFREE(pp)); 4036 ASSERT(PP_ISAGED(pp) == 0); 4037 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == 4038 mnode); 4039 } 4040 4041 if (pp) { 4042 page_t **ppp; 4043 /* 4044 * Found and locked a page. 4045 * Pull it off the list. 4046 */ 4047 ASSERT(mtype == PP_2_MTYPE(pp)); 4048 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 4049 page_sub(ppp, pp); 4050 /* 4051 * Subtract counters before releasing pcm mutex 4052 * to avoid a race with page_freelist_coalesce 4053 * and page_freelist_split. 4054 */ 4055 page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST); 4056 mutex_exit(pcm); 4057 ASSERT(pp->p_vnode); 4058 ASSERT(PP_ISAGED(pp) == 0); 4059 #if defined(__sparc) 4060 ASSERT(!kcage_on || 4061 (flags & PG_NORELOC) == 0 || 4062 PP_ISNORELOC(pp)); 4063 if (PP_ISNORELOC(pp)) { 4064 kcage_freemem_sub(1); 4065 } 4066 #endif 4067 VM_STAT_ADD(vmm_vmstats. pgmc_allocok); 4068 return (pp); 4069 } 4070 bin_empty_0: 4071 mutex_exit(pcm); 4072 bin_empty_1: 4073 if (plw_initialized == 0) { 4074 page_list_walk_init(0, flags, bin, 0, 1, &plw); 4075 plw_initialized = 1; 4076 } 4077 /* calculate the next bin with equivalent color */ 4078 bin = ADD_MASKED(bin, plw.plw_bin_step, 4079 plw.plw_ceq_mask[0], plw.plw_color_mask); 4080 } while (sbin != bin); 4081 4082 if (plw.plw_ceq_dif > 1) 4083 bin = page_list_walk_next_bin(0, bin, &plw); 4084 } 4085 4086 MTYPE_NEXT(mnode, mtype, flags); 4087 if (mtype >= 0) 4088 goto try_again; 4089 4090 VM_STAT_ADD(vmm_vmstats.pgmc_allocfailed); 4091 return (NULL); 4092 } 4093 4094 #ifdef DEBUG 4095 #define REPL_PAGE_STATS 4096 #endif /* DEBUG */ 4097 4098 #ifdef REPL_PAGE_STATS 4099 struct repl_page_stats { 4100 uint_t ngets; 4101 uint_t ngets_noreloc; 4102 uint_t npgr_noreloc; 4103 uint_t nnopage_first; 4104 uint_t nnopage; 4105 uint_t nhashout; 4106 uint_t nnofree; 4107 uint_t nnext_pp; 4108 } repl_page_stats; 4109 #define REPL_STAT_INCR(v) atomic_add_32(&repl_page_stats.v, 1) 4110 #else /* REPL_PAGE_STATS */ 4111 #define REPL_STAT_INCR(v) 4112 #endif /* REPL_PAGE_STATS */ 4113 4114 int pgrppgcp; 4115 4116 /* 4117 * The freemem accounting must be done by the caller. 4118 * First we try to get a replacement page of the same size as like_pp, 4119 * if that is not possible, then we just get a set of discontiguous 4120 * PAGESIZE pages. 4121 */ 4122 page_t * 4123 page_get_replacement_page(page_t *orig_like_pp, struct lgrp *lgrp_target, 4124 uint_t pgrflags) 4125 { 4126 page_t *like_pp; 4127 page_t *pp, *pplist; 4128 page_t *pl = NULL; 4129 ulong_t bin; 4130 int mnode, page_mnode; 4131 int szc; 4132 spgcnt_t npgs, pg_cnt; 4133 pfn_t pfnum; 4134 int mtype; 4135 int flags = 0; 4136 lgrp_mnode_cookie_t lgrp_cookie; 4137 lgrp_t *lgrp; 4138 4139 REPL_STAT_INCR(ngets); 4140 like_pp = orig_like_pp; 4141 ASSERT(PAGE_EXCL(like_pp)); 4142 4143 szc = like_pp->p_szc; 4144 npgs = page_get_pagecnt(szc); 4145 /* 4146 * Now we reset like_pp to the base page_t. 4147 * That way, we won't walk past the end of this 'szc' page. 4148 */ 4149 pfnum = PFN_BASE(like_pp->p_pagenum, szc); 4150 like_pp = page_numtopp_nolock(pfnum); 4151 ASSERT(like_pp->p_szc == szc); 4152 4153 if (PP_ISNORELOC(like_pp)) { 4154 ASSERT(kcage_on); 4155 REPL_STAT_INCR(ngets_noreloc); 4156 flags = PGI_RELOCONLY; 4157 } else if (pgrflags & PGR_NORELOC) { 4158 ASSERT(kcage_on); 4159 REPL_STAT_INCR(npgr_noreloc); 4160 flags = PG_NORELOC; 4161 } 4162 4163 /* 4164 * Kernel pages must always be replaced with the same size 4165 * pages, since we cannot properly handle demotion of kernel 4166 * pages. 4167 */ 4168 if (PP_ISKAS(like_pp)) 4169 pgrflags |= PGR_SAMESZC; 4170 4171 /* LINTED */ 4172 MTYPE_PGR_INIT(mtype, flags, like_pp, page_mnode, npgs); 4173 4174 while (npgs) { 4175 pplist = NULL; 4176 for (;;) { 4177 pg_cnt = page_get_pagecnt(szc); 4178 bin = PP_2_BIN(like_pp); 4179 ASSERT(like_pp->p_szc == orig_like_pp->p_szc); 4180 ASSERT(pg_cnt <= npgs); 4181 4182 /* 4183 * If an lgroup was specified, try to get the 4184 * page from that lgroup. 4185 * NOTE: Must be careful with code below because 4186 * lgroup may disappear and reappear since there 4187 * is no locking for lgroup here. 4188 */ 4189 if (LGRP_EXISTS(lgrp_target)) { 4190 /* 4191 * Keep local variable for lgroup separate 4192 * from lgroup argument since this code should 4193 * only be exercised when lgroup argument 4194 * exists.... 4195 */ 4196 lgrp = lgrp_target; 4197 4198 /* Try the lgroup's freelists first */ 4199 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 4200 LGRP_SRCH_LOCAL); 4201 while ((pplist == NULL) && 4202 (mnode = lgrp_memnode_choose(&lgrp_cookie)) 4203 != -1) { 4204 pplist = 4205 page_get_mnode_freelist(mnode, bin, 4206 mtype, szc, flags); 4207 } 4208 4209 /* 4210 * Now try it's cachelists if this is a 4211 * small page. Don't need to do it for 4212 * larger ones since page_freelist_coalesce() 4213 * already failed. 4214 */ 4215 if (pplist != NULL || szc != 0) 4216 break; 4217 4218 /* Now try it's cachelists */ 4219 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 4220 LGRP_SRCH_LOCAL); 4221 4222 while ((pplist == NULL) && 4223 (mnode = lgrp_memnode_choose(&lgrp_cookie)) 4224 != -1) { 4225 pplist = 4226 page_get_mnode_cachelist(bin, flags, 4227 mnode, mtype); 4228 } 4229 if (pplist != NULL) { 4230 page_hashout(pplist, NULL); 4231 PP_SETAGED(pplist); 4232 REPL_STAT_INCR(nhashout); 4233 break; 4234 } 4235 /* Done looking in this lgroup. Bail out. */ 4236 break; 4237 } 4238 4239 /* 4240 * No lgroup was specified (or lgroup was removed by 4241 * DR, so just try to get the page as close to 4242 * like_pp's mnode as possible. 4243 * First try the local freelist... 4244 */ 4245 mnode = PP_2_MEM_NODE(like_pp); 4246 pplist = page_get_mnode_freelist(mnode, bin, 4247 mtype, szc, flags); 4248 if (pplist != NULL) 4249 break; 4250 4251 REPL_STAT_INCR(nnofree); 4252 4253 /* 4254 * ...then the local cachelist. Don't need to do it for 4255 * larger pages cause page_freelist_coalesce() already 4256 * failed there anyway. 4257 */ 4258 if (szc == 0) { 4259 pplist = page_get_mnode_cachelist(bin, flags, 4260 mnode, mtype); 4261 if (pplist != NULL) { 4262 page_hashout(pplist, NULL); 4263 PP_SETAGED(pplist); 4264 REPL_STAT_INCR(nhashout); 4265 break; 4266 } 4267 } 4268 4269 /* Now try remote freelists */ 4270 page_mnode = mnode; 4271 lgrp = 4272 lgrp_hand_to_lgrp(MEM_NODE_2_LGRPHAND(page_mnode)); 4273 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 4274 LGRP_SRCH_HIER); 4275 while (pplist == NULL && 4276 (mnode = lgrp_memnode_choose(&lgrp_cookie)) 4277 != -1) { 4278 /* 4279 * Skip local mnode. 4280 */ 4281 if ((mnode == page_mnode) || 4282 (mem_node_config[mnode].exists == 0)) 4283 continue; 4284 4285 pplist = page_get_mnode_freelist(mnode, 4286 bin, mtype, szc, flags); 4287 } 4288 4289 if (pplist != NULL) 4290 break; 4291 4292 4293 /* Now try remote cachelists */ 4294 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 4295 LGRP_SRCH_HIER); 4296 while (pplist == NULL && szc == 0) { 4297 mnode = lgrp_memnode_choose(&lgrp_cookie); 4298 if (mnode == -1) 4299 break; 4300 /* 4301 * Skip local mnode. 4302 */ 4303 if ((mnode == page_mnode) || 4304 (mem_node_config[mnode].exists == 0)) 4305 continue; 4306 4307 pplist = page_get_mnode_cachelist(bin, 4308 flags, mnode, mtype); 4309 4310 if (pplist != NULL) { 4311 page_hashout(pplist, NULL); 4312 PP_SETAGED(pplist); 4313 REPL_STAT_INCR(nhashout); 4314 break; 4315 } 4316 } 4317 4318 /* 4319 * Break out of while loop under the following cases: 4320 * - If we successfully got a page. 4321 * - If pgrflags specified only returning a specific 4322 * page size and we could not find that page size. 4323 * - If we could not satisfy the request with PAGESIZE 4324 * or larger pages. 4325 */ 4326 if (pplist != NULL || szc == 0) 4327 break; 4328 4329 if ((pgrflags & PGR_SAMESZC) || pgrppgcp) { 4330 /* try to find contig page */ 4331 4332 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 4333 LGRP_SRCH_HIER); 4334 4335 while ((pplist == NULL) && 4336 (mnode = 4337 lgrp_memnode_choose(&lgrp_cookie)) 4338 != -1) { 4339 pplist = page_get_contig_pages( 4340 mnode, bin, mtype, szc, 4341 flags | PGI_PGCPHIPRI); 4342 } 4343 break; 4344 } 4345 4346 /* 4347 * The correct thing to do here is try the next 4348 * page size down using szc--. Due to a bug 4349 * with the processing of HAT_RELOAD_SHARE 4350 * where the sfmmu_ttecnt arrays of all 4351 * hats sharing an ISM segment don't get updated, 4352 * using intermediate size pages for relocation 4353 * can lead to continuous page faults. 4354 */ 4355 szc = 0; 4356 } 4357 4358 if (pplist != NULL) { 4359 DTRACE_PROBE4(page__get, 4360 lgrp_t *, lgrp, 4361 int, mnode, 4362 ulong_t, bin, 4363 uint_t, flags); 4364 4365 while (pplist != NULL && pg_cnt--) { 4366 ASSERT(pplist != NULL); 4367 pp = pplist; 4368 page_sub(&pplist, pp); 4369 PP_CLRFREE(pp); 4370 PP_CLRAGED(pp); 4371 page_list_concat(&pl, &pp); 4372 npgs--; 4373 like_pp = like_pp + 1; 4374 REPL_STAT_INCR(nnext_pp); 4375 } 4376 ASSERT(pg_cnt == 0); 4377 } else { 4378 break; 4379 } 4380 } 4381 4382 if (npgs) { 4383 /* 4384 * We were unable to allocate the necessary number 4385 * of pages. 4386 * We need to free up any pl. 4387 */ 4388 REPL_STAT_INCR(nnopage); 4389 page_free_replacement_page(pl); 4390 return (NULL); 4391 } else { 4392 return (pl); 4393 } 4394 } 4395 4396 /* 4397 * demote a free large page to it's constituent pages 4398 */ 4399 void 4400 page_demote_free_pages(page_t *pp) 4401 { 4402 4403 int mnode; 4404 4405 ASSERT(pp != NULL); 4406 ASSERT(PAGE_LOCKED(pp)); 4407 ASSERT(PP_ISFREE(pp)); 4408 ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes); 4409 4410 mnode = PP_2_MEM_NODE(pp); 4411 page_freelist_lock(mnode); 4412 if (pp->p_szc != 0) { 4413 (void) page_demote(mnode, PFN_BASE(pp->p_pagenum, 4414 pp->p_szc), 0, pp->p_szc, 0, PC_NO_COLOR, PC_FREE); 4415 } 4416 page_freelist_unlock(mnode); 4417 ASSERT(pp->p_szc == 0); 4418 } 4419 4420 /* 4421 * Factor in colorequiv to check additional 'equivalent' bins. 4422 * colorequiv may be set in /etc/system 4423 */ 4424 void 4425 page_set_colorequiv_arr(void) 4426 { 4427 if (colorequiv > 1) { 4428 int i; 4429 uint_t sv_a = lowbit(colorequiv) - 1; 4430 4431 if (sv_a > 15) 4432 sv_a = 15; 4433 4434 for (i = 0; i < MMU_PAGE_SIZES; i++) { 4435 uint_t colors; 4436 uint_t a = sv_a; 4437 4438 if ((colors = hw_page_array[i].hp_colors) <= 1) { 4439 continue; 4440 } 4441 while ((colors >> a) == 0) 4442 a--; 4443 if ((a << 4) > colorequivszc[i]) { 4444 colorequivszc[i] = (a << 4); 4445 } 4446 } 4447 } 4448 } 4449