1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * Portions of this source code were derived from Berkeley 4.3 BSD 31 * under license from the Regents of the University of California. 32 */ 33 34 #pragma ident "%Z%%M% %I% %E% SMI" 35 36 /* 37 * This file contains common functions to access and manage the page lists. 38 * Many of these routines originated from platform dependent modules 39 * (sun4/vm/vm_dep.c, i86pc/vm/vm_machdep.c) and modified to function in 40 * a platform independent manner. 41 * 42 * vm/vm_dep.h provides for platform specific support. 43 */ 44 45 #include <sys/types.h> 46 #include <sys/debug.h> 47 #include <sys/cmn_err.h> 48 #include <sys/systm.h> 49 #include <sys/atomic.h> 50 #include <sys/sysmacros.h> 51 #include <vm/as.h> 52 #include <vm/page.h> 53 #include <vm/seg_kmem.h> 54 #include <vm/seg_vn.h> 55 #include <sys/memnode.h> 56 #include <vm/vm_dep.h> 57 #include <sys/lgrp.h> 58 #include <sys/mem_config.h> 59 #include <sys/callb.h> 60 #include <sys/mem_cage.h> 61 #include <sys/sdt.h> 62 63 extern uint_t vac_colors; 64 65 #define MAX_PRAGMA_ALIGN 128 66 67 /* vm_cpu_data0 for the boot cpu before kmem is initialized */ 68 69 #if L2CACHE_ALIGN_MAX <= MAX_PRAGMA_ALIGN 70 #pragma align L2CACHE_ALIGN_MAX(vm_cpu_data0) 71 #else 72 #pragma align MAX_PRAGMA_ALIGN(vm_cpu_data0) 73 #endif 74 char vm_cpu_data0[VM_CPU_DATA_PADSIZE]; 75 76 /* 77 * number of page colors equivalent to reqested color in page_get routines. 78 * If set, keeps large pages intact longer and keeps MPO allocation 79 * from the local mnode in favor of acquiring the 'correct' page color from 80 * a demoted large page or from a remote mnode. 81 */ 82 uint_t colorequiv; 83 84 /* 85 * color equivalency mask for each page size. 86 * Mask is computed based on cpu L2$ way sizes and colorequiv global. 87 * High 4 bits determine the number of high order bits of the color to ignore. 88 * Low 4 bits determines number of low order bits of color to ignore (it's only 89 * relevant for hashed index based page coloring). 90 */ 91 uchar_t colorequivszc[MMU_PAGE_SIZES]; 92 93 /* 94 * if set, specifies the percentage of large pages that are free from within 95 * a large page region before attempting to lock those pages for 96 * page_get_contig_pages processing. 97 * 98 * Should be turned on when kpr is available when page_trylock_contig_pages 99 * can be more selective. 100 */ 101 102 int ptcpthreshold; 103 104 /* 105 * Limit page get contig page search based on failure cnts in pgcpfailcnt[]. 106 * Enabled by default via pgcplimitsearch. 107 * 108 * pgcpfailcnt[] is bounded by PGCPFAILMAX (>= 1/2 of installed 109 * memory). When reached, pgcpfailcnt[] is reset to 1/2 of this upper 110 * bound. This upper bound range guarantees: 111 * - all large page 'slots' will be searched over time 112 * - the minimum (1) large page candidates considered on each pgcp call 113 * - count doesn't wrap around to 0 114 */ 115 pgcnt_t pgcpfailcnt[MMU_PAGE_SIZES]; 116 int pgcplimitsearch = 1; 117 118 #define PGCPFAILMAX (1 << (highbit(physinstalled) - 1)) 119 #define SETPGCPFAILCNT(szc) \ 120 if (++pgcpfailcnt[szc] >= PGCPFAILMAX) \ 121 pgcpfailcnt[szc] = PGCPFAILMAX / 2; 122 123 #ifdef VM_STATS 124 struct vmm_vmstats_str vmm_vmstats; 125 126 #endif /* VM_STATS */ 127 128 #if defined(__sparc) 129 #define LPGCREATE 0 130 #else 131 /* enable page_get_contig_pages */ 132 #define LPGCREATE 1 133 #endif 134 135 int pg_contig_disable; 136 int pg_lpgcreate_nocage = LPGCREATE; 137 138 /* 139 * page_freelist_split pfn flag to signify no hi pfn requirement. 140 */ 141 #define PFNNULL 0 142 143 /* Flags involved in promotion and demotion routines */ 144 #define PC_FREE 0x1 /* put page on freelist */ 145 #define PC_ALLOC 0x2 /* return page for allocation */ 146 147 /* 148 * Flag for page_demote to be used with PC_FREE to denote that we don't care 149 * what the color is as the color parameter to the function is ignored. 150 */ 151 #define PC_NO_COLOR (-1) 152 153 /* mtype value for page_promote to use when mtype does not matter */ 154 #define PC_MTYPE_ANY (-1) 155 156 /* 157 * page counters candidates info 158 * See page_ctrs_cands comment below for more details. 159 * fields are as follows: 160 * pcc_pages_free: # pages which freelist coalesce can create 161 * pcc_color_free: pointer to page free counts per color 162 */ 163 typedef struct pcc_info { 164 pgcnt_t pcc_pages_free; 165 pgcnt_t *pcc_color_free; 166 } pcc_info_t; 167 168 /* 169 * On big machines it can take a long time to check page_counters 170 * arrays. page_ctrs_cands is a summary array whose elements are a dynamically 171 * updated sum of all elements of the corresponding page_counters arrays. 172 * page_freelist_coalesce() searches page_counters only if an appropriate 173 * element of page_ctrs_cands array is greater than 0. 174 * 175 * page_ctrs_cands is indexed by mutex (i), region (r), mnode (m), mrange (g) 176 */ 177 pcc_info_t **page_ctrs_cands[NPC_MUTEX][MMU_PAGE_SIZES]; 178 179 /* 180 * Return in val the total number of free pages which can be created 181 * for the given mnode (m), mrange (g), and region size (r) 182 */ 183 #define PGCTRS_CANDS_GETVALUE(m, g, r, val) { \ 184 int i; \ 185 val = 0; \ 186 for (i = 0; i < NPC_MUTEX; i++) { \ 187 val += page_ctrs_cands[i][(r)][(m)][(g)].pcc_pages_free; \ 188 } \ 189 } 190 191 /* 192 * Return in val the total number of free pages which can be created 193 * for the given mnode (m), mrange (g), region size (r), and color (c) 194 */ 195 #define PGCTRS_CANDS_GETVALUECOLOR(m, g, r, c, val) { \ 196 int i; \ 197 val = 0; \ 198 ASSERT((c) < PAGE_GET_PAGECOLORS(r)); \ 199 for (i = 0; i < NPC_MUTEX; i++) { \ 200 val += \ 201 page_ctrs_cands[i][(r)][(m)][(g)].pcc_color_free[(c)]; \ 202 } \ 203 } 204 205 /* 206 * We can only allow a single thread to update a counter within the physical 207 * range of the largest supported page size. That is the finest granularity 208 * possible since the counter values are dependent on each other 209 * as you move accross region sizes. PP_CTR_LOCK_INDX is used to determine the 210 * ctr_mutex lock index for a particular physical range. 211 */ 212 static kmutex_t *ctr_mutex[NPC_MUTEX]; 213 214 #define PP_CTR_LOCK_INDX(pp) \ 215 (((pp)->p_pagenum >> \ 216 (PAGE_BSZS_SHIFT(mmu_page_sizes - 1))) & (NPC_MUTEX - 1)) 217 218 #define INVALID_COLOR 0xffffffff 219 #define INVALID_MASK 0xffffffff 220 221 /* 222 * Local functions prototypes. 223 */ 224 225 void page_ctr_add(int, int, page_t *, int); 226 void page_ctr_add_internal(int, int, page_t *, int); 227 void page_ctr_sub(int, int, page_t *, int); 228 void page_ctr_sub_internal(int, int, page_t *, int); 229 void page_freelist_lock(int); 230 void page_freelist_unlock(int); 231 page_t *page_promote(int, pfn_t, uchar_t, int, int); 232 page_t *page_demote(int, pfn_t, uchar_t, uchar_t, int, int); 233 page_t *page_freelist_split(uchar_t, 234 uint_t, int, int, pfn_t, page_list_walker_t *); 235 page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int); 236 static int page_trylock_cons(page_t *pp, se_t se); 237 238 /* 239 * The page_counters array below is used to keep track of free contiguous 240 * physical memory. A hw_page_map_t will be allocated per mnode per szc. 241 * This contains an array of counters, the size of the array, a shift value 242 * used to convert a pagenum into a counter array index or vice versa, as 243 * well as a cache of the last successful index to be promoted to a larger 244 * page size. As an optimization, we keep track of the last successful index 245 * to be promoted per page color for the given size region, and this is 246 * allocated dynamically based upon the number of colors for a given 247 * region size. 248 * 249 * Conceptually, the page counters are represented as: 250 * 251 * page_counters[region_size][mnode] 252 * 253 * region_size: size code of a candidate larger page made up 254 * of contiguous free smaller pages. 255 * 256 * page_counters[region_size][mnode].hpm_counters[index]: 257 * represents how many (region_size - 1) pages either 258 * exist or can be created within the given index range. 259 * 260 * Let's look at a sparc example: 261 * If we want to create a free 512k page, we look at region_size 2 262 * for the mnode we want. We calculate the index and look at a specific 263 * hpm_counters location. If we see 8 (FULL_REGION_CNT on sparc) at 264 * this location, it means that 8 64k pages either exist or can be created 265 * from 8K pages in order to make a single free 512k page at the given 266 * index. Note that when a region is full, it will contribute to the 267 * counts in the region above it. Thus we will not know what page 268 * size the free pages will be which can be promoted to this new free 269 * page unless we look at all regions below the current region. 270 */ 271 272 /* 273 * Note: hpmctr_t is defined in platform vm_dep.h 274 * hw_page_map_t contains all the information needed for the page_counters 275 * logic. The fields are as follows: 276 * 277 * hpm_counters: dynamically allocated array to hold counter data 278 * hpm_entries: entries in hpm_counters 279 * hpm_shift: shift for pnum/array index conv 280 * hpm_base: PFN mapped to counter index 0 281 * hpm_color_current: last index in counter array for this color at 282 * which we successfully created a large page 283 */ 284 typedef struct hw_page_map { 285 hpmctr_t *hpm_counters; 286 size_t hpm_entries; 287 int hpm_shift; 288 pfn_t hpm_base; 289 size_t *hpm_color_current[MAX_MNODE_MRANGES]; 290 } hw_page_map_t; 291 292 /* 293 * Element zero is not used, but is allocated for convenience. 294 */ 295 static hw_page_map_t *page_counters[MMU_PAGE_SIZES]; 296 297 /* 298 * Cached value of MNODE_RANGE_CNT(mnode). 299 * This is a function call in x86. 300 */ 301 static int mnode_nranges[MAX_MEM_NODES]; 302 static int mnode_maxmrange[MAX_MEM_NODES]; 303 304 /* 305 * The following macros are convenient ways to get access to the individual 306 * elements of the page_counters arrays. They can be used on both 307 * the left side and right side of equations. 308 */ 309 #define PAGE_COUNTERS(mnode, rg_szc, idx) \ 310 (page_counters[(rg_szc)][(mnode)].hpm_counters[(idx)]) 311 312 #define PAGE_COUNTERS_COUNTERS(mnode, rg_szc) \ 313 (page_counters[(rg_szc)][(mnode)].hpm_counters) 314 315 #define PAGE_COUNTERS_SHIFT(mnode, rg_szc) \ 316 (page_counters[(rg_szc)][(mnode)].hpm_shift) 317 318 #define PAGE_COUNTERS_ENTRIES(mnode, rg_szc) \ 319 (page_counters[(rg_szc)][(mnode)].hpm_entries) 320 321 #define PAGE_COUNTERS_BASE(mnode, rg_szc) \ 322 (page_counters[(rg_szc)][(mnode)].hpm_base) 323 324 #define PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, rg_szc, g) \ 325 (page_counters[(rg_szc)][(mnode)].hpm_color_current[(g)]) 326 327 #define PAGE_COUNTERS_CURRENT_COLOR(mnode, rg_szc, color, mrange) \ 328 (page_counters[(rg_szc)][(mnode)]. \ 329 hpm_color_current[(mrange)][(color)]) 330 331 #define PNUM_TO_IDX(mnode, rg_szc, pnum) \ 332 (((pnum) - PAGE_COUNTERS_BASE((mnode), (rg_szc))) >> \ 333 PAGE_COUNTERS_SHIFT((mnode), (rg_szc))) 334 335 #define IDX_TO_PNUM(mnode, rg_szc, index) \ 336 (PAGE_COUNTERS_BASE((mnode), (rg_szc)) + \ 337 ((index) << PAGE_COUNTERS_SHIFT((mnode), (rg_szc)))) 338 339 /* 340 * Protects the hpm_counters and hpm_color_current memory from changing while 341 * looking at page counters information. 342 * Grab the write lock to modify what these fields point at. 343 * Grab the read lock to prevent any pointers from changing. 344 * The write lock can not be held during memory allocation due to a possible 345 * recursion deadlock with trying to grab the read lock while the 346 * write lock is already held. 347 */ 348 krwlock_t page_ctrs_rwlock[MAX_MEM_NODES]; 349 350 351 /* 352 * initialize cpu_vm_data to point at cache aligned vm_cpu_data_t. 353 */ 354 void 355 cpu_vm_data_init(struct cpu *cp) 356 { 357 if (cp == CPU0) { 358 cp->cpu_vm_data = (void *)&vm_cpu_data0; 359 } else { 360 void *kmptr; 361 int align; 362 size_t sz; 363 364 align = (L2CACHE_ALIGN) ? L2CACHE_ALIGN : L2CACHE_ALIGN_MAX; 365 sz = P2ROUNDUP(sizeof (vm_cpu_data_t), align) + align; 366 kmptr = kmem_zalloc(sz, KM_SLEEP); 367 cp->cpu_vm_data = (void *) P2ROUNDUP((uintptr_t)kmptr, align); 368 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr = kmptr; 369 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize = sz; 370 } 371 } 372 373 /* 374 * free cpu_vm_data 375 */ 376 void 377 cpu_vm_data_destroy(struct cpu *cp) 378 { 379 if (cp->cpu_seqid && cp->cpu_vm_data) { 380 ASSERT(cp != CPU0); 381 kmem_free(((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr, 382 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize); 383 } 384 cp->cpu_vm_data = NULL; 385 } 386 387 388 /* 389 * page size to page size code 390 */ 391 int 392 page_szc(size_t pagesize) 393 { 394 int i = 0; 395 396 while (hw_page_array[i].hp_size) { 397 if (pagesize == hw_page_array[i].hp_size) 398 return (i); 399 i++; 400 } 401 return (-1); 402 } 403 404 /* 405 * page size to page size code with the restriction that it be a supported 406 * user page size. If it's not a supported user page size, -1 will be returned. 407 */ 408 int 409 page_szc_user_filtered(size_t pagesize) 410 { 411 int szc = page_szc(pagesize); 412 if ((szc != -1) && (SZC_2_USERSZC(szc) != -1)) { 413 return (szc); 414 } 415 return (-1); 416 } 417 418 /* 419 * Return how many page sizes are available for the user to use. This is 420 * what the hardware supports and not based upon how the OS implements the 421 * support of different page sizes. 422 * 423 * If legacy is non-zero, return the number of pagesizes available to legacy 424 * applications. The number of legacy page sizes might be less than the 425 * exported user page sizes. This is to prevent legacy applications that 426 * use the largest page size returned from getpagesizes(3c) from inadvertantly 427 * using the 'new' large pagesizes. 428 */ 429 uint_t 430 page_num_user_pagesizes(int legacy) 431 { 432 if (legacy) 433 return (mmu_legacy_page_sizes); 434 return (mmu_exported_page_sizes); 435 } 436 437 uint_t 438 page_num_pagesizes(void) 439 { 440 return (mmu_page_sizes); 441 } 442 443 /* 444 * returns the count of the number of base pagesize pages associated with szc 445 */ 446 pgcnt_t 447 page_get_pagecnt(uint_t szc) 448 { 449 if (szc >= mmu_page_sizes) 450 panic("page_get_pagecnt: out of range %d", szc); 451 return (hw_page_array[szc].hp_pgcnt); 452 } 453 454 size_t 455 page_get_pagesize(uint_t szc) 456 { 457 if (szc >= mmu_page_sizes) 458 panic("page_get_pagesize: out of range %d", szc); 459 return (hw_page_array[szc].hp_size); 460 } 461 462 /* 463 * Return the size of a page based upon the index passed in. An index of 464 * zero refers to the smallest page size in the system, and as index increases 465 * it refers to the next larger supported page size in the system. 466 * Note that szc and userszc may not be the same due to unsupported szc's on 467 * some systems. 468 */ 469 size_t 470 page_get_user_pagesize(uint_t userszc) 471 { 472 uint_t szc = USERSZC_2_SZC(userszc); 473 474 if (szc >= mmu_page_sizes) 475 panic("page_get_user_pagesize: out of range %d", szc); 476 return (hw_page_array[szc].hp_size); 477 } 478 479 uint_t 480 page_get_shift(uint_t szc) 481 { 482 if (szc >= mmu_page_sizes) 483 panic("page_get_shift: out of range %d", szc); 484 return (PAGE_GET_SHIFT(szc)); 485 } 486 487 uint_t 488 page_get_pagecolors(uint_t szc) 489 { 490 if (szc >= mmu_page_sizes) 491 panic("page_get_pagecolors: out of range %d", szc); 492 return (PAGE_GET_PAGECOLORS(szc)); 493 } 494 495 /* 496 * this assigns the desired equivalent color after a split 497 */ 498 uint_t 499 page_correct_color(uchar_t szc, uchar_t nszc, uint_t color, 500 uint_t ncolor, uint_t ceq_mask) 501 { 502 ASSERT(nszc > szc); 503 ASSERT(szc < mmu_page_sizes); 504 ASSERT(color < PAGE_GET_PAGECOLORS(szc)); 505 ASSERT(ncolor < PAGE_GET_PAGECOLORS(nszc)); 506 507 color &= ceq_mask; 508 ncolor = PAGE_CONVERT_COLOR(ncolor, szc, nszc); 509 return (color | (ncolor & ~ceq_mask)); 510 } 511 512 /* 513 * The interleaved_mnodes flag is set when mnodes overlap in 514 * the physbase..physmax range, but have disjoint slices. 515 * In this case hpm_counters is shared by all mnodes. 516 * This flag is set dynamically by the platform. 517 */ 518 int interleaved_mnodes = 0; 519 520 /* 521 * Called by startup(). 522 * Size up the per page size free list counters based on physmax 523 * of each node and max_mem_nodes. 524 * 525 * If interleaved_mnodes is set we need to find the first mnode that 526 * exists. hpm_counters for the first mnode will then be shared by 527 * all other mnodes. If interleaved_mnodes is not set, just set 528 * first=mnode each time. That means there will be no sharing. 529 */ 530 size_t 531 page_ctrs_sz(void) 532 { 533 int r; /* region size */ 534 int mnode; 535 int firstmn; /* first mnode that exists */ 536 int nranges; 537 pfn_t physbase; 538 pfn_t physmax; 539 uint_t ctrs_sz = 0; 540 int i; 541 pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; 542 543 /* 544 * We need to determine how many page colors there are for each 545 * page size in order to allocate memory for any color specific 546 * arrays. 547 */ 548 for (i = 0; i < mmu_page_sizes; i++) { 549 colors_per_szc[i] = PAGE_GET_PAGECOLORS(i); 550 } 551 552 for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) { 553 554 pgcnt_t r_pgcnt; 555 pfn_t r_base; 556 pgcnt_t r_align; 557 558 if (mem_node_config[mnode].exists == 0) 559 continue; 560 561 HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn); 562 nranges = MNODE_RANGE_CNT(mnode); 563 mnode_nranges[mnode] = nranges; 564 mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode); 565 566 /* 567 * determine size needed for page counter arrays with 568 * base aligned to large page size. 569 */ 570 for (r = 1; r < mmu_page_sizes; r++) { 571 /* add in space for hpm_color_current */ 572 ctrs_sz += sizeof (size_t) * 573 colors_per_szc[r] * nranges; 574 575 if (firstmn != mnode) 576 continue; 577 578 /* add in space for hpm_counters */ 579 r_align = page_get_pagecnt(r); 580 r_base = physbase; 581 r_base &= ~(r_align - 1); 582 r_pgcnt = howmany(physmax - r_base + 1, r_align); 583 584 /* 585 * Round up to always allocate on pointer sized 586 * boundaries. 587 */ 588 ctrs_sz += P2ROUNDUP((r_pgcnt * sizeof (hpmctr_t)), 589 sizeof (hpmctr_t *)); 590 } 591 } 592 593 for (r = 1; r < mmu_page_sizes; r++) { 594 ctrs_sz += (max_mem_nodes * sizeof (hw_page_map_t)); 595 } 596 597 /* add in space for page_ctrs_cands and pcc_color_free */ 598 ctrs_sz += sizeof (pcc_info_t *) * max_mem_nodes * 599 mmu_page_sizes * NPC_MUTEX; 600 601 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 602 603 if (mem_node_config[mnode].exists == 0) 604 continue; 605 606 nranges = mnode_nranges[mnode]; 607 ctrs_sz += sizeof (pcc_info_t) * nranges * 608 mmu_page_sizes * NPC_MUTEX; 609 for (r = 1; r < mmu_page_sizes; r++) { 610 ctrs_sz += sizeof (pgcnt_t) * nranges * 611 colors_per_szc[r] * NPC_MUTEX; 612 } 613 } 614 615 /* ctr_mutex */ 616 ctrs_sz += (max_mem_nodes * NPC_MUTEX * sizeof (kmutex_t)); 617 618 /* size for page list counts */ 619 PLCNT_SZ(ctrs_sz); 620 621 /* 622 * add some slop for roundups. page_ctrs_alloc will roundup the start 623 * address of the counters to ecache_alignsize boundary for every 624 * memory node. 625 */ 626 return (ctrs_sz + max_mem_nodes * L2CACHE_ALIGN); 627 } 628 629 caddr_t 630 page_ctrs_alloc(caddr_t alloc_base) 631 { 632 int mnode; 633 int mrange, nranges; 634 int r; /* region size */ 635 int i; 636 int firstmn; /* first mnode that exists */ 637 pfn_t physbase; 638 pfn_t physmax; 639 pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; 640 641 /* 642 * We need to determine how many page colors there are for each 643 * page size in order to allocate memory for any color specific 644 * arrays. 645 */ 646 for (i = 0; i < mmu_page_sizes; i++) { 647 colors_per_szc[i] = PAGE_GET_PAGECOLORS(i); 648 } 649 650 for (r = 1; r < mmu_page_sizes; r++) { 651 page_counters[r] = (hw_page_map_t *)alloc_base; 652 alloc_base += (max_mem_nodes * sizeof (hw_page_map_t)); 653 } 654 655 /* page_ctrs_cands and pcc_color_free array */ 656 for (i = 0; i < NPC_MUTEX; i++) { 657 for (r = 1; r < mmu_page_sizes; r++) { 658 659 page_ctrs_cands[i][r] = (pcc_info_t **)alloc_base; 660 alloc_base += sizeof (pcc_info_t *) * max_mem_nodes; 661 662 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 663 pcc_info_t *pi; 664 665 if (mem_node_config[mnode].exists == 0) 666 continue; 667 668 nranges = mnode_nranges[mnode]; 669 670 pi = (pcc_info_t *)alloc_base; 671 alloc_base += sizeof (pcc_info_t) * nranges; 672 page_ctrs_cands[i][r][mnode] = pi; 673 674 for (mrange = 0; mrange < nranges; mrange++) { 675 pi->pcc_color_free = 676 (pgcnt_t *)alloc_base; 677 alloc_base += sizeof (pgcnt_t) * 678 colors_per_szc[r]; 679 pi++; 680 } 681 } 682 } 683 } 684 685 /* ctr_mutex */ 686 for (i = 0; i < NPC_MUTEX; i++) { 687 ctr_mutex[i] = (kmutex_t *)alloc_base; 688 alloc_base += (max_mem_nodes * sizeof (kmutex_t)); 689 } 690 691 /* initialize page list counts */ 692 PLCNT_INIT(alloc_base); 693 694 for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) { 695 696 pgcnt_t r_pgcnt; 697 pfn_t r_base; 698 pgcnt_t r_align; 699 int r_shift; 700 int nranges = mnode_nranges[mnode]; 701 702 if (mem_node_config[mnode].exists == 0) 703 continue; 704 705 HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn); 706 707 for (r = 1; r < mmu_page_sizes; r++) { 708 /* 709 * the page_counters base has to be aligned to the 710 * page count of page size code r otherwise the counts 711 * will cross large page boundaries. 712 */ 713 r_align = page_get_pagecnt(r); 714 r_base = physbase; 715 /* base needs to be aligned - lower to aligned value */ 716 r_base &= ~(r_align - 1); 717 r_pgcnt = howmany(physmax - r_base + 1, r_align); 718 r_shift = PAGE_BSZS_SHIFT(r); 719 720 PAGE_COUNTERS_SHIFT(mnode, r) = r_shift; 721 PAGE_COUNTERS_ENTRIES(mnode, r) = r_pgcnt; 722 PAGE_COUNTERS_BASE(mnode, r) = r_base; 723 for (mrange = 0; mrange < nranges; mrange++) { 724 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, 725 r, mrange) = (size_t *)alloc_base; 726 alloc_base += sizeof (size_t) * 727 colors_per_szc[r]; 728 } 729 for (i = 0; i < colors_per_szc[r]; i++) { 730 uint_t color_mask = colors_per_szc[r] - 1; 731 pfn_t pfnum = r_base; 732 size_t idx; 733 int mrange; 734 MEM_NODE_ITERATOR_DECL(it); 735 736 MEM_NODE_ITERATOR_INIT(pfnum, mnode, &it); 737 ASSERT(pfnum != (pfn_t)-1); 738 PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i, 739 color_mask, color_mask, &it); 740 idx = PNUM_TO_IDX(mnode, r, pfnum); 741 idx = (idx >= r_pgcnt) ? 0 : idx; 742 for (mrange = 0; mrange < nranges; mrange++) { 743 PAGE_COUNTERS_CURRENT_COLOR(mnode, 744 r, i, mrange) = idx; 745 } 746 } 747 748 /* hpm_counters may be shared by all mnodes */ 749 if (firstmn == mnode) { 750 PAGE_COUNTERS_COUNTERS(mnode, r) = 751 (hpmctr_t *)alloc_base; 752 alloc_base += 753 P2ROUNDUP((sizeof (hpmctr_t) * r_pgcnt), 754 sizeof (hpmctr_t *)); 755 } else { 756 PAGE_COUNTERS_COUNTERS(mnode, r) = 757 PAGE_COUNTERS_COUNTERS(firstmn, r); 758 } 759 760 /* 761 * Verify that PNUM_TO_IDX and IDX_TO_PNUM 762 * satisfy the identity requirement. 763 * We should be able to go from one to the other 764 * and get consistent values. 765 */ 766 ASSERT(PNUM_TO_IDX(mnode, r, 767 (IDX_TO_PNUM(mnode, r, 0))) == 0); 768 ASSERT(IDX_TO_PNUM(mnode, r, 769 (PNUM_TO_IDX(mnode, r, r_base))) == r_base); 770 } 771 /* 772 * Roundup the start address of the page_counters to 773 * cache aligned boundary for every memory node. 774 * page_ctrs_sz() has added some slop for these roundups. 775 */ 776 alloc_base = (caddr_t)P2ROUNDUP((uintptr_t)alloc_base, 777 L2CACHE_ALIGN); 778 } 779 780 /* Initialize other page counter specific data structures. */ 781 for (mnode = 0; mnode < MAX_MEM_NODES; mnode++) { 782 rw_init(&page_ctrs_rwlock[mnode], NULL, RW_DEFAULT, NULL); 783 } 784 785 return (alloc_base); 786 } 787 788 /* 789 * Functions to adjust region counters for each size free list. 790 * Caller is responsible to acquire the ctr_mutex lock if necessary and 791 * thus can be called during startup without locks. 792 */ 793 /* ARGSUSED */ 794 void 795 page_ctr_add_internal(int mnode, int mtype, page_t *pp, int flags) 796 { 797 ssize_t r; /* region size */ 798 ssize_t idx; 799 pfn_t pfnum; 800 int lckidx; 801 802 ASSERT(mnode == PP_2_MEM_NODE(pp)); 803 ASSERT(mtype == PP_2_MTYPE(pp)); 804 805 ASSERT(pp->p_szc < mmu_page_sizes); 806 807 PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags); 808 809 /* no counter update needed for largest page size */ 810 if (pp->p_szc >= mmu_page_sizes - 1) { 811 return; 812 } 813 814 r = pp->p_szc + 1; 815 pfnum = pp->p_pagenum; 816 lckidx = PP_CTR_LOCK_INDX(pp); 817 818 /* 819 * Increment the count of free pages for the current 820 * region. Continue looping up in region size incrementing 821 * count if the preceeding region is full. 822 */ 823 while (r < mmu_page_sizes) { 824 idx = PNUM_TO_IDX(mnode, r, pfnum); 825 826 ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r)); 827 ASSERT(PAGE_COUNTERS(mnode, r, idx) < FULL_REGION_CNT(r)); 828 829 if (++PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r)) { 830 break; 831 } else { 832 int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r)); 833 pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode] 834 [MTYPE_2_MRANGE(mnode, root_mtype)]; 835 836 cand->pcc_pages_free++; 837 cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]++; 838 } 839 r++; 840 } 841 } 842 843 void 844 page_ctr_add(int mnode, int mtype, page_t *pp, int flags) 845 { 846 int lckidx = PP_CTR_LOCK_INDX(pp); 847 kmutex_t *lock = &ctr_mutex[lckidx][mnode]; 848 849 mutex_enter(lock); 850 page_ctr_add_internal(mnode, mtype, pp, flags); 851 mutex_exit(lock); 852 } 853 854 void 855 page_ctr_sub_internal(int mnode, int mtype, page_t *pp, int flags) 856 { 857 int lckidx; 858 ssize_t r; /* region size */ 859 ssize_t idx; 860 pfn_t pfnum; 861 862 ASSERT(mnode == PP_2_MEM_NODE(pp)); 863 ASSERT(mtype == PP_2_MTYPE(pp)); 864 865 ASSERT(pp->p_szc < mmu_page_sizes); 866 867 PLCNT_DECR(pp, mnode, mtype, pp->p_szc, flags); 868 869 /* no counter update needed for largest page size */ 870 if (pp->p_szc >= mmu_page_sizes - 1) { 871 return; 872 } 873 874 r = pp->p_szc + 1; 875 pfnum = pp->p_pagenum; 876 lckidx = PP_CTR_LOCK_INDX(pp); 877 878 /* 879 * Decrement the count of free pages for the current 880 * region. Continue looping up in region size decrementing 881 * count if the preceeding region was full. 882 */ 883 while (r < mmu_page_sizes) { 884 idx = PNUM_TO_IDX(mnode, r, pfnum); 885 886 ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r)); 887 ASSERT(PAGE_COUNTERS(mnode, r, idx) > 0); 888 889 if (--PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r) - 1) { 890 break; 891 } else { 892 int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r)); 893 pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode] 894 [MTYPE_2_MRANGE(mnode, root_mtype)]; 895 896 ASSERT(cand->pcc_pages_free != 0); 897 ASSERT(cand->pcc_color_free[PP_2_BIN_SZC(pp, r)] != 0); 898 899 cand->pcc_pages_free--; 900 cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]--; 901 } 902 r++; 903 } 904 } 905 906 void 907 page_ctr_sub(int mnode, int mtype, page_t *pp, int flags) 908 { 909 int lckidx = PP_CTR_LOCK_INDX(pp); 910 kmutex_t *lock = &ctr_mutex[lckidx][mnode]; 911 912 mutex_enter(lock); 913 page_ctr_sub_internal(mnode, mtype, pp, flags); 914 mutex_exit(lock); 915 } 916 917 /* 918 * Adjust page counters following a memory attach, since typically the 919 * size of the array needs to change, and the PFN to counter index 920 * mapping needs to change. 921 * 922 * It is possible this mnode did not exist at startup. In that case 923 * allocate pcc_info_t and pcc_color_free arrays. Also, allow for nranges 924 * to change (a theoretical possibility on x86), which means pcc_color_free 925 * arrays must be extended. 926 */ 927 uint_t 928 page_ctrs_adjust(int mnode) 929 { 930 pgcnt_t npgs; 931 int r; /* region size */ 932 int i; 933 size_t pcsz, old_csz; 934 hpmctr_t *new_ctr, *old_ctr; 935 pfn_t oldbase, newbase; 936 pfn_t physbase, physmax; 937 size_t old_npgs; 938 hpmctr_t *ctr_cache[MMU_PAGE_SIZES]; 939 size_t size_cache[MMU_PAGE_SIZES]; 940 size_t *color_cache[MMU_PAGE_SIZES][MAX_MNODE_MRANGES]; 941 size_t *old_color_array[MAX_MNODE_MRANGES]; 942 pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; 943 pcc_info_t **cands_cache; 944 pcc_info_t *old_pi, *pi; 945 pgcnt_t *pgcntp; 946 int nr, old_nranges, mrange, nranges = MNODE_RANGE_CNT(mnode); 947 int cands_cache_nranges; 948 int old_maxmrange, new_maxmrange; 949 int rc = 0; 950 951 cands_cache = kmem_zalloc(sizeof (pcc_info_t *) * NPC_MUTEX * 952 MMU_PAGE_SIZES, KM_NOSLEEP); 953 if (cands_cache == NULL) 954 return (ENOMEM); 955 956 i = -1; 957 HPM_COUNTERS_LIMITS(mnode, physbase, physmax, i); 958 959 newbase = physbase & ~PC_BASE_ALIGN_MASK; 960 npgs = roundup(physmax, PC_BASE_ALIGN) - newbase; 961 962 /* prepare to free non-null pointers on the way out */ 963 cands_cache_nranges = nranges; 964 bzero(ctr_cache, sizeof (ctr_cache)); 965 bzero(color_cache, sizeof (color_cache)); 966 967 /* 968 * We need to determine how many page colors there are for each 969 * page size in order to allocate memory for any color specific 970 * arrays. 971 */ 972 for (r = 0; r < mmu_page_sizes; r++) { 973 colors_per_szc[r] = PAGE_GET_PAGECOLORS(r); 974 } 975 976 /* 977 * Preallocate all of the new hpm_counters arrays as we can't 978 * hold the page_ctrs_rwlock as a writer and allocate memory. 979 * If we can't allocate all of the arrays, undo our work so far 980 * and return failure. 981 */ 982 for (r = 1; r < mmu_page_sizes; r++) { 983 pcsz = npgs >> PAGE_BSZS_SHIFT(r); 984 size_cache[r] = pcsz; 985 ctr_cache[r] = kmem_zalloc(pcsz * 986 sizeof (hpmctr_t), KM_NOSLEEP); 987 if (ctr_cache[r] == NULL) { 988 rc = ENOMEM; 989 goto cleanup; 990 } 991 } 992 993 /* 994 * Preallocate all of the new color current arrays as we can't 995 * hold the page_ctrs_rwlock as a writer and allocate memory. 996 * If we can't allocate all of the arrays, undo our work so far 997 * and return failure. 998 */ 999 for (r = 1; r < mmu_page_sizes; r++) { 1000 for (mrange = 0; mrange < nranges; mrange++) { 1001 color_cache[r][mrange] = kmem_zalloc(sizeof (size_t) * 1002 colors_per_szc[r], KM_NOSLEEP); 1003 if (color_cache[r][mrange] == NULL) { 1004 rc = ENOMEM; 1005 goto cleanup; 1006 } 1007 } 1008 } 1009 1010 /* 1011 * Preallocate all of the new pcc_info_t arrays as we can't 1012 * hold the page_ctrs_rwlock as a writer and allocate memory. 1013 * If we can't allocate all of the arrays, undo our work so far 1014 * and return failure. 1015 */ 1016 for (r = 1; r < mmu_page_sizes; r++) { 1017 for (i = 0; i < NPC_MUTEX; i++) { 1018 pi = kmem_zalloc(nranges * sizeof (pcc_info_t), 1019 KM_NOSLEEP); 1020 if (pi == NULL) { 1021 rc = ENOMEM; 1022 goto cleanup; 1023 } 1024 cands_cache[i * MMU_PAGE_SIZES + r] = pi; 1025 1026 for (mrange = 0; mrange < nranges; mrange++, pi++) { 1027 pgcntp = kmem_zalloc(colors_per_szc[r] * 1028 sizeof (pgcnt_t), KM_NOSLEEP); 1029 if (pgcntp == NULL) { 1030 rc = ENOMEM; 1031 goto cleanup; 1032 } 1033 pi->pcc_color_free = pgcntp; 1034 } 1035 } 1036 } 1037 1038 /* 1039 * Grab the write lock to prevent others from walking these arrays 1040 * while we are modifying them. 1041 */ 1042 PAGE_CTRS_WRITE_LOCK(mnode); 1043 1044 old_nranges = mnode_nranges[mnode]; 1045 cands_cache_nranges = old_nranges; 1046 mnode_nranges[mnode] = nranges; 1047 old_maxmrange = mnode_maxmrange[mnode]; 1048 mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode); 1049 new_maxmrange = mnode_maxmrange[mnode]; 1050 1051 for (r = 1; r < mmu_page_sizes; r++) { 1052 PAGE_COUNTERS_SHIFT(mnode, r) = PAGE_BSZS_SHIFT(r); 1053 old_ctr = PAGE_COUNTERS_COUNTERS(mnode, r); 1054 old_csz = PAGE_COUNTERS_ENTRIES(mnode, r); 1055 oldbase = PAGE_COUNTERS_BASE(mnode, r); 1056 old_npgs = old_csz << PAGE_COUNTERS_SHIFT(mnode, r); 1057 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) { 1058 old_color_array[mrange] = 1059 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, 1060 r, mrange); 1061 } 1062 1063 pcsz = npgs >> PAGE_COUNTERS_SHIFT(mnode, r); 1064 new_ctr = ctr_cache[r]; 1065 ctr_cache[r] = NULL; 1066 if (old_ctr != NULL && 1067 (oldbase + old_npgs > newbase) && 1068 (newbase + npgs > oldbase)) { 1069 /* 1070 * Map the intersection of the old and new 1071 * counters into the new array. 1072 */ 1073 size_t offset; 1074 if (newbase > oldbase) { 1075 offset = (newbase - oldbase) >> 1076 PAGE_COUNTERS_SHIFT(mnode, r); 1077 bcopy(old_ctr + offset, new_ctr, 1078 MIN(pcsz, (old_csz - offset)) * 1079 sizeof (hpmctr_t)); 1080 } else { 1081 offset = (oldbase - newbase) >> 1082 PAGE_COUNTERS_SHIFT(mnode, r); 1083 bcopy(old_ctr, new_ctr + offset, 1084 MIN(pcsz - offset, old_csz) * 1085 sizeof (hpmctr_t)); 1086 } 1087 } 1088 1089 PAGE_COUNTERS_COUNTERS(mnode, r) = new_ctr; 1090 PAGE_COUNTERS_ENTRIES(mnode, r) = pcsz; 1091 PAGE_COUNTERS_BASE(mnode, r) = newbase; 1092 1093 /* update shared hpm_counters in other mnodes */ 1094 if (interleaved_mnodes) { 1095 for (i = 0; i < max_mem_nodes; i++) { 1096 if (i == mnode) 1097 continue; 1098 if (mem_node_config[i].exists == 0) 1099 continue; 1100 ASSERT(PAGE_COUNTERS_COUNTERS(i, r) == old_ctr); 1101 PAGE_COUNTERS_COUNTERS(i, r) = new_ctr; 1102 PAGE_COUNTERS_ENTRIES(i, r) = pcsz; 1103 PAGE_COUNTERS_BASE(i, r) = newbase; 1104 } 1105 } 1106 1107 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) { 1108 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r, mrange) = 1109 color_cache[r][mrange]; 1110 color_cache[r][mrange] = NULL; 1111 } 1112 /* 1113 * for now, just reset on these events as it's probably 1114 * not worthwhile to try and optimize this. 1115 */ 1116 for (i = 0; i < colors_per_szc[r]; i++) { 1117 uint_t color_mask = colors_per_szc[r] - 1; 1118 int mlo = interleaved_mnodes ? 0 : mnode; 1119 int mhi = interleaved_mnodes ? max_mem_nodes : 1120 (mnode + 1); 1121 int m; 1122 pfn_t pfnum = newbase; 1123 size_t idx; 1124 MEM_NODE_ITERATOR_DECL(it); 1125 1126 for (m = mlo; m < mhi; m++) { 1127 if (mem_node_config[m].exists == 0) 1128 continue; 1129 MEM_NODE_ITERATOR_INIT(pfnum, m, &it); 1130 ASSERT(pfnum != (pfn_t)-1); 1131 PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i, color_mask, 1132 color_mask, &it); 1133 idx = PNUM_TO_IDX(m, r, pfnum); 1134 idx = (idx < pcsz) ? idx : 0; 1135 for (mrange = 0; mrange < nranges; mrange++) { 1136 PAGE_COUNTERS_CURRENT_COLOR(m, 1137 r, i, mrange) = idx; 1138 } 1139 } 1140 } 1141 1142 /* cache info for freeing out of the critical path */ 1143 if ((caddr_t)old_ctr >= kernelheap && 1144 (caddr_t)old_ctr < ekernelheap) { 1145 ctr_cache[r] = old_ctr; 1146 size_cache[r] = old_csz; 1147 } 1148 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) { 1149 size_t *tmp = old_color_array[mrange]; 1150 if ((caddr_t)tmp >= kernelheap && 1151 (caddr_t)tmp < ekernelheap) { 1152 color_cache[r][mrange] = tmp; 1153 } 1154 } 1155 /* 1156 * Verify that PNUM_TO_IDX and IDX_TO_PNUM 1157 * satisfy the identity requirement. 1158 * We should be able to go from one to the other 1159 * and get consistent values. 1160 */ 1161 ASSERT(PNUM_TO_IDX(mnode, r, 1162 (IDX_TO_PNUM(mnode, r, 0))) == 0); 1163 ASSERT(IDX_TO_PNUM(mnode, r, 1164 (PNUM_TO_IDX(mnode, r, newbase))) == newbase); 1165 1166 /* pcc_info_t and pcc_color_free */ 1167 for (i = 0; i < NPC_MUTEX; i++) { 1168 pcc_info_t *epi; 1169 pcc_info_t *eold_pi; 1170 1171 pi = cands_cache[i * MMU_PAGE_SIZES + r]; 1172 old_pi = page_ctrs_cands[i][r][mnode]; 1173 page_ctrs_cands[i][r][mnode] = pi; 1174 cands_cache[i * MMU_PAGE_SIZES + r] = old_pi; 1175 1176 /* preserve old pcc_color_free values, if any */ 1177 if (old_pi == NULL) 1178 continue; 1179 1180 /* 1181 * when/if x86 does DR, must account for 1182 * possible change in range index when 1183 * preserving pcc_info 1184 */ 1185 epi = &pi[nranges]; 1186 eold_pi = &old_pi[old_nranges]; 1187 if (new_maxmrange > old_maxmrange) { 1188 pi += new_maxmrange - old_maxmrange; 1189 } else if (new_maxmrange < old_maxmrange) { 1190 old_pi += old_maxmrange - new_maxmrange; 1191 } 1192 for (; pi < epi && old_pi < eold_pi; pi++, old_pi++) { 1193 pcc_info_t tmp = *pi; 1194 *pi = *old_pi; 1195 *old_pi = tmp; 1196 } 1197 } 1198 } 1199 PAGE_CTRS_WRITE_UNLOCK(mnode); 1200 1201 /* 1202 * Now that we have dropped the write lock, it is safe to free all 1203 * of the memory we have cached above. 1204 * We come thru here to free memory when pre-alloc fails, and also to 1205 * free old pointers which were recorded while locked. 1206 */ 1207 cleanup: 1208 for (r = 1; r < mmu_page_sizes; r++) { 1209 if (ctr_cache[r] != NULL) { 1210 kmem_free(ctr_cache[r], 1211 size_cache[r] * sizeof (hpmctr_t)); 1212 } 1213 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) { 1214 if (color_cache[r][mrange] != NULL) { 1215 kmem_free(color_cache[r][mrange], 1216 colors_per_szc[r] * sizeof (size_t)); 1217 } 1218 } 1219 for (i = 0; i < NPC_MUTEX; i++) { 1220 pi = cands_cache[i * MMU_PAGE_SIZES + r]; 1221 if (pi == NULL) 1222 continue; 1223 nr = cands_cache_nranges; 1224 for (mrange = 0; mrange < nr; mrange++, pi++) { 1225 pgcntp = pi->pcc_color_free; 1226 if (pgcntp == NULL) 1227 continue; 1228 if ((caddr_t)pgcntp >= kernelheap && 1229 (caddr_t)pgcntp < ekernelheap) { 1230 kmem_free(pgcntp, 1231 colors_per_szc[r] * 1232 sizeof (pgcnt_t)); 1233 } 1234 } 1235 pi = cands_cache[i * MMU_PAGE_SIZES + r]; 1236 if ((caddr_t)pi >= kernelheap && 1237 (caddr_t)pi < ekernelheap) { 1238 kmem_free(pi, nr * sizeof (pcc_info_t)); 1239 } 1240 } 1241 } 1242 1243 kmem_free(cands_cache, 1244 sizeof (pcc_info_t *) * NPC_MUTEX * MMU_PAGE_SIZES); 1245 return (rc); 1246 } 1247 1248 1249 #ifdef DEBUG 1250 1251 /* 1252 * confirm pp is a large page corresponding to szc 1253 */ 1254 void 1255 chk_lpg(page_t *pp, uchar_t szc) 1256 { 1257 spgcnt_t npgs = page_get_pagecnt(pp->p_szc); 1258 uint_t noreloc; 1259 1260 if (npgs == 1) { 1261 ASSERT(pp->p_szc == 0); 1262 ASSERT(pp->p_next == pp); 1263 ASSERT(pp->p_prev == pp); 1264 return; 1265 } 1266 1267 ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL); 1268 ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL); 1269 1270 ASSERT(IS_P2ALIGNED(pp->p_pagenum, npgs)); 1271 ASSERT(pp->p_pagenum == (pp->p_next->p_pagenum - 1)); 1272 ASSERT(pp->p_prev->p_pagenum == (pp->p_pagenum + (npgs - 1))); 1273 ASSERT(pp->p_prev == (pp + (npgs - 1))); 1274 1275 /* 1276 * Check list of pages. 1277 */ 1278 noreloc = PP_ISNORELOC(pp); 1279 while (npgs--) { 1280 if (npgs != 0) { 1281 ASSERT(pp->p_pagenum == pp->p_next->p_pagenum - 1); 1282 ASSERT(pp->p_next == (pp + 1)); 1283 } 1284 ASSERT(pp->p_szc == szc); 1285 ASSERT(PP_ISFREE(pp)); 1286 ASSERT(PP_ISAGED(pp)); 1287 ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL); 1288 ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL); 1289 ASSERT(pp->p_vnode == NULL); 1290 ASSERT(PP_ISNORELOC(pp) == noreloc); 1291 1292 pp = pp->p_next; 1293 } 1294 } 1295 #endif /* DEBUG */ 1296 1297 void 1298 page_freelist_lock(int mnode) 1299 { 1300 int i; 1301 for (i = 0; i < NPC_MUTEX; i++) { 1302 mutex_enter(FPC_MUTEX(mnode, i)); 1303 mutex_enter(CPC_MUTEX(mnode, i)); 1304 } 1305 } 1306 1307 void 1308 page_freelist_unlock(int mnode) 1309 { 1310 int i; 1311 for (i = 0; i < NPC_MUTEX; i++) { 1312 mutex_exit(FPC_MUTEX(mnode, i)); 1313 mutex_exit(CPC_MUTEX(mnode, i)); 1314 } 1315 } 1316 1317 /* 1318 * add pp to the specified page list. Defaults to head of the page list 1319 * unless PG_LIST_TAIL is specified. 1320 */ 1321 void 1322 page_list_add(page_t *pp, int flags) 1323 { 1324 page_t **ppp; 1325 kmutex_t *pcm; 1326 uint_t bin, mtype; 1327 int mnode; 1328 1329 ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT)); 1330 ASSERT(PP_ISFREE(pp)); 1331 ASSERT(!hat_page_is_mapped(pp)); 1332 ASSERT(hat_page_getshare(pp) == 0); 1333 1334 /* 1335 * Large pages should be freed via page_list_add_pages(). 1336 */ 1337 ASSERT(pp->p_szc == 0); 1338 1339 /* 1340 * Don't need to lock the freelist first here 1341 * because the page isn't on the freelist yet. 1342 * This means p_szc can't change on us. 1343 */ 1344 1345 bin = PP_2_BIN(pp); 1346 mnode = PP_2_MEM_NODE(pp); 1347 mtype = PP_2_MTYPE(pp); 1348 1349 if (flags & PG_LIST_ISINIT) { 1350 /* 1351 * PG_LIST_ISINIT is set during system startup (ie. single 1352 * threaded), add a page to the free list and add to the 1353 * the free region counters w/o any locking 1354 */ 1355 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1356 1357 /* inline version of page_add() */ 1358 if (*ppp != NULL) { 1359 pp->p_next = *ppp; 1360 pp->p_prev = (*ppp)->p_prev; 1361 (*ppp)->p_prev = pp; 1362 pp->p_prev->p_next = pp; 1363 } else 1364 *ppp = pp; 1365 1366 page_ctr_add_internal(mnode, mtype, pp, flags); 1367 VM_STAT_ADD(vmm_vmstats.pladd_free[0]); 1368 } else { 1369 pcm = PC_BIN_MUTEX(mnode, bin, flags); 1370 1371 if (flags & PG_FREE_LIST) { 1372 VM_STAT_ADD(vmm_vmstats.pladd_free[0]); 1373 ASSERT(PP_ISAGED(pp)); 1374 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1375 1376 } else { 1377 VM_STAT_ADD(vmm_vmstats.pladd_cache); 1378 ASSERT(pp->p_vnode); 1379 ASSERT((pp->p_offset & PAGEOFFSET) == 0); 1380 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1381 } 1382 mutex_enter(pcm); 1383 page_add(ppp, pp); 1384 1385 if (flags & PG_LIST_TAIL) 1386 *ppp = (*ppp)->p_next; 1387 /* 1388 * Add counters before releasing pcm mutex to avoid a race with 1389 * page_freelist_coalesce and page_freelist_split. 1390 */ 1391 page_ctr_add(mnode, mtype, pp, flags); 1392 mutex_exit(pcm); 1393 } 1394 1395 1396 #if defined(__sparc) 1397 if (PP_ISNORELOC(pp)) { 1398 kcage_freemem_add(1); 1399 } 1400 #endif 1401 /* 1402 * It is up to the caller to unlock the page! 1403 */ 1404 ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT)); 1405 } 1406 1407 1408 #ifdef __sparc 1409 /* 1410 * This routine is only used by kcage_init during system startup. 1411 * It performs the function of page_list_sub/PP_SETNORELOC/page_list_add 1412 * without the overhead of taking locks and updating counters. 1413 */ 1414 void 1415 page_list_noreloc_startup(page_t *pp) 1416 { 1417 page_t **ppp; 1418 uint_t bin; 1419 int mnode; 1420 int mtype; 1421 int flags = 0; 1422 1423 /* 1424 * If this is a large page on the freelist then 1425 * break it up into smaller pages. 1426 */ 1427 if (pp->p_szc != 0) 1428 page_boot_demote(pp); 1429 1430 /* 1431 * Get list page is currently on. 1432 */ 1433 bin = PP_2_BIN(pp); 1434 mnode = PP_2_MEM_NODE(pp); 1435 mtype = PP_2_MTYPE(pp); 1436 ASSERT(mtype == MTYPE_RELOC); 1437 ASSERT(pp->p_szc == 0); 1438 1439 if (PP_ISAGED(pp)) { 1440 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1441 flags |= PG_FREE_LIST; 1442 } else { 1443 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1444 flags |= PG_CACHE_LIST; 1445 } 1446 1447 ASSERT(*ppp != NULL); 1448 1449 /* 1450 * Delete page from current list. 1451 */ 1452 if (*ppp == pp) 1453 *ppp = pp->p_next; /* go to next page */ 1454 if (*ppp == pp) { 1455 *ppp = NULL; /* page list is gone */ 1456 } else { 1457 pp->p_prev->p_next = pp->p_next; 1458 pp->p_next->p_prev = pp->p_prev; 1459 } 1460 1461 /* 1462 * Decrement page counters 1463 */ 1464 page_ctr_sub_internal(mnode, mtype, pp, flags); 1465 1466 /* 1467 * Set no reloc for cage initted pages. 1468 */ 1469 PP_SETNORELOC(pp); 1470 1471 mtype = PP_2_MTYPE(pp); 1472 ASSERT(mtype == MTYPE_NORELOC); 1473 1474 /* 1475 * Get new list for page. 1476 */ 1477 if (PP_ISAGED(pp)) { 1478 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1479 } else { 1480 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1481 } 1482 1483 /* 1484 * Insert page on new list. 1485 */ 1486 if (*ppp == NULL) { 1487 *ppp = pp; 1488 pp->p_next = pp->p_prev = pp; 1489 } else { 1490 pp->p_next = *ppp; 1491 pp->p_prev = (*ppp)->p_prev; 1492 (*ppp)->p_prev = pp; 1493 pp->p_prev->p_next = pp; 1494 } 1495 1496 /* 1497 * Increment page counters 1498 */ 1499 page_ctr_add_internal(mnode, mtype, pp, flags); 1500 1501 /* 1502 * Update cage freemem counter 1503 */ 1504 atomic_add_long(&kcage_freemem, 1); 1505 } 1506 #else /* __sparc */ 1507 1508 /* ARGSUSED */ 1509 void 1510 page_list_noreloc_startup(page_t *pp) 1511 { 1512 panic("page_list_noreloc_startup: should be here only for sparc"); 1513 } 1514 #endif 1515 1516 void 1517 page_list_add_pages(page_t *pp, int flags) 1518 { 1519 kmutex_t *pcm; 1520 pgcnt_t pgcnt; 1521 uint_t bin, mtype, i; 1522 int mnode; 1523 1524 /* default to freelist/head */ 1525 ASSERT((flags & (PG_CACHE_LIST | PG_LIST_TAIL)) == 0); 1526 1527 CHK_LPG(pp, pp->p_szc); 1528 VM_STAT_ADD(vmm_vmstats.pladd_free[pp->p_szc]); 1529 1530 bin = PP_2_BIN(pp); 1531 mnode = PP_2_MEM_NODE(pp); 1532 mtype = PP_2_MTYPE(pp); 1533 1534 if (flags & PG_LIST_ISINIT) { 1535 ASSERT(pp->p_szc == mmu_page_sizes - 1); 1536 page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1537 ASSERT(!PP_ISNORELOC(pp)); 1538 PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags); 1539 } else { 1540 1541 ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes); 1542 1543 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 1544 1545 mutex_enter(pcm); 1546 page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1547 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST); 1548 mutex_exit(pcm); 1549 1550 pgcnt = page_get_pagecnt(pp->p_szc); 1551 #if defined(__sparc) 1552 if (PP_ISNORELOC(pp)) 1553 kcage_freemem_add(pgcnt); 1554 #endif 1555 for (i = 0; i < pgcnt; i++, pp++) 1556 page_unlock_nocapture(pp); 1557 } 1558 } 1559 1560 /* 1561 * During boot, need to demote a large page to base 1562 * pagesize pages for seg_kmem for use in boot_alloc() 1563 */ 1564 void 1565 page_boot_demote(page_t *pp) 1566 { 1567 ASSERT(pp->p_szc != 0); 1568 ASSERT(PP_ISFREE(pp)); 1569 ASSERT(PP_ISAGED(pp)); 1570 1571 (void) page_demote(PP_2_MEM_NODE(pp), 1572 PFN_BASE(pp->p_pagenum, pp->p_szc), pp->p_szc, 0, PC_NO_COLOR, 1573 PC_FREE); 1574 1575 ASSERT(PP_ISFREE(pp)); 1576 ASSERT(PP_ISAGED(pp)); 1577 ASSERT(pp->p_szc == 0); 1578 } 1579 1580 /* 1581 * Take a particular page off of whatever freelist the page 1582 * is claimed to be on. 1583 * 1584 * NOTE: Only used for PAGESIZE pages. 1585 */ 1586 void 1587 page_list_sub(page_t *pp, int flags) 1588 { 1589 int bin; 1590 uint_t mtype; 1591 int mnode; 1592 kmutex_t *pcm; 1593 page_t **ppp; 1594 1595 ASSERT(PAGE_EXCL(pp)); 1596 ASSERT(PP_ISFREE(pp)); 1597 1598 /* 1599 * The p_szc field can only be changed by page_promote() 1600 * and page_demote(). Only free pages can be promoted and 1601 * demoted and the free list MUST be locked during these 1602 * operations. So to prevent a race in page_list_sub() 1603 * between computing which bin of the freelist lock to 1604 * grab and actually grabing the lock we check again that 1605 * the bin we locked is still the correct one. Notice that 1606 * the p_szc field could have actually changed on us but 1607 * if the bin happens to still be the same we are safe. 1608 */ 1609 try_again: 1610 bin = PP_2_BIN(pp); 1611 mnode = PP_2_MEM_NODE(pp); 1612 pcm = PC_BIN_MUTEX(mnode, bin, flags); 1613 mutex_enter(pcm); 1614 if (PP_2_BIN(pp) != bin) { 1615 mutex_exit(pcm); 1616 goto try_again; 1617 } 1618 mtype = PP_2_MTYPE(pp); 1619 1620 if (flags & PG_FREE_LIST) { 1621 VM_STAT_ADD(vmm_vmstats.plsub_free[0]); 1622 ASSERT(PP_ISAGED(pp)); 1623 ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype); 1624 } else { 1625 VM_STAT_ADD(vmm_vmstats.plsub_cache); 1626 ASSERT(!PP_ISAGED(pp)); 1627 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1628 } 1629 1630 /* 1631 * Common PAGESIZE case. 1632 * 1633 * Note that we locked the freelist. This prevents 1634 * any page promotion/demotion operations. Therefore 1635 * the p_szc will not change until we drop pcm mutex. 1636 */ 1637 if (pp->p_szc == 0) { 1638 page_sub(ppp, pp); 1639 /* 1640 * Subtract counters before releasing pcm mutex 1641 * to avoid race with page_freelist_coalesce. 1642 */ 1643 page_ctr_sub(mnode, mtype, pp, flags); 1644 mutex_exit(pcm); 1645 1646 #if defined(__sparc) 1647 if (PP_ISNORELOC(pp)) { 1648 kcage_freemem_sub(1); 1649 } 1650 #endif 1651 return; 1652 } 1653 1654 /* 1655 * Large pages on the cache list are not supported. 1656 */ 1657 if (flags & PG_CACHE_LIST) 1658 panic("page_list_sub: large page on cachelist"); 1659 1660 /* 1661 * Slow but rare. 1662 * 1663 * Somebody wants this particular page which is part 1664 * of a large page. In this case we just demote the page 1665 * if it's on the freelist. 1666 * 1667 * We have to drop pcm before locking the entire freelist. 1668 * Once we have re-locked the freelist check to make sure 1669 * the page hasn't already been demoted or completely 1670 * freed. 1671 */ 1672 mutex_exit(pcm); 1673 page_freelist_lock(mnode); 1674 if (pp->p_szc != 0) { 1675 /* 1676 * Large page is on freelist. 1677 */ 1678 (void) page_demote(mnode, PFN_BASE(pp->p_pagenum, pp->p_szc), 1679 pp->p_szc, 0, PC_NO_COLOR, PC_FREE); 1680 } 1681 ASSERT(PP_ISFREE(pp)); 1682 ASSERT(PP_ISAGED(pp)); 1683 ASSERT(pp->p_szc == 0); 1684 1685 /* 1686 * Subtract counters before releasing pcm mutex 1687 * to avoid race with page_freelist_coalesce. 1688 */ 1689 bin = PP_2_BIN(pp); 1690 mtype = PP_2_MTYPE(pp); 1691 ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype); 1692 1693 page_sub(ppp, pp); 1694 page_ctr_sub(mnode, mtype, pp, flags); 1695 page_freelist_unlock(mnode); 1696 1697 #if defined(__sparc) 1698 if (PP_ISNORELOC(pp)) { 1699 kcage_freemem_sub(1); 1700 } 1701 #endif 1702 } 1703 1704 void 1705 page_list_sub_pages(page_t *pp, uint_t szc) 1706 { 1707 kmutex_t *pcm; 1708 uint_t bin, mtype; 1709 int mnode; 1710 1711 ASSERT(PAGE_EXCL(pp)); 1712 ASSERT(PP_ISFREE(pp)); 1713 ASSERT(PP_ISAGED(pp)); 1714 1715 /* 1716 * See comment in page_list_sub(). 1717 */ 1718 try_again: 1719 bin = PP_2_BIN(pp); 1720 mnode = PP_2_MEM_NODE(pp); 1721 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 1722 mutex_enter(pcm); 1723 if (PP_2_BIN(pp) != bin) { 1724 mutex_exit(pcm); 1725 goto try_again; 1726 } 1727 1728 /* 1729 * If we're called with a page larger than szc or it got 1730 * promoted above szc before we locked the freelist then 1731 * drop pcm and re-lock entire freelist. If page still larger 1732 * than szc then demote it. 1733 */ 1734 if (pp->p_szc > szc) { 1735 mutex_exit(pcm); 1736 pcm = NULL; 1737 page_freelist_lock(mnode); 1738 if (pp->p_szc > szc) { 1739 VM_STAT_ADD(vmm_vmstats.plsubpages_szcbig); 1740 (void) page_demote(mnode, 1741 PFN_BASE(pp->p_pagenum, pp->p_szc), 1742 pp->p_szc, szc, PC_NO_COLOR, PC_FREE); 1743 } 1744 bin = PP_2_BIN(pp); 1745 } 1746 ASSERT(PP_ISFREE(pp)); 1747 ASSERT(PP_ISAGED(pp)); 1748 ASSERT(pp->p_szc <= szc); 1749 ASSERT(pp == PP_PAGEROOT(pp)); 1750 1751 VM_STAT_ADD(vmm_vmstats.plsub_free[pp->p_szc]); 1752 1753 mtype = PP_2_MTYPE(pp); 1754 if (pp->p_szc != 0) { 1755 page_vpsub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1756 CHK_LPG(pp, pp->p_szc); 1757 } else { 1758 VM_STAT_ADD(vmm_vmstats.plsubpages_szc0); 1759 page_sub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1760 } 1761 page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST); 1762 1763 if (pcm != NULL) { 1764 mutex_exit(pcm); 1765 } else { 1766 page_freelist_unlock(mnode); 1767 } 1768 1769 #if defined(__sparc) 1770 if (PP_ISNORELOC(pp)) { 1771 pgcnt_t pgcnt; 1772 1773 pgcnt = page_get_pagecnt(pp->p_szc); 1774 kcage_freemem_sub(pgcnt); 1775 } 1776 #endif 1777 } 1778 1779 /* 1780 * Add the page to the front of a linked list of pages 1781 * using the p_next & p_prev pointers for the list. 1782 * The caller is responsible for protecting the list pointers. 1783 */ 1784 void 1785 mach_page_add(page_t **ppp, page_t *pp) 1786 { 1787 if (*ppp == NULL) { 1788 pp->p_next = pp->p_prev = pp; 1789 } else { 1790 pp->p_next = *ppp; 1791 pp->p_prev = (*ppp)->p_prev; 1792 (*ppp)->p_prev = pp; 1793 pp->p_prev->p_next = pp; 1794 } 1795 *ppp = pp; 1796 } 1797 1798 /* 1799 * Remove this page from a linked list of pages 1800 * using the p_next & p_prev pointers for the list. 1801 * 1802 * The caller is responsible for protecting the list pointers. 1803 */ 1804 void 1805 mach_page_sub(page_t **ppp, page_t *pp) 1806 { 1807 ASSERT(PP_ISFREE(pp)); 1808 1809 if (*ppp == NULL || pp == NULL) 1810 panic("mach_page_sub"); 1811 1812 if (*ppp == pp) 1813 *ppp = pp->p_next; /* go to next page */ 1814 1815 if (*ppp == pp) 1816 *ppp = NULL; /* page list is gone */ 1817 else { 1818 pp->p_prev->p_next = pp->p_next; 1819 pp->p_next->p_prev = pp->p_prev; 1820 } 1821 pp->p_prev = pp->p_next = pp; /* make pp a list of one */ 1822 } 1823 1824 /* 1825 * Routine fsflush uses to gradually coalesce the free list into larger pages. 1826 */ 1827 void 1828 page_promote_size(page_t *pp, uint_t cur_szc) 1829 { 1830 pfn_t pfn; 1831 int mnode; 1832 int idx; 1833 int new_szc = cur_szc + 1; 1834 int full = FULL_REGION_CNT(new_szc); 1835 1836 pfn = page_pptonum(pp); 1837 mnode = PFN_2_MEM_NODE(pfn); 1838 1839 page_freelist_lock(mnode); 1840 1841 idx = PNUM_TO_IDX(mnode, new_szc, pfn); 1842 if (PAGE_COUNTERS(mnode, new_szc, idx) == full) 1843 (void) page_promote(mnode, pfn, new_szc, PC_FREE, PC_MTYPE_ANY); 1844 1845 page_freelist_unlock(mnode); 1846 } 1847 1848 static uint_t page_promote_err; 1849 static uint_t page_promote_noreloc_err; 1850 1851 /* 1852 * Create a single larger page (of szc new_szc) from smaller contiguous pages 1853 * for the given mnode starting at pfnum. Pages involved are on the freelist 1854 * before the call and may be returned to the caller if requested, otherwise 1855 * they will be placed back on the freelist. 1856 * If flags is PC_ALLOC, then the large page will be returned to the user in 1857 * a state which is consistent with a page being taken off the freelist. If 1858 * we failed to lock the new large page, then we will return NULL to the 1859 * caller and put the large page on the freelist instead. 1860 * If flags is PC_FREE, then the large page will be placed on the freelist, 1861 * and NULL will be returned. 1862 * The caller is responsible for locking the freelist as well as any other 1863 * accounting which needs to be done for a returned page. 1864 * 1865 * RFE: For performance pass in pp instead of pfnum so 1866 * we can avoid excessive calls to page_numtopp_nolock(). 1867 * This would depend on an assumption that all contiguous 1868 * pages are in the same memseg so we can just add/dec 1869 * our pp. 1870 * 1871 * Lock ordering: 1872 * 1873 * There is a potential but rare deadlock situation 1874 * for page promotion and demotion operations. The problem 1875 * is there are two paths into the freelist manager and 1876 * they have different lock orders: 1877 * 1878 * page_create() 1879 * lock freelist 1880 * page_lock(EXCL) 1881 * unlock freelist 1882 * return 1883 * caller drops page_lock 1884 * 1885 * page_free() and page_reclaim() 1886 * caller grabs page_lock(EXCL) 1887 * 1888 * lock freelist 1889 * unlock freelist 1890 * drop page_lock 1891 * 1892 * What prevents a thread in page_create() from deadlocking 1893 * with a thread freeing or reclaiming the same page is the 1894 * page_trylock() in page_get_freelist(). If the trylock fails 1895 * it skips the page. 1896 * 1897 * The lock ordering for promotion and demotion is the same as 1898 * for page_create(). Since the same deadlock could occur during 1899 * page promotion and freeing or reclaiming of a page on the 1900 * cache list we might have to fail the operation and undo what 1901 * have done so far. Again this is rare. 1902 */ 1903 page_t * 1904 page_promote(int mnode, pfn_t pfnum, uchar_t new_szc, int flags, int mtype) 1905 { 1906 page_t *pp, *pplist, *tpp, *start_pp; 1907 pgcnt_t new_npgs, npgs; 1908 uint_t bin; 1909 pgcnt_t tmpnpgs, pages_left; 1910 uint_t noreloc; 1911 int which_list; 1912 ulong_t index; 1913 kmutex_t *phm; 1914 1915 /* 1916 * General algorithm: 1917 * Find the starting page 1918 * Walk each page struct removing it from the freelist, 1919 * and linking it to all the other pages removed. 1920 * Once all pages are off the freelist, 1921 * walk the list, modifying p_szc to new_szc and what 1922 * ever other info needs to be done to create a large free page. 1923 * According to the flags, either return the page or put it 1924 * on the freelist. 1925 */ 1926 1927 start_pp = page_numtopp_nolock(pfnum); 1928 ASSERT(start_pp && (start_pp->p_pagenum == pfnum)); 1929 new_npgs = page_get_pagecnt(new_szc); 1930 ASSERT(IS_P2ALIGNED(pfnum, new_npgs)); 1931 1932 /* don't return page of the wrong mtype */ 1933 if (mtype != PC_MTYPE_ANY && mtype != PP_2_MTYPE(start_pp)) 1934 return (NULL); 1935 1936 /* 1937 * Loop through smaller pages to confirm that all pages 1938 * give the same result for PP_ISNORELOC(). 1939 * We can check this reliably here as the protocol for setting 1940 * P_NORELOC requires pages to be taken off the free list first. 1941 */ 1942 noreloc = PP_ISNORELOC(start_pp); 1943 for (pp = start_pp + new_npgs; --pp > start_pp; ) { 1944 if (noreloc != PP_ISNORELOC(pp)) { 1945 page_promote_noreloc_err++; 1946 page_promote_err++; 1947 return (NULL); 1948 } 1949 } 1950 1951 pages_left = new_npgs; 1952 pplist = NULL; 1953 pp = start_pp; 1954 1955 /* Loop around coalescing the smaller pages into a big page. */ 1956 while (pages_left) { 1957 /* 1958 * Remove from the freelist. 1959 */ 1960 ASSERT(PP_ISFREE(pp)); 1961 bin = PP_2_BIN(pp); 1962 ASSERT(mnode == PP_2_MEM_NODE(pp)); 1963 mtype = PP_2_MTYPE(pp); 1964 if (PP_ISAGED(pp)) { 1965 1966 /* 1967 * PG_FREE_LIST 1968 */ 1969 if (pp->p_szc) { 1970 page_vpsub(&PAGE_FREELISTS(mnode, 1971 pp->p_szc, bin, mtype), pp); 1972 } else { 1973 mach_page_sub(&PAGE_FREELISTS(mnode, 0, 1974 bin, mtype), pp); 1975 } 1976 which_list = PG_FREE_LIST; 1977 } else { 1978 ASSERT(pp->p_szc == 0); 1979 1980 /* 1981 * PG_CACHE_LIST 1982 * 1983 * Since this page comes from the 1984 * cachelist, we must destroy the 1985 * vnode association. 1986 */ 1987 if (!page_trylock(pp, SE_EXCL)) { 1988 goto fail_promote; 1989 } 1990 1991 /* 1992 * We need to be careful not to deadlock 1993 * with another thread in page_lookup(). 1994 * The page_lookup() thread could be holding 1995 * the same phm that we need if the two 1996 * pages happen to hash to the same phm lock. 1997 * At this point we have locked the entire 1998 * freelist and page_lookup() could be trying 1999 * to grab a freelist lock. 2000 */ 2001 index = PAGE_HASH_FUNC(pp->p_vnode, pp->p_offset); 2002 phm = PAGE_HASH_MUTEX(index); 2003 if (!mutex_tryenter(phm)) { 2004 page_unlock_nocapture(pp); 2005 goto fail_promote; 2006 } 2007 2008 mach_page_sub(&PAGE_CACHELISTS(mnode, bin, mtype), pp); 2009 page_hashout(pp, phm); 2010 mutex_exit(phm); 2011 PP_SETAGED(pp); 2012 page_unlock_nocapture(pp); 2013 which_list = PG_CACHE_LIST; 2014 } 2015 page_ctr_sub(mnode, mtype, pp, which_list); 2016 2017 /* 2018 * Concatenate the smaller page(s) onto 2019 * the large page list. 2020 */ 2021 tmpnpgs = npgs = page_get_pagecnt(pp->p_szc); 2022 pages_left -= npgs; 2023 tpp = pp; 2024 while (npgs--) { 2025 tpp->p_szc = new_szc; 2026 tpp = tpp->p_next; 2027 } 2028 page_list_concat(&pplist, &pp); 2029 pp += tmpnpgs; 2030 } 2031 CHK_LPG(pplist, new_szc); 2032 2033 /* 2034 * return the page to the user if requested 2035 * in the properly locked state. 2036 */ 2037 if (flags == PC_ALLOC && (page_trylock_cons(pplist, SE_EXCL))) { 2038 return (pplist); 2039 } 2040 2041 /* 2042 * Otherwise place the new large page on the freelist 2043 */ 2044 bin = PP_2_BIN(pplist); 2045 mnode = PP_2_MEM_NODE(pplist); 2046 mtype = PP_2_MTYPE(pplist); 2047 page_vpadd(&PAGE_FREELISTS(mnode, new_szc, bin, mtype), pplist); 2048 2049 page_ctr_add(mnode, mtype, pplist, PG_FREE_LIST); 2050 return (NULL); 2051 2052 fail_promote: 2053 /* 2054 * A thread must have still been freeing or 2055 * reclaiming the page on the cachelist. 2056 * To prevent a deadlock undo what we have 2057 * done sofar and return failure. This 2058 * situation can only happen while promoting 2059 * PAGESIZE pages. 2060 */ 2061 page_promote_err++; 2062 while (pplist) { 2063 pp = pplist; 2064 mach_page_sub(&pplist, pp); 2065 pp->p_szc = 0; 2066 bin = PP_2_BIN(pp); 2067 mtype = PP_2_MTYPE(pp); 2068 mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, mtype), pp); 2069 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST); 2070 } 2071 return (NULL); 2072 2073 } 2074 2075 /* 2076 * Break up a large page into smaller size pages. 2077 * Pages involved are on the freelist before the call and may 2078 * be returned to the caller if requested, otherwise they will 2079 * be placed back on the freelist. 2080 * The caller is responsible for locking the freelist as well as any other 2081 * accounting which needs to be done for a returned page. 2082 * If flags is not PC_ALLOC, the color argument is ignored, and thus 2083 * technically, any value may be passed in but PC_NO_COLOR is the standard 2084 * which should be followed for clarity's sake. 2085 */ 2086 page_t * 2087 page_demote(int mnode, pfn_t pfnum, uchar_t cur_szc, uchar_t new_szc, 2088 int color, int flags) 2089 { 2090 page_t *pp, *pplist, *npplist; 2091 pgcnt_t npgs, n; 2092 uint_t bin; 2093 uint_t mtype; 2094 page_t *ret_pp = NULL; 2095 2096 ASSERT(cur_szc != 0); 2097 ASSERT(new_szc < cur_szc); 2098 2099 pplist = page_numtopp_nolock(pfnum); 2100 ASSERT(pplist != NULL); 2101 2102 ASSERT(pplist->p_szc == cur_szc); 2103 2104 bin = PP_2_BIN(pplist); 2105 ASSERT(mnode == PP_2_MEM_NODE(pplist)); 2106 mtype = PP_2_MTYPE(pplist); 2107 page_vpsub(&PAGE_FREELISTS(mnode, cur_szc, bin, mtype), pplist); 2108 2109 CHK_LPG(pplist, cur_szc); 2110 page_ctr_sub(mnode, mtype, pplist, PG_FREE_LIST); 2111 2112 /* 2113 * Number of PAGESIZE pages for smaller new_szc 2114 * page. 2115 */ 2116 npgs = page_get_pagecnt(new_szc); 2117 2118 while (pplist) { 2119 pp = pplist; 2120 2121 ASSERT(pp->p_szc == cur_szc); 2122 2123 /* 2124 * We either break it up into PAGESIZE pages or larger. 2125 */ 2126 if (npgs == 1) { /* PAGESIZE case */ 2127 mach_page_sub(&pplist, pp); 2128 ASSERT(pp->p_szc == cur_szc); 2129 ASSERT(new_szc == 0); 2130 ASSERT(mnode == PP_2_MEM_NODE(pp)); 2131 pp->p_szc = new_szc; 2132 bin = PP_2_BIN(pp); 2133 if ((bin == color) && (flags == PC_ALLOC) && 2134 (ret_pp == NULL) && 2135 page_trylock_cons(pp, SE_EXCL)) { 2136 ret_pp = pp; 2137 } else { 2138 mtype = PP_2_MTYPE(pp); 2139 mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, 2140 mtype), pp); 2141 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST); 2142 } 2143 } else { 2144 2145 /* 2146 * Break down into smaller lists of pages. 2147 */ 2148 page_list_break(&pplist, &npplist, npgs); 2149 2150 pp = pplist; 2151 n = npgs; 2152 while (n--) { 2153 ASSERT(pp->p_szc == cur_szc); 2154 pp->p_szc = new_szc; 2155 pp = pp->p_next; 2156 } 2157 2158 CHK_LPG(pplist, new_szc); 2159 2160 bin = PP_2_BIN(pplist); 2161 ASSERT(mnode == PP_2_MEM_NODE(pp)); 2162 if ((bin == color) && (flags == PC_ALLOC) && 2163 (ret_pp == NULL) && 2164 page_trylock_cons(pp, SE_EXCL)) { 2165 ret_pp = pp; 2166 } else { 2167 mtype = PP_2_MTYPE(pp); 2168 page_vpadd(&PAGE_FREELISTS(mnode, new_szc, 2169 bin, mtype), pplist); 2170 2171 page_ctr_add(mnode, mtype, pplist, 2172 PG_FREE_LIST); 2173 } 2174 pplist = npplist; 2175 } 2176 } 2177 return (ret_pp); 2178 } 2179 2180 int mpss_coalesce_disable = 0; 2181 2182 /* 2183 * Coalesce free pages into a page of the given szc and color if possible. 2184 * Return the pointer to the page created, otherwise, return NULL. 2185 * 2186 * If pfnhi is non-zero, search for large page with pfn range less than pfnhi. 2187 */ 2188 page_t * 2189 page_freelist_coalesce(int mnode, uchar_t szc, uint_t color, uint_t ceq_mask, 2190 int mtype, pfn_t pfnhi) 2191 { 2192 int r = szc; /* region size */ 2193 int mrange; 2194 uint_t full, bin, color_mask, wrap = 0; 2195 pfn_t pfnum, lo, hi; 2196 size_t len, idx, idx0; 2197 pgcnt_t cands = 0, szcpgcnt = page_get_pagecnt(szc); 2198 page_t *ret_pp; 2199 MEM_NODE_ITERATOR_DECL(it); 2200 #if defined(__sparc) 2201 pfn_t pfnum0, nlo, nhi; 2202 #endif 2203 2204 if (mpss_coalesce_disable) { 2205 ASSERT(szc < MMU_PAGE_SIZES); 2206 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[szc][0]); 2207 return (NULL); 2208 } 2209 2210 ASSERT(szc < mmu_page_sizes); 2211 color_mask = PAGE_GET_PAGECOLORS(szc) - 1; 2212 ASSERT(ceq_mask <= color_mask); 2213 ASSERT(color <= color_mask); 2214 color &= ceq_mask; 2215 2216 /* Prevent page_counters dynamic memory from being freed */ 2217 rw_enter(&page_ctrs_rwlock[mnode], RW_READER); 2218 2219 mrange = MTYPE_2_MRANGE(mnode, mtype); 2220 ASSERT(mrange < mnode_nranges[mnode]); 2221 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[r][mrange]); 2222 2223 /* get pfn range for mtype */ 2224 len = PAGE_COUNTERS_ENTRIES(mnode, r); 2225 #if defined(__sparc) 2226 lo = PAGE_COUNTERS_BASE(mnode, r); 2227 hi = IDX_TO_PNUM(mnode, r, len); 2228 #else 2229 MNODETYPE_2_PFN(mnode, mtype, lo, hi); 2230 hi++; 2231 #endif 2232 2233 /* use lower limit if given */ 2234 if (pfnhi != PFNNULL && pfnhi < hi) 2235 hi = pfnhi; 2236 2237 /* round to szcpgcnt boundaries */ 2238 lo = P2ROUNDUP(lo, szcpgcnt); 2239 MEM_NODE_ITERATOR_INIT(lo, mnode, &it); 2240 ASSERT(lo != (pfn_t)-1); 2241 hi = hi & ~(szcpgcnt - 1); 2242 2243 /* set lo to the closest pfn of the right color */ 2244 if (((PFN_2_COLOR(lo, szc, &it) ^ color) & ceq_mask) || 2245 (interleaved_mnodes && PFN_2_MEM_NODE(lo) != mnode)) { 2246 PAGE_NEXT_PFN_FOR_COLOR(lo, szc, color, ceq_mask, color_mask, 2247 &it); 2248 } 2249 2250 if (hi <= lo) { 2251 rw_exit(&page_ctrs_rwlock[mnode]); 2252 return (NULL); 2253 } 2254 2255 full = FULL_REGION_CNT(r); 2256 2257 /* calculate the number of page candidates and initial search index */ 2258 bin = color; 2259 idx0 = (size_t)(-1); 2260 do { 2261 pgcnt_t acand; 2262 2263 PGCTRS_CANDS_GETVALUECOLOR(mnode, mrange, r, bin, acand); 2264 if (acand) { 2265 idx = PAGE_COUNTERS_CURRENT_COLOR(mnode, 2266 r, bin, mrange); 2267 idx0 = MIN(idx0, idx); 2268 cands += acand; 2269 } 2270 bin = ADD_MASKED(bin, 1, ceq_mask, color_mask); 2271 } while (bin != color); 2272 2273 if (cands == 0) { 2274 VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip[r][mrange]); 2275 rw_exit(&page_ctrs_rwlock[mnode]); 2276 return (NULL); 2277 } 2278 2279 pfnum = IDX_TO_PNUM(mnode, r, idx0); 2280 if (pfnum < lo || pfnum >= hi) { 2281 pfnum = lo; 2282 } else { 2283 MEM_NODE_ITERATOR_INIT(pfnum, mnode, &it); 2284 if (pfnum == (pfn_t)-1) { 2285 pfnum = lo; 2286 MEM_NODE_ITERATOR_INIT(pfnum, mnode, &it); 2287 ASSERT(pfnum != (pfn_t)-1); 2288 } else if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) & ceq_mask || 2289 (interleaved_mnodes && PFN_2_MEM_NODE(pfnum) != mnode)) { 2290 /* invalid color, get the closest correct pfn */ 2291 PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask, 2292 color_mask, &it); 2293 if (pfnum >= hi) { 2294 pfnum = lo; 2295 MEM_NODE_ITERATOR_INIT(pfnum, mnode, &it); 2296 } 2297 } 2298 } 2299 2300 /* set starting index */ 2301 idx0 = PNUM_TO_IDX(mnode, r, pfnum); 2302 ASSERT(idx0 < len); 2303 2304 #if defined(__sparc) 2305 pfnum0 = pfnum; /* page corresponding to idx0 */ 2306 nhi = 0; /* search kcage ranges */ 2307 #endif 2308 2309 for (idx = idx0; wrap == 0 || (idx < idx0 && wrap < 2); ) { 2310 2311 #if defined(__sparc) 2312 /* 2313 * Find lowest intersection of kcage ranges and mnode. 2314 * MTYPE_NORELOC means look in the cage, otherwise outside. 2315 */ 2316 if (nhi <= pfnum) { 2317 if (kcage_next_range(mtype == MTYPE_NORELOC, pfnum, 2318 (wrap == 0 ? hi : pfnum0), &nlo, &nhi)) 2319 goto wrapit; 2320 2321 /* jump to the next page in the range */ 2322 if (pfnum < nlo) { 2323 pfnum = P2ROUNDUP(nlo, szcpgcnt); 2324 MEM_NODE_ITERATOR_INIT(pfnum, mnode, &it); 2325 idx = PNUM_TO_IDX(mnode, r, pfnum); 2326 if (idx >= len || pfnum >= hi) 2327 goto wrapit; 2328 if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) & 2329 ceq_mask) 2330 goto next; 2331 if (interleaved_mnodes && 2332 PFN_2_MEM_NODE(pfnum) != mnode) 2333 goto next; 2334 } 2335 } 2336 #endif 2337 2338 if (PAGE_COUNTERS(mnode, r, idx) != full) 2339 goto next; 2340 2341 /* 2342 * RFE: For performance maybe we can do something less 2343 * brutal than locking the entire freelist. So far 2344 * this doesn't seem to be a performance problem? 2345 */ 2346 page_freelist_lock(mnode); 2347 if (PAGE_COUNTERS(mnode, r, idx) == full) { 2348 ret_pp = 2349 page_promote(mnode, pfnum, r, PC_ALLOC, mtype); 2350 if (ret_pp != NULL) { 2351 VM_STAT_ADD(vmm_vmstats.pfc_coalok[r][mrange]); 2352 PAGE_COUNTERS_CURRENT_COLOR(mnode, r, 2353 PFN_2_COLOR(pfnum, szc, &it), mrange) = idx; 2354 page_freelist_unlock(mnode); 2355 rw_exit(&page_ctrs_rwlock[mnode]); 2356 #if defined(__sparc) 2357 if (PP_ISNORELOC(ret_pp)) { 2358 pgcnt_t npgs; 2359 2360 npgs = page_get_pagecnt(ret_pp->p_szc); 2361 kcage_freemem_sub(npgs); 2362 } 2363 #endif 2364 return (ret_pp); 2365 } 2366 } else { 2367 VM_STAT_ADD(vmm_vmstats.page_ctrs_changed[r][mrange]); 2368 } 2369 2370 page_freelist_unlock(mnode); 2371 /* 2372 * No point looking for another page if we've 2373 * already tried all of the ones that 2374 * page_ctr_cands indicated. Stash off where we left 2375 * off. 2376 * Note: this is not exact since we don't hold the 2377 * page_freelist_locks before we initially get the 2378 * value of cands for performance reasons, but should 2379 * be a decent approximation. 2380 */ 2381 if (--cands == 0) { 2382 PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color, mrange) = 2383 idx; 2384 break; 2385 } 2386 next: 2387 PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask, 2388 color_mask, &it); 2389 idx = PNUM_TO_IDX(mnode, r, pfnum); 2390 if (idx >= len || pfnum >= hi) { 2391 wrapit: 2392 pfnum = lo; 2393 MEM_NODE_ITERATOR_INIT(pfnum, mnode, &it); 2394 idx = PNUM_TO_IDX(mnode, r, pfnum); 2395 wrap++; 2396 #if defined(__sparc) 2397 nhi = 0; /* search kcage ranges */ 2398 #endif 2399 } 2400 } 2401 2402 rw_exit(&page_ctrs_rwlock[mnode]); 2403 VM_STAT_ADD(vmm_vmstats.page_ctrs_failed[r][mrange]); 2404 return (NULL); 2405 } 2406 2407 /* 2408 * For the given mnode, promote as many small pages to large pages as possible. 2409 * mnode can be -1, which means do them all 2410 */ 2411 void 2412 page_freelist_coalesce_all(int mnode) 2413 { 2414 int r; /* region size */ 2415 int idx, full; 2416 size_t len; 2417 int doall = interleaved_mnodes || mnode < 0; 2418 int mlo = doall ? 0 : mnode; 2419 int mhi = doall ? max_mem_nodes : (mnode + 1); 2420 2421 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce_all); 2422 2423 if (mpss_coalesce_disable) { 2424 return; 2425 } 2426 2427 /* 2428 * Lock the entire freelist and coalesce what we can. 2429 * 2430 * Always promote to the largest page possible 2431 * first to reduce the number of page promotions. 2432 */ 2433 for (mnode = mlo; mnode < mhi; mnode++) { 2434 rw_enter(&page_ctrs_rwlock[mnode], RW_READER); 2435 page_freelist_lock(mnode); 2436 } 2437 for (r = mmu_page_sizes - 1; r > 0; r--) { 2438 for (mnode = mlo; mnode < mhi; mnode++) { 2439 pgcnt_t cands = 0; 2440 int mrange, nranges = mnode_nranges[mnode]; 2441 2442 for (mrange = 0; mrange < nranges; mrange++) { 2443 PGCTRS_CANDS_GETVALUE(mnode, mrange, r, cands); 2444 if (cands != 0) 2445 break; 2446 } 2447 if (cands == 0) { 2448 VM_STAT_ADD(vmm_vmstats. 2449 page_ctrs_cands_skip_all); 2450 continue; 2451 } 2452 2453 full = FULL_REGION_CNT(r); 2454 len = PAGE_COUNTERS_ENTRIES(mnode, r); 2455 2456 for (idx = 0; idx < len; idx++) { 2457 if (PAGE_COUNTERS(mnode, r, idx) == full) { 2458 pfn_t pfnum = 2459 IDX_TO_PNUM(mnode, r, idx); 2460 int tmnode = interleaved_mnodes ? 2461 PFN_2_MEM_NODE(pfnum) : mnode; 2462 2463 ASSERT(pfnum >= 2464 mem_node_config[tmnode].physbase && 2465 pfnum < 2466 mem_node_config[tmnode].physmax); 2467 2468 (void) page_promote(tmnode, 2469 pfnum, r, PC_FREE, PC_MTYPE_ANY); 2470 } 2471 } 2472 /* shared hpm_counters covers all mnodes, so we quit */ 2473 if (interleaved_mnodes) 2474 break; 2475 } 2476 } 2477 for (mnode = mlo; mnode < mhi; mnode++) { 2478 page_freelist_unlock(mnode); 2479 rw_exit(&page_ctrs_rwlock[mnode]); 2480 } 2481 } 2482 2483 /* 2484 * This is where all polices for moving pages around 2485 * to different page size free lists is implemented. 2486 * Returns 1 on success, 0 on failure. 2487 * 2488 * So far these are the priorities for this algorithm in descending 2489 * order: 2490 * 2491 * 1) When servicing a request try to do so with a free page 2492 * from next size up. Helps defer fragmentation as long 2493 * as possible. 2494 * 2495 * 2) Page coalesce on demand. Only when a freelist 2496 * larger than PAGESIZE is empty and step 1 2497 * will not work since all larger size lists are 2498 * also empty. 2499 * 2500 * If pfnhi is non-zero, search for large page with pfn range less than pfnhi. 2501 */ 2502 2503 page_t * 2504 page_freelist_split(uchar_t szc, uint_t color, int mnode, int mtype, 2505 pfn_t pfnhi, page_list_walker_t *plw) 2506 { 2507 uchar_t nszc = szc + 1; 2508 uint_t bin, sbin, bin_prev; 2509 page_t *pp, *firstpp; 2510 page_t *ret_pp = NULL; 2511 uint_t color_mask; 2512 2513 if (nszc == mmu_page_sizes) 2514 return (NULL); 2515 2516 ASSERT(nszc < mmu_page_sizes); 2517 color_mask = PAGE_GET_PAGECOLORS(nszc) - 1; 2518 bin = sbin = PAGE_GET_NSZ_COLOR(szc, color); 2519 bin_prev = (plw->plw_bin_split_prev == color) ? INVALID_COLOR : 2520 PAGE_GET_NSZ_COLOR(szc, plw->plw_bin_split_prev); 2521 2522 VM_STAT_ADD(vmm_vmstats.pfs_req[szc]); 2523 /* 2524 * First try to break up a larger page to fill current size freelist. 2525 */ 2526 while (plw->plw_bins[nszc] != 0) { 2527 2528 ASSERT(nszc < mmu_page_sizes); 2529 2530 /* 2531 * If page found then demote it. 2532 */ 2533 if (PAGE_FREELISTS(mnode, nszc, bin, mtype)) { 2534 page_freelist_lock(mnode); 2535 firstpp = pp = PAGE_FREELISTS(mnode, nszc, bin, mtype); 2536 2537 /* 2538 * If pfnhi is not PFNNULL, look for large page below 2539 * pfnhi. PFNNULL signifies no pfn requirement. 2540 */ 2541 if (pfnhi != PFNNULL && pp->p_pagenum >= pfnhi) { 2542 do { 2543 pp = pp->p_vpnext; 2544 if (pp == firstpp) { 2545 pp = NULL; 2546 break; 2547 } 2548 } while (pp->p_pagenum >= pfnhi); 2549 } 2550 if (pp) { 2551 uint_t ccolor = page_correct_color(szc, nszc, 2552 color, bin, plw->plw_ceq_mask[szc]); 2553 2554 ASSERT(pp->p_szc == nszc); 2555 VM_STAT_ADD(vmm_vmstats.pfs_demote[nszc]); 2556 ret_pp = page_demote(mnode, pp->p_pagenum, 2557 pp->p_szc, szc, ccolor, PC_ALLOC); 2558 if (ret_pp) { 2559 page_freelist_unlock(mnode); 2560 #if defined(__sparc) 2561 if (PP_ISNORELOC(ret_pp)) { 2562 pgcnt_t npgs; 2563 2564 npgs = page_get_pagecnt( 2565 ret_pp->p_szc); 2566 kcage_freemem_sub(npgs); 2567 } 2568 #endif 2569 return (ret_pp); 2570 } 2571 } 2572 page_freelist_unlock(mnode); 2573 } 2574 2575 /* loop through next size bins */ 2576 bin = ADD_MASKED(bin, 1, plw->plw_ceq_mask[nszc], color_mask); 2577 plw->plw_bins[nszc]--; 2578 2579 if (bin == sbin) { 2580 uchar_t nnszc = nszc + 1; 2581 2582 /* we are done with this page size - check next */ 2583 if (plw->plw_bins[nnszc] == 0) 2584 /* we have already checked next size bins */ 2585 break; 2586 2587 bin = sbin = PAGE_GET_NSZ_COLOR(nszc, bin); 2588 if (bin_prev != INVALID_COLOR) { 2589 bin_prev = PAGE_GET_NSZ_COLOR(nszc, bin_prev); 2590 if (!((bin ^ bin_prev) & 2591 plw->plw_ceq_mask[nnszc])) 2592 break; 2593 } 2594 ASSERT(nnszc < mmu_page_sizes); 2595 color_mask = PAGE_GET_PAGECOLORS(nnszc) - 1; 2596 nszc = nnszc; 2597 ASSERT(nszc < mmu_page_sizes); 2598 } 2599 } 2600 2601 return (ret_pp); 2602 } 2603 2604 /* 2605 * Helper routine used only by the freelist code to lock 2606 * a page. If the page is a large page then it succeeds in 2607 * locking all the constituent pages or none at all. 2608 * Returns 1 on sucess, 0 on failure. 2609 */ 2610 static int 2611 page_trylock_cons(page_t *pp, se_t se) 2612 { 2613 page_t *tpp, *first_pp = pp; 2614 2615 /* 2616 * Fail if can't lock first or only page. 2617 */ 2618 if (!page_trylock(pp, se)) { 2619 return (0); 2620 } 2621 2622 /* 2623 * PAGESIZE: common case. 2624 */ 2625 if (pp->p_szc == 0) { 2626 return (1); 2627 } 2628 2629 /* 2630 * Large page case. 2631 */ 2632 tpp = pp->p_next; 2633 while (tpp != pp) { 2634 if (!page_trylock(tpp, se)) { 2635 /* 2636 * On failure unlock what we have locked so far. 2637 * We want to avoid attempting to capture these 2638 * pages as the pcm mutex may be held which could 2639 * lead to a recursive mutex panic. 2640 */ 2641 while (first_pp != tpp) { 2642 page_unlock_nocapture(first_pp); 2643 first_pp = first_pp->p_next; 2644 } 2645 return (0); 2646 } 2647 tpp = tpp->p_next; 2648 } 2649 return (1); 2650 } 2651 2652 /* 2653 * init context for walking page lists 2654 * Called when a page of the given szc in unavailable. Sets markers 2655 * for the beginning of the search to detect when search has 2656 * completed a full cycle. Sets flags for splitting larger pages 2657 * and coalescing smaller pages. Page walking procedes until a page 2658 * of the desired equivalent color is found. 2659 */ 2660 void 2661 page_list_walk_init(uchar_t szc, uint_t flags, uint_t bin, int can_split, 2662 int use_ceq, page_list_walker_t *plw) 2663 { 2664 uint_t nszc, ceq_mask, colors; 2665 uchar_t ceq = use_ceq ? colorequivszc[szc] : 0; 2666 2667 ASSERT(szc < mmu_page_sizes); 2668 colors = PAGE_GET_PAGECOLORS(szc); 2669 2670 plw->plw_colors = colors; 2671 plw->plw_color_mask = colors - 1; 2672 plw->plw_bin_marker = plw->plw_bin0 = bin; 2673 plw->plw_bin_split_prev = bin; 2674 plw->plw_bin_step = (szc == 0) ? vac_colors : 1; 2675 2676 /* 2677 * if vac aliasing is possible make sure lower order color 2678 * bits are never ignored 2679 */ 2680 if (vac_colors > 1) 2681 ceq &= 0xf0; 2682 2683 /* 2684 * calculate the number of non-equivalent colors and 2685 * color equivalency mask 2686 */ 2687 plw->plw_ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf)); 2688 ASSERT(szc > 0 || plw->plw_ceq_dif >= vac_colors); 2689 ASSERT(plw->plw_ceq_dif > 0); 2690 plw->plw_ceq_mask[szc] = (plw->plw_ceq_dif - 1) << (ceq & 0xf); 2691 2692 if (flags & PG_MATCH_COLOR) { 2693 if (cpu_page_colors < 0) { 2694 /* 2695 * this is a heterogeneous machine with different CPUs 2696 * having different size e$ (not supported for ni2/rock 2697 */ 2698 uint_t cpucolors = CPUSETSIZE() >> PAGE_GET_SHIFT(szc); 2699 cpucolors = MAX(cpucolors, 1); 2700 ceq_mask = plw->plw_color_mask & (cpucolors - 1); 2701 plw->plw_ceq_mask[szc] = 2702 MIN(ceq_mask, plw->plw_ceq_mask[szc]); 2703 } 2704 plw->plw_ceq_dif = 1; 2705 } 2706 2707 /* we can split pages in the freelist, but not the cachelist */ 2708 if (can_split) { 2709 plw->plw_do_split = (szc + 1 < mmu_page_sizes) ? 1 : 0; 2710 2711 /* set next szc color masks and number of free list bins */ 2712 for (nszc = szc + 1; nszc < mmu_page_sizes; nszc++, szc++) { 2713 plw->plw_ceq_mask[nszc] = PAGE_GET_NSZ_MASK(szc, 2714 plw->plw_ceq_mask[szc]); 2715 plw->plw_bins[nszc] = PAGE_GET_PAGECOLORS(nszc); 2716 } 2717 plw->plw_ceq_mask[nszc] = INVALID_MASK; 2718 plw->plw_bins[nszc] = 0; 2719 2720 } else { 2721 ASSERT(szc == 0); 2722 plw->plw_do_split = 0; 2723 plw->plw_bins[1] = 0; 2724 plw->plw_ceq_mask[1] = INVALID_MASK; 2725 } 2726 } 2727 2728 /* 2729 * set mark to flag where next split should occur 2730 */ 2731 #define PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw) { \ 2732 uint_t bin_nsz = PAGE_GET_NSZ_COLOR(szc, bin); \ 2733 uint_t bin0_nsz = PAGE_GET_NSZ_COLOR(szc, plw->plw_bin0); \ 2734 uint_t neq_mask = ~plw->plw_ceq_mask[nszc] & plw->plw_color_mask; \ 2735 plw->plw_split_next = \ 2736 INC_MASKED(bin_nsz, neq_mask, plw->plw_color_mask); \ 2737 if (!((plw->plw_split_next ^ bin0_nsz) & plw->plw_ceq_mask[nszc])) { \ 2738 plw->plw_split_next = \ 2739 INC_MASKED(plw->plw_split_next, \ 2740 neq_mask, plw->plw_color_mask); \ 2741 } \ 2742 } 2743 2744 uint_t 2745 page_list_walk_next_bin(uchar_t szc, uint_t bin, page_list_walker_t *plw) 2746 { 2747 uint_t neq_mask = ~plw->plw_ceq_mask[szc] & plw->plw_color_mask; 2748 uint_t bin0_nsz, nbin_nsz, nbin0, nbin; 2749 uchar_t nszc = szc + 1; 2750 2751 nbin = ADD_MASKED(bin, 2752 plw->plw_bin_step, neq_mask, plw->plw_color_mask); 2753 2754 if (plw->plw_do_split) { 2755 plw->plw_bin_split_prev = bin; 2756 PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw); 2757 plw->plw_do_split = 0; 2758 } 2759 2760 if (szc == 0) { 2761 if (plw->plw_count != 0 || plw->plw_ceq_dif == vac_colors) { 2762 if (nbin == plw->plw_bin0 && 2763 (vac_colors == 1 || nbin != plw->plw_bin_marker)) { 2764 nbin = ADD_MASKED(nbin, plw->plw_bin_step, 2765 neq_mask, plw->plw_color_mask); 2766 plw->plw_bin_split_prev = plw->plw_bin0; 2767 } 2768 2769 if (vac_colors > 1 && nbin == plw->plw_bin_marker) { 2770 plw->plw_bin_marker = 2771 nbin = INC_MASKED(nbin, neq_mask, 2772 plw->plw_color_mask); 2773 plw->plw_bin_split_prev = plw->plw_bin0; 2774 /* 2775 * large pages all have the same vac color 2776 * so by now we should be done with next 2777 * size page splitting process 2778 */ 2779 ASSERT(plw->plw_bins[1] == 0); 2780 plw->plw_do_split = 0; 2781 return (nbin); 2782 } 2783 2784 } else { 2785 uint_t bin_jump = (vac_colors == 1) ? 2786 (BIN_STEP & ~3) - (plw->plw_bin0 & 3) : BIN_STEP; 2787 2788 bin_jump &= ~(vac_colors - 1); 2789 2790 nbin0 = ADD_MASKED(plw->plw_bin0, bin_jump, neq_mask, 2791 plw->plw_color_mask); 2792 2793 if ((nbin0 ^ plw->plw_bin0) & plw->plw_ceq_mask[szc]) { 2794 2795 plw->plw_bin_marker = nbin = nbin0; 2796 2797 if (plw->plw_bins[nszc] != 0) { 2798 /* 2799 * check if next page size bin is the 2800 * same as the next page size bin for 2801 * bin0 2802 */ 2803 nbin_nsz = PAGE_GET_NSZ_COLOR(szc, 2804 nbin); 2805 bin0_nsz = PAGE_GET_NSZ_COLOR(szc, 2806 plw->plw_bin0); 2807 2808 if ((bin0_nsz ^ nbin_nsz) & 2809 plw->plw_ceq_mask[nszc]) 2810 plw->plw_do_split = 1; 2811 } 2812 return (nbin); 2813 } 2814 } 2815 } 2816 2817 if (plw->plw_bins[nszc] != 0) { 2818 nbin_nsz = PAGE_GET_NSZ_COLOR(szc, nbin); 2819 if (!((plw->plw_split_next ^ nbin_nsz) & 2820 plw->plw_ceq_mask[nszc])) 2821 plw->plw_do_split = 1; 2822 } 2823 2824 return (nbin); 2825 } 2826 2827 page_t * 2828 page_get_mnode_freelist(int mnode, uint_t bin, int mtype, uchar_t szc, 2829 uint_t flags) 2830 { 2831 kmutex_t *pcm; 2832 page_t *pp, *first_pp; 2833 uint_t sbin; 2834 int plw_initialized; 2835 page_list_walker_t plw; 2836 2837 ASSERT(szc < mmu_page_sizes); 2838 2839 VM_STAT_ADD(vmm_vmstats.pgmf_alloc[szc]); 2840 2841 MTYPE_START(mnode, mtype, flags); 2842 if (mtype < 0) { /* mnode does not have memory in mtype range */ 2843 VM_STAT_ADD(vmm_vmstats.pgmf_allocempty[szc]); 2844 return (NULL); 2845 } 2846 try_again: 2847 2848 plw_initialized = 0; 2849 plw.plw_ceq_dif = 1; 2850 2851 /* 2852 * Only hold one freelist lock at a time, that way we 2853 * can start anywhere and not have to worry about lock 2854 * ordering. 2855 */ 2856 for (plw.plw_count = 0; 2857 plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) { 2858 sbin = bin; 2859 do { 2860 if (!PAGE_FREELISTS(mnode, szc, bin, mtype)) 2861 goto bin_empty_1; 2862 2863 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 2864 mutex_enter(pcm); 2865 pp = PAGE_FREELISTS(mnode, szc, bin, mtype); 2866 if (pp == NULL) 2867 goto bin_empty_0; 2868 2869 /* 2870 * These were set before the page 2871 * was put on the free list, 2872 * they must still be set. 2873 */ 2874 ASSERT(PP_ISFREE(pp)); 2875 ASSERT(PP_ISAGED(pp)); 2876 ASSERT(pp->p_vnode == NULL); 2877 ASSERT(pp->p_hash == NULL); 2878 ASSERT(pp->p_offset == (u_offset_t)-1); 2879 ASSERT(pp->p_szc == szc); 2880 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 2881 2882 /* 2883 * Walk down the hash chain. 2884 * 8k pages are linked on p_next 2885 * and p_prev fields. Large pages 2886 * are a contiguous group of 2887 * constituent pages linked together 2888 * on their p_next and p_prev fields. 2889 * The large pages are linked together 2890 * on the hash chain using p_vpnext 2891 * p_vpprev of the base constituent 2892 * page of each large page. 2893 */ 2894 first_pp = pp; 2895 while (!page_trylock_cons(pp, SE_EXCL)) { 2896 if (szc == 0) { 2897 pp = pp->p_next; 2898 } else { 2899 pp = pp->p_vpnext; 2900 } 2901 2902 ASSERT(PP_ISFREE(pp)); 2903 ASSERT(PP_ISAGED(pp)); 2904 ASSERT(pp->p_vnode == NULL); 2905 ASSERT(pp->p_hash == NULL); 2906 ASSERT(pp->p_offset == (u_offset_t)-1); 2907 ASSERT(pp->p_szc == szc); 2908 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 2909 2910 if (pp == first_pp) 2911 goto bin_empty_0; 2912 } 2913 2914 ASSERT(pp != NULL); 2915 ASSERT(mtype == PP_2_MTYPE(pp)); 2916 ASSERT(pp->p_szc == szc); 2917 if (szc == 0) { 2918 page_sub(&PAGE_FREELISTS(mnode, 2919 szc, bin, mtype), pp); 2920 } else { 2921 page_vpsub(&PAGE_FREELISTS(mnode, 2922 szc, bin, mtype), pp); 2923 CHK_LPG(pp, szc); 2924 } 2925 page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST); 2926 2927 if ((PP_ISFREE(pp) == 0) || (PP_ISAGED(pp) == 0)) 2928 panic("free page is not. pp %p", (void *)pp); 2929 mutex_exit(pcm); 2930 2931 #if defined(__sparc) 2932 ASSERT(!kcage_on || PP_ISNORELOC(pp) || 2933 (flags & PG_NORELOC) == 0); 2934 2935 if (PP_ISNORELOC(pp)) 2936 kcage_freemem_sub(page_get_pagecnt(szc)); 2937 #endif 2938 VM_STAT_ADD(vmm_vmstats.pgmf_allocok[szc]); 2939 return (pp); 2940 2941 bin_empty_0: 2942 mutex_exit(pcm); 2943 bin_empty_1: 2944 if (plw_initialized == 0) { 2945 page_list_walk_init(szc, flags, bin, 1, 1, 2946 &plw); 2947 plw_initialized = 1; 2948 ASSERT(plw.plw_colors <= 2949 PAGE_GET_PAGECOLORS(szc)); 2950 ASSERT(plw.plw_colors > 0); 2951 ASSERT((plw.plw_colors & 2952 (plw.plw_colors - 1)) == 0); 2953 ASSERT(bin < plw.plw_colors); 2954 ASSERT(plw.plw_ceq_mask[szc] < plw.plw_colors); 2955 } 2956 /* calculate the next bin with equivalent color */ 2957 bin = ADD_MASKED(bin, plw.plw_bin_step, 2958 plw.plw_ceq_mask[szc], plw.plw_color_mask); 2959 } while (sbin != bin); 2960 2961 /* 2962 * color bins are all empty if color match. Try and 2963 * satisfy the request by breaking up or coalescing 2964 * pages from a different size freelist of the correct 2965 * color that satisfies the ORIGINAL color requested. 2966 * If that fails then try pages of the same size but 2967 * different colors assuming we are not called with 2968 * PG_MATCH_COLOR. 2969 */ 2970 if (plw.plw_do_split && 2971 (pp = page_freelist_split(szc, bin, mnode, 2972 mtype, PFNNULL, &plw)) != NULL) 2973 return (pp); 2974 2975 if (szc > 0 && (pp = page_freelist_coalesce(mnode, szc, 2976 bin, plw.plw_ceq_mask[szc], mtype, PFNNULL)) != NULL) 2977 return (pp); 2978 2979 if (plw.plw_ceq_dif > 1) 2980 bin = page_list_walk_next_bin(szc, bin, &plw); 2981 } 2982 2983 /* if allowed, cycle through additional mtypes */ 2984 MTYPE_NEXT(mnode, mtype, flags); 2985 if (mtype >= 0) 2986 goto try_again; 2987 2988 VM_STAT_ADD(vmm_vmstats.pgmf_allocfailed[szc]); 2989 2990 return (NULL); 2991 } 2992 2993 /* 2994 * Returns the count of free pages for 'pp' with size code 'szc'. 2995 * Note: This function does not return an exact value as the page freelist 2996 * locks are not held and thus the values in the page_counters may be 2997 * changing as we walk through the data. 2998 */ 2999 static int 3000 page_freecnt(int mnode, page_t *pp, uchar_t szc) 3001 { 3002 pgcnt_t pgfree; 3003 pgcnt_t cnt; 3004 ssize_t r = szc; /* region size */ 3005 ssize_t idx; 3006 int i; 3007 int full, range; 3008 3009 /* Make sure pagenum passed in is aligned properly */ 3010 ASSERT((pp->p_pagenum & (PNUM_SIZE(szc) - 1)) == 0); 3011 ASSERT(szc > 0); 3012 3013 /* Prevent page_counters dynamic memory from being freed */ 3014 rw_enter(&page_ctrs_rwlock[mnode], RW_READER); 3015 idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum); 3016 cnt = PAGE_COUNTERS(mnode, r, idx); 3017 pgfree = cnt << PNUM_SHIFT(r - 1); 3018 range = FULL_REGION_CNT(szc); 3019 3020 /* Check for completely full region */ 3021 if (cnt == range) { 3022 rw_exit(&page_ctrs_rwlock[mnode]); 3023 return (pgfree); 3024 } 3025 3026 while (--r > 0) { 3027 idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum); 3028 full = FULL_REGION_CNT(r); 3029 for (i = 0; i < range; i++, idx++) { 3030 cnt = PAGE_COUNTERS(mnode, r, idx); 3031 /* 3032 * If cnt here is full, that means we have already 3033 * accounted for these pages earlier. 3034 */ 3035 if (cnt != full) { 3036 pgfree += (cnt << PNUM_SHIFT(r - 1)); 3037 } 3038 } 3039 range *= full; 3040 } 3041 rw_exit(&page_ctrs_rwlock[mnode]); 3042 return (pgfree); 3043 } 3044 3045 /* 3046 * Called from page_geti_contig_pages to exclusively lock constituent pages 3047 * starting from 'spp' for page size code 'szc'. 3048 * 3049 * If 'ptcpthreshold' is set, the number of free pages needed in the 'szc' 3050 * region needs to be greater than or equal to the threshold. 3051 */ 3052 static int 3053 page_trylock_contig_pages(int mnode, page_t *spp, uchar_t szc, int flags) 3054 { 3055 pgcnt_t pgcnt = PNUM_SIZE(szc); 3056 pgcnt_t pgfree, i; 3057 page_t *pp; 3058 3059 VM_STAT_ADD(vmm_vmstats.ptcp[szc]); 3060 3061 3062 if ((ptcpthreshold == 0) || (flags & PGI_PGCPHIPRI)) 3063 goto skipptcpcheck; 3064 /* 3065 * check if there are sufficient free pages available before attempting 3066 * to trylock. Count is approximate as page counters can change. 3067 */ 3068 pgfree = page_freecnt(mnode, spp, szc); 3069 3070 /* attempt to trylock if there are sufficient already free pages */ 3071 if (pgfree < pgcnt/ptcpthreshold) { 3072 VM_STAT_ADD(vmm_vmstats.ptcpfreethresh[szc]); 3073 return (0); 3074 } 3075 3076 skipptcpcheck: 3077 3078 for (i = 0; i < pgcnt; i++) { 3079 pp = &spp[i]; 3080 if (!page_trylock(pp, SE_EXCL)) { 3081 VM_STAT_ADD(vmm_vmstats.ptcpfailexcl[szc]); 3082 while (--i != (pgcnt_t)-1) { 3083 pp = &spp[i]; 3084 ASSERT(PAGE_EXCL(pp)); 3085 page_unlock_nocapture(pp); 3086 } 3087 return (0); 3088 } 3089 ASSERT(spp[i].p_pagenum == spp->p_pagenum + i); 3090 if ((pp->p_szc > szc || (szc && pp->p_szc == szc)) && 3091 !PP_ISFREE(pp)) { 3092 VM_STAT_ADD(vmm_vmstats.ptcpfailszc[szc]); 3093 ASSERT(i == 0); 3094 page_unlock_nocapture(pp); 3095 return (0); 3096 } 3097 if (PP_ISNORELOC(pp)) { 3098 VM_STAT_ADD(vmm_vmstats.ptcpfailcage[szc]); 3099 while (i != (pgcnt_t)-1) { 3100 pp = &spp[i]; 3101 ASSERT(PAGE_EXCL(pp)); 3102 page_unlock_nocapture(pp); 3103 i--; 3104 } 3105 return (0); 3106 } 3107 } 3108 VM_STAT_ADD(vmm_vmstats.ptcpok[szc]); 3109 return (1); 3110 } 3111 3112 /* 3113 * Claim large page pointed to by 'pp'. 'pp' is the starting set 3114 * of 'szc' constituent pages that had been locked exclusively previously. 3115 * Will attempt to relocate constituent pages in use. 3116 */ 3117 static page_t * 3118 page_claim_contig_pages(page_t *pp, uchar_t szc, int flags) 3119 { 3120 spgcnt_t pgcnt, npgs, i; 3121 page_t *targpp, *rpp, *hpp; 3122 page_t *replpp = NULL; 3123 page_t *pplist = NULL; 3124 3125 ASSERT(pp != NULL); 3126 3127 pgcnt = page_get_pagecnt(szc); 3128 while (pgcnt) { 3129 ASSERT(PAGE_EXCL(pp)); 3130 ASSERT(!PP_ISNORELOC(pp)); 3131 if (PP_ISFREE(pp)) { 3132 /* 3133 * If this is a PG_FREE_LIST page then its 3134 * size code can change underneath us due to 3135 * page promotion or demotion. As an optimzation 3136 * use page_list_sub_pages() instead of 3137 * page_list_sub(). 3138 */ 3139 if (PP_ISAGED(pp)) { 3140 page_list_sub_pages(pp, szc); 3141 if (pp->p_szc == szc) { 3142 return (pp); 3143 } 3144 ASSERT(pp->p_szc < szc); 3145 npgs = page_get_pagecnt(pp->p_szc); 3146 hpp = pp; 3147 for (i = 0; i < npgs; i++, pp++) { 3148 pp->p_szc = szc; 3149 } 3150 page_list_concat(&pplist, &hpp); 3151 pgcnt -= npgs; 3152 continue; 3153 } 3154 ASSERT(!PP_ISAGED(pp)); 3155 ASSERT(pp->p_szc == 0); 3156 page_list_sub(pp, PG_CACHE_LIST); 3157 page_hashout(pp, NULL); 3158 PP_SETAGED(pp); 3159 pp->p_szc = szc; 3160 page_list_concat(&pplist, &pp); 3161 pp++; 3162 pgcnt--; 3163 continue; 3164 } 3165 npgs = page_get_pagecnt(pp->p_szc); 3166 3167 /* 3168 * page_create_wait freemem accounting done by caller of 3169 * page_get_freelist and not necessary to call it prior to 3170 * calling page_get_replacement_page. 3171 * 3172 * page_get_replacement_page can call page_get_contig_pages 3173 * to acquire a large page (szc > 0); the replacement must be 3174 * smaller than the contig page size to avoid looping or 3175 * szc == 0 and PGI_PGCPSZC0 is set. 3176 */ 3177 if (pp->p_szc < szc || (szc == 0 && (flags & PGI_PGCPSZC0))) { 3178 replpp = page_get_replacement_page(pp, NULL, 0); 3179 if (replpp) { 3180 npgs = page_get_pagecnt(pp->p_szc); 3181 ASSERT(npgs <= pgcnt); 3182 targpp = pp; 3183 } 3184 } 3185 3186 /* 3187 * If replacement is NULL or do_page_relocate fails, fail 3188 * coalescing of pages. 3189 */ 3190 if (replpp == NULL || (do_page_relocate(&targpp, &replpp, 0, 3191 &npgs, NULL) != 0)) { 3192 /* 3193 * Unlock un-processed target list 3194 */ 3195 while (pgcnt--) { 3196 ASSERT(PAGE_EXCL(pp)); 3197 page_unlock_nocapture(pp); 3198 pp++; 3199 } 3200 /* 3201 * Free the processed target list. 3202 */ 3203 while (pplist) { 3204 pp = pplist; 3205 page_sub(&pplist, pp); 3206 ASSERT(PAGE_EXCL(pp)); 3207 ASSERT(pp->p_szc == szc); 3208 ASSERT(PP_ISFREE(pp)); 3209 ASSERT(PP_ISAGED(pp)); 3210 pp->p_szc = 0; 3211 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); 3212 page_unlock_nocapture(pp); 3213 } 3214 3215 if (replpp != NULL) 3216 page_free_replacement_page(replpp); 3217 3218 return (NULL); 3219 } 3220 ASSERT(pp == targpp); 3221 3222 /* LINTED */ 3223 ASSERT(hpp = pp); /* That's right, it's an assignment */ 3224 3225 pp += npgs; 3226 pgcnt -= npgs; 3227 3228 while (npgs--) { 3229 ASSERT(PAGE_EXCL(targpp)); 3230 ASSERT(!PP_ISFREE(targpp)); 3231 ASSERT(!PP_ISNORELOC(targpp)); 3232 PP_SETFREE(targpp); 3233 ASSERT(PP_ISAGED(targpp)); 3234 ASSERT(targpp->p_szc < szc || (szc == 0 && 3235 (flags & PGI_PGCPSZC0))); 3236 targpp->p_szc = szc; 3237 targpp = targpp->p_next; 3238 3239 rpp = replpp; 3240 ASSERT(rpp != NULL); 3241 page_sub(&replpp, rpp); 3242 ASSERT(PAGE_EXCL(rpp)); 3243 ASSERT(!PP_ISFREE(rpp)); 3244 page_unlock_nocapture(rpp); 3245 } 3246 ASSERT(targpp == hpp); 3247 ASSERT(replpp == NULL); 3248 page_list_concat(&pplist, &targpp); 3249 } 3250 CHK_LPG(pplist, szc); 3251 return (pplist); 3252 } 3253 3254 /* 3255 * Trim kernel cage from pfnlo-pfnhi and store result in lo-hi. Return code 3256 * of 0 means nothing left after trim. 3257 */ 3258 int 3259 trimkcage(struct memseg *mseg, pfn_t *lo, pfn_t *hi, pfn_t pfnlo, pfn_t pfnhi) 3260 { 3261 pfn_t kcagepfn; 3262 int decr; 3263 int rc = 0; 3264 3265 if (PP_ISNORELOC(mseg->pages)) { 3266 if (PP_ISNORELOC(mseg->epages - 1) == 0) { 3267 3268 /* lower part of this mseg inside kernel cage */ 3269 decr = kcage_current_pfn(&kcagepfn); 3270 3271 /* kernel cage may have transitioned past mseg */ 3272 if (kcagepfn >= mseg->pages_base && 3273 kcagepfn < mseg->pages_end) { 3274 ASSERT(decr == 0); 3275 *lo = kcagepfn; 3276 *hi = MIN(pfnhi, 3277 (mseg->pages_end - 1)); 3278 rc = 1; 3279 } 3280 } 3281 /* else entire mseg in the cage */ 3282 } else { 3283 if (PP_ISNORELOC(mseg->epages - 1)) { 3284 3285 /* upper part of this mseg inside kernel cage */ 3286 decr = kcage_current_pfn(&kcagepfn); 3287 3288 /* kernel cage may have transitioned past mseg */ 3289 if (kcagepfn >= mseg->pages_base && 3290 kcagepfn < mseg->pages_end) { 3291 ASSERT(decr); 3292 *hi = kcagepfn; 3293 *lo = MAX(pfnlo, mseg->pages_base); 3294 rc = 1; 3295 } 3296 } else { 3297 /* entire mseg outside of kernel cage */ 3298 *lo = MAX(pfnlo, mseg->pages_base); 3299 *hi = MIN(pfnhi, (mseg->pages_end - 1)); 3300 rc = 1; 3301 } 3302 } 3303 return (rc); 3304 } 3305 3306 /* 3307 * called from page_get_contig_pages to search 'pfnlo' thru 'pfnhi' to claim a 3308 * page with size code 'szc'. Claiming such a page requires acquiring 3309 * exclusive locks on all constituent pages (page_trylock_contig_pages), 3310 * relocating pages in use and concatenating these constituent pages into a 3311 * large page. 3312 * 3313 * The page lists do not have such a large page and page_freelist_split has 3314 * already failed to demote larger pages and/or coalesce smaller free pages. 3315 * 3316 * 'flags' may specify PG_COLOR_MATCH which would limit the search of large 3317 * pages with the same color as 'bin'. 3318 * 3319 * 'pfnflag' specifies the subset of the pfn range to search. 3320 */ 3321 3322 static page_t * 3323 page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags, 3324 pfn_t pfnlo, pfn_t pfnhi, pgcnt_t pfnflag) 3325 { 3326 struct memseg *mseg; 3327 pgcnt_t szcpgcnt = page_get_pagecnt(szc); 3328 pgcnt_t szcpgmask = szcpgcnt - 1; 3329 pfn_t randpfn; 3330 page_t *pp, *randpp, *endpp; 3331 uint_t colors, ceq_mask; 3332 /* LINTED : set but not used in function */ 3333 uint_t color_mask; 3334 pfn_t hi, lo; 3335 uint_t skip; 3336 MEM_NODE_ITERATOR_DECL(it); 3337 3338 ASSERT(szc != 0 || (flags & PGI_PGCPSZC0)); 3339 3340 pfnlo = P2ROUNDUP(pfnlo, szcpgcnt); 3341 3342 if ((pfnhi - pfnlo) + 1 < szcpgcnt || pfnlo >= pfnhi) 3343 return (NULL); 3344 3345 ASSERT(szc < mmu_page_sizes); 3346 3347 colors = PAGE_GET_PAGECOLORS(szc); 3348 color_mask = colors - 1; 3349 if ((colors > 1) && (flags & PG_MATCH_COLOR)) { 3350 uchar_t ceq = colorequivszc[szc]; 3351 uint_t ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf)); 3352 3353 ASSERT(ceq_dif > 0); 3354 ceq_mask = (ceq_dif - 1) << (ceq & 0xf); 3355 } else { 3356 ceq_mask = 0; 3357 } 3358 3359 ASSERT(bin < colors); 3360 3361 /* clear "non-significant" color bits */ 3362 bin &= ceq_mask; 3363 3364 /* 3365 * trim the pfn range to search based on pfnflag. pfnflag is set 3366 * when there have been previous page_get_contig_page failures to 3367 * limit the search. 3368 * 3369 * The high bit in pfnflag specifies the number of 'slots' in the 3370 * pfn range and the remainder of pfnflag specifies which slot. 3371 * For example, a value of 1010b would mean the second slot of 3372 * the pfn range that has been divided into 8 slots. 3373 */ 3374 if (pfnflag > 1) { 3375 int slots = 1 << (highbit(pfnflag) - 1); 3376 int slotid = pfnflag & (slots - 1); 3377 pgcnt_t szcpages; 3378 int slotlen; 3379 3380 pfnhi = P2ALIGN((pfnhi + 1), szcpgcnt) - 1; 3381 szcpages = ((pfnhi - pfnlo) + 1) / szcpgcnt; 3382 slotlen = howmany(szcpages, slots); 3383 /* skip if 'slotid' slot is empty */ 3384 if (slotid * slotlen >= szcpages) 3385 return (NULL); 3386 pfnlo = pfnlo + (((slotid * slotlen) % szcpages) * szcpgcnt); 3387 ASSERT(pfnlo < pfnhi); 3388 if (pfnhi > pfnlo + (slotlen * szcpgcnt)) 3389 pfnhi = pfnlo + (slotlen * szcpgcnt) - 1; 3390 } 3391 3392 memsegs_lock(0); 3393 3394 /* 3395 * loop through memsegs to look for contig page candidates 3396 */ 3397 3398 for (mseg = memsegs; mseg != NULL; mseg = mseg->next) { 3399 if (pfnhi < mseg->pages_base || pfnlo >= mseg->pages_end) { 3400 /* no overlap */ 3401 continue; 3402 } 3403 3404 if (mseg->pages_end - mseg->pages_base < szcpgcnt) 3405 /* mseg too small */ 3406 continue; 3407 3408 /* trim off kernel cage pages from pfn range */ 3409 if (kcage_on) { 3410 if (trimkcage(mseg, &lo, &hi, pfnlo, pfnhi) == 0) 3411 continue; 3412 } else { 3413 lo = MAX(pfnlo, mseg->pages_base); 3414 hi = MIN(pfnhi, (mseg->pages_end - 1)); 3415 } 3416 3417 /* round to szcpgcnt boundaries */ 3418 lo = P2ROUNDUP(lo, szcpgcnt); 3419 3420 MEM_NODE_ITERATOR_INIT(lo, mnode, &it); 3421 hi = P2ALIGN((hi + 1), szcpgcnt) - 1; 3422 3423 if (hi <= lo) 3424 continue; 3425 3426 /* 3427 * set lo to point to the pfn for the desired bin. Large 3428 * page sizes may only have a single page color 3429 */ 3430 skip = szcpgcnt; 3431 if (ceq_mask > 0 || interleaved_mnodes) { 3432 /* set lo to point at appropriate color */ 3433 if (((PFN_2_COLOR(lo, szc, &it) ^ bin) & ceq_mask) || 3434 (interleaved_mnodes && 3435 PFN_2_MEM_NODE(lo) != mnode)) { 3436 PAGE_NEXT_PFN_FOR_COLOR(lo, szc, bin, ceq_mask, 3437 color_mask, &it); 3438 } 3439 if (hi <= lo) 3440 /* mseg cannot satisfy color request */ 3441 continue; 3442 } 3443 3444 /* randomly choose a point between lo and hi to begin search */ 3445 3446 randpfn = (pfn_t)GETTICK(); 3447 randpfn = ((randpfn % (hi - lo)) + lo) & ~(skip - 1); 3448 MEM_NODE_ITERATOR_INIT(randpfn, mnode, &it); 3449 if (ceq_mask || interleaved_mnodes) { 3450 if (randpfn != (pfn_t)-1) 3451 PAGE_NEXT_PFN_FOR_COLOR(randpfn, szc, bin, 3452 ceq_mask, color_mask, &it); 3453 if (randpfn >= hi) { 3454 randpfn = lo; 3455 MEM_NODE_ITERATOR_INIT(randpfn, mnode, &it); 3456 } 3457 } 3458 randpp = mseg->pages + (randpfn - mseg->pages_base); 3459 3460 ASSERT(randpp->p_pagenum == randpfn); 3461 3462 pp = randpp; 3463 endpp = mseg->pages + (hi - mseg->pages_base) + 1; 3464 3465 ASSERT(randpp + szcpgcnt <= endpp); 3466 3467 do { 3468 ASSERT(!(pp->p_pagenum & szcpgmask)); 3469 ASSERT(((PP_2_BIN(pp) ^ bin) & ceq_mask) == 0); 3470 3471 if (page_trylock_contig_pages(mnode, pp, szc, flags)) { 3472 /* pages unlocked by page_claim on failure */ 3473 if (page_claim_contig_pages(pp, szc, flags)) { 3474 memsegs_unlock(0); 3475 return (pp); 3476 } 3477 } 3478 3479 if (ceq_mask == 0 && !interleaved_mnodes) { 3480 pp += skip; 3481 } else { 3482 pfn_t pfn = pp->p_pagenum; 3483 3484 PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, bin, 3485 ceq_mask, color_mask, &it); 3486 if (pfn == (pfn_t)-1) { 3487 pp = endpp; 3488 } else { 3489 pp = mseg->pages + 3490 (pfn - mseg->pages_base); 3491 } 3492 } 3493 if (pp >= endpp) { 3494 /* start from the beginning */ 3495 MEM_NODE_ITERATOR_INIT(lo, mnode, &it); 3496 pp = mseg->pages + (lo - mseg->pages_base); 3497 ASSERT(pp->p_pagenum == lo); 3498 ASSERT(pp + szcpgcnt <= endpp); 3499 } 3500 } while (pp != randpp); 3501 } 3502 memsegs_unlock(0); 3503 return (NULL); 3504 } 3505 3506 3507 /* 3508 * controlling routine that searches through physical memory in an attempt to 3509 * claim a large page based on the input parameters. 3510 * on the page free lists. 3511 * 3512 * calls page_geti_contig_pages with an initial pfn range from the mnode 3513 * and mtype. page_geti_contig_pages will trim off the parts of the pfn range 3514 * that overlaps with the kernel cage or does not match the requested page 3515 * color if PG_MATCH_COLOR is set. Since this search is very expensive, 3516 * page_geti_contig_pages may further limit the search range based on 3517 * previous failure counts (pgcpfailcnt[]). 3518 * 3519 * for PGI_PGCPSZC0 requests, page_get_contig_pages will relocate a base 3520 * pagesize page that satisfies mtype. 3521 */ 3522 page_t * 3523 page_get_contig_pages(int mnode, uint_t bin, int mtype, uchar_t szc, 3524 uint_t flags) 3525 { 3526 pfn_t pfnlo, pfnhi; /* contig pages pfn range */ 3527 page_t *pp; 3528 pgcnt_t pfnflag = 0; /* no limit on search if 0 */ 3529 3530 VM_STAT_ADD(vmm_vmstats.pgcp_alloc[szc]); 3531 3532 /* no allocations from cage */ 3533 flags |= PGI_NOCAGE; 3534 3535 /* LINTED */ 3536 MTYPE_START(mnode, mtype, flags); 3537 if (mtype < 0) { /* mnode does not have memory in mtype range */ 3538 VM_STAT_ADD(vmm_vmstats.pgcp_allocempty[szc]); 3539 return (NULL); 3540 } 3541 3542 ASSERT(szc > 0 || (flags & PGI_PGCPSZC0)); 3543 3544 /* do not limit search and ignore color if hi pri */ 3545 3546 if (pgcplimitsearch && ((flags & PGI_PGCPHIPRI) == 0)) 3547 pfnflag = pgcpfailcnt[szc]; 3548 3549 /* remove color match to improve chances */ 3550 3551 if (flags & PGI_PGCPHIPRI || pfnflag) 3552 flags &= ~PG_MATCH_COLOR; 3553 3554 do { 3555 /* get pfn range based on mnode and mtype */ 3556 MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi); 3557 3558 ASSERT(pfnhi >= pfnlo); 3559 3560 pp = page_geti_contig_pages(mnode, bin, szc, flags, 3561 pfnlo, pfnhi, pfnflag); 3562 3563 if (pp != NULL) { 3564 pfnflag = pgcpfailcnt[szc]; 3565 if (pfnflag) { 3566 /* double the search size */ 3567 pgcpfailcnt[szc] = pfnflag >> 1; 3568 } 3569 VM_STAT_ADD(vmm_vmstats.pgcp_allocok[szc]); 3570 return (pp); 3571 } 3572 MTYPE_NEXT(mnode, mtype, flags); 3573 } while (mtype >= 0); 3574 3575 VM_STAT_ADD(vmm_vmstats.pgcp_allocfailed[szc]); 3576 return (NULL); 3577 } 3578 3579 3580 /* 3581 * Find the `best' page on the freelist for this (vp,off) (as,vaddr) pair. 3582 * 3583 * Does its own locking and accounting. 3584 * If PG_MATCH_COLOR is set, then NULL will be returned if there are no 3585 * pages of the proper color even if there are pages of a different color. 3586 * 3587 * Finds a page, removes it, THEN locks it. 3588 */ 3589 3590 /*ARGSUSED*/ 3591 page_t * 3592 page_get_freelist(struct vnode *vp, u_offset_t off, struct seg *seg, 3593 caddr_t vaddr, size_t size, uint_t flags, struct lgrp *lgrp) 3594 { 3595 struct as *as = seg->s_as; 3596 page_t *pp = NULL; 3597 ulong_t bin; 3598 uchar_t szc; 3599 int mnode; 3600 int mtype; 3601 page_t *(*page_get_func)(int, uint_t, int, uchar_t, uint_t); 3602 lgrp_mnode_cookie_t lgrp_cookie; 3603 3604 page_get_func = page_get_mnode_freelist; 3605 3606 /* 3607 * If we aren't passed a specific lgroup, or passed a freed lgrp 3608 * assume we wish to allocate near to the current thread's home. 3609 */ 3610 if (!LGRP_EXISTS(lgrp)) 3611 lgrp = lgrp_home_lgrp(); 3612 3613 if (kcage_on) { 3614 if ((flags & (PG_NORELOC | PG_PANIC)) == PG_NORELOC && 3615 kcage_freemem < kcage_throttlefree + btop(size) && 3616 curthread != kcage_cageout_thread) { 3617 /* 3618 * Set a "reserve" of kcage_throttlefree pages for 3619 * PG_PANIC and cageout thread allocations. 3620 * 3621 * Everybody else has to serialize in 3622 * page_create_get_something() to get a cage page, so 3623 * that we don't deadlock cageout! 3624 */ 3625 return (NULL); 3626 } 3627 } else { 3628 flags &= ~PG_NORELOC; 3629 flags |= PGI_NOCAGE; 3630 } 3631 3632 /* LINTED */ 3633 MTYPE_INIT(mtype, vp, vaddr, flags, size); 3634 3635 /* 3636 * Convert size to page size code. 3637 */ 3638 if ((szc = page_szc(size)) == (uchar_t)-1) 3639 panic("page_get_freelist: illegal page size request"); 3640 ASSERT(szc < mmu_page_sizes); 3641 3642 VM_STAT_ADD(vmm_vmstats.pgf_alloc[szc]); 3643 3644 /* LINTED */ 3645 AS_2_BIN(as, seg, vp, vaddr, bin, szc); 3646 3647 ASSERT(bin < PAGE_GET_PAGECOLORS(szc)); 3648 3649 /* 3650 * Try to get a local page first, but try remote if we can't 3651 * get a page of the right color. 3652 */ 3653 pgretry: 3654 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL); 3655 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3656 pp = page_get_func(mnode, bin, mtype, szc, flags); 3657 if (pp != NULL) { 3658 VM_STAT_ADD(vmm_vmstats.pgf_allocok[szc]); 3659 DTRACE_PROBE4(page__get, 3660 lgrp_t *, lgrp, 3661 int, mnode, 3662 ulong_t, bin, 3663 uint_t, flags); 3664 return (pp); 3665 } 3666 } 3667 ASSERT(pp == NULL); 3668 3669 /* 3670 * for non-SZC0 PAGESIZE requests, check cachelist before checking 3671 * remote free lists. Caller expected to call page_get_cachelist which 3672 * will check local cache lists and remote free lists. 3673 */ 3674 if (szc == 0 && ((flags & PGI_PGCPSZC0) == 0)) { 3675 VM_STAT_ADD(vmm_vmstats.pgf_allocdeferred); 3676 return (NULL); 3677 } 3678 3679 ASSERT(szc > 0 || (flags & PGI_PGCPSZC0)); 3680 3681 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); 3682 3683 if (!(flags & PG_LOCAL)) { 3684 /* 3685 * Try to get a non-local freelist page. 3686 */ 3687 LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie); 3688 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3689 pp = page_get_func(mnode, bin, mtype, szc, flags); 3690 if (pp != NULL) { 3691 DTRACE_PROBE4(page__get, 3692 lgrp_t *, lgrp, 3693 int, mnode, 3694 ulong_t, bin, 3695 uint_t, flags); 3696 VM_STAT_ADD(vmm_vmstats.pgf_allocokrem[szc]); 3697 return (pp); 3698 } 3699 } 3700 ASSERT(pp == NULL); 3701 } 3702 3703 /* 3704 * when the cage is off chances are page_get_contig_pages() will fail 3705 * to lock a large page chunk therefore when the cage is off it's not 3706 * called by default. this can be changed via /etc/system. 3707 * 3708 * page_get_contig_pages() also called to acquire a base pagesize page 3709 * for page_create_get_something(). 3710 */ 3711 if (!(flags & PG_NORELOC) && (pg_contig_disable == 0) && 3712 (kcage_on || pg_lpgcreate_nocage || szc == 0) && 3713 (page_get_func != page_get_contig_pages)) { 3714 3715 VM_STAT_ADD(vmm_vmstats.pgf_allocretry[szc]); 3716 page_get_func = page_get_contig_pages; 3717 goto pgretry; 3718 } 3719 3720 if (!(flags & PG_LOCAL) && pgcplimitsearch && 3721 page_get_func == page_get_contig_pages) 3722 SETPGCPFAILCNT(szc); 3723 3724 VM_STAT_ADD(vmm_vmstats.pgf_allocfailed[szc]); 3725 return (NULL); 3726 } 3727 3728 /* 3729 * Find the `best' page on the cachelist for this (vp,off) (as,vaddr) pair. 3730 * 3731 * Does its own locking. 3732 * If PG_MATCH_COLOR is set, then NULL will be returned if there are no 3733 * pages of the proper color even if there are pages of a different color. 3734 * Otherwise, scan the bins for ones with pages. For each bin with pages, 3735 * try to lock one of them. If no page can be locked, try the 3736 * next bin. Return NULL if a page can not be found and locked. 3737 * 3738 * Finds a pages, trys to lock it, then removes it. 3739 */ 3740 3741 /*ARGSUSED*/ 3742 page_t * 3743 page_get_cachelist(struct vnode *vp, u_offset_t off, struct seg *seg, 3744 caddr_t vaddr, uint_t flags, struct lgrp *lgrp) 3745 { 3746 page_t *pp; 3747 struct as *as = seg->s_as; 3748 ulong_t bin; 3749 /*LINTED*/ 3750 int mnode; 3751 int mtype; 3752 lgrp_mnode_cookie_t lgrp_cookie; 3753 3754 /* 3755 * If we aren't passed a specific lgroup, or pasased a freed lgrp 3756 * assume we wish to allocate near to the current thread's home. 3757 */ 3758 if (!LGRP_EXISTS(lgrp)) 3759 lgrp = lgrp_home_lgrp(); 3760 3761 if (!kcage_on) { 3762 flags &= ~PG_NORELOC; 3763 flags |= PGI_NOCAGE; 3764 } 3765 3766 if ((flags & (PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == PG_NORELOC && 3767 kcage_freemem <= kcage_throttlefree) { 3768 /* 3769 * Reserve kcage_throttlefree pages for critical kernel 3770 * threads. 3771 * 3772 * Everybody else has to go to page_create_get_something() 3773 * to get a cage page, so we don't deadlock cageout. 3774 */ 3775 return (NULL); 3776 } 3777 3778 /* LINTED */ 3779 AS_2_BIN(as, seg, vp, vaddr, bin, 0); 3780 3781 ASSERT(bin < PAGE_GET_PAGECOLORS(0)); 3782 3783 /* LINTED */ 3784 MTYPE_INIT(mtype, vp, vaddr, flags, MMU_PAGESIZE); 3785 3786 VM_STAT_ADD(vmm_vmstats.pgc_alloc); 3787 3788 /* 3789 * Try local cachelists first 3790 */ 3791 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL); 3792 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3793 pp = page_get_mnode_cachelist(bin, flags, mnode, mtype); 3794 if (pp != NULL) { 3795 VM_STAT_ADD(vmm_vmstats.pgc_allocok); 3796 DTRACE_PROBE4(page__get, 3797 lgrp_t *, lgrp, 3798 int, mnode, 3799 ulong_t, bin, 3800 uint_t, flags); 3801 return (pp); 3802 } 3803 } 3804 3805 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); 3806 3807 /* 3808 * Try freelists/cachelists that are farther away 3809 * This is our only chance to allocate remote pages for PAGESIZE 3810 * requests. 3811 */ 3812 LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie); 3813 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3814 pp = page_get_mnode_freelist(mnode, bin, mtype, 3815 0, flags); 3816 if (pp != NULL) { 3817 VM_STAT_ADD(vmm_vmstats.pgc_allocokdeferred); 3818 DTRACE_PROBE4(page__get, 3819 lgrp_t *, lgrp, 3820 int, mnode, 3821 ulong_t, bin, 3822 uint_t, flags); 3823 return (pp); 3824 } 3825 pp = page_get_mnode_cachelist(bin, flags, mnode, mtype); 3826 if (pp != NULL) { 3827 VM_STAT_ADD(vmm_vmstats.pgc_allocokrem); 3828 DTRACE_PROBE4(page__get, 3829 lgrp_t *, lgrp, 3830 int, mnode, 3831 ulong_t, bin, 3832 uint_t, flags); 3833 return (pp); 3834 } 3835 } 3836 3837 VM_STAT_ADD(vmm_vmstats.pgc_allocfailed); 3838 return (NULL); 3839 } 3840 3841 page_t * 3842 page_get_mnode_cachelist(uint_t bin, uint_t flags, int mnode, int mtype) 3843 { 3844 kmutex_t *pcm; 3845 page_t *pp, *first_pp; 3846 uint_t sbin; 3847 int plw_initialized; 3848 page_list_walker_t plw; 3849 3850 VM_STAT_ADD(vmm_vmstats.pgmc_alloc); 3851 3852 /* LINTED */ 3853 MTYPE_START(mnode, mtype, flags); 3854 if (mtype < 0) { /* mnode does not have memory in mtype range */ 3855 VM_STAT_ADD(vmm_vmstats.pgmc_allocempty); 3856 return (NULL); 3857 } 3858 3859 try_again: 3860 3861 plw_initialized = 0; 3862 plw.plw_ceq_dif = 1; 3863 3864 /* 3865 * Only hold one cachelist lock at a time, that way we 3866 * can start anywhere and not have to worry about lock 3867 * ordering. 3868 */ 3869 3870 for (plw.plw_count = 0; 3871 plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) { 3872 sbin = bin; 3873 do { 3874 3875 if (!PAGE_CACHELISTS(mnode, bin, mtype)) 3876 goto bin_empty_1; 3877 pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST); 3878 mutex_enter(pcm); 3879 pp = PAGE_CACHELISTS(mnode, bin, mtype); 3880 if (pp == NULL) 3881 goto bin_empty_0; 3882 3883 first_pp = pp; 3884 ASSERT(pp->p_vnode); 3885 ASSERT(PP_ISAGED(pp) == 0); 3886 ASSERT(pp->p_szc == 0); 3887 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 3888 while (!page_trylock(pp, SE_EXCL)) { 3889 pp = pp->p_next; 3890 ASSERT(pp->p_szc == 0); 3891 if (pp == first_pp) { 3892 /* 3893 * We have searched the complete list! 3894 * And all of them (might only be one) 3895 * are locked. This can happen since 3896 * these pages can also be found via 3897 * the hash list. When found via the 3898 * hash list, they are locked first, 3899 * then removed. We give up to let the 3900 * other thread run. 3901 */ 3902 pp = NULL; 3903 break; 3904 } 3905 ASSERT(pp->p_vnode); 3906 ASSERT(PP_ISFREE(pp)); 3907 ASSERT(PP_ISAGED(pp) == 0); 3908 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == 3909 mnode); 3910 } 3911 3912 if (pp) { 3913 page_t **ppp; 3914 /* 3915 * Found and locked a page. 3916 * Pull it off the list. 3917 */ 3918 ASSERT(mtype == PP_2_MTYPE(pp)); 3919 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 3920 page_sub(ppp, pp); 3921 /* 3922 * Subtract counters before releasing pcm mutex 3923 * to avoid a race with page_freelist_coalesce 3924 * and page_freelist_split. 3925 */ 3926 page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST); 3927 mutex_exit(pcm); 3928 ASSERT(pp->p_vnode); 3929 ASSERT(PP_ISAGED(pp) == 0); 3930 #if defined(__sparc) 3931 ASSERT(!kcage_on || 3932 (flags & PG_NORELOC) == 0 || 3933 PP_ISNORELOC(pp)); 3934 if (PP_ISNORELOC(pp)) { 3935 kcage_freemem_sub(1); 3936 } 3937 #endif 3938 VM_STAT_ADD(vmm_vmstats. pgmc_allocok); 3939 return (pp); 3940 } 3941 bin_empty_0: 3942 mutex_exit(pcm); 3943 bin_empty_1: 3944 if (plw_initialized == 0) { 3945 page_list_walk_init(0, flags, bin, 0, 1, &plw); 3946 plw_initialized = 1; 3947 } 3948 /* calculate the next bin with equivalent color */ 3949 bin = ADD_MASKED(bin, plw.plw_bin_step, 3950 plw.plw_ceq_mask[0], plw.plw_color_mask); 3951 } while (sbin != bin); 3952 3953 if (plw.plw_ceq_dif > 1) 3954 bin = page_list_walk_next_bin(0, bin, &plw); 3955 } 3956 3957 MTYPE_NEXT(mnode, mtype, flags); 3958 if (mtype >= 0) 3959 goto try_again; 3960 3961 VM_STAT_ADD(vmm_vmstats.pgmc_allocfailed); 3962 return (NULL); 3963 } 3964 3965 #ifdef DEBUG 3966 #define REPL_PAGE_STATS 3967 #endif /* DEBUG */ 3968 3969 #ifdef REPL_PAGE_STATS 3970 struct repl_page_stats { 3971 uint_t ngets; 3972 uint_t ngets_noreloc; 3973 uint_t npgr_noreloc; 3974 uint_t nnopage_first; 3975 uint_t nnopage; 3976 uint_t nhashout; 3977 uint_t nnofree; 3978 uint_t nnext_pp; 3979 } repl_page_stats; 3980 #define REPL_STAT_INCR(v) atomic_add_32(&repl_page_stats.v, 1) 3981 #else /* REPL_PAGE_STATS */ 3982 #define REPL_STAT_INCR(v) 3983 #endif /* REPL_PAGE_STATS */ 3984 3985 int pgrppgcp; 3986 3987 /* 3988 * The freemem accounting must be done by the caller. 3989 * First we try to get a replacement page of the same size as like_pp, 3990 * if that is not possible, then we just get a set of discontiguous 3991 * PAGESIZE pages. 3992 */ 3993 page_t * 3994 page_get_replacement_page(page_t *orig_like_pp, struct lgrp *lgrp_target, 3995 uint_t pgrflags) 3996 { 3997 page_t *like_pp; 3998 page_t *pp, *pplist; 3999 page_t *pl = NULL; 4000 ulong_t bin; 4001 int mnode, page_mnode; 4002 int szc; 4003 spgcnt_t npgs, pg_cnt; 4004 pfn_t pfnum; 4005 int mtype; 4006 int flags = 0; 4007 lgrp_mnode_cookie_t lgrp_cookie; 4008 lgrp_t *lgrp; 4009 4010 REPL_STAT_INCR(ngets); 4011 like_pp = orig_like_pp; 4012 ASSERT(PAGE_EXCL(like_pp)); 4013 4014 szc = like_pp->p_szc; 4015 npgs = page_get_pagecnt(szc); 4016 /* 4017 * Now we reset like_pp to the base page_t. 4018 * That way, we won't walk past the end of this 'szc' page. 4019 */ 4020 pfnum = PFN_BASE(like_pp->p_pagenum, szc); 4021 like_pp = page_numtopp_nolock(pfnum); 4022 ASSERT(like_pp->p_szc == szc); 4023 4024 if (PP_ISNORELOC(like_pp)) { 4025 ASSERT(kcage_on); 4026 REPL_STAT_INCR(ngets_noreloc); 4027 flags = PGI_RELOCONLY; 4028 } else if (pgrflags & PGR_NORELOC) { 4029 ASSERT(kcage_on); 4030 REPL_STAT_INCR(npgr_noreloc); 4031 flags = PG_NORELOC; 4032 } 4033 4034 /* 4035 * Kernel pages must always be replaced with the same size 4036 * pages, since we cannot properly handle demotion of kernel 4037 * pages. 4038 */ 4039 if (PP_ISKAS(like_pp)) 4040 pgrflags |= PGR_SAMESZC; 4041 4042 /* LINTED */ 4043 MTYPE_PGR_INIT(mtype, flags, like_pp, page_mnode, npgs); 4044 4045 while (npgs) { 4046 pplist = NULL; 4047 for (;;) { 4048 pg_cnt = page_get_pagecnt(szc); 4049 bin = PP_2_BIN(like_pp); 4050 ASSERT(like_pp->p_szc == orig_like_pp->p_szc); 4051 ASSERT(pg_cnt <= npgs); 4052 4053 /* 4054 * If an lgroup was specified, try to get the 4055 * page from that lgroup. 4056 * NOTE: Must be careful with code below because 4057 * lgroup may disappear and reappear since there 4058 * is no locking for lgroup here. 4059 */ 4060 if (LGRP_EXISTS(lgrp_target)) { 4061 /* 4062 * Keep local variable for lgroup separate 4063 * from lgroup argument since this code should 4064 * only be exercised when lgroup argument 4065 * exists.... 4066 */ 4067 lgrp = lgrp_target; 4068 4069 /* Try the lgroup's freelists first */ 4070 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 4071 LGRP_SRCH_LOCAL); 4072 while ((pplist == NULL) && 4073 (mnode = lgrp_memnode_choose(&lgrp_cookie)) 4074 != -1) { 4075 pplist = 4076 page_get_mnode_freelist(mnode, bin, 4077 mtype, szc, flags); 4078 } 4079 4080 /* 4081 * Now try it's cachelists if this is a 4082 * small page. Don't need to do it for 4083 * larger ones since page_freelist_coalesce() 4084 * already failed. 4085 */ 4086 if (pplist != NULL || szc != 0) 4087 break; 4088 4089 /* Now try it's cachelists */ 4090 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 4091 LGRP_SRCH_LOCAL); 4092 4093 while ((pplist == NULL) && 4094 (mnode = lgrp_memnode_choose(&lgrp_cookie)) 4095 != -1) { 4096 pplist = 4097 page_get_mnode_cachelist(bin, flags, 4098 mnode, mtype); 4099 } 4100 if (pplist != NULL) { 4101 page_hashout(pplist, NULL); 4102 PP_SETAGED(pplist); 4103 REPL_STAT_INCR(nhashout); 4104 break; 4105 } 4106 /* Done looking in this lgroup. Bail out. */ 4107 break; 4108 } 4109 4110 /* 4111 * No lgroup was specified (or lgroup was removed by 4112 * DR, so just try to get the page as close to 4113 * like_pp's mnode as possible. 4114 * First try the local freelist... 4115 */ 4116 mnode = PP_2_MEM_NODE(like_pp); 4117 pplist = page_get_mnode_freelist(mnode, bin, 4118 mtype, szc, flags); 4119 if (pplist != NULL) 4120 break; 4121 4122 REPL_STAT_INCR(nnofree); 4123 4124 /* 4125 * ...then the local cachelist. Don't need to do it for 4126 * larger pages cause page_freelist_coalesce() already 4127 * failed there anyway. 4128 */ 4129 if (szc == 0) { 4130 pplist = page_get_mnode_cachelist(bin, flags, 4131 mnode, mtype); 4132 if (pplist != NULL) { 4133 page_hashout(pplist, NULL); 4134 PP_SETAGED(pplist); 4135 REPL_STAT_INCR(nhashout); 4136 break; 4137 } 4138 } 4139 4140 /* Now try remote freelists */ 4141 page_mnode = mnode; 4142 lgrp = 4143 lgrp_hand_to_lgrp(MEM_NODE_2_LGRPHAND(page_mnode)); 4144 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 4145 LGRP_SRCH_HIER); 4146 while (pplist == NULL && 4147 (mnode = lgrp_memnode_choose(&lgrp_cookie)) 4148 != -1) { 4149 /* 4150 * Skip local mnode. 4151 */ 4152 if ((mnode == page_mnode) || 4153 (mem_node_config[mnode].exists == 0)) 4154 continue; 4155 4156 pplist = page_get_mnode_freelist(mnode, 4157 bin, mtype, szc, flags); 4158 } 4159 4160 if (pplist != NULL) 4161 break; 4162 4163 4164 /* Now try remote cachelists */ 4165 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 4166 LGRP_SRCH_HIER); 4167 while (pplist == NULL && szc == 0) { 4168 mnode = lgrp_memnode_choose(&lgrp_cookie); 4169 if (mnode == -1) 4170 break; 4171 /* 4172 * Skip local mnode. 4173 */ 4174 if ((mnode == page_mnode) || 4175 (mem_node_config[mnode].exists == 0)) 4176 continue; 4177 4178 pplist = page_get_mnode_cachelist(bin, 4179 flags, mnode, mtype); 4180 4181 if (pplist != NULL) { 4182 page_hashout(pplist, NULL); 4183 PP_SETAGED(pplist); 4184 REPL_STAT_INCR(nhashout); 4185 break; 4186 } 4187 } 4188 4189 /* 4190 * Break out of while loop under the following cases: 4191 * - If we successfully got a page. 4192 * - If pgrflags specified only returning a specific 4193 * page size and we could not find that page size. 4194 * - If we could not satisfy the request with PAGESIZE 4195 * or larger pages. 4196 */ 4197 if (pplist != NULL || szc == 0) 4198 break; 4199 4200 if ((pgrflags & PGR_SAMESZC) || pgrppgcp) { 4201 /* try to find contig page */ 4202 4203 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 4204 LGRP_SRCH_HIER); 4205 4206 while ((pplist == NULL) && 4207 (mnode = 4208 lgrp_memnode_choose(&lgrp_cookie)) 4209 != -1) { 4210 pplist = page_get_contig_pages( 4211 mnode, bin, mtype, szc, 4212 flags | PGI_PGCPHIPRI); 4213 } 4214 break; 4215 } 4216 4217 /* 4218 * The correct thing to do here is try the next 4219 * page size down using szc--. Due to a bug 4220 * with the processing of HAT_RELOAD_SHARE 4221 * where the sfmmu_ttecnt arrays of all 4222 * hats sharing an ISM segment don't get updated, 4223 * using intermediate size pages for relocation 4224 * can lead to continuous page faults. 4225 */ 4226 szc = 0; 4227 } 4228 4229 if (pplist != NULL) { 4230 DTRACE_PROBE4(page__get, 4231 lgrp_t *, lgrp, 4232 int, mnode, 4233 ulong_t, bin, 4234 uint_t, flags); 4235 4236 while (pplist != NULL && pg_cnt--) { 4237 ASSERT(pplist != NULL); 4238 pp = pplist; 4239 page_sub(&pplist, pp); 4240 PP_CLRFREE(pp); 4241 PP_CLRAGED(pp); 4242 page_list_concat(&pl, &pp); 4243 npgs--; 4244 like_pp = like_pp + 1; 4245 REPL_STAT_INCR(nnext_pp); 4246 } 4247 ASSERT(pg_cnt == 0); 4248 } else { 4249 break; 4250 } 4251 } 4252 4253 if (npgs) { 4254 /* 4255 * We were unable to allocate the necessary number 4256 * of pages. 4257 * We need to free up any pl. 4258 */ 4259 REPL_STAT_INCR(nnopage); 4260 page_free_replacement_page(pl); 4261 return (NULL); 4262 } else { 4263 return (pl); 4264 } 4265 } 4266 4267 /* 4268 * demote a free large page to it's constituent pages 4269 */ 4270 void 4271 page_demote_free_pages(page_t *pp) 4272 { 4273 4274 int mnode; 4275 4276 ASSERT(pp != NULL); 4277 ASSERT(PAGE_LOCKED(pp)); 4278 ASSERT(PP_ISFREE(pp)); 4279 ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes); 4280 4281 mnode = PP_2_MEM_NODE(pp); 4282 page_freelist_lock(mnode); 4283 if (pp->p_szc != 0) { 4284 (void) page_demote(mnode, PFN_BASE(pp->p_pagenum, 4285 pp->p_szc), pp->p_szc, 0, PC_NO_COLOR, PC_FREE); 4286 } 4287 page_freelist_unlock(mnode); 4288 ASSERT(pp->p_szc == 0); 4289 } 4290 4291 /* 4292 * Factor in colorequiv to check additional 'equivalent' bins. 4293 * colorequiv may be set in /etc/system 4294 */ 4295 void 4296 page_set_colorequiv_arr(void) 4297 { 4298 if (colorequiv > 1) { 4299 int i; 4300 uint_t sv_a = lowbit(colorequiv) - 1; 4301 4302 if (sv_a > 15) 4303 sv_a = 15; 4304 4305 for (i = 0; i < MMU_PAGE_SIZES; i++) { 4306 uint_t colors; 4307 uint_t a = sv_a; 4308 4309 if ((colors = hw_page_array[i].hp_colors) <= 1) { 4310 continue; 4311 } 4312 while ((colors >> a) == 0) 4313 a--; 4314 if ((a << 4) > colorequivszc[i]) { 4315 colorequivszc[i] = (a << 4); 4316 } 4317 } 4318 } 4319 } 4320