1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * Portions of this source code were derived from Berkeley 4.3 BSD 31 * under license from the Regents of the University of California. 32 */ 33 34 #pragma ident "%Z%%M% %I% %E% SMI" 35 36 /* 37 * This file contains common functions to access and manage the page lists. 38 * Many of these routines originated from platform dependent modules 39 * (sun4/vm/vm_dep.c, i86pc/vm/vm_machdep.c) and modified to function in 40 * a platform independent manner. 41 * 42 * vm/vm_dep.h provides for platform specific support. 43 */ 44 45 #include <sys/types.h> 46 #include <sys/debug.h> 47 #include <sys/cmn_err.h> 48 #include <sys/systm.h> 49 #include <sys/atomic.h> 50 #include <sys/sysmacros.h> 51 #include <vm/as.h> 52 #include <vm/page.h> 53 #include <vm/seg_kmem.h> 54 #include <vm/seg_vn.h> 55 #include <sys/vmsystm.h> 56 #include <sys/memnode.h> 57 #include <vm/vm_dep.h> 58 #include <sys/lgrp.h> 59 #include <sys/mem_config.h> 60 #include <sys/callb.h> 61 #include <sys/mem_cage.h> 62 #include <sys/sdt.h> 63 64 extern uint_t vac_colors; 65 66 #define MAX_PRAGMA_ALIGN 128 67 68 /* vm_cpu_data0 for the boot cpu before kmem is initialized */ 69 70 #if L2CACHE_ALIGN_MAX <= MAX_PRAGMA_ALIGN 71 #pragma align L2CACHE_ALIGN_MAX(vm_cpu_data0) 72 #else 73 #pragma align MAX_PRAGMA_ALIGN(vm_cpu_data0) 74 #endif 75 char vm_cpu_data0[VM_CPU_DATA_PADSIZE]; 76 77 /* 78 * number of page colors equivalent to reqested color in page_get routines. 79 * If set, keeps large pages intact longer and keeps MPO allocation 80 * from the local mnode in favor of acquiring the 'correct' page color from 81 * a demoted large page or from a remote mnode. 82 */ 83 uint_t colorequiv; 84 85 /* 86 * color equivalency mask for each page size. 87 * Mask is computed based on cpu L2$ way sizes and colorequiv global. 88 * High 4 bits determine the number of high order bits of the color to ignore. 89 * Low 4 bits determines number of low order bits of color to ignore (it's only 90 * relevant for hashed index based page coloring). 91 */ 92 uchar_t colorequivszc[MMU_PAGE_SIZES]; 93 94 /* 95 * if set, specifies the percentage of large pages that are free from within 96 * a large page region before attempting to lock those pages for 97 * page_get_contig_pages processing. 98 * 99 * Should be turned on when kpr is available when page_trylock_contig_pages 100 * can be more selective. 101 */ 102 103 int ptcpthreshold; 104 105 /* 106 * Limit page get contig page search based on failure cnts in pgcpfailcnt[]. 107 * Enabled by default via pgcplimitsearch. 108 * 109 * pgcpfailcnt[] is bounded by PGCPFAILMAX (>= 1/2 of installed 110 * memory). When reached, pgcpfailcnt[] is reset to 1/2 of this upper 111 * bound. This upper bound range guarantees: 112 * - all large page 'slots' will be searched over time 113 * - the minimum (1) large page candidates considered on each pgcp call 114 * - count doesn't wrap around to 0 115 */ 116 pgcnt_t pgcpfailcnt[MMU_PAGE_SIZES]; 117 int pgcplimitsearch = 1; 118 119 #define PGCPFAILMAX (1 << (highbit(physinstalled) - 1)) 120 #define SETPGCPFAILCNT(szc) \ 121 if (++pgcpfailcnt[szc] >= PGCPFAILMAX) \ 122 pgcpfailcnt[szc] = PGCPFAILMAX / 2; 123 124 #ifdef VM_STATS 125 struct vmm_vmstats_str vmm_vmstats; 126 127 #endif /* VM_STATS */ 128 129 #if defined(__sparc) 130 #define LPGCREATE 0 131 #else 132 /* enable page_get_contig_pages */ 133 #define LPGCREATE 1 134 #endif 135 136 int pg_contig_disable; 137 int pg_lpgcreate_nocage = LPGCREATE; 138 139 /* 140 * page_freelist_split pfn flag to signify no hi pfn requirement. 141 */ 142 #define PFNNULL 0 143 144 /* Flags involved in promotion and demotion routines */ 145 #define PC_FREE 0x1 /* put page on freelist */ 146 #define PC_ALLOC 0x2 /* return page for allocation */ 147 148 /* 149 * Flag for page_demote to be used with PC_FREE to denote that we don't care 150 * what the color is as the color parameter to the function is ignored. 151 */ 152 #define PC_NO_COLOR (-1) 153 154 /* mtype value for page_promote to use when mtype does not matter */ 155 #define PC_MTYPE_ANY (-1) 156 157 /* 158 * page counters candidates info 159 * See page_ctrs_cands comment below for more details. 160 * fields are as follows: 161 * pcc_pages_free: # pages which freelist coalesce can create 162 * pcc_color_free: pointer to page free counts per color 163 */ 164 typedef struct pcc_info { 165 pgcnt_t pcc_pages_free; 166 pgcnt_t *pcc_color_free; 167 uint_t pad[12]; 168 } pcc_info_t; 169 170 /* 171 * On big machines it can take a long time to check page_counters 172 * arrays. page_ctrs_cands is a summary array whose elements are a dynamically 173 * updated sum of all elements of the corresponding page_counters arrays. 174 * page_freelist_coalesce() searches page_counters only if an appropriate 175 * element of page_ctrs_cands array is greater than 0. 176 * 177 * page_ctrs_cands is indexed by mutex (i), region (r), mnode (m), mrange (g) 178 */ 179 pcc_info_t **page_ctrs_cands[NPC_MUTEX][MMU_PAGE_SIZES]; 180 181 /* 182 * Return in val the total number of free pages which can be created 183 * for the given mnode (m), mrange (g), and region size (r) 184 */ 185 #define PGCTRS_CANDS_GETVALUE(m, g, r, val) { \ 186 int i; \ 187 val = 0; \ 188 for (i = 0; i < NPC_MUTEX; i++) { \ 189 val += page_ctrs_cands[i][(r)][(m)][(g)].pcc_pages_free; \ 190 } \ 191 } 192 193 /* 194 * Return in val the total number of free pages which can be created 195 * for the given mnode (m), mrange (g), region size (r), and color (c) 196 */ 197 #define PGCTRS_CANDS_GETVALUECOLOR(m, g, r, c, val) { \ 198 int i; \ 199 val = 0; \ 200 ASSERT((c) < PAGE_GET_PAGECOLORS(r)); \ 201 for (i = 0; i < NPC_MUTEX; i++) { \ 202 val += \ 203 page_ctrs_cands[i][(r)][(m)][(g)].pcc_color_free[(c)]; \ 204 } \ 205 } 206 207 /* 208 * We can only allow a single thread to update a counter within the physical 209 * range of the largest supported page size. That is the finest granularity 210 * possible since the counter values are dependent on each other 211 * as you move accross region sizes. PP_CTR_LOCK_INDX is used to determine the 212 * ctr_mutex lock index for a particular physical range. 213 */ 214 static kmutex_t *ctr_mutex[NPC_MUTEX]; 215 216 #define PP_CTR_LOCK_INDX(pp) \ 217 (((pp)->p_pagenum >> \ 218 (PAGE_BSZS_SHIFT(mmu_page_sizes - 1))) & (NPC_MUTEX - 1)) 219 220 #define INVALID_COLOR 0xffffffff 221 #define INVALID_MASK 0xffffffff 222 223 /* 224 * Local functions prototypes. 225 */ 226 227 void page_ctr_add(int, int, page_t *, int); 228 void page_ctr_add_internal(int, int, page_t *, int); 229 void page_ctr_sub(int, int, page_t *, int); 230 void page_ctr_sub_internal(int, int, page_t *, int); 231 void page_freelist_lock(int); 232 void page_freelist_unlock(int); 233 page_t *page_promote(int, pfn_t, uchar_t, int, int); 234 page_t *page_demote(int, pfn_t, uchar_t, uchar_t, int, int); 235 page_t *page_freelist_split(uchar_t, 236 uint_t, int, int, pfn_t, page_list_walker_t *); 237 page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int); 238 static int page_trylock_cons(page_t *pp, se_t se); 239 240 /* 241 * The page_counters array below is used to keep track of free contiguous 242 * physical memory. A hw_page_map_t will be allocated per mnode per szc. 243 * This contains an array of counters, the size of the array, a shift value 244 * used to convert a pagenum into a counter array index or vice versa, as 245 * well as a cache of the last successful index to be promoted to a larger 246 * page size. As an optimization, we keep track of the last successful index 247 * to be promoted per page color for the given size region, and this is 248 * allocated dynamically based upon the number of colors for a given 249 * region size. 250 * 251 * Conceptually, the page counters are represented as: 252 * 253 * page_counters[region_size][mnode] 254 * 255 * region_size: size code of a candidate larger page made up 256 * of contiguous free smaller pages. 257 * 258 * page_counters[region_size][mnode].hpm_counters[index]: 259 * represents how many (region_size - 1) pages either 260 * exist or can be created within the given index range. 261 * 262 * Let's look at a sparc example: 263 * If we want to create a free 512k page, we look at region_size 2 264 * for the mnode we want. We calculate the index and look at a specific 265 * hpm_counters location. If we see 8 (FULL_REGION_CNT on sparc) at 266 * this location, it means that 8 64k pages either exist or can be created 267 * from 8K pages in order to make a single free 512k page at the given 268 * index. Note that when a region is full, it will contribute to the 269 * counts in the region above it. Thus we will not know what page 270 * size the free pages will be which can be promoted to this new free 271 * page unless we look at all regions below the current region. 272 */ 273 274 /* 275 * Note: hpmctr_t is defined in platform vm_dep.h 276 * hw_page_map_t contains all the information needed for the page_counters 277 * logic. The fields are as follows: 278 * 279 * hpm_counters: dynamically allocated array to hold counter data 280 * hpm_entries: entries in hpm_counters 281 * hpm_shift: shift for pnum/array index conv 282 * hpm_base: PFN mapped to counter index 0 283 * hpm_color_current: last index in counter array for this color at 284 * which we successfully created a large page 285 */ 286 typedef struct hw_page_map { 287 hpmctr_t *hpm_counters; 288 size_t hpm_entries; 289 int hpm_shift; 290 pfn_t hpm_base; 291 size_t *hpm_color_current[MAX_MNODE_MRANGES]; 292 #if defined(__sparc) 293 uint_t pad[4]; 294 #endif 295 } hw_page_map_t; 296 297 /* 298 * Element zero is not used, but is allocated for convenience. 299 */ 300 static hw_page_map_t *page_counters[MMU_PAGE_SIZES]; 301 302 /* 303 * Cached value of MNODE_RANGE_CNT(mnode). 304 * This is a function call in x86. 305 */ 306 static int mnode_nranges[MAX_MEM_NODES]; 307 static int mnode_maxmrange[MAX_MEM_NODES]; 308 309 /* 310 * The following macros are convenient ways to get access to the individual 311 * elements of the page_counters arrays. They can be used on both 312 * the left side and right side of equations. 313 */ 314 #define PAGE_COUNTERS(mnode, rg_szc, idx) \ 315 (page_counters[(rg_szc)][(mnode)].hpm_counters[(idx)]) 316 317 #define PAGE_COUNTERS_COUNTERS(mnode, rg_szc) \ 318 (page_counters[(rg_szc)][(mnode)].hpm_counters) 319 320 #define PAGE_COUNTERS_SHIFT(mnode, rg_szc) \ 321 (page_counters[(rg_szc)][(mnode)].hpm_shift) 322 323 #define PAGE_COUNTERS_ENTRIES(mnode, rg_szc) \ 324 (page_counters[(rg_szc)][(mnode)].hpm_entries) 325 326 #define PAGE_COUNTERS_BASE(mnode, rg_szc) \ 327 (page_counters[(rg_szc)][(mnode)].hpm_base) 328 329 #define PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, rg_szc, g) \ 330 (page_counters[(rg_szc)][(mnode)].hpm_color_current[(g)]) 331 332 #define PAGE_COUNTERS_CURRENT_COLOR(mnode, rg_szc, color, mrange) \ 333 (page_counters[(rg_szc)][(mnode)]. \ 334 hpm_color_current[(mrange)][(color)]) 335 336 #define PNUM_TO_IDX(mnode, rg_szc, pnum) \ 337 (((pnum) - PAGE_COUNTERS_BASE((mnode), (rg_szc))) >> \ 338 PAGE_COUNTERS_SHIFT((mnode), (rg_szc))) 339 340 #define IDX_TO_PNUM(mnode, rg_szc, index) \ 341 (PAGE_COUNTERS_BASE((mnode), (rg_szc)) + \ 342 ((index) << PAGE_COUNTERS_SHIFT((mnode), (rg_szc)))) 343 344 /* 345 * Protects the hpm_counters and hpm_color_current memory from changing while 346 * looking at page counters information. 347 * Grab the write lock to modify what these fields point at. 348 * Grab the read lock to prevent any pointers from changing. 349 * The write lock can not be held during memory allocation due to a possible 350 * recursion deadlock with trying to grab the read lock while the 351 * write lock is already held. 352 */ 353 krwlock_t page_ctrs_rwlock[MAX_MEM_NODES]; 354 355 356 /* 357 * initialize cpu_vm_data to point at cache aligned vm_cpu_data_t. 358 */ 359 void 360 cpu_vm_data_init(struct cpu *cp) 361 { 362 if (cp == CPU0) { 363 cp->cpu_vm_data = (void *)&vm_cpu_data0; 364 } else { 365 void *kmptr; 366 int align; 367 size_t sz; 368 369 align = (L2CACHE_ALIGN) ? L2CACHE_ALIGN : L2CACHE_ALIGN_MAX; 370 sz = P2ROUNDUP(sizeof (vm_cpu_data_t), align) + align; 371 kmptr = kmem_zalloc(sz, KM_SLEEP); 372 cp->cpu_vm_data = (void *) P2ROUNDUP((uintptr_t)kmptr, align); 373 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr = kmptr; 374 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize = sz; 375 } 376 } 377 378 /* 379 * free cpu_vm_data 380 */ 381 void 382 cpu_vm_data_destroy(struct cpu *cp) 383 { 384 if (cp->cpu_seqid && cp->cpu_vm_data) { 385 ASSERT(cp != CPU0); 386 kmem_free(((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr, 387 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize); 388 } 389 cp->cpu_vm_data = NULL; 390 } 391 392 393 /* 394 * page size to page size code 395 */ 396 int 397 page_szc(size_t pagesize) 398 { 399 int i = 0; 400 401 while (hw_page_array[i].hp_size) { 402 if (pagesize == hw_page_array[i].hp_size) 403 return (i); 404 i++; 405 } 406 return (-1); 407 } 408 409 /* 410 * page size to page size code with the restriction that it be a supported 411 * user page size. If it's not a supported user page size, -1 will be returned. 412 */ 413 int 414 page_szc_user_filtered(size_t pagesize) 415 { 416 int szc = page_szc(pagesize); 417 if ((szc != -1) && (SZC_2_USERSZC(szc) != -1)) { 418 return (szc); 419 } 420 return (-1); 421 } 422 423 /* 424 * Return how many page sizes are available for the user to use. This is 425 * what the hardware supports and not based upon how the OS implements the 426 * support of different page sizes. 427 * 428 * If legacy is non-zero, return the number of pagesizes available to legacy 429 * applications. The number of legacy page sizes might be less than the 430 * exported user page sizes. This is to prevent legacy applications that 431 * use the largest page size returned from getpagesizes(3c) from inadvertantly 432 * using the 'new' large pagesizes. 433 */ 434 uint_t 435 page_num_user_pagesizes(int legacy) 436 { 437 if (legacy) 438 return (mmu_legacy_page_sizes); 439 return (mmu_exported_page_sizes); 440 } 441 442 uint_t 443 page_num_pagesizes(void) 444 { 445 return (mmu_page_sizes); 446 } 447 448 /* 449 * returns the count of the number of base pagesize pages associated with szc 450 */ 451 pgcnt_t 452 page_get_pagecnt(uint_t szc) 453 { 454 if (szc >= mmu_page_sizes) 455 panic("page_get_pagecnt: out of range %d", szc); 456 return (hw_page_array[szc].hp_pgcnt); 457 } 458 459 size_t 460 page_get_pagesize(uint_t szc) 461 { 462 if (szc >= mmu_page_sizes) 463 panic("page_get_pagesize: out of range %d", szc); 464 return (hw_page_array[szc].hp_size); 465 } 466 467 /* 468 * Return the size of a page based upon the index passed in. An index of 469 * zero refers to the smallest page size in the system, and as index increases 470 * it refers to the next larger supported page size in the system. 471 * Note that szc and userszc may not be the same due to unsupported szc's on 472 * some systems. 473 */ 474 size_t 475 page_get_user_pagesize(uint_t userszc) 476 { 477 uint_t szc = USERSZC_2_SZC(userszc); 478 479 if (szc >= mmu_page_sizes) 480 panic("page_get_user_pagesize: out of range %d", szc); 481 return (hw_page_array[szc].hp_size); 482 } 483 484 uint_t 485 page_get_shift(uint_t szc) 486 { 487 if (szc >= mmu_page_sizes) 488 panic("page_get_shift: out of range %d", szc); 489 return (PAGE_GET_SHIFT(szc)); 490 } 491 492 uint_t 493 page_get_pagecolors(uint_t szc) 494 { 495 if (szc >= mmu_page_sizes) 496 panic("page_get_pagecolors: out of range %d", szc); 497 return (PAGE_GET_PAGECOLORS(szc)); 498 } 499 500 /* 501 * this assigns the desired equivalent color after a split 502 */ 503 uint_t 504 page_correct_color(uchar_t szc, uchar_t nszc, uint_t color, 505 uint_t ncolor, uint_t ceq_mask) 506 { 507 ASSERT(nszc > szc); 508 ASSERT(szc < mmu_page_sizes); 509 ASSERT(color < PAGE_GET_PAGECOLORS(szc)); 510 ASSERT(ncolor < PAGE_GET_PAGECOLORS(nszc)); 511 512 color &= ceq_mask; 513 ncolor = PAGE_CONVERT_COLOR(ncolor, szc, nszc); 514 return (color | (ncolor & ~ceq_mask)); 515 } 516 517 /* 518 * The interleaved_mnodes flag is set when mnodes overlap in 519 * the physbase..physmax range, but have disjoint slices. 520 * In this case hpm_counters is shared by all mnodes. 521 * This flag is set dynamically by the platform. 522 */ 523 int interleaved_mnodes = 0; 524 525 /* 526 * Called by startup(). 527 * Size up the per page size free list counters based on physmax 528 * of each node and max_mem_nodes. 529 * 530 * If interleaved_mnodes is set we need to find the first mnode that 531 * exists. hpm_counters for the first mnode will then be shared by 532 * all other mnodes. If interleaved_mnodes is not set, just set 533 * first=mnode each time. That means there will be no sharing. 534 */ 535 size_t 536 page_ctrs_sz(void) 537 { 538 int r; /* region size */ 539 int mnode; 540 int firstmn; /* first mnode that exists */ 541 int nranges; 542 pfn_t physbase; 543 pfn_t physmax; 544 uint_t ctrs_sz = 0; 545 int i; 546 pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; 547 548 /* 549 * We need to determine how many page colors there are for each 550 * page size in order to allocate memory for any color specific 551 * arrays. 552 */ 553 for (i = 0; i < mmu_page_sizes; i++) { 554 colors_per_szc[i] = PAGE_GET_PAGECOLORS(i); 555 } 556 557 for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) { 558 559 pgcnt_t r_pgcnt; 560 pfn_t r_base; 561 pgcnt_t r_align; 562 563 if (mem_node_config[mnode].exists == 0) 564 continue; 565 566 HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn); 567 nranges = MNODE_RANGE_CNT(mnode); 568 mnode_nranges[mnode] = nranges; 569 mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode); 570 571 /* 572 * determine size needed for page counter arrays with 573 * base aligned to large page size. 574 */ 575 for (r = 1; r < mmu_page_sizes; r++) { 576 /* add in space for hpm_color_current */ 577 ctrs_sz += sizeof (size_t) * 578 colors_per_szc[r] * nranges; 579 580 if (firstmn != mnode) 581 continue; 582 583 /* add in space for hpm_counters */ 584 r_align = page_get_pagecnt(r); 585 r_base = physbase; 586 r_base &= ~(r_align - 1); 587 r_pgcnt = howmany(physmax - r_base + 1, r_align); 588 589 /* 590 * Round up to always allocate on pointer sized 591 * boundaries. 592 */ 593 ctrs_sz += P2ROUNDUP((r_pgcnt * sizeof (hpmctr_t)), 594 sizeof (hpmctr_t *)); 595 } 596 } 597 598 for (r = 1; r < mmu_page_sizes; r++) { 599 ctrs_sz += (max_mem_nodes * sizeof (hw_page_map_t)); 600 } 601 602 /* add in space for page_ctrs_cands and pcc_color_free */ 603 ctrs_sz += sizeof (pcc_info_t *) * max_mem_nodes * 604 mmu_page_sizes * NPC_MUTEX; 605 606 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 607 608 if (mem_node_config[mnode].exists == 0) 609 continue; 610 611 nranges = mnode_nranges[mnode]; 612 ctrs_sz += sizeof (pcc_info_t) * nranges * 613 mmu_page_sizes * NPC_MUTEX; 614 for (r = 1; r < mmu_page_sizes; r++) { 615 ctrs_sz += sizeof (pgcnt_t) * nranges * 616 colors_per_szc[r] * NPC_MUTEX; 617 } 618 } 619 620 /* ctr_mutex */ 621 ctrs_sz += (max_mem_nodes * NPC_MUTEX * sizeof (kmutex_t)); 622 623 /* size for page list counts */ 624 PLCNT_SZ(ctrs_sz); 625 626 /* 627 * add some slop for roundups. page_ctrs_alloc will roundup the start 628 * address of the counters to ecache_alignsize boundary for every 629 * memory node. 630 */ 631 return (ctrs_sz + max_mem_nodes * L2CACHE_ALIGN); 632 } 633 634 caddr_t 635 page_ctrs_alloc(caddr_t alloc_base) 636 { 637 int mnode; 638 int mrange, nranges; 639 int r; /* region size */ 640 int i; 641 int firstmn; /* first mnode that exists */ 642 pfn_t physbase; 643 pfn_t physmax; 644 pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; 645 646 /* 647 * We need to determine how many page colors there are for each 648 * page size in order to allocate memory for any color specific 649 * arrays. 650 */ 651 for (i = 0; i < mmu_page_sizes; i++) { 652 colors_per_szc[i] = PAGE_GET_PAGECOLORS(i); 653 } 654 655 for (r = 1; r < mmu_page_sizes; r++) { 656 page_counters[r] = (hw_page_map_t *)alloc_base; 657 alloc_base += (max_mem_nodes * sizeof (hw_page_map_t)); 658 } 659 660 /* page_ctrs_cands and pcc_color_free array */ 661 for (i = 0; i < NPC_MUTEX; i++) { 662 for (r = 1; r < mmu_page_sizes; r++) { 663 664 page_ctrs_cands[i][r] = (pcc_info_t **)alloc_base; 665 alloc_base += sizeof (pcc_info_t *) * max_mem_nodes; 666 667 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 668 pcc_info_t *pi; 669 670 if (mem_node_config[mnode].exists == 0) 671 continue; 672 673 nranges = mnode_nranges[mnode]; 674 675 pi = (pcc_info_t *)alloc_base; 676 alloc_base += sizeof (pcc_info_t) * nranges; 677 page_ctrs_cands[i][r][mnode] = pi; 678 679 for (mrange = 0; mrange < nranges; mrange++) { 680 pi->pcc_color_free = 681 (pgcnt_t *)alloc_base; 682 alloc_base += sizeof (pgcnt_t) * 683 colors_per_szc[r]; 684 pi++; 685 } 686 } 687 } 688 } 689 690 /* ctr_mutex */ 691 for (i = 0; i < NPC_MUTEX; i++) { 692 ctr_mutex[i] = (kmutex_t *)alloc_base; 693 alloc_base += (max_mem_nodes * sizeof (kmutex_t)); 694 } 695 696 /* initialize page list counts */ 697 PLCNT_INIT(alloc_base); 698 699 for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) { 700 701 pgcnt_t r_pgcnt; 702 pfn_t r_base; 703 pgcnt_t r_align; 704 int r_shift; 705 int nranges = mnode_nranges[mnode]; 706 707 if (mem_node_config[mnode].exists == 0) 708 continue; 709 710 HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn); 711 712 for (r = 1; r < mmu_page_sizes; r++) { 713 /* 714 * the page_counters base has to be aligned to the 715 * page count of page size code r otherwise the counts 716 * will cross large page boundaries. 717 */ 718 r_align = page_get_pagecnt(r); 719 r_base = physbase; 720 /* base needs to be aligned - lower to aligned value */ 721 r_base &= ~(r_align - 1); 722 r_pgcnt = howmany(physmax - r_base + 1, r_align); 723 r_shift = PAGE_BSZS_SHIFT(r); 724 725 PAGE_COUNTERS_SHIFT(mnode, r) = r_shift; 726 PAGE_COUNTERS_ENTRIES(mnode, r) = r_pgcnt; 727 PAGE_COUNTERS_BASE(mnode, r) = r_base; 728 for (mrange = 0; mrange < nranges; mrange++) { 729 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, 730 r, mrange) = (size_t *)alloc_base; 731 alloc_base += sizeof (size_t) * 732 colors_per_szc[r]; 733 } 734 for (i = 0; i < colors_per_szc[r]; i++) { 735 uint_t color_mask = colors_per_szc[r] - 1; 736 pfn_t pfnum = r_base; 737 size_t idx; 738 int mrange; 739 MEM_NODE_ITERATOR_DECL(it); 740 741 MEM_NODE_ITERATOR_INIT(pfnum, mnode, r, &it); 742 if (pfnum == (pfn_t)-1) { 743 idx = 0; 744 } else { 745 PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i, 746 color_mask, color_mask, &it); 747 idx = PNUM_TO_IDX(mnode, r, pfnum); 748 idx = (idx >= r_pgcnt) ? 0 : idx; 749 } 750 for (mrange = 0; mrange < nranges; mrange++) { 751 PAGE_COUNTERS_CURRENT_COLOR(mnode, 752 r, i, mrange) = idx; 753 } 754 } 755 756 /* hpm_counters may be shared by all mnodes */ 757 if (firstmn == mnode) { 758 PAGE_COUNTERS_COUNTERS(mnode, r) = 759 (hpmctr_t *)alloc_base; 760 alloc_base += 761 P2ROUNDUP((sizeof (hpmctr_t) * r_pgcnt), 762 sizeof (hpmctr_t *)); 763 } else { 764 PAGE_COUNTERS_COUNTERS(mnode, r) = 765 PAGE_COUNTERS_COUNTERS(firstmn, r); 766 } 767 768 /* 769 * Verify that PNUM_TO_IDX and IDX_TO_PNUM 770 * satisfy the identity requirement. 771 * We should be able to go from one to the other 772 * and get consistent values. 773 */ 774 ASSERT(PNUM_TO_IDX(mnode, r, 775 (IDX_TO_PNUM(mnode, r, 0))) == 0); 776 ASSERT(IDX_TO_PNUM(mnode, r, 777 (PNUM_TO_IDX(mnode, r, r_base))) == r_base); 778 } 779 /* 780 * Roundup the start address of the page_counters to 781 * cache aligned boundary for every memory node. 782 * page_ctrs_sz() has added some slop for these roundups. 783 */ 784 alloc_base = (caddr_t)P2ROUNDUP((uintptr_t)alloc_base, 785 L2CACHE_ALIGN); 786 } 787 788 /* Initialize other page counter specific data structures. */ 789 for (mnode = 0; mnode < MAX_MEM_NODES; mnode++) { 790 rw_init(&page_ctrs_rwlock[mnode], NULL, RW_DEFAULT, NULL); 791 } 792 793 return (alloc_base); 794 } 795 796 /* 797 * Functions to adjust region counters for each size free list. 798 * Caller is responsible to acquire the ctr_mutex lock if necessary and 799 * thus can be called during startup without locks. 800 */ 801 /* ARGSUSED */ 802 void 803 page_ctr_add_internal(int mnode, int mtype, page_t *pp, int flags) 804 { 805 ssize_t r; /* region size */ 806 ssize_t idx; 807 pfn_t pfnum; 808 int lckidx; 809 810 ASSERT(mnode == PP_2_MEM_NODE(pp)); 811 ASSERT(mtype == PP_2_MTYPE(pp)); 812 813 ASSERT(pp->p_szc < mmu_page_sizes); 814 815 PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags); 816 817 /* no counter update needed for largest page size */ 818 if (pp->p_szc >= mmu_page_sizes - 1) { 819 return; 820 } 821 822 r = pp->p_szc + 1; 823 pfnum = pp->p_pagenum; 824 lckidx = PP_CTR_LOCK_INDX(pp); 825 826 /* 827 * Increment the count of free pages for the current 828 * region. Continue looping up in region size incrementing 829 * count if the preceeding region is full. 830 */ 831 while (r < mmu_page_sizes) { 832 idx = PNUM_TO_IDX(mnode, r, pfnum); 833 834 ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r)); 835 ASSERT(PAGE_COUNTERS(mnode, r, idx) < FULL_REGION_CNT(r)); 836 837 if (++PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r)) { 838 break; 839 } else { 840 int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r)); 841 pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode] 842 [MTYPE_2_MRANGE(mnode, root_mtype)]; 843 844 cand->pcc_pages_free++; 845 cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]++; 846 } 847 r++; 848 } 849 } 850 851 void 852 page_ctr_add(int mnode, int mtype, page_t *pp, int flags) 853 { 854 int lckidx = PP_CTR_LOCK_INDX(pp); 855 kmutex_t *lock = &ctr_mutex[lckidx][mnode]; 856 857 mutex_enter(lock); 858 page_ctr_add_internal(mnode, mtype, pp, flags); 859 mutex_exit(lock); 860 } 861 862 void 863 page_ctr_sub_internal(int mnode, int mtype, page_t *pp, int flags) 864 { 865 int lckidx; 866 ssize_t r; /* region size */ 867 ssize_t idx; 868 pfn_t pfnum; 869 870 ASSERT(mnode == PP_2_MEM_NODE(pp)); 871 ASSERT(mtype == PP_2_MTYPE(pp)); 872 873 ASSERT(pp->p_szc < mmu_page_sizes); 874 875 PLCNT_DECR(pp, mnode, mtype, pp->p_szc, flags); 876 877 /* no counter update needed for largest page size */ 878 if (pp->p_szc >= mmu_page_sizes - 1) { 879 return; 880 } 881 882 r = pp->p_szc + 1; 883 pfnum = pp->p_pagenum; 884 lckidx = PP_CTR_LOCK_INDX(pp); 885 886 /* 887 * Decrement the count of free pages for the current 888 * region. Continue looping up in region size decrementing 889 * count if the preceeding region was full. 890 */ 891 while (r < mmu_page_sizes) { 892 idx = PNUM_TO_IDX(mnode, r, pfnum); 893 894 ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r)); 895 ASSERT(PAGE_COUNTERS(mnode, r, idx) > 0); 896 897 if (--PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r) - 1) { 898 break; 899 } else { 900 int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r)); 901 pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode] 902 [MTYPE_2_MRANGE(mnode, root_mtype)]; 903 904 ASSERT(cand->pcc_pages_free != 0); 905 ASSERT(cand->pcc_color_free[PP_2_BIN_SZC(pp, r)] != 0); 906 907 cand->pcc_pages_free--; 908 cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]--; 909 } 910 r++; 911 } 912 } 913 914 void 915 page_ctr_sub(int mnode, int mtype, page_t *pp, int flags) 916 { 917 int lckidx = PP_CTR_LOCK_INDX(pp); 918 kmutex_t *lock = &ctr_mutex[lckidx][mnode]; 919 920 mutex_enter(lock); 921 page_ctr_sub_internal(mnode, mtype, pp, flags); 922 mutex_exit(lock); 923 } 924 925 /* 926 * Adjust page counters following a memory attach, since typically the 927 * size of the array needs to change, and the PFN to counter index 928 * mapping needs to change. 929 * 930 * It is possible this mnode did not exist at startup. In that case 931 * allocate pcc_info_t and pcc_color_free arrays. Also, allow for nranges 932 * to change (a theoretical possibility on x86), which means pcc_color_free 933 * arrays must be extended. 934 */ 935 uint_t 936 page_ctrs_adjust(int mnode) 937 { 938 pgcnt_t npgs; 939 int r; /* region size */ 940 int i; 941 size_t pcsz, old_csz; 942 hpmctr_t *new_ctr, *old_ctr; 943 pfn_t oldbase, newbase; 944 pfn_t physbase, physmax; 945 size_t old_npgs; 946 hpmctr_t *ctr_cache[MMU_PAGE_SIZES]; 947 size_t size_cache[MMU_PAGE_SIZES]; 948 size_t *color_cache[MMU_PAGE_SIZES][MAX_MNODE_MRANGES]; 949 size_t *old_color_array[MAX_MNODE_MRANGES]; 950 pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; 951 pcc_info_t **cands_cache; 952 pcc_info_t *old_pi, *pi; 953 pgcnt_t *pgcntp; 954 int nr, old_nranges, mrange, nranges = MNODE_RANGE_CNT(mnode); 955 int cands_cache_nranges; 956 int old_maxmrange, new_maxmrange; 957 int rc = 0; 958 959 cands_cache = kmem_zalloc(sizeof (pcc_info_t *) * NPC_MUTEX * 960 MMU_PAGE_SIZES, KM_NOSLEEP); 961 if (cands_cache == NULL) 962 return (ENOMEM); 963 964 i = -1; 965 HPM_COUNTERS_LIMITS(mnode, physbase, physmax, i); 966 967 newbase = physbase & ~PC_BASE_ALIGN_MASK; 968 npgs = roundup(physmax, PC_BASE_ALIGN) - newbase; 969 970 /* prepare to free non-null pointers on the way out */ 971 cands_cache_nranges = nranges; 972 bzero(ctr_cache, sizeof (ctr_cache)); 973 bzero(color_cache, sizeof (color_cache)); 974 975 /* 976 * We need to determine how many page colors there are for each 977 * page size in order to allocate memory for any color specific 978 * arrays. 979 */ 980 for (r = 0; r < mmu_page_sizes; r++) { 981 colors_per_szc[r] = PAGE_GET_PAGECOLORS(r); 982 } 983 984 /* 985 * Preallocate all of the new hpm_counters arrays as we can't 986 * hold the page_ctrs_rwlock as a writer and allocate memory. 987 * If we can't allocate all of the arrays, undo our work so far 988 * and return failure. 989 */ 990 for (r = 1; r < mmu_page_sizes; r++) { 991 pcsz = npgs >> PAGE_BSZS_SHIFT(r); 992 size_cache[r] = pcsz; 993 ctr_cache[r] = kmem_zalloc(pcsz * 994 sizeof (hpmctr_t), KM_NOSLEEP); 995 if (ctr_cache[r] == NULL) { 996 rc = ENOMEM; 997 goto cleanup; 998 } 999 } 1000 1001 /* 1002 * Preallocate all of the new color current arrays as we can't 1003 * hold the page_ctrs_rwlock as a writer and allocate memory. 1004 * If we can't allocate all of the arrays, undo our work so far 1005 * and return failure. 1006 */ 1007 for (r = 1; r < mmu_page_sizes; r++) { 1008 for (mrange = 0; mrange < nranges; mrange++) { 1009 color_cache[r][mrange] = kmem_zalloc(sizeof (size_t) * 1010 colors_per_szc[r], KM_NOSLEEP); 1011 if (color_cache[r][mrange] == NULL) { 1012 rc = ENOMEM; 1013 goto cleanup; 1014 } 1015 } 1016 } 1017 1018 /* 1019 * Preallocate all of the new pcc_info_t arrays as we can't 1020 * hold the page_ctrs_rwlock as a writer and allocate memory. 1021 * If we can't allocate all of the arrays, undo our work so far 1022 * and return failure. 1023 */ 1024 for (r = 1; r < mmu_page_sizes; r++) { 1025 for (i = 0; i < NPC_MUTEX; i++) { 1026 pi = kmem_zalloc(nranges * sizeof (pcc_info_t), 1027 KM_NOSLEEP); 1028 if (pi == NULL) { 1029 rc = ENOMEM; 1030 goto cleanup; 1031 } 1032 cands_cache[i * MMU_PAGE_SIZES + r] = pi; 1033 1034 for (mrange = 0; mrange < nranges; mrange++, pi++) { 1035 pgcntp = kmem_zalloc(colors_per_szc[r] * 1036 sizeof (pgcnt_t), KM_NOSLEEP); 1037 if (pgcntp == NULL) { 1038 rc = ENOMEM; 1039 goto cleanup; 1040 } 1041 pi->pcc_color_free = pgcntp; 1042 } 1043 } 1044 } 1045 1046 /* 1047 * Grab the write lock to prevent others from walking these arrays 1048 * while we are modifying them. 1049 */ 1050 PAGE_CTRS_WRITE_LOCK(mnode); 1051 1052 old_nranges = mnode_nranges[mnode]; 1053 cands_cache_nranges = old_nranges; 1054 mnode_nranges[mnode] = nranges; 1055 old_maxmrange = mnode_maxmrange[mnode]; 1056 mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode); 1057 new_maxmrange = mnode_maxmrange[mnode]; 1058 1059 for (r = 1; r < mmu_page_sizes; r++) { 1060 PAGE_COUNTERS_SHIFT(mnode, r) = PAGE_BSZS_SHIFT(r); 1061 old_ctr = PAGE_COUNTERS_COUNTERS(mnode, r); 1062 old_csz = PAGE_COUNTERS_ENTRIES(mnode, r); 1063 oldbase = PAGE_COUNTERS_BASE(mnode, r); 1064 old_npgs = old_csz << PAGE_COUNTERS_SHIFT(mnode, r); 1065 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) { 1066 old_color_array[mrange] = 1067 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, 1068 r, mrange); 1069 } 1070 1071 pcsz = npgs >> PAGE_COUNTERS_SHIFT(mnode, r); 1072 new_ctr = ctr_cache[r]; 1073 ctr_cache[r] = NULL; 1074 if (old_ctr != NULL && 1075 (oldbase + old_npgs > newbase) && 1076 (newbase + npgs > oldbase)) { 1077 /* 1078 * Map the intersection of the old and new 1079 * counters into the new array. 1080 */ 1081 size_t offset; 1082 if (newbase > oldbase) { 1083 offset = (newbase - oldbase) >> 1084 PAGE_COUNTERS_SHIFT(mnode, r); 1085 bcopy(old_ctr + offset, new_ctr, 1086 MIN(pcsz, (old_csz - offset)) * 1087 sizeof (hpmctr_t)); 1088 } else { 1089 offset = (oldbase - newbase) >> 1090 PAGE_COUNTERS_SHIFT(mnode, r); 1091 bcopy(old_ctr, new_ctr + offset, 1092 MIN(pcsz - offset, old_csz) * 1093 sizeof (hpmctr_t)); 1094 } 1095 } 1096 1097 PAGE_COUNTERS_COUNTERS(mnode, r) = new_ctr; 1098 PAGE_COUNTERS_ENTRIES(mnode, r) = pcsz; 1099 PAGE_COUNTERS_BASE(mnode, r) = newbase; 1100 1101 /* update shared hpm_counters in other mnodes */ 1102 if (interleaved_mnodes) { 1103 for (i = 0; i < max_mem_nodes; i++) { 1104 if (i == mnode) 1105 continue; 1106 if (mem_node_config[i].exists == 0) 1107 continue; 1108 ASSERT(PAGE_COUNTERS_COUNTERS(i, r) == old_ctr); 1109 PAGE_COUNTERS_COUNTERS(i, r) = new_ctr; 1110 PAGE_COUNTERS_ENTRIES(i, r) = pcsz; 1111 PAGE_COUNTERS_BASE(i, r) = newbase; 1112 } 1113 } 1114 1115 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) { 1116 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r, mrange) = 1117 color_cache[r][mrange]; 1118 color_cache[r][mrange] = NULL; 1119 } 1120 /* 1121 * for now, just reset on these events as it's probably 1122 * not worthwhile to try and optimize this. 1123 */ 1124 for (i = 0; i < colors_per_szc[r]; i++) { 1125 uint_t color_mask = colors_per_szc[r] - 1; 1126 int mlo = interleaved_mnodes ? 0 : mnode; 1127 int mhi = interleaved_mnodes ? max_mem_nodes : 1128 (mnode + 1); 1129 int m; 1130 pfn_t pfnum = newbase; 1131 size_t idx; 1132 MEM_NODE_ITERATOR_DECL(it); 1133 1134 for (m = mlo; m < mhi; m++) { 1135 if (mem_node_config[m].exists == 0) 1136 continue; 1137 MEM_NODE_ITERATOR_INIT(pfnum, m, r, &it); 1138 if (pfnum == (pfn_t)-1) { 1139 idx = 0; 1140 } else { 1141 PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i, 1142 color_mask, color_mask, &it); 1143 idx = PNUM_TO_IDX(m, r, pfnum); 1144 idx = (idx < pcsz) ? idx : 0; 1145 } 1146 for (mrange = 0; mrange < nranges; mrange++) { 1147 PAGE_COUNTERS_CURRENT_COLOR(m, 1148 r, i, mrange) = idx; 1149 } 1150 } 1151 } 1152 1153 /* cache info for freeing out of the critical path */ 1154 if ((caddr_t)old_ctr >= kernelheap && 1155 (caddr_t)old_ctr < ekernelheap) { 1156 ctr_cache[r] = old_ctr; 1157 size_cache[r] = old_csz; 1158 } 1159 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) { 1160 size_t *tmp = old_color_array[mrange]; 1161 if ((caddr_t)tmp >= kernelheap && 1162 (caddr_t)tmp < ekernelheap) { 1163 color_cache[r][mrange] = tmp; 1164 } 1165 } 1166 /* 1167 * Verify that PNUM_TO_IDX and IDX_TO_PNUM 1168 * satisfy the identity requirement. 1169 * We should be able to go from one to the other 1170 * and get consistent values. 1171 */ 1172 ASSERT(PNUM_TO_IDX(mnode, r, 1173 (IDX_TO_PNUM(mnode, r, 0))) == 0); 1174 ASSERT(IDX_TO_PNUM(mnode, r, 1175 (PNUM_TO_IDX(mnode, r, newbase))) == newbase); 1176 1177 /* pcc_info_t and pcc_color_free */ 1178 for (i = 0; i < NPC_MUTEX; i++) { 1179 pcc_info_t *epi; 1180 pcc_info_t *eold_pi; 1181 1182 pi = cands_cache[i * MMU_PAGE_SIZES + r]; 1183 old_pi = page_ctrs_cands[i][r][mnode]; 1184 page_ctrs_cands[i][r][mnode] = pi; 1185 cands_cache[i * MMU_PAGE_SIZES + r] = old_pi; 1186 1187 /* preserve old pcc_color_free values, if any */ 1188 if (old_pi == NULL) 1189 continue; 1190 1191 /* 1192 * when/if x86 does DR, must account for 1193 * possible change in range index when 1194 * preserving pcc_info 1195 */ 1196 epi = &pi[nranges]; 1197 eold_pi = &old_pi[old_nranges]; 1198 if (new_maxmrange > old_maxmrange) { 1199 pi += new_maxmrange - old_maxmrange; 1200 } else if (new_maxmrange < old_maxmrange) { 1201 old_pi += old_maxmrange - new_maxmrange; 1202 } 1203 for (; pi < epi && old_pi < eold_pi; pi++, old_pi++) { 1204 pcc_info_t tmp = *pi; 1205 *pi = *old_pi; 1206 *old_pi = tmp; 1207 } 1208 } 1209 } 1210 PAGE_CTRS_WRITE_UNLOCK(mnode); 1211 1212 /* 1213 * Now that we have dropped the write lock, it is safe to free all 1214 * of the memory we have cached above. 1215 * We come thru here to free memory when pre-alloc fails, and also to 1216 * free old pointers which were recorded while locked. 1217 */ 1218 cleanup: 1219 for (r = 1; r < mmu_page_sizes; r++) { 1220 if (ctr_cache[r] != NULL) { 1221 kmem_free(ctr_cache[r], 1222 size_cache[r] * sizeof (hpmctr_t)); 1223 } 1224 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) { 1225 if (color_cache[r][mrange] != NULL) { 1226 kmem_free(color_cache[r][mrange], 1227 colors_per_szc[r] * sizeof (size_t)); 1228 } 1229 } 1230 for (i = 0; i < NPC_MUTEX; i++) { 1231 pi = cands_cache[i * MMU_PAGE_SIZES + r]; 1232 if (pi == NULL) 1233 continue; 1234 nr = cands_cache_nranges; 1235 for (mrange = 0; mrange < nr; mrange++, pi++) { 1236 pgcntp = pi->pcc_color_free; 1237 if (pgcntp == NULL) 1238 continue; 1239 if ((caddr_t)pgcntp >= kernelheap && 1240 (caddr_t)pgcntp < ekernelheap) { 1241 kmem_free(pgcntp, 1242 colors_per_szc[r] * 1243 sizeof (pgcnt_t)); 1244 } 1245 } 1246 pi = cands_cache[i * MMU_PAGE_SIZES + r]; 1247 if ((caddr_t)pi >= kernelheap && 1248 (caddr_t)pi < ekernelheap) { 1249 kmem_free(pi, nr * sizeof (pcc_info_t)); 1250 } 1251 } 1252 } 1253 1254 kmem_free(cands_cache, 1255 sizeof (pcc_info_t *) * NPC_MUTEX * MMU_PAGE_SIZES); 1256 return (rc); 1257 } 1258 1259 1260 #ifdef DEBUG 1261 1262 /* 1263 * confirm pp is a large page corresponding to szc 1264 */ 1265 void 1266 chk_lpg(page_t *pp, uchar_t szc) 1267 { 1268 spgcnt_t npgs = page_get_pagecnt(pp->p_szc); 1269 uint_t noreloc; 1270 1271 if (npgs == 1) { 1272 ASSERT(pp->p_szc == 0); 1273 ASSERT(pp->p_next == pp); 1274 ASSERT(pp->p_prev == pp); 1275 return; 1276 } 1277 1278 ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL); 1279 ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL); 1280 1281 ASSERT(IS_P2ALIGNED(pp->p_pagenum, npgs)); 1282 ASSERT(pp->p_pagenum == (pp->p_next->p_pagenum - 1)); 1283 ASSERT(pp->p_prev->p_pagenum == (pp->p_pagenum + (npgs - 1))); 1284 ASSERT(pp->p_prev == (pp + (npgs - 1))); 1285 1286 /* 1287 * Check list of pages. 1288 */ 1289 noreloc = PP_ISNORELOC(pp); 1290 while (npgs--) { 1291 if (npgs != 0) { 1292 ASSERT(pp->p_pagenum == pp->p_next->p_pagenum - 1); 1293 ASSERT(pp->p_next == (pp + 1)); 1294 } 1295 ASSERT(pp->p_szc == szc); 1296 ASSERT(PP_ISFREE(pp)); 1297 ASSERT(PP_ISAGED(pp)); 1298 ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL); 1299 ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL); 1300 ASSERT(pp->p_vnode == NULL); 1301 ASSERT(PP_ISNORELOC(pp) == noreloc); 1302 1303 pp = pp->p_next; 1304 } 1305 } 1306 #endif /* DEBUG */ 1307 1308 void 1309 page_freelist_lock(int mnode) 1310 { 1311 int i; 1312 for (i = 0; i < NPC_MUTEX; i++) { 1313 mutex_enter(FPC_MUTEX(mnode, i)); 1314 mutex_enter(CPC_MUTEX(mnode, i)); 1315 } 1316 } 1317 1318 void 1319 page_freelist_unlock(int mnode) 1320 { 1321 int i; 1322 for (i = 0; i < NPC_MUTEX; i++) { 1323 mutex_exit(FPC_MUTEX(mnode, i)); 1324 mutex_exit(CPC_MUTEX(mnode, i)); 1325 } 1326 } 1327 1328 /* 1329 * add pp to the specified page list. Defaults to head of the page list 1330 * unless PG_LIST_TAIL is specified. 1331 */ 1332 void 1333 page_list_add(page_t *pp, int flags) 1334 { 1335 page_t **ppp; 1336 kmutex_t *pcm; 1337 uint_t bin, mtype; 1338 int mnode; 1339 1340 ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT)); 1341 ASSERT(PP_ISFREE(pp)); 1342 ASSERT(!hat_page_is_mapped(pp)); 1343 ASSERT(hat_page_getshare(pp) == 0); 1344 1345 /* 1346 * Large pages should be freed via page_list_add_pages(). 1347 */ 1348 ASSERT(pp->p_szc == 0); 1349 1350 /* 1351 * Don't need to lock the freelist first here 1352 * because the page isn't on the freelist yet. 1353 * This means p_szc can't change on us. 1354 */ 1355 1356 bin = PP_2_BIN(pp); 1357 mnode = PP_2_MEM_NODE(pp); 1358 mtype = PP_2_MTYPE(pp); 1359 1360 if (flags & PG_LIST_ISINIT) { 1361 /* 1362 * PG_LIST_ISINIT is set during system startup (ie. single 1363 * threaded), add a page to the free list and add to the 1364 * the free region counters w/o any locking 1365 */ 1366 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1367 1368 /* inline version of page_add() */ 1369 if (*ppp != NULL) { 1370 pp->p_next = *ppp; 1371 pp->p_prev = (*ppp)->p_prev; 1372 (*ppp)->p_prev = pp; 1373 pp->p_prev->p_next = pp; 1374 } else 1375 *ppp = pp; 1376 1377 page_ctr_add_internal(mnode, mtype, pp, flags); 1378 VM_STAT_ADD(vmm_vmstats.pladd_free[0]); 1379 } else { 1380 pcm = PC_BIN_MUTEX(mnode, bin, flags); 1381 1382 if (flags & PG_FREE_LIST) { 1383 VM_STAT_ADD(vmm_vmstats.pladd_free[0]); 1384 ASSERT(PP_ISAGED(pp)); 1385 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1386 1387 } else { 1388 VM_STAT_ADD(vmm_vmstats.pladd_cache); 1389 ASSERT(pp->p_vnode); 1390 ASSERT((pp->p_offset & PAGEOFFSET) == 0); 1391 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1392 } 1393 mutex_enter(pcm); 1394 page_add(ppp, pp); 1395 1396 if (flags & PG_LIST_TAIL) 1397 *ppp = (*ppp)->p_next; 1398 /* 1399 * Add counters before releasing pcm mutex to avoid a race with 1400 * page_freelist_coalesce and page_freelist_split. 1401 */ 1402 page_ctr_add(mnode, mtype, pp, flags); 1403 mutex_exit(pcm); 1404 } 1405 1406 1407 #if defined(__sparc) 1408 if (PP_ISNORELOC(pp)) { 1409 kcage_freemem_add(1); 1410 } 1411 #endif 1412 /* 1413 * It is up to the caller to unlock the page! 1414 */ 1415 ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT)); 1416 } 1417 1418 1419 #ifdef __sparc 1420 /* 1421 * This routine is only used by kcage_init during system startup. 1422 * It performs the function of page_list_sub/PP_SETNORELOC/page_list_add 1423 * without the overhead of taking locks and updating counters. 1424 */ 1425 void 1426 page_list_noreloc_startup(page_t *pp) 1427 { 1428 page_t **ppp; 1429 uint_t bin; 1430 int mnode; 1431 int mtype; 1432 int flags = 0; 1433 1434 /* 1435 * If this is a large page on the freelist then 1436 * break it up into smaller pages. 1437 */ 1438 if (pp->p_szc != 0) 1439 page_boot_demote(pp); 1440 1441 /* 1442 * Get list page is currently on. 1443 */ 1444 bin = PP_2_BIN(pp); 1445 mnode = PP_2_MEM_NODE(pp); 1446 mtype = PP_2_MTYPE(pp); 1447 ASSERT(mtype == MTYPE_RELOC); 1448 ASSERT(pp->p_szc == 0); 1449 1450 if (PP_ISAGED(pp)) { 1451 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1452 flags |= PG_FREE_LIST; 1453 } else { 1454 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1455 flags |= PG_CACHE_LIST; 1456 } 1457 1458 ASSERT(*ppp != NULL); 1459 1460 /* 1461 * Delete page from current list. 1462 */ 1463 if (*ppp == pp) 1464 *ppp = pp->p_next; /* go to next page */ 1465 if (*ppp == pp) { 1466 *ppp = NULL; /* page list is gone */ 1467 } else { 1468 pp->p_prev->p_next = pp->p_next; 1469 pp->p_next->p_prev = pp->p_prev; 1470 } 1471 1472 /* 1473 * Decrement page counters 1474 */ 1475 page_ctr_sub_internal(mnode, mtype, pp, flags); 1476 1477 /* 1478 * Set no reloc for cage initted pages. 1479 */ 1480 PP_SETNORELOC(pp); 1481 1482 mtype = PP_2_MTYPE(pp); 1483 ASSERT(mtype == MTYPE_NORELOC); 1484 1485 /* 1486 * Get new list for page. 1487 */ 1488 if (PP_ISAGED(pp)) { 1489 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1490 } else { 1491 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1492 } 1493 1494 /* 1495 * Insert page on new list. 1496 */ 1497 if (*ppp == NULL) { 1498 *ppp = pp; 1499 pp->p_next = pp->p_prev = pp; 1500 } else { 1501 pp->p_next = *ppp; 1502 pp->p_prev = (*ppp)->p_prev; 1503 (*ppp)->p_prev = pp; 1504 pp->p_prev->p_next = pp; 1505 } 1506 1507 /* 1508 * Increment page counters 1509 */ 1510 page_ctr_add_internal(mnode, mtype, pp, flags); 1511 1512 /* 1513 * Update cage freemem counter 1514 */ 1515 atomic_add_long(&kcage_freemem, 1); 1516 } 1517 #else /* __sparc */ 1518 1519 /* ARGSUSED */ 1520 void 1521 page_list_noreloc_startup(page_t *pp) 1522 { 1523 panic("page_list_noreloc_startup: should be here only for sparc"); 1524 } 1525 #endif 1526 1527 void 1528 page_list_add_pages(page_t *pp, int flags) 1529 { 1530 kmutex_t *pcm; 1531 pgcnt_t pgcnt; 1532 uint_t bin, mtype, i; 1533 int mnode; 1534 1535 /* default to freelist/head */ 1536 ASSERT((flags & (PG_CACHE_LIST | PG_LIST_TAIL)) == 0); 1537 1538 CHK_LPG(pp, pp->p_szc); 1539 VM_STAT_ADD(vmm_vmstats.pladd_free[pp->p_szc]); 1540 1541 bin = PP_2_BIN(pp); 1542 mnode = PP_2_MEM_NODE(pp); 1543 mtype = PP_2_MTYPE(pp); 1544 1545 if (flags & PG_LIST_ISINIT) { 1546 ASSERT(pp->p_szc == mmu_page_sizes - 1); 1547 page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1548 ASSERT(!PP_ISNORELOC(pp)); 1549 PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags); 1550 } else { 1551 1552 ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes); 1553 1554 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 1555 1556 mutex_enter(pcm); 1557 page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1558 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST); 1559 mutex_exit(pcm); 1560 1561 pgcnt = page_get_pagecnt(pp->p_szc); 1562 #if defined(__sparc) 1563 if (PP_ISNORELOC(pp)) 1564 kcage_freemem_add(pgcnt); 1565 #endif 1566 for (i = 0; i < pgcnt; i++, pp++) 1567 page_unlock_nocapture(pp); 1568 } 1569 } 1570 1571 /* 1572 * During boot, need to demote a large page to base 1573 * pagesize pages for seg_kmem for use in boot_alloc() 1574 */ 1575 void 1576 page_boot_demote(page_t *pp) 1577 { 1578 ASSERT(pp->p_szc != 0); 1579 ASSERT(PP_ISFREE(pp)); 1580 ASSERT(PP_ISAGED(pp)); 1581 1582 (void) page_demote(PP_2_MEM_NODE(pp), 1583 PFN_BASE(pp->p_pagenum, pp->p_szc), pp->p_szc, 0, PC_NO_COLOR, 1584 PC_FREE); 1585 1586 ASSERT(PP_ISFREE(pp)); 1587 ASSERT(PP_ISAGED(pp)); 1588 ASSERT(pp->p_szc == 0); 1589 } 1590 1591 /* 1592 * Take a particular page off of whatever freelist the page 1593 * is claimed to be on. 1594 * 1595 * NOTE: Only used for PAGESIZE pages. 1596 */ 1597 void 1598 page_list_sub(page_t *pp, int flags) 1599 { 1600 int bin; 1601 uint_t mtype; 1602 int mnode; 1603 kmutex_t *pcm; 1604 page_t **ppp; 1605 1606 ASSERT(PAGE_EXCL(pp)); 1607 ASSERT(PP_ISFREE(pp)); 1608 1609 /* 1610 * The p_szc field can only be changed by page_promote() 1611 * and page_demote(). Only free pages can be promoted and 1612 * demoted and the free list MUST be locked during these 1613 * operations. So to prevent a race in page_list_sub() 1614 * between computing which bin of the freelist lock to 1615 * grab and actually grabing the lock we check again that 1616 * the bin we locked is still the correct one. Notice that 1617 * the p_szc field could have actually changed on us but 1618 * if the bin happens to still be the same we are safe. 1619 */ 1620 try_again: 1621 bin = PP_2_BIN(pp); 1622 mnode = PP_2_MEM_NODE(pp); 1623 pcm = PC_BIN_MUTEX(mnode, bin, flags); 1624 mutex_enter(pcm); 1625 if (PP_2_BIN(pp) != bin) { 1626 mutex_exit(pcm); 1627 goto try_again; 1628 } 1629 mtype = PP_2_MTYPE(pp); 1630 1631 if (flags & PG_FREE_LIST) { 1632 VM_STAT_ADD(vmm_vmstats.plsub_free[0]); 1633 ASSERT(PP_ISAGED(pp)); 1634 ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype); 1635 } else { 1636 VM_STAT_ADD(vmm_vmstats.plsub_cache); 1637 ASSERT(!PP_ISAGED(pp)); 1638 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1639 } 1640 1641 /* 1642 * Common PAGESIZE case. 1643 * 1644 * Note that we locked the freelist. This prevents 1645 * any page promotion/demotion operations. Therefore 1646 * the p_szc will not change until we drop pcm mutex. 1647 */ 1648 if (pp->p_szc == 0) { 1649 page_sub(ppp, pp); 1650 /* 1651 * Subtract counters before releasing pcm mutex 1652 * to avoid race with page_freelist_coalesce. 1653 */ 1654 page_ctr_sub(mnode, mtype, pp, flags); 1655 mutex_exit(pcm); 1656 1657 #if defined(__sparc) 1658 if (PP_ISNORELOC(pp)) { 1659 kcage_freemem_sub(1); 1660 } 1661 #endif 1662 return; 1663 } 1664 1665 /* 1666 * Large pages on the cache list are not supported. 1667 */ 1668 if (flags & PG_CACHE_LIST) 1669 panic("page_list_sub: large page on cachelist"); 1670 1671 /* 1672 * Slow but rare. 1673 * 1674 * Somebody wants this particular page which is part 1675 * of a large page. In this case we just demote the page 1676 * if it's on the freelist. 1677 * 1678 * We have to drop pcm before locking the entire freelist. 1679 * Once we have re-locked the freelist check to make sure 1680 * the page hasn't already been demoted or completely 1681 * freed. 1682 */ 1683 mutex_exit(pcm); 1684 page_freelist_lock(mnode); 1685 if (pp->p_szc != 0) { 1686 /* 1687 * Large page is on freelist. 1688 */ 1689 (void) page_demote(mnode, PFN_BASE(pp->p_pagenum, pp->p_szc), 1690 pp->p_szc, 0, PC_NO_COLOR, PC_FREE); 1691 } 1692 ASSERT(PP_ISFREE(pp)); 1693 ASSERT(PP_ISAGED(pp)); 1694 ASSERT(pp->p_szc == 0); 1695 1696 /* 1697 * Subtract counters before releasing pcm mutex 1698 * to avoid race with page_freelist_coalesce. 1699 */ 1700 bin = PP_2_BIN(pp); 1701 mtype = PP_2_MTYPE(pp); 1702 ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype); 1703 1704 page_sub(ppp, pp); 1705 page_ctr_sub(mnode, mtype, pp, flags); 1706 page_freelist_unlock(mnode); 1707 1708 #if defined(__sparc) 1709 if (PP_ISNORELOC(pp)) { 1710 kcage_freemem_sub(1); 1711 } 1712 #endif 1713 } 1714 1715 void 1716 page_list_sub_pages(page_t *pp, uint_t szc) 1717 { 1718 kmutex_t *pcm; 1719 uint_t bin, mtype; 1720 int mnode; 1721 1722 ASSERT(PAGE_EXCL(pp)); 1723 ASSERT(PP_ISFREE(pp)); 1724 ASSERT(PP_ISAGED(pp)); 1725 1726 /* 1727 * See comment in page_list_sub(). 1728 */ 1729 try_again: 1730 bin = PP_2_BIN(pp); 1731 mnode = PP_2_MEM_NODE(pp); 1732 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 1733 mutex_enter(pcm); 1734 if (PP_2_BIN(pp) != bin) { 1735 mutex_exit(pcm); 1736 goto try_again; 1737 } 1738 1739 /* 1740 * If we're called with a page larger than szc or it got 1741 * promoted above szc before we locked the freelist then 1742 * drop pcm and re-lock entire freelist. If page still larger 1743 * than szc then demote it. 1744 */ 1745 if (pp->p_szc > szc) { 1746 mutex_exit(pcm); 1747 pcm = NULL; 1748 page_freelist_lock(mnode); 1749 if (pp->p_szc > szc) { 1750 VM_STAT_ADD(vmm_vmstats.plsubpages_szcbig); 1751 (void) page_demote(mnode, 1752 PFN_BASE(pp->p_pagenum, pp->p_szc), 1753 pp->p_szc, szc, PC_NO_COLOR, PC_FREE); 1754 } 1755 bin = PP_2_BIN(pp); 1756 } 1757 ASSERT(PP_ISFREE(pp)); 1758 ASSERT(PP_ISAGED(pp)); 1759 ASSERT(pp->p_szc <= szc); 1760 ASSERT(pp == PP_PAGEROOT(pp)); 1761 1762 VM_STAT_ADD(vmm_vmstats.plsub_free[pp->p_szc]); 1763 1764 mtype = PP_2_MTYPE(pp); 1765 if (pp->p_szc != 0) { 1766 page_vpsub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1767 CHK_LPG(pp, pp->p_szc); 1768 } else { 1769 VM_STAT_ADD(vmm_vmstats.plsubpages_szc0); 1770 page_sub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1771 } 1772 page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST); 1773 1774 if (pcm != NULL) { 1775 mutex_exit(pcm); 1776 } else { 1777 page_freelist_unlock(mnode); 1778 } 1779 1780 #if defined(__sparc) 1781 if (PP_ISNORELOC(pp)) { 1782 pgcnt_t pgcnt; 1783 1784 pgcnt = page_get_pagecnt(pp->p_szc); 1785 kcage_freemem_sub(pgcnt); 1786 } 1787 #endif 1788 } 1789 1790 /* 1791 * Add the page to the front of a linked list of pages 1792 * using the p_next & p_prev pointers for the list. 1793 * The caller is responsible for protecting the list pointers. 1794 */ 1795 void 1796 mach_page_add(page_t **ppp, page_t *pp) 1797 { 1798 if (*ppp == NULL) { 1799 pp->p_next = pp->p_prev = pp; 1800 } else { 1801 pp->p_next = *ppp; 1802 pp->p_prev = (*ppp)->p_prev; 1803 (*ppp)->p_prev = pp; 1804 pp->p_prev->p_next = pp; 1805 } 1806 *ppp = pp; 1807 } 1808 1809 /* 1810 * Remove this page from a linked list of pages 1811 * using the p_next & p_prev pointers for the list. 1812 * 1813 * The caller is responsible for protecting the list pointers. 1814 */ 1815 void 1816 mach_page_sub(page_t **ppp, page_t *pp) 1817 { 1818 ASSERT(PP_ISFREE(pp)); 1819 1820 if (*ppp == NULL || pp == NULL) 1821 panic("mach_page_sub"); 1822 1823 if (*ppp == pp) 1824 *ppp = pp->p_next; /* go to next page */ 1825 1826 if (*ppp == pp) 1827 *ppp = NULL; /* page list is gone */ 1828 else { 1829 pp->p_prev->p_next = pp->p_next; 1830 pp->p_next->p_prev = pp->p_prev; 1831 } 1832 pp->p_prev = pp->p_next = pp; /* make pp a list of one */ 1833 } 1834 1835 /* 1836 * Routine fsflush uses to gradually coalesce the free list into larger pages. 1837 */ 1838 void 1839 page_promote_size(page_t *pp, uint_t cur_szc) 1840 { 1841 pfn_t pfn; 1842 int mnode; 1843 int idx; 1844 int new_szc = cur_szc + 1; 1845 int full = FULL_REGION_CNT(new_szc); 1846 1847 pfn = page_pptonum(pp); 1848 mnode = PFN_2_MEM_NODE(pfn); 1849 1850 page_freelist_lock(mnode); 1851 1852 idx = PNUM_TO_IDX(mnode, new_szc, pfn); 1853 if (PAGE_COUNTERS(mnode, new_szc, idx) == full) 1854 (void) page_promote(mnode, pfn, new_szc, PC_FREE, PC_MTYPE_ANY); 1855 1856 page_freelist_unlock(mnode); 1857 } 1858 1859 static uint_t page_promote_err; 1860 static uint_t page_promote_noreloc_err; 1861 1862 /* 1863 * Create a single larger page (of szc new_szc) from smaller contiguous pages 1864 * for the given mnode starting at pfnum. Pages involved are on the freelist 1865 * before the call and may be returned to the caller if requested, otherwise 1866 * they will be placed back on the freelist. 1867 * If flags is PC_ALLOC, then the large page will be returned to the user in 1868 * a state which is consistent with a page being taken off the freelist. If 1869 * we failed to lock the new large page, then we will return NULL to the 1870 * caller and put the large page on the freelist instead. 1871 * If flags is PC_FREE, then the large page will be placed on the freelist, 1872 * and NULL will be returned. 1873 * The caller is responsible for locking the freelist as well as any other 1874 * accounting which needs to be done for a returned page. 1875 * 1876 * RFE: For performance pass in pp instead of pfnum so 1877 * we can avoid excessive calls to page_numtopp_nolock(). 1878 * This would depend on an assumption that all contiguous 1879 * pages are in the same memseg so we can just add/dec 1880 * our pp. 1881 * 1882 * Lock ordering: 1883 * 1884 * There is a potential but rare deadlock situation 1885 * for page promotion and demotion operations. The problem 1886 * is there are two paths into the freelist manager and 1887 * they have different lock orders: 1888 * 1889 * page_create() 1890 * lock freelist 1891 * page_lock(EXCL) 1892 * unlock freelist 1893 * return 1894 * caller drops page_lock 1895 * 1896 * page_free() and page_reclaim() 1897 * caller grabs page_lock(EXCL) 1898 * 1899 * lock freelist 1900 * unlock freelist 1901 * drop page_lock 1902 * 1903 * What prevents a thread in page_create() from deadlocking 1904 * with a thread freeing or reclaiming the same page is the 1905 * page_trylock() in page_get_freelist(). If the trylock fails 1906 * it skips the page. 1907 * 1908 * The lock ordering for promotion and demotion is the same as 1909 * for page_create(). Since the same deadlock could occur during 1910 * page promotion and freeing or reclaiming of a page on the 1911 * cache list we might have to fail the operation and undo what 1912 * have done so far. Again this is rare. 1913 */ 1914 page_t * 1915 page_promote(int mnode, pfn_t pfnum, uchar_t new_szc, int flags, int mtype) 1916 { 1917 page_t *pp, *pplist, *tpp, *start_pp; 1918 pgcnt_t new_npgs, npgs; 1919 uint_t bin; 1920 pgcnt_t tmpnpgs, pages_left; 1921 uint_t noreloc; 1922 int which_list; 1923 ulong_t index; 1924 kmutex_t *phm; 1925 1926 /* 1927 * General algorithm: 1928 * Find the starting page 1929 * Walk each page struct removing it from the freelist, 1930 * and linking it to all the other pages removed. 1931 * Once all pages are off the freelist, 1932 * walk the list, modifying p_szc to new_szc and what 1933 * ever other info needs to be done to create a large free page. 1934 * According to the flags, either return the page or put it 1935 * on the freelist. 1936 */ 1937 1938 start_pp = page_numtopp_nolock(pfnum); 1939 ASSERT(start_pp && (start_pp->p_pagenum == pfnum)); 1940 new_npgs = page_get_pagecnt(new_szc); 1941 ASSERT(IS_P2ALIGNED(pfnum, new_npgs)); 1942 1943 /* don't return page of the wrong mtype */ 1944 if (mtype != PC_MTYPE_ANY && mtype != PP_2_MTYPE(start_pp)) 1945 return (NULL); 1946 1947 /* 1948 * Loop through smaller pages to confirm that all pages 1949 * give the same result for PP_ISNORELOC(). 1950 * We can check this reliably here as the protocol for setting 1951 * P_NORELOC requires pages to be taken off the free list first. 1952 */ 1953 noreloc = PP_ISNORELOC(start_pp); 1954 for (pp = start_pp + new_npgs; --pp > start_pp; ) { 1955 if (noreloc != PP_ISNORELOC(pp)) { 1956 page_promote_noreloc_err++; 1957 page_promote_err++; 1958 return (NULL); 1959 } 1960 } 1961 1962 pages_left = new_npgs; 1963 pplist = NULL; 1964 pp = start_pp; 1965 1966 /* Loop around coalescing the smaller pages into a big page. */ 1967 while (pages_left) { 1968 /* 1969 * Remove from the freelist. 1970 */ 1971 ASSERT(PP_ISFREE(pp)); 1972 bin = PP_2_BIN(pp); 1973 ASSERT(mnode == PP_2_MEM_NODE(pp)); 1974 mtype = PP_2_MTYPE(pp); 1975 if (PP_ISAGED(pp)) { 1976 1977 /* 1978 * PG_FREE_LIST 1979 */ 1980 if (pp->p_szc) { 1981 page_vpsub(&PAGE_FREELISTS(mnode, 1982 pp->p_szc, bin, mtype), pp); 1983 } else { 1984 mach_page_sub(&PAGE_FREELISTS(mnode, 0, 1985 bin, mtype), pp); 1986 } 1987 which_list = PG_FREE_LIST; 1988 } else { 1989 ASSERT(pp->p_szc == 0); 1990 1991 /* 1992 * PG_CACHE_LIST 1993 * 1994 * Since this page comes from the 1995 * cachelist, we must destroy the 1996 * vnode association. 1997 */ 1998 if (!page_trylock(pp, SE_EXCL)) { 1999 goto fail_promote; 2000 } 2001 2002 /* 2003 * We need to be careful not to deadlock 2004 * with another thread in page_lookup(). 2005 * The page_lookup() thread could be holding 2006 * the same phm that we need if the two 2007 * pages happen to hash to the same phm lock. 2008 * At this point we have locked the entire 2009 * freelist and page_lookup() could be trying 2010 * to grab a freelist lock. 2011 */ 2012 index = PAGE_HASH_FUNC(pp->p_vnode, pp->p_offset); 2013 phm = PAGE_HASH_MUTEX(index); 2014 if (!mutex_tryenter(phm)) { 2015 page_unlock_nocapture(pp); 2016 goto fail_promote; 2017 } 2018 2019 mach_page_sub(&PAGE_CACHELISTS(mnode, bin, mtype), pp); 2020 page_hashout(pp, phm); 2021 mutex_exit(phm); 2022 PP_SETAGED(pp); 2023 page_unlock_nocapture(pp); 2024 which_list = PG_CACHE_LIST; 2025 } 2026 page_ctr_sub(mnode, mtype, pp, which_list); 2027 2028 /* 2029 * Concatenate the smaller page(s) onto 2030 * the large page list. 2031 */ 2032 tmpnpgs = npgs = page_get_pagecnt(pp->p_szc); 2033 pages_left -= npgs; 2034 tpp = pp; 2035 while (npgs--) { 2036 tpp->p_szc = new_szc; 2037 tpp = tpp->p_next; 2038 } 2039 page_list_concat(&pplist, &pp); 2040 pp += tmpnpgs; 2041 } 2042 CHK_LPG(pplist, new_szc); 2043 2044 /* 2045 * return the page to the user if requested 2046 * in the properly locked state. 2047 */ 2048 if (flags == PC_ALLOC && (page_trylock_cons(pplist, SE_EXCL))) { 2049 return (pplist); 2050 } 2051 2052 /* 2053 * Otherwise place the new large page on the freelist 2054 */ 2055 bin = PP_2_BIN(pplist); 2056 mnode = PP_2_MEM_NODE(pplist); 2057 mtype = PP_2_MTYPE(pplist); 2058 page_vpadd(&PAGE_FREELISTS(mnode, new_szc, bin, mtype), pplist); 2059 2060 page_ctr_add(mnode, mtype, pplist, PG_FREE_LIST); 2061 return (NULL); 2062 2063 fail_promote: 2064 /* 2065 * A thread must have still been freeing or 2066 * reclaiming the page on the cachelist. 2067 * To prevent a deadlock undo what we have 2068 * done sofar and return failure. This 2069 * situation can only happen while promoting 2070 * PAGESIZE pages. 2071 */ 2072 page_promote_err++; 2073 while (pplist) { 2074 pp = pplist; 2075 mach_page_sub(&pplist, pp); 2076 pp->p_szc = 0; 2077 bin = PP_2_BIN(pp); 2078 mtype = PP_2_MTYPE(pp); 2079 mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, mtype), pp); 2080 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST); 2081 } 2082 return (NULL); 2083 2084 } 2085 2086 /* 2087 * Break up a large page into smaller size pages. 2088 * Pages involved are on the freelist before the call and may 2089 * be returned to the caller if requested, otherwise they will 2090 * be placed back on the freelist. 2091 * The caller is responsible for locking the freelist as well as any other 2092 * accounting which needs to be done for a returned page. 2093 * If flags is not PC_ALLOC, the color argument is ignored, and thus 2094 * technically, any value may be passed in but PC_NO_COLOR is the standard 2095 * which should be followed for clarity's sake. 2096 */ 2097 page_t * 2098 page_demote(int mnode, pfn_t pfnum, uchar_t cur_szc, uchar_t new_szc, 2099 int color, int flags) 2100 { 2101 page_t *pp, *pplist, *npplist; 2102 pgcnt_t npgs, n; 2103 uint_t bin; 2104 uint_t mtype; 2105 page_t *ret_pp = NULL; 2106 2107 ASSERT(cur_szc != 0); 2108 ASSERT(new_szc < cur_szc); 2109 2110 pplist = page_numtopp_nolock(pfnum); 2111 ASSERT(pplist != NULL); 2112 2113 ASSERT(pplist->p_szc == cur_szc); 2114 2115 bin = PP_2_BIN(pplist); 2116 ASSERT(mnode == PP_2_MEM_NODE(pplist)); 2117 mtype = PP_2_MTYPE(pplist); 2118 page_vpsub(&PAGE_FREELISTS(mnode, cur_szc, bin, mtype), pplist); 2119 2120 CHK_LPG(pplist, cur_szc); 2121 page_ctr_sub(mnode, mtype, pplist, PG_FREE_LIST); 2122 2123 /* 2124 * Number of PAGESIZE pages for smaller new_szc 2125 * page. 2126 */ 2127 npgs = page_get_pagecnt(new_szc); 2128 2129 while (pplist) { 2130 pp = pplist; 2131 2132 ASSERT(pp->p_szc == cur_szc); 2133 2134 /* 2135 * We either break it up into PAGESIZE pages or larger. 2136 */ 2137 if (npgs == 1) { /* PAGESIZE case */ 2138 mach_page_sub(&pplist, pp); 2139 ASSERT(pp->p_szc == cur_szc); 2140 ASSERT(new_szc == 0); 2141 ASSERT(mnode == PP_2_MEM_NODE(pp)); 2142 pp->p_szc = new_szc; 2143 bin = PP_2_BIN(pp); 2144 if ((bin == color) && (flags == PC_ALLOC) && 2145 (ret_pp == NULL) && 2146 page_trylock_cons(pp, SE_EXCL)) { 2147 ret_pp = pp; 2148 } else { 2149 mtype = PP_2_MTYPE(pp); 2150 mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, 2151 mtype), pp); 2152 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST); 2153 } 2154 } else { 2155 2156 /* 2157 * Break down into smaller lists of pages. 2158 */ 2159 page_list_break(&pplist, &npplist, npgs); 2160 2161 pp = pplist; 2162 n = npgs; 2163 while (n--) { 2164 ASSERT(pp->p_szc == cur_szc); 2165 pp->p_szc = new_szc; 2166 pp = pp->p_next; 2167 } 2168 2169 CHK_LPG(pplist, new_szc); 2170 2171 bin = PP_2_BIN(pplist); 2172 ASSERT(mnode == PP_2_MEM_NODE(pp)); 2173 if ((bin == color) && (flags == PC_ALLOC) && 2174 (ret_pp == NULL) && 2175 page_trylock_cons(pp, SE_EXCL)) { 2176 ret_pp = pp; 2177 } else { 2178 mtype = PP_2_MTYPE(pp); 2179 page_vpadd(&PAGE_FREELISTS(mnode, new_szc, 2180 bin, mtype), pplist); 2181 2182 page_ctr_add(mnode, mtype, pplist, 2183 PG_FREE_LIST); 2184 } 2185 pplist = npplist; 2186 } 2187 } 2188 return (ret_pp); 2189 } 2190 2191 int mpss_coalesce_disable = 0; 2192 2193 /* 2194 * Coalesce free pages into a page of the given szc and color if possible. 2195 * Return the pointer to the page created, otherwise, return NULL. 2196 * 2197 * If pfnhi is non-zero, search for large page with pfn range less than pfnhi. 2198 */ 2199 page_t * 2200 page_freelist_coalesce(int mnode, uchar_t szc, uint_t color, uint_t ceq_mask, 2201 int mtype, pfn_t pfnhi) 2202 { 2203 int r = szc; /* region size */ 2204 int mrange; 2205 uint_t full, bin, color_mask, wrap = 0; 2206 pfn_t pfnum, lo, hi; 2207 size_t len, idx, idx0; 2208 pgcnt_t cands = 0, szcpgcnt = page_get_pagecnt(szc); 2209 page_t *ret_pp; 2210 MEM_NODE_ITERATOR_DECL(it); 2211 #if defined(__sparc) 2212 pfn_t pfnum0, nlo, nhi; 2213 #endif 2214 2215 if (mpss_coalesce_disable) { 2216 ASSERT(szc < MMU_PAGE_SIZES); 2217 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[szc][0]); 2218 return (NULL); 2219 } 2220 2221 ASSERT(szc < mmu_page_sizes); 2222 color_mask = PAGE_GET_PAGECOLORS(szc) - 1; 2223 ASSERT(ceq_mask <= color_mask); 2224 ASSERT(color <= color_mask); 2225 color &= ceq_mask; 2226 2227 /* Prevent page_counters dynamic memory from being freed */ 2228 rw_enter(&page_ctrs_rwlock[mnode], RW_READER); 2229 2230 mrange = MTYPE_2_MRANGE(mnode, mtype); 2231 ASSERT(mrange < mnode_nranges[mnode]); 2232 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[r][mrange]); 2233 2234 /* get pfn range for mtype */ 2235 len = PAGE_COUNTERS_ENTRIES(mnode, r); 2236 #if defined(__sparc) 2237 lo = PAGE_COUNTERS_BASE(mnode, r); 2238 hi = IDX_TO_PNUM(mnode, r, len); 2239 #else 2240 MNODETYPE_2_PFN(mnode, mtype, lo, hi); 2241 hi++; 2242 #endif 2243 2244 /* use lower limit if given */ 2245 if (pfnhi != PFNNULL && pfnhi < hi) 2246 hi = pfnhi; 2247 2248 /* round to szcpgcnt boundaries */ 2249 lo = P2ROUNDUP(lo, szcpgcnt); 2250 MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it); 2251 if (lo == (pfn_t)-1) { 2252 rw_exit(&page_ctrs_rwlock[mnode]); 2253 return (NULL); 2254 } 2255 hi = hi & ~(szcpgcnt - 1); 2256 2257 /* set lo to the closest pfn of the right color */ 2258 if (((PFN_2_COLOR(lo, szc, &it) ^ color) & ceq_mask) || 2259 (interleaved_mnodes && PFN_2_MEM_NODE(lo) != mnode)) { 2260 PAGE_NEXT_PFN_FOR_COLOR(lo, szc, color, ceq_mask, color_mask, 2261 &it); 2262 } 2263 2264 if (hi <= lo) { 2265 rw_exit(&page_ctrs_rwlock[mnode]); 2266 return (NULL); 2267 } 2268 2269 full = FULL_REGION_CNT(r); 2270 2271 /* calculate the number of page candidates and initial search index */ 2272 bin = color; 2273 idx0 = (size_t)(-1); 2274 do { 2275 pgcnt_t acand; 2276 2277 PGCTRS_CANDS_GETVALUECOLOR(mnode, mrange, r, bin, acand); 2278 if (acand) { 2279 idx = PAGE_COUNTERS_CURRENT_COLOR(mnode, 2280 r, bin, mrange); 2281 idx0 = MIN(idx0, idx); 2282 cands += acand; 2283 } 2284 bin = ADD_MASKED(bin, 1, ceq_mask, color_mask); 2285 } while (bin != color); 2286 2287 if (cands == 0) { 2288 VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip[r][mrange]); 2289 rw_exit(&page_ctrs_rwlock[mnode]); 2290 return (NULL); 2291 } 2292 2293 pfnum = IDX_TO_PNUM(mnode, r, idx0); 2294 if (pfnum < lo || pfnum >= hi) { 2295 pfnum = lo; 2296 } else { 2297 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it); 2298 if (pfnum == (pfn_t)-1) { 2299 pfnum = lo; 2300 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it); 2301 ASSERT(pfnum != (pfn_t)-1); 2302 } else if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) & ceq_mask || 2303 (interleaved_mnodes && PFN_2_MEM_NODE(pfnum) != mnode)) { 2304 /* invalid color, get the closest correct pfn */ 2305 PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask, 2306 color_mask, &it); 2307 if (pfnum >= hi) { 2308 pfnum = lo; 2309 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it); 2310 } 2311 } 2312 } 2313 2314 /* set starting index */ 2315 idx0 = PNUM_TO_IDX(mnode, r, pfnum); 2316 ASSERT(idx0 < len); 2317 2318 #if defined(__sparc) 2319 pfnum0 = pfnum; /* page corresponding to idx0 */ 2320 nhi = 0; /* search kcage ranges */ 2321 #endif 2322 2323 for (idx = idx0; wrap == 0 || (idx < idx0 && wrap < 2); ) { 2324 2325 #if defined(__sparc) 2326 /* 2327 * Find lowest intersection of kcage ranges and mnode. 2328 * MTYPE_NORELOC means look in the cage, otherwise outside. 2329 */ 2330 if (nhi <= pfnum) { 2331 if (kcage_next_range(mtype == MTYPE_NORELOC, pfnum, 2332 (wrap == 0 ? hi : pfnum0), &nlo, &nhi)) 2333 goto wrapit; 2334 2335 /* jump to the next page in the range */ 2336 if (pfnum < nlo) { 2337 pfnum = P2ROUNDUP(nlo, szcpgcnt); 2338 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it); 2339 idx = PNUM_TO_IDX(mnode, r, pfnum); 2340 if (idx >= len || pfnum >= hi) 2341 goto wrapit; 2342 if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) & 2343 ceq_mask) 2344 goto next; 2345 if (interleaved_mnodes && 2346 PFN_2_MEM_NODE(pfnum) != mnode) 2347 goto next; 2348 } 2349 } 2350 #endif 2351 2352 if (PAGE_COUNTERS(mnode, r, idx) != full) 2353 goto next; 2354 2355 /* 2356 * RFE: For performance maybe we can do something less 2357 * brutal than locking the entire freelist. So far 2358 * this doesn't seem to be a performance problem? 2359 */ 2360 page_freelist_lock(mnode); 2361 if (PAGE_COUNTERS(mnode, r, idx) == full) { 2362 ret_pp = 2363 page_promote(mnode, pfnum, r, PC_ALLOC, mtype); 2364 if (ret_pp != NULL) { 2365 VM_STAT_ADD(vmm_vmstats.pfc_coalok[r][mrange]); 2366 PAGE_COUNTERS_CURRENT_COLOR(mnode, r, 2367 PFN_2_COLOR(pfnum, szc, &it), mrange) = idx; 2368 page_freelist_unlock(mnode); 2369 rw_exit(&page_ctrs_rwlock[mnode]); 2370 #if defined(__sparc) 2371 if (PP_ISNORELOC(ret_pp)) { 2372 pgcnt_t npgs; 2373 2374 npgs = page_get_pagecnt(ret_pp->p_szc); 2375 kcage_freemem_sub(npgs); 2376 } 2377 #endif 2378 return (ret_pp); 2379 } 2380 } else { 2381 VM_STAT_ADD(vmm_vmstats.page_ctrs_changed[r][mrange]); 2382 } 2383 2384 page_freelist_unlock(mnode); 2385 /* 2386 * No point looking for another page if we've 2387 * already tried all of the ones that 2388 * page_ctr_cands indicated. Stash off where we left 2389 * off. 2390 * Note: this is not exact since we don't hold the 2391 * page_freelist_locks before we initially get the 2392 * value of cands for performance reasons, but should 2393 * be a decent approximation. 2394 */ 2395 if (--cands == 0) { 2396 PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color, mrange) = 2397 idx; 2398 break; 2399 } 2400 next: 2401 PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask, 2402 color_mask, &it); 2403 idx = PNUM_TO_IDX(mnode, r, pfnum); 2404 if (idx >= len || pfnum >= hi) { 2405 wrapit: 2406 pfnum = lo; 2407 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it); 2408 idx = PNUM_TO_IDX(mnode, r, pfnum); 2409 wrap++; 2410 #if defined(__sparc) 2411 nhi = 0; /* search kcage ranges */ 2412 #endif 2413 } 2414 } 2415 2416 rw_exit(&page_ctrs_rwlock[mnode]); 2417 VM_STAT_ADD(vmm_vmstats.page_ctrs_failed[r][mrange]); 2418 return (NULL); 2419 } 2420 2421 /* 2422 * For the given mnode, promote as many small pages to large pages as possible. 2423 * mnode can be -1, which means do them all 2424 */ 2425 void 2426 page_freelist_coalesce_all(int mnode) 2427 { 2428 int r; /* region size */ 2429 int idx, full; 2430 size_t len; 2431 int doall = interleaved_mnodes || mnode < 0; 2432 int mlo = doall ? 0 : mnode; 2433 int mhi = doall ? max_mem_nodes : (mnode + 1); 2434 2435 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce_all); 2436 2437 if (mpss_coalesce_disable) { 2438 return; 2439 } 2440 2441 /* 2442 * Lock the entire freelist and coalesce what we can. 2443 * 2444 * Always promote to the largest page possible 2445 * first to reduce the number of page promotions. 2446 */ 2447 for (mnode = mlo; mnode < mhi; mnode++) { 2448 rw_enter(&page_ctrs_rwlock[mnode], RW_READER); 2449 page_freelist_lock(mnode); 2450 } 2451 for (r = mmu_page_sizes - 1; r > 0; r--) { 2452 for (mnode = mlo; mnode < mhi; mnode++) { 2453 pgcnt_t cands = 0; 2454 int mrange, nranges = mnode_nranges[mnode]; 2455 2456 for (mrange = 0; mrange < nranges; mrange++) { 2457 PGCTRS_CANDS_GETVALUE(mnode, mrange, r, cands); 2458 if (cands != 0) 2459 break; 2460 } 2461 if (cands == 0) { 2462 VM_STAT_ADD(vmm_vmstats. 2463 page_ctrs_cands_skip_all); 2464 continue; 2465 } 2466 2467 full = FULL_REGION_CNT(r); 2468 len = PAGE_COUNTERS_ENTRIES(mnode, r); 2469 2470 for (idx = 0; idx < len; idx++) { 2471 if (PAGE_COUNTERS(mnode, r, idx) == full) { 2472 pfn_t pfnum = 2473 IDX_TO_PNUM(mnode, r, idx); 2474 int tmnode = interleaved_mnodes ? 2475 PFN_2_MEM_NODE(pfnum) : mnode; 2476 2477 ASSERT(pfnum >= 2478 mem_node_config[tmnode].physbase && 2479 pfnum < 2480 mem_node_config[tmnode].physmax); 2481 2482 (void) page_promote(tmnode, 2483 pfnum, r, PC_FREE, PC_MTYPE_ANY); 2484 } 2485 } 2486 /* shared hpm_counters covers all mnodes, so we quit */ 2487 if (interleaved_mnodes) 2488 break; 2489 } 2490 } 2491 for (mnode = mlo; mnode < mhi; mnode++) { 2492 page_freelist_unlock(mnode); 2493 rw_exit(&page_ctrs_rwlock[mnode]); 2494 } 2495 } 2496 2497 /* 2498 * This is where all polices for moving pages around 2499 * to different page size free lists is implemented. 2500 * Returns 1 on success, 0 on failure. 2501 * 2502 * So far these are the priorities for this algorithm in descending 2503 * order: 2504 * 2505 * 1) When servicing a request try to do so with a free page 2506 * from next size up. Helps defer fragmentation as long 2507 * as possible. 2508 * 2509 * 2) Page coalesce on demand. Only when a freelist 2510 * larger than PAGESIZE is empty and step 1 2511 * will not work since all larger size lists are 2512 * also empty. 2513 * 2514 * If pfnhi is non-zero, search for large page with pfn range less than pfnhi. 2515 */ 2516 2517 page_t * 2518 page_freelist_split(uchar_t szc, uint_t color, int mnode, int mtype, 2519 pfn_t pfnhi, page_list_walker_t *plw) 2520 { 2521 uchar_t nszc = szc + 1; 2522 uint_t bin, sbin, bin_prev; 2523 page_t *pp, *firstpp; 2524 page_t *ret_pp = NULL; 2525 uint_t color_mask; 2526 2527 if (nszc == mmu_page_sizes) 2528 return (NULL); 2529 2530 ASSERT(nszc < mmu_page_sizes); 2531 color_mask = PAGE_GET_PAGECOLORS(nszc) - 1; 2532 bin = sbin = PAGE_GET_NSZ_COLOR(szc, color); 2533 bin_prev = (plw->plw_bin_split_prev == color) ? INVALID_COLOR : 2534 PAGE_GET_NSZ_COLOR(szc, plw->plw_bin_split_prev); 2535 2536 VM_STAT_ADD(vmm_vmstats.pfs_req[szc]); 2537 /* 2538 * First try to break up a larger page to fill current size freelist. 2539 */ 2540 while (plw->plw_bins[nszc] != 0) { 2541 2542 ASSERT(nszc < mmu_page_sizes); 2543 2544 /* 2545 * If page found then demote it. 2546 */ 2547 if (PAGE_FREELISTS(mnode, nszc, bin, mtype)) { 2548 page_freelist_lock(mnode); 2549 firstpp = pp = PAGE_FREELISTS(mnode, nszc, bin, mtype); 2550 2551 /* 2552 * If pfnhi is not PFNNULL, look for large page below 2553 * pfnhi. PFNNULL signifies no pfn requirement. 2554 */ 2555 if (pfnhi != PFNNULL && pp->p_pagenum >= pfnhi) { 2556 do { 2557 pp = pp->p_vpnext; 2558 if (pp == firstpp) { 2559 pp = NULL; 2560 break; 2561 } 2562 } while (pp->p_pagenum >= pfnhi); 2563 } 2564 if (pp) { 2565 uint_t ccolor = page_correct_color(szc, nszc, 2566 color, bin, plw->plw_ceq_mask[szc]); 2567 2568 ASSERT(pp->p_szc == nszc); 2569 VM_STAT_ADD(vmm_vmstats.pfs_demote[nszc]); 2570 ret_pp = page_demote(mnode, pp->p_pagenum, 2571 pp->p_szc, szc, ccolor, PC_ALLOC); 2572 if (ret_pp) { 2573 page_freelist_unlock(mnode); 2574 #if defined(__sparc) 2575 if (PP_ISNORELOC(ret_pp)) { 2576 pgcnt_t npgs; 2577 2578 npgs = page_get_pagecnt( 2579 ret_pp->p_szc); 2580 kcage_freemem_sub(npgs); 2581 } 2582 #endif 2583 return (ret_pp); 2584 } 2585 } 2586 page_freelist_unlock(mnode); 2587 } 2588 2589 /* loop through next size bins */ 2590 bin = ADD_MASKED(bin, 1, plw->plw_ceq_mask[nszc], color_mask); 2591 plw->plw_bins[nszc]--; 2592 2593 if (bin == sbin) { 2594 uchar_t nnszc = nszc + 1; 2595 2596 /* we are done with this page size - check next */ 2597 if (plw->plw_bins[nnszc] == 0) 2598 /* we have already checked next size bins */ 2599 break; 2600 2601 bin = sbin = PAGE_GET_NSZ_COLOR(nszc, bin); 2602 if (bin_prev != INVALID_COLOR) { 2603 bin_prev = PAGE_GET_NSZ_COLOR(nszc, bin_prev); 2604 if (!((bin ^ bin_prev) & 2605 plw->plw_ceq_mask[nnszc])) 2606 break; 2607 } 2608 ASSERT(nnszc < mmu_page_sizes); 2609 color_mask = PAGE_GET_PAGECOLORS(nnszc) - 1; 2610 nszc = nnszc; 2611 ASSERT(nszc < mmu_page_sizes); 2612 } 2613 } 2614 2615 return (ret_pp); 2616 } 2617 2618 /* 2619 * Helper routine used only by the freelist code to lock 2620 * a page. If the page is a large page then it succeeds in 2621 * locking all the constituent pages or none at all. 2622 * Returns 1 on sucess, 0 on failure. 2623 */ 2624 static int 2625 page_trylock_cons(page_t *pp, se_t se) 2626 { 2627 page_t *tpp, *first_pp = pp; 2628 2629 /* 2630 * Fail if can't lock first or only page. 2631 */ 2632 if (!page_trylock(pp, se)) { 2633 return (0); 2634 } 2635 2636 /* 2637 * PAGESIZE: common case. 2638 */ 2639 if (pp->p_szc == 0) { 2640 return (1); 2641 } 2642 2643 /* 2644 * Large page case. 2645 */ 2646 tpp = pp->p_next; 2647 while (tpp != pp) { 2648 if (!page_trylock(tpp, se)) { 2649 /* 2650 * On failure unlock what we have locked so far. 2651 * We want to avoid attempting to capture these 2652 * pages as the pcm mutex may be held which could 2653 * lead to a recursive mutex panic. 2654 */ 2655 while (first_pp != tpp) { 2656 page_unlock_nocapture(first_pp); 2657 first_pp = first_pp->p_next; 2658 } 2659 return (0); 2660 } 2661 tpp = tpp->p_next; 2662 } 2663 return (1); 2664 } 2665 2666 /* 2667 * init context for walking page lists 2668 * Called when a page of the given szc in unavailable. Sets markers 2669 * for the beginning of the search to detect when search has 2670 * completed a full cycle. Sets flags for splitting larger pages 2671 * and coalescing smaller pages. Page walking procedes until a page 2672 * of the desired equivalent color is found. 2673 */ 2674 void 2675 page_list_walk_init(uchar_t szc, uint_t flags, uint_t bin, int can_split, 2676 int use_ceq, page_list_walker_t *plw) 2677 { 2678 uint_t nszc, ceq_mask, colors; 2679 uchar_t ceq = use_ceq ? colorequivszc[szc] : 0; 2680 2681 ASSERT(szc < mmu_page_sizes); 2682 colors = PAGE_GET_PAGECOLORS(szc); 2683 2684 plw->plw_colors = colors; 2685 plw->plw_color_mask = colors - 1; 2686 plw->plw_bin_marker = plw->plw_bin0 = bin; 2687 plw->plw_bin_split_prev = bin; 2688 plw->plw_bin_step = (szc == 0) ? vac_colors : 1; 2689 2690 /* 2691 * if vac aliasing is possible make sure lower order color 2692 * bits are never ignored 2693 */ 2694 if (vac_colors > 1) 2695 ceq &= 0xf0; 2696 2697 /* 2698 * calculate the number of non-equivalent colors and 2699 * color equivalency mask 2700 */ 2701 plw->plw_ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf)); 2702 ASSERT(szc > 0 || plw->plw_ceq_dif >= vac_colors); 2703 ASSERT(plw->plw_ceq_dif > 0); 2704 plw->plw_ceq_mask[szc] = (plw->plw_ceq_dif - 1) << (ceq & 0xf); 2705 2706 if (flags & PG_MATCH_COLOR) { 2707 if (cpu_page_colors < 0) { 2708 /* 2709 * this is a heterogeneous machine with different CPUs 2710 * having different size e$ (not supported for ni2/rock 2711 */ 2712 uint_t cpucolors = CPUSETSIZE() >> PAGE_GET_SHIFT(szc); 2713 cpucolors = MAX(cpucolors, 1); 2714 ceq_mask = plw->plw_color_mask & (cpucolors - 1); 2715 plw->plw_ceq_mask[szc] = 2716 MIN(ceq_mask, plw->plw_ceq_mask[szc]); 2717 } 2718 plw->plw_ceq_dif = 1; 2719 } 2720 2721 /* we can split pages in the freelist, but not the cachelist */ 2722 if (can_split) { 2723 plw->plw_do_split = (szc + 1 < mmu_page_sizes) ? 1 : 0; 2724 2725 /* set next szc color masks and number of free list bins */ 2726 for (nszc = szc + 1; nszc < mmu_page_sizes; nszc++, szc++) { 2727 plw->plw_ceq_mask[nszc] = PAGE_GET_NSZ_MASK(szc, 2728 plw->plw_ceq_mask[szc]); 2729 plw->plw_bins[nszc] = PAGE_GET_PAGECOLORS(nszc); 2730 } 2731 plw->plw_ceq_mask[nszc] = INVALID_MASK; 2732 plw->plw_bins[nszc] = 0; 2733 2734 } else { 2735 ASSERT(szc == 0); 2736 plw->plw_do_split = 0; 2737 plw->plw_bins[1] = 0; 2738 plw->plw_ceq_mask[1] = INVALID_MASK; 2739 } 2740 } 2741 2742 /* 2743 * set mark to flag where next split should occur 2744 */ 2745 #define PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw) { \ 2746 uint_t bin_nsz = PAGE_GET_NSZ_COLOR(szc, bin); \ 2747 uint_t bin0_nsz = PAGE_GET_NSZ_COLOR(szc, plw->plw_bin0); \ 2748 uint_t neq_mask = ~plw->plw_ceq_mask[nszc] & plw->plw_color_mask; \ 2749 plw->plw_split_next = \ 2750 INC_MASKED(bin_nsz, neq_mask, plw->plw_color_mask); \ 2751 if (!((plw->plw_split_next ^ bin0_nsz) & plw->plw_ceq_mask[nszc])) { \ 2752 plw->plw_split_next = \ 2753 INC_MASKED(plw->plw_split_next, \ 2754 neq_mask, plw->plw_color_mask); \ 2755 } \ 2756 } 2757 2758 uint_t 2759 page_list_walk_next_bin(uchar_t szc, uint_t bin, page_list_walker_t *plw) 2760 { 2761 uint_t neq_mask = ~plw->plw_ceq_mask[szc] & plw->plw_color_mask; 2762 uint_t bin0_nsz, nbin_nsz, nbin0, nbin; 2763 uchar_t nszc = szc + 1; 2764 2765 nbin = ADD_MASKED(bin, 2766 plw->plw_bin_step, neq_mask, plw->plw_color_mask); 2767 2768 if (plw->plw_do_split) { 2769 plw->plw_bin_split_prev = bin; 2770 PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw); 2771 plw->plw_do_split = 0; 2772 } 2773 2774 if (szc == 0) { 2775 if (plw->plw_count != 0 || plw->plw_ceq_dif == vac_colors) { 2776 if (nbin == plw->plw_bin0 && 2777 (vac_colors == 1 || nbin != plw->plw_bin_marker)) { 2778 nbin = ADD_MASKED(nbin, plw->plw_bin_step, 2779 neq_mask, plw->plw_color_mask); 2780 plw->plw_bin_split_prev = plw->plw_bin0; 2781 } 2782 2783 if (vac_colors > 1 && nbin == plw->plw_bin_marker) { 2784 plw->plw_bin_marker = 2785 nbin = INC_MASKED(nbin, neq_mask, 2786 plw->plw_color_mask); 2787 plw->plw_bin_split_prev = plw->plw_bin0; 2788 /* 2789 * large pages all have the same vac color 2790 * so by now we should be done with next 2791 * size page splitting process 2792 */ 2793 ASSERT(plw->plw_bins[1] == 0); 2794 plw->plw_do_split = 0; 2795 return (nbin); 2796 } 2797 2798 } else { 2799 uint_t bin_jump = (vac_colors == 1) ? 2800 (BIN_STEP & ~3) - (plw->plw_bin0 & 3) : BIN_STEP; 2801 2802 bin_jump &= ~(vac_colors - 1); 2803 2804 nbin0 = ADD_MASKED(plw->plw_bin0, bin_jump, neq_mask, 2805 plw->plw_color_mask); 2806 2807 if ((nbin0 ^ plw->plw_bin0) & plw->plw_ceq_mask[szc]) { 2808 2809 plw->plw_bin_marker = nbin = nbin0; 2810 2811 if (plw->plw_bins[nszc] != 0) { 2812 /* 2813 * check if next page size bin is the 2814 * same as the next page size bin for 2815 * bin0 2816 */ 2817 nbin_nsz = PAGE_GET_NSZ_COLOR(szc, 2818 nbin); 2819 bin0_nsz = PAGE_GET_NSZ_COLOR(szc, 2820 plw->plw_bin0); 2821 2822 if ((bin0_nsz ^ nbin_nsz) & 2823 plw->plw_ceq_mask[nszc]) 2824 plw->plw_do_split = 1; 2825 } 2826 return (nbin); 2827 } 2828 } 2829 } 2830 2831 if (plw->plw_bins[nszc] != 0) { 2832 nbin_nsz = PAGE_GET_NSZ_COLOR(szc, nbin); 2833 if (!((plw->plw_split_next ^ nbin_nsz) & 2834 plw->plw_ceq_mask[nszc])) 2835 plw->plw_do_split = 1; 2836 } 2837 2838 return (nbin); 2839 } 2840 2841 page_t * 2842 page_get_mnode_freelist(int mnode, uint_t bin, int mtype, uchar_t szc, 2843 uint_t flags) 2844 { 2845 kmutex_t *pcm; 2846 page_t *pp, *first_pp; 2847 uint_t sbin; 2848 int plw_initialized; 2849 page_list_walker_t plw; 2850 2851 ASSERT(szc < mmu_page_sizes); 2852 2853 VM_STAT_ADD(vmm_vmstats.pgmf_alloc[szc]); 2854 2855 MTYPE_START(mnode, mtype, flags); 2856 if (mtype < 0) { /* mnode does not have memory in mtype range */ 2857 VM_STAT_ADD(vmm_vmstats.pgmf_allocempty[szc]); 2858 return (NULL); 2859 } 2860 try_again: 2861 2862 plw_initialized = 0; 2863 plw.plw_ceq_dif = 1; 2864 2865 /* 2866 * Only hold one freelist lock at a time, that way we 2867 * can start anywhere and not have to worry about lock 2868 * ordering. 2869 */ 2870 for (plw.plw_count = 0; 2871 plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) { 2872 sbin = bin; 2873 do { 2874 if (!PAGE_FREELISTS(mnode, szc, bin, mtype)) 2875 goto bin_empty_1; 2876 2877 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 2878 mutex_enter(pcm); 2879 pp = PAGE_FREELISTS(mnode, szc, bin, mtype); 2880 if (pp == NULL) 2881 goto bin_empty_0; 2882 2883 /* 2884 * These were set before the page 2885 * was put on the free list, 2886 * they must still be set. 2887 */ 2888 ASSERT(PP_ISFREE(pp)); 2889 ASSERT(PP_ISAGED(pp)); 2890 ASSERT(pp->p_vnode == NULL); 2891 ASSERT(pp->p_hash == NULL); 2892 ASSERT(pp->p_offset == (u_offset_t)-1); 2893 ASSERT(pp->p_szc == szc); 2894 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 2895 2896 /* 2897 * Walk down the hash chain. 2898 * 8k pages are linked on p_next 2899 * and p_prev fields. Large pages 2900 * are a contiguous group of 2901 * constituent pages linked together 2902 * on their p_next and p_prev fields. 2903 * The large pages are linked together 2904 * on the hash chain using p_vpnext 2905 * p_vpprev of the base constituent 2906 * page of each large page. 2907 */ 2908 first_pp = pp; 2909 while (!page_trylock_cons(pp, SE_EXCL)) { 2910 if (szc == 0) { 2911 pp = pp->p_next; 2912 } else { 2913 pp = pp->p_vpnext; 2914 } 2915 2916 ASSERT(PP_ISFREE(pp)); 2917 ASSERT(PP_ISAGED(pp)); 2918 ASSERT(pp->p_vnode == NULL); 2919 ASSERT(pp->p_hash == NULL); 2920 ASSERT(pp->p_offset == (u_offset_t)-1); 2921 ASSERT(pp->p_szc == szc); 2922 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 2923 2924 if (pp == first_pp) 2925 goto bin_empty_0; 2926 } 2927 2928 ASSERT(pp != NULL); 2929 ASSERT(mtype == PP_2_MTYPE(pp)); 2930 ASSERT(pp->p_szc == szc); 2931 if (szc == 0) { 2932 page_sub(&PAGE_FREELISTS(mnode, 2933 szc, bin, mtype), pp); 2934 } else { 2935 page_vpsub(&PAGE_FREELISTS(mnode, 2936 szc, bin, mtype), pp); 2937 CHK_LPG(pp, szc); 2938 } 2939 page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST); 2940 2941 if ((PP_ISFREE(pp) == 0) || (PP_ISAGED(pp) == 0)) 2942 panic("free page is not. pp %p", (void *)pp); 2943 mutex_exit(pcm); 2944 2945 #if defined(__sparc) 2946 ASSERT(!kcage_on || PP_ISNORELOC(pp) || 2947 (flags & PG_NORELOC) == 0); 2948 2949 if (PP_ISNORELOC(pp)) 2950 kcage_freemem_sub(page_get_pagecnt(szc)); 2951 #endif 2952 VM_STAT_ADD(vmm_vmstats.pgmf_allocok[szc]); 2953 return (pp); 2954 2955 bin_empty_0: 2956 mutex_exit(pcm); 2957 bin_empty_1: 2958 if (plw_initialized == 0) { 2959 page_list_walk_init(szc, flags, bin, 1, 1, 2960 &plw); 2961 plw_initialized = 1; 2962 ASSERT(plw.plw_colors <= 2963 PAGE_GET_PAGECOLORS(szc)); 2964 ASSERT(plw.plw_colors > 0); 2965 ASSERT((plw.plw_colors & 2966 (plw.plw_colors - 1)) == 0); 2967 ASSERT(bin < plw.plw_colors); 2968 ASSERT(plw.plw_ceq_mask[szc] < plw.plw_colors); 2969 } 2970 /* calculate the next bin with equivalent color */ 2971 bin = ADD_MASKED(bin, plw.plw_bin_step, 2972 plw.plw_ceq_mask[szc], plw.plw_color_mask); 2973 } while (sbin != bin); 2974 2975 /* 2976 * color bins are all empty if color match. Try and 2977 * satisfy the request by breaking up or coalescing 2978 * pages from a different size freelist of the correct 2979 * color that satisfies the ORIGINAL color requested. 2980 * If that fails then try pages of the same size but 2981 * different colors assuming we are not called with 2982 * PG_MATCH_COLOR. 2983 */ 2984 if (plw.plw_do_split && 2985 (pp = page_freelist_split(szc, bin, mnode, 2986 mtype, PFNNULL, &plw)) != NULL) 2987 return (pp); 2988 2989 if (szc > 0 && (pp = page_freelist_coalesce(mnode, szc, 2990 bin, plw.plw_ceq_mask[szc], mtype, PFNNULL)) != NULL) 2991 return (pp); 2992 2993 if (plw.plw_ceq_dif > 1) 2994 bin = page_list_walk_next_bin(szc, bin, &plw); 2995 } 2996 2997 /* if allowed, cycle through additional mtypes */ 2998 MTYPE_NEXT(mnode, mtype, flags); 2999 if (mtype >= 0) 3000 goto try_again; 3001 3002 VM_STAT_ADD(vmm_vmstats.pgmf_allocfailed[szc]); 3003 3004 return (NULL); 3005 } 3006 3007 /* 3008 * Returns the count of free pages for 'pp' with size code 'szc'. 3009 * Note: This function does not return an exact value as the page freelist 3010 * locks are not held and thus the values in the page_counters may be 3011 * changing as we walk through the data. 3012 */ 3013 static int 3014 page_freecnt(int mnode, page_t *pp, uchar_t szc) 3015 { 3016 pgcnt_t pgfree; 3017 pgcnt_t cnt; 3018 ssize_t r = szc; /* region size */ 3019 ssize_t idx; 3020 int i; 3021 int full, range; 3022 3023 /* Make sure pagenum passed in is aligned properly */ 3024 ASSERT((pp->p_pagenum & (PNUM_SIZE(szc) - 1)) == 0); 3025 ASSERT(szc > 0); 3026 3027 /* Prevent page_counters dynamic memory from being freed */ 3028 rw_enter(&page_ctrs_rwlock[mnode], RW_READER); 3029 idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum); 3030 cnt = PAGE_COUNTERS(mnode, r, idx); 3031 pgfree = cnt << PNUM_SHIFT(r - 1); 3032 range = FULL_REGION_CNT(szc); 3033 3034 /* Check for completely full region */ 3035 if (cnt == range) { 3036 rw_exit(&page_ctrs_rwlock[mnode]); 3037 return (pgfree); 3038 } 3039 3040 while (--r > 0) { 3041 idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum); 3042 full = FULL_REGION_CNT(r); 3043 for (i = 0; i < range; i++, idx++) { 3044 cnt = PAGE_COUNTERS(mnode, r, idx); 3045 /* 3046 * If cnt here is full, that means we have already 3047 * accounted for these pages earlier. 3048 */ 3049 if (cnt != full) { 3050 pgfree += (cnt << PNUM_SHIFT(r - 1)); 3051 } 3052 } 3053 range *= full; 3054 } 3055 rw_exit(&page_ctrs_rwlock[mnode]); 3056 return (pgfree); 3057 } 3058 3059 /* 3060 * Called from page_geti_contig_pages to exclusively lock constituent pages 3061 * starting from 'spp' for page size code 'szc'. 3062 * 3063 * If 'ptcpthreshold' is set, the number of free pages needed in the 'szc' 3064 * region needs to be greater than or equal to the threshold. 3065 */ 3066 static int 3067 page_trylock_contig_pages(int mnode, page_t *spp, uchar_t szc, int flags) 3068 { 3069 pgcnt_t pgcnt = PNUM_SIZE(szc); 3070 pgcnt_t pgfree, i; 3071 page_t *pp; 3072 3073 VM_STAT_ADD(vmm_vmstats.ptcp[szc]); 3074 3075 3076 if ((ptcpthreshold == 0) || (flags & PGI_PGCPHIPRI)) 3077 goto skipptcpcheck; 3078 /* 3079 * check if there are sufficient free pages available before attempting 3080 * to trylock. Count is approximate as page counters can change. 3081 */ 3082 pgfree = page_freecnt(mnode, spp, szc); 3083 3084 /* attempt to trylock if there are sufficient already free pages */ 3085 if (pgfree < pgcnt/ptcpthreshold) { 3086 VM_STAT_ADD(vmm_vmstats.ptcpfreethresh[szc]); 3087 return (0); 3088 } 3089 3090 skipptcpcheck: 3091 3092 for (i = 0; i < pgcnt; i++) { 3093 pp = &spp[i]; 3094 if (!page_trylock(pp, SE_EXCL)) { 3095 VM_STAT_ADD(vmm_vmstats.ptcpfailexcl[szc]); 3096 while (--i != (pgcnt_t)-1) { 3097 pp = &spp[i]; 3098 ASSERT(PAGE_EXCL(pp)); 3099 page_unlock_nocapture(pp); 3100 } 3101 return (0); 3102 } 3103 ASSERT(spp[i].p_pagenum == spp->p_pagenum + i); 3104 if ((pp->p_szc > szc || (szc && pp->p_szc == szc)) && 3105 !PP_ISFREE(pp)) { 3106 VM_STAT_ADD(vmm_vmstats.ptcpfailszc[szc]); 3107 ASSERT(i == 0); 3108 page_unlock_nocapture(pp); 3109 return (0); 3110 } 3111 if (PP_ISNORELOC(pp)) { 3112 VM_STAT_ADD(vmm_vmstats.ptcpfailcage[szc]); 3113 while (i != (pgcnt_t)-1) { 3114 pp = &spp[i]; 3115 ASSERT(PAGE_EXCL(pp)); 3116 page_unlock_nocapture(pp); 3117 i--; 3118 } 3119 return (0); 3120 } 3121 } 3122 VM_STAT_ADD(vmm_vmstats.ptcpok[szc]); 3123 return (1); 3124 } 3125 3126 /* 3127 * Claim large page pointed to by 'pp'. 'pp' is the starting set 3128 * of 'szc' constituent pages that had been locked exclusively previously. 3129 * Will attempt to relocate constituent pages in use. 3130 */ 3131 static page_t * 3132 page_claim_contig_pages(page_t *pp, uchar_t szc, int flags) 3133 { 3134 spgcnt_t pgcnt, npgs, i; 3135 page_t *targpp, *rpp, *hpp; 3136 page_t *replpp = NULL; 3137 page_t *pplist = NULL; 3138 3139 ASSERT(pp != NULL); 3140 3141 pgcnt = page_get_pagecnt(szc); 3142 while (pgcnt) { 3143 ASSERT(PAGE_EXCL(pp)); 3144 ASSERT(!PP_ISNORELOC(pp)); 3145 if (PP_ISFREE(pp)) { 3146 /* 3147 * If this is a PG_FREE_LIST page then its 3148 * size code can change underneath us due to 3149 * page promotion or demotion. As an optimzation 3150 * use page_list_sub_pages() instead of 3151 * page_list_sub(). 3152 */ 3153 if (PP_ISAGED(pp)) { 3154 page_list_sub_pages(pp, szc); 3155 if (pp->p_szc == szc) { 3156 return (pp); 3157 } 3158 ASSERT(pp->p_szc < szc); 3159 npgs = page_get_pagecnt(pp->p_szc); 3160 hpp = pp; 3161 for (i = 0; i < npgs; i++, pp++) { 3162 pp->p_szc = szc; 3163 } 3164 page_list_concat(&pplist, &hpp); 3165 pgcnt -= npgs; 3166 continue; 3167 } 3168 ASSERT(!PP_ISAGED(pp)); 3169 ASSERT(pp->p_szc == 0); 3170 page_list_sub(pp, PG_CACHE_LIST); 3171 page_hashout(pp, NULL); 3172 PP_SETAGED(pp); 3173 pp->p_szc = szc; 3174 page_list_concat(&pplist, &pp); 3175 pp++; 3176 pgcnt--; 3177 continue; 3178 } 3179 npgs = page_get_pagecnt(pp->p_szc); 3180 3181 /* 3182 * page_create_wait freemem accounting done by caller of 3183 * page_get_freelist and not necessary to call it prior to 3184 * calling page_get_replacement_page. 3185 * 3186 * page_get_replacement_page can call page_get_contig_pages 3187 * to acquire a large page (szc > 0); the replacement must be 3188 * smaller than the contig page size to avoid looping or 3189 * szc == 0 and PGI_PGCPSZC0 is set. 3190 */ 3191 if (pp->p_szc < szc || (szc == 0 && (flags & PGI_PGCPSZC0))) { 3192 replpp = page_get_replacement_page(pp, NULL, 0); 3193 if (replpp) { 3194 npgs = page_get_pagecnt(pp->p_szc); 3195 ASSERT(npgs <= pgcnt); 3196 targpp = pp; 3197 } 3198 } 3199 3200 /* 3201 * If replacement is NULL or do_page_relocate fails, fail 3202 * coalescing of pages. 3203 */ 3204 if (replpp == NULL || (do_page_relocate(&targpp, &replpp, 0, 3205 &npgs, NULL) != 0)) { 3206 /* 3207 * Unlock un-processed target list 3208 */ 3209 while (pgcnt--) { 3210 ASSERT(PAGE_EXCL(pp)); 3211 page_unlock_nocapture(pp); 3212 pp++; 3213 } 3214 /* 3215 * Free the processed target list. 3216 */ 3217 while (pplist) { 3218 pp = pplist; 3219 page_sub(&pplist, pp); 3220 ASSERT(PAGE_EXCL(pp)); 3221 ASSERT(pp->p_szc == szc); 3222 ASSERT(PP_ISFREE(pp)); 3223 ASSERT(PP_ISAGED(pp)); 3224 pp->p_szc = 0; 3225 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); 3226 page_unlock_nocapture(pp); 3227 } 3228 3229 if (replpp != NULL) 3230 page_free_replacement_page(replpp); 3231 3232 return (NULL); 3233 } 3234 ASSERT(pp == targpp); 3235 3236 /* LINTED */ 3237 ASSERT(hpp = pp); /* That's right, it's an assignment */ 3238 3239 pp += npgs; 3240 pgcnt -= npgs; 3241 3242 while (npgs--) { 3243 ASSERT(PAGE_EXCL(targpp)); 3244 ASSERT(!PP_ISFREE(targpp)); 3245 ASSERT(!PP_ISNORELOC(targpp)); 3246 PP_SETFREE(targpp); 3247 ASSERT(PP_ISAGED(targpp)); 3248 ASSERT(targpp->p_szc < szc || (szc == 0 && 3249 (flags & PGI_PGCPSZC0))); 3250 targpp->p_szc = szc; 3251 targpp = targpp->p_next; 3252 3253 rpp = replpp; 3254 ASSERT(rpp != NULL); 3255 page_sub(&replpp, rpp); 3256 ASSERT(PAGE_EXCL(rpp)); 3257 ASSERT(!PP_ISFREE(rpp)); 3258 page_unlock_nocapture(rpp); 3259 } 3260 ASSERT(targpp == hpp); 3261 ASSERT(replpp == NULL); 3262 page_list_concat(&pplist, &targpp); 3263 } 3264 CHK_LPG(pplist, szc); 3265 return (pplist); 3266 } 3267 3268 /* 3269 * Trim kernel cage from pfnlo-pfnhi and store result in lo-hi. Return code 3270 * of 0 means nothing left after trim. 3271 */ 3272 int 3273 trimkcage(struct memseg *mseg, pfn_t *lo, pfn_t *hi, pfn_t pfnlo, pfn_t pfnhi) 3274 { 3275 pfn_t kcagepfn; 3276 int decr; 3277 int rc = 0; 3278 3279 if (PP_ISNORELOC(mseg->pages)) { 3280 if (PP_ISNORELOC(mseg->epages - 1) == 0) { 3281 3282 /* lower part of this mseg inside kernel cage */ 3283 decr = kcage_current_pfn(&kcagepfn); 3284 3285 /* kernel cage may have transitioned past mseg */ 3286 if (kcagepfn >= mseg->pages_base && 3287 kcagepfn < mseg->pages_end) { 3288 ASSERT(decr == 0); 3289 *lo = MAX(kcagepfn, pfnlo); 3290 *hi = MIN(pfnhi, (mseg->pages_end - 1)); 3291 rc = 1; 3292 } 3293 } 3294 /* else entire mseg in the cage */ 3295 } else { 3296 if (PP_ISNORELOC(mseg->epages - 1)) { 3297 3298 /* upper part of this mseg inside kernel cage */ 3299 decr = kcage_current_pfn(&kcagepfn); 3300 3301 /* kernel cage may have transitioned past mseg */ 3302 if (kcagepfn >= mseg->pages_base && 3303 kcagepfn < mseg->pages_end) { 3304 ASSERT(decr); 3305 *hi = MIN(kcagepfn, pfnhi); 3306 *lo = MAX(pfnlo, mseg->pages_base); 3307 rc = 1; 3308 } 3309 } else { 3310 /* entire mseg outside of kernel cage */ 3311 *lo = MAX(pfnlo, mseg->pages_base); 3312 *hi = MIN(pfnhi, (mseg->pages_end - 1)); 3313 rc = 1; 3314 } 3315 } 3316 return (rc); 3317 } 3318 3319 /* 3320 * called from page_get_contig_pages to search 'pfnlo' thru 'pfnhi' to claim a 3321 * page with size code 'szc'. Claiming such a page requires acquiring 3322 * exclusive locks on all constituent pages (page_trylock_contig_pages), 3323 * relocating pages in use and concatenating these constituent pages into a 3324 * large page. 3325 * 3326 * The page lists do not have such a large page and page_freelist_split has 3327 * already failed to demote larger pages and/or coalesce smaller free pages. 3328 * 3329 * 'flags' may specify PG_COLOR_MATCH which would limit the search of large 3330 * pages with the same color as 'bin'. 3331 * 3332 * 'pfnflag' specifies the subset of the pfn range to search. 3333 */ 3334 3335 static page_t * 3336 page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags, 3337 pfn_t pfnlo, pfn_t pfnhi, pgcnt_t pfnflag) 3338 { 3339 struct memseg *mseg; 3340 pgcnt_t szcpgcnt = page_get_pagecnt(szc); 3341 pgcnt_t szcpgmask = szcpgcnt - 1; 3342 pfn_t randpfn; 3343 page_t *pp, *randpp, *endpp; 3344 uint_t colors, ceq_mask; 3345 /* LINTED : set but not used in function */ 3346 uint_t color_mask; 3347 pfn_t hi, lo; 3348 uint_t skip; 3349 MEM_NODE_ITERATOR_DECL(it); 3350 3351 ASSERT(szc != 0 || (flags & PGI_PGCPSZC0)); 3352 3353 pfnlo = P2ROUNDUP(pfnlo, szcpgcnt); 3354 3355 if ((pfnhi - pfnlo) + 1 < szcpgcnt || pfnlo >= pfnhi) 3356 return (NULL); 3357 3358 ASSERT(szc < mmu_page_sizes); 3359 3360 colors = PAGE_GET_PAGECOLORS(szc); 3361 color_mask = colors - 1; 3362 if ((colors > 1) && (flags & PG_MATCH_COLOR)) { 3363 uchar_t ceq = colorequivszc[szc]; 3364 uint_t ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf)); 3365 3366 ASSERT(ceq_dif > 0); 3367 ceq_mask = (ceq_dif - 1) << (ceq & 0xf); 3368 } else { 3369 ceq_mask = 0; 3370 } 3371 3372 ASSERT(bin < colors); 3373 3374 /* clear "non-significant" color bits */ 3375 bin &= ceq_mask; 3376 3377 /* 3378 * trim the pfn range to search based on pfnflag. pfnflag is set 3379 * when there have been previous page_get_contig_page failures to 3380 * limit the search. 3381 * 3382 * The high bit in pfnflag specifies the number of 'slots' in the 3383 * pfn range and the remainder of pfnflag specifies which slot. 3384 * For example, a value of 1010b would mean the second slot of 3385 * the pfn range that has been divided into 8 slots. 3386 */ 3387 if (pfnflag > 1) { 3388 int slots = 1 << (highbit(pfnflag) - 1); 3389 int slotid = pfnflag & (slots - 1); 3390 pgcnt_t szcpages; 3391 int slotlen; 3392 3393 pfnhi = P2ALIGN((pfnhi + 1), szcpgcnt) - 1; 3394 szcpages = ((pfnhi - pfnlo) + 1) / szcpgcnt; 3395 slotlen = howmany(szcpages, slots); 3396 /* skip if 'slotid' slot is empty */ 3397 if (slotid * slotlen >= szcpages) 3398 return (NULL); 3399 pfnlo = pfnlo + (((slotid * slotlen) % szcpages) * szcpgcnt); 3400 ASSERT(pfnlo < pfnhi); 3401 if (pfnhi > pfnlo + (slotlen * szcpgcnt)) 3402 pfnhi = pfnlo + (slotlen * szcpgcnt) - 1; 3403 } 3404 3405 memsegs_lock(0); 3406 3407 /* 3408 * loop through memsegs to look for contig page candidates 3409 */ 3410 3411 for (mseg = memsegs; mseg != NULL; mseg = mseg->next) { 3412 if (pfnhi < mseg->pages_base || pfnlo >= mseg->pages_end) { 3413 /* no overlap */ 3414 continue; 3415 } 3416 3417 if (mseg->pages_end - mseg->pages_base < szcpgcnt) 3418 /* mseg too small */ 3419 continue; 3420 3421 /* 3422 * trim off kernel cage pages from pfn range and check for 3423 * a trimmed pfn range returned that does not span the 3424 * desired large page size. 3425 */ 3426 if (kcage_on) { 3427 if (trimkcage(mseg, &lo, &hi, pfnlo, pfnhi) == 0 || 3428 lo >= hi || ((hi - lo) + 1) < szcpgcnt) 3429 continue; 3430 } else { 3431 lo = MAX(pfnlo, mseg->pages_base); 3432 hi = MIN(pfnhi, (mseg->pages_end - 1)); 3433 } 3434 3435 /* round to szcpgcnt boundaries */ 3436 lo = P2ROUNDUP(lo, szcpgcnt); 3437 3438 MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it); 3439 hi = P2ALIGN((hi + 1), szcpgcnt) - 1; 3440 3441 if (hi <= lo) 3442 continue; 3443 3444 /* 3445 * set lo to point to the pfn for the desired bin. Large 3446 * page sizes may only have a single page color 3447 */ 3448 skip = szcpgcnt; 3449 if (ceq_mask > 0 || interleaved_mnodes) { 3450 /* set lo to point at appropriate color */ 3451 if (((PFN_2_COLOR(lo, szc, &it) ^ bin) & ceq_mask) || 3452 (interleaved_mnodes && 3453 PFN_2_MEM_NODE(lo) != mnode)) { 3454 PAGE_NEXT_PFN_FOR_COLOR(lo, szc, bin, ceq_mask, 3455 color_mask, &it); 3456 } 3457 if (hi <= lo) 3458 /* mseg cannot satisfy color request */ 3459 continue; 3460 } 3461 3462 /* randomly choose a point between lo and hi to begin search */ 3463 3464 randpfn = (pfn_t)GETTICK(); 3465 randpfn = ((randpfn % (hi - lo)) + lo) & ~(skip - 1); 3466 MEM_NODE_ITERATOR_INIT(randpfn, mnode, szc, &it); 3467 if (ceq_mask || interleaved_mnodes || randpfn == (pfn_t)-1) { 3468 if (randpfn != (pfn_t)-1) { 3469 PAGE_NEXT_PFN_FOR_COLOR(randpfn, szc, bin, 3470 ceq_mask, color_mask, &it); 3471 } 3472 if (randpfn >= hi) { 3473 randpfn = lo; 3474 MEM_NODE_ITERATOR_INIT(randpfn, mnode, szc, 3475 &it); 3476 } 3477 } 3478 randpp = mseg->pages + (randpfn - mseg->pages_base); 3479 3480 ASSERT(randpp->p_pagenum == randpfn); 3481 3482 pp = randpp; 3483 endpp = mseg->pages + (hi - mseg->pages_base) + 1; 3484 3485 ASSERT(randpp + szcpgcnt <= endpp); 3486 3487 do { 3488 ASSERT(!(pp->p_pagenum & szcpgmask)); 3489 ASSERT(((PP_2_BIN(pp) ^ bin) & ceq_mask) == 0); 3490 3491 if (page_trylock_contig_pages(mnode, pp, szc, flags)) { 3492 /* pages unlocked by page_claim on failure */ 3493 if (page_claim_contig_pages(pp, szc, flags)) { 3494 memsegs_unlock(0); 3495 return (pp); 3496 } 3497 } 3498 3499 if (ceq_mask == 0 && !interleaved_mnodes) { 3500 pp += skip; 3501 } else { 3502 pfn_t pfn = pp->p_pagenum; 3503 3504 PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, bin, 3505 ceq_mask, color_mask, &it); 3506 if (pfn == (pfn_t)-1) { 3507 pp = endpp; 3508 } else { 3509 pp = mseg->pages + 3510 (pfn - mseg->pages_base); 3511 } 3512 } 3513 if (pp >= endpp) { 3514 /* start from the beginning */ 3515 MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it); 3516 pp = mseg->pages + (lo - mseg->pages_base); 3517 ASSERT(pp->p_pagenum == lo); 3518 ASSERT(pp + szcpgcnt <= endpp); 3519 } 3520 } while (pp != randpp); 3521 } 3522 memsegs_unlock(0); 3523 return (NULL); 3524 } 3525 3526 3527 /* 3528 * controlling routine that searches through physical memory in an attempt to 3529 * claim a large page based on the input parameters. 3530 * on the page free lists. 3531 * 3532 * calls page_geti_contig_pages with an initial pfn range from the mnode 3533 * and mtype. page_geti_contig_pages will trim off the parts of the pfn range 3534 * that overlaps with the kernel cage or does not match the requested page 3535 * color if PG_MATCH_COLOR is set. Since this search is very expensive, 3536 * page_geti_contig_pages may further limit the search range based on 3537 * previous failure counts (pgcpfailcnt[]). 3538 * 3539 * for PGI_PGCPSZC0 requests, page_get_contig_pages will relocate a base 3540 * pagesize page that satisfies mtype. 3541 */ 3542 page_t * 3543 page_get_contig_pages(int mnode, uint_t bin, int mtype, uchar_t szc, 3544 uint_t flags) 3545 { 3546 pfn_t pfnlo, pfnhi; /* contig pages pfn range */ 3547 page_t *pp; 3548 pgcnt_t pfnflag = 0; /* no limit on search if 0 */ 3549 3550 VM_STAT_ADD(vmm_vmstats.pgcp_alloc[szc]); 3551 3552 /* no allocations from cage */ 3553 flags |= PGI_NOCAGE; 3554 3555 /* LINTED */ 3556 MTYPE_START(mnode, mtype, flags); 3557 if (mtype < 0) { /* mnode does not have memory in mtype range */ 3558 VM_STAT_ADD(vmm_vmstats.pgcp_allocempty[szc]); 3559 return (NULL); 3560 } 3561 3562 ASSERT(szc > 0 || (flags & PGI_PGCPSZC0)); 3563 3564 /* do not limit search and ignore color if hi pri */ 3565 3566 if (pgcplimitsearch && ((flags & PGI_PGCPHIPRI) == 0)) 3567 pfnflag = pgcpfailcnt[szc]; 3568 3569 /* remove color match to improve chances */ 3570 3571 if (flags & PGI_PGCPHIPRI || pfnflag) 3572 flags &= ~PG_MATCH_COLOR; 3573 3574 do { 3575 /* get pfn range based on mnode and mtype */ 3576 MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi); 3577 3578 ASSERT(pfnhi >= pfnlo); 3579 3580 pp = page_geti_contig_pages(mnode, bin, szc, flags, 3581 pfnlo, pfnhi, pfnflag); 3582 3583 if (pp != NULL) { 3584 pfnflag = pgcpfailcnt[szc]; 3585 if (pfnflag) { 3586 /* double the search size */ 3587 pgcpfailcnt[szc] = pfnflag >> 1; 3588 } 3589 VM_STAT_ADD(vmm_vmstats.pgcp_allocok[szc]); 3590 return (pp); 3591 } 3592 MTYPE_NEXT(mnode, mtype, flags); 3593 } while (mtype >= 0); 3594 3595 VM_STAT_ADD(vmm_vmstats.pgcp_allocfailed[szc]); 3596 return (NULL); 3597 } 3598 3599 #if defined(__i386) || defined(__amd64) 3600 /* 3601 * Determine the likelihood of finding/coalescing a szc page. 3602 * Return 0 if the likelihood is small otherwise return 1. 3603 * 3604 * For now, be conservative and check only 1g pages and return 0 3605 * if there had been previous coalescing failures and the szc pages 3606 * needed to satisfy request would exhaust most of freemem. 3607 */ 3608 int 3609 page_chk_freelist(uint_t szc) 3610 { 3611 pgcnt_t pgcnt; 3612 3613 if (szc <= 1) 3614 return (1); 3615 3616 pgcnt = page_get_pagecnt(szc); 3617 if (pgcpfailcnt[szc] && pgcnt + throttlefree >= freemem) { 3618 VM_STAT_ADD(vmm_vmstats.pcf_deny[szc]); 3619 return (0); 3620 } 3621 VM_STAT_ADD(vmm_vmstats.pcf_allow[szc]); 3622 return (1); 3623 } 3624 #endif 3625 3626 /* 3627 * Find the `best' page on the freelist for this (vp,off) (as,vaddr) pair. 3628 * 3629 * Does its own locking and accounting. 3630 * If PG_MATCH_COLOR is set, then NULL will be returned if there are no 3631 * pages of the proper color even if there are pages of a different color. 3632 * 3633 * Finds a page, removes it, THEN locks it. 3634 */ 3635 3636 /*ARGSUSED*/ 3637 page_t * 3638 page_get_freelist(struct vnode *vp, u_offset_t off, struct seg *seg, 3639 caddr_t vaddr, size_t size, uint_t flags, struct lgrp *lgrp) 3640 { 3641 struct as *as = seg->s_as; 3642 page_t *pp = NULL; 3643 ulong_t bin; 3644 uchar_t szc; 3645 int mnode; 3646 int mtype; 3647 page_t *(*page_get_func)(int, uint_t, int, uchar_t, uint_t); 3648 lgrp_mnode_cookie_t lgrp_cookie; 3649 3650 page_get_func = page_get_mnode_freelist; 3651 3652 /* 3653 * If we aren't passed a specific lgroup, or passed a freed lgrp 3654 * assume we wish to allocate near to the current thread's home. 3655 */ 3656 if (!LGRP_EXISTS(lgrp)) 3657 lgrp = lgrp_home_lgrp(); 3658 3659 if (kcage_on) { 3660 if ((flags & (PG_NORELOC | PG_PANIC)) == PG_NORELOC && 3661 kcage_freemem < kcage_throttlefree + btop(size) && 3662 curthread != kcage_cageout_thread) { 3663 /* 3664 * Set a "reserve" of kcage_throttlefree pages for 3665 * PG_PANIC and cageout thread allocations. 3666 * 3667 * Everybody else has to serialize in 3668 * page_create_get_something() to get a cage page, so 3669 * that we don't deadlock cageout! 3670 */ 3671 return (NULL); 3672 } 3673 } else { 3674 flags &= ~PG_NORELOC; 3675 flags |= PGI_NOCAGE; 3676 } 3677 3678 /* LINTED */ 3679 MTYPE_INIT(mtype, vp, vaddr, flags, size); 3680 3681 /* 3682 * Convert size to page size code. 3683 */ 3684 if ((szc = page_szc(size)) == (uchar_t)-1) 3685 panic("page_get_freelist: illegal page size request"); 3686 ASSERT(szc < mmu_page_sizes); 3687 3688 VM_STAT_ADD(vmm_vmstats.pgf_alloc[szc]); 3689 3690 /* LINTED */ 3691 AS_2_BIN(as, seg, vp, vaddr, bin, szc); 3692 3693 ASSERT(bin < PAGE_GET_PAGECOLORS(szc)); 3694 3695 /* 3696 * Try to get a local page first, but try remote if we can't 3697 * get a page of the right color. 3698 */ 3699 pgretry: 3700 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL); 3701 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3702 pp = page_get_func(mnode, bin, mtype, szc, flags); 3703 if (pp != NULL) { 3704 VM_STAT_ADD(vmm_vmstats.pgf_allocok[szc]); 3705 DTRACE_PROBE4(page__get, 3706 lgrp_t *, lgrp, 3707 int, mnode, 3708 ulong_t, bin, 3709 uint_t, flags); 3710 return (pp); 3711 } 3712 } 3713 ASSERT(pp == NULL); 3714 3715 /* 3716 * for non-SZC0 PAGESIZE requests, check cachelist before checking 3717 * remote free lists. Caller expected to call page_get_cachelist which 3718 * will check local cache lists and remote free lists. 3719 */ 3720 if (szc == 0 && ((flags & PGI_PGCPSZC0) == 0)) { 3721 VM_STAT_ADD(vmm_vmstats.pgf_allocdeferred); 3722 return (NULL); 3723 } 3724 3725 ASSERT(szc > 0 || (flags & PGI_PGCPSZC0)); 3726 3727 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); 3728 3729 if (!(flags & PG_LOCAL)) { 3730 /* 3731 * Try to get a non-local freelist page. 3732 */ 3733 LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie); 3734 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3735 pp = page_get_func(mnode, bin, mtype, szc, flags); 3736 if (pp != NULL) { 3737 DTRACE_PROBE4(page__get, 3738 lgrp_t *, lgrp, 3739 int, mnode, 3740 ulong_t, bin, 3741 uint_t, flags); 3742 VM_STAT_ADD(vmm_vmstats.pgf_allocokrem[szc]); 3743 return (pp); 3744 } 3745 } 3746 ASSERT(pp == NULL); 3747 } 3748 3749 /* 3750 * when the cage is off chances are page_get_contig_pages() will fail 3751 * to lock a large page chunk therefore when the cage is off it's not 3752 * called by default. this can be changed via /etc/system. 3753 * 3754 * page_get_contig_pages() also called to acquire a base pagesize page 3755 * for page_create_get_something(). 3756 */ 3757 if (!(flags & PG_NORELOC) && (pg_contig_disable == 0) && 3758 (kcage_on || pg_lpgcreate_nocage || szc == 0) && 3759 (page_get_func != page_get_contig_pages)) { 3760 3761 VM_STAT_ADD(vmm_vmstats.pgf_allocretry[szc]); 3762 page_get_func = page_get_contig_pages; 3763 goto pgretry; 3764 } 3765 3766 if (!(flags & PG_LOCAL) && pgcplimitsearch && 3767 page_get_func == page_get_contig_pages) 3768 SETPGCPFAILCNT(szc); 3769 3770 VM_STAT_ADD(vmm_vmstats.pgf_allocfailed[szc]); 3771 return (NULL); 3772 } 3773 3774 /* 3775 * Find the `best' page on the cachelist for this (vp,off) (as,vaddr) pair. 3776 * 3777 * Does its own locking. 3778 * If PG_MATCH_COLOR is set, then NULL will be returned if there are no 3779 * pages of the proper color even if there are pages of a different color. 3780 * Otherwise, scan the bins for ones with pages. For each bin with pages, 3781 * try to lock one of them. If no page can be locked, try the 3782 * next bin. Return NULL if a page can not be found and locked. 3783 * 3784 * Finds a pages, trys to lock it, then removes it. 3785 */ 3786 3787 /*ARGSUSED*/ 3788 page_t * 3789 page_get_cachelist(struct vnode *vp, u_offset_t off, struct seg *seg, 3790 caddr_t vaddr, uint_t flags, struct lgrp *lgrp) 3791 { 3792 page_t *pp; 3793 struct as *as = seg->s_as; 3794 ulong_t bin; 3795 /*LINTED*/ 3796 int mnode; 3797 int mtype; 3798 lgrp_mnode_cookie_t lgrp_cookie; 3799 3800 /* 3801 * If we aren't passed a specific lgroup, or pasased a freed lgrp 3802 * assume we wish to allocate near to the current thread's home. 3803 */ 3804 if (!LGRP_EXISTS(lgrp)) 3805 lgrp = lgrp_home_lgrp(); 3806 3807 if (!kcage_on) { 3808 flags &= ~PG_NORELOC; 3809 flags |= PGI_NOCAGE; 3810 } 3811 3812 if ((flags & (PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == PG_NORELOC && 3813 kcage_freemem <= kcage_throttlefree) { 3814 /* 3815 * Reserve kcage_throttlefree pages for critical kernel 3816 * threads. 3817 * 3818 * Everybody else has to go to page_create_get_something() 3819 * to get a cage page, so we don't deadlock cageout. 3820 */ 3821 return (NULL); 3822 } 3823 3824 /* LINTED */ 3825 AS_2_BIN(as, seg, vp, vaddr, bin, 0); 3826 3827 ASSERT(bin < PAGE_GET_PAGECOLORS(0)); 3828 3829 /* LINTED */ 3830 MTYPE_INIT(mtype, vp, vaddr, flags, MMU_PAGESIZE); 3831 3832 VM_STAT_ADD(vmm_vmstats.pgc_alloc); 3833 3834 /* 3835 * Try local cachelists first 3836 */ 3837 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL); 3838 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3839 pp = page_get_mnode_cachelist(bin, flags, mnode, mtype); 3840 if (pp != NULL) { 3841 VM_STAT_ADD(vmm_vmstats.pgc_allocok); 3842 DTRACE_PROBE4(page__get, 3843 lgrp_t *, lgrp, 3844 int, mnode, 3845 ulong_t, bin, 3846 uint_t, flags); 3847 return (pp); 3848 } 3849 } 3850 3851 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); 3852 3853 /* 3854 * Try freelists/cachelists that are farther away 3855 * This is our only chance to allocate remote pages for PAGESIZE 3856 * requests. 3857 */ 3858 LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie); 3859 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3860 pp = page_get_mnode_freelist(mnode, bin, mtype, 3861 0, flags); 3862 if (pp != NULL) { 3863 VM_STAT_ADD(vmm_vmstats.pgc_allocokdeferred); 3864 DTRACE_PROBE4(page__get, 3865 lgrp_t *, lgrp, 3866 int, mnode, 3867 ulong_t, bin, 3868 uint_t, flags); 3869 return (pp); 3870 } 3871 pp = page_get_mnode_cachelist(bin, flags, mnode, mtype); 3872 if (pp != NULL) { 3873 VM_STAT_ADD(vmm_vmstats.pgc_allocokrem); 3874 DTRACE_PROBE4(page__get, 3875 lgrp_t *, lgrp, 3876 int, mnode, 3877 ulong_t, bin, 3878 uint_t, flags); 3879 return (pp); 3880 } 3881 } 3882 3883 VM_STAT_ADD(vmm_vmstats.pgc_allocfailed); 3884 return (NULL); 3885 } 3886 3887 page_t * 3888 page_get_mnode_cachelist(uint_t bin, uint_t flags, int mnode, int mtype) 3889 { 3890 kmutex_t *pcm; 3891 page_t *pp, *first_pp; 3892 uint_t sbin; 3893 int plw_initialized; 3894 page_list_walker_t plw; 3895 3896 VM_STAT_ADD(vmm_vmstats.pgmc_alloc); 3897 3898 /* LINTED */ 3899 MTYPE_START(mnode, mtype, flags); 3900 if (mtype < 0) { /* mnode does not have memory in mtype range */ 3901 VM_STAT_ADD(vmm_vmstats.pgmc_allocempty); 3902 return (NULL); 3903 } 3904 3905 try_again: 3906 3907 plw_initialized = 0; 3908 plw.plw_ceq_dif = 1; 3909 3910 /* 3911 * Only hold one cachelist lock at a time, that way we 3912 * can start anywhere and not have to worry about lock 3913 * ordering. 3914 */ 3915 3916 for (plw.plw_count = 0; 3917 plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) { 3918 sbin = bin; 3919 do { 3920 3921 if (!PAGE_CACHELISTS(mnode, bin, mtype)) 3922 goto bin_empty_1; 3923 pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST); 3924 mutex_enter(pcm); 3925 pp = PAGE_CACHELISTS(mnode, bin, mtype); 3926 if (pp == NULL) 3927 goto bin_empty_0; 3928 3929 first_pp = pp; 3930 ASSERT(pp->p_vnode); 3931 ASSERT(PP_ISAGED(pp) == 0); 3932 ASSERT(pp->p_szc == 0); 3933 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 3934 while (!page_trylock(pp, SE_EXCL)) { 3935 pp = pp->p_next; 3936 ASSERT(pp->p_szc == 0); 3937 if (pp == first_pp) { 3938 /* 3939 * We have searched the complete list! 3940 * And all of them (might only be one) 3941 * are locked. This can happen since 3942 * these pages can also be found via 3943 * the hash list. When found via the 3944 * hash list, they are locked first, 3945 * then removed. We give up to let the 3946 * other thread run. 3947 */ 3948 pp = NULL; 3949 break; 3950 } 3951 ASSERT(pp->p_vnode); 3952 ASSERT(PP_ISFREE(pp)); 3953 ASSERT(PP_ISAGED(pp) == 0); 3954 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == 3955 mnode); 3956 } 3957 3958 if (pp) { 3959 page_t **ppp; 3960 /* 3961 * Found and locked a page. 3962 * Pull it off the list. 3963 */ 3964 ASSERT(mtype == PP_2_MTYPE(pp)); 3965 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 3966 page_sub(ppp, pp); 3967 /* 3968 * Subtract counters before releasing pcm mutex 3969 * to avoid a race with page_freelist_coalesce 3970 * and page_freelist_split. 3971 */ 3972 page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST); 3973 mutex_exit(pcm); 3974 ASSERT(pp->p_vnode); 3975 ASSERT(PP_ISAGED(pp) == 0); 3976 #if defined(__sparc) 3977 ASSERT(!kcage_on || 3978 (flags & PG_NORELOC) == 0 || 3979 PP_ISNORELOC(pp)); 3980 if (PP_ISNORELOC(pp)) { 3981 kcage_freemem_sub(1); 3982 } 3983 #endif 3984 VM_STAT_ADD(vmm_vmstats. pgmc_allocok); 3985 return (pp); 3986 } 3987 bin_empty_0: 3988 mutex_exit(pcm); 3989 bin_empty_1: 3990 if (plw_initialized == 0) { 3991 page_list_walk_init(0, flags, bin, 0, 1, &plw); 3992 plw_initialized = 1; 3993 } 3994 /* calculate the next bin with equivalent color */ 3995 bin = ADD_MASKED(bin, plw.plw_bin_step, 3996 plw.plw_ceq_mask[0], plw.plw_color_mask); 3997 } while (sbin != bin); 3998 3999 if (plw.plw_ceq_dif > 1) 4000 bin = page_list_walk_next_bin(0, bin, &plw); 4001 } 4002 4003 MTYPE_NEXT(mnode, mtype, flags); 4004 if (mtype >= 0) 4005 goto try_again; 4006 4007 VM_STAT_ADD(vmm_vmstats.pgmc_allocfailed); 4008 return (NULL); 4009 } 4010 4011 #ifdef DEBUG 4012 #define REPL_PAGE_STATS 4013 #endif /* DEBUG */ 4014 4015 #ifdef REPL_PAGE_STATS 4016 struct repl_page_stats { 4017 uint_t ngets; 4018 uint_t ngets_noreloc; 4019 uint_t npgr_noreloc; 4020 uint_t nnopage_first; 4021 uint_t nnopage; 4022 uint_t nhashout; 4023 uint_t nnofree; 4024 uint_t nnext_pp; 4025 } repl_page_stats; 4026 #define REPL_STAT_INCR(v) atomic_add_32(&repl_page_stats.v, 1) 4027 #else /* REPL_PAGE_STATS */ 4028 #define REPL_STAT_INCR(v) 4029 #endif /* REPL_PAGE_STATS */ 4030 4031 int pgrppgcp; 4032 4033 /* 4034 * The freemem accounting must be done by the caller. 4035 * First we try to get a replacement page of the same size as like_pp, 4036 * if that is not possible, then we just get a set of discontiguous 4037 * PAGESIZE pages. 4038 */ 4039 page_t * 4040 page_get_replacement_page(page_t *orig_like_pp, struct lgrp *lgrp_target, 4041 uint_t pgrflags) 4042 { 4043 page_t *like_pp; 4044 page_t *pp, *pplist; 4045 page_t *pl = NULL; 4046 ulong_t bin; 4047 int mnode, page_mnode; 4048 int szc; 4049 spgcnt_t npgs, pg_cnt; 4050 pfn_t pfnum; 4051 int mtype; 4052 int flags = 0; 4053 lgrp_mnode_cookie_t lgrp_cookie; 4054 lgrp_t *lgrp; 4055 4056 REPL_STAT_INCR(ngets); 4057 like_pp = orig_like_pp; 4058 ASSERT(PAGE_EXCL(like_pp)); 4059 4060 szc = like_pp->p_szc; 4061 npgs = page_get_pagecnt(szc); 4062 /* 4063 * Now we reset like_pp to the base page_t. 4064 * That way, we won't walk past the end of this 'szc' page. 4065 */ 4066 pfnum = PFN_BASE(like_pp->p_pagenum, szc); 4067 like_pp = page_numtopp_nolock(pfnum); 4068 ASSERT(like_pp->p_szc == szc); 4069 4070 if (PP_ISNORELOC(like_pp)) { 4071 ASSERT(kcage_on); 4072 REPL_STAT_INCR(ngets_noreloc); 4073 flags = PGI_RELOCONLY; 4074 } else if (pgrflags & PGR_NORELOC) { 4075 ASSERT(kcage_on); 4076 REPL_STAT_INCR(npgr_noreloc); 4077 flags = PG_NORELOC; 4078 } 4079 4080 /* 4081 * Kernel pages must always be replaced with the same size 4082 * pages, since we cannot properly handle demotion of kernel 4083 * pages. 4084 */ 4085 if (PP_ISKAS(like_pp)) 4086 pgrflags |= PGR_SAMESZC; 4087 4088 /* LINTED */ 4089 MTYPE_PGR_INIT(mtype, flags, like_pp, page_mnode, npgs); 4090 4091 while (npgs) { 4092 pplist = NULL; 4093 for (;;) { 4094 pg_cnt = page_get_pagecnt(szc); 4095 bin = PP_2_BIN(like_pp); 4096 ASSERT(like_pp->p_szc == orig_like_pp->p_szc); 4097 ASSERT(pg_cnt <= npgs); 4098 4099 /* 4100 * If an lgroup was specified, try to get the 4101 * page from that lgroup. 4102 * NOTE: Must be careful with code below because 4103 * lgroup may disappear and reappear since there 4104 * is no locking for lgroup here. 4105 */ 4106 if (LGRP_EXISTS(lgrp_target)) { 4107 /* 4108 * Keep local variable for lgroup separate 4109 * from lgroup argument since this code should 4110 * only be exercised when lgroup argument 4111 * exists.... 4112 */ 4113 lgrp = lgrp_target; 4114 4115 /* Try the lgroup's freelists first */ 4116 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 4117 LGRP_SRCH_LOCAL); 4118 while ((pplist == NULL) && 4119 (mnode = lgrp_memnode_choose(&lgrp_cookie)) 4120 != -1) { 4121 pplist = 4122 page_get_mnode_freelist(mnode, bin, 4123 mtype, szc, flags); 4124 } 4125 4126 /* 4127 * Now try it's cachelists if this is a 4128 * small page. Don't need to do it for 4129 * larger ones since page_freelist_coalesce() 4130 * already failed. 4131 */ 4132 if (pplist != NULL || szc != 0) 4133 break; 4134 4135 /* Now try it's cachelists */ 4136 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 4137 LGRP_SRCH_LOCAL); 4138 4139 while ((pplist == NULL) && 4140 (mnode = lgrp_memnode_choose(&lgrp_cookie)) 4141 != -1) { 4142 pplist = 4143 page_get_mnode_cachelist(bin, flags, 4144 mnode, mtype); 4145 } 4146 if (pplist != NULL) { 4147 page_hashout(pplist, NULL); 4148 PP_SETAGED(pplist); 4149 REPL_STAT_INCR(nhashout); 4150 break; 4151 } 4152 /* Done looking in this lgroup. Bail out. */ 4153 break; 4154 } 4155 4156 /* 4157 * No lgroup was specified (or lgroup was removed by 4158 * DR, so just try to get the page as close to 4159 * like_pp's mnode as possible. 4160 * First try the local freelist... 4161 */ 4162 mnode = PP_2_MEM_NODE(like_pp); 4163 pplist = page_get_mnode_freelist(mnode, bin, 4164 mtype, szc, flags); 4165 if (pplist != NULL) 4166 break; 4167 4168 REPL_STAT_INCR(nnofree); 4169 4170 /* 4171 * ...then the local cachelist. Don't need to do it for 4172 * larger pages cause page_freelist_coalesce() already 4173 * failed there anyway. 4174 */ 4175 if (szc == 0) { 4176 pplist = page_get_mnode_cachelist(bin, flags, 4177 mnode, mtype); 4178 if (pplist != NULL) { 4179 page_hashout(pplist, NULL); 4180 PP_SETAGED(pplist); 4181 REPL_STAT_INCR(nhashout); 4182 break; 4183 } 4184 } 4185 4186 /* Now try remote freelists */ 4187 page_mnode = mnode; 4188 lgrp = 4189 lgrp_hand_to_lgrp(MEM_NODE_2_LGRPHAND(page_mnode)); 4190 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 4191 LGRP_SRCH_HIER); 4192 while (pplist == NULL && 4193 (mnode = lgrp_memnode_choose(&lgrp_cookie)) 4194 != -1) { 4195 /* 4196 * Skip local mnode. 4197 */ 4198 if ((mnode == page_mnode) || 4199 (mem_node_config[mnode].exists == 0)) 4200 continue; 4201 4202 pplist = page_get_mnode_freelist(mnode, 4203 bin, mtype, szc, flags); 4204 } 4205 4206 if (pplist != NULL) 4207 break; 4208 4209 4210 /* Now try remote cachelists */ 4211 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 4212 LGRP_SRCH_HIER); 4213 while (pplist == NULL && szc == 0) { 4214 mnode = lgrp_memnode_choose(&lgrp_cookie); 4215 if (mnode == -1) 4216 break; 4217 /* 4218 * Skip local mnode. 4219 */ 4220 if ((mnode == page_mnode) || 4221 (mem_node_config[mnode].exists == 0)) 4222 continue; 4223 4224 pplist = page_get_mnode_cachelist(bin, 4225 flags, mnode, mtype); 4226 4227 if (pplist != NULL) { 4228 page_hashout(pplist, NULL); 4229 PP_SETAGED(pplist); 4230 REPL_STAT_INCR(nhashout); 4231 break; 4232 } 4233 } 4234 4235 /* 4236 * Break out of while loop under the following cases: 4237 * - If we successfully got a page. 4238 * - If pgrflags specified only returning a specific 4239 * page size and we could not find that page size. 4240 * - If we could not satisfy the request with PAGESIZE 4241 * or larger pages. 4242 */ 4243 if (pplist != NULL || szc == 0) 4244 break; 4245 4246 if ((pgrflags & PGR_SAMESZC) || pgrppgcp) { 4247 /* try to find contig page */ 4248 4249 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 4250 LGRP_SRCH_HIER); 4251 4252 while ((pplist == NULL) && 4253 (mnode = 4254 lgrp_memnode_choose(&lgrp_cookie)) 4255 != -1) { 4256 pplist = page_get_contig_pages( 4257 mnode, bin, mtype, szc, 4258 flags | PGI_PGCPHIPRI); 4259 } 4260 break; 4261 } 4262 4263 /* 4264 * The correct thing to do here is try the next 4265 * page size down using szc--. Due to a bug 4266 * with the processing of HAT_RELOAD_SHARE 4267 * where the sfmmu_ttecnt arrays of all 4268 * hats sharing an ISM segment don't get updated, 4269 * using intermediate size pages for relocation 4270 * can lead to continuous page faults. 4271 */ 4272 szc = 0; 4273 } 4274 4275 if (pplist != NULL) { 4276 DTRACE_PROBE4(page__get, 4277 lgrp_t *, lgrp, 4278 int, mnode, 4279 ulong_t, bin, 4280 uint_t, flags); 4281 4282 while (pplist != NULL && pg_cnt--) { 4283 ASSERT(pplist != NULL); 4284 pp = pplist; 4285 page_sub(&pplist, pp); 4286 PP_CLRFREE(pp); 4287 PP_CLRAGED(pp); 4288 page_list_concat(&pl, &pp); 4289 npgs--; 4290 like_pp = like_pp + 1; 4291 REPL_STAT_INCR(nnext_pp); 4292 } 4293 ASSERT(pg_cnt == 0); 4294 } else { 4295 break; 4296 } 4297 } 4298 4299 if (npgs) { 4300 /* 4301 * We were unable to allocate the necessary number 4302 * of pages. 4303 * We need to free up any pl. 4304 */ 4305 REPL_STAT_INCR(nnopage); 4306 page_free_replacement_page(pl); 4307 return (NULL); 4308 } else { 4309 return (pl); 4310 } 4311 } 4312 4313 /* 4314 * demote a free large page to it's constituent pages 4315 */ 4316 void 4317 page_demote_free_pages(page_t *pp) 4318 { 4319 4320 int mnode; 4321 4322 ASSERT(pp != NULL); 4323 ASSERT(PAGE_LOCKED(pp)); 4324 ASSERT(PP_ISFREE(pp)); 4325 ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes); 4326 4327 mnode = PP_2_MEM_NODE(pp); 4328 page_freelist_lock(mnode); 4329 if (pp->p_szc != 0) { 4330 (void) page_demote(mnode, PFN_BASE(pp->p_pagenum, 4331 pp->p_szc), pp->p_szc, 0, PC_NO_COLOR, PC_FREE); 4332 } 4333 page_freelist_unlock(mnode); 4334 ASSERT(pp->p_szc == 0); 4335 } 4336 4337 /* 4338 * Factor in colorequiv to check additional 'equivalent' bins. 4339 * colorequiv may be set in /etc/system 4340 */ 4341 void 4342 page_set_colorequiv_arr(void) 4343 { 4344 if (colorequiv > 1) { 4345 int i; 4346 uint_t sv_a = lowbit(colorequiv) - 1; 4347 4348 if (sv_a > 15) 4349 sv_a = 15; 4350 4351 for (i = 0; i < MMU_PAGE_SIZES; i++) { 4352 uint_t colors; 4353 uint_t a = sv_a; 4354 4355 if ((colors = hw_page_array[i].hp_colors) <= 1) { 4356 continue; 4357 } 4358 while ((colors >> a) == 0) 4359 a--; 4360 if ((a << 4) > colorequivszc[i]) { 4361 colorequivszc[i] = (a << 4); 4362 } 4363 } 4364 } 4365 } 4366