1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * Portions of this source code were derived from Berkeley 4.3 BSD 31 * under license from the Regents of the University of California. 32 */ 33 34 #pragma ident "%Z%%M% %I% %E% SMI" 35 36 /* 37 * This file contains common functions to access and manage the page lists. 38 * Many of these routines originated from platform dependent modules 39 * (sun4/vm/vm_dep.c, i86pc/vm/vm_machdep.c) and modified to function in 40 * a platform independent manner. 41 * 42 * vm/vm_dep.h provides for platform specific support. 43 */ 44 45 #include <sys/types.h> 46 #include <sys/debug.h> 47 #include <sys/cmn_err.h> 48 #include <sys/systm.h> 49 #include <sys/atomic.h> 50 #include <sys/sysmacros.h> 51 #include <vm/as.h> 52 #include <vm/page.h> 53 #include <vm/seg_kmem.h> 54 #include <vm/seg_vn.h> 55 #include <sys/vmsystm.h> 56 #include <sys/memnode.h> 57 #include <vm/vm_dep.h> 58 #include <sys/lgrp.h> 59 #include <sys/mem_config.h> 60 #include <sys/callb.h> 61 #include <sys/mem_cage.h> 62 #include <sys/sdt.h> 63 64 extern uint_t vac_colors; 65 66 #define MAX_PRAGMA_ALIGN 128 67 68 /* vm_cpu_data0 for the boot cpu before kmem is initialized */ 69 70 #if L2CACHE_ALIGN_MAX <= MAX_PRAGMA_ALIGN 71 #pragma align L2CACHE_ALIGN_MAX(vm_cpu_data0) 72 #else 73 #pragma align MAX_PRAGMA_ALIGN(vm_cpu_data0) 74 #endif 75 char vm_cpu_data0[VM_CPU_DATA_PADSIZE]; 76 77 /* 78 * number of page colors equivalent to reqested color in page_get routines. 79 * If set, keeps large pages intact longer and keeps MPO allocation 80 * from the local mnode in favor of acquiring the 'correct' page color from 81 * a demoted large page or from a remote mnode. 82 */ 83 uint_t colorequiv; 84 85 /* 86 * color equivalency mask for each page size. 87 * Mask is computed based on cpu L2$ way sizes and colorequiv global. 88 * High 4 bits determine the number of high order bits of the color to ignore. 89 * Low 4 bits determines number of low order bits of color to ignore (it's only 90 * relevant for hashed index based page coloring). 91 */ 92 uchar_t colorequivszc[MMU_PAGE_SIZES]; 93 94 /* 95 * if set, specifies the percentage of large pages that are free from within 96 * a large page region before attempting to lock those pages for 97 * page_get_contig_pages processing. 98 * 99 * Should be turned on when kpr is available when page_trylock_contig_pages 100 * can be more selective. 101 */ 102 103 int ptcpthreshold; 104 105 /* 106 * Limit page get contig page search based on failure cnts in pgcpfailcnt[]. 107 * Enabled by default via pgcplimitsearch. 108 * 109 * pgcpfailcnt[] is bounded by PGCPFAILMAX (>= 1/2 of installed 110 * memory). When reached, pgcpfailcnt[] is reset to 1/2 of this upper 111 * bound. This upper bound range guarantees: 112 * - all large page 'slots' will be searched over time 113 * - the minimum (1) large page candidates considered on each pgcp call 114 * - count doesn't wrap around to 0 115 */ 116 pgcnt_t pgcpfailcnt[MMU_PAGE_SIZES]; 117 int pgcplimitsearch = 1; 118 119 #define PGCPFAILMAX (1 << (highbit(physinstalled) - 1)) 120 #define SETPGCPFAILCNT(szc) \ 121 if (++pgcpfailcnt[szc] >= PGCPFAILMAX) \ 122 pgcpfailcnt[szc] = PGCPFAILMAX / 2; 123 124 #ifdef VM_STATS 125 struct vmm_vmstats_str vmm_vmstats; 126 127 #endif /* VM_STATS */ 128 129 #if defined(__sparc) 130 #define LPGCREATE 0 131 #else 132 /* enable page_get_contig_pages */ 133 #define LPGCREATE 1 134 #endif 135 136 int pg_contig_disable; 137 int pg_lpgcreate_nocage = LPGCREATE; 138 139 /* 140 * page_freelist_split pfn flag to signify no hi pfn requirement. 141 */ 142 #define PFNNULL 0 143 144 /* Flags involved in promotion and demotion routines */ 145 #define PC_FREE 0x1 /* put page on freelist */ 146 #define PC_ALLOC 0x2 /* return page for allocation */ 147 148 /* 149 * Flag for page_demote to be used with PC_FREE to denote that we don't care 150 * what the color is as the color parameter to the function is ignored. 151 */ 152 #define PC_NO_COLOR (-1) 153 154 /* mtype value for page_promote to use when mtype does not matter */ 155 #define PC_MTYPE_ANY (-1) 156 157 /* 158 * page counters candidates info 159 * See page_ctrs_cands comment below for more details. 160 * fields are as follows: 161 * pcc_pages_free: # pages which freelist coalesce can create 162 * pcc_color_free: pointer to page free counts per color 163 */ 164 typedef struct pcc_info { 165 pgcnt_t pcc_pages_free; 166 pgcnt_t *pcc_color_free; 167 } pcc_info_t; 168 169 /* 170 * On big machines it can take a long time to check page_counters 171 * arrays. page_ctrs_cands is a summary array whose elements are a dynamically 172 * updated sum of all elements of the corresponding page_counters arrays. 173 * page_freelist_coalesce() searches page_counters only if an appropriate 174 * element of page_ctrs_cands array is greater than 0. 175 * 176 * page_ctrs_cands is indexed by mutex (i), region (r), mnode (m), mrange (g) 177 */ 178 pcc_info_t **page_ctrs_cands[NPC_MUTEX][MMU_PAGE_SIZES]; 179 180 /* 181 * Return in val the total number of free pages which can be created 182 * for the given mnode (m), mrange (g), and region size (r) 183 */ 184 #define PGCTRS_CANDS_GETVALUE(m, g, r, val) { \ 185 int i; \ 186 val = 0; \ 187 for (i = 0; i < NPC_MUTEX; i++) { \ 188 val += page_ctrs_cands[i][(r)][(m)][(g)].pcc_pages_free; \ 189 } \ 190 } 191 192 /* 193 * Return in val the total number of free pages which can be created 194 * for the given mnode (m), mrange (g), region size (r), and color (c) 195 */ 196 #define PGCTRS_CANDS_GETVALUECOLOR(m, g, r, c, val) { \ 197 int i; \ 198 val = 0; \ 199 ASSERT((c) < PAGE_GET_PAGECOLORS(r)); \ 200 for (i = 0; i < NPC_MUTEX; i++) { \ 201 val += \ 202 page_ctrs_cands[i][(r)][(m)][(g)].pcc_color_free[(c)]; \ 203 } \ 204 } 205 206 /* 207 * We can only allow a single thread to update a counter within the physical 208 * range of the largest supported page size. That is the finest granularity 209 * possible since the counter values are dependent on each other 210 * as you move accross region sizes. PP_CTR_LOCK_INDX is used to determine the 211 * ctr_mutex lock index for a particular physical range. 212 */ 213 static kmutex_t *ctr_mutex[NPC_MUTEX]; 214 215 #define PP_CTR_LOCK_INDX(pp) \ 216 (((pp)->p_pagenum >> \ 217 (PAGE_BSZS_SHIFT(mmu_page_sizes - 1))) & (NPC_MUTEX - 1)) 218 219 #define INVALID_COLOR 0xffffffff 220 #define INVALID_MASK 0xffffffff 221 222 /* 223 * Local functions prototypes. 224 */ 225 226 void page_ctr_add(int, int, page_t *, int); 227 void page_ctr_add_internal(int, int, page_t *, int); 228 void page_ctr_sub(int, int, page_t *, int); 229 void page_ctr_sub_internal(int, int, page_t *, int); 230 void page_freelist_lock(int); 231 void page_freelist_unlock(int); 232 page_t *page_promote(int, pfn_t, uchar_t, int, int); 233 page_t *page_demote(int, pfn_t, uchar_t, uchar_t, int, int); 234 page_t *page_freelist_split(uchar_t, 235 uint_t, int, int, pfn_t, page_list_walker_t *); 236 page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int); 237 static int page_trylock_cons(page_t *pp, se_t se); 238 239 /* 240 * The page_counters array below is used to keep track of free contiguous 241 * physical memory. A hw_page_map_t will be allocated per mnode per szc. 242 * This contains an array of counters, the size of the array, a shift value 243 * used to convert a pagenum into a counter array index or vice versa, as 244 * well as a cache of the last successful index to be promoted to a larger 245 * page size. As an optimization, we keep track of the last successful index 246 * to be promoted per page color for the given size region, and this is 247 * allocated dynamically based upon the number of colors for a given 248 * region size. 249 * 250 * Conceptually, the page counters are represented as: 251 * 252 * page_counters[region_size][mnode] 253 * 254 * region_size: size code of a candidate larger page made up 255 * of contiguous free smaller pages. 256 * 257 * page_counters[region_size][mnode].hpm_counters[index]: 258 * represents how many (region_size - 1) pages either 259 * exist or can be created within the given index range. 260 * 261 * Let's look at a sparc example: 262 * If we want to create a free 512k page, we look at region_size 2 263 * for the mnode we want. We calculate the index and look at a specific 264 * hpm_counters location. If we see 8 (FULL_REGION_CNT on sparc) at 265 * this location, it means that 8 64k pages either exist or can be created 266 * from 8K pages in order to make a single free 512k page at the given 267 * index. Note that when a region is full, it will contribute to the 268 * counts in the region above it. Thus we will not know what page 269 * size the free pages will be which can be promoted to this new free 270 * page unless we look at all regions below the current region. 271 */ 272 273 /* 274 * Note: hpmctr_t is defined in platform vm_dep.h 275 * hw_page_map_t contains all the information needed for the page_counters 276 * logic. The fields are as follows: 277 * 278 * hpm_counters: dynamically allocated array to hold counter data 279 * hpm_entries: entries in hpm_counters 280 * hpm_shift: shift for pnum/array index conv 281 * hpm_base: PFN mapped to counter index 0 282 * hpm_color_current: last index in counter array for this color at 283 * which we successfully created a large page 284 */ 285 typedef struct hw_page_map { 286 hpmctr_t *hpm_counters; 287 size_t hpm_entries; 288 int hpm_shift; 289 pfn_t hpm_base; 290 size_t *hpm_color_current[MAX_MNODE_MRANGES]; 291 } hw_page_map_t; 292 293 /* 294 * Element zero is not used, but is allocated for convenience. 295 */ 296 static hw_page_map_t *page_counters[MMU_PAGE_SIZES]; 297 298 /* 299 * Cached value of MNODE_RANGE_CNT(mnode). 300 * This is a function call in x86. 301 */ 302 static int mnode_nranges[MAX_MEM_NODES]; 303 static int mnode_maxmrange[MAX_MEM_NODES]; 304 305 /* 306 * The following macros are convenient ways to get access to the individual 307 * elements of the page_counters arrays. They can be used on both 308 * the left side and right side of equations. 309 */ 310 #define PAGE_COUNTERS(mnode, rg_szc, idx) \ 311 (page_counters[(rg_szc)][(mnode)].hpm_counters[(idx)]) 312 313 #define PAGE_COUNTERS_COUNTERS(mnode, rg_szc) \ 314 (page_counters[(rg_szc)][(mnode)].hpm_counters) 315 316 #define PAGE_COUNTERS_SHIFT(mnode, rg_szc) \ 317 (page_counters[(rg_szc)][(mnode)].hpm_shift) 318 319 #define PAGE_COUNTERS_ENTRIES(mnode, rg_szc) \ 320 (page_counters[(rg_szc)][(mnode)].hpm_entries) 321 322 #define PAGE_COUNTERS_BASE(mnode, rg_szc) \ 323 (page_counters[(rg_szc)][(mnode)].hpm_base) 324 325 #define PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, rg_szc, g) \ 326 (page_counters[(rg_szc)][(mnode)].hpm_color_current[(g)]) 327 328 #define PAGE_COUNTERS_CURRENT_COLOR(mnode, rg_szc, color, mrange) \ 329 (page_counters[(rg_szc)][(mnode)]. \ 330 hpm_color_current[(mrange)][(color)]) 331 332 #define PNUM_TO_IDX(mnode, rg_szc, pnum) \ 333 (((pnum) - PAGE_COUNTERS_BASE((mnode), (rg_szc))) >> \ 334 PAGE_COUNTERS_SHIFT((mnode), (rg_szc))) 335 336 #define IDX_TO_PNUM(mnode, rg_szc, index) \ 337 (PAGE_COUNTERS_BASE((mnode), (rg_szc)) + \ 338 ((index) << PAGE_COUNTERS_SHIFT((mnode), (rg_szc)))) 339 340 /* 341 * Protects the hpm_counters and hpm_color_current memory from changing while 342 * looking at page counters information. 343 * Grab the write lock to modify what these fields point at. 344 * Grab the read lock to prevent any pointers from changing. 345 * The write lock can not be held during memory allocation due to a possible 346 * recursion deadlock with trying to grab the read lock while the 347 * write lock is already held. 348 */ 349 krwlock_t page_ctrs_rwlock[MAX_MEM_NODES]; 350 351 352 /* 353 * initialize cpu_vm_data to point at cache aligned vm_cpu_data_t. 354 */ 355 void 356 cpu_vm_data_init(struct cpu *cp) 357 { 358 if (cp == CPU0) { 359 cp->cpu_vm_data = (void *)&vm_cpu_data0; 360 } else { 361 void *kmptr; 362 int align; 363 size_t sz; 364 365 align = (L2CACHE_ALIGN) ? L2CACHE_ALIGN : L2CACHE_ALIGN_MAX; 366 sz = P2ROUNDUP(sizeof (vm_cpu_data_t), align) + align; 367 kmptr = kmem_zalloc(sz, KM_SLEEP); 368 cp->cpu_vm_data = (void *) P2ROUNDUP((uintptr_t)kmptr, align); 369 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr = kmptr; 370 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize = sz; 371 } 372 } 373 374 /* 375 * free cpu_vm_data 376 */ 377 void 378 cpu_vm_data_destroy(struct cpu *cp) 379 { 380 if (cp->cpu_seqid && cp->cpu_vm_data) { 381 ASSERT(cp != CPU0); 382 kmem_free(((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr, 383 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize); 384 } 385 cp->cpu_vm_data = NULL; 386 } 387 388 389 /* 390 * page size to page size code 391 */ 392 int 393 page_szc(size_t pagesize) 394 { 395 int i = 0; 396 397 while (hw_page_array[i].hp_size) { 398 if (pagesize == hw_page_array[i].hp_size) 399 return (i); 400 i++; 401 } 402 return (-1); 403 } 404 405 /* 406 * page size to page size code with the restriction that it be a supported 407 * user page size. If it's not a supported user page size, -1 will be returned. 408 */ 409 int 410 page_szc_user_filtered(size_t pagesize) 411 { 412 int szc = page_szc(pagesize); 413 if ((szc != -1) && (SZC_2_USERSZC(szc) != -1)) { 414 return (szc); 415 } 416 return (-1); 417 } 418 419 /* 420 * Return how many page sizes are available for the user to use. This is 421 * what the hardware supports and not based upon how the OS implements the 422 * support of different page sizes. 423 * 424 * If legacy is non-zero, return the number of pagesizes available to legacy 425 * applications. The number of legacy page sizes might be less than the 426 * exported user page sizes. This is to prevent legacy applications that 427 * use the largest page size returned from getpagesizes(3c) from inadvertantly 428 * using the 'new' large pagesizes. 429 */ 430 uint_t 431 page_num_user_pagesizes(int legacy) 432 { 433 if (legacy) 434 return (mmu_legacy_page_sizes); 435 return (mmu_exported_page_sizes); 436 } 437 438 uint_t 439 page_num_pagesizes(void) 440 { 441 return (mmu_page_sizes); 442 } 443 444 /* 445 * returns the count of the number of base pagesize pages associated with szc 446 */ 447 pgcnt_t 448 page_get_pagecnt(uint_t szc) 449 { 450 if (szc >= mmu_page_sizes) 451 panic("page_get_pagecnt: out of range %d", szc); 452 return (hw_page_array[szc].hp_pgcnt); 453 } 454 455 size_t 456 page_get_pagesize(uint_t szc) 457 { 458 if (szc >= mmu_page_sizes) 459 panic("page_get_pagesize: out of range %d", szc); 460 return (hw_page_array[szc].hp_size); 461 } 462 463 /* 464 * Return the size of a page based upon the index passed in. An index of 465 * zero refers to the smallest page size in the system, and as index increases 466 * it refers to the next larger supported page size in the system. 467 * Note that szc and userszc may not be the same due to unsupported szc's on 468 * some systems. 469 */ 470 size_t 471 page_get_user_pagesize(uint_t userszc) 472 { 473 uint_t szc = USERSZC_2_SZC(userszc); 474 475 if (szc >= mmu_page_sizes) 476 panic("page_get_user_pagesize: out of range %d", szc); 477 return (hw_page_array[szc].hp_size); 478 } 479 480 uint_t 481 page_get_shift(uint_t szc) 482 { 483 if (szc >= mmu_page_sizes) 484 panic("page_get_shift: out of range %d", szc); 485 return (PAGE_GET_SHIFT(szc)); 486 } 487 488 uint_t 489 page_get_pagecolors(uint_t szc) 490 { 491 if (szc >= mmu_page_sizes) 492 panic("page_get_pagecolors: out of range %d", szc); 493 return (PAGE_GET_PAGECOLORS(szc)); 494 } 495 496 /* 497 * this assigns the desired equivalent color after a split 498 */ 499 uint_t 500 page_correct_color(uchar_t szc, uchar_t nszc, uint_t color, 501 uint_t ncolor, uint_t ceq_mask) 502 { 503 ASSERT(nszc > szc); 504 ASSERT(szc < mmu_page_sizes); 505 ASSERT(color < PAGE_GET_PAGECOLORS(szc)); 506 ASSERT(ncolor < PAGE_GET_PAGECOLORS(nszc)); 507 508 color &= ceq_mask; 509 ncolor = PAGE_CONVERT_COLOR(ncolor, szc, nszc); 510 return (color | (ncolor & ~ceq_mask)); 511 } 512 513 /* 514 * The interleaved_mnodes flag is set when mnodes overlap in 515 * the physbase..physmax range, but have disjoint slices. 516 * In this case hpm_counters is shared by all mnodes. 517 * This flag is set dynamically by the platform. 518 */ 519 int interleaved_mnodes = 0; 520 521 /* 522 * Called by startup(). 523 * Size up the per page size free list counters based on physmax 524 * of each node and max_mem_nodes. 525 * 526 * If interleaved_mnodes is set we need to find the first mnode that 527 * exists. hpm_counters for the first mnode will then be shared by 528 * all other mnodes. If interleaved_mnodes is not set, just set 529 * first=mnode each time. That means there will be no sharing. 530 */ 531 size_t 532 page_ctrs_sz(void) 533 { 534 int r; /* region size */ 535 int mnode; 536 int firstmn; /* first mnode that exists */ 537 int nranges; 538 pfn_t physbase; 539 pfn_t physmax; 540 uint_t ctrs_sz = 0; 541 int i; 542 pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; 543 544 /* 545 * We need to determine how many page colors there are for each 546 * page size in order to allocate memory for any color specific 547 * arrays. 548 */ 549 for (i = 0; i < mmu_page_sizes; i++) { 550 colors_per_szc[i] = PAGE_GET_PAGECOLORS(i); 551 } 552 553 for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) { 554 555 pgcnt_t r_pgcnt; 556 pfn_t r_base; 557 pgcnt_t r_align; 558 559 if (mem_node_config[mnode].exists == 0) 560 continue; 561 562 HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn); 563 nranges = MNODE_RANGE_CNT(mnode); 564 mnode_nranges[mnode] = nranges; 565 mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode); 566 567 /* 568 * determine size needed for page counter arrays with 569 * base aligned to large page size. 570 */ 571 for (r = 1; r < mmu_page_sizes; r++) { 572 /* add in space for hpm_color_current */ 573 ctrs_sz += sizeof (size_t) * 574 colors_per_szc[r] * nranges; 575 576 if (firstmn != mnode) 577 continue; 578 579 /* add in space for hpm_counters */ 580 r_align = page_get_pagecnt(r); 581 r_base = physbase; 582 r_base &= ~(r_align - 1); 583 r_pgcnt = howmany(physmax - r_base + 1, r_align); 584 585 /* 586 * Round up to always allocate on pointer sized 587 * boundaries. 588 */ 589 ctrs_sz += P2ROUNDUP((r_pgcnt * sizeof (hpmctr_t)), 590 sizeof (hpmctr_t *)); 591 } 592 } 593 594 for (r = 1; r < mmu_page_sizes; r++) { 595 ctrs_sz += (max_mem_nodes * sizeof (hw_page_map_t)); 596 } 597 598 /* add in space for page_ctrs_cands and pcc_color_free */ 599 ctrs_sz += sizeof (pcc_info_t *) * max_mem_nodes * 600 mmu_page_sizes * NPC_MUTEX; 601 602 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 603 604 if (mem_node_config[mnode].exists == 0) 605 continue; 606 607 nranges = mnode_nranges[mnode]; 608 ctrs_sz += sizeof (pcc_info_t) * nranges * 609 mmu_page_sizes * NPC_MUTEX; 610 for (r = 1; r < mmu_page_sizes; r++) { 611 ctrs_sz += sizeof (pgcnt_t) * nranges * 612 colors_per_szc[r] * NPC_MUTEX; 613 } 614 } 615 616 /* ctr_mutex */ 617 ctrs_sz += (max_mem_nodes * NPC_MUTEX * sizeof (kmutex_t)); 618 619 /* size for page list counts */ 620 PLCNT_SZ(ctrs_sz); 621 622 /* 623 * add some slop for roundups. page_ctrs_alloc will roundup the start 624 * address of the counters to ecache_alignsize boundary for every 625 * memory node. 626 */ 627 return (ctrs_sz + max_mem_nodes * L2CACHE_ALIGN); 628 } 629 630 caddr_t 631 page_ctrs_alloc(caddr_t alloc_base) 632 { 633 int mnode; 634 int mrange, nranges; 635 int r; /* region size */ 636 int i; 637 int firstmn; /* first mnode that exists */ 638 pfn_t physbase; 639 pfn_t physmax; 640 pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; 641 642 /* 643 * We need to determine how many page colors there are for each 644 * page size in order to allocate memory for any color specific 645 * arrays. 646 */ 647 for (i = 0; i < mmu_page_sizes; i++) { 648 colors_per_szc[i] = PAGE_GET_PAGECOLORS(i); 649 } 650 651 for (r = 1; r < mmu_page_sizes; r++) { 652 page_counters[r] = (hw_page_map_t *)alloc_base; 653 alloc_base += (max_mem_nodes * sizeof (hw_page_map_t)); 654 } 655 656 /* page_ctrs_cands and pcc_color_free array */ 657 for (i = 0; i < NPC_MUTEX; i++) { 658 for (r = 1; r < mmu_page_sizes; r++) { 659 660 page_ctrs_cands[i][r] = (pcc_info_t **)alloc_base; 661 alloc_base += sizeof (pcc_info_t *) * max_mem_nodes; 662 663 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 664 pcc_info_t *pi; 665 666 if (mem_node_config[mnode].exists == 0) 667 continue; 668 669 nranges = mnode_nranges[mnode]; 670 671 pi = (pcc_info_t *)alloc_base; 672 alloc_base += sizeof (pcc_info_t) * nranges; 673 page_ctrs_cands[i][r][mnode] = pi; 674 675 for (mrange = 0; mrange < nranges; mrange++) { 676 pi->pcc_color_free = 677 (pgcnt_t *)alloc_base; 678 alloc_base += sizeof (pgcnt_t) * 679 colors_per_szc[r]; 680 pi++; 681 } 682 } 683 } 684 } 685 686 /* ctr_mutex */ 687 for (i = 0; i < NPC_MUTEX; i++) { 688 ctr_mutex[i] = (kmutex_t *)alloc_base; 689 alloc_base += (max_mem_nodes * sizeof (kmutex_t)); 690 } 691 692 /* initialize page list counts */ 693 PLCNT_INIT(alloc_base); 694 695 for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) { 696 697 pgcnt_t r_pgcnt; 698 pfn_t r_base; 699 pgcnt_t r_align; 700 int r_shift; 701 int nranges = mnode_nranges[mnode]; 702 703 if (mem_node_config[mnode].exists == 0) 704 continue; 705 706 HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn); 707 708 for (r = 1; r < mmu_page_sizes; r++) { 709 /* 710 * the page_counters base has to be aligned to the 711 * page count of page size code r otherwise the counts 712 * will cross large page boundaries. 713 */ 714 r_align = page_get_pagecnt(r); 715 r_base = physbase; 716 /* base needs to be aligned - lower to aligned value */ 717 r_base &= ~(r_align - 1); 718 r_pgcnt = howmany(physmax - r_base + 1, r_align); 719 r_shift = PAGE_BSZS_SHIFT(r); 720 721 PAGE_COUNTERS_SHIFT(mnode, r) = r_shift; 722 PAGE_COUNTERS_ENTRIES(mnode, r) = r_pgcnt; 723 PAGE_COUNTERS_BASE(mnode, r) = r_base; 724 for (mrange = 0; mrange < nranges; mrange++) { 725 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, 726 r, mrange) = (size_t *)alloc_base; 727 alloc_base += sizeof (size_t) * 728 colors_per_szc[r]; 729 } 730 for (i = 0; i < colors_per_szc[r]; i++) { 731 uint_t color_mask = colors_per_szc[r] - 1; 732 pfn_t pfnum = r_base; 733 size_t idx; 734 int mrange; 735 MEM_NODE_ITERATOR_DECL(it); 736 737 MEM_NODE_ITERATOR_INIT(pfnum, mnode, &it); 738 ASSERT(pfnum != (pfn_t)-1); 739 PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i, 740 color_mask, color_mask, &it); 741 idx = PNUM_TO_IDX(mnode, r, pfnum); 742 idx = (idx >= r_pgcnt) ? 0 : idx; 743 for (mrange = 0; mrange < nranges; mrange++) { 744 PAGE_COUNTERS_CURRENT_COLOR(mnode, 745 r, i, mrange) = idx; 746 } 747 } 748 749 /* hpm_counters may be shared by all mnodes */ 750 if (firstmn == mnode) { 751 PAGE_COUNTERS_COUNTERS(mnode, r) = 752 (hpmctr_t *)alloc_base; 753 alloc_base += 754 P2ROUNDUP((sizeof (hpmctr_t) * r_pgcnt), 755 sizeof (hpmctr_t *)); 756 } else { 757 PAGE_COUNTERS_COUNTERS(mnode, r) = 758 PAGE_COUNTERS_COUNTERS(firstmn, r); 759 } 760 761 /* 762 * Verify that PNUM_TO_IDX and IDX_TO_PNUM 763 * satisfy the identity requirement. 764 * We should be able to go from one to the other 765 * and get consistent values. 766 */ 767 ASSERT(PNUM_TO_IDX(mnode, r, 768 (IDX_TO_PNUM(mnode, r, 0))) == 0); 769 ASSERT(IDX_TO_PNUM(mnode, r, 770 (PNUM_TO_IDX(mnode, r, r_base))) == r_base); 771 } 772 /* 773 * Roundup the start address of the page_counters to 774 * cache aligned boundary for every memory node. 775 * page_ctrs_sz() has added some slop for these roundups. 776 */ 777 alloc_base = (caddr_t)P2ROUNDUP((uintptr_t)alloc_base, 778 L2CACHE_ALIGN); 779 } 780 781 /* Initialize other page counter specific data structures. */ 782 for (mnode = 0; mnode < MAX_MEM_NODES; mnode++) { 783 rw_init(&page_ctrs_rwlock[mnode], NULL, RW_DEFAULT, NULL); 784 } 785 786 return (alloc_base); 787 } 788 789 /* 790 * Functions to adjust region counters for each size free list. 791 * Caller is responsible to acquire the ctr_mutex lock if necessary and 792 * thus can be called during startup without locks. 793 */ 794 /* ARGSUSED */ 795 void 796 page_ctr_add_internal(int mnode, int mtype, page_t *pp, int flags) 797 { 798 ssize_t r; /* region size */ 799 ssize_t idx; 800 pfn_t pfnum; 801 int lckidx; 802 803 ASSERT(mnode == PP_2_MEM_NODE(pp)); 804 ASSERT(mtype == PP_2_MTYPE(pp)); 805 806 ASSERT(pp->p_szc < mmu_page_sizes); 807 808 PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags); 809 810 /* no counter update needed for largest page size */ 811 if (pp->p_szc >= mmu_page_sizes - 1) { 812 return; 813 } 814 815 r = pp->p_szc + 1; 816 pfnum = pp->p_pagenum; 817 lckidx = PP_CTR_LOCK_INDX(pp); 818 819 /* 820 * Increment the count of free pages for the current 821 * region. Continue looping up in region size incrementing 822 * count if the preceeding region is full. 823 */ 824 while (r < mmu_page_sizes) { 825 idx = PNUM_TO_IDX(mnode, r, pfnum); 826 827 ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r)); 828 ASSERT(PAGE_COUNTERS(mnode, r, idx) < FULL_REGION_CNT(r)); 829 830 if (++PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r)) { 831 break; 832 } else { 833 int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r)); 834 pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode] 835 [MTYPE_2_MRANGE(mnode, root_mtype)]; 836 837 cand->pcc_pages_free++; 838 cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]++; 839 } 840 r++; 841 } 842 } 843 844 void 845 page_ctr_add(int mnode, int mtype, page_t *pp, int flags) 846 { 847 int lckidx = PP_CTR_LOCK_INDX(pp); 848 kmutex_t *lock = &ctr_mutex[lckidx][mnode]; 849 850 mutex_enter(lock); 851 page_ctr_add_internal(mnode, mtype, pp, flags); 852 mutex_exit(lock); 853 } 854 855 void 856 page_ctr_sub_internal(int mnode, int mtype, page_t *pp, int flags) 857 { 858 int lckidx; 859 ssize_t r; /* region size */ 860 ssize_t idx; 861 pfn_t pfnum; 862 863 ASSERT(mnode == PP_2_MEM_NODE(pp)); 864 ASSERT(mtype == PP_2_MTYPE(pp)); 865 866 ASSERT(pp->p_szc < mmu_page_sizes); 867 868 PLCNT_DECR(pp, mnode, mtype, pp->p_szc, flags); 869 870 /* no counter update needed for largest page size */ 871 if (pp->p_szc >= mmu_page_sizes - 1) { 872 return; 873 } 874 875 r = pp->p_szc + 1; 876 pfnum = pp->p_pagenum; 877 lckidx = PP_CTR_LOCK_INDX(pp); 878 879 /* 880 * Decrement the count of free pages for the current 881 * region. Continue looping up in region size decrementing 882 * count if the preceeding region was full. 883 */ 884 while (r < mmu_page_sizes) { 885 idx = PNUM_TO_IDX(mnode, r, pfnum); 886 887 ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r)); 888 ASSERT(PAGE_COUNTERS(mnode, r, idx) > 0); 889 890 if (--PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r) - 1) { 891 break; 892 } else { 893 int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r)); 894 pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode] 895 [MTYPE_2_MRANGE(mnode, root_mtype)]; 896 897 ASSERT(cand->pcc_pages_free != 0); 898 ASSERT(cand->pcc_color_free[PP_2_BIN_SZC(pp, r)] != 0); 899 900 cand->pcc_pages_free--; 901 cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]--; 902 } 903 r++; 904 } 905 } 906 907 void 908 page_ctr_sub(int mnode, int mtype, page_t *pp, int flags) 909 { 910 int lckidx = PP_CTR_LOCK_INDX(pp); 911 kmutex_t *lock = &ctr_mutex[lckidx][mnode]; 912 913 mutex_enter(lock); 914 page_ctr_sub_internal(mnode, mtype, pp, flags); 915 mutex_exit(lock); 916 } 917 918 /* 919 * Adjust page counters following a memory attach, since typically the 920 * size of the array needs to change, and the PFN to counter index 921 * mapping needs to change. 922 * 923 * It is possible this mnode did not exist at startup. In that case 924 * allocate pcc_info_t and pcc_color_free arrays. Also, allow for nranges 925 * to change (a theoretical possibility on x86), which means pcc_color_free 926 * arrays must be extended. 927 */ 928 uint_t 929 page_ctrs_adjust(int mnode) 930 { 931 pgcnt_t npgs; 932 int r; /* region size */ 933 int i; 934 size_t pcsz, old_csz; 935 hpmctr_t *new_ctr, *old_ctr; 936 pfn_t oldbase, newbase; 937 pfn_t physbase, physmax; 938 size_t old_npgs; 939 hpmctr_t *ctr_cache[MMU_PAGE_SIZES]; 940 size_t size_cache[MMU_PAGE_SIZES]; 941 size_t *color_cache[MMU_PAGE_SIZES][MAX_MNODE_MRANGES]; 942 size_t *old_color_array[MAX_MNODE_MRANGES]; 943 pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; 944 pcc_info_t **cands_cache; 945 pcc_info_t *old_pi, *pi; 946 pgcnt_t *pgcntp; 947 int nr, old_nranges, mrange, nranges = MNODE_RANGE_CNT(mnode); 948 int cands_cache_nranges; 949 int old_maxmrange, new_maxmrange; 950 int rc = 0; 951 952 cands_cache = kmem_zalloc(sizeof (pcc_info_t *) * NPC_MUTEX * 953 MMU_PAGE_SIZES, KM_NOSLEEP); 954 if (cands_cache == NULL) 955 return (ENOMEM); 956 957 i = -1; 958 HPM_COUNTERS_LIMITS(mnode, physbase, physmax, i); 959 960 newbase = physbase & ~PC_BASE_ALIGN_MASK; 961 npgs = roundup(physmax, PC_BASE_ALIGN) - newbase; 962 963 /* prepare to free non-null pointers on the way out */ 964 cands_cache_nranges = nranges; 965 bzero(ctr_cache, sizeof (ctr_cache)); 966 bzero(color_cache, sizeof (color_cache)); 967 968 /* 969 * We need to determine how many page colors there are for each 970 * page size in order to allocate memory for any color specific 971 * arrays. 972 */ 973 for (r = 0; r < mmu_page_sizes; r++) { 974 colors_per_szc[r] = PAGE_GET_PAGECOLORS(r); 975 } 976 977 /* 978 * Preallocate all of the new hpm_counters arrays as we can't 979 * hold the page_ctrs_rwlock as a writer and allocate memory. 980 * If we can't allocate all of the arrays, undo our work so far 981 * and return failure. 982 */ 983 for (r = 1; r < mmu_page_sizes; r++) { 984 pcsz = npgs >> PAGE_BSZS_SHIFT(r); 985 size_cache[r] = pcsz; 986 ctr_cache[r] = kmem_zalloc(pcsz * 987 sizeof (hpmctr_t), KM_NOSLEEP); 988 if (ctr_cache[r] == NULL) { 989 rc = ENOMEM; 990 goto cleanup; 991 } 992 } 993 994 /* 995 * Preallocate all of the new color current arrays as we can't 996 * hold the page_ctrs_rwlock as a writer and allocate memory. 997 * If we can't allocate all of the arrays, undo our work so far 998 * and return failure. 999 */ 1000 for (r = 1; r < mmu_page_sizes; r++) { 1001 for (mrange = 0; mrange < nranges; mrange++) { 1002 color_cache[r][mrange] = kmem_zalloc(sizeof (size_t) * 1003 colors_per_szc[r], KM_NOSLEEP); 1004 if (color_cache[r][mrange] == NULL) { 1005 rc = ENOMEM; 1006 goto cleanup; 1007 } 1008 } 1009 } 1010 1011 /* 1012 * Preallocate all of the new pcc_info_t arrays as we can't 1013 * hold the page_ctrs_rwlock as a writer and allocate memory. 1014 * If we can't allocate all of the arrays, undo our work so far 1015 * and return failure. 1016 */ 1017 for (r = 1; r < mmu_page_sizes; r++) { 1018 for (i = 0; i < NPC_MUTEX; i++) { 1019 pi = kmem_zalloc(nranges * sizeof (pcc_info_t), 1020 KM_NOSLEEP); 1021 if (pi == NULL) { 1022 rc = ENOMEM; 1023 goto cleanup; 1024 } 1025 cands_cache[i * MMU_PAGE_SIZES + r] = pi; 1026 1027 for (mrange = 0; mrange < nranges; mrange++, pi++) { 1028 pgcntp = kmem_zalloc(colors_per_szc[r] * 1029 sizeof (pgcnt_t), KM_NOSLEEP); 1030 if (pgcntp == NULL) { 1031 rc = ENOMEM; 1032 goto cleanup; 1033 } 1034 pi->pcc_color_free = pgcntp; 1035 } 1036 } 1037 } 1038 1039 /* 1040 * Grab the write lock to prevent others from walking these arrays 1041 * while we are modifying them. 1042 */ 1043 PAGE_CTRS_WRITE_LOCK(mnode); 1044 1045 old_nranges = mnode_nranges[mnode]; 1046 cands_cache_nranges = old_nranges; 1047 mnode_nranges[mnode] = nranges; 1048 old_maxmrange = mnode_maxmrange[mnode]; 1049 mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode); 1050 new_maxmrange = mnode_maxmrange[mnode]; 1051 1052 for (r = 1; r < mmu_page_sizes; r++) { 1053 PAGE_COUNTERS_SHIFT(mnode, r) = PAGE_BSZS_SHIFT(r); 1054 old_ctr = PAGE_COUNTERS_COUNTERS(mnode, r); 1055 old_csz = PAGE_COUNTERS_ENTRIES(mnode, r); 1056 oldbase = PAGE_COUNTERS_BASE(mnode, r); 1057 old_npgs = old_csz << PAGE_COUNTERS_SHIFT(mnode, r); 1058 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) { 1059 old_color_array[mrange] = 1060 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, 1061 r, mrange); 1062 } 1063 1064 pcsz = npgs >> PAGE_COUNTERS_SHIFT(mnode, r); 1065 new_ctr = ctr_cache[r]; 1066 ctr_cache[r] = NULL; 1067 if (old_ctr != NULL && 1068 (oldbase + old_npgs > newbase) && 1069 (newbase + npgs > oldbase)) { 1070 /* 1071 * Map the intersection of the old and new 1072 * counters into the new array. 1073 */ 1074 size_t offset; 1075 if (newbase > oldbase) { 1076 offset = (newbase - oldbase) >> 1077 PAGE_COUNTERS_SHIFT(mnode, r); 1078 bcopy(old_ctr + offset, new_ctr, 1079 MIN(pcsz, (old_csz - offset)) * 1080 sizeof (hpmctr_t)); 1081 } else { 1082 offset = (oldbase - newbase) >> 1083 PAGE_COUNTERS_SHIFT(mnode, r); 1084 bcopy(old_ctr, new_ctr + offset, 1085 MIN(pcsz - offset, old_csz) * 1086 sizeof (hpmctr_t)); 1087 } 1088 } 1089 1090 PAGE_COUNTERS_COUNTERS(mnode, r) = new_ctr; 1091 PAGE_COUNTERS_ENTRIES(mnode, r) = pcsz; 1092 PAGE_COUNTERS_BASE(mnode, r) = newbase; 1093 1094 /* update shared hpm_counters in other mnodes */ 1095 if (interleaved_mnodes) { 1096 for (i = 0; i < max_mem_nodes; i++) { 1097 if (i == mnode) 1098 continue; 1099 if (mem_node_config[i].exists == 0) 1100 continue; 1101 ASSERT(PAGE_COUNTERS_COUNTERS(i, r) == old_ctr); 1102 PAGE_COUNTERS_COUNTERS(i, r) = new_ctr; 1103 PAGE_COUNTERS_ENTRIES(i, r) = pcsz; 1104 PAGE_COUNTERS_BASE(i, r) = newbase; 1105 } 1106 } 1107 1108 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) { 1109 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r, mrange) = 1110 color_cache[r][mrange]; 1111 color_cache[r][mrange] = NULL; 1112 } 1113 /* 1114 * for now, just reset on these events as it's probably 1115 * not worthwhile to try and optimize this. 1116 */ 1117 for (i = 0; i < colors_per_szc[r]; i++) { 1118 uint_t color_mask = colors_per_szc[r] - 1; 1119 int mlo = interleaved_mnodes ? 0 : mnode; 1120 int mhi = interleaved_mnodes ? max_mem_nodes : 1121 (mnode + 1); 1122 int m; 1123 pfn_t pfnum = newbase; 1124 size_t idx; 1125 MEM_NODE_ITERATOR_DECL(it); 1126 1127 for (m = mlo; m < mhi; m++) { 1128 if (mem_node_config[m].exists == 0) 1129 continue; 1130 MEM_NODE_ITERATOR_INIT(pfnum, m, &it); 1131 ASSERT(pfnum != (pfn_t)-1); 1132 PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i, color_mask, 1133 color_mask, &it); 1134 idx = PNUM_TO_IDX(m, r, pfnum); 1135 idx = (idx < pcsz) ? idx : 0; 1136 for (mrange = 0; mrange < nranges; mrange++) { 1137 PAGE_COUNTERS_CURRENT_COLOR(m, 1138 r, i, mrange) = idx; 1139 } 1140 } 1141 } 1142 1143 /* cache info for freeing out of the critical path */ 1144 if ((caddr_t)old_ctr >= kernelheap && 1145 (caddr_t)old_ctr < ekernelheap) { 1146 ctr_cache[r] = old_ctr; 1147 size_cache[r] = old_csz; 1148 } 1149 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) { 1150 size_t *tmp = old_color_array[mrange]; 1151 if ((caddr_t)tmp >= kernelheap && 1152 (caddr_t)tmp < ekernelheap) { 1153 color_cache[r][mrange] = tmp; 1154 } 1155 } 1156 /* 1157 * Verify that PNUM_TO_IDX and IDX_TO_PNUM 1158 * satisfy the identity requirement. 1159 * We should be able to go from one to the other 1160 * and get consistent values. 1161 */ 1162 ASSERT(PNUM_TO_IDX(mnode, r, 1163 (IDX_TO_PNUM(mnode, r, 0))) == 0); 1164 ASSERT(IDX_TO_PNUM(mnode, r, 1165 (PNUM_TO_IDX(mnode, r, newbase))) == newbase); 1166 1167 /* pcc_info_t and pcc_color_free */ 1168 for (i = 0; i < NPC_MUTEX; i++) { 1169 pcc_info_t *epi; 1170 pcc_info_t *eold_pi; 1171 1172 pi = cands_cache[i * MMU_PAGE_SIZES + r]; 1173 old_pi = page_ctrs_cands[i][r][mnode]; 1174 page_ctrs_cands[i][r][mnode] = pi; 1175 cands_cache[i * MMU_PAGE_SIZES + r] = old_pi; 1176 1177 /* preserve old pcc_color_free values, if any */ 1178 if (old_pi == NULL) 1179 continue; 1180 1181 /* 1182 * when/if x86 does DR, must account for 1183 * possible change in range index when 1184 * preserving pcc_info 1185 */ 1186 epi = &pi[nranges]; 1187 eold_pi = &old_pi[old_nranges]; 1188 if (new_maxmrange > old_maxmrange) { 1189 pi += new_maxmrange - old_maxmrange; 1190 } else if (new_maxmrange < old_maxmrange) { 1191 old_pi += old_maxmrange - new_maxmrange; 1192 } 1193 for (; pi < epi && old_pi < eold_pi; pi++, old_pi++) { 1194 pcc_info_t tmp = *pi; 1195 *pi = *old_pi; 1196 *old_pi = tmp; 1197 } 1198 } 1199 } 1200 PAGE_CTRS_WRITE_UNLOCK(mnode); 1201 1202 /* 1203 * Now that we have dropped the write lock, it is safe to free all 1204 * of the memory we have cached above. 1205 * We come thru here to free memory when pre-alloc fails, and also to 1206 * free old pointers which were recorded while locked. 1207 */ 1208 cleanup: 1209 for (r = 1; r < mmu_page_sizes; r++) { 1210 if (ctr_cache[r] != NULL) { 1211 kmem_free(ctr_cache[r], 1212 size_cache[r] * sizeof (hpmctr_t)); 1213 } 1214 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) { 1215 if (color_cache[r][mrange] != NULL) { 1216 kmem_free(color_cache[r][mrange], 1217 colors_per_szc[r] * sizeof (size_t)); 1218 } 1219 } 1220 for (i = 0; i < NPC_MUTEX; i++) { 1221 pi = cands_cache[i * MMU_PAGE_SIZES + r]; 1222 if (pi == NULL) 1223 continue; 1224 nr = cands_cache_nranges; 1225 for (mrange = 0; mrange < nr; mrange++, pi++) { 1226 pgcntp = pi->pcc_color_free; 1227 if (pgcntp == NULL) 1228 continue; 1229 if ((caddr_t)pgcntp >= kernelheap && 1230 (caddr_t)pgcntp < ekernelheap) { 1231 kmem_free(pgcntp, 1232 colors_per_szc[r] * 1233 sizeof (pgcnt_t)); 1234 } 1235 } 1236 pi = cands_cache[i * MMU_PAGE_SIZES + r]; 1237 if ((caddr_t)pi >= kernelheap && 1238 (caddr_t)pi < ekernelheap) { 1239 kmem_free(pi, nr * sizeof (pcc_info_t)); 1240 } 1241 } 1242 } 1243 1244 kmem_free(cands_cache, 1245 sizeof (pcc_info_t *) * NPC_MUTEX * MMU_PAGE_SIZES); 1246 return (rc); 1247 } 1248 1249 1250 #ifdef DEBUG 1251 1252 /* 1253 * confirm pp is a large page corresponding to szc 1254 */ 1255 void 1256 chk_lpg(page_t *pp, uchar_t szc) 1257 { 1258 spgcnt_t npgs = page_get_pagecnt(pp->p_szc); 1259 uint_t noreloc; 1260 1261 if (npgs == 1) { 1262 ASSERT(pp->p_szc == 0); 1263 ASSERT(pp->p_next == pp); 1264 ASSERT(pp->p_prev == pp); 1265 return; 1266 } 1267 1268 ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL); 1269 ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL); 1270 1271 ASSERT(IS_P2ALIGNED(pp->p_pagenum, npgs)); 1272 ASSERT(pp->p_pagenum == (pp->p_next->p_pagenum - 1)); 1273 ASSERT(pp->p_prev->p_pagenum == (pp->p_pagenum + (npgs - 1))); 1274 ASSERT(pp->p_prev == (pp + (npgs - 1))); 1275 1276 /* 1277 * Check list of pages. 1278 */ 1279 noreloc = PP_ISNORELOC(pp); 1280 while (npgs--) { 1281 if (npgs != 0) { 1282 ASSERT(pp->p_pagenum == pp->p_next->p_pagenum - 1); 1283 ASSERT(pp->p_next == (pp + 1)); 1284 } 1285 ASSERT(pp->p_szc == szc); 1286 ASSERT(PP_ISFREE(pp)); 1287 ASSERT(PP_ISAGED(pp)); 1288 ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL); 1289 ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL); 1290 ASSERT(pp->p_vnode == NULL); 1291 ASSERT(PP_ISNORELOC(pp) == noreloc); 1292 1293 pp = pp->p_next; 1294 } 1295 } 1296 #endif /* DEBUG */ 1297 1298 void 1299 page_freelist_lock(int mnode) 1300 { 1301 int i; 1302 for (i = 0; i < NPC_MUTEX; i++) { 1303 mutex_enter(FPC_MUTEX(mnode, i)); 1304 mutex_enter(CPC_MUTEX(mnode, i)); 1305 } 1306 } 1307 1308 void 1309 page_freelist_unlock(int mnode) 1310 { 1311 int i; 1312 for (i = 0; i < NPC_MUTEX; i++) { 1313 mutex_exit(FPC_MUTEX(mnode, i)); 1314 mutex_exit(CPC_MUTEX(mnode, i)); 1315 } 1316 } 1317 1318 /* 1319 * add pp to the specified page list. Defaults to head of the page list 1320 * unless PG_LIST_TAIL is specified. 1321 */ 1322 void 1323 page_list_add(page_t *pp, int flags) 1324 { 1325 page_t **ppp; 1326 kmutex_t *pcm; 1327 uint_t bin, mtype; 1328 int mnode; 1329 1330 ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT)); 1331 ASSERT(PP_ISFREE(pp)); 1332 ASSERT(!hat_page_is_mapped(pp)); 1333 ASSERT(hat_page_getshare(pp) == 0); 1334 1335 /* 1336 * Large pages should be freed via page_list_add_pages(). 1337 */ 1338 ASSERT(pp->p_szc == 0); 1339 1340 /* 1341 * Don't need to lock the freelist first here 1342 * because the page isn't on the freelist yet. 1343 * This means p_szc can't change on us. 1344 */ 1345 1346 bin = PP_2_BIN(pp); 1347 mnode = PP_2_MEM_NODE(pp); 1348 mtype = PP_2_MTYPE(pp); 1349 1350 if (flags & PG_LIST_ISINIT) { 1351 /* 1352 * PG_LIST_ISINIT is set during system startup (ie. single 1353 * threaded), add a page to the free list and add to the 1354 * the free region counters w/o any locking 1355 */ 1356 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1357 1358 /* inline version of page_add() */ 1359 if (*ppp != NULL) { 1360 pp->p_next = *ppp; 1361 pp->p_prev = (*ppp)->p_prev; 1362 (*ppp)->p_prev = pp; 1363 pp->p_prev->p_next = pp; 1364 } else 1365 *ppp = pp; 1366 1367 page_ctr_add_internal(mnode, mtype, pp, flags); 1368 VM_STAT_ADD(vmm_vmstats.pladd_free[0]); 1369 } else { 1370 pcm = PC_BIN_MUTEX(mnode, bin, flags); 1371 1372 if (flags & PG_FREE_LIST) { 1373 VM_STAT_ADD(vmm_vmstats.pladd_free[0]); 1374 ASSERT(PP_ISAGED(pp)); 1375 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1376 1377 } else { 1378 VM_STAT_ADD(vmm_vmstats.pladd_cache); 1379 ASSERT(pp->p_vnode); 1380 ASSERT((pp->p_offset & PAGEOFFSET) == 0); 1381 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1382 } 1383 mutex_enter(pcm); 1384 page_add(ppp, pp); 1385 1386 if (flags & PG_LIST_TAIL) 1387 *ppp = (*ppp)->p_next; 1388 /* 1389 * Add counters before releasing pcm mutex to avoid a race with 1390 * page_freelist_coalesce and page_freelist_split. 1391 */ 1392 page_ctr_add(mnode, mtype, pp, flags); 1393 mutex_exit(pcm); 1394 } 1395 1396 1397 #if defined(__sparc) 1398 if (PP_ISNORELOC(pp)) { 1399 kcage_freemem_add(1); 1400 } 1401 #endif 1402 /* 1403 * It is up to the caller to unlock the page! 1404 */ 1405 ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT)); 1406 } 1407 1408 1409 #ifdef __sparc 1410 /* 1411 * This routine is only used by kcage_init during system startup. 1412 * It performs the function of page_list_sub/PP_SETNORELOC/page_list_add 1413 * without the overhead of taking locks and updating counters. 1414 */ 1415 void 1416 page_list_noreloc_startup(page_t *pp) 1417 { 1418 page_t **ppp; 1419 uint_t bin; 1420 int mnode; 1421 int mtype; 1422 int flags = 0; 1423 1424 /* 1425 * If this is a large page on the freelist then 1426 * break it up into smaller pages. 1427 */ 1428 if (pp->p_szc != 0) 1429 page_boot_demote(pp); 1430 1431 /* 1432 * Get list page is currently on. 1433 */ 1434 bin = PP_2_BIN(pp); 1435 mnode = PP_2_MEM_NODE(pp); 1436 mtype = PP_2_MTYPE(pp); 1437 ASSERT(mtype == MTYPE_RELOC); 1438 ASSERT(pp->p_szc == 0); 1439 1440 if (PP_ISAGED(pp)) { 1441 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1442 flags |= PG_FREE_LIST; 1443 } else { 1444 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1445 flags |= PG_CACHE_LIST; 1446 } 1447 1448 ASSERT(*ppp != NULL); 1449 1450 /* 1451 * Delete page from current list. 1452 */ 1453 if (*ppp == pp) 1454 *ppp = pp->p_next; /* go to next page */ 1455 if (*ppp == pp) { 1456 *ppp = NULL; /* page list is gone */ 1457 } else { 1458 pp->p_prev->p_next = pp->p_next; 1459 pp->p_next->p_prev = pp->p_prev; 1460 } 1461 1462 /* 1463 * Decrement page counters 1464 */ 1465 page_ctr_sub_internal(mnode, mtype, pp, flags); 1466 1467 /* 1468 * Set no reloc for cage initted pages. 1469 */ 1470 PP_SETNORELOC(pp); 1471 1472 mtype = PP_2_MTYPE(pp); 1473 ASSERT(mtype == MTYPE_NORELOC); 1474 1475 /* 1476 * Get new list for page. 1477 */ 1478 if (PP_ISAGED(pp)) { 1479 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1480 } else { 1481 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1482 } 1483 1484 /* 1485 * Insert page on new list. 1486 */ 1487 if (*ppp == NULL) { 1488 *ppp = pp; 1489 pp->p_next = pp->p_prev = pp; 1490 } else { 1491 pp->p_next = *ppp; 1492 pp->p_prev = (*ppp)->p_prev; 1493 (*ppp)->p_prev = pp; 1494 pp->p_prev->p_next = pp; 1495 } 1496 1497 /* 1498 * Increment page counters 1499 */ 1500 page_ctr_add_internal(mnode, mtype, pp, flags); 1501 1502 /* 1503 * Update cage freemem counter 1504 */ 1505 atomic_add_long(&kcage_freemem, 1); 1506 } 1507 #else /* __sparc */ 1508 1509 /* ARGSUSED */ 1510 void 1511 page_list_noreloc_startup(page_t *pp) 1512 { 1513 panic("page_list_noreloc_startup: should be here only for sparc"); 1514 } 1515 #endif 1516 1517 void 1518 page_list_add_pages(page_t *pp, int flags) 1519 { 1520 kmutex_t *pcm; 1521 pgcnt_t pgcnt; 1522 uint_t bin, mtype, i; 1523 int mnode; 1524 1525 /* default to freelist/head */ 1526 ASSERT((flags & (PG_CACHE_LIST | PG_LIST_TAIL)) == 0); 1527 1528 CHK_LPG(pp, pp->p_szc); 1529 VM_STAT_ADD(vmm_vmstats.pladd_free[pp->p_szc]); 1530 1531 bin = PP_2_BIN(pp); 1532 mnode = PP_2_MEM_NODE(pp); 1533 mtype = PP_2_MTYPE(pp); 1534 1535 if (flags & PG_LIST_ISINIT) { 1536 ASSERT(pp->p_szc == mmu_page_sizes - 1); 1537 page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1538 ASSERT(!PP_ISNORELOC(pp)); 1539 PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags); 1540 } else { 1541 1542 ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes); 1543 1544 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 1545 1546 mutex_enter(pcm); 1547 page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1548 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST); 1549 mutex_exit(pcm); 1550 1551 pgcnt = page_get_pagecnt(pp->p_szc); 1552 #if defined(__sparc) 1553 if (PP_ISNORELOC(pp)) 1554 kcage_freemem_add(pgcnt); 1555 #endif 1556 for (i = 0; i < pgcnt; i++, pp++) 1557 page_unlock_nocapture(pp); 1558 } 1559 } 1560 1561 /* 1562 * During boot, need to demote a large page to base 1563 * pagesize pages for seg_kmem for use in boot_alloc() 1564 */ 1565 void 1566 page_boot_demote(page_t *pp) 1567 { 1568 ASSERT(pp->p_szc != 0); 1569 ASSERT(PP_ISFREE(pp)); 1570 ASSERT(PP_ISAGED(pp)); 1571 1572 (void) page_demote(PP_2_MEM_NODE(pp), 1573 PFN_BASE(pp->p_pagenum, pp->p_szc), pp->p_szc, 0, PC_NO_COLOR, 1574 PC_FREE); 1575 1576 ASSERT(PP_ISFREE(pp)); 1577 ASSERT(PP_ISAGED(pp)); 1578 ASSERT(pp->p_szc == 0); 1579 } 1580 1581 /* 1582 * Take a particular page off of whatever freelist the page 1583 * is claimed to be on. 1584 * 1585 * NOTE: Only used for PAGESIZE pages. 1586 */ 1587 void 1588 page_list_sub(page_t *pp, int flags) 1589 { 1590 int bin; 1591 uint_t mtype; 1592 int mnode; 1593 kmutex_t *pcm; 1594 page_t **ppp; 1595 1596 ASSERT(PAGE_EXCL(pp)); 1597 ASSERT(PP_ISFREE(pp)); 1598 1599 /* 1600 * The p_szc field can only be changed by page_promote() 1601 * and page_demote(). Only free pages can be promoted and 1602 * demoted and the free list MUST be locked during these 1603 * operations. So to prevent a race in page_list_sub() 1604 * between computing which bin of the freelist lock to 1605 * grab and actually grabing the lock we check again that 1606 * the bin we locked is still the correct one. Notice that 1607 * the p_szc field could have actually changed on us but 1608 * if the bin happens to still be the same we are safe. 1609 */ 1610 try_again: 1611 bin = PP_2_BIN(pp); 1612 mnode = PP_2_MEM_NODE(pp); 1613 pcm = PC_BIN_MUTEX(mnode, bin, flags); 1614 mutex_enter(pcm); 1615 if (PP_2_BIN(pp) != bin) { 1616 mutex_exit(pcm); 1617 goto try_again; 1618 } 1619 mtype = PP_2_MTYPE(pp); 1620 1621 if (flags & PG_FREE_LIST) { 1622 VM_STAT_ADD(vmm_vmstats.plsub_free[0]); 1623 ASSERT(PP_ISAGED(pp)); 1624 ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype); 1625 } else { 1626 VM_STAT_ADD(vmm_vmstats.plsub_cache); 1627 ASSERT(!PP_ISAGED(pp)); 1628 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1629 } 1630 1631 /* 1632 * Common PAGESIZE case. 1633 * 1634 * Note that we locked the freelist. This prevents 1635 * any page promotion/demotion operations. Therefore 1636 * the p_szc will not change until we drop pcm mutex. 1637 */ 1638 if (pp->p_szc == 0) { 1639 page_sub(ppp, pp); 1640 /* 1641 * Subtract counters before releasing pcm mutex 1642 * to avoid race with page_freelist_coalesce. 1643 */ 1644 page_ctr_sub(mnode, mtype, pp, flags); 1645 mutex_exit(pcm); 1646 1647 #if defined(__sparc) 1648 if (PP_ISNORELOC(pp)) { 1649 kcage_freemem_sub(1); 1650 } 1651 #endif 1652 return; 1653 } 1654 1655 /* 1656 * Large pages on the cache list are not supported. 1657 */ 1658 if (flags & PG_CACHE_LIST) 1659 panic("page_list_sub: large page on cachelist"); 1660 1661 /* 1662 * Slow but rare. 1663 * 1664 * Somebody wants this particular page which is part 1665 * of a large page. In this case we just demote the page 1666 * if it's on the freelist. 1667 * 1668 * We have to drop pcm before locking the entire freelist. 1669 * Once we have re-locked the freelist check to make sure 1670 * the page hasn't already been demoted or completely 1671 * freed. 1672 */ 1673 mutex_exit(pcm); 1674 page_freelist_lock(mnode); 1675 if (pp->p_szc != 0) { 1676 /* 1677 * Large page is on freelist. 1678 */ 1679 (void) page_demote(mnode, PFN_BASE(pp->p_pagenum, pp->p_szc), 1680 pp->p_szc, 0, PC_NO_COLOR, PC_FREE); 1681 } 1682 ASSERT(PP_ISFREE(pp)); 1683 ASSERT(PP_ISAGED(pp)); 1684 ASSERT(pp->p_szc == 0); 1685 1686 /* 1687 * Subtract counters before releasing pcm mutex 1688 * to avoid race with page_freelist_coalesce. 1689 */ 1690 bin = PP_2_BIN(pp); 1691 mtype = PP_2_MTYPE(pp); 1692 ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype); 1693 1694 page_sub(ppp, pp); 1695 page_ctr_sub(mnode, mtype, pp, flags); 1696 page_freelist_unlock(mnode); 1697 1698 #if defined(__sparc) 1699 if (PP_ISNORELOC(pp)) { 1700 kcage_freemem_sub(1); 1701 } 1702 #endif 1703 } 1704 1705 void 1706 page_list_sub_pages(page_t *pp, uint_t szc) 1707 { 1708 kmutex_t *pcm; 1709 uint_t bin, mtype; 1710 int mnode; 1711 1712 ASSERT(PAGE_EXCL(pp)); 1713 ASSERT(PP_ISFREE(pp)); 1714 ASSERT(PP_ISAGED(pp)); 1715 1716 /* 1717 * See comment in page_list_sub(). 1718 */ 1719 try_again: 1720 bin = PP_2_BIN(pp); 1721 mnode = PP_2_MEM_NODE(pp); 1722 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 1723 mutex_enter(pcm); 1724 if (PP_2_BIN(pp) != bin) { 1725 mutex_exit(pcm); 1726 goto try_again; 1727 } 1728 1729 /* 1730 * If we're called with a page larger than szc or it got 1731 * promoted above szc before we locked the freelist then 1732 * drop pcm and re-lock entire freelist. If page still larger 1733 * than szc then demote it. 1734 */ 1735 if (pp->p_szc > szc) { 1736 mutex_exit(pcm); 1737 pcm = NULL; 1738 page_freelist_lock(mnode); 1739 if (pp->p_szc > szc) { 1740 VM_STAT_ADD(vmm_vmstats.plsubpages_szcbig); 1741 (void) page_demote(mnode, 1742 PFN_BASE(pp->p_pagenum, pp->p_szc), 1743 pp->p_szc, szc, PC_NO_COLOR, PC_FREE); 1744 } 1745 bin = PP_2_BIN(pp); 1746 } 1747 ASSERT(PP_ISFREE(pp)); 1748 ASSERT(PP_ISAGED(pp)); 1749 ASSERT(pp->p_szc <= szc); 1750 ASSERT(pp == PP_PAGEROOT(pp)); 1751 1752 VM_STAT_ADD(vmm_vmstats.plsub_free[pp->p_szc]); 1753 1754 mtype = PP_2_MTYPE(pp); 1755 if (pp->p_szc != 0) { 1756 page_vpsub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1757 CHK_LPG(pp, pp->p_szc); 1758 } else { 1759 VM_STAT_ADD(vmm_vmstats.plsubpages_szc0); 1760 page_sub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1761 } 1762 page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST); 1763 1764 if (pcm != NULL) { 1765 mutex_exit(pcm); 1766 } else { 1767 page_freelist_unlock(mnode); 1768 } 1769 1770 #if defined(__sparc) 1771 if (PP_ISNORELOC(pp)) { 1772 pgcnt_t pgcnt; 1773 1774 pgcnt = page_get_pagecnt(pp->p_szc); 1775 kcage_freemem_sub(pgcnt); 1776 } 1777 #endif 1778 } 1779 1780 /* 1781 * Add the page to the front of a linked list of pages 1782 * using the p_next & p_prev pointers for the list. 1783 * The caller is responsible for protecting the list pointers. 1784 */ 1785 void 1786 mach_page_add(page_t **ppp, page_t *pp) 1787 { 1788 if (*ppp == NULL) { 1789 pp->p_next = pp->p_prev = pp; 1790 } else { 1791 pp->p_next = *ppp; 1792 pp->p_prev = (*ppp)->p_prev; 1793 (*ppp)->p_prev = pp; 1794 pp->p_prev->p_next = pp; 1795 } 1796 *ppp = pp; 1797 } 1798 1799 /* 1800 * Remove this page from a linked list of pages 1801 * using the p_next & p_prev pointers for the list. 1802 * 1803 * The caller is responsible for protecting the list pointers. 1804 */ 1805 void 1806 mach_page_sub(page_t **ppp, page_t *pp) 1807 { 1808 ASSERT(PP_ISFREE(pp)); 1809 1810 if (*ppp == NULL || pp == NULL) 1811 panic("mach_page_sub"); 1812 1813 if (*ppp == pp) 1814 *ppp = pp->p_next; /* go to next page */ 1815 1816 if (*ppp == pp) 1817 *ppp = NULL; /* page list is gone */ 1818 else { 1819 pp->p_prev->p_next = pp->p_next; 1820 pp->p_next->p_prev = pp->p_prev; 1821 } 1822 pp->p_prev = pp->p_next = pp; /* make pp a list of one */ 1823 } 1824 1825 /* 1826 * Routine fsflush uses to gradually coalesce the free list into larger pages. 1827 */ 1828 void 1829 page_promote_size(page_t *pp, uint_t cur_szc) 1830 { 1831 pfn_t pfn; 1832 int mnode; 1833 int idx; 1834 int new_szc = cur_szc + 1; 1835 int full = FULL_REGION_CNT(new_szc); 1836 1837 pfn = page_pptonum(pp); 1838 mnode = PFN_2_MEM_NODE(pfn); 1839 1840 page_freelist_lock(mnode); 1841 1842 idx = PNUM_TO_IDX(mnode, new_szc, pfn); 1843 if (PAGE_COUNTERS(mnode, new_szc, idx) == full) 1844 (void) page_promote(mnode, pfn, new_szc, PC_FREE, PC_MTYPE_ANY); 1845 1846 page_freelist_unlock(mnode); 1847 } 1848 1849 static uint_t page_promote_err; 1850 static uint_t page_promote_noreloc_err; 1851 1852 /* 1853 * Create a single larger page (of szc new_szc) from smaller contiguous pages 1854 * for the given mnode starting at pfnum. Pages involved are on the freelist 1855 * before the call and may be returned to the caller if requested, otherwise 1856 * they will be placed back on the freelist. 1857 * If flags is PC_ALLOC, then the large page will be returned to the user in 1858 * a state which is consistent with a page being taken off the freelist. If 1859 * we failed to lock the new large page, then we will return NULL to the 1860 * caller and put the large page on the freelist instead. 1861 * If flags is PC_FREE, then the large page will be placed on the freelist, 1862 * and NULL will be returned. 1863 * The caller is responsible for locking the freelist as well as any other 1864 * accounting which needs to be done for a returned page. 1865 * 1866 * RFE: For performance pass in pp instead of pfnum so 1867 * we can avoid excessive calls to page_numtopp_nolock(). 1868 * This would depend on an assumption that all contiguous 1869 * pages are in the same memseg so we can just add/dec 1870 * our pp. 1871 * 1872 * Lock ordering: 1873 * 1874 * There is a potential but rare deadlock situation 1875 * for page promotion and demotion operations. The problem 1876 * is there are two paths into the freelist manager and 1877 * they have different lock orders: 1878 * 1879 * page_create() 1880 * lock freelist 1881 * page_lock(EXCL) 1882 * unlock freelist 1883 * return 1884 * caller drops page_lock 1885 * 1886 * page_free() and page_reclaim() 1887 * caller grabs page_lock(EXCL) 1888 * 1889 * lock freelist 1890 * unlock freelist 1891 * drop page_lock 1892 * 1893 * What prevents a thread in page_create() from deadlocking 1894 * with a thread freeing or reclaiming the same page is the 1895 * page_trylock() in page_get_freelist(). If the trylock fails 1896 * it skips the page. 1897 * 1898 * The lock ordering for promotion and demotion is the same as 1899 * for page_create(). Since the same deadlock could occur during 1900 * page promotion and freeing or reclaiming of a page on the 1901 * cache list we might have to fail the operation and undo what 1902 * have done so far. Again this is rare. 1903 */ 1904 page_t * 1905 page_promote(int mnode, pfn_t pfnum, uchar_t new_szc, int flags, int mtype) 1906 { 1907 page_t *pp, *pplist, *tpp, *start_pp; 1908 pgcnt_t new_npgs, npgs; 1909 uint_t bin; 1910 pgcnt_t tmpnpgs, pages_left; 1911 uint_t noreloc; 1912 int which_list; 1913 ulong_t index; 1914 kmutex_t *phm; 1915 1916 /* 1917 * General algorithm: 1918 * Find the starting page 1919 * Walk each page struct removing it from the freelist, 1920 * and linking it to all the other pages removed. 1921 * Once all pages are off the freelist, 1922 * walk the list, modifying p_szc to new_szc and what 1923 * ever other info needs to be done to create a large free page. 1924 * According to the flags, either return the page or put it 1925 * on the freelist. 1926 */ 1927 1928 start_pp = page_numtopp_nolock(pfnum); 1929 ASSERT(start_pp && (start_pp->p_pagenum == pfnum)); 1930 new_npgs = page_get_pagecnt(new_szc); 1931 ASSERT(IS_P2ALIGNED(pfnum, new_npgs)); 1932 1933 /* don't return page of the wrong mtype */ 1934 if (mtype != PC_MTYPE_ANY && mtype != PP_2_MTYPE(start_pp)) 1935 return (NULL); 1936 1937 /* 1938 * Loop through smaller pages to confirm that all pages 1939 * give the same result for PP_ISNORELOC(). 1940 * We can check this reliably here as the protocol for setting 1941 * P_NORELOC requires pages to be taken off the free list first. 1942 */ 1943 noreloc = PP_ISNORELOC(start_pp); 1944 for (pp = start_pp + new_npgs; --pp > start_pp; ) { 1945 if (noreloc != PP_ISNORELOC(pp)) { 1946 page_promote_noreloc_err++; 1947 page_promote_err++; 1948 return (NULL); 1949 } 1950 } 1951 1952 pages_left = new_npgs; 1953 pplist = NULL; 1954 pp = start_pp; 1955 1956 /* Loop around coalescing the smaller pages into a big page. */ 1957 while (pages_left) { 1958 /* 1959 * Remove from the freelist. 1960 */ 1961 ASSERT(PP_ISFREE(pp)); 1962 bin = PP_2_BIN(pp); 1963 ASSERT(mnode == PP_2_MEM_NODE(pp)); 1964 mtype = PP_2_MTYPE(pp); 1965 if (PP_ISAGED(pp)) { 1966 1967 /* 1968 * PG_FREE_LIST 1969 */ 1970 if (pp->p_szc) { 1971 page_vpsub(&PAGE_FREELISTS(mnode, 1972 pp->p_szc, bin, mtype), pp); 1973 } else { 1974 mach_page_sub(&PAGE_FREELISTS(mnode, 0, 1975 bin, mtype), pp); 1976 } 1977 which_list = PG_FREE_LIST; 1978 } else { 1979 ASSERT(pp->p_szc == 0); 1980 1981 /* 1982 * PG_CACHE_LIST 1983 * 1984 * Since this page comes from the 1985 * cachelist, we must destroy the 1986 * vnode association. 1987 */ 1988 if (!page_trylock(pp, SE_EXCL)) { 1989 goto fail_promote; 1990 } 1991 1992 /* 1993 * We need to be careful not to deadlock 1994 * with another thread in page_lookup(). 1995 * The page_lookup() thread could be holding 1996 * the same phm that we need if the two 1997 * pages happen to hash to the same phm lock. 1998 * At this point we have locked the entire 1999 * freelist and page_lookup() could be trying 2000 * to grab a freelist lock. 2001 */ 2002 index = PAGE_HASH_FUNC(pp->p_vnode, pp->p_offset); 2003 phm = PAGE_HASH_MUTEX(index); 2004 if (!mutex_tryenter(phm)) { 2005 page_unlock_nocapture(pp); 2006 goto fail_promote; 2007 } 2008 2009 mach_page_sub(&PAGE_CACHELISTS(mnode, bin, mtype), pp); 2010 page_hashout(pp, phm); 2011 mutex_exit(phm); 2012 PP_SETAGED(pp); 2013 page_unlock_nocapture(pp); 2014 which_list = PG_CACHE_LIST; 2015 } 2016 page_ctr_sub(mnode, mtype, pp, which_list); 2017 2018 /* 2019 * Concatenate the smaller page(s) onto 2020 * the large page list. 2021 */ 2022 tmpnpgs = npgs = page_get_pagecnt(pp->p_szc); 2023 pages_left -= npgs; 2024 tpp = pp; 2025 while (npgs--) { 2026 tpp->p_szc = new_szc; 2027 tpp = tpp->p_next; 2028 } 2029 page_list_concat(&pplist, &pp); 2030 pp += tmpnpgs; 2031 } 2032 CHK_LPG(pplist, new_szc); 2033 2034 /* 2035 * return the page to the user if requested 2036 * in the properly locked state. 2037 */ 2038 if (flags == PC_ALLOC && (page_trylock_cons(pplist, SE_EXCL))) { 2039 return (pplist); 2040 } 2041 2042 /* 2043 * Otherwise place the new large page on the freelist 2044 */ 2045 bin = PP_2_BIN(pplist); 2046 mnode = PP_2_MEM_NODE(pplist); 2047 mtype = PP_2_MTYPE(pplist); 2048 page_vpadd(&PAGE_FREELISTS(mnode, new_szc, bin, mtype), pplist); 2049 2050 page_ctr_add(mnode, mtype, pplist, PG_FREE_LIST); 2051 return (NULL); 2052 2053 fail_promote: 2054 /* 2055 * A thread must have still been freeing or 2056 * reclaiming the page on the cachelist. 2057 * To prevent a deadlock undo what we have 2058 * done sofar and return failure. This 2059 * situation can only happen while promoting 2060 * PAGESIZE pages. 2061 */ 2062 page_promote_err++; 2063 while (pplist) { 2064 pp = pplist; 2065 mach_page_sub(&pplist, pp); 2066 pp->p_szc = 0; 2067 bin = PP_2_BIN(pp); 2068 mtype = PP_2_MTYPE(pp); 2069 mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, mtype), pp); 2070 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST); 2071 } 2072 return (NULL); 2073 2074 } 2075 2076 /* 2077 * Break up a large page into smaller size pages. 2078 * Pages involved are on the freelist before the call and may 2079 * be returned to the caller if requested, otherwise they will 2080 * be placed back on the freelist. 2081 * The caller is responsible for locking the freelist as well as any other 2082 * accounting which needs to be done for a returned page. 2083 * If flags is not PC_ALLOC, the color argument is ignored, and thus 2084 * technically, any value may be passed in but PC_NO_COLOR is the standard 2085 * which should be followed for clarity's sake. 2086 */ 2087 page_t * 2088 page_demote(int mnode, pfn_t pfnum, uchar_t cur_szc, uchar_t new_szc, 2089 int color, int flags) 2090 { 2091 page_t *pp, *pplist, *npplist; 2092 pgcnt_t npgs, n; 2093 uint_t bin; 2094 uint_t mtype; 2095 page_t *ret_pp = NULL; 2096 2097 ASSERT(cur_szc != 0); 2098 ASSERT(new_szc < cur_szc); 2099 2100 pplist = page_numtopp_nolock(pfnum); 2101 ASSERT(pplist != NULL); 2102 2103 ASSERT(pplist->p_szc == cur_szc); 2104 2105 bin = PP_2_BIN(pplist); 2106 ASSERT(mnode == PP_2_MEM_NODE(pplist)); 2107 mtype = PP_2_MTYPE(pplist); 2108 page_vpsub(&PAGE_FREELISTS(mnode, cur_szc, bin, mtype), pplist); 2109 2110 CHK_LPG(pplist, cur_szc); 2111 page_ctr_sub(mnode, mtype, pplist, PG_FREE_LIST); 2112 2113 /* 2114 * Number of PAGESIZE pages for smaller new_szc 2115 * page. 2116 */ 2117 npgs = page_get_pagecnt(new_szc); 2118 2119 while (pplist) { 2120 pp = pplist; 2121 2122 ASSERT(pp->p_szc == cur_szc); 2123 2124 /* 2125 * We either break it up into PAGESIZE pages or larger. 2126 */ 2127 if (npgs == 1) { /* PAGESIZE case */ 2128 mach_page_sub(&pplist, pp); 2129 ASSERT(pp->p_szc == cur_szc); 2130 ASSERT(new_szc == 0); 2131 ASSERT(mnode == PP_2_MEM_NODE(pp)); 2132 pp->p_szc = new_szc; 2133 bin = PP_2_BIN(pp); 2134 if ((bin == color) && (flags == PC_ALLOC) && 2135 (ret_pp == NULL) && 2136 page_trylock_cons(pp, SE_EXCL)) { 2137 ret_pp = pp; 2138 } else { 2139 mtype = PP_2_MTYPE(pp); 2140 mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, 2141 mtype), pp); 2142 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST); 2143 } 2144 } else { 2145 2146 /* 2147 * Break down into smaller lists of pages. 2148 */ 2149 page_list_break(&pplist, &npplist, npgs); 2150 2151 pp = pplist; 2152 n = npgs; 2153 while (n--) { 2154 ASSERT(pp->p_szc == cur_szc); 2155 pp->p_szc = new_szc; 2156 pp = pp->p_next; 2157 } 2158 2159 CHK_LPG(pplist, new_szc); 2160 2161 bin = PP_2_BIN(pplist); 2162 ASSERT(mnode == PP_2_MEM_NODE(pp)); 2163 if ((bin == color) && (flags == PC_ALLOC) && 2164 (ret_pp == NULL) && 2165 page_trylock_cons(pp, SE_EXCL)) { 2166 ret_pp = pp; 2167 } else { 2168 mtype = PP_2_MTYPE(pp); 2169 page_vpadd(&PAGE_FREELISTS(mnode, new_szc, 2170 bin, mtype), pplist); 2171 2172 page_ctr_add(mnode, mtype, pplist, 2173 PG_FREE_LIST); 2174 } 2175 pplist = npplist; 2176 } 2177 } 2178 return (ret_pp); 2179 } 2180 2181 int mpss_coalesce_disable = 0; 2182 2183 /* 2184 * Coalesce free pages into a page of the given szc and color if possible. 2185 * Return the pointer to the page created, otherwise, return NULL. 2186 * 2187 * If pfnhi is non-zero, search for large page with pfn range less than pfnhi. 2188 */ 2189 page_t * 2190 page_freelist_coalesce(int mnode, uchar_t szc, uint_t color, uint_t ceq_mask, 2191 int mtype, pfn_t pfnhi) 2192 { 2193 int r = szc; /* region size */ 2194 int mrange; 2195 uint_t full, bin, color_mask, wrap = 0; 2196 pfn_t pfnum, lo, hi; 2197 size_t len, idx, idx0; 2198 pgcnt_t cands = 0, szcpgcnt = page_get_pagecnt(szc); 2199 page_t *ret_pp; 2200 MEM_NODE_ITERATOR_DECL(it); 2201 #if defined(__sparc) 2202 pfn_t pfnum0, nlo, nhi; 2203 #endif 2204 2205 if (mpss_coalesce_disable) { 2206 ASSERT(szc < MMU_PAGE_SIZES); 2207 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[szc][0]); 2208 return (NULL); 2209 } 2210 2211 ASSERT(szc < mmu_page_sizes); 2212 color_mask = PAGE_GET_PAGECOLORS(szc) - 1; 2213 ASSERT(ceq_mask <= color_mask); 2214 ASSERT(color <= color_mask); 2215 color &= ceq_mask; 2216 2217 /* Prevent page_counters dynamic memory from being freed */ 2218 rw_enter(&page_ctrs_rwlock[mnode], RW_READER); 2219 2220 mrange = MTYPE_2_MRANGE(mnode, mtype); 2221 ASSERT(mrange < mnode_nranges[mnode]); 2222 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[r][mrange]); 2223 2224 /* get pfn range for mtype */ 2225 len = PAGE_COUNTERS_ENTRIES(mnode, r); 2226 #if defined(__sparc) 2227 lo = PAGE_COUNTERS_BASE(mnode, r); 2228 hi = IDX_TO_PNUM(mnode, r, len); 2229 #else 2230 MNODETYPE_2_PFN(mnode, mtype, lo, hi); 2231 hi++; 2232 #endif 2233 2234 /* use lower limit if given */ 2235 if (pfnhi != PFNNULL && pfnhi < hi) 2236 hi = pfnhi; 2237 2238 /* round to szcpgcnt boundaries */ 2239 lo = P2ROUNDUP(lo, szcpgcnt); 2240 MEM_NODE_ITERATOR_INIT(lo, mnode, &it); 2241 ASSERT(lo != (pfn_t)-1); 2242 hi = hi & ~(szcpgcnt - 1); 2243 2244 /* set lo to the closest pfn of the right color */ 2245 if (((PFN_2_COLOR(lo, szc, &it) ^ color) & ceq_mask) || 2246 (interleaved_mnodes && PFN_2_MEM_NODE(lo) != mnode)) { 2247 PAGE_NEXT_PFN_FOR_COLOR(lo, szc, color, ceq_mask, color_mask, 2248 &it); 2249 } 2250 2251 if (hi <= lo) { 2252 rw_exit(&page_ctrs_rwlock[mnode]); 2253 return (NULL); 2254 } 2255 2256 full = FULL_REGION_CNT(r); 2257 2258 /* calculate the number of page candidates and initial search index */ 2259 bin = color; 2260 idx0 = (size_t)(-1); 2261 do { 2262 pgcnt_t acand; 2263 2264 PGCTRS_CANDS_GETVALUECOLOR(mnode, mrange, r, bin, acand); 2265 if (acand) { 2266 idx = PAGE_COUNTERS_CURRENT_COLOR(mnode, 2267 r, bin, mrange); 2268 idx0 = MIN(idx0, idx); 2269 cands += acand; 2270 } 2271 bin = ADD_MASKED(bin, 1, ceq_mask, color_mask); 2272 } while (bin != color); 2273 2274 if (cands == 0) { 2275 VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip[r][mrange]); 2276 rw_exit(&page_ctrs_rwlock[mnode]); 2277 return (NULL); 2278 } 2279 2280 pfnum = IDX_TO_PNUM(mnode, r, idx0); 2281 if (pfnum < lo || pfnum >= hi) { 2282 pfnum = lo; 2283 } else { 2284 MEM_NODE_ITERATOR_INIT(pfnum, mnode, &it); 2285 if (pfnum == (pfn_t)-1) { 2286 pfnum = lo; 2287 MEM_NODE_ITERATOR_INIT(pfnum, mnode, &it); 2288 ASSERT(pfnum != (pfn_t)-1); 2289 } else if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) & ceq_mask || 2290 (interleaved_mnodes && PFN_2_MEM_NODE(pfnum) != mnode)) { 2291 /* invalid color, get the closest correct pfn */ 2292 PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask, 2293 color_mask, &it); 2294 if (pfnum >= hi) { 2295 pfnum = lo; 2296 MEM_NODE_ITERATOR_INIT(pfnum, mnode, &it); 2297 } 2298 } 2299 } 2300 2301 /* set starting index */ 2302 idx0 = PNUM_TO_IDX(mnode, r, pfnum); 2303 ASSERT(idx0 < len); 2304 2305 #if defined(__sparc) 2306 pfnum0 = pfnum; /* page corresponding to idx0 */ 2307 nhi = 0; /* search kcage ranges */ 2308 #endif 2309 2310 for (idx = idx0; wrap == 0 || (idx < idx0 && wrap < 2); ) { 2311 2312 #if defined(__sparc) 2313 /* 2314 * Find lowest intersection of kcage ranges and mnode. 2315 * MTYPE_NORELOC means look in the cage, otherwise outside. 2316 */ 2317 if (nhi <= pfnum) { 2318 if (kcage_next_range(mtype == MTYPE_NORELOC, pfnum, 2319 (wrap == 0 ? hi : pfnum0), &nlo, &nhi)) 2320 goto wrapit; 2321 2322 /* jump to the next page in the range */ 2323 if (pfnum < nlo) { 2324 pfnum = P2ROUNDUP(nlo, szcpgcnt); 2325 MEM_NODE_ITERATOR_INIT(pfnum, mnode, &it); 2326 idx = PNUM_TO_IDX(mnode, r, pfnum); 2327 if (idx >= len || pfnum >= hi) 2328 goto wrapit; 2329 if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) & 2330 ceq_mask) 2331 goto next; 2332 if (interleaved_mnodes && 2333 PFN_2_MEM_NODE(pfnum) != mnode) 2334 goto next; 2335 } 2336 } 2337 #endif 2338 2339 if (PAGE_COUNTERS(mnode, r, idx) != full) 2340 goto next; 2341 2342 /* 2343 * RFE: For performance maybe we can do something less 2344 * brutal than locking the entire freelist. So far 2345 * this doesn't seem to be a performance problem? 2346 */ 2347 page_freelist_lock(mnode); 2348 if (PAGE_COUNTERS(mnode, r, idx) == full) { 2349 ret_pp = 2350 page_promote(mnode, pfnum, r, PC_ALLOC, mtype); 2351 if (ret_pp != NULL) { 2352 VM_STAT_ADD(vmm_vmstats.pfc_coalok[r][mrange]); 2353 PAGE_COUNTERS_CURRENT_COLOR(mnode, r, 2354 PFN_2_COLOR(pfnum, szc, &it), mrange) = idx; 2355 page_freelist_unlock(mnode); 2356 rw_exit(&page_ctrs_rwlock[mnode]); 2357 #if defined(__sparc) 2358 if (PP_ISNORELOC(ret_pp)) { 2359 pgcnt_t npgs; 2360 2361 npgs = page_get_pagecnt(ret_pp->p_szc); 2362 kcage_freemem_sub(npgs); 2363 } 2364 #endif 2365 return (ret_pp); 2366 } 2367 } else { 2368 VM_STAT_ADD(vmm_vmstats.page_ctrs_changed[r][mrange]); 2369 } 2370 2371 page_freelist_unlock(mnode); 2372 /* 2373 * No point looking for another page if we've 2374 * already tried all of the ones that 2375 * page_ctr_cands indicated. Stash off where we left 2376 * off. 2377 * Note: this is not exact since we don't hold the 2378 * page_freelist_locks before we initially get the 2379 * value of cands for performance reasons, but should 2380 * be a decent approximation. 2381 */ 2382 if (--cands == 0) { 2383 PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color, mrange) = 2384 idx; 2385 break; 2386 } 2387 next: 2388 PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask, 2389 color_mask, &it); 2390 idx = PNUM_TO_IDX(mnode, r, pfnum); 2391 if (idx >= len || pfnum >= hi) { 2392 wrapit: 2393 pfnum = lo; 2394 MEM_NODE_ITERATOR_INIT(pfnum, mnode, &it); 2395 idx = PNUM_TO_IDX(mnode, r, pfnum); 2396 wrap++; 2397 #if defined(__sparc) 2398 nhi = 0; /* search kcage ranges */ 2399 #endif 2400 } 2401 } 2402 2403 rw_exit(&page_ctrs_rwlock[mnode]); 2404 VM_STAT_ADD(vmm_vmstats.page_ctrs_failed[r][mrange]); 2405 return (NULL); 2406 } 2407 2408 /* 2409 * For the given mnode, promote as many small pages to large pages as possible. 2410 * mnode can be -1, which means do them all 2411 */ 2412 void 2413 page_freelist_coalesce_all(int mnode) 2414 { 2415 int r; /* region size */ 2416 int idx, full; 2417 size_t len; 2418 int doall = interleaved_mnodes || mnode < 0; 2419 int mlo = doall ? 0 : mnode; 2420 int mhi = doall ? max_mem_nodes : (mnode + 1); 2421 2422 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce_all); 2423 2424 if (mpss_coalesce_disable) { 2425 return; 2426 } 2427 2428 /* 2429 * Lock the entire freelist and coalesce what we can. 2430 * 2431 * Always promote to the largest page possible 2432 * first to reduce the number of page promotions. 2433 */ 2434 for (mnode = mlo; mnode < mhi; mnode++) { 2435 rw_enter(&page_ctrs_rwlock[mnode], RW_READER); 2436 page_freelist_lock(mnode); 2437 } 2438 for (r = mmu_page_sizes - 1; r > 0; r--) { 2439 for (mnode = mlo; mnode < mhi; mnode++) { 2440 pgcnt_t cands = 0; 2441 int mrange, nranges = mnode_nranges[mnode]; 2442 2443 for (mrange = 0; mrange < nranges; mrange++) { 2444 PGCTRS_CANDS_GETVALUE(mnode, mrange, r, cands); 2445 if (cands != 0) 2446 break; 2447 } 2448 if (cands == 0) { 2449 VM_STAT_ADD(vmm_vmstats. 2450 page_ctrs_cands_skip_all); 2451 continue; 2452 } 2453 2454 full = FULL_REGION_CNT(r); 2455 len = PAGE_COUNTERS_ENTRIES(mnode, r); 2456 2457 for (idx = 0; idx < len; idx++) { 2458 if (PAGE_COUNTERS(mnode, r, idx) == full) { 2459 pfn_t pfnum = 2460 IDX_TO_PNUM(mnode, r, idx); 2461 int tmnode = interleaved_mnodes ? 2462 PFN_2_MEM_NODE(pfnum) : mnode; 2463 2464 ASSERT(pfnum >= 2465 mem_node_config[tmnode].physbase && 2466 pfnum < 2467 mem_node_config[tmnode].physmax); 2468 2469 (void) page_promote(tmnode, 2470 pfnum, r, PC_FREE, PC_MTYPE_ANY); 2471 } 2472 } 2473 /* shared hpm_counters covers all mnodes, so we quit */ 2474 if (interleaved_mnodes) 2475 break; 2476 } 2477 } 2478 for (mnode = mlo; mnode < mhi; mnode++) { 2479 page_freelist_unlock(mnode); 2480 rw_exit(&page_ctrs_rwlock[mnode]); 2481 } 2482 } 2483 2484 /* 2485 * This is where all polices for moving pages around 2486 * to different page size free lists is implemented. 2487 * Returns 1 on success, 0 on failure. 2488 * 2489 * So far these are the priorities for this algorithm in descending 2490 * order: 2491 * 2492 * 1) When servicing a request try to do so with a free page 2493 * from next size up. Helps defer fragmentation as long 2494 * as possible. 2495 * 2496 * 2) Page coalesce on demand. Only when a freelist 2497 * larger than PAGESIZE is empty and step 1 2498 * will not work since all larger size lists are 2499 * also empty. 2500 * 2501 * If pfnhi is non-zero, search for large page with pfn range less than pfnhi. 2502 */ 2503 2504 page_t * 2505 page_freelist_split(uchar_t szc, uint_t color, int mnode, int mtype, 2506 pfn_t pfnhi, page_list_walker_t *plw) 2507 { 2508 uchar_t nszc = szc + 1; 2509 uint_t bin, sbin, bin_prev; 2510 page_t *pp, *firstpp; 2511 page_t *ret_pp = NULL; 2512 uint_t color_mask; 2513 2514 if (nszc == mmu_page_sizes) 2515 return (NULL); 2516 2517 ASSERT(nszc < mmu_page_sizes); 2518 color_mask = PAGE_GET_PAGECOLORS(nszc) - 1; 2519 bin = sbin = PAGE_GET_NSZ_COLOR(szc, color); 2520 bin_prev = (plw->plw_bin_split_prev == color) ? INVALID_COLOR : 2521 PAGE_GET_NSZ_COLOR(szc, plw->plw_bin_split_prev); 2522 2523 VM_STAT_ADD(vmm_vmstats.pfs_req[szc]); 2524 /* 2525 * First try to break up a larger page to fill current size freelist. 2526 */ 2527 while (plw->plw_bins[nszc] != 0) { 2528 2529 ASSERT(nszc < mmu_page_sizes); 2530 2531 /* 2532 * If page found then demote it. 2533 */ 2534 if (PAGE_FREELISTS(mnode, nszc, bin, mtype)) { 2535 page_freelist_lock(mnode); 2536 firstpp = pp = PAGE_FREELISTS(mnode, nszc, bin, mtype); 2537 2538 /* 2539 * If pfnhi is not PFNNULL, look for large page below 2540 * pfnhi. PFNNULL signifies no pfn requirement. 2541 */ 2542 if (pfnhi != PFNNULL && pp->p_pagenum >= pfnhi) { 2543 do { 2544 pp = pp->p_vpnext; 2545 if (pp == firstpp) { 2546 pp = NULL; 2547 break; 2548 } 2549 } while (pp->p_pagenum >= pfnhi); 2550 } 2551 if (pp) { 2552 uint_t ccolor = page_correct_color(szc, nszc, 2553 color, bin, plw->plw_ceq_mask[szc]); 2554 2555 ASSERT(pp->p_szc == nszc); 2556 VM_STAT_ADD(vmm_vmstats.pfs_demote[nszc]); 2557 ret_pp = page_demote(mnode, pp->p_pagenum, 2558 pp->p_szc, szc, ccolor, PC_ALLOC); 2559 if (ret_pp) { 2560 page_freelist_unlock(mnode); 2561 #if defined(__sparc) 2562 if (PP_ISNORELOC(ret_pp)) { 2563 pgcnt_t npgs; 2564 2565 npgs = page_get_pagecnt( 2566 ret_pp->p_szc); 2567 kcage_freemem_sub(npgs); 2568 } 2569 #endif 2570 return (ret_pp); 2571 } 2572 } 2573 page_freelist_unlock(mnode); 2574 } 2575 2576 /* loop through next size bins */ 2577 bin = ADD_MASKED(bin, 1, plw->plw_ceq_mask[nszc], color_mask); 2578 plw->plw_bins[nszc]--; 2579 2580 if (bin == sbin) { 2581 uchar_t nnszc = nszc + 1; 2582 2583 /* we are done with this page size - check next */ 2584 if (plw->plw_bins[nnszc] == 0) 2585 /* we have already checked next size bins */ 2586 break; 2587 2588 bin = sbin = PAGE_GET_NSZ_COLOR(nszc, bin); 2589 if (bin_prev != INVALID_COLOR) { 2590 bin_prev = PAGE_GET_NSZ_COLOR(nszc, bin_prev); 2591 if (!((bin ^ bin_prev) & 2592 plw->plw_ceq_mask[nnszc])) 2593 break; 2594 } 2595 ASSERT(nnszc < mmu_page_sizes); 2596 color_mask = PAGE_GET_PAGECOLORS(nnszc) - 1; 2597 nszc = nnszc; 2598 ASSERT(nszc < mmu_page_sizes); 2599 } 2600 } 2601 2602 return (ret_pp); 2603 } 2604 2605 /* 2606 * Helper routine used only by the freelist code to lock 2607 * a page. If the page is a large page then it succeeds in 2608 * locking all the constituent pages or none at all. 2609 * Returns 1 on sucess, 0 on failure. 2610 */ 2611 static int 2612 page_trylock_cons(page_t *pp, se_t se) 2613 { 2614 page_t *tpp, *first_pp = pp; 2615 2616 /* 2617 * Fail if can't lock first or only page. 2618 */ 2619 if (!page_trylock(pp, se)) { 2620 return (0); 2621 } 2622 2623 /* 2624 * PAGESIZE: common case. 2625 */ 2626 if (pp->p_szc == 0) { 2627 return (1); 2628 } 2629 2630 /* 2631 * Large page case. 2632 */ 2633 tpp = pp->p_next; 2634 while (tpp != pp) { 2635 if (!page_trylock(tpp, se)) { 2636 /* 2637 * On failure unlock what we have locked so far. 2638 * We want to avoid attempting to capture these 2639 * pages as the pcm mutex may be held which could 2640 * lead to a recursive mutex panic. 2641 */ 2642 while (first_pp != tpp) { 2643 page_unlock_nocapture(first_pp); 2644 first_pp = first_pp->p_next; 2645 } 2646 return (0); 2647 } 2648 tpp = tpp->p_next; 2649 } 2650 return (1); 2651 } 2652 2653 /* 2654 * init context for walking page lists 2655 * Called when a page of the given szc in unavailable. Sets markers 2656 * for the beginning of the search to detect when search has 2657 * completed a full cycle. Sets flags for splitting larger pages 2658 * and coalescing smaller pages. Page walking procedes until a page 2659 * of the desired equivalent color is found. 2660 */ 2661 void 2662 page_list_walk_init(uchar_t szc, uint_t flags, uint_t bin, int can_split, 2663 int use_ceq, page_list_walker_t *plw) 2664 { 2665 uint_t nszc, ceq_mask, colors; 2666 uchar_t ceq = use_ceq ? colorequivszc[szc] : 0; 2667 2668 ASSERT(szc < mmu_page_sizes); 2669 colors = PAGE_GET_PAGECOLORS(szc); 2670 2671 plw->plw_colors = colors; 2672 plw->plw_color_mask = colors - 1; 2673 plw->plw_bin_marker = plw->plw_bin0 = bin; 2674 plw->plw_bin_split_prev = bin; 2675 plw->plw_bin_step = (szc == 0) ? vac_colors : 1; 2676 2677 /* 2678 * if vac aliasing is possible make sure lower order color 2679 * bits are never ignored 2680 */ 2681 if (vac_colors > 1) 2682 ceq &= 0xf0; 2683 2684 /* 2685 * calculate the number of non-equivalent colors and 2686 * color equivalency mask 2687 */ 2688 plw->plw_ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf)); 2689 ASSERT(szc > 0 || plw->plw_ceq_dif >= vac_colors); 2690 ASSERT(plw->plw_ceq_dif > 0); 2691 plw->plw_ceq_mask[szc] = (plw->plw_ceq_dif - 1) << (ceq & 0xf); 2692 2693 if (flags & PG_MATCH_COLOR) { 2694 if (cpu_page_colors < 0) { 2695 /* 2696 * this is a heterogeneous machine with different CPUs 2697 * having different size e$ (not supported for ni2/rock 2698 */ 2699 uint_t cpucolors = CPUSETSIZE() >> PAGE_GET_SHIFT(szc); 2700 cpucolors = MAX(cpucolors, 1); 2701 ceq_mask = plw->plw_color_mask & (cpucolors - 1); 2702 plw->plw_ceq_mask[szc] = 2703 MIN(ceq_mask, plw->plw_ceq_mask[szc]); 2704 } 2705 plw->plw_ceq_dif = 1; 2706 } 2707 2708 /* we can split pages in the freelist, but not the cachelist */ 2709 if (can_split) { 2710 plw->plw_do_split = (szc + 1 < mmu_page_sizes) ? 1 : 0; 2711 2712 /* set next szc color masks and number of free list bins */ 2713 for (nszc = szc + 1; nszc < mmu_page_sizes; nszc++, szc++) { 2714 plw->plw_ceq_mask[nszc] = PAGE_GET_NSZ_MASK(szc, 2715 plw->plw_ceq_mask[szc]); 2716 plw->plw_bins[nszc] = PAGE_GET_PAGECOLORS(nszc); 2717 } 2718 plw->plw_ceq_mask[nszc] = INVALID_MASK; 2719 plw->plw_bins[nszc] = 0; 2720 2721 } else { 2722 ASSERT(szc == 0); 2723 plw->plw_do_split = 0; 2724 plw->plw_bins[1] = 0; 2725 plw->plw_ceq_mask[1] = INVALID_MASK; 2726 } 2727 } 2728 2729 /* 2730 * set mark to flag where next split should occur 2731 */ 2732 #define PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw) { \ 2733 uint_t bin_nsz = PAGE_GET_NSZ_COLOR(szc, bin); \ 2734 uint_t bin0_nsz = PAGE_GET_NSZ_COLOR(szc, plw->plw_bin0); \ 2735 uint_t neq_mask = ~plw->plw_ceq_mask[nszc] & plw->plw_color_mask; \ 2736 plw->plw_split_next = \ 2737 INC_MASKED(bin_nsz, neq_mask, plw->plw_color_mask); \ 2738 if (!((plw->plw_split_next ^ bin0_nsz) & plw->plw_ceq_mask[nszc])) { \ 2739 plw->plw_split_next = \ 2740 INC_MASKED(plw->plw_split_next, \ 2741 neq_mask, plw->plw_color_mask); \ 2742 } \ 2743 } 2744 2745 uint_t 2746 page_list_walk_next_bin(uchar_t szc, uint_t bin, page_list_walker_t *plw) 2747 { 2748 uint_t neq_mask = ~plw->plw_ceq_mask[szc] & plw->plw_color_mask; 2749 uint_t bin0_nsz, nbin_nsz, nbin0, nbin; 2750 uchar_t nszc = szc + 1; 2751 2752 nbin = ADD_MASKED(bin, 2753 plw->plw_bin_step, neq_mask, plw->plw_color_mask); 2754 2755 if (plw->plw_do_split) { 2756 plw->plw_bin_split_prev = bin; 2757 PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw); 2758 plw->plw_do_split = 0; 2759 } 2760 2761 if (szc == 0) { 2762 if (plw->plw_count != 0 || plw->plw_ceq_dif == vac_colors) { 2763 if (nbin == plw->plw_bin0 && 2764 (vac_colors == 1 || nbin != plw->plw_bin_marker)) { 2765 nbin = ADD_MASKED(nbin, plw->plw_bin_step, 2766 neq_mask, plw->plw_color_mask); 2767 plw->plw_bin_split_prev = plw->plw_bin0; 2768 } 2769 2770 if (vac_colors > 1 && nbin == plw->plw_bin_marker) { 2771 plw->plw_bin_marker = 2772 nbin = INC_MASKED(nbin, neq_mask, 2773 plw->plw_color_mask); 2774 plw->plw_bin_split_prev = plw->plw_bin0; 2775 /* 2776 * large pages all have the same vac color 2777 * so by now we should be done with next 2778 * size page splitting process 2779 */ 2780 ASSERT(plw->plw_bins[1] == 0); 2781 plw->plw_do_split = 0; 2782 return (nbin); 2783 } 2784 2785 } else { 2786 uint_t bin_jump = (vac_colors == 1) ? 2787 (BIN_STEP & ~3) - (plw->plw_bin0 & 3) : BIN_STEP; 2788 2789 bin_jump &= ~(vac_colors - 1); 2790 2791 nbin0 = ADD_MASKED(plw->plw_bin0, bin_jump, neq_mask, 2792 plw->plw_color_mask); 2793 2794 if ((nbin0 ^ plw->plw_bin0) & plw->plw_ceq_mask[szc]) { 2795 2796 plw->plw_bin_marker = nbin = nbin0; 2797 2798 if (plw->plw_bins[nszc] != 0) { 2799 /* 2800 * check if next page size bin is the 2801 * same as the next page size bin for 2802 * bin0 2803 */ 2804 nbin_nsz = PAGE_GET_NSZ_COLOR(szc, 2805 nbin); 2806 bin0_nsz = PAGE_GET_NSZ_COLOR(szc, 2807 plw->plw_bin0); 2808 2809 if ((bin0_nsz ^ nbin_nsz) & 2810 plw->plw_ceq_mask[nszc]) 2811 plw->plw_do_split = 1; 2812 } 2813 return (nbin); 2814 } 2815 } 2816 } 2817 2818 if (plw->plw_bins[nszc] != 0) { 2819 nbin_nsz = PAGE_GET_NSZ_COLOR(szc, nbin); 2820 if (!((plw->plw_split_next ^ nbin_nsz) & 2821 plw->plw_ceq_mask[nszc])) 2822 plw->plw_do_split = 1; 2823 } 2824 2825 return (nbin); 2826 } 2827 2828 page_t * 2829 page_get_mnode_freelist(int mnode, uint_t bin, int mtype, uchar_t szc, 2830 uint_t flags) 2831 { 2832 kmutex_t *pcm; 2833 page_t *pp, *first_pp; 2834 uint_t sbin; 2835 int plw_initialized; 2836 page_list_walker_t plw; 2837 2838 ASSERT(szc < mmu_page_sizes); 2839 2840 VM_STAT_ADD(vmm_vmstats.pgmf_alloc[szc]); 2841 2842 MTYPE_START(mnode, mtype, flags); 2843 if (mtype < 0) { /* mnode does not have memory in mtype range */ 2844 VM_STAT_ADD(vmm_vmstats.pgmf_allocempty[szc]); 2845 return (NULL); 2846 } 2847 try_again: 2848 2849 plw_initialized = 0; 2850 plw.plw_ceq_dif = 1; 2851 2852 /* 2853 * Only hold one freelist lock at a time, that way we 2854 * can start anywhere and not have to worry about lock 2855 * ordering. 2856 */ 2857 for (plw.plw_count = 0; 2858 plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) { 2859 sbin = bin; 2860 do { 2861 if (!PAGE_FREELISTS(mnode, szc, bin, mtype)) 2862 goto bin_empty_1; 2863 2864 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 2865 mutex_enter(pcm); 2866 pp = PAGE_FREELISTS(mnode, szc, bin, mtype); 2867 if (pp == NULL) 2868 goto bin_empty_0; 2869 2870 /* 2871 * These were set before the page 2872 * was put on the free list, 2873 * they must still be set. 2874 */ 2875 ASSERT(PP_ISFREE(pp)); 2876 ASSERT(PP_ISAGED(pp)); 2877 ASSERT(pp->p_vnode == NULL); 2878 ASSERT(pp->p_hash == NULL); 2879 ASSERT(pp->p_offset == (u_offset_t)-1); 2880 ASSERT(pp->p_szc == szc); 2881 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 2882 2883 /* 2884 * Walk down the hash chain. 2885 * 8k pages are linked on p_next 2886 * and p_prev fields. Large pages 2887 * are a contiguous group of 2888 * constituent pages linked together 2889 * on their p_next and p_prev fields. 2890 * The large pages are linked together 2891 * on the hash chain using p_vpnext 2892 * p_vpprev of the base constituent 2893 * page of each large page. 2894 */ 2895 first_pp = pp; 2896 while (!page_trylock_cons(pp, SE_EXCL)) { 2897 if (szc == 0) { 2898 pp = pp->p_next; 2899 } else { 2900 pp = pp->p_vpnext; 2901 } 2902 2903 ASSERT(PP_ISFREE(pp)); 2904 ASSERT(PP_ISAGED(pp)); 2905 ASSERT(pp->p_vnode == NULL); 2906 ASSERT(pp->p_hash == NULL); 2907 ASSERT(pp->p_offset == (u_offset_t)-1); 2908 ASSERT(pp->p_szc == szc); 2909 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 2910 2911 if (pp == first_pp) 2912 goto bin_empty_0; 2913 } 2914 2915 ASSERT(pp != NULL); 2916 ASSERT(mtype == PP_2_MTYPE(pp)); 2917 ASSERT(pp->p_szc == szc); 2918 if (szc == 0) { 2919 page_sub(&PAGE_FREELISTS(mnode, 2920 szc, bin, mtype), pp); 2921 } else { 2922 page_vpsub(&PAGE_FREELISTS(mnode, 2923 szc, bin, mtype), pp); 2924 CHK_LPG(pp, szc); 2925 } 2926 page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST); 2927 2928 if ((PP_ISFREE(pp) == 0) || (PP_ISAGED(pp) == 0)) 2929 panic("free page is not. pp %p", (void *)pp); 2930 mutex_exit(pcm); 2931 2932 #if defined(__sparc) 2933 ASSERT(!kcage_on || PP_ISNORELOC(pp) || 2934 (flags & PG_NORELOC) == 0); 2935 2936 if (PP_ISNORELOC(pp)) 2937 kcage_freemem_sub(page_get_pagecnt(szc)); 2938 #endif 2939 VM_STAT_ADD(vmm_vmstats.pgmf_allocok[szc]); 2940 return (pp); 2941 2942 bin_empty_0: 2943 mutex_exit(pcm); 2944 bin_empty_1: 2945 if (plw_initialized == 0) { 2946 page_list_walk_init(szc, flags, bin, 1, 1, 2947 &plw); 2948 plw_initialized = 1; 2949 ASSERT(plw.plw_colors <= 2950 PAGE_GET_PAGECOLORS(szc)); 2951 ASSERT(plw.plw_colors > 0); 2952 ASSERT((plw.plw_colors & 2953 (plw.plw_colors - 1)) == 0); 2954 ASSERT(bin < plw.plw_colors); 2955 ASSERT(plw.plw_ceq_mask[szc] < plw.plw_colors); 2956 } 2957 /* calculate the next bin with equivalent color */ 2958 bin = ADD_MASKED(bin, plw.plw_bin_step, 2959 plw.plw_ceq_mask[szc], plw.plw_color_mask); 2960 } while (sbin != bin); 2961 2962 /* 2963 * color bins are all empty if color match. Try and 2964 * satisfy the request by breaking up or coalescing 2965 * pages from a different size freelist of the correct 2966 * color that satisfies the ORIGINAL color requested. 2967 * If that fails then try pages of the same size but 2968 * different colors assuming we are not called with 2969 * PG_MATCH_COLOR. 2970 */ 2971 if (plw.plw_do_split && 2972 (pp = page_freelist_split(szc, bin, mnode, 2973 mtype, PFNNULL, &plw)) != NULL) 2974 return (pp); 2975 2976 if (szc > 0 && (pp = page_freelist_coalesce(mnode, szc, 2977 bin, plw.plw_ceq_mask[szc], mtype, PFNNULL)) != NULL) 2978 return (pp); 2979 2980 if (plw.plw_ceq_dif > 1) 2981 bin = page_list_walk_next_bin(szc, bin, &plw); 2982 } 2983 2984 /* if allowed, cycle through additional mtypes */ 2985 MTYPE_NEXT(mnode, mtype, flags); 2986 if (mtype >= 0) 2987 goto try_again; 2988 2989 VM_STAT_ADD(vmm_vmstats.pgmf_allocfailed[szc]); 2990 2991 return (NULL); 2992 } 2993 2994 /* 2995 * Returns the count of free pages for 'pp' with size code 'szc'. 2996 * Note: This function does not return an exact value as the page freelist 2997 * locks are not held and thus the values in the page_counters may be 2998 * changing as we walk through the data. 2999 */ 3000 static int 3001 page_freecnt(int mnode, page_t *pp, uchar_t szc) 3002 { 3003 pgcnt_t pgfree; 3004 pgcnt_t cnt; 3005 ssize_t r = szc; /* region size */ 3006 ssize_t idx; 3007 int i; 3008 int full, range; 3009 3010 /* Make sure pagenum passed in is aligned properly */ 3011 ASSERT((pp->p_pagenum & (PNUM_SIZE(szc) - 1)) == 0); 3012 ASSERT(szc > 0); 3013 3014 /* Prevent page_counters dynamic memory from being freed */ 3015 rw_enter(&page_ctrs_rwlock[mnode], RW_READER); 3016 idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum); 3017 cnt = PAGE_COUNTERS(mnode, r, idx); 3018 pgfree = cnt << PNUM_SHIFT(r - 1); 3019 range = FULL_REGION_CNT(szc); 3020 3021 /* Check for completely full region */ 3022 if (cnt == range) { 3023 rw_exit(&page_ctrs_rwlock[mnode]); 3024 return (pgfree); 3025 } 3026 3027 while (--r > 0) { 3028 idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum); 3029 full = FULL_REGION_CNT(r); 3030 for (i = 0; i < range; i++, idx++) { 3031 cnt = PAGE_COUNTERS(mnode, r, idx); 3032 /* 3033 * If cnt here is full, that means we have already 3034 * accounted for these pages earlier. 3035 */ 3036 if (cnt != full) { 3037 pgfree += (cnt << PNUM_SHIFT(r - 1)); 3038 } 3039 } 3040 range *= full; 3041 } 3042 rw_exit(&page_ctrs_rwlock[mnode]); 3043 return (pgfree); 3044 } 3045 3046 /* 3047 * Called from page_geti_contig_pages to exclusively lock constituent pages 3048 * starting from 'spp' for page size code 'szc'. 3049 * 3050 * If 'ptcpthreshold' is set, the number of free pages needed in the 'szc' 3051 * region needs to be greater than or equal to the threshold. 3052 */ 3053 static int 3054 page_trylock_contig_pages(int mnode, page_t *spp, uchar_t szc, int flags) 3055 { 3056 pgcnt_t pgcnt = PNUM_SIZE(szc); 3057 pgcnt_t pgfree, i; 3058 page_t *pp; 3059 3060 VM_STAT_ADD(vmm_vmstats.ptcp[szc]); 3061 3062 3063 if ((ptcpthreshold == 0) || (flags & PGI_PGCPHIPRI)) 3064 goto skipptcpcheck; 3065 /* 3066 * check if there are sufficient free pages available before attempting 3067 * to trylock. Count is approximate as page counters can change. 3068 */ 3069 pgfree = page_freecnt(mnode, spp, szc); 3070 3071 /* attempt to trylock if there are sufficient already free pages */ 3072 if (pgfree < pgcnt/ptcpthreshold) { 3073 VM_STAT_ADD(vmm_vmstats.ptcpfreethresh[szc]); 3074 return (0); 3075 } 3076 3077 skipptcpcheck: 3078 3079 for (i = 0; i < pgcnt; i++) { 3080 pp = &spp[i]; 3081 if (!page_trylock(pp, SE_EXCL)) { 3082 VM_STAT_ADD(vmm_vmstats.ptcpfailexcl[szc]); 3083 while (--i != (pgcnt_t)-1) { 3084 pp = &spp[i]; 3085 ASSERT(PAGE_EXCL(pp)); 3086 page_unlock_nocapture(pp); 3087 } 3088 return (0); 3089 } 3090 ASSERT(spp[i].p_pagenum == spp->p_pagenum + i); 3091 if ((pp->p_szc > szc || (szc && pp->p_szc == szc)) && 3092 !PP_ISFREE(pp)) { 3093 VM_STAT_ADD(vmm_vmstats.ptcpfailszc[szc]); 3094 ASSERT(i == 0); 3095 page_unlock_nocapture(pp); 3096 return (0); 3097 } 3098 if (PP_ISNORELOC(pp)) { 3099 VM_STAT_ADD(vmm_vmstats.ptcpfailcage[szc]); 3100 while (i != (pgcnt_t)-1) { 3101 pp = &spp[i]; 3102 ASSERT(PAGE_EXCL(pp)); 3103 page_unlock_nocapture(pp); 3104 i--; 3105 } 3106 return (0); 3107 } 3108 } 3109 VM_STAT_ADD(vmm_vmstats.ptcpok[szc]); 3110 return (1); 3111 } 3112 3113 /* 3114 * Claim large page pointed to by 'pp'. 'pp' is the starting set 3115 * of 'szc' constituent pages that had been locked exclusively previously. 3116 * Will attempt to relocate constituent pages in use. 3117 */ 3118 static page_t * 3119 page_claim_contig_pages(page_t *pp, uchar_t szc, int flags) 3120 { 3121 spgcnt_t pgcnt, npgs, i; 3122 page_t *targpp, *rpp, *hpp; 3123 page_t *replpp = NULL; 3124 page_t *pplist = NULL; 3125 3126 ASSERT(pp != NULL); 3127 3128 pgcnt = page_get_pagecnt(szc); 3129 while (pgcnt) { 3130 ASSERT(PAGE_EXCL(pp)); 3131 ASSERT(!PP_ISNORELOC(pp)); 3132 if (PP_ISFREE(pp)) { 3133 /* 3134 * If this is a PG_FREE_LIST page then its 3135 * size code can change underneath us due to 3136 * page promotion or demotion. As an optimzation 3137 * use page_list_sub_pages() instead of 3138 * page_list_sub(). 3139 */ 3140 if (PP_ISAGED(pp)) { 3141 page_list_sub_pages(pp, szc); 3142 if (pp->p_szc == szc) { 3143 return (pp); 3144 } 3145 ASSERT(pp->p_szc < szc); 3146 npgs = page_get_pagecnt(pp->p_szc); 3147 hpp = pp; 3148 for (i = 0; i < npgs; i++, pp++) { 3149 pp->p_szc = szc; 3150 } 3151 page_list_concat(&pplist, &hpp); 3152 pgcnt -= npgs; 3153 continue; 3154 } 3155 ASSERT(!PP_ISAGED(pp)); 3156 ASSERT(pp->p_szc == 0); 3157 page_list_sub(pp, PG_CACHE_LIST); 3158 page_hashout(pp, NULL); 3159 PP_SETAGED(pp); 3160 pp->p_szc = szc; 3161 page_list_concat(&pplist, &pp); 3162 pp++; 3163 pgcnt--; 3164 continue; 3165 } 3166 npgs = page_get_pagecnt(pp->p_szc); 3167 3168 /* 3169 * page_create_wait freemem accounting done by caller of 3170 * page_get_freelist and not necessary to call it prior to 3171 * calling page_get_replacement_page. 3172 * 3173 * page_get_replacement_page can call page_get_contig_pages 3174 * to acquire a large page (szc > 0); the replacement must be 3175 * smaller than the contig page size to avoid looping or 3176 * szc == 0 and PGI_PGCPSZC0 is set. 3177 */ 3178 if (pp->p_szc < szc || (szc == 0 && (flags & PGI_PGCPSZC0))) { 3179 replpp = page_get_replacement_page(pp, NULL, 0); 3180 if (replpp) { 3181 npgs = page_get_pagecnt(pp->p_szc); 3182 ASSERT(npgs <= pgcnt); 3183 targpp = pp; 3184 } 3185 } 3186 3187 /* 3188 * If replacement is NULL or do_page_relocate fails, fail 3189 * coalescing of pages. 3190 */ 3191 if (replpp == NULL || (do_page_relocate(&targpp, &replpp, 0, 3192 &npgs, NULL) != 0)) { 3193 /* 3194 * Unlock un-processed target list 3195 */ 3196 while (pgcnt--) { 3197 ASSERT(PAGE_EXCL(pp)); 3198 page_unlock_nocapture(pp); 3199 pp++; 3200 } 3201 /* 3202 * Free the processed target list. 3203 */ 3204 while (pplist) { 3205 pp = pplist; 3206 page_sub(&pplist, pp); 3207 ASSERT(PAGE_EXCL(pp)); 3208 ASSERT(pp->p_szc == szc); 3209 ASSERT(PP_ISFREE(pp)); 3210 ASSERT(PP_ISAGED(pp)); 3211 pp->p_szc = 0; 3212 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); 3213 page_unlock_nocapture(pp); 3214 } 3215 3216 if (replpp != NULL) 3217 page_free_replacement_page(replpp); 3218 3219 return (NULL); 3220 } 3221 ASSERT(pp == targpp); 3222 3223 /* LINTED */ 3224 ASSERT(hpp = pp); /* That's right, it's an assignment */ 3225 3226 pp += npgs; 3227 pgcnt -= npgs; 3228 3229 while (npgs--) { 3230 ASSERT(PAGE_EXCL(targpp)); 3231 ASSERT(!PP_ISFREE(targpp)); 3232 ASSERT(!PP_ISNORELOC(targpp)); 3233 PP_SETFREE(targpp); 3234 ASSERT(PP_ISAGED(targpp)); 3235 ASSERT(targpp->p_szc < szc || (szc == 0 && 3236 (flags & PGI_PGCPSZC0))); 3237 targpp->p_szc = szc; 3238 targpp = targpp->p_next; 3239 3240 rpp = replpp; 3241 ASSERT(rpp != NULL); 3242 page_sub(&replpp, rpp); 3243 ASSERT(PAGE_EXCL(rpp)); 3244 ASSERT(!PP_ISFREE(rpp)); 3245 page_unlock_nocapture(rpp); 3246 } 3247 ASSERT(targpp == hpp); 3248 ASSERT(replpp == NULL); 3249 page_list_concat(&pplist, &targpp); 3250 } 3251 CHK_LPG(pplist, szc); 3252 return (pplist); 3253 } 3254 3255 /* 3256 * Trim kernel cage from pfnlo-pfnhi and store result in lo-hi. Return code 3257 * of 0 means nothing left after trim. 3258 */ 3259 int 3260 trimkcage(struct memseg *mseg, pfn_t *lo, pfn_t *hi, pfn_t pfnlo, pfn_t pfnhi) 3261 { 3262 pfn_t kcagepfn; 3263 int decr; 3264 int rc = 0; 3265 3266 if (PP_ISNORELOC(mseg->pages)) { 3267 if (PP_ISNORELOC(mseg->epages - 1) == 0) { 3268 3269 /* lower part of this mseg inside kernel cage */ 3270 decr = kcage_current_pfn(&kcagepfn); 3271 3272 /* kernel cage may have transitioned past mseg */ 3273 if (kcagepfn >= mseg->pages_base && 3274 kcagepfn < mseg->pages_end) { 3275 ASSERT(decr == 0); 3276 *lo = MAX(kcagepfn, pfnlo); 3277 *hi = MIN(pfnhi, (mseg->pages_end - 1)); 3278 rc = 1; 3279 } 3280 } 3281 /* else entire mseg in the cage */ 3282 } else { 3283 if (PP_ISNORELOC(mseg->epages - 1)) { 3284 3285 /* upper part of this mseg inside kernel cage */ 3286 decr = kcage_current_pfn(&kcagepfn); 3287 3288 /* kernel cage may have transitioned past mseg */ 3289 if (kcagepfn >= mseg->pages_base && 3290 kcagepfn < mseg->pages_end) { 3291 ASSERT(decr); 3292 *hi = MIN(kcagepfn, pfnhi); 3293 *lo = MAX(pfnlo, mseg->pages_base); 3294 rc = 1; 3295 } 3296 } else { 3297 /* entire mseg outside of kernel cage */ 3298 *lo = MAX(pfnlo, mseg->pages_base); 3299 *hi = MIN(pfnhi, (mseg->pages_end - 1)); 3300 rc = 1; 3301 } 3302 } 3303 return (rc); 3304 } 3305 3306 /* 3307 * called from page_get_contig_pages to search 'pfnlo' thru 'pfnhi' to claim a 3308 * page with size code 'szc'. Claiming such a page requires acquiring 3309 * exclusive locks on all constituent pages (page_trylock_contig_pages), 3310 * relocating pages in use and concatenating these constituent pages into a 3311 * large page. 3312 * 3313 * The page lists do not have such a large page and page_freelist_split has 3314 * already failed to demote larger pages and/or coalesce smaller free pages. 3315 * 3316 * 'flags' may specify PG_COLOR_MATCH which would limit the search of large 3317 * pages with the same color as 'bin'. 3318 * 3319 * 'pfnflag' specifies the subset of the pfn range to search. 3320 */ 3321 3322 static page_t * 3323 page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags, 3324 pfn_t pfnlo, pfn_t pfnhi, pgcnt_t pfnflag) 3325 { 3326 struct memseg *mseg; 3327 pgcnt_t szcpgcnt = page_get_pagecnt(szc); 3328 pgcnt_t szcpgmask = szcpgcnt - 1; 3329 pfn_t randpfn; 3330 page_t *pp, *randpp, *endpp; 3331 uint_t colors, ceq_mask; 3332 /* LINTED : set but not used in function */ 3333 uint_t color_mask; 3334 pfn_t hi, lo; 3335 uint_t skip; 3336 MEM_NODE_ITERATOR_DECL(it); 3337 3338 ASSERT(szc != 0 || (flags & PGI_PGCPSZC0)); 3339 3340 pfnlo = P2ROUNDUP(pfnlo, szcpgcnt); 3341 3342 if ((pfnhi - pfnlo) + 1 < szcpgcnt || pfnlo >= pfnhi) 3343 return (NULL); 3344 3345 ASSERT(szc < mmu_page_sizes); 3346 3347 colors = PAGE_GET_PAGECOLORS(szc); 3348 color_mask = colors - 1; 3349 if ((colors > 1) && (flags & PG_MATCH_COLOR)) { 3350 uchar_t ceq = colorequivszc[szc]; 3351 uint_t ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf)); 3352 3353 ASSERT(ceq_dif > 0); 3354 ceq_mask = (ceq_dif - 1) << (ceq & 0xf); 3355 } else { 3356 ceq_mask = 0; 3357 } 3358 3359 ASSERT(bin < colors); 3360 3361 /* clear "non-significant" color bits */ 3362 bin &= ceq_mask; 3363 3364 /* 3365 * trim the pfn range to search based on pfnflag. pfnflag is set 3366 * when there have been previous page_get_contig_page failures to 3367 * limit the search. 3368 * 3369 * The high bit in pfnflag specifies the number of 'slots' in the 3370 * pfn range and the remainder of pfnflag specifies which slot. 3371 * For example, a value of 1010b would mean the second slot of 3372 * the pfn range that has been divided into 8 slots. 3373 */ 3374 if (pfnflag > 1) { 3375 int slots = 1 << (highbit(pfnflag) - 1); 3376 int slotid = pfnflag & (slots - 1); 3377 pgcnt_t szcpages; 3378 int slotlen; 3379 3380 pfnhi = P2ALIGN((pfnhi + 1), szcpgcnt) - 1; 3381 szcpages = ((pfnhi - pfnlo) + 1) / szcpgcnt; 3382 slotlen = howmany(szcpages, slots); 3383 /* skip if 'slotid' slot is empty */ 3384 if (slotid * slotlen >= szcpages) 3385 return (NULL); 3386 pfnlo = pfnlo + (((slotid * slotlen) % szcpages) * szcpgcnt); 3387 ASSERT(pfnlo < pfnhi); 3388 if (pfnhi > pfnlo + (slotlen * szcpgcnt)) 3389 pfnhi = pfnlo + (slotlen * szcpgcnt) - 1; 3390 } 3391 3392 memsegs_lock(0); 3393 3394 /* 3395 * loop through memsegs to look for contig page candidates 3396 */ 3397 3398 for (mseg = memsegs; mseg != NULL; mseg = mseg->next) { 3399 if (pfnhi < mseg->pages_base || pfnlo >= mseg->pages_end) { 3400 /* no overlap */ 3401 continue; 3402 } 3403 3404 if (mseg->pages_end - mseg->pages_base < szcpgcnt) 3405 /* mseg too small */ 3406 continue; 3407 3408 /* 3409 * trim off kernel cage pages from pfn range and check for 3410 * a trimmed pfn range returned that does not span the 3411 * desired large page size. 3412 */ 3413 if (kcage_on) { 3414 if (trimkcage(mseg, &lo, &hi, pfnlo, pfnhi) == 0 || 3415 ((hi - lo) + 1) < szcpgcnt) 3416 continue; 3417 } else { 3418 lo = MAX(pfnlo, mseg->pages_base); 3419 hi = MIN(pfnhi, (mseg->pages_end - 1)); 3420 } 3421 3422 /* round to szcpgcnt boundaries */ 3423 lo = P2ROUNDUP(lo, szcpgcnt); 3424 3425 MEM_NODE_ITERATOR_INIT(lo, mnode, &it); 3426 hi = P2ALIGN((hi + 1), szcpgcnt) - 1; 3427 3428 if (hi <= lo) 3429 continue; 3430 3431 /* 3432 * set lo to point to the pfn for the desired bin. Large 3433 * page sizes may only have a single page color 3434 */ 3435 skip = szcpgcnt; 3436 if (ceq_mask > 0 || interleaved_mnodes) { 3437 /* set lo to point at appropriate color */ 3438 if (((PFN_2_COLOR(lo, szc, &it) ^ bin) & ceq_mask) || 3439 (interleaved_mnodes && 3440 PFN_2_MEM_NODE(lo) != mnode)) { 3441 PAGE_NEXT_PFN_FOR_COLOR(lo, szc, bin, ceq_mask, 3442 color_mask, &it); 3443 } 3444 if (hi <= lo) 3445 /* mseg cannot satisfy color request */ 3446 continue; 3447 } 3448 3449 /* randomly choose a point between lo and hi to begin search */ 3450 3451 randpfn = (pfn_t)GETTICK(); 3452 randpfn = ((randpfn % (hi - lo)) + lo) & ~(skip - 1); 3453 MEM_NODE_ITERATOR_INIT(randpfn, mnode, &it); 3454 if (ceq_mask || interleaved_mnodes) { 3455 if (randpfn != (pfn_t)-1) 3456 PAGE_NEXT_PFN_FOR_COLOR(randpfn, szc, bin, 3457 ceq_mask, color_mask, &it); 3458 if (randpfn >= hi) { 3459 randpfn = lo; 3460 MEM_NODE_ITERATOR_INIT(randpfn, mnode, &it); 3461 } 3462 } 3463 randpp = mseg->pages + (randpfn - mseg->pages_base); 3464 3465 ASSERT(randpp->p_pagenum == randpfn); 3466 3467 pp = randpp; 3468 endpp = mseg->pages + (hi - mseg->pages_base) + 1; 3469 3470 ASSERT(randpp + szcpgcnt <= endpp); 3471 3472 do { 3473 ASSERT(!(pp->p_pagenum & szcpgmask)); 3474 ASSERT(((PP_2_BIN(pp) ^ bin) & ceq_mask) == 0); 3475 3476 if (page_trylock_contig_pages(mnode, pp, szc, flags)) { 3477 /* pages unlocked by page_claim on failure */ 3478 if (page_claim_contig_pages(pp, szc, flags)) { 3479 memsegs_unlock(0); 3480 return (pp); 3481 } 3482 } 3483 3484 if (ceq_mask == 0 && !interleaved_mnodes) { 3485 pp += skip; 3486 } else { 3487 pfn_t pfn = pp->p_pagenum; 3488 3489 PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, bin, 3490 ceq_mask, color_mask, &it); 3491 if (pfn == (pfn_t)-1) { 3492 pp = endpp; 3493 } else { 3494 pp = mseg->pages + 3495 (pfn - mseg->pages_base); 3496 } 3497 } 3498 if (pp >= endpp) { 3499 /* start from the beginning */ 3500 MEM_NODE_ITERATOR_INIT(lo, mnode, &it); 3501 pp = mseg->pages + (lo - mseg->pages_base); 3502 ASSERT(pp->p_pagenum == lo); 3503 ASSERT(pp + szcpgcnt <= endpp); 3504 } 3505 } while (pp != randpp); 3506 } 3507 memsegs_unlock(0); 3508 return (NULL); 3509 } 3510 3511 3512 /* 3513 * controlling routine that searches through physical memory in an attempt to 3514 * claim a large page based on the input parameters. 3515 * on the page free lists. 3516 * 3517 * calls page_geti_contig_pages with an initial pfn range from the mnode 3518 * and mtype. page_geti_contig_pages will trim off the parts of the pfn range 3519 * that overlaps with the kernel cage or does not match the requested page 3520 * color if PG_MATCH_COLOR is set. Since this search is very expensive, 3521 * page_geti_contig_pages may further limit the search range based on 3522 * previous failure counts (pgcpfailcnt[]). 3523 * 3524 * for PGI_PGCPSZC0 requests, page_get_contig_pages will relocate a base 3525 * pagesize page that satisfies mtype. 3526 */ 3527 page_t * 3528 page_get_contig_pages(int mnode, uint_t bin, int mtype, uchar_t szc, 3529 uint_t flags) 3530 { 3531 pfn_t pfnlo, pfnhi; /* contig pages pfn range */ 3532 page_t *pp; 3533 pgcnt_t pfnflag = 0; /* no limit on search if 0 */ 3534 3535 VM_STAT_ADD(vmm_vmstats.pgcp_alloc[szc]); 3536 3537 /* no allocations from cage */ 3538 flags |= PGI_NOCAGE; 3539 3540 /* LINTED */ 3541 MTYPE_START(mnode, mtype, flags); 3542 if (mtype < 0) { /* mnode does not have memory in mtype range */ 3543 VM_STAT_ADD(vmm_vmstats.pgcp_allocempty[szc]); 3544 return (NULL); 3545 } 3546 3547 ASSERT(szc > 0 || (flags & PGI_PGCPSZC0)); 3548 3549 /* do not limit search and ignore color if hi pri */ 3550 3551 if (pgcplimitsearch && ((flags & PGI_PGCPHIPRI) == 0)) 3552 pfnflag = pgcpfailcnt[szc]; 3553 3554 /* remove color match to improve chances */ 3555 3556 if (flags & PGI_PGCPHIPRI || pfnflag) 3557 flags &= ~PG_MATCH_COLOR; 3558 3559 do { 3560 /* get pfn range based on mnode and mtype */ 3561 MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi); 3562 3563 ASSERT(pfnhi >= pfnlo); 3564 3565 pp = page_geti_contig_pages(mnode, bin, szc, flags, 3566 pfnlo, pfnhi, pfnflag); 3567 3568 if (pp != NULL) { 3569 pfnflag = pgcpfailcnt[szc]; 3570 if (pfnflag) { 3571 /* double the search size */ 3572 pgcpfailcnt[szc] = pfnflag >> 1; 3573 } 3574 VM_STAT_ADD(vmm_vmstats.pgcp_allocok[szc]); 3575 return (pp); 3576 } 3577 MTYPE_NEXT(mnode, mtype, flags); 3578 } while (mtype >= 0); 3579 3580 VM_STAT_ADD(vmm_vmstats.pgcp_allocfailed[szc]); 3581 return (NULL); 3582 } 3583 3584 #if defined(__i386) || defined(__amd64) 3585 /* 3586 * Determine the likelihood of finding/coalescing a szc page. 3587 * Return 0 if the likelihood is small otherwise return 1. 3588 * 3589 * For now, be conservative and check only 1g pages and return 0 3590 * if there had been previous coalescing failures and the szc pages 3591 * needed to satisfy request would exhaust most of freemem. 3592 */ 3593 int 3594 page_chk_freelist(uint_t szc) 3595 { 3596 pgcnt_t pgcnt; 3597 3598 if (szc <= 1) 3599 return (1); 3600 3601 pgcnt = page_get_pagecnt(szc); 3602 if (pgcpfailcnt[szc] && pgcnt + throttlefree >= freemem) { 3603 VM_STAT_ADD(vmm_vmstats.pcf_deny[szc]); 3604 return (0); 3605 } 3606 VM_STAT_ADD(vmm_vmstats.pcf_allow[szc]); 3607 return (1); 3608 } 3609 #endif 3610 3611 /* 3612 * Find the `best' page on the freelist for this (vp,off) (as,vaddr) pair. 3613 * 3614 * Does its own locking and accounting. 3615 * If PG_MATCH_COLOR is set, then NULL will be returned if there are no 3616 * pages of the proper color even if there are pages of a different color. 3617 * 3618 * Finds a page, removes it, THEN locks it. 3619 */ 3620 3621 /*ARGSUSED*/ 3622 page_t * 3623 page_get_freelist(struct vnode *vp, u_offset_t off, struct seg *seg, 3624 caddr_t vaddr, size_t size, uint_t flags, struct lgrp *lgrp) 3625 { 3626 struct as *as = seg->s_as; 3627 page_t *pp = NULL; 3628 ulong_t bin; 3629 uchar_t szc; 3630 int mnode; 3631 int mtype; 3632 page_t *(*page_get_func)(int, uint_t, int, uchar_t, uint_t); 3633 lgrp_mnode_cookie_t lgrp_cookie; 3634 3635 page_get_func = page_get_mnode_freelist; 3636 3637 /* 3638 * If we aren't passed a specific lgroup, or passed a freed lgrp 3639 * assume we wish to allocate near to the current thread's home. 3640 */ 3641 if (!LGRP_EXISTS(lgrp)) 3642 lgrp = lgrp_home_lgrp(); 3643 3644 if (kcage_on) { 3645 if ((flags & (PG_NORELOC | PG_PANIC)) == PG_NORELOC && 3646 kcage_freemem < kcage_throttlefree + btop(size) && 3647 curthread != kcage_cageout_thread) { 3648 /* 3649 * Set a "reserve" of kcage_throttlefree pages for 3650 * PG_PANIC and cageout thread allocations. 3651 * 3652 * Everybody else has to serialize in 3653 * page_create_get_something() to get a cage page, so 3654 * that we don't deadlock cageout! 3655 */ 3656 return (NULL); 3657 } 3658 } else { 3659 flags &= ~PG_NORELOC; 3660 flags |= PGI_NOCAGE; 3661 } 3662 3663 /* LINTED */ 3664 MTYPE_INIT(mtype, vp, vaddr, flags, size); 3665 3666 /* 3667 * Convert size to page size code. 3668 */ 3669 if ((szc = page_szc(size)) == (uchar_t)-1) 3670 panic("page_get_freelist: illegal page size request"); 3671 ASSERT(szc < mmu_page_sizes); 3672 3673 VM_STAT_ADD(vmm_vmstats.pgf_alloc[szc]); 3674 3675 /* LINTED */ 3676 AS_2_BIN(as, seg, vp, vaddr, bin, szc); 3677 3678 ASSERT(bin < PAGE_GET_PAGECOLORS(szc)); 3679 3680 /* 3681 * Try to get a local page first, but try remote if we can't 3682 * get a page of the right color. 3683 */ 3684 pgretry: 3685 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL); 3686 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3687 pp = page_get_func(mnode, bin, mtype, szc, flags); 3688 if (pp != NULL) { 3689 VM_STAT_ADD(vmm_vmstats.pgf_allocok[szc]); 3690 DTRACE_PROBE4(page__get, 3691 lgrp_t *, lgrp, 3692 int, mnode, 3693 ulong_t, bin, 3694 uint_t, flags); 3695 return (pp); 3696 } 3697 } 3698 ASSERT(pp == NULL); 3699 3700 /* 3701 * for non-SZC0 PAGESIZE requests, check cachelist before checking 3702 * remote free lists. Caller expected to call page_get_cachelist which 3703 * will check local cache lists and remote free lists. 3704 */ 3705 if (szc == 0 && ((flags & PGI_PGCPSZC0) == 0)) { 3706 VM_STAT_ADD(vmm_vmstats.pgf_allocdeferred); 3707 return (NULL); 3708 } 3709 3710 ASSERT(szc > 0 || (flags & PGI_PGCPSZC0)); 3711 3712 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); 3713 3714 if (!(flags & PG_LOCAL)) { 3715 /* 3716 * Try to get a non-local freelist page. 3717 */ 3718 LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie); 3719 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3720 pp = page_get_func(mnode, bin, mtype, szc, flags); 3721 if (pp != NULL) { 3722 DTRACE_PROBE4(page__get, 3723 lgrp_t *, lgrp, 3724 int, mnode, 3725 ulong_t, bin, 3726 uint_t, flags); 3727 VM_STAT_ADD(vmm_vmstats.pgf_allocokrem[szc]); 3728 return (pp); 3729 } 3730 } 3731 ASSERT(pp == NULL); 3732 } 3733 3734 /* 3735 * when the cage is off chances are page_get_contig_pages() will fail 3736 * to lock a large page chunk therefore when the cage is off it's not 3737 * called by default. this can be changed via /etc/system. 3738 * 3739 * page_get_contig_pages() also called to acquire a base pagesize page 3740 * for page_create_get_something(). 3741 */ 3742 if (!(flags & PG_NORELOC) && (pg_contig_disable == 0) && 3743 (kcage_on || pg_lpgcreate_nocage || szc == 0) && 3744 (page_get_func != page_get_contig_pages)) { 3745 3746 VM_STAT_ADD(vmm_vmstats.pgf_allocretry[szc]); 3747 page_get_func = page_get_contig_pages; 3748 goto pgretry; 3749 } 3750 3751 if (!(flags & PG_LOCAL) && pgcplimitsearch && 3752 page_get_func == page_get_contig_pages) 3753 SETPGCPFAILCNT(szc); 3754 3755 VM_STAT_ADD(vmm_vmstats.pgf_allocfailed[szc]); 3756 return (NULL); 3757 } 3758 3759 /* 3760 * Find the `best' page on the cachelist for this (vp,off) (as,vaddr) pair. 3761 * 3762 * Does its own locking. 3763 * If PG_MATCH_COLOR is set, then NULL will be returned if there are no 3764 * pages of the proper color even if there are pages of a different color. 3765 * Otherwise, scan the bins for ones with pages. For each bin with pages, 3766 * try to lock one of them. If no page can be locked, try the 3767 * next bin. Return NULL if a page can not be found and locked. 3768 * 3769 * Finds a pages, trys to lock it, then removes it. 3770 */ 3771 3772 /*ARGSUSED*/ 3773 page_t * 3774 page_get_cachelist(struct vnode *vp, u_offset_t off, struct seg *seg, 3775 caddr_t vaddr, uint_t flags, struct lgrp *lgrp) 3776 { 3777 page_t *pp; 3778 struct as *as = seg->s_as; 3779 ulong_t bin; 3780 /*LINTED*/ 3781 int mnode; 3782 int mtype; 3783 lgrp_mnode_cookie_t lgrp_cookie; 3784 3785 /* 3786 * If we aren't passed a specific lgroup, or pasased a freed lgrp 3787 * assume we wish to allocate near to the current thread's home. 3788 */ 3789 if (!LGRP_EXISTS(lgrp)) 3790 lgrp = lgrp_home_lgrp(); 3791 3792 if (!kcage_on) { 3793 flags &= ~PG_NORELOC; 3794 flags |= PGI_NOCAGE; 3795 } 3796 3797 if ((flags & (PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == PG_NORELOC && 3798 kcage_freemem <= kcage_throttlefree) { 3799 /* 3800 * Reserve kcage_throttlefree pages for critical kernel 3801 * threads. 3802 * 3803 * Everybody else has to go to page_create_get_something() 3804 * to get a cage page, so we don't deadlock cageout. 3805 */ 3806 return (NULL); 3807 } 3808 3809 /* LINTED */ 3810 AS_2_BIN(as, seg, vp, vaddr, bin, 0); 3811 3812 ASSERT(bin < PAGE_GET_PAGECOLORS(0)); 3813 3814 /* LINTED */ 3815 MTYPE_INIT(mtype, vp, vaddr, flags, MMU_PAGESIZE); 3816 3817 VM_STAT_ADD(vmm_vmstats.pgc_alloc); 3818 3819 /* 3820 * Try local cachelists first 3821 */ 3822 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL); 3823 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3824 pp = page_get_mnode_cachelist(bin, flags, mnode, mtype); 3825 if (pp != NULL) { 3826 VM_STAT_ADD(vmm_vmstats.pgc_allocok); 3827 DTRACE_PROBE4(page__get, 3828 lgrp_t *, lgrp, 3829 int, mnode, 3830 ulong_t, bin, 3831 uint_t, flags); 3832 return (pp); 3833 } 3834 } 3835 3836 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); 3837 3838 /* 3839 * Try freelists/cachelists that are farther away 3840 * This is our only chance to allocate remote pages for PAGESIZE 3841 * requests. 3842 */ 3843 LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie); 3844 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3845 pp = page_get_mnode_freelist(mnode, bin, mtype, 3846 0, flags); 3847 if (pp != NULL) { 3848 VM_STAT_ADD(vmm_vmstats.pgc_allocokdeferred); 3849 DTRACE_PROBE4(page__get, 3850 lgrp_t *, lgrp, 3851 int, mnode, 3852 ulong_t, bin, 3853 uint_t, flags); 3854 return (pp); 3855 } 3856 pp = page_get_mnode_cachelist(bin, flags, mnode, mtype); 3857 if (pp != NULL) { 3858 VM_STAT_ADD(vmm_vmstats.pgc_allocokrem); 3859 DTRACE_PROBE4(page__get, 3860 lgrp_t *, lgrp, 3861 int, mnode, 3862 ulong_t, bin, 3863 uint_t, flags); 3864 return (pp); 3865 } 3866 } 3867 3868 VM_STAT_ADD(vmm_vmstats.pgc_allocfailed); 3869 return (NULL); 3870 } 3871 3872 page_t * 3873 page_get_mnode_cachelist(uint_t bin, uint_t flags, int mnode, int mtype) 3874 { 3875 kmutex_t *pcm; 3876 page_t *pp, *first_pp; 3877 uint_t sbin; 3878 int plw_initialized; 3879 page_list_walker_t plw; 3880 3881 VM_STAT_ADD(vmm_vmstats.pgmc_alloc); 3882 3883 /* LINTED */ 3884 MTYPE_START(mnode, mtype, flags); 3885 if (mtype < 0) { /* mnode does not have memory in mtype range */ 3886 VM_STAT_ADD(vmm_vmstats.pgmc_allocempty); 3887 return (NULL); 3888 } 3889 3890 try_again: 3891 3892 plw_initialized = 0; 3893 plw.plw_ceq_dif = 1; 3894 3895 /* 3896 * Only hold one cachelist lock at a time, that way we 3897 * can start anywhere and not have to worry about lock 3898 * ordering. 3899 */ 3900 3901 for (plw.plw_count = 0; 3902 plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) { 3903 sbin = bin; 3904 do { 3905 3906 if (!PAGE_CACHELISTS(mnode, bin, mtype)) 3907 goto bin_empty_1; 3908 pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST); 3909 mutex_enter(pcm); 3910 pp = PAGE_CACHELISTS(mnode, bin, mtype); 3911 if (pp == NULL) 3912 goto bin_empty_0; 3913 3914 first_pp = pp; 3915 ASSERT(pp->p_vnode); 3916 ASSERT(PP_ISAGED(pp) == 0); 3917 ASSERT(pp->p_szc == 0); 3918 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 3919 while (!page_trylock(pp, SE_EXCL)) { 3920 pp = pp->p_next; 3921 ASSERT(pp->p_szc == 0); 3922 if (pp == first_pp) { 3923 /* 3924 * We have searched the complete list! 3925 * And all of them (might only be one) 3926 * are locked. This can happen since 3927 * these pages can also be found via 3928 * the hash list. When found via the 3929 * hash list, they are locked first, 3930 * then removed. We give up to let the 3931 * other thread run. 3932 */ 3933 pp = NULL; 3934 break; 3935 } 3936 ASSERT(pp->p_vnode); 3937 ASSERT(PP_ISFREE(pp)); 3938 ASSERT(PP_ISAGED(pp) == 0); 3939 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == 3940 mnode); 3941 } 3942 3943 if (pp) { 3944 page_t **ppp; 3945 /* 3946 * Found and locked a page. 3947 * Pull it off the list. 3948 */ 3949 ASSERT(mtype == PP_2_MTYPE(pp)); 3950 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 3951 page_sub(ppp, pp); 3952 /* 3953 * Subtract counters before releasing pcm mutex 3954 * to avoid a race with page_freelist_coalesce 3955 * and page_freelist_split. 3956 */ 3957 page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST); 3958 mutex_exit(pcm); 3959 ASSERT(pp->p_vnode); 3960 ASSERT(PP_ISAGED(pp) == 0); 3961 #if defined(__sparc) 3962 ASSERT(!kcage_on || 3963 (flags & PG_NORELOC) == 0 || 3964 PP_ISNORELOC(pp)); 3965 if (PP_ISNORELOC(pp)) { 3966 kcage_freemem_sub(1); 3967 } 3968 #endif 3969 VM_STAT_ADD(vmm_vmstats. pgmc_allocok); 3970 return (pp); 3971 } 3972 bin_empty_0: 3973 mutex_exit(pcm); 3974 bin_empty_1: 3975 if (plw_initialized == 0) { 3976 page_list_walk_init(0, flags, bin, 0, 1, &plw); 3977 plw_initialized = 1; 3978 } 3979 /* calculate the next bin with equivalent color */ 3980 bin = ADD_MASKED(bin, plw.plw_bin_step, 3981 plw.plw_ceq_mask[0], plw.plw_color_mask); 3982 } while (sbin != bin); 3983 3984 if (plw.plw_ceq_dif > 1) 3985 bin = page_list_walk_next_bin(0, bin, &plw); 3986 } 3987 3988 MTYPE_NEXT(mnode, mtype, flags); 3989 if (mtype >= 0) 3990 goto try_again; 3991 3992 VM_STAT_ADD(vmm_vmstats.pgmc_allocfailed); 3993 return (NULL); 3994 } 3995 3996 #ifdef DEBUG 3997 #define REPL_PAGE_STATS 3998 #endif /* DEBUG */ 3999 4000 #ifdef REPL_PAGE_STATS 4001 struct repl_page_stats { 4002 uint_t ngets; 4003 uint_t ngets_noreloc; 4004 uint_t npgr_noreloc; 4005 uint_t nnopage_first; 4006 uint_t nnopage; 4007 uint_t nhashout; 4008 uint_t nnofree; 4009 uint_t nnext_pp; 4010 } repl_page_stats; 4011 #define REPL_STAT_INCR(v) atomic_add_32(&repl_page_stats.v, 1) 4012 #else /* REPL_PAGE_STATS */ 4013 #define REPL_STAT_INCR(v) 4014 #endif /* REPL_PAGE_STATS */ 4015 4016 int pgrppgcp; 4017 4018 /* 4019 * The freemem accounting must be done by the caller. 4020 * First we try to get a replacement page of the same size as like_pp, 4021 * if that is not possible, then we just get a set of discontiguous 4022 * PAGESIZE pages. 4023 */ 4024 page_t * 4025 page_get_replacement_page(page_t *orig_like_pp, struct lgrp *lgrp_target, 4026 uint_t pgrflags) 4027 { 4028 page_t *like_pp; 4029 page_t *pp, *pplist; 4030 page_t *pl = NULL; 4031 ulong_t bin; 4032 int mnode, page_mnode; 4033 int szc; 4034 spgcnt_t npgs, pg_cnt; 4035 pfn_t pfnum; 4036 int mtype; 4037 int flags = 0; 4038 lgrp_mnode_cookie_t lgrp_cookie; 4039 lgrp_t *lgrp; 4040 4041 REPL_STAT_INCR(ngets); 4042 like_pp = orig_like_pp; 4043 ASSERT(PAGE_EXCL(like_pp)); 4044 4045 szc = like_pp->p_szc; 4046 npgs = page_get_pagecnt(szc); 4047 /* 4048 * Now we reset like_pp to the base page_t. 4049 * That way, we won't walk past the end of this 'szc' page. 4050 */ 4051 pfnum = PFN_BASE(like_pp->p_pagenum, szc); 4052 like_pp = page_numtopp_nolock(pfnum); 4053 ASSERT(like_pp->p_szc == szc); 4054 4055 if (PP_ISNORELOC(like_pp)) { 4056 ASSERT(kcage_on); 4057 REPL_STAT_INCR(ngets_noreloc); 4058 flags = PGI_RELOCONLY; 4059 } else if (pgrflags & PGR_NORELOC) { 4060 ASSERT(kcage_on); 4061 REPL_STAT_INCR(npgr_noreloc); 4062 flags = PG_NORELOC; 4063 } 4064 4065 /* 4066 * Kernel pages must always be replaced with the same size 4067 * pages, since we cannot properly handle demotion of kernel 4068 * pages. 4069 */ 4070 if (PP_ISKAS(like_pp)) 4071 pgrflags |= PGR_SAMESZC; 4072 4073 /* LINTED */ 4074 MTYPE_PGR_INIT(mtype, flags, like_pp, page_mnode, npgs); 4075 4076 while (npgs) { 4077 pplist = NULL; 4078 for (;;) { 4079 pg_cnt = page_get_pagecnt(szc); 4080 bin = PP_2_BIN(like_pp); 4081 ASSERT(like_pp->p_szc == orig_like_pp->p_szc); 4082 ASSERT(pg_cnt <= npgs); 4083 4084 /* 4085 * If an lgroup was specified, try to get the 4086 * page from that lgroup. 4087 * NOTE: Must be careful with code below because 4088 * lgroup may disappear and reappear since there 4089 * is no locking for lgroup here. 4090 */ 4091 if (LGRP_EXISTS(lgrp_target)) { 4092 /* 4093 * Keep local variable for lgroup separate 4094 * from lgroup argument since this code should 4095 * only be exercised when lgroup argument 4096 * exists.... 4097 */ 4098 lgrp = lgrp_target; 4099 4100 /* Try the lgroup's freelists first */ 4101 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 4102 LGRP_SRCH_LOCAL); 4103 while ((pplist == NULL) && 4104 (mnode = lgrp_memnode_choose(&lgrp_cookie)) 4105 != -1) { 4106 pplist = 4107 page_get_mnode_freelist(mnode, bin, 4108 mtype, szc, flags); 4109 } 4110 4111 /* 4112 * Now try it's cachelists if this is a 4113 * small page. Don't need to do it for 4114 * larger ones since page_freelist_coalesce() 4115 * already failed. 4116 */ 4117 if (pplist != NULL || szc != 0) 4118 break; 4119 4120 /* Now try it's cachelists */ 4121 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 4122 LGRP_SRCH_LOCAL); 4123 4124 while ((pplist == NULL) && 4125 (mnode = lgrp_memnode_choose(&lgrp_cookie)) 4126 != -1) { 4127 pplist = 4128 page_get_mnode_cachelist(bin, flags, 4129 mnode, mtype); 4130 } 4131 if (pplist != NULL) { 4132 page_hashout(pplist, NULL); 4133 PP_SETAGED(pplist); 4134 REPL_STAT_INCR(nhashout); 4135 break; 4136 } 4137 /* Done looking in this lgroup. Bail out. */ 4138 break; 4139 } 4140 4141 /* 4142 * No lgroup was specified (or lgroup was removed by 4143 * DR, so just try to get the page as close to 4144 * like_pp's mnode as possible. 4145 * First try the local freelist... 4146 */ 4147 mnode = PP_2_MEM_NODE(like_pp); 4148 pplist = page_get_mnode_freelist(mnode, bin, 4149 mtype, szc, flags); 4150 if (pplist != NULL) 4151 break; 4152 4153 REPL_STAT_INCR(nnofree); 4154 4155 /* 4156 * ...then the local cachelist. Don't need to do it for 4157 * larger pages cause page_freelist_coalesce() already 4158 * failed there anyway. 4159 */ 4160 if (szc == 0) { 4161 pplist = page_get_mnode_cachelist(bin, flags, 4162 mnode, mtype); 4163 if (pplist != NULL) { 4164 page_hashout(pplist, NULL); 4165 PP_SETAGED(pplist); 4166 REPL_STAT_INCR(nhashout); 4167 break; 4168 } 4169 } 4170 4171 /* Now try remote freelists */ 4172 page_mnode = mnode; 4173 lgrp = 4174 lgrp_hand_to_lgrp(MEM_NODE_2_LGRPHAND(page_mnode)); 4175 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 4176 LGRP_SRCH_HIER); 4177 while (pplist == NULL && 4178 (mnode = lgrp_memnode_choose(&lgrp_cookie)) 4179 != -1) { 4180 /* 4181 * Skip local mnode. 4182 */ 4183 if ((mnode == page_mnode) || 4184 (mem_node_config[mnode].exists == 0)) 4185 continue; 4186 4187 pplist = page_get_mnode_freelist(mnode, 4188 bin, mtype, szc, flags); 4189 } 4190 4191 if (pplist != NULL) 4192 break; 4193 4194 4195 /* Now try remote cachelists */ 4196 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 4197 LGRP_SRCH_HIER); 4198 while (pplist == NULL && szc == 0) { 4199 mnode = lgrp_memnode_choose(&lgrp_cookie); 4200 if (mnode == -1) 4201 break; 4202 /* 4203 * Skip local mnode. 4204 */ 4205 if ((mnode == page_mnode) || 4206 (mem_node_config[mnode].exists == 0)) 4207 continue; 4208 4209 pplist = page_get_mnode_cachelist(bin, 4210 flags, mnode, mtype); 4211 4212 if (pplist != NULL) { 4213 page_hashout(pplist, NULL); 4214 PP_SETAGED(pplist); 4215 REPL_STAT_INCR(nhashout); 4216 break; 4217 } 4218 } 4219 4220 /* 4221 * Break out of while loop under the following cases: 4222 * - If we successfully got a page. 4223 * - If pgrflags specified only returning a specific 4224 * page size and we could not find that page size. 4225 * - If we could not satisfy the request with PAGESIZE 4226 * or larger pages. 4227 */ 4228 if (pplist != NULL || szc == 0) 4229 break; 4230 4231 if ((pgrflags & PGR_SAMESZC) || pgrppgcp) { 4232 /* try to find contig page */ 4233 4234 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 4235 LGRP_SRCH_HIER); 4236 4237 while ((pplist == NULL) && 4238 (mnode = 4239 lgrp_memnode_choose(&lgrp_cookie)) 4240 != -1) { 4241 pplist = page_get_contig_pages( 4242 mnode, bin, mtype, szc, 4243 flags | PGI_PGCPHIPRI); 4244 } 4245 break; 4246 } 4247 4248 /* 4249 * The correct thing to do here is try the next 4250 * page size down using szc--. Due to a bug 4251 * with the processing of HAT_RELOAD_SHARE 4252 * where the sfmmu_ttecnt arrays of all 4253 * hats sharing an ISM segment don't get updated, 4254 * using intermediate size pages for relocation 4255 * can lead to continuous page faults. 4256 */ 4257 szc = 0; 4258 } 4259 4260 if (pplist != NULL) { 4261 DTRACE_PROBE4(page__get, 4262 lgrp_t *, lgrp, 4263 int, mnode, 4264 ulong_t, bin, 4265 uint_t, flags); 4266 4267 while (pplist != NULL && pg_cnt--) { 4268 ASSERT(pplist != NULL); 4269 pp = pplist; 4270 page_sub(&pplist, pp); 4271 PP_CLRFREE(pp); 4272 PP_CLRAGED(pp); 4273 page_list_concat(&pl, &pp); 4274 npgs--; 4275 like_pp = like_pp + 1; 4276 REPL_STAT_INCR(nnext_pp); 4277 } 4278 ASSERT(pg_cnt == 0); 4279 } else { 4280 break; 4281 } 4282 } 4283 4284 if (npgs) { 4285 /* 4286 * We were unable to allocate the necessary number 4287 * of pages. 4288 * We need to free up any pl. 4289 */ 4290 REPL_STAT_INCR(nnopage); 4291 page_free_replacement_page(pl); 4292 return (NULL); 4293 } else { 4294 return (pl); 4295 } 4296 } 4297 4298 /* 4299 * demote a free large page to it's constituent pages 4300 */ 4301 void 4302 page_demote_free_pages(page_t *pp) 4303 { 4304 4305 int mnode; 4306 4307 ASSERT(pp != NULL); 4308 ASSERT(PAGE_LOCKED(pp)); 4309 ASSERT(PP_ISFREE(pp)); 4310 ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes); 4311 4312 mnode = PP_2_MEM_NODE(pp); 4313 page_freelist_lock(mnode); 4314 if (pp->p_szc != 0) { 4315 (void) page_demote(mnode, PFN_BASE(pp->p_pagenum, 4316 pp->p_szc), pp->p_szc, 0, PC_NO_COLOR, PC_FREE); 4317 } 4318 page_freelist_unlock(mnode); 4319 ASSERT(pp->p_szc == 0); 4320 } 4321 4322 /* 4323 * Factor in colorequiv to check additional 'equivalent' bins. 4324 * colorequiv may be set in /etc/system 4325 */ 4326 void 4327 page_set_colorequiv_arr(void) 4328 { 4329 if (colorequiv > 1) { 4330 int i; 4331 uint_t sv_a = lowbit(colorequiv) - 1; 4332 4333 if (sv_a > 15) 4334 sv_a = 15; 4335 4336 for (i = 0; i < MMU_PAGE_SIZES; i++) { 4337 uint_t colors; 4338 uint_t a = sv_a; 4339 4340 if ((colors = hw_page_array[i].hp_colors) <= 1) { 4341 continue; 4342 } 4343 while ((colors >> a) == 0) 4344 a--; 4345 if ((a << 4) > colorequivszc[i]) { 4346 colorequivszc[i] = (a << 4); 4347 } 4348 } 4349 } 4350 } 4351