1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 /* 26 * Copyright 2012 Joyent, Inc. All rights reserved. 27 */ 28 29 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 30 /* All Rights Reserved */ 31 32 /* 33 * Portions of this source code were derived from Berkeley 4.3 BSD 34 * under license from the Regents of the University of California. 35 */ 36 37 38 /* 39 * This file contains common functions to access and manage the page lists. 40 * Many of these routines originated from platform dependent modules 41 * (sun4/vm/vm_dep.c, i86pc/vm/vm_machdep.c) and modified to function in 42 * a platform independent manner. 43 * 44 * vm/vm_dep.h provides for platform specific support. 45 */ 46 47 #include <sys/types.h> 48 #include <sys/debug.h> 49 #include <sys/cmn_err.h> 50 #include <sys/systm.h> 51 #include <sys/atomic.h> 52 #include <sys/sysmacros.h> 53 #include <vm/as.h> 54 #include <vm/page.h> 55 #include <vm/seg_kmem.h> 56 #include <vm/seg_vn.h> 57 #include <sys/vmsystm.h> 58 #include <sys/memnode.h> 59 #include <vm/vm_dep.h> 60 #include <sys/lgrp.h> 61 #include <sys/mem_config.h> 62 #include <sys/callb.h> 63 #include <sys/mem_cage.h> 64 #include <sys/sdt.h> 65 #include <sys/dumphdr.h> 66 #include <sys/swap.h> 67 68 extern uint_t vac_colors; 69 70 #define MAX_PRAGMA_ALIGN 128 71 72 /* vm_cpu_data0 for the boot cpu before kmem is initialized */ 73 74 #if L2CACHE_ALIGN_MAX <= MAX_PRAGMA_ALIGN 75 #pragma align L2CACHE_ALIGN_MAX(vm_cpu_data0) 76 #else 77 #pragma align MAX_PRAGMA_ALIGN(vm_cpu_data0) 78 #endif 79 char vm_cpu_data0[VM_CPU_DATA_PADSIZE]; 80 81 /* 82 * number of page colors equivalent to reqested color in page_get routines. 83 * If set, keeps large pages intact longer and keeps MPO allocation 84 * from the local mnode in favor of acquiring the 'correct' page color from 85 * a demoted large page or from a remote mnode. 86 */ 87 uint_t colorequiv; 88 89 /* 90 * color equivalency mask for each page size. 91 * Mask is computed based on cpu L2$ way sizes and colorequiv global. 92 * High 4 bits determine the number of high order bits of the color to ignore. 93 * Low 4 bits determines number of low order bits of color to ignore (it's only 94 * relevant for hashed index based page coloring). 95 */ 96 uchar_t colorequivszc[MMU_PAGE_SIZES]; 97 98 /* 99 * if set, specifies the percentage of large pages that are free from within 100 * a large page region before attempting to lock those pages for 101 * page_get_contig_pages processing. 102 * 103 * Should be turned on when kpr is available when page_trylock_contig_pages 104 * can be more selective. 105 */ 106 107 int ptcpthreshold; 108 109 /* 110 * Limit page get contig page search based on failure cnts in pgcpfailcnt[]. 111 * Enabled by default via pgcplimitsearch. 112 * 113 * pgcpfailcnt[] is bounded by PGCPFAILMAX (>= 1/2 of installed 114 * memory). When reached, pgcpfailcnt[] is reset to 1/2 of this upper 115 * bound. This upper bound range guarantees: 116 * - all large page 'slots' will be searched over time 117 * - the minimum (1) large page candidates considered on each pgcp call 118 * - count doesn't wrap around to 0 119 */ 120 pgcnt_t pgcpfailcnt[MMU_PAGE_SIZES]; 121 int pgcplimitsearch = 1; 122 123 #define PGCPFAILMAX (1 << (highbit(physinstalled) - 1)) 124 #define SETPGCPFAILCNT(szc) \ 125 if (++pgcpfailcnt[szc] >= PGCPFAILMAX) \ 126 pgcpfailcnt[szc] = PGCPFAILMAX / 2; 127 128 #ifdef VM_STATS 129 struct vmm_vmstats_str vmm_vmstats; 130 131 #endif /* VM_STATS */ 132 133 #if defined(__sparc) 134 #define LPGCREATE 0 135 #else 136 /* enable page_get_contig_pages */ 137 #define LPGCREATE 1 138 #endif 139 140 int pg_contig_disable; 141 int pg_lpgcreate_nocage = LPGCREATE; 142 143 /* 144 * page_freelist_split pfn flag to signify no lo or hi pfn requirement. 145 */ 146 #define PFNNULL 0 147 148 /* Flags involved in promotion and demotion routines */ 149 #define PC_FREE 0x1 /* put page on freelist */ 150 #define PC_ALLOC 0x2 /* return page for allocation */ 151 152 /* 153 * Flag for page_demote to be used with PC_FREE to denote that we don't care 154 * what the color is as the color parameter to the function is ignored. 155 */ 156 #define PC_NO_COLOR (-1) 157 158 /* mtype value for page_promote to use when mtype does not matter */ 159 #define PC_MTYPE_ANY (-1) 160 161 /* 162 * page counters candidates info 163 * See page_ctrs_cands comment below for more details. 164 * fields are as follows: 165 * pcc_pages_free: # pages which freelist coalesce can create 166 * pcc_color_free: pointer to page free counts per color 167 */ 168 typedef struct pcc_info { 169 pgcnt_t pcc_pages_free; 170 pgcnt_t *pcc_color_free; 171 uint_t pad[12]; 172 } pcc_info_t; 173 174 /* 175 * On big machines it can take a long time to check page_counters 176 * arrays. page_ctrs_cands is a summary array whose elements are a dynamically 177 * updated sum of all elements of the corresponding page_counters arrays. 178 * page_freelist_coalesce() searches page_counters only if an appropriate 179 * element of page_ctrs_cands array is greater than 0. 180 * 181 * page_ctrs_cands is indexed by mutex (i), region (r), mnode (m), mrange (g) 182 */ 183 pcc_info_t **page_ctrs_cands[NPC_MUTEX][MMU_PAGE_SIZES]; 184 185 /* 186 * Return in val the total number of free pages which can be created 187 * for the given mnode (m), mrange (g), and region size (r) 188 */ 189 #define PGCTRS_CANDS_GETVALUE(m, g, r, val) { \ 190 int i; \ 191 val = 0; \ 192 for (i = 0; i < NPC_MUTEX; i++) { \ 193 val += page_ctrs_cands[i][(r)][(m)][(g)].pcc_pages_free; \ 194 } \ 195 } 196 197 /* 198 * Return in val the total number of free pages which can be created 199 * for the given mnode (m), mrange (g), region size (r), and color (c) 200 */ 201 #define PGCTRS_CANDS_GETVALUECOLOR(m, g, r, c, val) { \ 202 int i; \ 203 val = 0; \ 204 ASSERT((c) < PAGE_GET_PAGECOLORS(r)); \ 205 for (i = 0; i < NPC_MUTEX; i++) { \ 206 val += \ 207 page_ctrs_cands[i][(r)][(m)][(g)].pcc_color_free[(c)]; \ 208 } \ 209 } 210 211 /* 212 * We can only allow a single thread to update a counter within the physical 213 * range of the largest supported page size. That is the finest granularity 214 * possible since the counter values are dependent on each other 215 * as you move accross region sizes. PP_CTR_LOCK_INDX is used to determine the 216 * ctr_mutex lock index for a particular physical range. 217 */ 218 static kmutex_t *ctr_mutex[NPC_MUTEX]; 219 220 #define PP_CTR_LOCK_INDX(pp) \ 221 (((pp)->p_pagenum >> \ 222 (PAGE_BSZS_SHIFT(mmu_page_sizes - 1))) & (NPC_MUTEX - 1)) 223 224 #define INVALID_COLOR 0xffffffff 225 #define INVALID_MASK 0xffffffff 226 227 /* 228 * Local functions prototypes. 229 */ 230 231 void page_ctr_add(int, int, page_t *, int); 232 void page_ctr_add_internal(int, int, page_t *, int); 233 void page_ctr_sub(int, int, page_t *, int); 234 void page_ctr_sub_internal(int, int, page_t *, int); 235 void page_freelist_lock(int); 236 void page_freelist_unlock(int); 237 page_t *page_promote(int, pfn_t, uchar_t, int, int); 238 page_t *page_demote(int, pfn_t, pfn_t, uchar_t, uchar_t, int, int); 239 page_t *page_freelist_split(uchar_t, 240 uint_t, int, int, pfn_t, pfn_t, page_list_walker_t *); 241 page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int); 242 static int page_trylock_cons(page_t *pp, se_t se); 243 244 /* 245 * The page_counters array below is used to keep track of free contiguous 246 * physical memory. A hw_page_map_t will be allocated per mnode per szc. 247 * This contains an array of counters, the size of the array, a shift value 248 * used to convert a pagenum into a counter array index or vice versa, as 249 * well as a cache of the last successful index to be promoted to a larger 250 * page size. As an optimization, we keep track of the last successful index 251 * to be promoted per page color for the given size region, and this is 252 * allocated dynamically based upon the number of colors for a given 253 * region size. 254 * 255 * Conceptually, the page counters are represented as: 256 * 257 * page_counters[region_size][mnode] 258 * 259 * region_size: size code of a candidate larger page made up 260 * of contiguous free smaller pages. 261 * 262 * page_counters[region_size][mnode].hpm_counters[index]: 263 * represents how many (region_size - 1) pages either 264 * exist or can be created within the given index range. 265 * 266 * Let's look at a sparc example: 267 * If we want to create a free 512k page, we look at region_size 2 268 * for the mnode we want. We calculate the index and look at a specific 269 * hpm_counters location. If we see 8 (FULL_REGION_CNT on sparc) at 270 * this location, it means that 8 64k pages either exist or can be created 271 * from 8K pages in order to make a single free 512k page at the given 272 * index. Note that when a region is full, it will contribute to the 273 * counts in the region above it. Thus we will not know what page 274 * size the free pages will be which can be promoted to this new free 275 * page unless we look at all regions below the current region. 276 */ 277 278 /* 279 * Note: hpmctr_t is defined in platform vm_dep.h 280 * hw_page_map_t contains all the information needed for the page_counters 281 * logic. The fields are as follows: 282 * 283 * hpm_counters: dynamically allocated array to hold counter data 284 * hpm_entries: entries in hpm_counters 285 * hpm_shift: shift for pnum/array index conv 286 * hpm_base: PFN mapped to counter index 0 287 * hpm_color_current: last index in counter array for this color at 288 * which we successfully created a large page 289 */ 290 typedef struct hw_page_map { 291 hpmctr_t *hpm_counters; 292 size_t hpm_entries; 293 int hpm_shift; 294 pfn_t hpm_base; 295 size_t *hpm_color_current[MAX_MNODE_MRANGES]; 296 #if defined(__sparc) 297 uint_t pad[4]; 298 #endif 299 } hw_page_map_t; 300 301 /* 302 * Element zero is not used, but is allocated for convenience. 303 */ 304 static hw_page_map_t *page_counters[MMU_PAGE_SIZES]; 305 306 /* 307 * Cached value of MNODE_RANGE_CNT(mnode). 308 * This is a function call in x86. 309 */ 310 static int mnode_nranges[MAX_MEM_NODES]; 311 static int mnode_maxmrange[MAX_MEM_NODES]; 312 313 /* 314 * The following macros are convenient ways to get access to the individual 315 * elements of the page_counters arrays. They can be used on both 316 * the left side and right side of equations. 317 */ 318 #define PAGE_COUNTERS(mnode, rg_szc, idx) \ 319 (page_counters[(rg_szc)][(mnode)].hpm_counters[(idx)]) 320 321 #define PAGE_COUNTERS_COUNTERS(mnode, rg_szc) \ 322 (page_counters[(rg_szc)][(mnode)].hpm_counters) 323 324 #define PAGE_COUNTERS_SHIFT(mnode, rg_szc) \ 325 (page_counters[(rg_szc)][(mnode)].hpm_shift) 326 327 #define PAGE_COUNTERS_ENTRIES(mnode, rg_szc) \ 328 (page_counters[(rg_szc)][(mnode)].hpm_entries) 329 330 #define PAGE_COUNTERS_BASE(mnode, rg_szc) \ 331 (page_counters[(rg_szc)][(mnode)].hpm_base) 332 333 #define PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, rg_szc, g) \ 334 (page_counters[(rg_szc)][(mnode)].hpm_color_current[(g)]) 335 336 #define PAGE_COUNTERS_CURRENT_COLOR(mnode, rg_szc, color, mrange) \ 337 (page_counters[(rg_szc)][(mnode)]. \ 338 hpm_color_current[(mrange)][(color)]) 339 340 #define PNUM_TO_IDX(mnode, rg_szc, pnum) \ 341 (((pnum) - PAGE_COUNTERS_BASE((mnode), (rg_szc))) >> \ 342 PAGE_COUNTERS_SHIFT((mnode), (rg_szc))) 343 344 #define IDX_TO_PNUM(mnode, rg_szc, index) \ 345 (PAGE_COUNTERS_BASE((mnode), (rg_szc)) + \ 346 ((index) << PAGE_COUNTERS_SHIFT((mnode), (rg_szc)))) 347 348 /* 349 * Protects the hpm_counters and hpm_color_current memory from changing while 350 * looking at page counters information. 351 * Grab the write lock to modify what these fields point at. 352 * Grab the read lock to prevent any pointers from changing. 353 * The write lock can not be held during memory allocation due to a possible 354 * recursion deadlock with trying to grab the read lock while the 355 * write lock is already held. 356 */ 357 krwlock_t page_ctrs_rwlock[MAX_MEM_NODES]; 358 359 360 /* 361 * initialize cpu_vm_data to point at cache aligned vm_cpu_data_t. 362 */ 363 void 364 cpu_vm_data_init(struct cpu *cp) 365 { 366 if (cp == CPU0) { 367 cp->cpu_vm_data = (void *)&vm_cpu_data0; 368 } else { 369 void *kmptr; 370 int align; 371 size_t sz; 372 373 align = (L2CACHE_ALIGN) ? L2CACHE_ALIGN : L2CACHE_ALIGN_MAX; 374 sz = P2ROUNDUP(sizeof (vm_cpu_data_t), align) + align; 375 kmptr = kmem_zalloc(sz, KM_SLEEP); 376 cp->cpu_vm_data = (void *) P2ROUNDUP((uintptr_t)kmptr, align); 377 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr = kmptr; 378 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize = sz; 379 } 380 } 381 382 /* 383 * free cpu_vm_data 384 */ 385 void 386 cpu_vm_data_destroy(struct cpu *cp) 387 { 388 if (cp->cpu_seqid && cp->cpu_vm_data) { 389 ASSERT(cp != CPU0); 390 kmem_free(((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr, 391 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize); 392 } 393 cp->cpu_vm_data = NULL; 394 } 395 396 397 /* 398 * page size to page size code 399 */ 400 int 401 page_szc(size_t pagesize) 402 { 403 int i = 0; 404 405 while (hw_page_array[i].hp_size) { 406 if (pagesize == hw_page_array[i].hp_size) 407 return (i); 408 i++; 409 } 410 return (-1); 411 } 412 413 /* 414 * page size to page size code with the restriction that it be a supported 415 * user page size. If it's not a supported user page size, -1 will be returned. 416 */ 417 int 418 page_szc_user_filtered(size_t pagesize) 419 { 420 int szc = page_szc(pagesize); 421 if ((szc != -1) && (SZC_2_USERSZC(szc) != -1)) { 422 return (szc); 423 } 424 return (-1); 425 } 426 427 /* 428 * Return how many page sizes are available for the user to use. This is 429 * what the hardware supports and not based upon how the OS implements the 430 * support of different page sizes. 431 * 432 * If legacy is non-zero, return the number of pagesizes available to legacy 433 * applications. The number of legacy page sizes might be less than the 434 * exported user page sizes. This is to prevent legacy applications that 435 * use the largest page size returned from getpagesizes(3c) from inadvertantly 436 * using the 'new' large pagesizes. 437 */ 438 uint_t 439 page_num_user_pagesizes(int legacy) 440 { 441 if (legacy) 442 return (mmu_legacy_page_sizes); 443 return (mmu_exported_page_sizes); 444 } 445 446 uint_t 447 page_num_pagesizes(void) 448 { 449 return (mmu_page_sizes); 450 } 451 452 /* 453 * returns the count of the number of base pagesize pages associated with szc 454 */ 455 pgcnt_t 456 page_get_pagecnt(uint_t szc) 457 { 458 if (szc >= mmu_page_sizes) 459 panic("page_get_pagecnt: out of range %d", szc); 460 return (hw_page_array[szc].hp_pgcnt); 461 } 462 463 size_t 464 page_get_pagesize(uint_t szc) 465 { 466 if (szc >= mmu_page_sizes) 467 panic("page_get_pagesize: out of range %d", szc); 468 return (hw_page_array[szc].hp_size); 469 } 470 471 /* 472 * Return the size of a page based upon the index passed in. An index of 473 * zero refers to the smallest page size in the system, and as index increases 474 * it refers to the next larger supported page size in the system. 475 * Note that szc and userszc may not be the same due to unsupported szc's on 476 * some systems. 477 */ 478 size_t 479 page_get_user_pagesize(uint_t userszc) 480 { 481 uint_t szc = USERSZC_2_SZC(userszc); 482 483 if (szc >= mmu_page_sizes) 484 panic("page_get_user_pagesize: out of range %d", szc); 485 return (hw_page_array[szc].hp_size); 486 } 487 488 uint_t 489 page_get_shift(uint_t szc) 490 { 491 if (szc >= mmu_page_sizes) 492 panic("page_get_shift: out of range %d", szc); 493 return (PAGE_GET_SHIFT(szc)); 494 } 495 496 uint_t 497 page_get_pagecolors(uint_t szc) 498 { 499 if (szc >= mmu_page_sizes) 500 panic("page_get_pagecolors: out of range %d", szc); 501 return (PAGE_GET_PAGECOLORS(szc)); 502 } 503 504 /* 505 * this assigns the desired equivalent color after a split 506 */ 507 uint_t 508 page_correct_color(uchar_t szc, uchar_t nszc, uint_t color, 509 uint_t ncolor, uint_t ceq_mask) 510 { 511 ASSERT(nszc > szc); 512 ASSERT(szc < mmu_page_sizes); 513 ASSERT(color < PAGE_GET_PAGECOLORS(szc)); 514 ASSERT(ncolor < PAGE_GET_PAGECOLORS(nszc)); 515 516 color &= ceq_mask; 517 ncolor = PAGE_CONVERT_COLOR(ncolor, szc, nszc); 518 return (color | (ncolor & ~ceq_mask)); 519 } 520 521 /* 522 * The interleaved_mnodes flag is set when mnodes overlap in 523 * the physbase..physmax range, but have disjoint slices. 524 * In this case hpm_counters is shared by all mnodes. 525 * This flag is set dynamically by the platform. 526 */ 527 int interleaved_mnodes = 0; 528 529 /* 530 * Called by startup(). 531 * Size up the per page size free list counters based on physmax 532 * of each node and max_mem_nodes. 533 * 534 * If interleaved_mnodes is set we need to find the first mnode that 535 * exists. hpm_counters for the first mnode will then be shared by 536 * all other mnodes. If interleaved_mnodes is not set, just set 537 * first=mnode each time. That means there will be no sharing. 538 */ 539 size_t 540 page_ctrs_sz(void) 541 { 542 int r; /* region size */ 543 int mnode; 544 int firstmn; /* first mnode that exists */ 545 int nranges; 546 pfn_t physbase; 547 pfn_t physmax; 548 uint_t ctrs_sz = 0; 549 int i; 550 pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; 551 552 /* 553 * We need to determine how many page colors there are for each 554 * page size in order to allocate memory for any color specific 555 * arrays. 556 */ 557 for (i = 0; i < mmu_page_sizes; i++) { 558 colors_per_szc[i] = PAGE_GET_PAGECOLORS(i); 559 } 560 561 for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) { 562 563 pgcnt_t r_pgcnt; 564 pfn_t r_base; 565 pgcnt_t r_align; 566 567 if (mem_node_config[mnode].exists == 0) 568 continue; 569 570 HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn); 571 nranges = MNODE_RANGE_CNT(mnode); 572 mnode_nranges[mnode] = nranges; 573 mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode); 574 575 /* 576 * determine size needed for page counter arrays with 577 * base aligned to large page size. 578 */ 579 for (r = 1; r < mmu_page_sizes; r++) { 580 /* add in space for hpm_color_current */ 581 ctrs_sz += sizeof (size_t) * 582 colors_per_szc[r] * nranges; 583 584 if (firstmn != mnode) 585 continue; 586 587 /* add in space for hpm_counters */ 588 r_align = page_get_pagecnt(r); 589 r_base = physbase; 590 r_base &= ~(r_align - 1); 591 r_pgcnt = howmany(physmax - r_base + 1, r_align); 592 593 /* 594 * Round up to always allocate on pointer sized 595 * boundaries. 596 */ 597 ctrs_sz += P2ROUNDUP((r_pgcnt * sizeof (hpmctr_t)), 598 sizeof (hpmctr_t *)); 599 } 600 } 601 602 for (r = 1; r < mmu_page_sizes; r++) { 603 ctrs_sz += (max_mem_nodes * sizeof (hw_page_map_t)); 604 } 605 606 /* add in space for page_ctrs_cands and pcc_color_free */ 607 ctrs_sz += sizeof (pcc_info_t *) * max_mem_nodes * 608 mmu_page_sizes * NPC_MUTEX; 609 610 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 611 612 if (mem_node_config[mnode].exists == 0) 613 continue; 614 615 nranges = mnode_nranges[mnode]; 616 ctrs_sz += sizeof (pcc_info_t) * nranges * 617 mmu_page_sizes * NPC_MUTEX; 618 for (r = 1; r < mmu_page_sizes; r++) { 619 ctrs_sz += sizeof (pgcnt_t) * nranges * 620 colors_per_szc[r] * NPC_MUTEX; 621 } 622 } 623 624 /* ctr_mutex */ 625 ctrs_sz += (max_mem_nodes * NPC_MUTEX * sizeof (kmutex_t)); 626 627 /* size for page list counts */ 628 PLCNT_SZ(ctrs_sz); 629 630 /* 631 * add some slop for roundups. page_ctrs_alloc will roundup the start 632 * address of the counters to ecache_alignsize boundary for every 633 * memory node. 634 */ 635 return (ctrs_sz + max_mem_nodes * L2CACHE_ALIGN); 636 } 637 638 caddr_t 639 page_ctrs_alloc(caddr_t alloc_base) 640 { 641 int mnode; 642 int mrange, nranges; 643 int r; /* region size */ 644 int i; 645 int firstmn; /* first mnode that exists */ 646 pfn_t physbase; 647 pfn_t physmax; 648 pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; 649 650 /* 651 * We need to determine how many page colors there are for each 652 * page size in order to allocate memory for any color specific 653 * arrays. 654 */ 655 for (i = 0; i < mmu_page_sizes; i++) { 656 colors_per_szc[i] = PAGE_GET_PAGECOLORS(i); 657 } 658 659 for (r = 1; r < mmu_page_sizes; r++) { 660 page_counters[r] = (hw_page_map_t *)alloc_base; 661 alloc_base += (max_mem_nodes * sizeof (hw_page_map_t)); 662 } 663 664 /* page_ctrs_cands and pcc_color_free array */ 665 for (i = 0; i < NPC_MUTEX; i++) { 666 for (r = 1; r < mmu_page_sizes; r++) { 667 668 page_ctrs_cands[i][r] = (pcc_info_t **)alloc_base; 669 alloc_base += sizeof (pcc_info_t *) * max_mem_nodes; 670 671 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 672 pcc_info_t *pi; 673 674 if (mem_node_config[mnode].exists == 0) 675 continue; 676 677 nranges = mnode_nranges[mnode]; 678 679 pi = (pcc_info_t *)alloc_base; 680 alloc_base += sizeof (pcc_info_t) * nranges; 681 page_ctrs_cands[i][r][mnode] = pi; 682 683 for (mrange = 0; mrange < nranges; mrange++) { 684 pi->pcc_color_free = 685 (pgcnt_t *)alloc_base; 686 alloc_base += sizeof (pgcnt_t) * 687 colors_per_szc[r]; 688 pi++; 689 } 690 } 691 } 692 } 693 694 /* ctr_mutex */ 695 for (i = 0; i < NPC_MUTEX; i++) { 696 ctr_mutex[i] = (kmutex_t *)alloc_base; 697 alloc_base += (max_mem_nodes * sizeof (kmutex_t)); 698 } 699 700 /* initialize page list counts */ 701 PLCNT_INIT(alloc_base); 702 703 for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) { 704 705 pgcnt_t r_pgcnt; 706 pfn_t r_base; 707 pgcnt_t r_align; 708 int r_shift; 709 int nranges = mnode_nranges[mnode]; 710 711 if (mem_node_config[mnode].exists == 0) 712 continue; 713 714 HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn); 715 716 for (r = 1; r < mmu_page_sizes; r++) { 717 /* 718 * the page_counters base has to be aligned to the 719 * page count of page size code r otherwise the counts 720 * will cross large page boundaries. 721 */ 722 r_align = page_get_pagecnt(r); 723 r_base = physbase; 724 /* base needs to be aligned - lower to aligned value */ 725 r_base &= ~(r_align - 1); 726 r_pgcnt = howmany(physmax - r_base + 1, r_align); 727 r_shift = PAGE_BSZS_SHIFT(r); 728 729 PAGE_COUNTERS_SHIFT(mnode, r) = r_shift; 730 PAGE_COUNTERS_ENTRIES(mnode, r) = r_pgcnt; 731 PAGE_COUNTERS_BASE(mnode, r) = r_base; 732 for (mrange = 0; mrange < nranges; mrange++) { 733 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, 734 r, mrange) = (size_t *)alloc_base; 735 alloc_base += sizeof (size_t) * 736 colors_per_szc[r]; 737 } 738 for (i = 0; i < colors_per_szc[r]; i++) { 739 uint_t color_mask = colors_per_szc[r] - 1; 740 pfn_t pfnum = r_base; 741 size_t idx; 742 int mrange; 743 MEM_NODE_ITERATOR_DECL(it); 744 745 MEM_NODE_ITERATOR_INIT(pfnum, mnode, r, &it); 746 if (pfnum == (pfn_t)-1) { 747 idx = 0; 748 } else { 749 PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i, 750 color_mask, color_mask, &it); 751 idx = PNUM_TO_IDX(mnode, r, pfnum); 752 idx = (idx >= r_pgcnt) ? 0 : idx; 753 } 754 for (mrange = 0; mrange < nranges; mrange++) { 755 PAGE_COUNTERS_CURRENT_COLOR(mnode, 756 r, i, mrange) = idx; 757 } 758 } 759 760 /* hpm_counters may be shared by all mnodes */ 761 if (firstmn == mnode) { 762 PAGE_COUNTERS_COUNTERS(mnode, r) = 763 (hpmctr_t *)alloc_base; 764 alloc_base += 765 P2ROUNDUP((sizeof (hpmctr_t) * r_pgcnt), 766 sizeof (hpmctr_t *)); 767 } else { 768 PAGE_COUNTERS_COUNTERS(mnode, r) = 769 PAGE_COUNTERS_COUNTERS(firstmn, r); 770 } 771 772 /* 773 * Verify that PNUM_TO_IDX and IDX_TO_PNUM 774 * satisfy the identity requirement. 775 * We should be able to go from one to the other 776 * and get consistent values. 777 */ 778 ASSERT(PNUM_TO_IDX(mnode, r, 779 (IDX_TO_PNUM(mnode, r, 0))) == 0); 780 ASSERT(IDX_TO_PNUM(mnode, r, 781 (PNUM_TO_IDX(mnode, r, r_base))) == r_base); 782 } 783 /* 784 * Roundup the start address of the page_counters to 785 * cache aligned boundary for every memory node. 786 * page_ctrs_sz() has added some slop for these roundups. 787 */ 788 alloc_base = (caddr_t)P2ROUNDUP((uintptr_t)alloc_base, 789 L2CACHE_ALIGN); 790 } 791 792 /* Initialize other page counter specific data structures. */ 793 for (mnode = 0; mnode < MAX_MEM_NODES; mnode++) { 794 rw_init(&page_ctrs_rwlock[mnode], NULL, RW_DEFAULT, NULL); 795 } 796 797 return (alloc_base); 798 } 799 800 /* 801 * Functions to adjust region counters for each size free list. 802 * Caller is responsible to acquire the ctr_mutex lock if necessary and 803 * thus can be called during startup without locks. 804 */ 805 /* ARGSUSED */ 806 void 807 page_ctr_add_internal(int mnode, int mtype, page_t *pp, int flags) 808 { 809 ssize_t r; /* region size */ 810 ssize_t idx; 811 pfn_t pfnum; 812 int lckidx; 813 814 ASSERT(mnode == PP_2_MEM_NODE(pp)); 815 ASSERT(mtype == PP_2_MTYPE(pp)); 816 817 ASSERT(pp->p_szc < mmu_page_sizes); 818 819 PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags); 820 821 /* no counter update needed for largest page size */ 822 if (pp->p_szc >= mmu_page_sizes - 1) { 823 return; 824 } 825 826 r = pp->p_szc + 1; 827 pfnum = pp->p_pagenum; 828 lckidx = PP_CTR_LOCK_INDX(pp); 829 830 /* 831 * Increment the count of free pages for the current 832 * region. Continue looping up in region size incrementing 833 * count if the preceeding region is full. 834 */ 835 while (r < mmu_page_sizes) { 836 idx = PNUM_TO_IDX(mnode, r, pfnum); 837 838 ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r)); 839 ASSERT(PAGE_COUNTERS(mnode, r, idx) < FULL_REGION_CNT(r)); 840 841 if (++PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r)) { 842 break; 843 } else { 844 int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r)); 845 pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode] 846 [MTYPE_2_MRANGE(mnode, root_mtype)]; 847 848 cand->pcc_pages_free++; 849 cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]++; 850 } 851 r++; 852 } 853 } 854 855 void 856 page_ctr_add(int mnode, int mtype, page_t *pp, int flags) 857 { 858 int lckidx = PP_CTR_LOCK_INDX(pp); 859 kmutex_t *lock = &ctr_mutex[lckidx][mnode]; 860 861 mutex_enter(lock); 862 page_ctr_add_internal(mnode, mtype, pp, flags); 863 mutex_exit(lock); 864 } 865 866 void 867 page_ctr_sub_internal(int mnode, int mtype, page_t *pp, int flags) 868 { 869 int lckidx; 870 ssize_t r; /* region size */ 871 ssize_t idx; 872 pfn_t pfnum; 873 874 ASSERT(mnode == PP_2_MEM_NODE(pp)); 875 ASSERT(mtype == PP_2_MTYPE(pp)); 876 877 ASSERT(pp->p_szc < mmu_page_sizes); 878 879 PLCNT_DECR(pp, mnode, mtype, pp->p_szc, flags); 880 881 /* no counter update needed for largest page size */ 882 if (pp->p_szc >= mmu_page_sizes - 1) { 883 return; 884 } 885 886 r = pp->p_szc + 1; 887 pfnum = pp->p_pagenum; 888 lckidx = PP_CTR_LOCK_INDX(pp); 889 890 /* 891 * Decrement the count of free pages for the current 892 * region. Continue looping up in region size decrementing 893 * count if the preceeding region was full. 894 */ 895 while (r < mmu_page_sizes) { 896 idx = PNUM_TO_IDX(mnode, r, pfnum); 897 898 ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r)); 899 ASSERT(PAGE_COUNTERS(mnode, r, idx) > 0); 900 901 if (--PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r) - 1) { 902 break; 903 } else { 904 int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r)); 905 pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode] 906 [MTYPE_2_MRANGE(mnode, root_mtype)]; 907 908 ASSERT(cand->pcc_pages_free != 0); 909 ASSERT(cand->pcc_color_free[PP_2_BIN_SZC(pp, r)] != 0); 910 911 cand->pcc_pages_free--; 912 cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]--; 913 } 914 r++; 915 } 916 } 917 918 void 919 page_ctr_sub(int mnode, int mtype, page_t *pp, int flags) 920 { 921 int lckidx = PP_CTR_LOCK_INDX(pp); 922 kmutex_t *lock = &ctr_mutex[lckidx][mnode]; 923 924 mutex_enter(lock); 925 page_ctr_sub_internal(mnode, mtype, pp, flags); 926 mutex_exit(lock); 927 } 928 929 /* 930 * Adjust page counters following a memory attach, since typically the 931 * size of the array needs to change, and the PFN to counter index 932 * mapping needs to change. 933 * 934 * It is possible this mnode did not exist at startup. In that case 935 * allocate pcc_info_t and pcc_color_free arrays. Also, allow for nranges 936 * to change (a theoretical possibility on x86), which means pcc_color_free 937 * arrays must be extended. 938 */ 939 uint_t 940 page_ctrs_adjust(int mnode) 941 { 942 pgcnt_t npgs; 943 int r; /* region size */ 944 int i; 945 size_t pcsz, old_csz; 946 hpmctr_t *new_ctr, *old_ctr; 947 pfn_t oldbase, newbase; 948 pfn_t physbase, physmax; 949 size_t old_npgs; 950 hpmctr_t *ctr_cache[MMU_PAGE_SIZES]; 951 size_t size_cache[MMU_PAGE_SIZES]; 952 size_t *color_cache[MMU_PAGE_SIZES][MAX_MNODE_MRANGES]; 953 size_t *old_color_array[MAX_MNODE_MRANGES]; 954 pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; 955 pcc_info_t **cands_cache; 956 pcc_info_t *old_pi, *pi; 957 pgcnt_t *pgcntp; 958 int nr, old_nranges, mrange, nranges = MNODE_RANGE_CNT(mnode); 959 int cands_cache_nranges; 960 int old_maxmrange, new_maxmrange; 961 int rc = 0; 962 int oldmnode; 963 964 cands_cache = kmem_zalloc(sizeof (pcc_info_t *) * NPC_MUTEX * 965 MMU_PAGE_SIZES, KM_NOSLEEP); 966 if (cands_cache == NULL) 967 return (ENOMEM); 968 969 i = -1; 970 HPM_COUNTERS_LIMITS(mnode, physbase, physmax, i); 971 972 newbase = physbase & ~PC_BASE_ALIGN_MASK; 973 npgs = roundup(physmax, PC_BASE_ALIGN) - newbase; 974 975 /* prepare to free non-null pointers on the way out */ 976 cands_cache_nranges = nranges; 977 bzero(ctr_cache, sizeof (ctr_cache)); 978 bzero(color_cache, sizeof (color_cache)); 979 980 /* 981 * We need to determine how many page colors there are for each 982 * page size in order to allocate memory for any color specific 983 * arrays. 984 */ 985 for (r = 0; r < mmu_page_sizes; r++) { 986 colors_per_szc[r] = PAGE_GET_PAGECOLORS(r); 987 } 988 989 /* 990 * Preallocate all of the new hpm_counters arrays as we can't 991 * hold the page_ctrs_rwlock as a writer and allocate memory. 992 * If we can't allocate all of the arrays, undo our work so far 993 * and return failure. 994 */ 995 for (r = 1; r < mmu_page_sizes; r++) { 996 pcsz = npgs >> PAGE_BSZS_SHIFT(r); 997 size_cache[r] = pcsz; 998 ctr_cache[r] = kmem_zalloc(pcsz * 999 sizeof (hpmctr_t), KM_NOSLEEP); 1000 if (ctr_cache[r] == NULL) { 1001 rc = ENOMEM; 1002 goto cleanup; 1003 } 1004 } 1005 1006 /* 1007 * Preallocate all of the new color current arrays as we can't 1008 * hold the page_ctrs_rwlock as a writer and allocate memory. 1009 * If we can't allocate all of the arrays, undo our work so far 1010 * and return failure. 1011 */ 1012 for (r = 1; r < mmu_page_sizes; r++) { 1013 for (mrange = 0; mrange < nranges; mrange++) { 1014 color_cache[r][mrange] = kmem_zalloc(sizeof (size_t) * 1015 colors_per_szc[r], KM_NOSLEEP); 1016 if (color_cache[r][mrange] == NULL) { 1017 rc = ENOMEM; 1018 goto cleanup; 1019 } 1020 } 1021 } 1022 1023 /* 1024 * Preallocate all of the new pcc_info_t arrays as we can't 1025 * hold the page_ctrs_rwlock as a writer and allocate memory. 1026 * If we can't allocate all of the arrays, undo our work so far 1027 * and return failure. 1028 */ 1029 for (r = 1; r < mmu_page_sizes; r++) { 1030 for (i = 0; i < NPC_MUTEX; i++) { 1031 pi = kmem_zalloc(nranges * sizeof (pcc_info_t), 1032 KM_NOSLEEP); 1033 if (pi == NULL) { 1034 rc = ENOMEM; 1035 goto cleanup; 1036 } 1037 cands_cache[i * MMU_PAGE_SIZES + r] = pi; 1038 1039 for (mrange = 0; mrange < nranges; mrange++, pi++) { 1040 pgcntp = kmem_zalloc(colors_per_szc[r] * 1041 sizeof (pgcnt_t), KM_NOSLEEP); 1042 if (pgcntp == NULL) { 1043 rc = ENOMEM; 1044 goto cleanup; 1045 } 1046 pi->pcc_color_free = pgcntp; 1047 } 1048 } 1049 } 1050 1051 /* 1052 * Grab the write lock to prevent others from walking these arrays 1053 * while we are modifying them. 1054 */ 1055 PAGE_CTRS_WRITE_LOCK(mnode); 1056 1057 /* 1058 * For interleaved mnodes, find the first mnode 1059 * with valid page counters since the current 1060 * mnode may have just been added and not have 1061 * valid page counters. 1062 */ 1063 if (interleaved_mnodes) { 1064 for (i = 0; i < max_mem_nodes; i++) 1065 if (PAGE_COUNTERS_COUNTERS(i, 1) != NULL) 1066 break; 1067 ASSERT(i < max_mem_nodes); 1068 oldmnode = i; 1069 } else 1070 oldmnode = mnode; 1071 1072 old_nranges = mnode_nranges[mnode]; 1073 cands_cache_nranges = old_nranges; 1074 mnode_nranges[mnode] = nranges; 1075 old_maxmrange = mnode_maxmrange[mnode]; 1076 mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode); 1077 new_maxmrange = mnode_maxmrange[mnode]; 1078 1079 for (r = 1; r < mmu_page_sizes; r++) { 1080 PAGE_COUNTERS_SHIFT(mnode, r) = PAGE_BSZS_SHIFT(r); 1081 old_ctr = PAGE_COUNTERS_COUNTERS(oldmnode, r); 1082 old_csz = PAGE_COUNTERS_ENTRIES(oldmnode, r); 1083 oldbase = PAGE_COUNTERS_BASE(oldmnode, r); 1084 old_npgs = old_csz << PAGE_COUNTERS_SHIFT(oldmnode, r); 1085 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) { 1086 old_color_array[mrange] = 1087 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, 1088 r, mrange); 1089 } 1090 1091 pcsz = npgs >> PAGE_COUNTERS_SHIFT(mnode, r); 1092 new_ctr = ctr_cache[r]; 1093 ctr_cache[r] = NULL; 1094 if (old_ctr != NULL && 1095 (oldbase + old_npgs > newbase) && 1096 (newbase + npgs > oldbase)) { 1097 /* 1098 * Map the intersection of the old and new 1099 * counters into the new array. 1100 */ 1101 size_t offset; 1102 if (newbase > oldbase) { 1103 offset = (newbase - oldbase) >> 1104 PAGE_COUNTERS_SHIFT(mnode, r); 1105 bcopy(old_ctr + offset, new_ctr, 1106 MIN(pcsz, (old_csz - offset)) * 1107 sizeof (hpmctr_t)); 1108 } else { 1109 offset = (oldbase - newbase) >> 1110 PAGE_COUNTERS_SHIFT(mnode, r); 1111 bcopy(old_ctr, new_ctr + offset, 1112 MIN(pcsz - offset, old_csz) * 1113 sizeof (hpmctr_t)); 1114 } 1115 } 1116 1117 PAGE_COUNTERS_COUNTERS(mnode, r) = new_ctr; 1118 PAGE_COUNTERS_ENTRIES(mnode, r) = pcsz; 1119 PAGE_COUNTERS_BASE(mnode, r) = newbase; 1120 1121 /* update shared hpm_counters in other mnodes */ 1122 if (interleaved_mnodes) { 1123 for (i = 0; i < max_mem_nodes; i++) { 1124 if ((i == mnode) || 1125 (mem_node_config[i].exists == 0)) 1126 continue; 1127 ASSERT( 1128 PAGE_COUNTERS_COUNTERS(i, r) == old_ctr || 1129 PAGE_COUNTERS_COUNTERS(i, r) == NULL); 1130 PAGE_COUNTERS_COUNTERS(i, r) = new_ctr; 1131 PAGE_COUNTERS_ENTRIES(i, r) = pcsz; 1132 PAGE_COUNTERS_BASE(i, r) = newbase; 1133 } 1134 } 1135 1136 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) { 1137 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r, mrange) = 1138 color_cache[r][mrange]; 1139 color_cache[r][mrange] = NULL; 1140 } 1141 /* 1142 * for now, just reset on these events as it's probably 1143 * not worthwhile to try and optimize this. 1144 */ 1145 for (i = 0; i < colors_per_szc[r]; i++) { 1146 uint_t color_mask = colors_per_szc[r] - 1; 1147 int mlo = interleaved_mnodes ? 0 : mnode; 1148 int mhi = interleaved_mnodes ? max_mem_nodes : 1149 (mnode + 1); 1150 int m; 1151 pfn_t pfnum; 1152 size_t idx; 1153 MEM_NODE_ITERATOR_DECL(it); 1154 1155 for (m = mlo; m < mhi; m++) { 1156 if (mem_node_config[m].exists == 0) 1157 continue; 1158 pfnum = newbase; 1159 MEM_NODE_ITERATOR_INIT(pfnum, m, r, &it); 1160 if (pfnum == (pfn_t)-1) { 1161 idx = 0; 1162 } else { 1163 PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i, 1164 color_mask, color_mask, &it); 1165 idx = PNUM_TO_IDX(m, r, pfnum); 1166 idx = (idx < pcsz) ? idx : 0; 1167 } 1168 for (mrange = 0; mrange < nranges; mrange++) { 1169 if (PAGE_COUNTERS_CURRENT_COLOR_ARRAY(m, 1170 r, mrange) != NULL) 1171 PAGE_COUNTERS_CURRENT_COLOR(m, 1172 r, i, mrange) = idx; 1173 } 1174 } 1175 } 1176 1177 /* cache info for freeing out of the critical path */ 1178 if ((caddr_t)old_ctr >= kernelheap && 1179 (caddr_t)old_ctr < ekernelheap) { 1180 ctr_cache[r] = old_ctr; 1181 size_cache[r] = old_csz; 1182 } 1183 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) { 1184 size_t *tmp = old_color_array[mrange]; 1185 if ((caddr_t)tmp >= kernelheap && 1186 (caddr_t)tmp < ekernelheap) { 1187 color_cache[r][mrange] = tmp; 1188 } 1189 } 1190 /* 1191 * Verify that PNUM_TO_IDX and IDX_TO_PNUM 1192 * satisfy the identity requirement. 1193 * We should be able to go from one to the other 1194 * and get consistent values. 1195 */ 1196 ASSERT(PNUM_TO_IDX(mnode, r, 1197 (IDX_TO_PNUM(mnode, r, 0))) == 0); 1198 ASSERT(IDX_TO_PNUM(mnode, r, 1199 (PNUM_TO_IDX(mnode, r, newbase))) == newbase); 1200 1201 /* pcc_info_t and pcc_color_free */ 1202 for (i = 0; i < NPC_MUTEX; i++) { 1203 pcc_info_t *epi; 1204 pcc_info_t *eold_pi; 1205 1206 pi = cands_cache[i * MMU_PAGE_SIZES + r]; 1207 old_pi = page_ctrs_cands[i][r][mnode]; 1208 page_ctrs_cands[i][r][mnode] = pi; 1209 cands_cache[i * MMU_PAGE_SIZES + r] = old_pi; 1210 1211 /* preserve old pcc_color_free values, if any */ 1212 if (old_pi == NULL) 1213 continue; 1214 1215 /* 1216 * when/if x86 does DR, must account for 1217 * possible change in range index when 1218 * preserving pcc_info 1219 */ 1220 epi = &pi[nranges]; 1221 eold_pi = &old_pi[old_nranges]; 1222 if (new_maxmrange > old_maxmrange) { 1223 pi += new_maxmrange - old_maxmrange; 1224 } else if (new_maxmrange < old_maxmrange) { 1225 old_pi += old_maxmrange - new_maxmrange; 1226 } 1227 for (; pi < epi && old_pi < eold_pi; pi++, old_pi++) { 1228 pcc_info_t tmp = *pi; 1229 *pi = *old_pi; 1230 *old_pi = tmp; 1231 } 1232 } 1233 } 1234 PAGE_CTRS_WRITE_UNLOCK(mnode); 1235 1236 /* 1237 * Now that we have dropped the write lock, it is safe to free all 1238 * of the memory we have cached above. 1239 * We come thru here to free memory when pre-alloc fails, and also to 1240 * free old pointers which were recorded while locked. 1241 */ 1242 cleanup: 1243 for (r = 1; r < mmu_page_sizes; r++) { 1244 if (ctr_cache[r] != NULL) { 1245 kmem_free(ctr_cache[r], 1246 size_cache[r] * sizeof (hpmctr_t)); 1247 } 1248 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) { 1249 if (color_cache[r][mrange] != NULL) { 1250 kmem_free(color_cache[r][mrange], 1251 colors_per_szc[r] * sizeof (size_t)); 1252 } 1253 } 1254 for (i = 0; i < NPC_MUTEX; i++) { 1255 pi = cands_cache[i * MMU_PAGE_SIZES + r]; 1256 if (pi == NULL) 1257 continue; 1258 nr = cands_cache_nranges; 1259 for (mrange = 0; mrange < nr; mrange++, pi++) { 1260 pgcntp = pi->pcc_color_free; 1261 if (pgcntp == NULL) 1262 continue; 1263 if ((caddr_t)pgcntp >= kernelheap && 1264 (caddr_t)pgcntp < ekernelheap) { 1265 kmem_free(pgcntp, 1266 colors_per_szc[r] * 1267 sizeof (pgcnt_t)); 1268 } 1269 } 1270 pi = cands_cache[i * MMU_PAGE_SIZES + r]; 1271 if ((caddr_t)pi >= kernelheap && 1272 (caddr_t)pi < ekernelheap) { 1273 kmem_free(pi, nr * sizeof (pcc_info_t)); 1274 } 1275 } 1276 } 1277 1278 kmem_free(cands_cache, 1279 sizeof (pcc_info_t *) * NPC_MUTEX * MMU_PAGE_SIZES); 1280 return (rc); 1281 } 1282 1283 /* 1284 * Cleanup the hpm_counters field in the page counters 1285 * array. 1286 */ 1287 void 1288 page_ctrs_cleanup(void) 1289 { 1290 int r; /* region size */ 1291 int i; /* mnode index */ 1292 1293 /* 1294 * Get the page counters write lock while we are 1295 * setting the page hpm_counters field to NULL 1296 * for non-existent mnodes. 1297 */ 1298 for (i = 0; i < max_mem_nodes; i++) { 1299 PAGE_CTRS_WRITE_LOCK(i); 1300 if (mem_node_config[i].exists) { 1301 PAGE_CTRS_WRITE_UNLOCK(i); 1302 continue; 1303 } 1304 for (r = 1; r < mmu_page_sizes; r++) { 1305 PAGE_COUNTERS_COUNTERS(i, r) = NULL; 1306 } 1307 PAGE_CTRS_WRITE_UNLOCK(i); 1308 } 1309 } 1310 1311 #ifdef DEBUG 1312 1313 /* 1314 * confirm pp is a large page corresponding to szc 1315 */ 1316 void 1317 chk_lpg(page_t *pp, uchar_t szc) 1318 { 1319 spgcnt_t npgs = page_get_pagecnt(pp->p_szc); 1320 uint_t noreloc; 1321 1322 if (npgs == 1) { 1323 ASSERT(pp->p_szc == 0); 1324 ASSERT(pp->p_next == pp); 1325 ASSERT(pp->p_prev == pp); 1326 return; 1327 } 1328 1329 ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL); 1330 ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL); 1331 1332 ASSERT(IS_P2ALIGNED(pp->p_pagenum, npgs)); 1333 ASSERT(pp->p_pagenum == (pp->p_next->p_pagenum - 1)); 1334 ASSERT(pp->p_prev->p_pagenum == (pp->p_pagenum + (npgs - 1))); 1335 ASSERT(pp->p_prev == (pp + (npgs - 1))); 1336 1337 /* 1338 * Check list of pages. 1339 */ 1340 noreloc = PP_ISNORELOC(pp); 1341 while (npgs--) { 1342 if (npgs != 0) { 1343 ASSERT(pp->p_pagenum == pp->p_next->p_pagenum - 1); 1344 ASSERT(pp->p_next == (pp + 1)); 1345 } 1346 ASSERT(pp->p_szc == szc); 1347 ASSERT(PP_ISFREE(pp)); 1348 ASSERT(PP_ISAGED(pp)); 1349 ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL); 1350 ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL); 1351 ASSERT(pp->p_vnode == NULL); 1352 ASSERT(PP_ISNORELOC(pp) == noreloc); 1353 1354 pp = pp->p_next; 1355 } 1356 } 1357 #endif /* DEBUG */ 1358 1359 void 1360 page_freelist_lock(int mnode) 1361 { 1362 int i; 1363 for (i = 0; i < NPC_MUTEX; i++) { 1364 mutex_enter(FPC_MUTEX(mnode, i)); 1365 mutex_enter(CPC_MUTEX(mnode, i)); 1366 } 1367 } 1368 1369 void 1370 page_freelist_unlock(int mnode) 1371 { 1372 int i; 1373 for (i = 0; i < NPC_MUTEX; i++) { 1374 mutex_exit(FPC_MUTEX(mnode, i)); 1375 mutex_exit(CPC_MUTEX(mnode, i)); 1376 } 1377 } 1378 1379 /* 1380 * add pp to the specified page list. Defaults to head of the page list 1381 * unless PG_LIST_TAIL is specified. 1382 */ 1383 void 1384 page_list_add(page_t *pp, int flags) 1385 { 1386 page_t **ppp; 1387 kmutex_t *pcm; 1388 uint_t bin, mtype; 1389 int mnode; 1390 1391 ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT)); 1392 ASSERT(PP_ISFREE(pp)); 1393 ASSERT(!hat_page_is_mapped(pp)); 1394 ASSERT(hat_page_getshare(pp) == 0); 1395 1396 /* 1397 * Large pages should be freed via page_list_add_pages(). 1398 */ 1399 ASSERT(pp->p_szc == 0); 1400 1401 /* 1402 * Don't need to lock the freelist first here 1403 * because the page isn't on the freelist yet. 1404 * This means p_szc can't change on us. 1405 */ 1406 1407 bin = PP_2_BIN(pp); 1408 mnode = PP_2_MEM_NODE(pp); 1409 mtype = PP_2_MTYPE(pp); 1410 1411 if (flags & PG_LIST_ISINIT) { 1412 /* 1413 * PG_LIST_ISINIT is set during system startup (ie. single 1414 * threaded), add a page to the free list and add to the 1415 * the free region counters w/o any locking 1416 */ 1417 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1418 1419 /* inline version of page_add() */ 1420 if (*ppp != NULL) { 1421 pp->p_next = *ppp; 1422 pp->p_prev = (*ppp)->p_prev; 1423 (*ppp)->p_prev = pp; 1424 pp->p_prev->p_next = pp; 1425 } else 1426 *ppp = pp; 1427 1428 page_ctr_add_internal(mnode, mtype, pp, flags); 1429 VM_STAT_ADD(vmm_vmstats.pladd_free[0]); 1430 } else { 1431 pcm = PC_BIN_MUTEX(mnode, bin, flags); 1432 1433 if (flags & PG_FREE_LIST) { 1434 VM_STAT_ADD(vmm_vmstats.pladd_free[0]); 1435 ASSERT(PP_ISAGED(pp)); 1436 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1437 1438 } else { 1439 VM_STAT_ADD(vmm_vmstats.pladd_cache); 1440 ASSERT(pp->p_vnode); 1441 ASSERT((pp->p_offset & PAGEOFFSET) == 0); 1442 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1443 } 1444 mutex_enter(pcm); 1445 page_add(ppp, pp); 1446 1447 if (flags & PG_LIST_TAIL) 1448 *ppp = (*ppp)->p_next; 1449 /* 1450 * Add counters before releasing pcm mutex to avoid a race with 1451 * page_freelist_coalesce and page_freelist_split. 1452 */ 1453 page_ctr_add(mnode, mtype, pp, flags); 1454 mutex_exit(pcm); 1455 } 1456 1457 1458 #if defined(__sparc) 1459 if (PP_ISNORELOC(pp)) { 1460 kcage_freemem_add(1); 1461 } 1462 #endif 1463 /* 1464 * It is up to the caller to unlock the page! 1465 */ 1466 ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT)); 1467 } 1468 1469 1470 #ifdef __sparc 1471 /* 1472 * This routine is only used by kcage_init during system startup. 1473 * It performs the function of page_list_sub/PP_SETNORELOC/page_list_add 1474 * without the overhead of taking locks and updating counters. 1475 */ 1476 void 1477 page_list_noreloc_startup(page_t *pp) 1478 { 1479 page_t **ppp; 1480 uint_t bin; 1481 int mnode; 1482 int mtype; 1483 int flags = 0; 1484 1485 /* 1486 * If this is a large page on the freelist then 1487 * break it up into smaller pages. 1488 */ 1489 if (pp->p_szc != 0) 1490 page_boot_demote(pp); 1491 1492 /* 1493 * Get list page is currently on. 1494 */ 1495 bin = PP_2_BIN(pp); 1496 mnode = PP_2_MEM_NODE(pp); 1497 mtype = PP_2_MTYPE(pp); 1498 ASSERT(mtype == MTYPE_RELOC); 1499 ASSERT(pp->p_szc == 0); 1500 1501 if (PP_ISAGED(pp)) { 1502 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1503 flags |= PG_FREE_LIST; 1504 } else { 1505 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1506 flags |= PG_CACHE_LIST; 1507 } 1508 1509 ASSERT(*ppp != NULL); 1510 1511 /* 1512 * Delete page from current list. 1513 */ 1514 if (*ppp == pp) 1515 *ppp = pp->p_next; /* go to next page */ 1516 if (*ppp == pp) { 1517 *ppp = NULL; /* page list is gone */ 1518 } else { 1519 pp->p_prev->p_next = pp->p_next; 1520 pp->p_next->p_prev = pp->p_prev; 1521 } 1522 1523 /* 1524 * Decrement page counters 1525 */ 1526 page_ctr_sub_internal(mnode, mtype, pp, flags); 1527 1528 /* 1529 * Set no reloc for cage initted pages. 1530 */ 1531 PP_SETNORELOC(pp); 1532 1533 mtype = PP_2_MTYPE(pp); 1534 ASSERT(mtype == MTYPE_NORELOC); 1535 1536 /* 1537 * Get new list for page. 1538 */ 1539 if (PP_ISAGED(pp)) { 1540 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1541 } else { 1542 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1543 } 1544 1545 /* 1546 * Insert page on new list. 1547 */ 1548 if (*ppp == NULL) { 1549 *ppp = pp; 1550 pp->p_next = pp->p_prev = pp; 1551 } else { 1552 pp->p_next = *ppp; 1553 pp->p_prev = (*ppp)->p_prev; 1554 (*ppp)->p_prev = pp; 1555 pp->p_prev->p_next = pp; 1556 } 1557 1558 /* 1559 * Increment page counters 1560 */ 1561 page_ctr_add_internal(mnode, mtype, pp, flags); 1562 1563 /* 1564 * Update cage freemem counter 1565 */ 1566 atomic_inc_ulong(&kcage_freemem); 1567 } 1568 #else /* __sparc */ 1569 1570 /* ARGSUSED */ 1571 void 1572 page_list_noreloc_startup(page_t *pp) 1573 { 1574 panic("page_list_noreloc_startup: should be here only for sparc"); 1575 } 1576 #endif 1577 1578 void 1579 page_list_add_pages(page_t *pp, int flags) 1580 { 1581 kmutex_t *pcm; 1582 pgcnt_t pgcnt; 1583 uint_t bin, mtype, i; 1584 int mnode; 1585 1586 /* default to freelist/head */ 1587 ASSERT((flags & (PG_CACHE_LIST | PG_LIST_TAIL)) == 0); 1588 1589 CHK_LPG(pp, pp->p_szc); 1590 VM_STAT_ADD(vmm_vmstats.pladd_free[pp->p_szc]); 1591 1592 bin = PP_2_BIN(pp); 1593 mnode = PP_2_MEM_NODE(pp); 1594 mtype = PP_2_MTYPE(pp); 1595 1596 if (flags & PG_LIST_ISINIT) { 1597 ASSERT(pp->p_szc == mmu_page_sizes - 1); 1598 page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1599 ASSERT(!PP_ISNORELOC(pp)); 1600 PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags); 1601 } else { 1602 1603 ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes); 1604 1605 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 1606 1607 mutex_enter(pcm); 1608 page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1609 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST); 1610 mutex_exit(pcm); 1611 1612 pgcnt = page_get_pagecnt(pp->p_szc); 1613 #if defined(__sparc) 1614 if (PP_ISNORELOC(pp)) 1615 kcage_freemem_add(pgcnt); 1616 #endif 1617 for (i = 0; i < pgcnt; i++, pp++) 1618 page_unlock_nocapture(pp); 1619 } 1620 } 1621 1622 /* 1623 * During boot, need to demote a large page to base 1624 * pagesize pages for seg_kmem for use in boot_alloc() 1625 */ 1626 void 1627 page_boot_demote(page_t *pp) 1628 { 1629 ASSERT(pp->p_szc != 0); 1630 ASSERT(PP_ISFREE(pp)); 1631 ASSERT(PP_ISAGED(pp)); 1632 1633 (void) page_demote(PP_2_MEM_NODE(pp), 1634 PFN_BASE(pp->p_pagenum, pp->p_szc), 0, pp->p_szc, 0, PC_NO_COLOR, 1635 PC_FREE); 1636 1637 ASSERT(PP_ISFREE(pp)); 1638 ASSERT(PP_ISAGED(pp)); 1639 ASSERT(pp->p_szc == 0); 1640 } 1641 1642 /* 1643 * Take a particular page off of whatever freelist the page 1644 * is claimed to be on. 1645 * 1646 * NOTE: Only used for PAGESIZE pages. 1647 */ 1648 void 1649 page_list_sub(page_t *pp, int flags) 1650 { 1651 int bin; 1652 uint_t mtype; 1653 int mnode; 1654 kmutex_t *pcm; 1655 page_t **ppp; 1656 1657 ASSERT(PAGE_EXCL(pp)); 1658 ASSERT(PP_ISFREE(pp)); 1659 1660 /* 1661 * The p_szc field can only be changed by page_promote() 1662 * and page_demote(). Only free pages can be promoted and 1663 * demoted and the free list MUST be locked during these 1664 * operations. So to prevent a race in page_list_sub() 1665 * between computing which bin of the freelist lock to 1666 * grab and actually grabing the lock we check again that 1667 * the bin we locked is still the correct one. Notice that 1668 * the p_szc field could have actually changed on us but 1669 * if the bin happens to still be the same we are safe. 1670 */ 1671 try_again: 1672 bin = PP_2_BIN(pp); 1673 mnode = PP_2_MEM_NODE(pp); 1674 pcm = PC_BIN_MUTEX(mnode, bin, flags); 1675 mutex_enter(pcm); 1676 if (PP_2_BIN(pp) != bin) { 1677 mutex_exit(pcm); 1678 goto try_again; 1679 } 1680 mtype = PP_2_MTYPE(pp); 1681 1682 if (flags & PG_FREE_LIST) { 1683 VM_STAT_ADD(vmm_vmstats.plsub_free[0]); 1684 ASSERT(PP_ISAGED(pp)); 1685 ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype); 1686 } else { 1687 VM_STAT_ADD(vmm_vmstats.plsub_cache); 1688 ASSERT(!PP_ISAGED(pp)); 1689 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1690 } 1691 1692 /* 1693 * Common PAGESIZE case. 1694 * 1695 * Note that we locked the freelist. This prevents 1696 * any page promotion/demotion operations. Therefore 1697 * the p_szc will not change until we drop pcm mutex. 1698 */ 1699 if (pp->p_szc == 0) { 1700 page_sub(ppp, pp); 1701 /* 1702 * Subtract counters before releasing pcm mutex 1703 * to avoid race with page_freelist_coalesce. 1704 */ 1705 page_ctr_sub(mnode, mtype, pp, flags); 1706 mutex_exit(pcm); 1707 1708 #if defined(__sparc) 1709 if (PP_ISNORELOC(pp)) { 1710 kcage_freemem_sub(1); 1711 } 1712 #endif 1713 return; 1714 } 1715 1716 /* 1717 * Large pages on the cache list are not supported. 1718 */ 1719 if (flags & PG_CACHE_LIST) 1720 panic("page_list_sub: large page on cachelist"); 1721 1722 /* 1723 * Slow but rare. 1724 * 1725 * Somebody wants this particular page which is part 1726 * of a large page. In this case we just demote the page 1727 * if it's on the freelist. 1728 * 1729 * We have to drop pcm before locking the entire freelist. 1730 * Once we have re-locked the freelist check to make sure 1731 * the page hasn't already been demoted or completely 1732 * freed. 1733 */ 1734 mutex_exit(pcm); 1735 page_freelist_lock(mnode); 1736 if (pp->p_szc != 0) { 1737 /* 1738 * Large page is on freelist. 1739 */ 1740 (void) page_demote(mnode, PFN_BASE(pp->p_pagenum, pp->p_szc), 1741 0, pp->p_szc, 0, PC_NO_COLOR, PC_FREE); 1742 } 1743 ASSERT(PP_ISFREE(pp)); 1744 ASSERT(PP_ISAGED(pp)); 1745 ASSERT(pp->p_szc == 0); 1746 1747 /* 1748 * Subtract counters before releasing pcm mutex 1749 * to avoid race with page_freelist_coalesce. 1750 */ 1751 bin = PP_2_BIN(pp); 1752 mtype = PP_2_MTYPE(pp); 1753 ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype); 1754 1755 page_sub(ppp, pp); 1756 page_ctr_sub(mnode, mtype, pp, flags); 1757 page_freelist_unlock(mnode); 1758 1759 #if defined(__sparc) 1760 if (PP_ISNORELOC(pp)) { 1761 kcage_freemem_sub(1); 1762 } 1763 #endif 1764 } 1765 1766 void 1767 page_list_sub_pages(page_t *pp, uint_t szc) 1768 { 1769 kmutex_t *pcm; 1770 uint_t bin, mtype; 1771 int mnode; 1772 1773 ASSERT(PAGE_EXCL(pp)); 1774 ASSERT(PP_ISFREE(pp)); 1775 ASSERT(PP_ISAGED(pp)); 1776 1777 /* 1778 * See comment in page_list_sub(). 1779 */ 1780 try_again: 1781 bin = PP_2_BIN(pp); 1782 mnode = PP_2_MEM_NODE(pp); 1783 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 1784 mutex_enter(pcm); 1785 if (PP_2_BIN(pp) != bin) { 1786 mutex_exit(pcm); 1787 goto try_again; 1788 } 1789 1790 /* 1791 * If we're called with a page larger than szc or it got 1792 * promoted above szc before we locked the freelist then 1793 * drop pcm and re-lock entire freelist. If page still larger 1794 * than szc then demote it. 1795 */ 1796 if (pp->p_szc > szc) { 1797 mutex_exit(pcm); 1798 pcm = NULL; 1799 page_freelist_lock(mnode); 1800 if (pp->p_szc > szc) { 1801 VM_STAT_ADD(vmm_vmstats.plsubpages_szcbig); 1802 (void) page_demote(mnode, 1803 PFN_BASE(pp->p_pagenum, pp->p_szc), 0, 1804 pp->p_szc, szc, PC_NO_COLOR, PC_FREE); 1805 } 1806 bin = PP_2_BIN(pp); 1807 } 1808 ASSERT(PP_ISFREE(pp)); 1809 ASSERT(PP_ISAGED(pp)); 1810 ASSERT(pp->p_szc <= szc); 1811 ASSERT(pp == PP_PAGEROOT(pp)); 1812 1813 VM_STAT_ADD(vmm_vmstats.plsub_free[pp->p_szc]); 1814 1815 mtype = PP_2_MTYPE(pp); 1816 if (pp->p_szc != 0) { 1817 page_vpsub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1818 CHK_LPG(pp, pp->p_szc); 1819 } else { 1820 VM_STAT_ADD(vmm_vmstats.plsubpages_szc0); 1821 page_sub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1822 } 1823 page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST); 1824 1825 if (pcm != NULL) { 1826 mutex_exit(pcm); 1827 } else { 1828 page_freelist_unlock(mnode); 1829 } 1830 1831 #if defined(__sparc) 1832 if (PP_ISNORELOC(pp)) { 1833 pgcnt_t pgcnt; 1834 1835 pgcnt = page_get_pagecnt(pp->p_szc); 1836 kcage_freemem_sub(pgcnt); 1837 } 1838 #endif 1839 } 1840 1841 /* 1842 * Add the page to the front of a linked list of pages 1843 * using the p_next & p_prev pointers for the list. 1844 * The caller is responsible for protecting the list pointers. 1845 */ 1846 void 1847 mach_page_add(page_t **ppp, page_t *pp) 1848 { 1849 if (*ppp == NULL) { 1850 pp->p_next = pp->p_prev = pp; 1851 } else { 1852 pp->p_next = *ppp; 1853 pp->p_prev = (*ppp)->p_prev; 1854 (*ppp)->p_prev = pp; 1855 pp->p_prev->p_next = pp; 1856 } 1857 *ppp = pp; 1858 } 1859 1860 /* 1861 * Remove this page from a linked list of pages 1862 * using the p_next & p_prev pointers for the list. 1863 * 1864 * The caller is responsible for protecting the list pointers. 1865 */ 1866 void 1867 mach_page_sub(page_t **ppp, page_t *pp) 1868 { 1869 ASSERT(PP_ISFREE(pp)); 1870 1871 if (*ppp == NULL || pp == NULL) 1872 panic("mach_page_sub"); 1873 1874 if (*ppp == pp) 1875 *ppp = pp->p_next; /* go to next page */ 1876 1877 if (*ppp == pp) 1878 *ppp = NULL; /* page list is gone */ 1879 else { 1880 pp->p_prev->p_next = pp->p_next; 1881 pp->p_next->p_prev = pp->p_prev; 1882 } 1883 pp->p_prev = pp->p_next = pp; /* make pp a list of one */ 1884 } 1885 1886 /* 1887 * Routine fsflush uses to gradually coalesce the free list into larger pages. 1888 */ 1889 void 1890 page_promote_size(page_t *pp, uint_t cur_szc) 1891 { 1892 pfn_t pfn; 1893 int mnode; 1894 int idx; 1895 int new_szc = cur_szc + 1; 1896 int full = FULL_REGION_CNT(new_szc); 1897 1898 pfn = page_pptonum(pp); 1899 mnode = PFN_2_MEM_NODE(pfn); 1900 1901 page_freelist_lock(mnode); 1902 1903 idx = PNUM_TO_IDX(mnode, new_szc, pfn); 1904 if (PAGE_COUNTERS(mnode, new_szc, idx) == full) 1905 (void) page_promote(mnode, pfn, new_szc, PC_FREE, PC_MTYPE_ANY); 1906 1907 page_freelist_unlock(mnode); 1908 } 1909 1910 static uint_t page_promote_err; 1911 static uint_t page_promote_noreloc_err; 1912 1913 /* 1914 * Create a single larger page (of szc new_szc) from smaller contiguous pages 1915 * for the given mnode starting at pfnum. Pages involved are on the freelist 1916 * before the call and may be returned to the caller if requested, otherwise 1917 * they will be placed back on the freelist. 1918 * If flags is PC_ALLOC, then the large page will be returned to the user in 1919 * a state which is consistent with a page being taken off the freelist. If 1920 * we failed to lock the new large page, then we will return NULL to the 1921 * caller and put the large page on the freelist instead. 1922 * If flags is PC_FREE, then the large page will be placed on the freelist, 1923 * and NULL will be returned. 1924 * The caller is responsible for locking the freelist as well as any other 1925 * accounting which needs to be done for a returned page. 1926 * 1927 * RFE: For performance pass in pp instead of pfnum so 1928 * we can avoid excessive calls to page_numtopp_nolock(). 1929 * This would depend on an assumption that all contiguous 1930 * pages are in the same memseg so we can just add/dec 1931 * our pp. 1932 * 1933 * Lock ordering: 1934 * 1935 * There is a potential but rare deadlock situation 1936 * for page promotion and demotion operations. The problem 1937 * is there are two paths into the freelist manager and 1938 * they have different lock orders: 1939 * 1940 * page_create() 1941 * lock freelist 1942 * page_lock(EXCL) 1943 * unlock freelist 1944 * return 1945 * caller drops page_lock 1946 * 1947 * page_free() and page_reclaim() 1948 * caller grabs page_lock(EXCL) 1949 * 1950 * lock freelist 1951 * unlock freelist 1952 * drop page_lock 1953 * 1954 * What prevents a thread in page_create() from deadlocking 1955 * with a thread freeing or reclaiming the same page is the 1956 * page_trylock() in page_get_freelist(). If the trylock fails 1957 * it skips the page. 1958 * 1959 * The lock ordering for promotion and demotion is the same as 1960 * for page_create(). Since the same deadlock could occur during 1961 * page promotion and freeing or reclaiming of a page on the 1962 * cache list we might have to fail the operation and undo what 1963 * have done so far. Again this is rare. 1964 */ 1965 page_t * 1966 page_promote(int mnode, pfn_t pfnum, uchar_t new_szc, int flags, int mtype) 1967 { 1968 page_t *pp, *pplist, *tpp, *start_pp; 1969 pgcnt_t new_npgs, npgs; 1970 uint_t bin; 1971 pgcnt_t tmpnpgs, pages_left; 1972 uint_t noreloc; 1973 int which_list; 1974 ulong_t index; 1975 kmutex_t *phm; 1976 1977 /* 1978 * General algorithm: 1979 * Find the starting page 1980 * Walk each page struct removing it from the freelist, 1981 * and linking it to all the other pages removed. 1982 * Once all pages are off the freelist, 1983 * walk the list, modifying p_szc to new_szc and what 1984 * ever other info needs to be done to create a large free page. 1985 * According to the flags, either return the page or put it 1986 * on the freelist. 1987 */ 1988 1989 start_pp = page_numtopp_nolock(pfnum); 1990 ASSERT(start_pp && (start_pp->p_pagenum == pfnum)); 1991 new_npgs = page_get_pagecnt(new_szc); 1992 ASSERT(IS_P2ALIGNED(pfnum, new_npgs)); 1993 1994 /* don't return page of the wrong mtype */ 1995 if (mtype != PC_MTYPE_ANY && mtype != PP_2_MTYPE(start_pp)) 1996 return (NULL); 1997 1998 /* 1999 * Loop through smaller pages to confirm that all pages 2000 * give the same result for PP_ISNORELOC(). 2001 * We can check this reliably here as the protocol for setting 2002 * P_NORELOC requires pages to be taken off the free list first. 2003 */ 2004 noreloc = PP_ISNORELOC(start_pp); 2005 for (pp = start_pp + new_npgs; --pp > start_pp; ) { 2006 if (noreloc != PP_ISNORELOC(pp)) { 2007 page_promote_noreloc_err++; 2008 page_promote_err++; 2009 return (NULL); 2010 } 2011 } 2012 2013 pages_left = new_npgs; 2014 pplist = NULL; 2015 pp = start_pp; 2016 2017 /* Loop around coalescing the smaller pages into a big page. */ 2018 while (pages_left) { 2019 /* 2020 * Remove from the freelist. 2021 */ 2022 ASSERT(PP_ISFREE(pp)); 2023 bin = PP_2_BIN(pp); 2024 ASSERT(mnode == PP_2_MEM_NODE(pp)); 2025 mtype = PP_2_MTYPE(pp); 2026 if (PP_ISAGED(pp)) { 2027 2028 /* 2029 * PG_FREE_LIST 2030 */ 2031 if (pp->p_szc) { 2032 page_vpsub(&PAGE_FREELISTS(mnode, 2033 pp->p_szc, bin, mtype), pp); 2034 } else { 2035 mach_page_sub(&PAGE_FREELISTS(mnode, 0, 2036 bin, mtype), pp); 2037 } 2038 which_list = PG_FREE_LIST; 2039 } else { 2040 ASSERT(pp->p_szc == 0); 2041 2042 /* 2043 * PG_CACHE_LIST 2044 * 2045 * Since this page comes from the 2046 * cachelist, we must destroy the 2047 * vnode association. 2048 */ 2049 if (!page_trylock(pp, SE_EXCL)) { 2050 goto fail_promote; 2051 } 2052 2053 /* 2054 * We need to be careful not to deadlock 2055 * with another thread in page_lookup(). 2056 * The page_lookup() thread could be holding 2057 * the same phm that we need if the two 2058 * pages happen to hash to the same phm lock. 2059 * At this point we have locked the entire 2060 * freelist and page_lookup() could be trying 2061 * to grab a freelist lock. 2062 */ 2063 index = PAGE_HASH_FUNC(pp->p_vnode, pp->p_offset); 2064 phm = PAGE_HASH_MUTEX(index); 2065 if (!mutex_tryenter(phm)) { 2066 page_unlock_nocapture(pp); 2067 goto fail_promote; 2068 } 2069 2070 mach_page_sub(&PAGE_CACHELISTS(mnode, bin, mtype), pp); 2071 page_hashout(pp, phm); 2072 mutex_exit(phm); 2073 PP_SETAGED(pp); 2074 page_unlock_nocapture(pp); 2075 which_list = PG_CACHE_LIST; 2076 } 2077 page_ctr_sub(mnode, mtype, pp, which_list); 2078 2079 /* 2080 * Concatenate the smaller page(s) onto 2081 * the large page list. 2082 */ 2083 tmpnpgs = npgs = page_get_pagecnt(pp->p_szc); 2084 pages_left -= npgs; 2085 tpp = pp; 2086 while (npgs--) { 2087 tpp->p_szc = new_szc; 2088 tpp = tpp->p_next; 2089 } 2090 page_list_concat(&pplist, &pp); 2091 pp += tmpnpgs; 2092 } 2093 CHK_LPG(pplist, new_szc); 2094 2095 /* 2096 * return the page to the user if requested 2097 * in the properly locked state. 2098 */ 2099 if (flags == PC_ALLOC && (page_trylock_cons(pplist, SE_EXCL))) { 2100 return (pplist); 2101 } 2102 2103 /* 2104 * Otherwise place the new large page on the freelist 2105 */ 2106 bin = PP_2_BIN(pplist); 2107 mnode = PP_2_MEM_NODE(pplist); 2108 mtype = PP_2_MTYPE(pplist); 2109 page_vpadd(&PAGE_FREELISTS(mnode, new_szc, bin, mtype), pplist); 2110 2111 page_ctr_add(mnode, mtype, pplist, PG_FREE_LIST); 2112 return (NULL); 2113 2114 fail_promote: 2115 /* 2116 * A thread must have still been freeing or 2117 * reclaiming the page on the cachelist. 2118 * To prevent a deadlock undo what we have 2119 * done sofar and return failure. This 2120 * situation can only happen while promoting 2121 * PAGESIZE pages. 2122 */ 2123 page_promote_err++; 2124 while (pplist) { 2125 pp = pplist; 2126 mach_page_sub(&pplist, pp); 2127 pp->p_szc = 0; 2128 bin = PP_2_BIN(pp); 2129 mtype = PP_2_MTYPE(pp); 2130 mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, mtype), pp); 2131 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST); 2132 } 2133 return (NULL); 2134 2135 } 2136 2137 /* 2138 * Break up a large page into smaller size pages. 2139 * Pages involved are on the freelist before the call and may 2140 * be returned to the caller if requested, otherwise they will 2141 * be placed back on the freelist. 2142 * The caller is responsible for locking the freelist as well as any other 2143 * accounting which needs to be done for a returned page. 2144 * If flags is not PC_ALLOC, the color argument is ignored, and thus 2145 * technically, any value may be passed in but PC_NO_COLOR is the standard 2146 * which should be followed for clarity's sake. 2147 * Returns a page whose pfn is < pfnmax 2148 */ 2149 page_t * 2150 page_demote(int mnode, pfn_t pfnum, pfn_t pfnmax, uchar_t cur_szc, 2151 uchar_t new_szc, int color, int flags) 2152 { 2153 page_t *pp, *pplist, *npplist; 2154 pgcnt_t npgs, n; 2155 uint_t bin; 2156 uint_t mtype; 2157 page_t *ret_pp = NULL; 2158 2159 ASSERT(cur_szc != 0); 2160 ASSERT(new_szc < cur_szc); 2161 2162 pplist = page_numtopp_nolock(pfnum); 2163 ASSERT(pplist != NULL); 2164 2165 ASSERT(pplist->p_szc == cur_szc); 2166 2167 bin = PP_2_BIN(pplist); 2168 ASSERT(mnode == PP_2_MEM_NODE(pplist)); 2169 mtype = PP_2_MTYPE(pplist); 2170 page_vpsub(&PAGE_FREELISTS(mnode, cur_szc, bin, mtype), pplist); 2171 2172 CHK_LPG(pplist, cur_szc); 2173 page_ctr_sub(mnode, mtype, pplist, PG_FREE_LIST); 2174 2175 /* 2176 * Number of PAGESIZE pages for smaller new_szc 2177 * page. 2178 */ 2179 npgs = page_get_pagecnt(new_szc); 2180 2181 while (pplist) { 2182 pp = pplist; 2183 2184 ASSERT(pp->p_szc == cur_szc); 2185 2186 /* 2187 * We either break it up into PAGESIZE pages or larger. 2188 */ 2189 if (npgs == 1) { /* PAGESIZE case */ 2190 mach_page_sub(&pplist, pp); 2191 ASSERT(pp->p_szc == cur_szc); 2192 ASSERT(new_szc == 0); 2193 ASSERT(mnode == PP_2_MEM_NODE(pp)); 2194 pp->p_szc = new_szc; 2195 bin = PP_2_BIN(pp); 2196 if ((bin == color) && (flags == PC_ALLOC) && 2197 (ret_pp == NULL) && (pfnmax == 0 || 2198 pp->p_pagenum < pfnmax) && 2199 page_trylock_cons(pp, SE_EXCL)) { 2200 ret_pp = pp; 2201 } else { 2202 mtype = PP_2_MTYPE(pp); 2203 mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, 2204 mtype), pp); 2205 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST); 2206 } 2207 } else { 2208 page_t *try_to_return_this_page = NULL; 2209 int count = 0; 2210 2211 /* 2212 * Break down into smaller lists of pages. 2213 */ 2214 page_list_break(&pplist, &npplist, npgs); 2215 2216 pp = pplist; 2217 n = npgs; 2218 while (n--) { 2219 ASSERT(pp->p_szc == cur_szc); 2220 /* 2221 * Check whether all the pages in this list 2222 * fit the request criteria. 2223 */ 2224 if (pfnmax == 0 || pp->p_pagenum < pfnmax) { 2225 count++; 2226 } 2227 pp->p_szc = new_szc; 2228 pp = pp->p_next; 2229 } 2230 2231 if (count == npgs && 2232 (pfnmax == 0 || pp->p_pagenum < pfnmax)) { 2233 try_to_return_this_page = pp; 2234 } 2235 2236 CHK_LPG(pplist, new_szc); 2237 2238 bin = PP_2_BIN(pplist); 2239 if (try_to_return_this_page) 2240 ASSERT(mnode == 2241 PP_2_MEM_NODE(try_to_return_this_page)); 2242 if ((bin == color) && (flags == PC_ALLOC) && 2243 (ret_pp == NULL) && try_to_return_this_page && 2244 page_trylock_cons(try_to_return_this_page, 2245 SE_EXCL)) { 2246 ret_pp = try_to_return_this_page; 2247 } else { 2248 mtype = PP_2_MTYPE(pp); 2249 page_vpadd(&PAGE_FREELISTS(mnode, new_szc, 2250 bin, mtype), pplist); 2251 2252 page_ctr_add(mnode, mtype, pplist, 2253 PG_FREE_LIST); 2254 } 2255 pplist = npplist; 2256 } 2257 } 2258 return (ret_pp); 2259 } 2260 2261 int mpss_coalesce_disable = 0; 2262 2263 /* 2264 * Coalesce free pages into a page of the given szc and color if possible. 2265 * Return the pointer to the page created, otherwise, return NULL. 2266 * 2267 * If pfnhi is non-zero, search for large page with pfn range less than pfnhi. 2268 */ 2269 page_t * 2270 page_freelist_coalesce(int mnode, uchar_t szc, uint_t color, uint_t ceq_mask, 2271 int mtype, pfn_t pfnhi) 2272 { 2273 int r = szc; /* region size */ 2274 int mrange; 2275 uint_t full, bin, color_mask, wrap = 0; 2276 pfn_t pfnum, lo, hi; 2277 size_t len, idx, idx0; 2278 pgcnt_t cands = 0, szcpgcnt = page_get_pagecnt(szc); 2279 page_t *ret_pp; 2280 MEM_NODE_ITERATOR_DECL(it); 2281 #if defined(__sparc) 2282 pfn_t pfnum0, nlo, nhi; 2283 #endif 2284 2285 if (mpss_coalesce_disable) { 2286 ASSERT(szc < MMU_PAGE_SIZES); 2287 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[szc][0]); 2288 return (NULL); 2289 } 2290 2291 ASSERT(szc < mmu_page_sizes); 2292 color_mask = PAGE_GET_PAGECOLORS(szc) - 1; 2293 ASSERT(ceq_mask <= color_mask); 2294 ASSERT(color <= color_mask); 2295 color &= ceq_mask; 2296 2297 /* Prevent page_counters dynamic memory from being freed */ 2298 rw_enter(&page_ctrs_rwlock[mnode], RW_READER); 2299 2300 mrange = MTYPE_2_MRANGE(mnode, mtype); 2301 ASSERT(mrange < mnode_nranges[mnode]); 2302 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[r][mrange]); 2303 2304 /* get pfn range for mtype */ 2305 len = PAGE_COUNTERS_ENTRIES(mnode, r); 2306 MNODETYPE_2_PFN(mnode, mtype, lo, hi); 2307 hi++; 2308 2309 /* use lower limit if given */ 2310 if (pfnhi != PFNNULL && pfnhi < hi) 2311 hi = pfnhi; 2312 2313 /* round to szcpgcnt boundaries */ 2314 lo = P2ROUNDUP(lo, szcpgcnt); 2315 MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it); 2316 if (lo == (pfn_t)-1) { 2317 rw_exit(&page_ctrs_rwlock[mnode]); 2318 return (NULL); 2319 } 2320 hi = hi & ~(szcpgcnt - 1); 2321 2322 /* set lo to the closest pfn of the right color */ 2323 if (((PFN_2_COLOR(lo, szc, &it) ^ color) & ceq_mask) || 2324 (interleaved_mnodes && PFN_2_MEM_NODE(lo) != mnode)) { 2325 PAGE_NEXT_PFN_FOR_COLOR(lo, szc, color, ceq_mask, color_mask, 2326 &it); 2327 } 2328 2329 if (hi <= lo) { 2330 rw_exit(&page_ctrs_rwlock[mnode]); 2331 return (NULL); 2332 } 2333 2334 full = FULL_REGION_CNT(r); 2335 2336 /* calculate the number of page candidates and initial search index */ 2337 bin = color; 2338 idx0 = (size_t)(-1); 2339 do { 2340 pgcnt_t acand; 2341 2342 PGCTRS_CANDS_GETVALUECOLOR(mnode, mrange, r, bin, acand); 2343 if (acand) { 2344 idx = PAGE_COUNTERS_CURRENT_COLOR(mnode, 2345 r, bin, mrange); 2346 idx0 = MIN(idx0, idx); 2347 cands += acand; 2348 } 2349 bin = ADD_MASKED(bin, 1, ceq_mask, color_mask); 2350 } while (bin != color); 2351 2352 if (cands == 0) { 2353 VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip[r][mrange]); 2354 rw_exit(&page_ctrs_rwlock[mnode]); 2355 return (NULL); 2356 } 2357 2358 pfnum = IDX_TO_PNUM(mnode, r, idx0); 2359 if (pfnum < lo || pfnum >= hi) { 2360 pfnum = lo; 2361 } else { 2362 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it); 2363 if (pfnum == (pfn_t)-1) { 2364 pfnum = lo; 2365 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it); 2366 ASSERT(pfnum != (pfn_t)-1); 2367 } else if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) & ceq_mask || 2368 (interleaved_mnodes && PFN_2_MEM_NODE(pfnum) != mnode)) { 2369 /* invalid color, get the closest correct pfn */ 2370 PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask, 2371 color_mask, &it); 2372 if (pfnum >= hi) { 2373 pfnum = lo; 2374 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it); 2375 } 2376 } 2377 } 2378 2379 /* set starting index */ 2380 idx0 = PNUM_TO_IDX(mnode, r, pfnum); 2381 ASSERT(idx0 < len); 2382 2383 #if defined(__sparc) 2384 pfnum0 = pfnum; /* page corresponding to idx0 */ 2385 nhi = 0; /* search kcage ranges */ 2386 #endif 2387 2388 for (idx = idx0; wrap == 0 || (idx < idx0 && wrap < 2); ) { 2389 2390 #if defined(__sparc) 2391 /* 2392 * Find lowest intersection of kcage ranges and mnode. 2393 * MTYPE_NORELOC means look in the cage, otherwise outside. 2394 */ 2395 if (nhi <= pfnum) { 2396 if (kcage_next_range(mtype == MTYPE_NORELOC, pfnum, 2397 (wrap == 0 ? hi : pfnum0), &nlo, &nhi)) 2398 goto wrapit; 2399 2400 /* jump to the next page in the range */ 2401 if (pfnum < nlo) { 2402 pfnum = P2ROUNDUP(nlo, szcpgcnt); 2403 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it); 2404 idx = PNUM_TO_IDX(mnode, r, pfnum); 2405 if (idx >= len || pfnum >= hi) 2406 goto wrapit; 2407 if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) & 2408 ceq_mask) 2409 goto next; 2410 if (interleaved_mnodes && 2411 PFN_2_MEM_NODE(pfnum) != mnode) 2412 goto next; 2413 } 2414 } 2415 #endif 2416 2417 if (PAGE_COUNTERS(mnode, r, idx) != full) 2418 goto next; 2419 2420 /* 2421 * RFE: For performance maybe we can do something less 2422 * brutal than locking the entire freelist. So far 2423 * this doesn't seem to be a performance problem? 2424 */ 2425 page_freelist_lock(mnode); 2426 if (PAGE_COUNTERS(mnode, r, idx) == full) { 2427 ret_pp = 2428 page_promote(mnode, pfnum, r, PC_ALLOC, mtype); 2429 if (ret_pp != NULL) { 2430 VM_STAT_ADD(vmm_vmstats.pfc_coalok[r][mrange]); 2431 PAGE_COUNTERS_CURRENT_COLOR(mnode, r, 2432 PFN_2_COLOR(pfnum, szc, &it), mrange) = idx; 2433 page_freelist_unlock(mnode); 2434 rw_exit(&page_ctrs_rwlock[mnode]); 2435 #if defined(__sparc) 2436 if (PP_ISNORELOC(ret_pp)) { 2437 pgcnt_t npgs; 2438 2439 npgs = page_get_pagecnt(ret_pp->p_szc); 2440 kcage_freemem_sub(npgs); 2441 } 2442 #endif 2443 return (ret_pp); 2444 } 2445 } else { 2446 VM_STAT_ADD(vmm_vmstats.page_ctrs_changed[r][mrange]); 2447 } 2448 2449 page_freelist_unlock(mnode); 2450 /* 2451 * No point looking for another page if we've 2452 * already tried all of the ones that 2453 * page_ctr_cands indicated. Stash off where we left 2454 * off. 2455 * Note: this is not exact since we don't hold the 2456 * page_freelist_locks before we initially get the 2457 * value of cands for performance reasons, but should 2458 * be a decent approximation. 2459 */ 2460 if (--cands == 0) { 2461 PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color, mrange) = 2462 idx; 2463 break; 2464 } 2465 next: 2466 PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask, 2467 color_mask, &it); 2468 idx = PNUM_TO_IDX(mnode, r, pfnum); 2469 if (idx >= len || pfnum >= hi) { 2470 wrapit: 2471 pfnum = lo; 2472 MEM_NODE_ITERATOR_INIT(pfnum, mnode, szc, &it); 2473 idx = PNUM_TO_IDX(mnode, r, pfnum); 2474 wrap++; 2475 #if defined(__sparc) 2476 nhi = 0; /* search kcage ranges */ 2477 #endif 2478 } 2479 } 2480 2481 rw_exit(&page_ctrs_rwlock[mnode]); 2482 VM_STAT_ADD(vmm_vmstats.page_ctrs_failed[r][mrange]); 2483 return (NULL); 2484 } 2485 2486 /* 2487 * For the given mnode, promote as many small pages to large pages as possible. 2488 * mnode can be -1, which means do them all 2489 */ 2490 void 2491 page_freelist_coalesce_all(int mnode) 2492 { 2493 int r; /* region size */ 2494 int idx, full; 2495 size_t len; 2496 int doall = interleaved_mnodes || mnode < 0; 2497 int mlo = doall ? 0 : mnode; 2498 int mhi = doall ? max_mem_nodes : (mnode + 1); 2499 2500 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce_all); 2501 2502 if (mpss_coalesce_disable) { 2503 return; 2504 } 2505 2506 /* 2507 * Lock the entire freelist and coalesce what we can. 2508 * 2509 * Always promote to the largest page possible 2510 * first to reduce the number of page promotions. 2511 */ 2512 for (mnode = mlo; mnode < mhi; mnode++) { 2513 rw_enter(&page_ctrs_rwlock[mnode], RW_READER); 2514 page_freelist_lock(mnode); 2515 } 2516 for (r = mmu_page_sizes - 1; r > 0; r--) { 2517 for (mnode = mlo; mnode < mhi; mnode++) { 2518 pgcnt_t cands = 0; 2519 int mrange, nranges = mnode_nranges[mnode]; 2520 2521 for (mrange = 0; mrange < nranges; mrange++) { 2522 PGCTRS_CANDS_GETVALUE(mnode, mrange, r, cands); 2523 if (cands != 0) 2524 break; 2525 } 2526 if (cands == 0) { 2527 VM_STAT_ADD(vmm_vmstats. 2528 page_ctrs_cands_skip_all); 2529 continue; 2530 } 2531 2532 full = FULL_REGION_CNT(r); 2533 len = PAGE_COUNTERS_ENTRIES(mnode, r); 2534 2535 for (idx = 0; idx < len; idx++) { 2536 if (PAGE_COUNTERS(mnode, r, idx) == full) { 2537 pfn_t pfnum = 2538 IDX_TO_PNUM(mnode, r, idx); 2539 int tmnode = interleaved_mnodes ? 2540 PFN_2_MEM_NODE(pfnum) : mnode; 2541 2542 ASSERT(pfnum >= 2543 mem_node_config[tmnode].physbase && 2544 pfnum < 2545 mem_node_config[tmnode].physmax); 2546 2547 (void) page_promote(tmnode, 2548 pfnum, r, PC_FREE, PC_MTYPE_ANY); 2549 } 2550 } 2551 /* shared hpm_counters covers all mnodes, so we quit */ 2552 if (interleaved_mnodes) 2553 break; 2554 } 2555 } 2556 for (mnode = mlo; mnode < mhi; mnode++) { 2557 page_freelist_unlock(mnode); 2558 rw_exit(&page_ctrs_rwlock[mnode]); 2559 } 2560 } 2561 2562 /* 2563 * This is where all polices for moving pages around 2564 * to different page size free lists is implemented. 2565 * Returns 1 on success, 0 on failure. 2566 * 2567 * So far these are the priorities for this algorithm in descending 2568 * order: 2569 * 2570 * 1) When servicing a request try to do so with a free page 2571 * from next size up. Helps defer fragmentation as long 2572 * as possible. 2573 * 2574 * 2) Page coalesce on demand. Only when a freelist 2575 * larger than PAGESIZE is empty and step 1 2576 * will not work since all larger size lists are 2577 * also empty. 2578 * 2579 * If pfnhi is non-zero, search for large page with pfn range less than pfnhi. 2580 */ 2581 2582 page_t * 2583 page_freelist_split(uchar_t szc, uint_t color, int mnode, int mtype, 2584 pfn_t pfnlo, pfn_t pfnhi, page_list_walker_t *plw) 2585 { 2586 uchar_t nszc = szc + 1; 2587 uint_t bin, sbin, bin_prev; 2588 page_t *pp, *firstpp; 2589 page_t *ret_pp = NULL; 2590 uint_t color_mask; 2591 2592 if (nszc == mmu_page_sizes) 2593 return (NULL); 2594 2595 ASSERT(nszc < mmu_page_sizes); 2596 color_mask = PAGE_GET_PAGECOLORS(nszc) - 1; 2597 bin = sbin = PAGE_GET_NSZ_COLOR(szc, color); 2598 bin_prev = (plw->plw_bin_split_prev == color) ? INVALID_COLOR : 2599 PAGE_GET_NSZ_COLOR(szc, plw->plw_bin_split_prev); 2600 2601 VM_STAT_ADD(vmm_vmstats.pfs_req[szc]); 2602 /* 2603 * First try to break up a larger page to fill current size freelist. 2604 */ 2605 while (plw->plw_bins[nszc] != 0) { 2606 2607 ASSERT(nszc < mmu_page_sizes); 2608 2609 /* 2610 * If page found then demote it. 2611 */ 2612 if (PAGE_FREELISTS(mnode, nszc, bin, mtype)) { 2613 page_freelist_lock(mnode); 2614 firstpp = pp = PAGE_FREELISTS(mnode, nszc, bin, mtype); 2615 2616 /* 2617 * If pfnhi is not PFNNULL, look for large page below 2618 * pfnhi. PFNNULL signifies no pfn requirement. 2619 */ 2620 if (pp && 2621 ((pfnhi != PFNNULL && pp->p_pagenum >= pfnhi) || 2622 (pfnlo != PFNNULL && pp->p_pagenum < pfnlo))) { 2623 do { 2624 pp = pp->p_vpnext; 2625 if (pp == firstpp) { 2626 pp = NULL; 2627 break; 2628 } 2629 } while ((pfnhi != PFNNULL && 2630 pp->p_pagenum >= pfnhi) || 2631 (pfnlo != PFNNULL && 2632 pp->p_pagenum < pfnlo)); 2633 2634 if (pfnhi != PFNNULL && pp != NULL) 2635 ASSERT(pp->p_pagenum < pfnhi); 2636 2637 if (pfnlo != PFNNULL && pp != NULL) 2638 ASSERT(pp->p_pagenum >= pfnlo); 2639 } 2640 if (pp) { 2641 uint_t ccolor = page_correct_color(szc, nszc, 2642 color, bin, plw->plw_ceq_mask[szc]); 2643 2644 ASSERT(pp->p_szc == nszc); 2645 VM_STAT_ADD(vmm_vmstats.pfs_demote[nszc]); 2646 ret_pp = page_demote(mnode, pp->p_pagenum, 2647 pfnhi, pp->p_szc, szc, ccolor, PC_ALLOC); 2648 if (ret_pp) { 2649 page_freelist_unlock(mnode); 2650 #if defined(__sparc) 2651 if (PP_ISNORELOC(ret_pp)) { 2652 pgcnt_t npgs; 2653 2654 npgs = page_get_pagecnt( 2655 ret_pp->p_szc); 2656 kcage_freemem_sub(npgs); 2657 } 2658 #endif 2659 return (ret_pp); 2660 } 2661 } 2662 page_freelist_unlock(mnode); 2663 } 2664 2665 /* loop through next size bins */ 2666 bin = ADD_MASKED(bin, 1, plw->plw_ceq_mask[nszc], color_mask); 2667 plw->plw_bins[nszc]--; 2668 2669 if (bin == sbin) { 2670 uchar_t nnszc = nszc + 1; 2671 2672 /* we are done with this page size - check next */ 2673 if (plw->plw_bins[nnszc] == 0) 2674 /* we have already checked next size bins */ 2675 break; 2676 2677 bin = sbin = PAGE_GET_NSZ_COLOR(nszc, bin); 2678 if (bin_prev != INVALID_COLOR) { 2679 bin_prev = PAGE_GET_NSZ_COLOR(nszc, bin_prev); 2680 if (!((bin ^ bin_prev) & 2681 plw->plw_ceq_mask[nnszc])) 2682 break; 2683 } 2684 ASSERT(nnszc < mmu_page_sizes); 2685 color_mask = PAGE_GET_PAGECOLORS(nnszc) - 1; 2686 nszc = nnszc; 2687 ASSERT(nszc < mmu_page_sizes); 2688 } 2689 } 2690 2691 return (ret_pp); 2692 } 2693 2694 /* 2695 * Helper routine used only by the freelist code to lock 2696 * a page. If the page is a large page then it succeeds in 2697 * locking all the constituent pages or none at all. 2698 * Returns 1 on sucess, 0 on failure. 2699 */ 2700 static int 2701 page_trylock_cons(page_t *pp, se_t se) 2702 { 2703 page_t *tpp, *first_pp = pp; 2704 2705 /* 2706 * Fail if can't lock first or only page. 2707 */ 2708 if (!page_trylock(pp, se)) { 2709 return (0); 2710 } 2711 2712 /* 2713 * PAGESIZE: common case. 2714 */ 2715 if (pp->p_szc == 0) { 2716 return (1); 2717 } 2718 2719 /* 2720 * Large page case. 2721 */ 2722 tpp = pp->p_next; 2723 while (tpp != pp) { 2724 if (!page_trylock(tpp, se)) { 2725 /* 2726 * On failure unlock what we have locked so far. 2727 * We want to avoid attempting to capture these 2728 * pages as the pcm mutex may be held which could 2729 * lead to a recursive mutex panic. 2730 */ 2731 while (first_pp != tpp) { 2732 page_unlock_nocapture(first_pp); 2733 first_pp = first_pp->p_next; 2734 } 2735 return (0); 2736 } 2737 tpp = tpp->p_next; 2738 } 2739 return (1); 2740 } 2741 2742 /* 2743 * init context for walking page lists 2744 * Called when a page of the given szc in unavailable. Sets markers 2745 * for the beginning of the search to detect when search has 2746 * completed a full cycle. Sets flags for splitting larger pages 2747 * and coalescing smaller pages. Page walking procedes until a page 2748 * of the desired equivalent color is found. 2749 */ 2750 void 2751 page_list_walk_init(uchar_t szc, uint_t flags, uint_t bin, int can_split, 2752 int use_ceq, page_list_walker_t *plw) 2753 { 2754 uint_t nszc, ceq_mask, colors; 2755 uchar_t ceq = use_ceq ? colorequivszc[szc] : 0; 2756 2757 ASSERT(szc < mmu_page_sizes); 2758 colors = PAGE_GET_PAGECOLORS(szc); 2759 2760 plw->plw_colors = colors; 2761 plw->plw_color_mask = colors - 1; 2762 plw->plw_bin_marker = plw->plw_bin0 = bin; 2763 plw->plw_bin_split_prev = bin; 2764 plw->plw_bin_step = (szc == 0) ? vac_colors : 1; 2765 2766 /* 2767 * if vac aliasing is possible make sure lower order color 2768 * bits are never ignored 2769 */ 2770 if (vac_colors > 1) 2771 ceq &= 0xf0; 2772 2773 /* 2774 * calculate the number of non-equivalent colors and 2775 * color equivalency mask 2776 */ 2777 plw->plw_ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf)); 2778 ASSERT(szc > 0 || plw->plw_ceq_dif >= vac_colors); 2779 ASSERT(plw->plw_ceq_dif > 0); 2780 plw->plw_ceq_mask[szc] = (plw->plw_ceq_dif - 1) << (ceq & 0xf); 2781 2782 if (flags & PG_MATCH_COLOR) { 2783 if (cpu_page_colors < 0) { 2784 /* 2785 * this is a heterogeneous machine with different CPUs 2786 * having different size e$ (not supported for ni2/rock 2787 */ 2788 uint_t cpucolors = CPUSETSIZE() >> PAGE_GET_SHIFT(szc); 2789 cpucolors = MAX(cpucolors, 1); 2790 ceq_mask = plw->plw_color_mask & (cpucolors - 1); 2791 plw->plw_ceq_mask[szc] = 2792 MIN(ceq_mask, plw->plw_ceq_mask[szc]); 2793 } 2794 plw->plw_ceq_dif = 1; 2795 } 2796 2797 /* we can split pages in the freelist, but not the cachelist */ 2798 if (can_split) { 2799 plw->plw_do_split = (szc + 1 < mmu_page_sizes) ? 1 : 0; 2800 2801 /* set next szc color masks and number of free list bins */ 2802 for (nszc = szc + 1; nszc < mmu_page_sizes; nszc++, szc++) { 2803 plw->plw_ceq_mask[nszc] = PAGE_GET_NSZ_MASK(szc, 2804 plw->plw_ceq_mask[szc]); 2805 plw->plw_bins[nszc] = PAGE_GET_PAGECOLORS(nszc); 2806 } 2807 plw->plw_ceq_mask[nszc] = INVALID_MASK; 2808 plw->plw_bins[nszc] = 0; 2809 2810 } else { 2811 ASSERT(szc == 0); 2812 plw->plw_do_split = 0; 2813 plw->plw_bins[1] = 0; 2814 plw->plw_ceq_mask[1] = INVALID_MASK; 2815 } 2816 } 2817 2818 /* 2819 * set mark to flag where next split should occur 2820 */ 2821 #define PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw) { \ 2822 uint_t bin_nsz = PAGE_GET_NSZ_COLOR(szc, bin); \ 2823 uint_t bin0_nsz = PAGE_GET_NSZ_COLOR(szc, plw->plw_bin0); \ 2824 uint_t neq_mask = ~plw->plw_ceq_mask[nszc] & plw->plw_color_mask; \ 2825 plw->plw_split_next = \ 2826 INC_MASKED(bin_nsz, neq_mask, plw->plw_color_mask); \ 2827 if (!((plw->plw_split_next ^ bin0_nsz) & plw->plw_ceq_mask[nszc])) { \ 2828 plw->plw_split_next = \ 2829 INC_MASKED(plw->plw_split_next, \ 2830 neq_mask, plw->plw_color_mask); \ 2831 } \ 2832 } 2833 2834 uint_t 2835 page_list_walk_next_bin(uchar_t szc, uint_t bin, page_list_walker_t *plw) 2836 { 2837 uint_t neq_mask = ~plw->plw_ceq_mask[szc] & plw->plw_color_mask; 2838 uint_t bin0_nsz, nbin_nsz, nbin0, nbin; 2839 uchar_t nszc = szc + 1; 2840 2841 nbin = ADD_MASKED(bin, 2842 plw->plw_bin_step, neq_mask, plw->plw_color_mask); 2843 2844 if (plw->plw_do_split) { 2845 plw->plw_bin_split_prev = bin; 2846 PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw); 2847 plw->plw_do_split = 0; 2848 } 2849 2850 if (szc == 0) { 2851 if (plw->plw_count != 0 || plw->plw_ceq_dif == vac_colors) { 2852 if (nbin == plw->plw_bin0 && 2853 (vac_colors == 1 || nbin != plw->plw_bin_marker)) { 2854 nbin = ADD_MASKED(nbin, plw->plw_bin_step, 2855 neq_mask, plw->plw_color_mask); 2856 plw->plw_bin_split_prev = plw->plw_bin0; 2857 } 2858 2859 if (vac_colors > 1 && nbin == plw->plw_bin_marker) { 2860 plw->plw_bin_marker = 2861 nbin = INC_MASKED(nbin, neq_mask, 2862 plw->plw_color_mask); 2863 plw->plw_bin_split_prev = plw->plw_bin0; 2864 /* 2865 * large pages all have the same vac color 2866 * so by now we should be done with next 2867 * size page splitting process 2868 */ 2869 ASSERT(plw->plw_bins[1] == 0); 2870 plw->plw_do_split = 0; 2871 return (nbin); 2872 } 2873 2874 } else { 2875 uint_t bin_jump = (vac_colors == 1) ? 2876 (BIN_STEP & ~3) - (plw->plw_bin0 & 3) : BIN_STEP; 2877 2878 bin_jump &= ~(vac_colors - 1); 2879 2880 nbin0 = ADD_MASKED(plw->plw_bin0, bin_jump, neq_mask, 2881 plw->plw_color_mask); 2882 2883 if ((nbin0 ^ plw->plw_bin0) & plw->plw_ceq_mask[szc]) { 2884 2885 plw->plw_bin_marker = nbin = nbin0; 2886 2887 if (plw->plw_bins[nszc] != 0) { 2888 /* 2889 * check if next page size bin is the 2890 * same as the next page size bin for 2891 * bin0 2892 */ 2893 nbin_nsz = PAGE_GET_NSZ_COLOR(szc, 2894 nbin); 2895 bin0_nsz = PAGE_GET_NSZ_COLOR(szc, 2896 plw->plw_bin0); 2897 2898 if ((bin0_nsz ^ nbin_nsz) & 2899 plw->plw_ceq_mask[nszc]) 2900 plw->plw_do_split = 1; 2901 } 2902 return (nbin); 2903 } 2904 } 2905 } 2906 2907 if (plw->plw_bins[nszc] != 0) { 2908 nbin_nsz = PAGE_GET_NSZ_COLOR(szc, nbin); 2909 if (!((plw->plw_split_next ^ nbin_nsz) & 2910 plw->plw_ceq_mask[nszc])) 2911 plw->plw_do_split = 1; 2912 } 2913 2914 return (nbin); 2915 } 2916 2917 page_t * 2918 page_get_mnode_freelist(int mnode, uint_t bin, int mtype, uchar_t szc, 2919 uint_t flags) 2920 { 2921 kmutex_t *pcm; 2922 page_t *pp, *first_pp; 2923 uint_t sbin; 2924 int plw_initialized; 2925 page_list_walker_t plw; 2926 2927 ASSERT(szc < mmu_page_sizes); 2928 2929 VM_STAT_ADD(vmm_vmstats.pgmf_alloc[szc]); 2930 2931 MTYPE_START(mnode, mtype, flags); 2932 if (mtype < 0) { /* mnode does not have memory in mtype range */ 2933 VM_STAT_ADD(vmm_vmstats.pgmf_allocempty[szc]); 2934 return (NULL); 2935 } 2936 try_again: 2937 2938 plw_initialized = 0; 2939 plw.plw_ceq_dif = 1; 2940 2941 /* 2942 * Only hold one freelist lock at a time, that way we 2943 * can start anywhere and not have to worry about lock 2944 * ordering. 2945 */ 2946 for (plw.plw_count = 0; 2947 plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) { 2948 sbin = bin; 2949 do { 2950 if (!PAGE_FREELISTS(mnode, szc, bin, mtype)) 2951 goto bin_empty_1; 2952 2953 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 2954 mutex_enter(pcm); 2955 pp = PAGE_FREELISTS(mnode, szc, bin, mtype); 2956 if (pp == NULL) 2957 goto bin_empty_0; 2958 2959 /* 2960 * These were set before the page 2961 * was put on the free list, 2962 * they must still be set. 2963 */ 2964 ASSERT(PP_ISFREE(pp)); 2965 ASSERT(PP_ISAGED(pp)); 2966 ASSERT(pp->p_vnode == NULL); 2967 ASSERT(pp->p_hash == NULL); 2968 ASSERT(pp->p_offset == (u_offset_t)-1); 2969 ASSERT(pp->p_szc == szc); 2970 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 2971 2972 /* 2973 * Walk down the hash chain. 2974 * 8k pages are linked on p_next 2975 * and p_prev fields. Large pages 2976 * are a contiguous group of 2977 * constituent pages linked together 2978 * on their p_next and p_prev fields. 2979 * The large pages are linked together 2980 * on the hash chain using p_vpnext 2981 * p_vpprev of the base constituent 2982 * page of each large page. 2983 */ 2984 first_pp = pp; 2985 while (IS_DUMP_PAGE(pp) || !page_trylock_cons(pp, 2986 SE_EXCL)) { 2987 if (szc == 0) { 2988 pp = pp->p_next; 2989 } else { 2990 pp = pp->p_vpnext; 2991 } 2992 2993 ASSERT(PP_ISFREE(pp)); 2994 ASSERT(PP_ISAGED(pp)); 2995 ASSERT(pp->p_vnode == NULL); 2996 ASSERT(pp->p_hash == NULL); 2997 ASSERT(pp->p_offset == (u_offset_t)-1); 2998 ASSERT(pp->p_szc == szc); 2999 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 3000 3001 if (pp == first_pp) 3002 goto bin_empty_0; 3003 } 3004 3005 ASSERT(pp != NULL); 3006 ASSERT(mtype == PP_2_MTYPE(pp)); 3007 ASSERT(pp->p_szc == szc); 3008 if (szc == 0) { 3009 page_sub(&PAGE_FREELISTS(mnode, 3010 szc, bin, mtype), pp); 3011 } else { 3012 page_vpsub(&PAGE_FREELISTS(mnode, 3013 szc, bin, mtype), pp); 3014 CHK_LPG(pp, szc); 3015 } 3016 page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST); 3017 3018 if ((PP_ISFREE(pp) == 0) || (PP_ISAGED(pp) == 0)) 3019 panic("free page is not. pp %p", (void *)pp); 3020 mutex_exit(pcm); 3021 3022 #if defined(__sparc) 3023 ASSERT(!kcage_on || PP_ISNORELOC(pp) || 3024 (flags & PG_NORELOC) == 0); 3025 3026 if (PP_ISNORELOC(pp)) 3027 kcage_freemem_sub(page_get_pagecnt(szc)); 3028 #endif 3029 VM_STAT_ADD(vmm_vmstats.pgmf_allocok[szc]); 3030 return (pp); 3031 3032 bin_empty_0: 3033 mutex_exit(pcm); 3034 bin_empty_1: 3035 if (plw_initialized == 0) { 3036 page_list_walk_init(szc, flags, bin, 1, 1, 3037 &plw); 3038 plw_initialized = 1; 3039 ASSERT(plw.plw_colors <= 3040 PAGE_GET_PAGECOLORS(szc)); 3041 ASSERT(plw.plw_colors > 0); 3042 ASSERT((plw.plw_colors & 3043 (plw.plw_colors - 1)) == 0); 3044 ASSERT(bin < plw.plw_colors); 3045 ASSERT(plw.plw_ceq_mask[szc] < plw.plw_colors); 3046 } 3047 /* calculate the next bin with equivalent color */ 3048 bin = ADD_MASKED(bin, plw.plw_bin_step, 3049 plw.plw_ceq_mask[szc], plw.plw_color_mask); 3050 } while (sbin != bin); 3051 3052 /* 3053 * color bins are all empty if color match. Try and 3054 * satisfy the request by breaking up or coalescing 3055 * pages from a different size freelist of the correct 3056 * color that satisfies the ORIGINAL color requested. 3057 * If that fails then try pages of the same size but 3058 * different colors assuming we are not called with 3059 * PG_MATCH_COLOR. 3060 */ 3061 if (plw.plw_do_split && 3062 (pp = page_freelist_split(szc, bin, mnode, 3063 mtype, PFNNULL, PFNNULL, &plw)) != NULL) 3064 return (pp); 3065 3066 if (szc > 0 && (pp = page_freelist_coalesce(mnode, szc, 3067 bin, plw.plw_ceq_mask[szc], mtype, PFNNULL)) != NULL) 3068 return (pp); 3069 3070 if (plw.plw_ceq_dif > 1) 3071 bin = page_list_walk_next_bin(szc, bin, &plw); 3072 } 3073 3074 /* if allowed, cycle through additional mtypes */ 3075 MTYPE_NEXT(mnode, mtype, flags); 3076 if (mtype >= 0) 3077 goto try_again; 3078 3079 VM_STAT_ADD(vmm_vmstats.pgmf_allocfailed[szc]); 3080 3081 return (NULL); 3082 } 3083 3084 /* 3085 * Returns the count of free pages for 'pp' with size code 'szc'. 3086 * Note: This function does not return an exact value as the page freelist 3087 * locks are not held and thus the values in the page_counters may be 3088 * changing as we walk through the data. 3089 */ 3090 static int 3091 page_freecnt(int mnode, page_t *pp, uchar_t szc) 3092 { 3093 pgcnt_t pgfree; 3094 pgcnt_t cnt; 3095 ssize_t r = szc; /* region size */ 3096 ssize_t idx; 3097 int i; 3098 int full, range; 3099 3100 /* Make sure pagenum passed in is aligned properly */ 3101 ASSERT((pp->p_pagenum & (PNUM_SIZE(szc) - 1)) == 0); 3102 ASSERT(szc > 0); 3103 3104 /* Prevent page_counters dynamic memory from being freed */ 3105 rw_enter(&page_ctrs_rwlock[mnode], RW_READER); 3106 idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum); 3107 cnt = PAGE_COUNTERS(mnode, r, idx); 3108 pgfree = cnt << PNUM_SHIFT(r - 1); 3109 range = FULL_REGION_CNT(szc); 3110 3111 /* Check for completely full region */ 3112 if (cnt == range) { 3113 rw_exit(&page_ctrs_rwlock[mnode]); 3114 return (pgfree); 3115 } 3116 3117 while (--r > 0) { 3118 idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum); 3119 full = FULL_REGION_CNT(r); 3120 for (i = 0; i < range; i++, idx++) { 3121 cnt = PAGE_COUNTERS(mnode, r, idx); 3122 /* 3123 * If cnt here is full, that means we have already 3124 * accounted for these pages earlier. 3125 */ 3126 if (cnt != full) { 3127 pgfree += (cnt << PNUM_SHIFT(r - 1)); 3128 } 3129 } 3130 range *= full; 3131 } 3132 rw_exit(&page_ctrs_rwlock[mnode]); 3133 return (pgfree); 3134 } 3135 3136 /* 3137 * Called from page_geti_contig_pages to exclusively lock constituent pages 3138 * starting from 'spp' for page size code 'szc'. 3139 * 3140 * If 'ptcpthreshold' is set, the number of free pages needed in the 'szc' 3141 * region needs to be greater than or equal to the threshold. 3142 */ 3143 static int 3144 page_trylock_contig_pages(int mnode, page_t *spp, uchar_t szc, int flags) 3145 { 3146 pgcnt_t pgcnt = PNUM_SIZE(szc); 3147 pgcnt_t pgfree, i; 3148 page_t *pp; 3149 3150 VM_STAT_ADD(vmm_vmstats.ptcp[szc]); 3151 3152 3153 if ((ptcpthreshold == 0) || (flags & PGI_PGCPHIPRI)) 3154 goto skipptcpcheck; 3155 /* 3156 * check if there are sufficient free pages available before attempting 3157 * to trylock. Count is approximate as page counters can change. 3158 */ 3159 pgfree = page_freecnt(mnode, spp, szc); 3160 3161 /* attempt to trylock if there are sufficient already free pages */ 3162 if (pgfree < pgcnt/ptcpthreshold) { 3163 VM_STAT_ADD(vmm_vmstats.ptcpfreethresh[szc]); 3164 return (0); 3165 } 3166 3167 skipptcpcheck: 3168 3169 for (i = 0; i < pgcnt; i++) { 3170 pp = &spp[i]; 3171 if (!page_trylock(pp, SE_EXCL)) { 3172 VM_STAT_ADD(vmm_vmstats.ptcpfailexcl[szc]); 3173 while (--i != (pgcnt_t)-1) { 3174 pp = &spp[i]; 3175 ASSERT(PAGE_EXCL(pp)); 3176 page_unlock_nocapture(pp); 3177 } 3178 return (0); 3179 } 3180 ASSERT(spp[i].p_pagenum == spp->p_pagenum + i); 3181 if ((pp->p_szc > szc || (szc && pp->p_szc == szc)) && 3182 !PP_ISFREE(pp)) { 3183 VM_STAT_ADD(vmm_vmstats.ptcpfailszc[szc]); 3184 ASSERT(i == 0); 3185 page_unlock_nocapture(pp); 3186 return (0); 3187 } 3188 3189 /* 3190 * If a page has been marked non-relocatable or has been 3191 * explicitly locked in memory, we don't want to relocate it; 3192 * unlock the pages and fail the operation. 3193 */ 3194 if (PP_ISNORELOC(pp) || 3195 pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 3196 VM_STAT_ADD(vmm_vmstats.ptcpfailcage[szc]); 3197 while (i != (pgcnt_t)-1) { 3198 pp = &spp[i]; 3199 ASSERT(PAGE_EXCL(pp)); 3200 page_unlock_nocapture(pp); 3201 i--; 3202 } 3203 return (0); 3204 } 3205 } 3206 VM_STAT_ADD(vmm_vmstats.ptcpok[szc]); 3207 return (1); 3208 } 3209 3210 /* 3211 * Claim large page pointed to by 'pp'. 'pp' is the starting set 3212 * of 'szc' constituent pages that had been locked exclusively previously. 3213 * Will attempt to relocate constituent pages in use. 3214 */ 3215 static page_t * 3216 page_claim_contig_pages(page_t *pp, uchar_t szc, int flags) 3217 { 3218 spgcnt_t pgcnt, npgs, i; 3219 page_t *targpp, *rpp, *hpp; 3220 page_t *replpp = NULL; 3221 page_t *pplist = NULL; 3222 3223 ASSERT(pp != NULL); 3224 3225 pgcnt = page_get_pagecnt(szc); 3226 while (pgcnt) { 3227 ASSERT(PAGE_EXCL(pp)); 3228 ASSERT(!PP_ISNORELOC(pp)); 3229 if (PP_ISFREE(pp)) { 3230 /* 3231 * If this is a PG_FREE_LIST page then its 3232 * size code can change underneath us due to 3233 * page promotion or demotion. As an optimzation 3234 * use page_list_sub_pages() instead of 3235 * page_list_sub(). 3236 */ 3237 if (PP_ISAGED(pp)) { 3238 page_list_sub_pages(pp, szc); 3239 if (pp->p_szc == szc) { 3240 return (pp); 3241 } 3242 ASSERT(pp->p_szc < szc); 3243 npgs = page_get_pagecnt(pp->p_szc); 3244 hpp = pp; 3245 for (i = 0; i < npgs; i++, pp++) { 3246 pp->p_szc = szc; 3247 } 3248 page_list_concat(&pplist, &hpp); 3249 pgcnt -= npgs; 3250 continue; 3251 } 3252 ASSERT(!PP_ISAGED(pp)); 3253 ASSERT(pp->p_szc == 0); 3254 page_list_sub(pp, PG_CACHE_LIST); 3255 page_hashout(pp, NULL); 3256 PP_SETAGED(pp); 3257 pp->p_szc = szc; 3258 page_list_concat(&pplist, &pp); 3259 pp++; 3260 pgcnt--; 3261 continue; 3262 } 3263 npgs = page_get_pagecnt(pp->p_szc); 3264 3265 /* 3266 * page_create_wait freemem accounting done by caller of 3267 * page_get_freelist and not necessary to call it prior to 3268 * calling page_get_replacement_page. 3269 * 3270 * page_get_replacement_page can call page_get_contig_pages 3271 * to acquire a large page (szc > 0); the replacement must be 3272 * smaller than the contig page size to avoid looping or 3273 * szc == 0 and PGI_PGCPSZC0 is set. 3274 */ 3275 if (pp->p_szc < szc || (szc == 0 && (flags & PGI_PGCPSZC0))) { 3276 replpp = page_get_replacement_page(pp, NULL, 0); 3277 if (replpp) { 3278 npgs = page_get_pagecnt(pp->p_szc); 3279 ASSERT(npgs <= pgcnt); 3280 targpp = pp; 3281 } 3282 } 3283 3284 /* 3285 * If replacement is NULL or do_page_relocate fails, fail 3286 * coalescing of pages. 3287 */ 3288 if (replpp == NULL || (do_page_relocate(&targpp, &replpp, 0, 3289 &npgs, NULL) != 0)) { 3290 /* 3291 * Unlock un-processed target list 3292 */ 3293 while (pgcnt--) { 3294 ASSERT(PAGE_EXCL(pp)); 3295 page_unlock_nocapture(pp); 3296 pp++; 3297 } 3298 /* 3299 * Free the processed target list. 3300 */ 3301 while (pplist) { 3302 pp = pplist; 3303 page_sub(&pplist, pp); 3304 ASSERT(PAGE_EXCL(pp)); 3305 ASSERT(pp->p_szc == szc); 3306 ASSERT(PP_ISFREE(pp)); 3307 ASSERT(PP_ISAGED(pp)); 3308 pp->p_szc = 0; 3309 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); 3310 page_unlock_nocapture(pp); 3311 } 3312 3313 if (replpp != NULL) 3314 page_free_replacement_page(replpp); 3315 3316 return (NULL); 3317 } 3318 ASSERT(pp == targpp); 3319 3320 /* LINTED */ 3321 ASSERT(hpp = pp); /* That's right, it's an assignment */ 3322 3323 pp += npgs; 3324 pgcnt -= npgs; 3325 3326 while (npgs--) { 3327 ASSERT(PAGE_EXCL(targpp)); 3328 ASSERT(!PP_ISFREE(targpp)); 3329 ASSERT(!PP_ISNORELOC(targpp)); 3330 PP_SETFREE(targpp); 3331 ASSERT(PP_ISAGED(targpp)); 3332 ASSERT(targpp->p_szc < szc || (szc == 0 && 3333 (flags & PGI_PGCPSZC0))); 3334 targpp->p_szc = szc; 3335 targpp = targpp->p_next; 3336 3337 rpp = replpp; 3338 ASSERT(rpp != NULL); 3339 page_sub(&replpp, rpp); 3340 ASSERT(PAGE_EXCL(rpp)); 3341 ASSERT(!PP_ISFREE(rpp)); 3342 page_unlock_nocapture(rpp); 3343 } 3344 ASSERT(targpp == hpp); 3345 ASSERT(replpp == NULL); 3346 page_list_concat(&pplist, &targpp); 3347 } 3348 CHK_LPG(pplist, szc); 3349 return (pplist); 3350 } 3351 3352 /* 3353 * Trim kernel cage from pfnlo-pfnhi and store result in lo-hi. Return code 3354 * of 0 means nothing left after trim. 3355 */ 3356 int 3357 trimkcage(struct memseg *mseg, pfn_t *lo, pfn_t *hi, pfn_t pfnlo, pfn_t pfnhi) 3358 { 3359 pfn_t kcagepfn; 3360 int decr; 3361 int rc = 0; 3362 3363 if (PP_ISNORELOC(mseg->pages)) { 3364 if (PP_ISNORELOC(mseg->epages - 1) == 0) { 3365 3366 /* lower part of this mseg inside kernel cage */ 3367 decr = kcage_current_pfn(&kcagepfn); 3368 3369 /* kernel cage may have transitioned past mseg */ 3370 if (kcagepfn >= mseg->pages_base && 3371 kcagepfn < mseg->pages_end) { 3372 ASSERT(decr == 0); 3373 *lo = MAX(kcagepfn, pfnlo); 3374 *hi = MIN(pfnhi, (mseg->pages_end - 1)); 3375 rc = 1; 3376 } 3377 } 3378 /* else entire mseg in the cage */ 3379 } else { 3380 if (PP_ISNORELOC(mseg->epages - 1)) { 3381 3382 /* upper part of this mseg inside kernel cage */ 3383 decr = kcage_current_pfn(&kcagepfn); 3384 3385 /* kernel cage may have transitioned past mseg */ 3386 if (kcagepfn >= mseg->pages_base && 3387 kcagepfn < mseg->pages_end) { 3388 ASSERT(decr); 3389 *hi = MIN(kcagepfn, pfnhi); 3390 *lo = MAX(pfnlo, mseg->pages_base); 3391 rc = 1; 3392 } 3393 } else { 3394 /* entire mseg outside of kernel cage */ 3395 *lo = MAX(pfnlo, mseg->pages_base); 3396 *hi = MIN(pfnhi, (mseg->pages_end - 1)); 3397 rc = 1; 3398 } 3399 } 3400 return (rc); 3401 } 3402 3403 /* 3404 * called from page_get_contig_pages to search 'pfnlo' thru 'pfnhi' to claim a 3405 * page with size code 'szc'. Claiming such a page requires acquiring 3406 * exclusive locks on all constituent pages (page_trylock_contig_pages), 3407 * relocating pages in use and concatenating these constituent pages into a 3408 * large page. 3409 * 3410 * The page lists do not have such a large page and page_freelist_split has 3411 * already failed to demote larger pages and/or coalesce smaller free pages. 3412 * 3413 * 'flags' may specify PG_COLOR_MATCH which would limit the search of large 3414 * pages with the same color as 'bin'. 3415 * 3416 * 'pfnflag' specifies the subset of the pfn range to search. 3417 */ 3418 3419 static page_t * 3420 page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags, 3421 pfn_t pfnlo, pfn_t pfnhi, pgcnt_t pfnflag) 3422 { 3423 struct memseg *mseg; 3424 pgcnt_t szcpgcnt = page_get_pagecnt(szc); 3425 pgcnt_t szcpgmask = szcpgcnt - 1; 3426 pfn_t randpfn; 3427 page_t *pp, *randpp, *endpp; 3428 uint_t colors, ceq_mask; 3429 /* LINTED : set but not used in function */ 3430 uint_t color_mask; 3431 pfn_t hi, lo; 3432 uint_t skip; 3433 MEM_NODE_ITERATOR_DECL(it); 3434 3435 ASSERT(szc != 0 || (flags & PGI_PGCPSZC0)); 3436 3437 pfnlo = P2ROUNDUP(pfnlo, szcpgcnt); 3438 3439 if ((pfnhi - pfnlo) + 1 < szcpgcnt || pfnlo >= pfnhi) 3440 return (NULL); 3441 3442 ASSERT(szc < mmu_page_sizes); 3443 3444 colors = PAGE_GET_PAGECOLORS(szc); 3445 color_mask = colors - 1; 3446 if ((colors > 1) && (flags & PG_MATCH_COLOR)) { 3447 uchar_t ceq = colorequivszc[szc]; 3448 uint_t ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf)); 3449 3450 ASSERT(ceq_dif > 0); 3451 ceq_mask = (ceq_dif - 1) << (ceq & 0xf); 3452 } else { 3453 ceq_mask = 0; 3454 } 3455 3456 ASSERT(bin < colors); 3457 3458 /* clear "non-significant" color bits */ 3459 bin &= ceq_mask; 3460 3461 /* 3462 * trim the pfn range to search based on pfnflag. pfnflag is set 3463 * when there have been previous page_get_contig_page failures to 3464 * limit the search. 3465 * 3466 * The high bit in pfnflag specifies the number of 'slots' in the 3467 * pfn range and the remainder of pfnflag specifies which slot. 3468 * For example, a value of 1010b would mean the second slot of 3469 * the pfn range that has been divided into 8 slots. 3470 */ 3471 if (pfnflag > 1) { 3472 int slots = 1 << (highbit(pfnflag) - 1); 3473 int slotid = pfnflag & (slots - 1); 3474 pgcnt_t szcpages; 3475 int slotlen; 3476 3477 pfnhi = P2ALIGN((pfnhi + 1), szcpgcnt) - 1; 3478 szcpages = ((pfnhi - pfnlo) + 1) / szcpgcnt; 3479 slotlen = howmany(szcpages, slots); 3480 /* skip if 'slotid' slot is empty */ 3481 if (slotid * slotlen >= szcpages) 3482 return (NULL); 3483 pfnlo = pfnlo + (((slotid * slotlen) % szcpages) * szcpgcnt); 3484 ASSERT(pfnlo < pfnhi); 3485 if (pfnhi > pfnlo + (slotlen * szcpgcnt)) 3486 pfnhi = pfnlo + (slotlen * szcpgcnt) - 1; 3487 } 3488 3489 /* 3490 * This routine is can be called recursively so we shouldn't 3491 * acquire a reader lock if a write request is pending. This 3492 * could lead to a deadlock with the DR thread. 3493 * 3494 * Returning NULL informs the caller that we could not get 3495 * a contig page with the required characteristics. 3496 */ 3497 3498 if (!memsegs_trylock(0)) 3499 return (NULL); 3500 3501 /* 3502 * loop through memsegs to look for contig page candidates 3503 */ 3504 3505 for (mseg = memsegs; mseg != NULL; mseg = mseg->next) { 3506 if (pfnhi < mseg->pages_base || pfnlo >= mseg->pages_end) { 3507 /* no overlap */ 3508 continue; 3509 } 3510 3511 if (mseg->pages_end - mseg->pages_base < szcpgcnt) 3512 /* mseg too small */ 3513 continue; 3514 3515 /* 3516 * trim off kernel cage pages from pfn range and check for 3517 * a trimmed pfn range returned that does not span the 3518 * desired large page size. 3519 */ 3520 if (kcage_on) { 3521 if (trimkcage(mseg, &lo, &hi, pfnlo, pfnhi) == 0 || 3522 lo >= hi || ((hi - lo) + 1) < szcpgcnt) 3523 continue; 3524 } else { 3525 lo = MAX(pfnlo, mseg->pages_base); 3526 hi = MIN(pfnhi, (mseg->pages_end - 1)); 3527 } 3528 3529 /* round to szcpgcnt boundaries */ 3530 lo = P2ROUNDUP(lo, szcpgcnt); 3531 3532 MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it); 3533 hi = P2ALIGN((hi + 1), szcpgcnt) - 1; 3534 3535 if (hi <= lo) 3536 continue; 3537 3538 /* 3539 * set lo to point to the pfn for the desired bin. Large 3540 * page sizes may only have a single page color 3541 */ 3542 skip = szcpgcnt; 3543 if (ceq_mask > 0 || interleaved_mnodes) { 3544 /* set lo to point at appropriate color */ 3545 if (((PFN_2_COLOR(lo, szc, &it) ^ bin) & ceq_mask) || 3546 (interleaved_mnodes && 3547 PFN_2_MEM_NODE(lo) != mnode)) { 3548 PAGE_NEXT_PFN_FOR_COLOR(lo, szc, bin, ceq_mask, 3549 color_mask, &it); 3550 } 3551 if (hi <= lo) 3552 /* mseg cannot satisfy color request */ 3553 continue; 3554 } 3555 3556 /* randomly choose a point between lo and hi to begin search */ 3557 3558 randpfn = (pfn_t)GETTICK(); 3559 randpfn = ((randpfn % (hi - lo)) + lo) & ~(skip - 1); 3560 MEM_NODE_ITERATOR_INIT(randpfn, mnode, szc, &it); 3561 if (ceq_mask || interleaved_mnodes || randpfn == (pfn_t)-1) { 3562 if (randpfn != (pfn_t)-1) { 3563 PAGE_NEXT_PFN_FOR_COLOR(randpfn, szc, bin, 3564 ceq_mask, color_mask, &it); 3565 } 3566 if (randpfn >= hi) { 3567 randpfn = lo; 3568 MEM_NODE_ITERATOR_INIT(randpfn, mnode, szc, 3569 &it); 3570 } 3571 } 3572 randpp = mseg->pages + (randpfn - mseg->pages_base); 3573 3574 ASSERT(randpp->p_pagenum == randpfn); 3575 3576 pp = randpp; 3577 endpp = mseg->pages + (hi - mseg->pages_base) + 1; 3578 3579 ASSERT(randpp + szcpgcnt <= endpp); 3580 3581 do { 3582 ASSERT(!(pp->p_pagenum & szcpgmask)); 3583 ASSERT(((PP_2_BIN(pp) ^ bin) & ceq_mask) == 0); 3584 3585 if (page_trylock_contig_pages(mnode, pp, szc, flags)) { 3586 /* pages unlocked by page_claim on failure */ 3587 if (page_claim_contig_pages(pp, szc, flags)) { 3588 memsegs_unlock(0); 3589 return (pp); 3590 } 3591 } 3592 3593 if (ceq_mask == 0 && !interleaved_mnodes) { 3594 pp += skip; 3595 } else { 3596 pfn_t pfn = pp->p_pagenum; 3597 3598 PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, bin, 3599 ceq_mask, color_mask, &it); 3600 if (pfn == (pfn_t)-1) { 3601 pp = endpp; 3602 } else { 3603 pp = mseg->pages + 3604 (pfn - mseg->pages_base); 3605 } 3606 } 3607 if (pp >= endpp) { 3608 /* start from the beginning */ 3609 MEM_NODE_ITERATOR_INIT(lo, mnode, szc, &it); 3610 pp = mseg->pages + (lo - mseg->pages_base); 3611 ASSERT(pp->p_pagenum == lo); 3612 ASSERT(pp + szcpgcnt <= endpp); 3613 } 3614 } while (pp != randpp); 3615 } 3616 memsegs_unlock(0); 3617 return (NULL); 3618 } 3619 3620 3621 /* 3622 * controlling routine that searches through physical memory in an attempt to 3623 * claim a large page based on the input parameters. 3624 * on the page free lists. 3625 * 3626 * calls page_geti_contig_pages with an initial pfn range from the mnode 3627 * and mtype. page_geti_contig_pages will trim off the parts of the pfn range 3628 * that overlaps with the kernel cage or does not match the requested page 3629 * color if PG_MATCH_COLOR is set. Since this search is very expensive, 3630 * page_geti_contig_pages may further limit the search range based on 3631 * previous failure counts (pgcpfailcnt[]). 3632 * 3633 * for PGI_PGCPSZC0 requests, page_get_contig_pages will relocate a base 3634 * pagesize page that satisfies mtype. 3635 */ 3636 page_t * 3637 page_get_contig_pages(int mnode, uint_t bin, int mtype, uchar_t szc, 3638 uint_t flags) 3639 { 3640 pfn_t pfnlo, pfnhi; /* contig pages pfn range */ 3641 page_t *pp; 3642 pgcnt_t pfnflag = 0; /* no limit on search if 0 */ 3643 3644 VM_STAT_ADD(vmm_vmstats.pgcp_alloc[szc]); 3645 3646 /* no allocations from cage */ 3647 flags |= PGI_NOCAGE; 3648 3649 /* LINTED */ 3650 MTYPE_START(mnode, mtype, flags); 3651 if (mtype < 0) { /* mnode does not have memory in mtype range */ 3652 VM_STAT_ADD(vmm_vmstats.pgcp_allocempty[szc]); 3653 return (NULL); 3654 } 3655 3656 ASSERT(szc > 0 || (flags & PGI_PGCPSZC0)); 3657 3658 /* do not limit search and ignore color if hi pri */ 3659 3660 if (pgcplimitsearch && ((flags & PGI_PGCPHIPRI) == 0)) 3661 pfnflag = pgcpfailcnt[szc]; 3662 3663 /* remove color match to improve chances */ 3664 3665 if (flags & PGI_PGCPHIPRI || pfnflag) 3666 flags &= ~PG_MATCH_COLOR; 3667 3668 do { 3669 /* get pfn range based on mnode and mtype */ 3670 MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi); 3671 3672 ASSERT(pfnhi >= pfnlo); 3673 3674 pp = page_geti_contig_pages(mnode, bin, szc, flags, 3675 pfnlo, pfnhi, pfnflag); 3676 3677 if (pp != NULL) { 3678 pfnflag = pgcpfailcnt[szc]; 3679 if (pfnflag) { 3680 /* double the search size */ 3681 pgcpfailcnt[szc] = pfnflag >> 1; 3682 } 3683 VM_STAT_ADD(vmm_vmstats.pgcp_allocok[szc]); 3684 return (pp); 3685 } 3686 MTYPE_NEXT(mnode, mtype, flags); 3687 } while (mtype >= 0); 3688 3689 VM_STAT_ADD(vmm_vmstats.pgcp_allocfailed[szc]); 3690 return (NULL); 3691 } 3692 3693 #if defined(__i386) || defined(__amd64) 3694 /* 3695 * Determine the likelihood of finding/coalescing a szc page. 3696 * Return 0 if the likelihood is small otherwise return 1. 3697 * 3698 * For now, be conservative and check only 1g pages and return 0 3699 * if there had been previous coalescing failures and the szc pages 3700 * needed to satisfy request would exhaust most of freemem. 3701 */ 3702 int 3703 page_chk_freelist(uint_t szc) 3704 { 3705 pgcnt_t pgcnt; 3706 3707 if (szc <= 1) 3708 return (1); 3709 3710 pgcnt = page_get_pagecnt(szc); 3711 if (pgcpfailcnt[szc] && pgcnt + throttlefree >= freemem) { 3712 VM_STAT_ADD(vmm_vmstats.pcf_deny[szc]); 3713 return (0); 3714 } 3715 VM_STAT_ADD(vmm_vmstats.pcf_allow[szc]); 3716 return (1); 3717 } 3718 #endif 3719 3720 /* 3721 * Find the `best' page on the freelist for this (vp,off) (as,vaddr) pair. 3722 * 3723 * Does its own locking and accounting. 3724 * If PG_MATCH_COLOR is set, then NULL will be returned if there are no 3725 * pages of the proper color even if there are pages of a different color. 3726 * 3727 * Finds a page, removes it, THEN locks it. 3728 */ 3729 3730 /*ARGSUSED*/ 3731 page_t * 3732 page_get_freelist(struct vnode *vp, u_offset_t off, struct seg *seg, 3733 caddr_t vaddr, size_t size, uint_t flags, struct lgrp *lgrp) 3734 { 3735 struct as *as = seg->s_as; 3736 page_t *pp = NULL; 3737 ulong_t bin; 3738 uchar_t szc; 3739 int mnode; 3740 int mtype; 3741 page_t *(*page_get_func)(int, uint_t, int, uchar_t, uint_t); 3742 lgrp_mnode_cookie_t lgrp_cookie; 3743 3744 page_get_func = page_get_mnode_freelist; 3745 3746 /* 3747 * If we aren't passed a specific lgroup, or passed a freed lgrp 3748 * assume we wish to allocate near to the current thread's home. 3749 */ 3750 if (!LGRP_EXISTS(lgrp)) 3751 lgrp = lgrp_home_lgrp(); 3752 3753 if (kcage_on) { 3754 if ((flags & (PG_NORELOC | PG_PANIC)) == PG_NORELOC && 3755 kcage_freemem < kcage_throttlefree + btop(size) && 3756 curthread != kcage_cageout_thread) { 3757 /* 3758 * Set a "reserve" of kcage_throttlefree pages for 3759 * PG_PANIC and cageout thread allocations. 3760 * 3761 * Everybody else has to serialize in 3762 * page_create_get_something() to get a cage page, so 3763 * that we don't deadlock cageout! 3764 */ 3765 return (NULL); 3766 } 3767 } else { 3768 flags &= ~PG_NORELOC; 3769 flags |= PGI_NOCAGE; 3770 } 3771 3772 /* LINTED */ 3773 MTYPE_INIT(mtype, vp, vaddr, flags, size); 3774 3775 /* 3776 * Convert size to page size code. 3777 */ 3778 if ((szc = page_szc(size)) == (uchar_t)-1) 3779 panic("page_get_freelist: illegal page size request"); 3780 ASSERT(szc < mmu_page_sizes); 3781 3782 VM_STAT_ADD(vmm_vmstats.pgf_alloc[szc]); 3783 3784 /* LINTED */ 3785 AS_2_BIN(as, seg, vp, vaddr, bin, szc); 3786 3787 ASSERT(bin < PAGE_GET_PAGECOLORS(szc)); 3788 3789 /* 3790 * Try to get a local page first, but try remote if we can't 3791 * get a page of the right color. 3792 */ 3793 pgretry: 3794 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL); 3795 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3796 pp = page_get_func(mnode, bin, mtype, szc, flags); 3797 if (pp != NULL) { 3798 VM_STAT_ADD(vmm_vmstats.pgf_allocok[szc]); 3799 DTRACE_PROBE4(page__get, 3800 lgrp_t *, lgrp, 3801 int, mnode, 3802 ulong_t, bin, 3803 uint_t, flags); 3804 return (pp); 3805 } 3806 } 3807 ASSERT(pp == NULL); 3808 3809 /* 3810 * for non-SZC0 PAGESIZE requests, check cachelist before checking 3811 * remote free lists. Caller expected to call page_get_cachelist which 3812 * will check local cache lists and remote free lists. 3813 */ 3814 if (szc == 0 && ((flags & PGI_PGCPSZC0) == 0)) { 3815 VM_STAT_ADD(vmm_vmstats.pgf_allocdeferred); 3816 return (NULL); 3817 } 3818 3819 ASSERT(szc > 0 || (flags & PGI_PGCPSZC0)); 3820 3821 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); 3822 3823 if (!(flags & PG_LOCAL)) { 3824 /* 3825 * Try to get a non-local freelist page. 3826 */ 3827 LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie); 3828 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3829 pp = page_get_func(mnode, bin, mtype, szc, flags); 3830 if (pp != NULL) { 3831 DTRACE_PROBE4(page__get, 3832 lgrp_t *, lgrp, 3833 int, mnode, 3834 ulong_t, bin, 3835 uint_t, flags); 3836 VM_STAT_ADD(vmm_vmstats.pgf_allocokrem[szc]); 3837 return (pp); 3838 } 3839 } 3840 ASSERT(pp == NULL); 3841 } 3842 3843 /* 3844 * when the cage is off chances are page_get_contig_pages() will fail 3845 * to lock a large page chunk therefore when the cage is off it's not 3846 * called by default. this can be changed via /etc/system. 3847 * 3848 * page_get_contig_pages() also called to acquire a base pagesize page 3849 * for page_create_get_something(). 3850 */ 3851 if (!(flags & PG_NORELOC) && (pg_contig_disable == 0) && 3852 (kcage_on || pg_lpgcreate_nocage || szc == 0) && 3853 (page_get_func != page_get_contig_pages)) { 3854 3855 VM_STAT_ADD(vmm_vmstats.pgf_allocretry[szc]); 3856 page_get_func = page_get_contig_pages; 3857 goto pgretry; 3858 } 3859 3860 if (!(flags & PG_LOCAL) && pgcplimitsearch && 3861 page_get_func == page_get_contig_pages) 3862 SETPGCPFAILCNT(szc); 3863 3864 VM_STAT_ADD(vmm_vmstats.pgf_allocfailed[szc]); 3865 return (NULL); 3866 } 3867 3868 /* 3869 * Find the `best' page on the cachelist for this (vp,off) (as,vaddr) pair. 3870 * 3871 * Does its own locking. 3872 * If PG_MATCH_COLOR is set, then NULL will be returned if there are no 3873 * pages of the proper color even if there are pages of a different color. 3874 * Otherwise, scan the bins for ones with pages. For each bin with pages, 3875 * try to lock one of them. If no page can be locked, try the 3876 * next bin. Return NULL if a page can not be found and locked. 3877 * 3878 * Finds a pages, trys to lock it, then removes it. 3879 */ 3880 3881 /*ARGSUSED*/ 3882 page_t * 3883 page_get_cachelist(struct vnode *vp, u_offset_t off, struct seg *seg, 3884 caddr_t vaddr, uint_t flags, struct lgrp *lgrp) 3885 { 3886 page_t *pp; 3887 struct as *as = seg->s_as; 3888 ulong_t bin; 3889 /*LINTED*/ 3890 int mnode; 3891 int mtype; 3892 lgrp_mnode_cookie_t lgrp_cookie; 3893 3894 /* 3895 * If we aren't passed a specific lgroup, or pasased a freed lgrp 3896 * assume we wish to allocate near to the current thread's home. 3897 */ 3898 if (!LGRP_EXISTS(lgrp)) 3899 lgrp = lgrp_home_lgrp(); 3900 3901 if (!kcage_on) { 3902 flags &= ~PG_NORELOC; 3903 flags |= PGI_NOCAGE; 3904 } 3905 3906 if ((flags & (PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == PG_NORELOC && 3907 kcage_freemem <= kcage_throttlefree) { 3908 /* 3909 * Reserve kcage_throttlefree pages for critical kernel 3910 * threads. 3911 * 3912 * Everybody else has to go to page_create_get_something() 3913 * to get a cage page, so we don't deadlock cageout. 3914 */ 3915 return (NULL); 3916 } 3917 3918 /* LINTED */ 3919 AS_2_BIN(as, seg, vp, vaddr, bin, 0); 3920 3921 ASSERT(bin < PAGE_GET_PAGECOLORS(0)); 3922 3923 /* LINTED */ 3924 MTYPE_INIT(mtype, vp, vaddr, flags, MMU_PAGESIZE); 3925 3926 VM_STAT_ADD(vmm_vmstats.pgc_alloc); 3927 3928 /* 3929 * Try local cachelists first 3930 */ 3931 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL); 3932 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3933 pp = page_get_mnode_cachelist(bin, flags, mnode, mtype); 3934 if (pp != NULL) { 3935 VM_STAT_ADD(vmm_vmstats.pgc_allocok); 3936 DTRACE_PROBE4(page__get, 3937 lgrp_t *, lgrp, 3938 int, mnode, 3939 ulong_t, bin, 3940 uint_t, flags); 3941 return (pp); 3942 } 3943 } 3944 3945 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); 3946 3947 /* 3948 * Try freelists/cachelists that are farther away 3949 * This is our only chance to allocate remote pages for PAGESIZE 3950 * requests. 3951 */ 3952 LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie); 3953 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3954 pp = page_get_mnode_freelist(mnode, bin, mtype, 3955 0, flags); 3956 if (pp != NULL) { 3957 VM_STAT_ADD(vmm_vmstats.pgc_allocokdeferred); 3958 DTRACE_PROBE4(page__get, 3959 lgrp_t *, lgrp, 3960 int, mnode, 3961 ulong_t, bin, 3962 uint_t, flags); 3963 return (pp); 3964 } 3965 pp = page_get_mnode_cachelist(bin, flags, mnode, mtype); 3966 if (pp != NULL) { 3967 VM_STAT_ADD(vmm_vmstats.pgc_allocokrem); 3968 DTRACE_PROBE4(page__get, 3969 lgrp_t *, lgrp, 3970 int, mnode, 3971 ulong_t, bin, 3972 uint_t, flags); 3973 return (pp); 3974 } 3975 } 3976 3977 VM_STAT_ADD(vmm_vmstats.pgc_allocfailed); 3978 return (NULL); 3979 } 3980 3981 page_t * 3982 page_get_mnode_cachelist(uint_t bin, uint_t flags, int mnode, int mtype) 3983 { 3984 kmutex_t *pcm; 3985 page_t *pp, *first_pp; 3986 uint_t sbin; 3987 int plw_initialized; 3988 page_list_walker_t plw; 3989 3990 VM_STAT_ADD(vmm_vmstats.pgmc_alloc); 3991 3992 /* LINTED */ 3993 MTYPE_START(mnode, mtype, flags); 3994 if (mtype < 0) { /* mnode does not have memory in mtype range */ 3995 VM_STAT_ADD(vmm_vmstats.pgmc_allocempty); 3996 return (NULL); 3997 } 3998 3999 try_again: 4000 4001 plw_initialized = 0; 4002 plw.plw_ceq_dif = 1; 4003 4004 /* 4005 * Only hold one cachelist lock at a time, that way we 4006 * can start anywhere and not have to worry about lock 4007 * ordering. 4008 */ 4009 4010 for (plw.plw_count = 0; 4011 plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) { 4012 sbin = bin; 4013 do { 4014 4015 if (!PAGE_CACHELISTS(mnode, bin, mtype)) 4016 goto bin_empty_1; 4017 pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST); 4018 mutex_enter(pcm); 4019 pp = PAGE_CACHELISTS(mnode, bin, mtype); 4020 if (pp == NULL) 4021 goto bin_empty_0; 4022 4023 first_pp = pp; 4024 ASSERT(pp->p_vnode); 4025 ASSERT(PP_ISAGED(pp) == 0); 4026 ASSERT(pp->p_szc == 0); 4027 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 4028 while (IS_DUMP_PAGE(pp) || !page_trylock(pp, SE_EXCL)) { 4029 pp = pp->p_next; 4030 ASSERT(pp->p_szc == 0); 4031 if (pp == first_pp) { 4032 /* 4033 * We have searched the complete list! 4034 * And all of them (might only be one) 4035 * are locked. This can happen since 4036 * these pages can also be found via 4037 * the hash list. When found via the 4038 * hash list, they are locked first, 4039 * then removed. We give up to let the 4040 * other thread run. 4041 */ 4042 pp = NULL; 4043 break; 4044 } 4045 ASSERT(pp->p_vnode); 4046 ASSERT(PP_ISFREE(pp)); 4047 ASSERT(PP_ISAGED(pp) == 0); 4048 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == 4049 mnode); 4050 } 4051 4052 if (pp) { 4053 page_t **ppp; 4054 /* 4055 * Found and locked a page. 4056 * Pull it off the list. 4057 */ 4058 ASSERT(mtype == PP_2_MTYPE(pp)); 4059 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 4060 page_sub(ppp, pp); 4061 /* 4062 * Subtract counters before releasing pcm mutex 4063 * to avoid a race with page_freelist_coalesce 4064 * and page_freelist_split. 4065 */ 4066 page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST); 4067 mutex_exit(pcm); 4068 ASSERT(pp->p_vnode); 4069 ASSERT(PP_ISAGED(pp) == 0); 4070 #if defined(__sparc) 4071 ASSERT(!kcage_on || 4072 (flags & PG_NORELOC) == 0 || 4073 PP_ISNORELOC(pp)); 4074 if (PP_ISNORELOC(pp)) { 4075 kcage_freemem_sub(1); 4076 } 4077 #endif 4078 VM_STAT_ADD(vmm_vmstats. pgmc_allocok); 4079 return (pp); 4080 } 4081 bin_empty_0: 4082 mutex_exit(pcm); 4083 bin_empty_1: 4084 if (plw_initialized == 0) { 4085 page_list_walk_init(0, flags, bin, 0, 1, &plw); 4086 plw_initialized = 1; 4087 } 4088 /* calculate the next bin with equivalent color */ 4089 bin = ADD_MASKED(bin, plw.plw_bin_step, 4090 plw.plw_ceq_mask[0], plw.plw_color_mask); 4091 } while (sbin != bin); 4092 4093 if (plw.plw_ceq_dif > 1) 4094 bin = page_list_walk_next_bin(0, bin, &plw); 4095 } 4096 4097 MTYPE_NEXT(mnode, mtype, flags); 4098 if (mtype >= 0) 4099 goto try_again; 4100 4101 VM_STAT_ADD(vmm_vmstats.pgmc_allocfailed); 4102 return (NULL); 4103 } 4104 4105 #ifdef DEBUG 4106 #define REPL_PAGE_STATS 4107 #endif /* DEBUG */ 4108 4109 #ifdef REPL_PAGE_STATS 4110 struct repl_page_stats { 4111 uint_t ngets; 4112 uint_t ngets_noreloc; 4113 uint_t npgr_noreloc; 4114 uint_t nnopage_first; 4115 uint_t nnopage; 4116 uint_t nhashout; 4117 uint_t nnofree; 4118 uint_t nnext_pp; 4119 } repl_page_stats; 4120 #define REPL_STAT_INCR(v) atomic_inc_32(&repl_page_stats.v) 4121 #else /* REPL_PAGE_STATS */ 4122 #define REPL_STAT_INCR(v) 4123 #endif /* REPL_PAGE_STATS */ 4124 4125 int pgrppgcp; 4126 4127 /* 4128 * The freemem accounting must be done by the caller. 4129 * First we try to get a replacement page of the same size as like_pp, 4130 * if that is not possible, then we just get a set of discontiguous 4131 * PAGESIZE pages. 4132 */ 4133 page_t * 4134 page_get_replacement_page(page_t *orig_like_pp, struct lgrp *lgrp_target, 4135 uint_t pgrflags) 4136 { 4137 page_t *like_pp; 4138 page_t *pp, *pplist; 4139 page_t *pl = NULL; 4140 ulong_t bin; 4141 int mnode, page_mnode; 4142 int szc; 4143 spgcnt_t npgs, pg_cnt; 4144 pfn_t pfnum; 4145 int mtype; 4146 int flags = 0; 4147 lgrp_mnode_cookie_t lgrp_cookie; 4148 lgrp_t *lgrp; 4149 4150 REPL_STAT_INCR(ngets); 4151 like_pp = orig_like_pp; 4152 ASSERT(PAGE_EXCL(like_pp)); 4153 4154 szc = like_pp->p_szc; 4155 npgs = page_get_pagecnt(szc); 4156 /* 4157 * Now we reset like_pp to the base page_t. 4158 * That way, we won't walk past the end of this 'szc' page. 4159 */ 4160 pfnum = PFN_BASE(like_pp->p_pagenum, szc); 4161 like_pp = page_numtopp_nolock(pfnum); 4162 ASSERT(like_pp->p_szc == szc); 4163 4164 if (PP_ISNORELOC(like_pp)) { 4165 ASSERT(kcage_on); 4166 REPL_STAT_INCR(ngets_noreloc); 4167 flags = PGI_RELOCONLY; 4168 } else if (pgrflags & PGR_NORELOC) { 4169 ASSERT(kcage_on); 4170 REPL_STAT_INCR(npgr_noreloc); 4171 flags = PG_NORELOC; 4172 } 4173 4174 /* 4175 * Kernel pages must always be replaced with the same size 4176 * pages, since we cannot properly handle demotion of kernel 4177 * pages. 4178 */ 4179 if (PP_ISKAS(like_pp)) 4180 pgrflags |= PGR_SAMESZC; 4181 4182 /* LINTED */ 4183 MTYPE_PGR_INIT(mtype, flags, like_pp, page_mnode, npgs); 4184 4185 while (npgs) { 4186 pplist = NULL; 4187 for (;;) { 4188 pg_cnt = page_get_pagecnt(szc); 4189 bin = PP_2_BIN(like_pp); 4190 ASSERT(like_pp->p_szc == orig_like_pp->p_szc); 4191 ASSERT(pg_cnt <= npgs); 4192 4193 /* 4194 * If an lgroup was specified, try to get the 4195 * page from that lgroup. 4196 * NOTE: Must be careful with code below because 4197 * lgroup may disappear and reappear since there 4198 * is no locking for lgroup here. 4199 */ 4200 if (LGRP_EXISTS(lgrp_target)) { 4201 /* 4202 * Keep local variable for lgroup separate 4203 * from lgroup argument since this code should 4204 * only be exercised when lgroup argument 4205 * exists.... 4206 */ 4207 lgrp = lgrp_target; 4208 4209 /* Try the lgroup's freelists first */ 4210 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 4211 LGRP_SRCH_LOCAL); 4212 while ((pplist == NULL) && 4213 (mnode = lgrp_memnode_choose(&lgrp_cookie)) 4214 != -1) { 4215 pplist = 4216 page_get_mnode_freelist(mnode, bin, 4217 mtype, szc, flags); 4218 } 4219 4220 /* 4221 * Now try it's cachelists if this is a 4222 * small page. Don't need to do it for 4223 * larger ones since page_freelist_coalesce() 4224 * already failed. 4225 */ 4226 if (pplist != NULL || szc != 0) 4227 break; 4228 4229 /* Now try it's cachelists */ 4230 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 4231 LGRP_SRCH_LOCAL); 4232 4233 while ((pplist == NULL) && 4234 (mnode = lgrp_memnode_choose(&lgrp_cookie)) 4235 != -1) { 4236 pplist = 4237 page_get_mnode_cachelist(bin, flags, 4238 mnode, mtype); 4239 } 4240 if (pplist != NULL) { 4241 page_hashout(pplist, NULL); 4242 PP_SETAGED(pplist); 4243 REPL_STAT_INCR(nhashout); 4244 break; 4245 } 4246 /* Done looking in this lgroup. Bail out. */ 4247 break; 4248 } 4249 4250 /* 4251 * No lgroup was specified (or lgroup was removed by 4252 * DR, so just try to get the page as close to 4253 * like_pp's mnode as possible. 4254 * First try the local freelist... 4255 */ 4256 mnode = PP_2_MEM_NODE(like_pp); 4257 pplist = page_get_mnode_freelist(mnode, bin, 4258 mtype, szc, flags); 4259 if (pplist != NULL) 4260 break; 4261 4262 REPL_STAT_INCR(nnofree); 4263 4264 /* 4265 * ...then the local cachelist. Don't need to do it for 4266 * larger pages cause page_freelist_coalesce() already 4267 * failed there anyway. 4268 */ 4269 if (szc == 0) { 4270 pplist = page_get_mnode_cachelist(bin, flags, 4271 mnode, mtype); 4272 if (pplist != NULL) { 4273 page_hashout(pplist, NULL); 4274 PP_SETAGED(pplist); 4275 REPL_STAT_INCR(nhashout); 4276 break; 4277 } 4278 } 4279 4280 /* Now try remote freelists */ 4281 page_mnode = mnode; 4282 lgrp = 4283 lgrp_hand_to_lgrp(MEM_NODE_2_LGRPHAND(page_mnode)); 4284 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 4285 LGRP_SRCH_HIER); 4286 while (pplist == NULL && 4287 (mnode = lgrp_memnode_choose(&lgrp_cookie)) 4288 != -1) { 4289 /* 4290 * Skip local mnode. 4291 */ 4292 if ((mnode == page_mnode) || 4293 (mem_node_config[mnode].exists == 0)) 4294 continue; 4295 4296 pplist = page_get_mnode_freelist(mnode, 4297 bin, mtype, szc, flags); 4298 } 4299 4300 if (pplist != NULL) 4301 break; 4302 4303 4304 /* Now try remote cachelists */ 4305 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 4306 LGRP_SRCH_HIER); 4307 while (pplist == NULL && szc == 0) { 4308 mnode = lgrp_memnode_choose(&lgrp_cookie); 4309 if (mnode == -1) 4310 break; 4311 /* 4312 * Skip local mnode. 4313 */ 4314 if ((mnode == page_mnode) || 4315 (mem_node_config[mnode].exists == 0)) 4316 continue; 4317 4318 pplist = page_get_mnode_cachelist(bin, 4319 flags, mnode, mtype); 4320 4321 if (pplist != NULL) { 4322 page_hashout(pplist, NULL); 4323 PP_SETAGED(pplist); 4324 REPL_STAT_INCR(nhashout); 4325 break; 4326 } 4327 } 4328 4329 /* 4330 * Break out of while loop under the following cases: 4331 * - If we successfully got a page. 4332 * - If pgrflags specified only returning a specific 4333 * page size and we could not find that page size. 4334 * - If we could not satisfy the request with PAGESIZE 4335 * or larger pages. 4336 */ 4337 if (pplist != NULL || szc == 0) 4338 break; 4339 4340 if ((pgrflags & PGR_SAMESZC) || pgrppgcp) { 4341 /* try to find contig page */ 4342 4343 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 4344 LGRP_SRCH_HIER); 4345 4346 while ((pplist == NULL) && 4347 (mnode = 4348 lgrp_memnode_choose(&lgrp_cookie)) 4349 != -1) { 4350 pplist = page_get_contig_pages( 4351 mnode, bin, mtype, szc, 4352 flags | PGI_PGCPHIPRI); 4353 } 4354 break; 4355 } 4356 4357 /* 4358 * The correct thing to do here is try the next 4359 * page size down using szc--. Due to a bug 4360 * with the processing of HAT_RELOAD_SHARE 4361 * where the sfmmu_ttecnt arrays of all 4362 * hats sharing an ISM segment don't get updated, 4363 * using intermediate size pages for relocation 4364 * can lead to continuous page faults. 4365 */ 4366 szc = 0; 4367 } 4368 4369 if (pplist != NULL) { 4370 DTRACE_PROBE4(page__get, 4371 lgrp_t *, lgrp, 4372 int, mnode, 4373 ulong_t, bin, 4374 uint_t, flags); 4375 4376 while (pplist != NULL && pg_cnt--) { 4377 ASSERT(pplist != NULL); 4378 pp = pplist; 4379 page_sub(&pplist, pp); 4380 PP_CLRFREE(pp); 4381 PP_CLRAGED(pp); 4382 page_list_concat(&pl, &pp); 4383 npgs--; 4384 like_pp = like_pp + 1; 4385 REPL_STAT_INCR(nnext_pp); 4386 } 4387 ASSERT(pg_cnt == 0); 4388 } else { 4389 break; 4390 } 4391 } 4392 4393 if (npgs) { 4394 /* 4395 * We were unable to allocate the necessary number 4396 * of pages. 4397 * We need to free up any pl. 4398 */ 4399 REPL_STAT_INCR(nnopage); 4400 page_free_replacement_page(pl); 4401 return (NULL); 4402 } else { 4403 return (pl); 4404 } 4405 } 4406 4407 /* 4408 * demote a free large page to it's constituent pages 4409 */ 4410 void 4411 page_demote_free_pages(page_t *pp) 4412 { 4413 4414 int mnode; 4415 4416 ASSERT(pp != NULL); 4417 ASSERT(PAGE_LOCKED(pp)); 4418 ASSERT(PP_ISFREE(pp)); 4419 ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes); 4420 4421 mnode = PP_2_MEM_NODE(pp); 4422 page_freelist_lock(mnode); 4423 if (pp->p_szc != 0) { 4424 (void) page_demote(mnode, PFN_BASE(pp->p_pagenum, 4425 pp->p_szc), 0, pp->p_szc, 0, PC_NO_COLOR, PC_FREE); 4426 } 4427 page_freelist_unlock(mnode); 4428 ASSERT(pp->p_szc == 0); 4429 } 4430 4431 /* 4432 * Factor in colorequiv to check additional 'equivalent' bins. 4433 * colorequiv may be set in /etc/system 4434 */ 4435 void 4436 page_set_colorequiv_arr(void) 4437 { 4438 if (colorequiv > 1) { 4439 int i; 4440 uint_t sv_a = lowbit(colorequiv) - 1; 4441 4442 if (sv_a > 15) 4443 sv_a = 15; 4444 4445 for (i = 0; i < MMU_PAGE_SIZES; i++) { 4446 uint_t colors; 4447 uint_t a = sv_a; 4448 4449 if ((colors = hw_page_array[i].hp_colors) <= 1) { 4450 continue; 4451 } 4452 while ((colors >> a) == 0) 4453 a--; 4454 if ((a << 4) > colorequivszc[i]) { 4455 colorequivszc[i] = (a << 4); 4456 } 4457 } 4458 } 4459 } 4460