1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * Portions of this source code were derived from Berkeley 4.3 BSD 31 * under license from the Regents of the University of California. 32 */ 33 34 #pragma ident "%Z%%M% %I% %E% SMI" 35 36 /* 37 * This file contains common functions to access and manage the page lists. 38 * Many of these routines originated from platform dependent modules 39 * (sun4/vm/vm_dep.c, i86pc/vm/vm_machdep.c) and modified to function in 40 * a platform independent manner. 41 * 42 * vm/vm_dep.h provides for platform specific support. 43 */ 44 45 #include <sys/types.h> 46 #include <sys/debug.h> 47 #include <sys/cmn_err.h> 48 #include <sys/systm.h> 49 #include <sys/atomic.h> 50 #include <sys/sysmacros.h> 51 #include <vm/as.h> 52 #include <vm/page.h> 53 #include <vm/seg_kmem.h> 54 #include <vm/seg_vn.h> 55 #include <sys/memnode.h> 56 #include <vm/vm_dep.h> 57 #include <sys/lgrp.h> 58 #include <sys/mem_config.h> 59 #include <sys/callb.h> 60 #include <sys/mem_cage.h> 61 #include <sys/sdt.h> 62 63 extern uint_t vac_colors; 64 65 #define MAX_PRAGMA_ALIGN 128 66 67 /* vm_cpu_data0 for the boot cpu before kmem is initialized */ 68 69 #if L2CACHE_ALIGN_MAX <= MAX_PRAGMA_ALIGN 70 #pragma align L2CACHE_ALIGN_MAX(vm_cpu_data0) 71 #else 72 #pragma align MAX_PRAGMA_ALIGN(vm_cpu_data0) 73 #endif 74 char vm_cpu_data0[VM_CPU_DATA_PADSIZE]; 75 76 /* 77 * number of page colors equivalent to reqested color in page_get routines. 78 * If set, keeps large pages intact longer and keeps MPO allocation 79 * from the local mnode in favor of acquiring the 'correct' page color from 80 * a demoted large page or from a remote mnode. 81 */ 82 uint_t colorequiv; 83 84 /* 85 * color equivalency mask for each page size. 86 * Mask is computed based on cpu L2$ way sizes and colorequiv global. 87 * High 4 bits determine the number of high order bits of the color to ignore. 88 * Low 4 bits determines number of low order bits of color to ignore (it's only 89 * relevant for hashed index based page coloring). 90 */ 91 uchar_t colorequivszc[MMU_PAGE_SIZES]; 92 93 /* 94 * if set, specifies the percentage of large pages that are free from within 95 * a large page region before attempting to lock those pages for 96 * page_get_contig_pages processing. 97 * 98 * Should be turned on when kpr is available when page_trylock_contig_pages 99 * can be more selective. 100 */ 101 102 int ptcpthreshold; 103 104 /* 105 * Limit page get contig page search based on failure cnts in pgcpfailcnt[]. 106 * Enabled by default via pgcplimitsearch. 107 * 108 * pgcpfailcnt[] is bounded by PGCPFAILMAX (>= 1/2 of installed 109 * memory). When reached, pgcpfailcnt[] is reset to 1/2 of this upper 110 * bound. This upper bound range guarantees: 111 * - all large page 'slots' will be searched over time 112 * - the minimum (1) large page candidates considered on each pgcp call 113 * - count doesn't wrap around to 0 114 */ 115 pgcnt_t pgcpfailcnt[MMU_PAGE_SIZES]; 116 int pgcplimitsearch = 1; 117 118 #define PGCPFAILMAX (1 << (highbit(physinstalled) - 1)) 119 #define SETPGCPFAILCNT(szc) \ 120 if (++pgcpfailcnt[szc] >= PGCPFAILMAX) \ 121 pgcpfailcnt[szc] = PGCPFAILMAX / 2; 122 123 #ifdef VM_STATS 124 struct vmm_vmstats_str vmm_vmstats; 125 126 #endif /* VM_STATS */ 127 128 #if defined(__sparc) 129 #define LPGCREATE 0 130 #else 131 /* enable page_get_contig_pages */ 132 #define LPGCREATE 1 133 #endif 134 135 int pg_contig_disable; 136 int pg_lpgcreate_nocage = LPGCREATE; 137 138 /* 139 * page_freelist_split pfn flag to signify no hi pfn requirement. 140 */ 141 #define PFNNULL 0 142 143 /* Flags involved in promotion and demotion routines */ 144 #define PC_FREE 0x1 /* put page on freelist */ 145 #define PC_ALLOC 0x2 /* return page for allocation */ 146 147 /* 148 * Flag for page_demote to be used with PC_FREE to denote that we don't care 149 * what the color is as the color parameter to the function is ignored. 150 */ 151 #define PC_NO_COLOR (-1) 152 153 /* mtype value for page_promote to use when mtype does not matter */ 154 #define PC_MTYPE_ANY (-1) 155 156 /* 157 * page counters candidates info 158 * See page_ctrs_cands comment below for more details. 159 * fields are as follows: 160 * pcc_pages_free: # pages which freelist coalesce can create 161 * pcc_color_free: pointer to page free counts per color 162 */ 163 typedef struct pcc_info { 164 pgcnt_t pcc_pages_free; 165 pgcnt_t *pcc_color_free; 166 } pcc_info_t; 167 168 /* 169 * On big machines it can take a long time to check page_counters 170 * arrays. page_ctrs_cands is a summary array whose elements are a dynamically 171 * updated sum of all elements of the corresponding page_counters arrays. 172 * page_freelist_coalesce() searches page_counters only if an appropriate 173 * element of page_ctrs_cands array is greater than 0. 174 * 175 * page_ctrs_cands is indexed by mutex (i), region (r), mnode (m), mrange (g) 176 */ 177 pcc_info_t **page_ctrs_cands[NPC_MUTEX][MMU_PAGE_SIZES]; 178 179 /* 180 * Return in val the total number of free pages which can be created 181 * for the given mnode (m), mrange (g), and region size (r) 182 */ 183 #define PGCTRS_CANDS_GETVALUE(m, g, r, val) { \ 184 int i; \ 185 val = 0; \ 186 for (i = 0; i < NPC_MUTEX; i++) { \ 187 val += page_ctrs_cands[i][(r)][(m)][(g)].pcc_pages_free; \ 188 } \ 189 } 190 191 /* 192 * Return in val the total number of free pages which can be created 193 * for the given mnode (m), mrange (g), region size (r), and color (c) 194 */ 195 #define PGCTRS_CANDS_GETVALUECOLOR(m, g, r, c, val) { \ 196 int i; \ 197 val = 0; \ 198 ASSERT((c) < PAGE_GET_PAGECOLORS(r)); \ 199 for (i = 0; i < NPC_MUTEX; i++) { \ 200 val += \ 201 page_ctrs_cands[i][(r)][(m)][(g)].pcc_color_free[(c)]; \ 202 } \ 203 } 204 205 /* 206 * We can only allow a single thread to update a counter within the physical 207 * range of the largest supported page size. That is the finest granularity 208 * possible since the counter values are dependent on each other 209 * as you move accross region sizes. PP_CTR_LOCK_INDX is used to determine the 210 * ctr_mutex lock index for a particular physical range. 211 */ 212 static kmutex_t *ctr_mutex[NPC_MUTEX]; 213 214 #define PP_CTR_LOCK_INDX(pp) \ 215 (((pp)->p_pagenum >> \ 216 (PAGE_BSZS_SHIFT(mmu_page_sizes - 1))) & (NPC_MUTEX - 1)) 217 218 #define INVALID_COLOR 0xffffffff 219 #define INVALID_MASK 0xffffffff 220 221 /* 222 * Local functions prototypes. 223 */ 224 225 void page_ctr_add(int, int, page_t *, int); 226 void page_ctr_add_internal(int, int, page_t *, int); 227 void page_ctr_sub(int, int, page_t *, int); 228 void page_ctr_sub_internal(int, int, page_t *, int); 229 void page_freelist_lock(int); 230 void page_freelist_unlock(int); 231 page_t *page_promote(int, pfn_t, uchar_t, int, int); 232 page_t *page_demote(int, pfn_t, uchar_t, uchar_t, int, int); 233 page_t *page_freelist_split(uchar_t, 234 uint_t, int, int, pfn_t, page_list_walker_t *); 235 page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int); 236 static int page_trylock_cons(page_t *pp, se_t se); 237 238 /* 239 * The page_counters array below is used to keep track of free contiguous 240 * physical memory. A hw_page_map_t will be allocated per mnode per szc. 241 * This contains an array of counters, the size of the array, a shift value 242 * used to convert a pagenum into a counter array index or vice versa, as 243 * well as a cache of the last successful index to be promoted to a larger 244 * page size. As an optimization, we keep track of the last successful index 245 * to be promoted per page color for the given size region, and this is 246 * allocated dynamically based upon the number of colors for a given 247 * region size. 248 * 249 * Conceptually, the page counters are represented as: 250 * 251 * page_counters[region_size][mnode] 252 * 253 * region_size: size code of a candidate larger page made up 254 * of contiguous free smaller pages. 255 * 256 * page_counters[region_size][mnode].hpm_counters[index]: 257 * represents how many (region_size - 1) pages either 258 * exist or can be created within the given index range. 259 * 260 * Let's look at a sparc example: 261 * If we want to create a free 512k page, we look at region_size 2 262 * for the mnode we want. We calculate the index and look at a specific 263 * hpm_counters location. If we see 8 (FULL_REGION_CNT on sparc) at 264 * this location, it means that 8 64k pages either exist or can be created 265 * from 8K pages in order to make a single free 512k page at the given 266 * index. Note that when a region is full, it will contribute to the 267 * counts in the region above it. Thus we will not know what page 268 * size the free pages will be which can be promoted to this new free 269 * page unless we look at all regions below the current region. 270 */ 271 272 /* 273 * Note: hpmctr_t is defined in platform vm_dep.h 274 * hw_page_map_t contains all the information needed for the page_counters 275 * logic. The fields are as follows: 276 * 277 * hpm_counters: dynamically allocated array to hold counter data 278 * hpm_entries: entries in hpm_counters 279 * hpm_shift: shift for pnum/array index conv 280 * hpm_base: PFN mapped to counter index 0 281 * hpm_color_current: last index in counter array for this color at 282 * which we successfully created a large page 283 */ 284 typedef struct hw_page_map { 285 hpmctr_t *hpm_counters; 286 size_t hpm_entries; 287 int hpm_shift; 288 pfn_t hpm_base; 289 size_t *hpm_color_current[MAX_MNODE_MRANGES]; 290 } hw_page_map_t; 291 292 /* 293 * Element zero is not used, but is allocated for convenience. 294 */ 295 static hw_page_map_t *page_counters[MMU_PAGE_SIZES]; 296 297 /* 298 * Cached value of MNODE_RANGE_CNT(mnode). 299 * This is a function call in x86. 300 */ 301 static int mnode_nranges[MAX_MEM_NODES]; 302 static int mnode_maxmrange[MAX_MEM_NODES]; 303 304 /* 305 * The following macros are convenient ways to get access to the individual 306 * elements of the page_counters arrays. They can be used on both 307 * the left side and right side of equations. 308 */ 309 #define PAGE_COUNTERS(mnode, rg_szc, idx) \ 310 (page_counters[(rg_szc)][(mnode)].hpm_counters[(idx)]) 311 312 #define PAGE_COUNTERS_COUNTERS(mnode, rg_szc) \ 313 (page_counters[(rg_szc)][(mnode)].hpm_counters) 314 315 #define PAGE_COUNTERS_SHIFT(mnode, rg_szc) \ 316 (page_counters[(rg_szc)][(mnode)].hpm_shift) 317 318 #define PAGE_COUNTERS_ENTRIES(mnode, rg_szc) \ 319 (page_counters[(rg_szc)][(mnode)].hpm_entries) 320 321 #define PAGE_COUNTERS_BASE(mnode, rg_szc) \ 322 (page_counters[(rg_szc)][(mnode)].hpm_base) 323 324 #define PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, rg_szc, g) \ 325 (page_counters[(rg_szc)][(mnode)].hpm_color_current[(g)]) 326 327 #define PAGE_COUNTERS_CURRENT_COLOR(mnode, rg_szc, color, mrange) \ 328 (page_counters[(rg_szc)][(mnode)]. \ 329 hpm_color_current[(mrange)][(color)]) 330 331 #define PNUM_TO_IDX(mnode, rg_szc, pnum) \ 332 (((pnum) - PAGE_COUNTERS_BASE((mnode), (rg_szc))) >> \ 333 PAGE_COUNTERS_SHIFT((mnode), (rg_szc))) 334 335 #define IDX_TO_PNUM(mnode, rg_szc, index) \ 336 (PAGE_COUNTERS_BASE((mnode), (rg_szc)) + \ 337 ((index) << PAGE_COUNTERS_SHIFT((mnode), (rg_szc)))) 338 339 /* 340 * Protects the hpm_counters and hpm_color_current memory from changing while 341 * looking at page counters information. 342 * Grab the write lock to modify what these fields point at. 343 * Grab the read lock to prevent any pointers from changing. 344 * The write lock can not be held during memory allocation due to a possible 345 * recursion deadlock with trying to grab the read lock while the 346 * write lock is already held. 347 */ 348 krwlock_t page_ctrs_rwlock[MAX_MEM_NODES]; 349 350 351 /* 352 * initialize cpu_vm_data to point at cache aligned vm_cpu_data_t. 353 */ 354 void 355 cpu_vm_data_init(struct cpu *cp) 356 { 357 if (cp == CPU0) { 358 cp->cpu_vm_data = (void *)&vm_cpu_data0; 359 } else { 360 void *kmptr; 361 int align; 362 size_t sz; 363 364 align = (L2CACHE_ALIGN) ? L2CACHE_ALIGN : L2CACHE_ALIGN_MAX; 365 sz = P2ROUNDUP(sizeof (vm_cpu_data_t), align) + align; 366 kmptr = kmem_zalloc(sz, KM_SLEEP); 367 cp->cpu_vm_data = (void *) P2ROUNDUP((uintptr_t)kmptr, align); 368 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr = kmptr; 369 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize = sz; 370 } 371 } 372 373 /* 374 * free cpu_vm_data 375 */ 376 void 377 cpu_vm_data_destroy(struct cpu *cp) 378 { 379 if (cp->cpu_seqid && cp->cpu_vm_data) { 380 ASSERT(cp != CPU0); 381 kmem_free(((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr, 382 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize); 383 } 384 cp->cpu_vm_data = NULL; 385 } 386 387 388 /* 389 * page size to page size code 390 */ 391 int 392 page_szc(size_t pagesize) 393 { 394 int i = 0; 395 396 while (hw_page_array[i].hp_size) { 397 if (pagesize == hw_page_array[i].hp_size) 398 return (i); 399 i++; 400 } 401 return (-1); 402 } 403 404 /* 405 * page size to page size code with the restriction that it be a supported 406 * user page size. If it's not a supported user page size, -1 will be returned. 407 */ 408 int 409 page_szc_user_filtered(size_t pagesize) 410 { 411 int szc = page_szc(pagesize); 412 if ((szc != -1) && (SZC_2_USERSZC(szc) != -1)) { 413 return (szc); 414 } 415 return (-1); 416 } 417 418 /* 419 * Return how many page sizes are available for the user to use. This is 420 * what the hardware supports and not based upon how the OS implements the 421 * support of different page sizes. 422 */ 423 uint_t 424 page_num_user_pagesizes(void) 425 { 426 return (mmu_exported_page_sizes); 427 } 428 429 uint_t 430 page_num_pagesizes(void) 431 { 432 return (mmu_page_sizes); 433 } 434 435 /* 436 * returns the count of the number of base pagesize pages associated with szc 437 */ 438 pgcnt_t 439 page_get_pagecnt(uint_t szc) 440 { 441 if (szc >= mmu_page_sizes) 442 panic("page_get_pagecnt: out of range %d", szc); 443 return (hw_page_array[szc].hp_pgcnt); 444 } 445 446 size_t 447 page_get_pagesize(uint_t szc) 448 { 449 if (szc >= mmu_page_sizes) 450 panic("page_get_pagesize: out of range %d", szc); 451 return (hw_page_array[szc].hp_size); 452 } 453 454 /* 455 * Return the size of a page based upon the index passed in. An index of 456 * zero refers to the smallest page size in the system, and as index increases 457 * it refers to the next larger supported page size in the system. 458 * Note that szc and userszc may not be the same due to unsupported szc's on 459 * some systems. 460 */ 461 size_t 462 page_get_user_pagesize(uint_t userszc) 463 { 464 uint_t szc = USERSZC_2_SZC(userszc); 465 466 if (szc >= mmu_page_sizes) 467 panic("page_get_user_pagesize: out of range %d", szc); 468 return (hw_page_array[szc].hp_size); 469 } 470 471 uint_t 472 page_get_shift(uint_t szc) 473 { 474 if (szc >= mmu_page_sizes) 475 panic("page_get_shift: out of range %d", szc); 476 return (PAGE_GET_SHIFT(szc)); 477 } 478 479 uint_t 480 page_get_pagecolors(uint_t szc) 481 { 482 if (szc >= mmu_page_sizes) 483 panic("page_get_pagecolors: out of range %d", szc); 484 return (PAGE_GET_PAGECOLORS(szc)); 485 } 486 487 /* 488 * this assigns the desired equivalent color after a split 489 */ 490 uint_t 491 page_correct_color(uchar_t szc, uchar_t nszc, uint_t color, 492 uint_t ncolor, uint_t ceq_mask) 493 { 494 ASSERT(nszc > szc); 495 ASSERT(szc < mmu_page_sizes); 496 ASSERT(color < PAGE_GET_PAGECOLORS(szc)); 497 ASSERT(ncolor < PAGE_GET_PAGECOLORS(nszc)); 498 499 color &= ceq_mask; 500 ncolor <<= PAGE_GET_COLOR_SHIFT(szc, nszc); 501 return (color | (ncolor & ~ceq_mask)); 502 } 503 504 /* 505 * Called by startup(). 506 * Size up the per page size free list counters based on physmax 507 * of each node and max_mem_nodes. 508 */ 509 size_t 510 page_ctrs_sz(void) 511 { 512 int r; /* region size */ 513 int mnode; 514 int nranges; 515 uint_t ctrs_sz = 0; 516 int i; 517 pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; 518 519 /* 520 * We need to determine how many page colors there are for each 521 * page size in order to allocate memory for any color specific 522 * arrays. 523 */ 524 for (i = 0; i < mmu_page_sizes; i++) { 525 colors_per_szc[i] = PAGE_GET_PAGECOLORS(i); 526 } 527 528 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 529 530 pgcnt_t r_pgcnt; 531 pfn_t r_base; 532 pgcnt_t r_align; 533 534 if (mem_node_config[mnode].exists == 0) 535 continue; 536 537 nranges = MNODE_RANGE_CNT(mnode); 538 mnode_nranges[mnode] = nranges; 539 mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode); 540 541 /* 542 * determine size needed for page counter arrays with 543 * base aligned to large page size. 544 */ 545 for (r = 1; r < mmu_page_sizes; r++) { 546 /* add in space for hpm_counters */ 547 r_align = page_get_pagecnt(r); 548 r_base = mem_node_config[mnode].physbase; 549 r_base &= ~(r_align - 1); 550 r_pgcnt = howmany(mem_node_config[mnode].physmax - 551 r_base + 1, r_align); 552 /* 553 * Round up to always allocate on pointer sized 554 * boundaries. 555 */ 556 ctrs_sz += P2ROUNDUP((r_pgcnt * sizeof (hpmctr_t)), 557 sizeof (hpmctr_t *)); 558 559 /* add in space for hpm_color_current */ 560 ctrs_sz += sizeof (size_t) * 561 colors_per_szc[r] * nranges; 562 } 563 } 564 565 for (r = 1; r < mmu_page_sizes; r++) { 566 ctrs_sz += (max_mem_nodes * sizeof (hw_page_map_t)); 567 } 568 569 /* add in space for page_ctrs_cands and pcc_color_free */ 570 ctrs_sz += sizeof (pcc_info_t *) * max_mem_nodes * 571 mmu_page_sizes * NPC_MUTEX; 572 573 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 574 575 if (mem_node_config[mnode].exists == 0) 576 continue; 577 578 nranges = mnode_nranges[mnode]; 579 ctrs_sz += sizeof (pcc_info_t) * nranges * 580 mmu_page_sizes * NPC_MUTEX; 581 for (r = 1; r < mmu_page_sizes; r++) { 582 ctrs_sz += sizeof (pgcnt_t) * nranges * 583 colors_per_szc[r] * NPC_MUTEX; 584 } 585 } 586 587 /* ctr_mutex */ 588 ctrs_sz += (max_mem_nodes * NPC_MUTEX * sizeof (kmutex_t)); 589 590 /* size for page list counts */ 591 PLCNT_SZ(ctrs_sz); 592 593 /* 594 * add some slop for roundups. page_ctrs_alloc will roundup the start 595 * address of the counters to ecache_alignsize boundary for every 596 * memory node. 597 */ 598 return (ctrs_sz + max_mem_nodes * L2CACHE_ALIGN); 599 } 600 601 caddr_t 602 page_ctrs_alloc(caddr_t alloc_base) 603 { 604 int mnode; 605 int mrange, nranges; 606 int r; /* region size */ 607 int i; 608 pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; 609 610 /* 611 * We need to determine how many page colors there are for each 612 * page size in order to allocate memory for any color specific 613 * arrays. 614 */ 615 for (i = 0; i < mmu_page_sizes; i++) { 616 colors_per_szc[i] = PAGE_GET_PAGECOLORS(i); 617 } 618 619 for (r = 1; r < mmu_page_sizes; r++) { 620 page_counters[r] = (hw_page_map_t *)alloc_base; 621 alloc_base += (max_mem_nodes * sizeof (hw_page_map_t)); 622 } 623 624 /* page_ctrs_cands and pcc_color_free array */ 625 for (i = 0; i < NPC_MUTEX; i++) { 626 for (r = 1; r < mmu_page_sizes; r++) { 627 628 page_ctrs_cands[i][r] = (pcc_info_t **)alloc_base; 629 alloc_base += sizeof (pcc_info_t *) * max_mem_nodes; 630 631 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 632 pcc_info_t *pi; 633 634 if (mem_node_config[mnode].exists == 0) 635 continue; 636 637 nranges = mnode_nranges[mnode]; 638 639 pi = (pcc_info_t *)alloc_base; 640 alloc_base += sizeof (pcc_info_t) * nranges; 641 page_ctrs_cands[i][r][mnode] = pi; 642 643 for (mrange = 0; mrange < nranges; mrange++) { 644 pi->pcc_color_free = 645 (pgcnt_t *)alloc_base; 646 alloc_base += sizeof (pgcnt_t) * 647 colors_per_szc[r]; 648 pi++; 649 } 650 } 651 } 652 } 653 654 /* ctr_mutex */ 655 for (i = 0; i < NPC_MUTEX; i++) { 656 ctr_mutex[i] = (kmutex_t *)alloc_base; 657 alloc_base += (max_mem_nodes * sizeof (kmutex_t)); 658 } 659 660 /* initialize page list counts */ 661 PLCNT_INIT(alloc_base); 662 663 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 664 665 pgcnt_t r_pgcnt; 666 pfn_t r_base; 667 pgcnt_t r_align; 668 int r_shift; 669 int nranges = mnode_nranges[mnode]; 670 671 if (mem_node_config[mnode].exists == 0) 672 continue; 673 674 for (r = 1; r < mmu_page_sizes; r++) { 675 /* 676 * the page_counters base has to be aligned to the 677 * page count of page size code r otherwise the counts 678 * will cross large page boundaries. 679 */ 680 r_align = page_get_pagecnt(r); 681 r_base = mem_node_config[mnode].physbase; 682 /* base needs to be aligned - lower to aligned value */ 683 r_base &= ~(r_align - 1); 684 r_pgcnt = howmany(mem_node_config[mnode].physmax - 685 r_base + 1, r_align); 686 r_shift = PAGE_BSZS_SHIFT(r); 687 688 PAGE_COUNTERS_SHIFT(mnode, r) = r_shift; 689 PAGE_COUNTERS_ENTRIES(mnode, r) = r_pgcnt; 690 PAGE_COUNTERS_BASE(mnode, r) = r_base; 691 for (mrange = 0; mrange < nranges; mrange++) { 692 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, 693 r, mrange) = (size_t *)alloc_base; 694 alloc_base += sizeof (size_t) * 695 colors_per_szc[r]; 696 } 697 for (i = 0; i < colors_per_szc[r]; i++) { 698 uint_t color_mask = colors_per_szc[r] - 1; 699 pfn_t pfnum = r_base; 700 size_t idx; 701 int mrange; 702 703 PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i, 704 color_mask, color_mask); 705 idx = PNUM_TO_IDX(mnode, r, pfnum); 706 idx = (idx >= r_pgcnt) ? 0 : idx; 707 for (mrange = 0; mrange < nranges; mrange++) { 708 PAGE_COUNTERS_CURRENT_COLOR(mnode, 709 r, i, mrange) = idx; 710 } 711 } 712 PAGE_COUNTERS_COUNTERS(mnode, r) = 713 (hpmctr_t *)alloc_base; 714 /* 715 * Round up to make alloc_base always be aligned on 716 * a pointer boundary. 717 */ 718 alloc_base += P2ROUNDUP((sizeof (hpmctr_t) * r_pgcnt), 719 sizeof (hpmctr_t *)); 720 721 /* 722 * Verify that PNUM_TO_IDX and IDX_TO_PNUM 723 * satisfy the identity requirement. 724 * We should be able to go from one to the other 725 * and get consistent values. 726 */ 727 ASSERT(PNUM_TO_IDX(mnode, r, 728 (IDX_TO_PNUM(mnode, r, 0))) == 0); 729 ASSERT(IDX_TO_PNUM(mnode, r, 730 (PNUM_TO_IDX(mnode, r, r_base))) == r_base); 731 } 732 /* 733 * Roundup the start address of the page_counters to 734 * cache aligned boundary for every memory node. 735 * page_ctrs_sz() has added some slop for these roundups. 736 */ 737 alloc_base = (caddr_t)P2ROUNDUP((uintptr_t)alloc_base, 738 L2CACHE_ALIGN); 739 } 740 741 /* Initialize other page counter specific data structures. */ 742 for (mnode = 0; mnode < MAX_MEM_NODES; mnode++) { 743 rw_init(&page_ctrs_rwlock[mnode], NULL, RW_DEFAULT, NULL); 744 } 745 746 return (alloc_base); 747 } 748 749 /* 750 * Functions to adjust region counters for each size free list. 751 * Caller is responsible to acquire the ctr_mutex lock if necessary and 752 * thus can be called during startup without locks. 753 */ 754 /* ARGSUSED */ 755 void 756 page_ctr_add_internal(int mnode, int mtype, page_t *pp, int flags) 757 { 758 ssize_t r; /* region size */ 759 ssize_t idx; 760 pfn_t pfnum; 761 int lckidx; 762 763 ASSERT(mnode == PP_2_MEM_NODE(pp)); 764 ASSERT(mtype == PP_2_MTYPE(pp)); 765 766 ASSERT(pp->p_szc < mmu_page_sizes); 767 768 PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags); 769 770 /* no counter update needed for largest page size */ 771 if (pp->p_szc >= mmu_page_sizes - 1) { 772 return; 773 } 774 775 r = pp->p_szc + 1; 776 pfnum = pp->p_pagenum; 777 lckidx = PP_CTR_LOCK_INDX(pp); 778 779 /* 780 * Increment the count of free pages for the current 781 * region. Continue looping up in region size incrementing 782 * count if the preceeding region is full. 783 */ 784 while (r < mmu_page_sizes) { 785 idx = PNUM_TO_IDX(mnode, r, pfnum); 786 787 ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r)); 788 ASSERT(PAGE_COUNTERS(mnode, r, idx) < FULL_REGION_CNT(r)); 789 790 if (++PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r)) { 791 break; 792 } else { 793 int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r)); 794 pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode] 795 [MTYPE_2_MRANGE(mnode, root_mtype)]; 796 797 cand->pcc_pages_free++; 798 cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]++; 799 } 800 r++; 801 } 802 } 803 804 void 805 page_ctr_add(int mnode, int mtype, page_t *pp, int flags) 806 { 807 int lckidx = PP_CTR_LOCK_INDX(pp); 808 kmutex_t *lock = &ctr_mutex[lckidx][mnode]; 809 810 mutex_enter(lock); 811 page_ctr_add_internal(mnode, mtype, pp, flags); 812 mutex_exit(lock); 813 } 814 815 void 816 page_ctr_sub_internal(int mnode, int mtype, page_t *pp, int flags) 817 { 818 int lckidx; 819 ssize_t r; /* region size */ 820 ssize_t idx; 821 pfn_t pfnum; 822 823 ASSERT(mnode == PP_2_MEM_NODE(pp)); 824 ASSERT(mtype == PP_2_MTYPE(pp)); 825 826 ASSERT(pp->p_szc < mmu_page_sizes); 827 828 PLCNT_DECR(pp, mnode, mtype, pp->p_szc, flags); 829 830 /* no counter update needed for largest page size */ 831 if (pp->p_szc >= mmu_page_sizes - 1) { 832 return; 833 } 834 835 r = pp->p_szc + 1; 836 pfnum = pp->p_pagenum; 837 lckidx = PP_CTR_LOCK_INDX(pp); 838 839 /* 840 * Decrement the count of free pages for the current 841 * region. Continue looping up in region size decrementing 842 * count if the preceeding region was full. 843 */ 844 while (r < mmu_page_sizes) { 845 idx = PNUM_TO_IDX(mnode, r, pfnum); 846 847 ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r)); 848 ASSERT(PAGE_COUNTERS(mnode, r, idx) > 0); 849 850 if (--PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r) - 1) { 851 break; 852 } else { 853 int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r)); 854 pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode] 855 [MTYPE_2_MRANGE(mnode, root_mtype)]; 856 857 ASSERT(cand->pcc_pages_free != 0); 858 ASSERT(cand->pcc_color_free[PP_2_BIN_SZC(pp, r)] != 0); 859 860 cand->pcc_pages_free--; 861 cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]--; 862 } 863 r++; 864 } 865 } 866 867 void 868 page_ctr_sub(int mnode, int mtype, page_t *pp, int flags) 869 { 870 int lckidx = PP_CTR_LOCK_INDX(pp); 871 kmutex_t *lock = &ctr_mutex[lckidx][mnode]; 872 873 mutex_enter(lock); 874 page_ctr_sub_internal(mnode, mtype, pp, flags); 875 mutex_exit(lock); 876 } 877 878 /* 879 * Adjust page counters following a memory attach, since typically the 880 * size of the array needs to change, and the PFN to counter index 881 * mapping needs to change. 882 * 883 * It is possible this mnode did not exist at startup. In that case 884 * allocate pcc_info_t and pcc_color_free arrays. Also, allow for nranges 885 * to change (a theoretical possibility on x86), which means pcc_color_free 886 * arrays must be extended. 887 */ 888 uint_t 889 page_ctrs_adjust(int mnode) 890 { 891 pgcnt_t npgs; 892 int r; /* region size */ 893 int i; 894 size_t pcsz, old_csz; 895 hpmctr_t *new_ctr, *old_ctr; 896 pfn_t oldbase, newbase; 897 size_t old_npgs; 898 hpmctr_t *ctr_cache[MMU_PAGE_SIZES]; 899 size_t size_cache[MMU_PAGE_SIZES]; 900 size_t *color_cache[MMU_PAGE_SIZES][MAX_MNODE_MRANGES]; 901 size_t *old_color_array[MAX_MNODE_MRANGES]; 902 pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; 903 pcc_info_t **cands_cache; 904 pcc_info_t *old_pi, *pi; 905 pgcnt_t *pgcntp; 906 int nr, old_nranges, mrange, nranges = MNODE_RANGE_CNT(mnode); 907 int cands_cache_nranges; 908 int old_maxmrange, new_maxmrange; 909 int rc = 0; 910 911 newbase = mem_node_config[mnode].physbase & ~PC_BASE_ALIGN_MASK; 912 npgs = roundup(mem_node_config[mnode].physmax, 913 PC_BASE_ALIGN) - newbase; 914 915 cands_cache = kmem_zalloc(sizeof (pcc_info_t *) * NPC_MUTEX * 916 MMU_PAGE_SIZES, KM_NOSLEEP); 917 if (cands_cache == NULL) 918 return (ENOMEM); 919 920 /* prepare to free non-null pointers on the way out */ 921 cands_cache_nranges = nranges; 922 bzero(ctr_cache, sizeof (ctr_cache)); 923 bzero(color_cache, sizeof (color_cache)); 924 925 /* 926 * We need to determine how many page colors there are for each 927 * page size in order to allocate memory for any color specific 928 * arrays. 929 */ 930 for (r = 0; r < mmu_page_sizes; r++) { 931 colors_per_szc[r] = PAGE_GET_PAGECOLORS(r); 932 } 933 934 /* 935 * Preallocate all of the new hpm_counters arrays as we can't 936 * hold the page_ctrs_rwlock as a writer and allocate memory. 937 * If we can't allocate all of the arrays, undo our work so far 938 * and return failure. 939 */ 940 for (r = 1; r < mmu_page_sizes; r++) { 941 pcsz = npgs >> PAGE_BSZS_SHIFT(r); 942 size_cache[r] = pcsz; 943 ctr_cache[r] = kmem_zalloc(pcsz * 944 sizeof (hpmctr_t), KM_NOSLEEP); 945 if (ctr_cache[r] == NULL) { 946 rc = ENOMEM; 947 goto cleanup; 948 } 949 } 950 951 /* 952 * Preallocate all of the new color current arrays as we can't 953 * hold the page_ctrs_rwlock as a writer and allocate memory. 954 * If we can't allocate all of the arrays, undo our work so far 955 * and return failure. 956 */ 957 for (r = 1; r < mmu_page_sizes; r++) { 958 for (mrange = 0; mrange < nranges; mrange++) { 959 color_cache[r][mrange] = kmem_zalloc(sizeof (size_t) * 960 colors_per_szc[r], KM_NOSLEEP); 961 if (color_cache[r][mrange] == NULL) { 962 rc = ENOMEM; 963 goto cleanup; 964 } 965 } 966 } 967 968 /* 969 * Preallocate all of the new pcc_info_t arrays as we can't 970 * hold the page_ctrs_rwlock as a writer and allocate memory. 971 * If we can't allocate all of the arrays, undo our work so far 972 * and return failure. 973 */ 974 for (r = 1; r < mmu_page_sizes; r++) { 975 for (i = 0; i < NPC_MUTEX; i++) { 976 pi = kmem_zalloc(nranges * sizeof (pcc_info_t), 977 KM_NOSLEEP); 978 if (pi == NULL) { 979 rc = ENOMEM; 980 goto cleanup; 981 } 982 cands_cache[i * MMU_PAGE_SIZES + r] = pi; 983 984 for (mrange = 0; mrange < nranges; mrange++, pi++) { 985 pgcntp = kmem_zalloc(colors_per_szc[r] * 986 sizeof (pgcnt_t), KM_NOSLEEP); 987 if (pgcntp == NULL) { 988 rc = ENOMEM; 989 goto cleanup; 990 } 991 pi->pcc_color_free = pgcntp; 992 } 993 } 994 } 995 996 /* 997 * Grab the write lock to prevent others from walking these arrays 998 * while we are modifying them. 999 */ 1000 rw_enter(&page_ctrs_rwlock[mnode], RW_WRITER); 1001 page_freelist_lock(mnode); 1002 1003 old_nranges = mnode_nranges[mnode]; 1004 cands_cache_nranges = old_nranges; 1005 mnode_nranges[mnode] = nranges; 1006 old_maxmrange = mnode_maxmrange[mnode]; 1007 mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode); 1008 new_maxmrange = mnode_maxmrange[mnode]; 1009 1010 for (r = 1; r < mmu_page_sizes; r++) { 1011 PAGE_COUNTERS_SHIFT(mnode, r) = PAGE_BSZS_SHIFT(r); 1012 old_ctr = PAGE_COUNTERS_COUNTERS(mnode, r); 1013 old_csz = PAGE_COUNTERS_ENTRIES(mnode, r); 1014 oldbase = PAGE_COUNTERS_BASE(mnode, r); 1015 old_npgs = old_csz << PAGE_COUNTERS_SHIFT(mnode, r); 1016 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) { 1017 old_color_array[mrange] = 1018 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, 1019 r, mrange); 1020 } 1021 1022 pcsz = npgs >> PAGE_COUNTERS_SHIFT(mnode, r); 1023 new_ctr = ctr_cache[r]; 1024 ctr_cache[r] = NULL; 1025 if (old_ctr != NULL && 1026 (oldbase + old_npgs > newbase) && 1027 (newbase + npgs > oldbase)) { 1028 /* 1029 * Map the intersection of the old and new 1030 * counters into the new array. 1031 */ 1032 size_t offset; 1033 if (newbase > oldbase) { 1034 offset = (newbase - oldbase) >> 1035 PAGE_COUNTERS_SHIFT(mnode, r); 1036 bcopy(old_ctr + offset, new_ctr, 1037 MIN(pcsz, (old_csz - offset)) * 1038 sizeof (hpmctr_t)); 1039 } else { 1040 offset = (oldbase - newbase) >> 1041 PAGE_COUNTERS_SHIFT(mnode, r); 1042 bcopy(old_ctr, new_ctr + offset, 1043 MIN(pcsz - offset, old_csz) * 1044 sizeof (hpmctr_t)); 1045 } 1046 } 1047 1048 PAGE_COUNTERS_COUNTERS(mnode, r) = new_ctr; 1049 PAGE_COUNTERS_ENTRIES(mnode, r) = pcsz; 1050 PAGE_COUNTERS_BASE(mnode, r) = newbase; 1051 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) { 1052 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r, mrange) = 1053 color_cache[r][mrange]; 1054 color_cache[r][mrange] = NULL; 1055 } 1056 /* 1057 * for now, just reset on these events as it's probably 1058 * not worthwhile to try and optimize this. 1059 */ 1060 for (i = 0; i < colors_per_szc[r]; i++) { 1061 uint_t color_mask = colors_per_szc[r] - 1; 1062 pfn_t pfnum = newbase; 1063 size_t idx; 1064 1065 PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i, color_mask, 1066 color_mask); 1067 idx = PNUM_TO_IDX(mnode, r, pfnum); 1068 idx = (idx < pcsz) ? idx : 0; 1069 for (mrange = 0; mrange < nranges; mrange++) { 1070 PAGE_COUNTERS_CURRENT_COLOR(mnode, 1071 r, i, mrange) = idx; 1072 } 1073 } 1074 1075 /* cache info for freeing out of the critical path */ 1076 if ((caddr_t)old_ctr >= kernelheap && 1077 (caddr_t)old_ctr < ekernelheap) { 1078 ctr_cache[r] = old_ctr; 1079 size_cache[r] = old_csz; 1080 } 1081 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) { 1082 size_t *tmp = old_color_array[mrange]; 1083 if ((caddr_t)tmp >= kernelheap && 1084 (caddr_t)tmp < ekernelheap) { 1085 color_cache[r][mrange] = tmp; 1086 } 1087 } 1088 /* 1089 * Verify that PNUM_TO_IDX and IDX_TO_PNUM 1090 * satisfy the identity requirement. 1091 * We should be able to go from one to the other 1092 * and get consistent values. 1093 */ 1094 ASSERT(PNUM_TO_IDX(mnode, r, 1095 (IDX_TO_PNUM(mnode, r, 0))) == 0); 1096 ASSERT(IDX_TO_PNUM(mnode, r, 1097 (PNUM_TO_IDX(mnode, r, newbase))) == newbase); 1098 1099 /* pcc_info_t and pcc_color_free */ 1100 for (i = 0; i < NPC_MUTEX; i++) { 1101 pcc_info_t *epi; 1102 pcc_info_t *eold_pi; 1103 1104 pi = cands_cache[i * MMU_PAGE_SIZES + r]; 1105 old_pi = page_ctrs_cands[i][r][mnode]; 1106 page_ctrs_cands[i][r][mnode] = pi; 1107 cands_cache[i * MMU_PAGE_SIZES + r] = old_pi; 1108 1109 /* preserve old pcc_color_free values, if any */ 1110 if (old_pi == NULL) 1111 continue; 1112 1113 /* 1114 * when/if x86 does DR, must account for 1115 * possible change in range index when 1116 * preserving pcc_info 1117 */ 1118 epi = &pi[nranges]; 1119 eold_pi = &old_pi[old_nranges]; 1120 if (new_maxmrange > old_maxmrange) { 1121 pi += new_maxmrange - old_maxmrange; 1122 } else if (new_maxmrange < old_maxmrange) { 1123 old_pi += old_maxmrange - new_maxmrange; 1124 } 1125 for (; pi < epi && old_pi < eold_pi; pi++, old_pi++) { 1126 pcc_info_t tmp = *pi; 1127 *pi = *old_pi; 1128 *old_pi = tmp; 1129 } 1130 } 1131 } 1132 page_freelist_unlock(mnode); 1133 rw_exit(&page_ctrs_rwlock[mnode]); 1134 1135 /* 1136 * Now that we have dropped the write lock, it is safe to free all 1137 * of the memory we have cached above. 1138 * We come thru here to free memory when pre-alloc fails, and also to 1139 * free old pointers which were recorded while locked. 1140 */ 1141 cleanup: 1142 for (r = 1; r < mmu_page_sizes; r++) { 1143 if (ctr_cache[r] != NULL) { 1144 kmem_free(ctr_cache[r], 1145 size_cache[r] * sizeof (hpmctr_t)); 1146 } 1147 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) { 1148 if (color_cache[r][mrange] != NULL) { 1149 kmem_free(color_cache[r][mrange], 1150 colors_per_szc[r] * sizeof (size_t)); 1151 } 1152 } 1153 for (i = 0; i < NPC_MUTEX; i++) { 1154 pi = cands_cache[i * MMU_PAGE_SIZES + r]; 1155 if (pi == NULL) 1156 continue; 1157 nr = cands_cache_nranges; 1158 for (mrange = 0; mrange < nr; mrange++, pi++) { 1159 pgcntp = pi->pcc_color_free; 1160 if (pgcntp == NULL) 1161 continue; 1162 if ((caddr_t)pgcntp >= kernelheap && 1163 (caddr_t)pgcntp < ekernelheap) { 1164 kmem_free(pgcntp, 1165 colors_per_szc[r] * 1166 sizeof (pgcnt_t)); 1167 } 1168 } 1169 pi = cands_cache[i * MMU_PAGE_SIZES + r]; 1170 if ((caddr_t)pi >= kernelheap && 1171 (caddr_t)pi < ekernelheap) { 1172 kmem_free(pi, nr * sizeof (pcc_info_t)); 1173 } 1174 } 1175 } 1176 1177 kmem_free(cands_cache, 1178 sizeof (pcc_info_t *) * NPC_MUTEX * MMU_PAGE_SIZES); 1179 return (rc); 1180 } 1181 1182 1183 #ifdef DEBUG 1184 1185 /* 1186 * confirm pp is a large page corresponding to szc 1187 */ 1188 void 1189 chk_lpg(page_t *pp, uchar_t szc) 1190 { 1191 spgcnt_t npgs = page_get_pagecnt(pp->p_szc); 1192 uint_t noreloc; 1193 1194 if (npgs == 1) { 1195 ASSERT(pp->p_szc == 0); 1196 ASSERT(pp->p_next == pp); 1197 ASSERT(pp->p_prev == pp); 1198 return; 1199 } 1200 1201 ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL); 1202 ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL); 1203 1204 ASSERT(IS_P2ALIGNED(pp->p_pagenum, npgs)); 1205 ASSERT(pp->p_pagenum == (pp->p_next->p_pagenum - 1)); 1206 ASSERT(pp->p_prev->p_pagenum == (pp->p_pagenum + (npgs - 1))); 1207 ASSERT(pp->p_prev == (pp + (npgs - 1))); 1208 1209 /* 1210 * Check list of pages. 1211 */ 1212 noreloc = PP_ISNORELOC(pp); 1213 while (npgs--) { 1214 if (npgs != 0) { 1215 ASSERT(pp->p_pagenum == pp->p_next->p_pagenum - 1); 1216 ASSERT(pp->p_next == (pp + 1)); 1217 } 1218 ASSERT(pp->p_szc == szc); 1219 ASSERT(PP_ISFREE(pp)); 1220 ASSERT(PP_ISAGED(pp)); 1221 ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL); 1222 ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL); 1223 ASSERT(pp->p_vnode == NULL); 1224 ASSERT(PP_ISNORELOC(pp) == noreloc); 1225 1226 pp = pp->p_next; 1227 } 1228 } 1229 #endif /* DEBUG */ 1230 1231 void 1232 page_freelist_lock(int mnode) 1233 { 1234 int i; 1235 for (i = 0; i < NPC_MUTEX; i++) { 1236 mutex_enter(FPC_MUTEX(mnode, i)); 1237 mutex_enter(CPC_MUTEX(mnode, i)); 1238 } 1239 } 1240 1241 void 1242 page_freelist_unlock(int mnode) 1243 { 1244 int i; 1245 for (i = 0; i < NPC_MUTEX; i++) { 1246 mutex_exit(FPC_MUTEX(mnode, i)); 1247 mutex_exit(CPC_MUTEX(mnode, i)); 1248 } 1249 } 1250 1251 /* 1252 * add pp to the specified page list. Defaults to head of the page list 1253 * unless PG_LIST_TAIL is specified. 1254 */ 1255 void 1256 page_list_add(page_t *pp, int flags) 1257 { 1258 page_t **ppp; 1259 kmutex_t *pcm; 1260 uint_t bin, mtype; 1261 int mnode; 1262 1263 ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT)); 1264 ASSERT(PP_ISFREE(pp)); 1265 ASSERT(!hat_page_is_mapped(pp)); 1266 ASSERT(hat_page_getshare(pp) == 0); 1267 1268 /* 1269 * Large pages should be freed via page_list_add_pages(). 1270 */ 1271 ASSERT(pp->p_szc == 0); 1272 1273 /* 1274 * Don't need to lock the freelist first here 1275 * because the page isn't on the freelist yet. 1276 * This means p_szc can't change on us. 1277 */ 1278 1279 bin = PP_2_BIN(pp); 1280 mnode = PP_2_MEM_NODE(pp); 1281 mtype = PP_2_MTYPE(pp); 1282 1283 if (flags & PG_LIST_ISINIT) { 1284 /* 1285 * PG_LIST_ISINIT is set during system startup (ie. single 1286 * threaded), add a page to the free list and add to the 1287 * the free region counters w/o any locking 1288 */ 1289 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1290 1291 /* inline version of page_add() */ 1292 if (*ppp != NULL) { 1293 pp->p_next = *ppp; 1294 pp->p_prev = (*ppp)->p_prev; 1295 (*ppp)->p_prev = pp; 1296 pp->p_prev->p_next = pp; 1297 } else 1298 *ppp = pp; 1299 1300 page_ctr_add_internal(mnode, mtype, pp, flags); 1301 VM_STAT_ADD(vmm_vmstats.pladd_free[0]); 1302 } else { 1303 pcm = PC_BIN_MUTEX(mnode, bin, flags); 1304 1305 if (flags & PG_FREE_LIST) { 1306 VM_STAT_ADD(vmm_vmstats.pladd_free[0]); 1307 ASSERT(PP_ISAGED(pp)); 1308 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1309 1310 } else { 1311 VM_STAT_ADD(vmm_vmstats.pladd_cache); 1312 ASSERT(pp->p_vnode); 1313 ASSERT((pp->p_offset & PAGEOFFSET) == 0); 1314 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1315 } 1316 mutex_enter(pcm); 1317 page_add(ppp, pp); 1318 1319 if (flags & PG_LIST_TAIL) 1320 *ppp = (*ppp)->p_next; 1321 /* 1322 * Add counters before releasing pcm mutex to avoid a race with 1323 * page_freelist_coalesce and page_freelist_split. 1324 */ 1325 page_ctr_add(mnode, mtype, pp, flags); 1326 mutex_exit(pcm); 1327 } 1328 1329 1330 #if defined(__sparc) 1331 if (PP_ISNORELOC(pp)) { 1332 kcage_freemem_add(1); 1333 } 1334 #endif 1335 /* 1336 * It is up to the caller to unlock the page! 1337 */ 1338 ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT)); 1339 } 1340 1341 1342 #ifdef __sparc 1343 /* 1344 * This routine is only used by kcage_init during system startup. 1345 * It performs the function of page_list_sub/PP_SETNORELOC/page_list_add 1346 * without the overhead of taking locks and updating counters. 1347 */ 1348 void 1349 page_list_noreloc_startup(page_t *pp) 1350 { 1351 page_t **ppp; 1352 uint_t bin; 1353 int mnode; 1354 int mtype; 1355 int flags = 0; 1356 1357 /* 1358 * If this is a large page on the freelist then 1359 * break it up into smaller pages. 1360 */ 1361 if (pp->p_szc != 0) 1362 page_boot_demote(pp); 1363 1364 /* 1365 * Get list page is currently on. 1366 */ 1367 bin = PP_2_BIN(pp); 1368 mnode = PP_2_MEM_NODE(pp); 1369 mtype = PP_2_MTYPE(pp); 1370 ASSERT(mtype == MTYPE_RELOC); 1371 ASSERT(pp->p_szc == 0); 1372 1373 if (PP_ISAGED(pp)) { 1374 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1375 flags |= PG_FREE_LIST; 1376 } else { 1377 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1378 flags |= PG_CACHE_LIST; 1379 } 1380 1381 ASSERT(*ppp != NULL); 1382 1383 /* 1384 * Delete page from current list. 1385 */ 1386 if (*ppp == pp) 1387 *ppp = pp->p_next; /* go to next page */ 1388 if (*ppp == pp) { 1389 *ppp = NULL; /* page list is gone */ 1390 } else { 1391 pp->p_prev->p_next = pp->p_next; 1392 pp->p_next->p_prev = pp->p_prev; 1393 } 1394 1395 /* 1396 * Decrement page counters 1397 */ 1398 page_ctr_sub_internal(mnode, mtype, pp, flags); 1399 1400 /* 1401 * Set no reloc for cage initted pages. 1402 */ 1403 PP_SETNORELOC(pp); 1404 1405 mtype = PP_2_MTYPE(pp); 1406 ASSERT(mtype == MTYPE_NORELOC); 1407 1408 /* 1409 * Get new list for page. 1410 */ 1411 if (PP_ISAGED(pp)) { 1412 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1413 } else { 1414 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1415 } 1416 1417 /* 1418 * Insert page on new list. 1419 */ 1420 if (*ppp == NULL) { 1421 *ppp = pp; 1422 pp->p_next = pp->p_prev = pp; 1423 } else { 1424 pp->p_next = *ppp; 1425 pp->p_prev = (*ppp)->p_prev; 1426 (*ppp)->p_prev = pp; 1427 pp->p_prev->p_next = pp; 1428 } 1429 1430 /* 1431 * Increment page counters 1432 */ 1433 page_ctr_add_internal(mnode, mtype, pp, flags); 1434 1435 /* 1436 * Update cage freemem counter 1437 */ 1438 atomic_add_long(&kcage_freemem, 1); 1439 } 1440 #else /* __sparc */ 1441 1442 /* ARGSUSED */ 1443 void 1444 page_list_noreloc_startup(page_t *pp) 1445 { 1446 panic("page_list_noreloc_startup: should be here only for sparc"); 1447 } 1448 #endif 1449 1450 void 1451 page_list_add_pages(page_t *pp, int flags) 1452 { 1453 kmutex_t *pcm; 1454 pgcnt_t pgcnt; 1455 uint_t bin, mtype, i; 1456 int mnode; 1457 1458 /* default to freelist/head */ 1459 ASSERT((flags & (PG_CACHE_LIST | PG_LIST_TAIL)) == 0); 1460 1461 CHK_LPG(pp, pp->p_szc); 1462 VM_STAT_ADD(vmm_vmstats.pladd_free[pp->p_szc]); 1463 1464 bin = PP_2_BIN(pp); 1465 mnode = PP_2_MEM_NODE(pp); 1466 mtype = PP_2_MTYPE(pp); 1467 1468 if (flags & PG_LIST_ISINIT) { 1469 ASSERT(pp->p_szc == mmu_page_sizes - 1); 1470 page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1471 ASSERT(!PP_ISNORELOC(pp)); 1472 PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags); 1473 } else { 1474 1475 ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes); 1476 1477 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 1478 1479 mutex_enter(pcm); 1480 page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1481 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST); 1482 mutex_exit(pcm); 1483 1484 pgcnt = page_get_pagecnt(pp->p_szc); 1485 #if defined(__sparc) 1486 if (PP_ISNORELOC(pp)) 1487 kcage_freemem_add(pgcnt); 1488 #endif 1489 for (i = 0; i < pgcnt; i++, pp++) 1490 page_unlock_nocapture(pp); 1491 } 1492 } 1493 1494 /* 1495 * During boot, need to demote a large page to base 1496 * pagesize pages for seg_kmem for use in boot_alloc() 1497 */ 1498 void 1499 page_boot_demote(page_t *pp) 1500 { 1501 ASSERT(pp->p_szc != 0); 1502 ASSERT(PP_ISFREE(pp)); 1503 ASSERT(PP_ISAGED(pp)); 1504 1505 (void) page_demote(PP_2_MEM_NODE(pp), 1506 PFN_BASE(pp->p_pagenum, pp->p_szc), pp->p_szc, 0, PC_NO_COLOR, 1507 PC_FREE); 1508 1509 ASSERT(PP_ISFREE(pp)); 1510 ASSERT(PP_ISAGED(pp)); 1511 ASSERT(pp->p_szc == 0); 1512 } 1513 1514 /* 1515 * Take a particular page off of whatever freelist the page 1516 * is claimed to be on. 1517 * 1518 * NOTE: Only used for PAGESIZE pages. 1519 */ 1520 void 1521 page_list_sub(page_t *pp, int flags) 1522 { 1523 int bin; 1524 uint_t mtype; 1525 int mnode; 1526 kmutex_t *pcm; 1527 page_t **ppp; 1528 1529 ASSERT(PAGE_EXCL(pp)); 1530 ASSERT(PP_ISFREE(pp)); 1531 1532 /* 1533 * The p_szc field can only be changed by page_promote() 1534 * and page_demote(). Only free pages can be promoted and 1535 * demoted and the free list MUST be locked during these 1536 * operations. So to prevent a race in page_list_sub() 1537 * between computing which bin of the freelist lock to 1538 * grab and actually grabing the lock we check again that 1539 * the bin we locked is still the correct one. Notice that 1540 * the p_szc field could have actually changed on us but 1541 * if the bin happens to still be the same we are safe. 1542 */ 1543 try_again: 1544 bin = PP_2_BIN(pp); 1545 mnode = PP_2_MEM_NODE(pp); 1546 pcm = PC_BIN_MUTEX(mnode, bin, flags); 1547 mutex_enter(pcm); 1548 if (PP_2_BIN(pp) != bin) { 1549 mutex_exit(pcm); 1550 goto try_again; 1551 } 1552 mtype = PP_2_MTYPE(pp); 1553 1554 if (flags & PG_FREE_LIST) { 1555 VM_STAT_ADD(vmm_vmstats.plsub_free[0]); 1556 ASSERT(PP_ISAGED(pp)); 1557 ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype); 1558 } else { 1559 VM_STAT_ADD(vmm_vmstats.plsub_cache); 1560 ASSERT(!PP_ISAGED(pp)); 1561 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1562 } 1563 1564 /* 1565 * Common PAGESIZE case. 1566 * 1567 * Note that we locked the freelist. This prevents 1568 * any page promotion/demotion operations. Therefore 1569 * the p_szc will not change until we drop pcm mutex. 1570 */ 1571 if (pp->p_szc == 0) { 1572 page_sub(ppp, pp); 1573 /* 1574 * Subtract counters before releasing pcm mutex 1575 * to avoid race with page_freelist_coalesce. 1576 */ 1577 page_ctr_sub(mnode, mtype, pp, flags); 1578 mutex_exit(pcm); 1579 1580 #if defined(__sparc) 1581 if (PP_ISNORELOC(pp)) { 1582 kcage_freemem_sub(1); 1583 } 1584 #endif 1585 return; 1586 } 1587 1588 /* 1589 * Large pages on the cache list are not supported. 1590 */ 1591 if (flags & PG_CACHE_LIST) 1592 panic("page_list_sub: large page on cachelist"); 1593 1594 /* 1595 * Slow but rare. 1596 * 1597 * Somebody wants this particular page which is part 1598 * of a large page. In this case we just demote the page 1599 * if it's on the freelist. 1600 * 1601 * We have to drop pcm before locking the entire freelist. 1602 * Once we have re-locked the freelist check to make sure 1603 * the page hasn't already been demoted or completely 1604 * freed. 1605 */ 1606 mutex_exit(pcm); 1607 page_freelist_lock(mnode); 1608 if (pp->p_szc != 0) { 1609 /* 1610 * Large page is on freelist. 1611 */ 1612 (void) page_demote(mnode, PFN_BASE(pp->p_pagenum, pp->p_szc), 1613 pp->p_szc, 0, PC_NO_COLOR, PC_FREE); 1614 } 1615 ASSERT(PP_ISFREE(pp)); 1616 ASSERT(PP_ISAGED(pp)); 1617 ASSERT(pp->p_szc == 0); 1618 1619 /* 1620 * Subtract counters before releasing pcm mutex 1621 * to avoid race with page_freelist_coalesce. 1622 */ 1623 bin = PP_2_BIN(pp); 1624 mtype = PP_2_MTYPE(pp); 1625 ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype); 1626 1627 page_sub(ppp, pp); 1628 page_ctr_sub(mnode, mtype, pp, flags); 1629 page_freelist_unlock(mnode); 1630 1631 #if defined(__sparc) 1632 if (PP_ISNORELOC(pp)) { 1633 kcage_freemem_sub(1); 1634 } 1635 #endif 1636 } 1637 1638 void 1639 page_list_sub_pages(page_t *pp, uint_t szc) 1640 { 1641 kmutex_t *pcm; 1642 uint_t bin, mtype; 1643 int mnode; 1644 1645 ASSERT(PAGE_EXCL(pp)); 1646 ASSERT(PP_ISFREE(pp)); 1647 ASSERT(PP_ISAGED(pp)); 1648 1649 /* 1650 * See comment in page_list_sub(). 1651 */ 1652 try_again: 1653 bin = PP_2_BIN(pp); 1654 mnode = PP_2_MEM_NODE(pp); 1655 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 1656 mutex_enter(pcm); 1657 if (PP_2_BIN(pp) != bin) { 1658 mutex_exit(pcm); 1659 goto try_again; 1660 } 1661 1662 /* 1663 * If we're called with a page larger than szc or it got 1664 * promoted above szc before we locked the freelist then 1665 * drop pcm and re-lock entire freelist. If page still larger 1666 * than szc then demote it. 1667 */ 1668 if (pp->p_szc > szc) { 1669 mutex_exit(pcm); 1670 pcm = NULL; 1671 page_freelist_lock(mnode); 1672 if (pp->p_szc > szc) { 1673 VM_STAT_ADD(vmm_vmstats.plsubpages_szcbig); 1674 (void) page_demote(mnode, 1675 PFN_BASE(pp->p_pagenum, pp->p_szc), 1676 pp->p_szc, szc, PC_NO_COLOR, PC_FREE); 1677 } 1678 bin = PP_2_BIN(pp); 1679 } 1680 ASSERT(PP_ISFREE(pp)); 1681 ASSERT(PP_ISAGED(pp)); 1682 ASSERT(pp->p_szc <= szc); 1683 ASSERT(pp == PP_PAGEROOT(pp)); 1684 1685 VM_STAT_ADD(vmm_vmstats.plsub_free[pp->p_szc]); 1686 1687 mtype = PP_2_MTYPE(pp); 1688 if (pp->p_szc != 0) { 1689 page_vpsub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1690 CHK_LPG(pp, pp->p_szc); 1691 } else { 1692 VM_STAT_ADD(vmm_vmstats.plsubpages_szc0); 1693 page_sub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1694 } 1695 page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST); 1696 1697 if (pcm != NULL) { 1698 mutex_exit(pcm); 1699 } else { 1700 page_freelist_unlock(mnode); 1701 } 1702 1703 #if defined(__sparc) 1704 if (PP_ISNORELOC(pp)) { 1705 pgcnt_t pgcnt; 1706 1707 pgcnt = page_get_pagecnt(pp->p_szc); 1708 kcage_freemem_sub(pgcnt); 1709 } 1710 #endif 1711 } 1712 1713 /* 1714 * Add the page to the front of a linked list of pages 1715 * using the p_next & p_prev pointers for the list. 1716 * The caller is responsible for protecting the list pointers. 1717 */ 1718 void 1719 mach_page_add(page_t **ppp, page_t *pp) 1720 { 1721 if (*ppp == NULL) { 1722 pp->p_next = pp->p_prev = pp; 1723 } else { 1724 pp->p_next = *ppp; 1725 pp->p_prev = (*ppp)->p_prev; 1726 (*ppp)->p_prev = pp; 1727 pp->p_prev->p_next = pp; 1728 } 1729 *ppp = pp; 1730 } 1731 1732 /* 1733 * Remove this page from a linked list of pages 1734 * using the p_next & p_prev pointers for the list. 1735 * 1736 * The caller is responsible for protecting the list pointers. 1737 */ 1738 void 1739 mach_page_sub(page_t **ppp, page_t *pp) 1740 { 1741 ASSERT(PP_ISFREE(pp)); 1742 1743 if (*ppp == NULL || pp == NULL) 1744 panic("mach_page_sub"); 1745 1746 if (*ppp == pp) 1747 *ppp = pp->p_next; /* go to next page */ 1748 1749 if (*ppp == pp) 1750 *ppp = NULL; /* page list is gone */ 1751 else { 1752 pp->p_prev->p_next = pp->p_next; 1753 pp->p_next->p_prev = pp->p_prev; 1754 } 1755 pp->p_prev = pp->p_next = pp; /* make pp a list of one */ 1756 } 1757 1758 /* 1759 * Routine fsflush uses to gradually coalesce the free list into larger pages. 1760 */ 1761 void 1762 page_promote_size(page_t *pp, uint_t cur_szc) 1763 { 1764 pfn_t pfn; 1765 int mnode; 1766 int idx; 1767 int new_szc = cur_szc + 1; 1768 int full = FULL_REGION_CNT(new_szc); 1769 1770 pfn = page_pptonum(pp); 1771 mnode = PFN_2_MEM_NODE(pfn); 1772 1773 page_freelist_lock(mnode); 1774 1775 idx = PNUM_TO_IDX(mnode, new_szc, pfn); 1776 if (PAGE_COUNTERS(mnode, new_szc, idx) == full) 1777 (void) page_promote(mnode, pfn, new_szc, PC_FREE, PC_MTYPE_ANY); 1778 1779 page_freelist_unlock(mnode); 1780 } 1781 1782 static uint_t page_promote_err; 1783 static uint_t page_promote_noreloc_err; 1784 1785 /* 1786 * Create a single larger page (of szc new_szc) from smaller contiguous pages 1787 * for the given mnode starting at pfnum. Pages involved are on the freelist 1788 * before the call and may be returned to the caller if requested, otherwise 1789 * they will be placed back on the freelist. 1790 * If flags is PC_ALLOC, then the large page will be returned to the user in 1791 * a state which is consistent with a page being taken off the freelist. If 1792 * we failed to lock the new large page, then we will return NULL to the 1793 * caller and put the large page on the freelist instead. 1794 * If flags is PC_FREE, then the large page will be placed on the freelist, 1795 * and NULL will be returned. 1796 * The caller is responsible for locking the freelist as well as any other 1797 * accounting which needs to be done for a returned page. 1798 * 1799 * RFE: For performance pass in pp instead of pfnum so 1800 * we can avoid excessive calls to page_numtopp_nolock(). 1801 * This would depend on an assumption that all contiguous 1802 * pages are in the same memseg so we can just add/dec 1803 * our pp. 1804 * 1805 * Lock ordering: 1806 * 1807 * There is a potential but rare deadlock situation 1808 * for page promotion and demotion operations. The problem 1809 * is there are two paths into the freelist manager and 1810 * they have different lock orders: 1811 * 1812 * page_create() 1813 * lock freelist 1814 * page_lock(EXCL) 1815 * unlock freelist 1816 * return 1817 * caller drops page_lock 1818 * 1819 * page_free() and page_reclaim() 1820 * caller grabs page_lock(EXCL) 1821 * 1822 * lock freelist 1823 * unlock freelist 1824 * drop page_lock 1825 * 1826 * What prevents a thread in page_create() from deadlocking 1827 * with a thread freeing or reclaiming the same page is the 1828 * page_trylock() in page_get_freelist(). If the trylock fails 1829 * it skips the page. 1830 * 1831 * The lock ordering for promotion and demotion is the same as 1832 * for page_create(). Since the same deadlock could occur during 1833 * page promotion and freeing or reclaiming of a page on the 1834 * cache list we might have to fail the operation and undo what 1835 * have done so far. Again this is rare. 1836 */ 1837 page_t * 1838 page_promote(int mnode, pfn_t pfnum, uchar_t new_szc, int flags, int mtype) 1839 { 1840 page_t *pp, *pplist, *tpp, *start_pp; 1841 pgcnt_t new_npgs, npgs; 1842 uint_t bin; 1843 pgcnt_t tmpnpgs, pages_left; 1844 uint_t noreloc; 1845 int which_list; 1846 ulong_t index; 1847 kmutex_t *phm; 1848 1849 /* 1850 * General algorithm: 1851 * Find the starting page 1852 * Walk each page struct removing it from the freelist, 1853 * and linking it to all the other pages removed. 1854 * Once all pages are off the freelist, 1855 * walk the list, modifying p_szc to new_szc and what 1856 * ever other info needs to be done to create a large free page. 1857 * According to the flags, either return the page or put it 1858 * on the freelist. 1859 */ 1860 1861 start_pp = page_numtopp_nolock(pfnum); 1862 ASSERT(start_pp && (start_pp->p_pagenum == pfnum)); 1863 new_npgs = page_get_pagecnt(new_szc); 1864 ASSERT(IS_P2ALIGNED(pfnum, new_npgs)); 1865 1866 /* don't return page of the wrong mtype */ 1867 if (mtype != PC_MTYPE_ANY && mtype != PP_2_MTYPE(start_pp)) 1868 return (NULL); 1869 1870 /* 1871 * Loop through smaller pages to confirm that all pages 1872 * give the same result for PP_ISNORELOC(). 1873 * We can check this reliably here as the protocol for setting 1874 * P_NORELOC requires pages to be taken off the free list first. 1875 */ 1876 noreloc = PP_ISNORELOC(start_pp); 1877 for (pp = start_pp + new_npgs; --pp > start_pp; ) { 1878 if (noreloc != PP_ISNORELOC(pp)) { 1879 page_promote_noreloc_err++; 1880 page_promote_err++; 1881 return (NULL); 1882 } 1883 } 1884 1885 pages_left = new_npgs; 1886 pplist = NULL; 1887 pp = start_pp; 1888 1889 /* Loop around coalescing the smaller pages into a big page. */ 1890 while (pages_left) { 1891 /* 1892 * Remove from the freelist. 1893 */ 1894 ASSERT(PP_ISFREE(pp)); 1895 bin = PP_2_BIN(pp); 1896 ASSERT(mnode == PP_2_MEM_NODE(pp)); 1897 mtype = PP_2_MTYPE(pp); 1898 if (PP_ISAGED(pp)) { 1899 1900 /* 1901 * PG_FREE_LIST 1902 */ 1903 if (pp->p_szc) { 1904 page_vpsub(&PAGE_FREELISTS(mnode, 1905 pp->p_szc, bin, mtype), pp); 1906 } else { 1907 mach_page_sub(&PAGE_FREELISTS(mnode, 0, 1908 bin, mtype), pp); 1909 } 1910 which_list = PG_FREE_LIST; 1911 } else { 1912 ASSERT(pp->p_szc == 0); 1913 1914 /* 1915 * PG_CACHE_LIST 1916 * 1917 * Since this page comes from the 1918 * cachelist, we must destroy the 1919 * vnode association. 1920 */ 1921 if (!page_trylock(pp, SE_EXCL)) { 1922 goto fail_promote; 1923 } 1924 1925 /* 1926 * We need to be careful not to deadlock 1927 * with another thread in page_lookup(). 1928 * The page_lookup() thread could be holding 1929 * the same phm that we need if the two 1930 * pages happen to hash to the same phm lock. 1931 * At this point we have locked the entire 1932 * freelist and page_lookup() could be trying 1933 * to grab a freelist lock. 1934 */ 1935 index = PAGE_HASH_FUNC(pp->p_vnode, pp->p_offset); 1936 phm = PAGE_HASH_MUTEX(index); 1937 if (!mutex_tryenter(phm)) { 1938 page_unlock_nocapture(pp); 1939 goto fail_promote; 1940 } 1941 1942 mach_page_sub(&PAGE_CACHELISTS(mnode, bin, mtype), pp); 1943 page_hashout(pp, phm); 1944 mutex_exit(phm); 1945 PP_SETAGED(pp); 1946 page_unlock_nocapture(pp); 1947 which_list = PG_CACHE_LIST; 1948 } 1949 page_ctr_sub(mnode, mtype, pp, which_list); 1950 1951 /* 1952 * Concatenate the smaller page(s) onto 1953 * the large page list. 1954 */ 1955 tmpnpgs = npgs = page_get_pagecnt(pp->p_szc); 1956 pages_left -= npgs; 1957 tpp = pp; 1958 while (npgs--) { 1959 tpp->p_szc = new_szc; 1960 tpp = tpp->p_next; 1961 } 1962 page_list_concat(&pplist, &pp); 1963 pp += tmpnpgs; 1964 } 1965 CHK_LPG(pplist, new_szc); 1966 1967 /* 1968 * return the page to the user if requested 1969 * in the properly locked state. 1970 */ 1971 if (flags == PC_ALLOC && (page_trylock_cons(pplist, SE_EXCL))) { 1972 return (pplist); 1973 } 1974 1975 /* 1976 * Otherwise place the new large page on the freelist 1977 */ 1978 bin = PP_2_BIN(pplist); 1979 mnode = PP_2_MEM_NODE(pplist); 1980 mtype = PP_2_MTYPE(pplist); 1981 page_vpadd(&PAGE_FREELISTS(mnode, new_szc, bin, mtype), pplist); 1982 1983 page_ctr_add(mnode, mtype, pplist, PG_FREE_LIST); 1984 return (NULL); 1985 1986 fail_promote: 1987 /* 1988 * A thread must have still been freeing or 1989 * reclaiming the page on the cachelist. 1990 * To prevent a deadlock undo what we have 1991 * done sofar and return failure. This 1992 * situation can only happen while promoting 1993 * PAGESIZE pages. 1994 */ 1995 page_promote_err++; 1996 while (pplist) { 1997 pp = pplist; 1998 mach_page_sub(&pplist, pp); 1999 pp->p_szc = 0; 2000 bin = PP_2_BIN(pp); 2001 mtype = PP_2_MTYPE(pp); 2002 mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, mtype), pp); 2003 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST); 2004 } 2005 return (NULL); 2006 2007 } 2008 2009 /* 2010 * Break up a large page into smaller size pages. 2011 * Pages involved are on the freelist before the call and may 2012 * be returned to the caller if requested, otherwise they will 2013 * be placed back on the freelist. 2014 * The caller is responsible for locking the freelist as well as any other 2015 * accounting which needs to be done for a returned page. 2016 * If flags is not PC_ALLOC, the color argument is ignored, and thus 2017 * technically, any value may be passed in but PC_NO_COLOR is the standard 2018 * which should be followed for clarity's sake. 2019 */ 2020 page_t * 2021 page_demote(int mnode, pfn_t pfnum, uchar_t cur_szc, uchar_t new_szc, 2022 int color, int flags) 2023 { 2024 page_t *pp, *pplist, *npplist; 2025 pgcnt_t npgs, n; 2026 uint_t bin; 2027 uint_t mtype; 2028 page_t *ret_pp = NULL; 2029 2030 ASSERT(cur_szc != 0); 2031 ASSERT(new_szc < cur_szc); 2032 2033 pplist = page_numtopp_nolock(pfnum); 2034 ASSERT(pplist != NULL); 2035 2036 ASSERT(pplist->p_szc == cur_szc); 2037 2038 bin = PP_2_BIN(pplist); 2039 ASSERT(mnode == PP_2_MEM_NODE(pplist)); 2040 mtype = PP_2_MTYPE(pplist); 2041 page_vpsub(&PAGE_FREELISTS(mnode, cur_szc, bin, mtype), pplist); 2042 2043 CHK_LPG(pplist, cur_szc); 2044 page_ctr_sub(mnode, mtype, pplist, PG_FREE_LIST); 2045 2046 /* 2047 * Number of PAGESIZE pages for smaller new_szc 2048 * page. 2049 */ 2050 npgs = page_get_pagecnt(new_szc); 2051 2052 while (pplist) { 2053 pp = pplist; 2054 2055 ASSERT(pp->p_szc == cur_szc); 2056 2057 /* 2058 * We either break it up into PAGESIZE pages or larger. 2059 */ 2060 if (npgs == 1) { /* PAGESIZE case */ 2061 mach_page_sub(&pplist, pp); 2062 ASSERT(pp->p_szc == cur_szc); 2063 ASSERT(new_szc == 0); 2064 ASSERT(mnode == PP_2_MEM_NODE(pp)); 2065 pp->p_szc = new_szc; 2066 bin = PP_2_BIN(pp); 2067 if ((bin == color) && (flags == PC_ALLOC) && 2068 (ret_pp == NULL) && 2069 page_trylock_cons(pp, SE_EXCL)) { 2070 ret_pp = pp; 2071 } else { 2072 mtype = PP_2_MTYPE(pp); 2073 mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, 2074 mtype), pp); 2075 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST); 2076 } 2077 } else { 2078 2079 /* 2080 * Break down into smaller lists of pages. 2081 */ 2082 page_list_break(&pplist, &npplist, npgs); 2083 2084 pp = pplist; 2085 n = npgs; 2086 while (n--) { 2087 ASSERT(pp->p_szc == cur_szc); 2088 pp->p_szc = new_szc; 2089 pp = pp->p_next; 2090 } 2091 2092 CHK_LPG(pplist, new_szc); 2093 2094 bin = PP_2_BIN(pplist); 2095 ASSERT(mnode == PP_2_MEM_NODE(pp)); 2096 if ((bin == color) && (flags == PC_ALLOC) && 2097 (ret_pp == NULL) && 2098 page_trylock_cons(pp, SE_EXCL)) { 2099 ret_pp = pp; 2100 } else { 2101 mtype = PP_2_MTYPE(pp); 2102 page_vpadd(&PAGE_FREELISTS(mnode, new_szc, 2103 bin, mtype), pplist); 2104 2105 page_ctr_add(mnode, mtype, pplist, 2106 PG_FREE_LIST); 2107 } 2108 pplist = npplist; 2109 } 2110 } 2111 return (ret_pp); 2112 } 2113 2114 int mpss_coalesce_disable = 0; 2115 2116 /* 2117 * Coalesce free pages into a page of the given szc and color if possible. 2118 * Return the pointer to the page created, otherwise, return NULL. 2119 * 2120 * If pfnhi is non-zero, search for large page with pfn range less than pfnhi. 2121 */ 2122 page_t * 2123 page_freelist_coalesce(int mnode, uchar_t szc, uint_t color, uint_t ceq_mask, 2124 int mtype, pfn_t pfnhi) 2125 { 2126 int r = szc; /* region size */ 2127 int mrange; 2128 uint_t full, bin, color_mask, wrap = 0; 2129 pfn_t pfnum, lo, hi; 2130 size_t len, idx, idx0; 2131 pgcnt_t cands = 0, szcpgcnt = page_get_pagecnt(szc); 2132 page_t *ret_pp; 2133 #if defined(__sparc) 2134 pfn_t pfnum0, nlo, nhi; 2135 #endif 2136 2137 if (mpss_coalesce_disable) { 2138 ASSERT(szc < MMU_PAGE_SIZES); 2139 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[szc][0]); 2140 return (NULL); 2141 } 2142 2143 ASSERT(szc < mmu_page_sizes); 2144 color_mask = PAGE_GET_PAGECOLORS(szc) - 1; 2145 ASSERT(ceq_mask <= color_mask); 2146 ASSERT(color <= color_mask); 2147 color &= ceq_mask; 2148 2149 /* Prevent page_counters dynamic memory from being freed */ 2150 rw_enter(&page_ctrs_rwlock[mnode], RW_READER); 2151 2152 mrange = MTYPE_2_MRANGE(mnode, mtype); 2153 ASSERT(mrange < mnode_nranges[mnode]); 2154 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[r][mrange]); 2155 2156 /* get pfn range for mtype */ 2157 len = PAGE_COUNTERS_ENTRIES(mnode, r); 2158 #if defined(__sparc) 2159 lo = PAGE_COUNTERS_BASE(mnode, r); 2160 hi = IDX_TO_PNUM(mnode, r, len); 2161 #else 2162 MNODETYPE_2_PFN(mnode, mtype, lo, hi); 2163 hi++; 2164 #endif 2165 2166 /* use lower limit if given */ 2167 if (pfnhi != PFNNULL && pfnhi < hi) 2168 hi = pfnhi; 2169 2170 /* round to szcpgcnt boundaries */ 2171 lo = P2ROUNDUP(lo, szcpgcnt); 2172 hi = hi & ~(szcpgcnt - 1); 2173 2174 /* set lo to the closest pfn of the right color */ 2175 if ((PFN_2_COLOR(lo, szc) ^ color) & ceq_mask) { 2176 PAGE_NEXT_PFN_FOR_COLOR(lo, szc, color, ceq_mask, color_mask); 2177 } 2178 2179 if (hi <= lo) { 2180 rw_exit(&page_ctrs_rwlock[mnode]); 2181 return (NULL); 2182 } 2183 2184 full = FULL_REGION_CNT(r); 2185 2186 /* calculate the number of page candidates and initial search index */ 2187 bin = color; 2188 idx0 = (size_t)(-1); 2189 do { 2190 pgcnt_t acand; 2191 2192 PGCTRS_CANDS_GETVALUECOLOR(mnode, mrange, r, bin, acand); 2193 if (acand) { 2194 idx = PAGE_COUNTERS_CURRENT_COLOR(mnode, 2195 r, bin, mrange); 2196 idx0 = MIN(idx0, idx); 2197 cands += acand; 2198 } 2199 bin = ADD_MASKED(bin, 1, ceq_mask, color_mask); 2200 } while (bin != color); 2201 2202 if (cands == 0) { 2203 VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip[r][mrange]); 2204 rw_exit(&page_ctrs_rwlock[mnode]); 2205 return (NULL); 2206 } 2207 2208 pfnum = IDX_TO_PNUM(mnode, r, idx0); 2209 if (pfnum < lo || pfnum >= hi) { 2210 pfnum = lo; 2211 } else if ((PFN_2_COLOR(pfnum, szc) ^ color) & ceq_mask) { 2212 /* pfnum has invalid color get the closest correct pfn */ 2213 PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask, 2214 color_mask); 2215 pfnum = (pfnum >= hi) ? lo : pfnum; 2216 } 2217 2218 /* set starting index */ 2219 idx0 = PNUM_TO_IDX(mnode, r, pfnum); 2220 ASSERT(idx0 < len); 2221 2222 #if defined(__sparc) 2223 pfnum0 = pfnum; /* page corresponding to idx0 */ 2224 nhi = 0; /* search kcage ranges */ 2225 #endif 2226 2227 for (idx = idx0; wrap == 0 || (idx < idx0 && wrap < 2); ) { 2228 2229 #if defined(__sparc) 2230 /* 2231 * Find lowest intersection of kcage ranges and mnode. 2232 * MTYPE_NORELOC means look in the cage, otherwise outside. 2233 */ 2234 if (nhi <= pfnum) { 2235 if (kcage_next_range(mtype == MTYPE_NORELOC, pfnum, 2236 (wrap == 0 ? hi : pfnum0), &nlo, &nhi)) 2237 goto wrapit; 2238 2239 /* jump to the next page in the range */ 2240 if (pfnum < nlo) { 2241 pfnum = P2ROUNDUP(nlo, szcpgcnt); 2242 idx = PNUM_TO_IDX(mnode, r, pfnum); 2243 if (idx >= len || pfnum >= hi) 2244 goto wrapit; 2245 if ((PFN_2_COLOR(pfnum, szc) ^ color) & 2246 ceq_mask) 2247 goto next; 2248 } 2249 } 2250 #endif 2251 2252 if (PAGE_COUNTERS(mnode, r, idx) != full) 2253 goto next; 2254 2255 /* 2256 * RFE: For performance maybe we can do something less 2257 * brutal than locking the entire freelist. So far 2258 * this doesn't seem to be a performance problem? 2259 */ 2260 page_freelist_lock(mnode); 2261 if (PAGE_COUNTERS(mnode, r, idx) == full) { 2262 ret_pp = 2263 page_promote(mnode, pfnum, r, PC_ALLOC, mtype); 2264 if (ret_pp != NULL) { 2265 VM_STAT_ADD(vmm_vmstats.pfc_coalok[r][mrange]); 2266 PAGE_COUNTERS_CURRENT_COLOR(mnode, r, 2267 PFN_2_COLOR(pfnum, szc), mrange) = idx; 2268 page_freelist_unlock(mnode); 2269 rw_exit(&page_ctrs_rwlock[mnode]); 2270 #if defined(__sparc) 2271 if (PP_ISNORELOC(ret_pp)) { 2272 pgcnt_t npgs; 2273 2274 npgs = page_get_pagecnt(ret_pp->p_szc); 2275 kcage_freemem_sub(npgs); 2276 } 2277 #endif 2278 return (ret_pp); 2279 } 2280 } else { 2281 VM_STAT_ADD(vmm_vmstats.page_ctrs_changed[r][mrange]); 2282 } 2283 2284 page_freelist_unlock(mnode); 2285 /* 2286 * No point looking for another page if we've 2287 * already tried all of the ones that 2288 * page_ctr_cands indicated. Stash off where we left 2289 * off. 2290 * Note: this is not exact since we don't hold the 2291 * page_freelist_locks before we initially get the 2292 * value of cands for performance reasons, but should 2293 * be a decent approximation. 2294 */ 2295 if (--cands == 0) { 2296 PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color, mrange) = 2297 idx; 2298 break; 2299 } 2300 next: 2301 PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask, 2302 color_mask); 2303 idx = PNUM_TO_IDX(mnode, r, pfnum); 2304 if (idx >= len || pfnum >= hi) { 2305 wrapit: 2306 pfnum = lo; 2307 idx = PNUM_TO_IDX(mnode, r, pfnum); 2308 wrap++; 2309 #if defined(__sparc) 2310 nhi = 0; /* search kcage ranges */ 2311 #endif 2312 } 2313 } 2314 2315 rw_exit(&page_ctrs_rwlock[mnode]); 2316 VM_STAT_ADD(vmm_vmstats.page_ctrs_failed[r][mrange]); 2317 return (NULL); 2318 } 2319 2320 /* 2321 * For the given mnode, promote as many small pages to large pages as possible. 2322 */ 2323 void 2324 page_freelist_coalesce_all(int mnode) 2325 { 2326 int r; /* region size */ 2327 int idx, full; 2328 pfn_t pfnum; 2329 size_t len; 2330 2331 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce_all); 2332 2333 if (mpss_coalesce_disable) { 2334 return; 2335 } 2336 2337 /* 2338 * Lock the entire freelist and coalesce what we can. 2339 * 2340 * Always promote to the largest page possible 2341 * first to reduce the number of page promotions. 2342 */ 2343 rw_enter(&page_ctrs_rwlock[mnode], RW_READER); 2344 page_freelist_lock(mnode); 2345 for (r = mmu_page_sizes - 1; r > 0; r--) { 2346 pgcnt_t cands = 0; 2347 int mrange, nranges = mnode_nranges[mnode]; 2348 2349 for (mrange = 0; mrange < nranges; mrange++) { 2350 PGCTRS_CANDS_GETVALUE(mnode, mrange, r, cands); 2351 if (cands != 0) 2352 break; 2353 } 2354 if (cands == 0) { 2355 VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip_all); 2356 continue; 2357 } 2358 2359 full = FULL_REGION_CNT(r); 2360 len = PAGE_COUNTERS_ENTRIES(mnode, r); 2361 2362 for (idx = 0; idx < len; idx++) { 2363 if (PAGE_COUNTERS(mnode, r, idx) == full) { 2364 pfnum = IDX_TO_PNUM(mnode, r, idx); 2365 ASSERT(pfnum >= 2366 mem_node_config[mnode].physbase && 2367 pfnum < 2368 mem_node_config[mnode].physmax); 2369 (void) page_promote(mnode, 2370 pfnum, r, PC_FREE, PC_MTYPE_ANY); 2371 } 2372 } 2373 } 2374 page_freelist_unlock(mnode); 2375 rw_exit(&page_ctrs_rwlock[mnode]); 2376 } 2377 2378 /* 2379 * This is where all polices for moving pages around 2380 * to different page size free lists is implemented. 2381 * Returns 1 on success, 0 on failure. 2382 * 2383 * So far these are the priorities for this algorithm in descending 2384 * order: 2385 * 2386 * 1) When servicing a request try to do so with a free page 2387 * from next size up. Helps defer fragmentation as long 2388 * as possible. 2389 * 2390 * 2) Page coalesce on demand. Only when a freelist 2391 * larger than PAGESIZE is empty and step 1 2392 * will not work since all larger size lists are 2393 * also empty. 2394 * 2395 * If pfnhi is non-zero, search for large page with pfn range less than pfnhi. 2396 */ 2397 2398 page_t * 2399 page_freelist_split(uchar_t szc, uint_t color, int mnode, int mtype, 2400 pfn_t pfnhi, page_list_walker_t *plw) 2401 { 2402 uchar_t nszc = szc + 1; 2403 uint_t bin, sbin, bin_prev; 2404 page_t *pp, *firstpp; 2405 page_t *ret_pp = NULL; 2406 uint_t color_mask; 2407 2408 if (nszc == mmu_page_sizes) 2409 return (NULL); 2410 2411 ASSERT(nszc < mmu_page_sizes); 2412 color_mask = PAGE_GET_PAGECOLORS(nszc) - 1; 2413 bin = sbin = PAGE_GET_NSZ_COLOR(szc, color); 2414 bin_prev = (plw->plw_bin_split_prev == color) ? INVALID_COLOR : 2415 PAGE_GET_NSZ_COLOR(szc, plw->plw_bin_split_prev); 2416 2417 VM_STAT_ADD(vmm_vmstats.pfs_req[szc]); 2418 /* 2419 * First try to break up a larger page to fill current size freelist. 2420 */ 2421 while (plw->plw_bins[nszc] != 0) { 2422 2423 ASSERT(nszc < mmu_page_sizes); 2424 2425 /* 2426 * If page found then demote it. 2427 */ 2428 if (PAGE_FREELISTS(mnode, nszc, bin, mtype)) { 2429 page_freelist_lock(mnode); 2430 firstpp = pp = PAGE_FREELISTS(mnode, nszc, bin, mtype); 2431 2432 /* 2433 * If pfnhi is not PFNNULL, look for large page below 2434 * pfnhi. PFNNULL signifies no pfn requirement. 2435 */ 2436 if (pfnhi != PFNNULL && pp->p_pagenum >= pfnhi) { 2437 do { 2438 pp = pp->p_vpnext; 2439 if (pp == firstpp) { 2440 pp = NULL; 2441 break; 2442 } 2443 } while (pp->p_pagenum >= pfnhi); 2444 } 2445 if (pp) { 2446 uint_t ccolor = page_correct_color(szc, nszc, 2447 color, bin, plw->plw_ceq_mask[szc]); 2448 2449 ASSERT(pp->p_szc == nszc); 2450 VM_STAT_ADD(vmm_vmstats.pfs_demote[nszc]); 2451 ret_pp = page_demote(mnode, pp->p_pagenum, 2452 pp->p_szc, szc, ccolor, PC_ALLOC); 2453 if (ret_pp) { 2454 page_freelist_unlock(mnode); 2455 #if defined(__sparc) 2456 if (PP_ISNORELOC(ret_pp)) { 2457 pgcnt_t npgs; 2458 2459 npgs = page_get_pagecnt( 2460 ret_pp->p_szc); 2461 kcage_freemem_sub(npgs); 2462 } 2463 #endif 2464 return (ret_pp); 2465 } 2466 } 2467 page_freelist_unlock(mnode); 2468 } 2469 2470 /* loop through next size bins */ 2471 bin = ADD_MASKED(bin, 1, plw->plw_ceq_mask[nszc], color_mask); 2472 plw->plw_bins[nszc]--; 2473 2474 if (bin == sbin) { 2475 uchar_t nnszc = nszc + 1; 2476 2477 /* we are done with this page size - check next */ 2478 if (plw->plw_bins[nnszc] == 0) 2479 /* we have already checked next size bins */ 2480 break; 2481 2482 bin = sbin = PAGE_GET_NSZ_COLOR(nszc, bin); 2483 if (bin_prev != INVALID_COLOR) { 2484 bin_prev = PAGE_GET_NSZ_COLOR(nszc, bin_prev); 2485 if (!((bin ^ bin_prev) & 2486 plw->plw_ceq_mask[nnszc])) 2487 break; 2488 } 2489 ASSERT(nnszc < mmu_page_sizes); 2490 color_mask = PAGE_GET_PAGECOLORS(nnszc) - 1; 2491 nszc = nnszc; 2492 ASSERT(nszc < mmu_page_sizes); 2493 } 2494 } 2495 2496 return (ret_pp); 2497 } 2498 2499 /* 2500 * Helper routine used only by the freelist code to lock 2501 * a page. If the page is a large page then it succeeds in 2502 * locking all the constituent pages or none at all. 2503 * Returns 1 on sucess, 0 on failure. 2504 */ 2505 static int 2506 page_trylock_cons(page_t *pp, se_t se) 2507 { 2508 page_t *tpp, *first_pp = pp; 2509 2510 /* 2511 * Fail if can't lock first or only page. 2512 */ 2513 if (!page_trylock(pp, se)) { 2514 return (0); 2515 } 2516 2517 /* 2518 * PAGESIZE: common case. 2519 */ 2520 if (pp->p_szc == 0) { 2521 return (1); 2522 } 2523 2524 /* 2525 * Large page case. 2526 */ 2527 tpp = pp->p_next; 2528 while (tpp != pp) { 2529 if (!page_trylock(tpp, se)) { 2530 /* 2531 * On failure unlock what we have locked so far. 2532 * We want to avoid attempting to capture these 2533 * pages as the pcm mutex may be held which could 2534 * lead to a recursive mutex panic. 2535 */ 2536 while (first_pp != tpp) { 2537 page_unlock_nocapture(first_pp); 2538 first_pp = first_pp->p_next; 2539 } 2540 return (0); 2541 } 2542 tpp = tpp->p_next; 2543 } 2544 return (1); 2545 } 2546 2547 /* 2548 * init context for walking page lists 2549 * Called when a page of the given szc in unavailable. Sets markers 2550 * for the beginning of the search to detect when search has 2551 * completed a full cycle. Sets flags for splitting larger pages 2552 * and coalescing smaller pages. Page walking procedes until a page 2553 * of the desired equivalent color is found. 2554 */ 2555 void 2556 page_list_walk_init(uchar_t szc, uint_t flags, uint_t bin, int can_split, 2557 int use_ceq, page_list_walker_t *plw) 2558 { 2559 uint_t nszc, ceq_mask, colors; 2560 uchar_t ceq = use_ceq ? colorequivszc[szc] : 0; 2561 2562 ASSERT(szc < mmu_page_sizes); 2563 colors = PAGE_GET_PAGECOLORS(szc); 2564 2565 plw->plw_colors = colors; 2566 plw->plw_color_mask = colors - 1; 2567 plw->plw_bin_marker = plw->plw_bin0 = bin; 2568 plw->plw_bin_split_prev = bin; 2569 plw->plw_bin_step = (szc == 0) ? vac_colors : 1; 2570 2571 /* 2572 * if vac aliasing is possible make sure lower order color 2573 * bits are never ignored 2574 */ 2575 if (vac_colors > 1) 2576 ceq &= 0xf0; 2577 2578 /* 2579 * calculate the number of non-equivalent colors and 2580 * color equivalency mask 2581 */ 2582 plw->plw_ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf)); 2583 ASSERT(szc > 0 || plw->plw_ceq_dif >= vac_colors); 2584 ASSERT(plw->plw_ceq_dif > 0); 2585 plw->plw_ceq_mask[szc] = (plw->plw_ceq_dif - 1) << (ceq & 0xf); 2586 2587 if (flags & PG_MATCH_COLOR) { 2588 if (cpu_page_colors < 0) { 2589 /* 2590 * this is a heterogeneous machine with different CPUs 2591 * having different size e$ (not supported for ni2/rock 2592 */ 2593 uint_t cpucolors = CPUSETSIZE() >> PAGE_GET_SHIFT(szc); 2594 cpucolors = MAX(cpucolors, 1); 2595 ceq_mask = plw->plw_color_mask & (cpucolors - 1); 2596 plw->plw_ceq_mask[szc] = 2597 MIN(ceq_mask, plw->plw_ceq_mask[szc]); 2598 } 2599 plw->plw_ceq_dif = 1; 2600 } 2601 2602 /* we can split pages in the freelist, but not the cachelist */ 2603 if (can_split) { 2604 plw->plw_do_split = (szc + 1 < mmu_page_sizes) ? 1 : 0; 2605 2606 /* calculate next sizes color masks and number of free list bins */ 2607 for (nszc = szc + 1; nszc < mmu_page_sizes; nszc++, szc++) { 2608 plw->plw_ceq_mask[nszc] = PAGE_GET_NSZ_MASK(szc, 2609 plw->plw_ceq_mask[szc]); 2610 plw->plw_bins[nszc] = PAGE_GET_PAGECOLORS(nszc); 2611 } 2612 plw->plw_ceq_mask[nszc] = INVALID_MASK; 2613 plw->plw_bins[nszc] = 0; 2614 2615 } else { 2616 ASSERT(szc == 0); 2617 plw->plw_do_split = 0; 2618 plw->plw_bins[1] = 0; 2619 plw->plw_ceq_mask[1] = INVALID_MASK; 2620 } 2621 } 2622 2623 /* 2624 * set mark to flag where next split should occur 2625 */ 2626 #define PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw) { \ 2627 uint_t bin_nsz = PAGE_GET_NSZ_COLOR(szc, bin); \ 2628 uint_t bin0_nsz = PAGE_GET_NSZ_COLOR(szc, plw->plw_bin0); \ 2629 uint_t neq_mask = ~plw->plw_ceq_mask[nszc] & plw->plw_color_mask; \ 2630 plw->plw_split_next = \ 2631 INC_MASKED(bin_nsz, neq_mask, plw->plw_color_mask); \ 2632 if (!((plw->plw_split_next ^ bin0_nsz) & plw->plw_ceq_mask[nszc])) { \ 2633 plw->plw_split_next = \ 2634 INC_MASKED(plw->plw_split_next, \ 2635 neq_mask, plw->plw_color_mask); \ 2636 } \ 2637 } 2638 2639 uint_t 2640 page_list_walk_next_bin(uchar_t szc, uint_t bin, page_list_walker_t *plw) 2641 { 2642 uint_t neq_mask = ~plw->plw_ceq_mask[szc] & plw->plw_color_mask; 2643 uint_t bin0_nsz, nbin_nsz, nbin0, nbin; 2644 uchar_t nszc = szc + 1; 2645 2646 nbin = ADD_MASKED(bin, 2647 plw->plw_bin_step, neq_mask, plw->plw_color_mask); 2648 2649 if (plw->plw_do_split) { 2650 plw->plw_bin_split_prev = bin; 2651 PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw); 2652 plw->plw_do_split = 0; 2653 } 2654 2655 if (szc == 0) { 2656 if (plw->plw_count != 0 || plw->plw_ceq_dif == vac_colors) { 2657 if (nbin == plw->plw_bin0 && 2658 (vac_colors == 1 || nbin != plw->plw_bin_marker)) { 2659 nbin = ADD_MASKED(nbin, plw->plw_bin_step, 2660 neq_mask, plw->plw_color_mask); 2661 plw->plw_bin_split_prev = plw->plw_bin0; 2662 } 2663 2664 if (vac_colors > 1 && nbin == plw->plw_bin_marker) { 2665 plw->plw_bin_marker = 2666 nbin = INC_MASKED(nbin, neq_mask, 2667 plw->plw_color_mask); 2668 plw->plw_bin_split_prev = plw->plw_bin0; 2669 /* 2670 * large pages all have the same vac color 2671 * so by now we should be done with next 2672 * size page splitting process 2673 */ 2674 ASSERT(plw->plw_bins[1] == 0); 2675 plw->plw_do_split = 0; 2676 return (nbin); 2677 } 2678 2679 } else { 2680 uint_t bin_jump = (vac_colors == 1) ? 2681 (BIN_STEP & ~3) - (plw->plw_bin0 & 3) : BIN_STEP; 2682 2683 bin_jump &= ~(vac_colors - 1); 2684 2685 nbin0 = ADD_MASKED(plw->plw_bin0, bin_jump, neq_mask, 2686 plw->plw_color_mask); 2687 2688 if ((nbin0 ^ plw->plw_bin0) & plw->plw_ceq_mask[szc]) { 2689 2690 plw->plw_bin_marker = nbin = nbin0; 2691 2692 if (plw->plw_bins[nszc] != 0) { 2693 /* 2694 * check if next page size bin is the 2695 * same as the next page size bin for 2696 * bin0 2697 */ 2698 nbin_nsz = PAGE_GET_NSZ_COLOR(szc, 2699 nbin); 2700 bin0_nsz = PAGE_GET_NSZ_COLOR(szc, 2701 plw->plw_bin0); 2702 2703 if ((bin0_nsz ^ nbin_nsz) & 2704 plw->plw_ceq_mask[nszc]) 2705 plw->plw_do_split = 1; 2706 } 2707 return (nbin); 2708 } 2709 } 2710 } 2711 2712 if (plw->plw_bins[nszc] != 0) { 2713 nbin_nsz = PAGE_GET_NSZ_COLOR(szc, nbin); 2714 if (!((plw->plw_split_next ^ nbin_nsz) & 2715 plw->plw_ceq_mask[nszc])) 2716 plw->plw_do_split = 1; 2717 } 2718 2719 return (nbin); 2720 } 2721 2722 page_t * 2723 page_get_mnode_freelist(int mnode, uint_t bin, int mtype, uchar_t szc, 2724 uint_t flags) 2725 { 2726 kmutex_t *pcm; 2727 page_t *pp, *first_pp; 2728 uint_t sbin; 2729 int plw_initialized; 2730 page_list_walker_t plw; 2731 2732 ASSERT(szc < mmu_page_sizes); 2733 2734 VM_STAT_ADD(vmm_vmstats.pgmf_alloc[szc]); 2735 2736 MTYPE_START(mnode, mtype, flags); 2737 if (mtype < 0) { /* mnode does not have memory in mtype range */ 2738 VM_STAT_ADD(vmm_vmstats.pgmf_allocempty[szc]); 2739 return (NULL); 2740 } 2741 try_again: 2742 2743 plw_initialized = 0; 2744 plw.plw_ceq_dif = 1; 2745 2746 /* 2747 * Only hold one freelist lock at a time, that way we 2748 * can start anywhere and not have to worry about lock 2749 * ordering. 2750 */ 2751 for (plw.plw_count = 0; 2752 plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) { 2753 sbin = bin; 2754 do { 2755 if (!PAGE_FREELISTS(mnode, szc, bin, mtype)) 2756 goto bin_empty_1; 2757 2758 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 2759 mutex_enter(pcm); 2760 pp = PAGE_FREELISTS(mnode, szc, bin, mtype); 2761 if (pp == NULL) 2762 goto bin_empty_0; 2763 2764 /* 2765 * These were set before the page 2766 * was put on the free list, 2767 * they must still be set. 2768 */ 2769 ASSERT(PP_ISFREE(pp)); 2770 ASSERT(PP_ISAGED(pp)); 2771 ASSERT(pp->p_vnode == NULL); 2772 ASSERT(pp->p_hash == NULL); 2773 ASSERT(pp->p_offset == (u_offset_t)-1); 2774 ASSERT(pp->p_szc == szc); 2775 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 2776 2777 /* 2778 * Walk down the hash chain. 2779 * 8k pages are linked on p_next 2780 * and p_prev fields. Large pages 2781 * are a contiguous group of 2782 * constituent pages linked together 2783 * on their p_next and p_prev fields. 2784 * The large pages are linked together 2785 * on the hash chain using p_vpnext 2786 * p_vpprev of the base constituent 2787 * page of each large page. 2788 */ 2789 first_pp = pp; 2790 while (!page_trylock_cons(pp, SE_EXCL)) { 2791 if (szc == 0) { 2792 pp = pp->p_next; 2793 } else { 2794 pp = pp->p_vpnext; 2795 } 2796 2797 ASSERT(PP_ISFREE(pp)); 2798 ASSERT(PP_ISAGED(pp)); 2799 ASSERT(pp->p_vnode == NULL); 2800 ASSERT(pp->p_hash == NULL); 2801 ASSERT(pp->p_offset == (u_offset_t)-1); 2802 ASSERT(pp->p_szc == szc); 2803 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 2804 2805 if (pp == first_pp) 2806 goto bin_empty_0; 2807 } 2808 2809 ASSERT(pp != NULL); 2810 ASSERT(mtype == PP_2_MTYPE(pp)); 2811 ASSERT(pp->p_szc == szc); 2812 if (szc == 0) { 2813 page_sub(&PAGE_FREELISTS(mnode, 2814 szc, bin, mtype), pp); 2815 } else { 2816 page_vpsub(&PAGE_FREELISTS(mnode, 2817 szc, bin, mtype), pp); 2818 CHK_LPG(pp, szc); 2819 } 2820 page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST); 2821 2822 if ((PP_ISFREE(pp) == 0) || (PP_ISAGED(pp) == 0)) 2823 panic("free page is not. pp %p", (void *)pp); 2824 mutex_exit(pcm); 2825 2826 #if defined(__sparc) 2827 ASSERT(!kcage_on || PP_ISNORELOC(pp) || 2828 (flags & PG_NORELOC) == 0); 2829 2830 if (PP_ISNORELOC(pp)) 2831 kcage_freemem_sub(page_get_pagecnt(szc)); 2832 #endif 2833 VM_STAT_ADD(vmm_vmstats.pgmf_allocok[szc]); 2834 return (pp); 2835 2836 bin_empty_0: 2837 mutex_exit(pcm); 2838 bin_empty_1: 2839 if (plw_initialized == 0) { 2840 page_list_walk_init(szc, flags, bin, 1, 1, 2841 &plw); 2842 plw_initialized = 1; 2843 ASSERT(plw.plw_colors <= 2844 PAGE_GET_PAGECOLORS(szc)); 2845 ASSERT(plw.plw_colors > 0); 2846 ASSERT((plw.plw_colors & 2847 (plw.plw_colors - 1)) == 0); 2848 ASSERT(bin < plw.plw_colors); 2849 ASSERT(plw.plw_ceq_mask[szc] < plw.plw_colors); 2850 } 2851 /* calculate the next bin with equivalent color */ 2852 bin = ADD_MASKED(bin, plw.plw_bin_step, 2853 plw.plw_ceq_mask[szc], plw.plw_color_mask); 2854 } while (sbin != bin); 2855 2856 /* 2857 * color bins are all empty if color match. Try and 2858 * satisfy the request by breaking up or coalescing 2859 * pages from a different size freelist of the correct 2860 * color that satisfies the ORIGINAL color requested. 2861 * If that fails then try pages of the same size but 2862 * different colors assuming we are not called with 2863 * PG_MATCH_COLOR. 2864 */ 2865 if (plw.plw_do_split && 2866 (pp = page_freelist_split(szc, bin, mnode, 2867 mtype, PFNNULL, &plw)) != NULL) 2868 return (pp); 2869 2870 if (szc > 0 && (pp = page_freelist_coalesce(mnode, szc, 2871 bin, plw.plw_ceq_mask[szc], mtype, PFNNULL)) != NULL) 2872 return (pp); 2873 2874 if (plw.plw_ceq_dif > 1) 2875 bin = page_list_walk_next_bin(szc, bin, &plw); 2876 } 2877 2878 /* if allowed, cycle through additional mtypes */ 2879 MTYPE_NEXT(mnode, mtype, flags); 2880 if (mtype >= 0) 2881 goto try_again; 2882 2883 VM_STAT_ADD(vmm_vmstats.pgmf_allocfailed[szc]); 2884 2885 return (NULL); 2886 } 2887 2888 /* 2889 * Returns the count of free pages for 'pp' with size code 'szc'. 2890 * Note: This function does not return an exact value as the page freelist 2891 * locks are not held and thus the values in the page_counters may be 2892 * changing as we walk through the data. 2893 */ 2894 static int 2895 page_freecnt(int mnode, page_t *pp, uchar_t szc) 2896 { 2897 pgcnt_t pgfree; 2898 pgcnt_t cnt; 2899 ssize_t r = szc; /* region size */ 2900 ssize_t idx; 2901 int i; 2902 int full, range; 2903 2904 /* Make sure pagenum passed in is aligned properly */ 2905 ASSERT((pp->p_pagenum & (PNUM_SIZE(szc) - 1)) == 0); 2906 ASSERT(szc > 0); 2907 2908 /* Prevent page_counters dynamic memory from being freed */ 2909 rw_enter(&page_ctrs_rwlock[mnode], RW_READER); 2910 idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum); 2911 cnt = PAGE_COUNTERS(mnode, r, idx); 2912 pgfree = cnt << PNUM_SHIFT(r - 1); 2913 range = FULL_REGION_CNT(szc); 2914 2915 /* Check for completely full region */ 2916 if (cnt == range) { 2917 rw_exit(&page_ctrs_rwlock[mnode]); 2918 return (pgfree); 2919 } 2920 2921 while (--r > 0) { 2922 idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum); 2923 full = FULL_REGION_CNT(r); 2924 for (i = 0; i < range; i++, idx++) { 2925 cnt = PAGE_COUNTERS(mnode, r, idx); 2926 /* 2927 * If cnt here is full, that means we have already 2928 * accounted for these pages earlier. 2929 */ 2930 if (cnt != full) { 2931 pgfree += (cnt << PNUM_SHIFT(r - 1)); 2932 } 2933 } 2934 range *= full; 2935 } 2936 rw_exit(&page_ctrs_rwlock[mnode]); 2937 return (pgfree); 2938 } 2939 2940 /* 2941 * Called from page_geti_contig_pages to exclusively lock constituent pages 2942 * starting from 'spp' for page size code 'szc'. 2943 * 2944 * If 'ptcpthreshold' is set, the number of free pages needed in the 'szc' 2945 * region needs to be greater than or equal to the threshold. 2946 */ 2947 static int 2948 page_trylock_contig_pages(int mnode, page_t *spp, uchar_t szc, int flags) 2949 { 2950 pgcnt_t pgcnt = PNUM_SIZE(szc); 2951 pgcnt_t pgfree, i; 2952 page_t *pp; 2953 2954 VM_STAT_ADD(vmm_vmstats.ptcp[szc]); 2955 2956 2957 if ((ptcpthreshold == 0) || (flags & PGI_PGCPHIPRI)) 2958 goto skipptcpcheck; 2959 /* 2960 * check if there are sufficient free pages available before attempting 2961 * to trylock. Count is approximate as page counters can change. 2962 */ 2963 pgfree = page_freecnt(mnode, spp, szc); 2964 2965 /* attempt to trylock if there are sufficient already free pages */ 2966 if (pgfree < pgcnt/ptcpthreshold) { 2967 VM_STAT_ADD(vmm_vmstats.ptcpfreethresh[szc]); 2968 return (0); 2969 } 2970 2971 skipptcpcheck: 2972 2973 for (i = 0; i < pgcnt; i++) { 2974 pp = &spp[i]; 2975 if (!page_trylock(pp, SE_EXCL)) { 2976 VM_STAT_ADD(vmm_vmstats.ptcpfailexcl[szc]); 2977 while (--i != (pgcnt_t)-1) { 2978 pp = &spp[i]; 2979 ASSERT(PAGE_EXCL(pp)); 2980 page_unlock_nocapture(pp); 2981 } 2982 return (0); 2983 } 2984 ASSERT(spp[i].p_pagenum == spp->p_pagenum + i); 2985 if ((pp->p_szc > szc || (szc && pp->p_szc == szc)) && 2986 !PP_ISFREE(pp)) { 2987 VM_STAT_ADD(vmm_vmstats.ptcpfailszc[szc]); 2988 ASSERT(i == 0); 2989 page_unlock_nocapture(pp); 2990 return (0); 2991 } 2992 if (PP_ISNORELOC(pp)) { 2993 VM_STAT_ADD(vmm_vmstats.ptcpfailcage[szc]); 2994 while (i != (pgcnt_t)-1) { 2995 pp = &spp[i]; 2996 ASSERT(PAGE_EXCL(pp)); 2997 page_unlock_nocapture(pp); 2998 i--; 2999 } 3000 return (0); 3001 } 3002 } 3003 VM_STAT_ADD(vmm_vmstats.ptcpok[szc]); 3004 return (1); 3005 } 3006 3007 /* 3008 * Claim large page pointed to by 'pp'. 'pp' is the starting set 3009 * of 'szc' constituent pages that had been locked exclusively previously. 3010 * Will attempt to relocate constituent pages in use. 3011 */ 3012 static page_t * 3013 page_claim_contig_pages(page_t *pp, uchar_t szc, int flags) 3014 { 3015 spgcnt_t pgcnt, npgs, i; 3016 page_t *targpp, *rpp, *hpp; 3017 page_t *replpp = NULL; 3018 page_t *pplist = NULL; 3019 3020 ASSERT(pp != NULL); 3021 3022 pgcnt = page_get_pagecnt(szc); 3023 while (pgcnt) { 3024 ASSERT(PAGE_EXCL(pp)); 3025 ASSERT(!PP_ISNORELOC(pp)); 3026 if (PP_ISFREE(pp)) { 3027 /* 3028 * If this is a PG_FREE_LIST page then its 3029 * size code can change underneath us due to 3030 * page promotion or demotion. As an optimzation 3031 * use page_list_sub_pages() instead of 3032 * page_list_sub(). 3033 */ 3034 if (PP_ISAGED(pp)) { 3035 page_list_sub_pages(pp, szc); 3036 if (pp->p_szc == szc) { 3037 return (pp); 3038 } 3039 ASSERT(pp->p_szc < szc); 3040 npgs = page_get_pagecnt(pp->p_szc); 3041 hpp = pp; 3042 for (i = 0; i < npgs; i++, pp++) { 3043 pp->p_szc = szc; 3044 } 3045 page_list_concat(&pplist, &hpp); 3046 pgcnt -= npgs; 3047 continue; 3048 } 3049 ASSERT(!PP_ISAGED(pp)); 3050 ASSERT(pp->p_szc == 0); 3051 page_list_sub(pp, PG_CACHE_LIST); 3052 page_hashout(pp, NULL); 3053 PP_SETAGED(pp); 3054 pp->p_szc = szc; 3055 page_list_concat(&pplist, &pp); 3056 pp++; 3057 pgcnt--; 3058 continue; 3059 } 3060 npgs = page_get_pagecnt(pp->p_szc); 3061 3062 /* 3063 * page_create_wait freemem accounting done by caller of 3064 * page_get_freelist and not necessary to call it prior to 3065 * calling page_get_replacement_page. 3066 * 3067 * page_get_replacement_page can call page_get_contig_pages 3068 * to acquire a large page (szc > 0); the replacement must be 3069 * smaller than the contig page size to avoid looping or 3070 * szc == 0 and PGI_PGCPSZC0 is set. 3071 */ 3072 if (pp->p_szc < szc || (szc == 0 && (flags & PGI_PGCPSZC0))) { 3073 replpp = page_get_replacement_page(pp, NULL, 0); 3074 if (replpp) { 3075 npgs = page_get_pagecnt(pp->p_szc); 3076 ASSERT(npgs <= pgcnt); 3077 targpp = pp; 3078 } 3079 } 3080 3081 /* 3082 * If replacement is NULL or do_page_relocate fails, fail 3083 * coalescing of pages. 3084 */ 3085 if (replpp == NULL || (do_page_relocate(&targpp, &replpp, 0, 3086 &npgs, NULL) != 0)) { 3087 /* 3088 * Unlock un-processed target list 3089 */ 3090 while (pgcnt--) { 3091 ASSERT(PAGE_EXCL(pp)); 3092 page_unlock_nocapture(pp); 3093 pp++; 3094 } 3095 /* 3096 * Free the processed target list. 3097 */ 3098 while (pplist) { 3099 pp = pplist; 3100 page_sub(&pplist, pp); 3101 ASSERT(PAGE_EXCL(pp)); 3102 ASSERT(pp->p_szc == szc); 3103 ASSERT(PP_ISFREE(pp)); 3104 ASSERT(PP_ISAGED(pp)); 3105 pp->p_szc = 0; 3106 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); 3107 page_unlock_nocapture(pp); 3108 } 3109 3110 if (replpp != NULL) 3111 page_free_replacement_page(replpp); 3112 3113 return (NULL); 3114 } 3115 ASSERT(pp == targpp); 3116 3117 /* LINTED */ 3118 ASSERT(hpp = pp); /* That's right, it's an assignment */ 3119 3120 pp += npgs; 3121 pgcnt -= npgs; 3122 3123 while (npgs--) { 3124 ASSERT(PAGE_EXCL(targpp)); 3125 ASSERT(!PP_ISFREE(targpp)); 3126 ASSERT(!PP_ISNORELOC(targpp)); 3127 PP_SETFREE(targpp); 3128 ASSERT(PP_ISAGED(targpp)); 3129 ASSERT(targpp->p_szc < szc || (szc == 0 && 3130 (flags & PGI_PGCPSZC0))); 3131 targpp->p_szc = szc; 3132 targpp = targpp->p_next; 3133 3134 rpp = replpp; 3135 ASSERT(rpp != NULL); 3136 page_sub(&replpp, rpp); 3137 ASSERT(PAGE_EXCL(rpp)); 3138 ASSERT(!PP_ISFREE(rpp)); 3139 page_unlock_nocapture(rpp); 3140 } 3141 ASSERT(targpp == hpp); 3142 ASSERT(replpp == NULL); 3143 page_list_concat(&pplist, &targpp); 3144 } 3145 CHK_LPG(pplist, szc); 3146 return (pplist); 3147 } 3148 3149 /* 3150 * Trim kernel cage from pfnlo-pfnhi and store result in lo-hi. Return code 3151 * of 0 means nothing left after trim. 3152 */ 3153 int 3154 trimkcage(struct memseg *mseg, pfn_t *lo, pfn_t *hi, pfn_t pfnlo, pfn_t pfnhi) 3155 { 3156 pfn_t kcagepfn; 3157 int decr; 3158 int rc = 0; 3159 3160 if (PP_ISNORELOC(mseg->pages)) { 3161 if (PP_ISNORELOC(mseg->epages - 1) == 0) { 3162 3163 /* lower part of this mseg inside kernel cage */ 3164 decr = kcage_current_pfn(&kcagepfn); 3165 3166 /* kernel cage may have transitioned past mseg */ 3167 if (kcagepfn >= mseg->pages_base && 3168 kcagepfn < mseg->pages_end) { 3169 ASSERT(decr == 0); 3170 *lo = kcagepfn; 3171 *hi = MIN(pfnhi, 3172 (mseg->pages_end - 1)); 3173 rc = 1; 3174 } 3175 } 3176 /* else entire mseg in the cage */ 3177 } else { 3178 if (PP_ISNORELOC(mseg->epages - 1)) { 3179 3180 /* upper part of this mseg inside kernel cage */ 3181 decr = kcage_current_pfn(&kcagepfn); 3182 3183 /* kernel cage may have transitioned past mseg */ 3184 if (kcagepfn >= mseg->pages_base && 3185 kcagepfn < mseg->pages_end) { 3186 ASSERT(decr); 3187 *hi = kcagepfn; 3188 *lo = MAX(pfnlo, mseg->pages_base); 3189 rc = 1; 3190 } 3191 } else { 3192 /* entire mseg outside of kernel cage */ 3193 *lo = MAX(pfnlo, mseg->pages_base); 3194 *hi = MIN(pfnhi, (mseg->pages_end - 1)); 3195 rc = 1; 3196 } 3197 } 3198 return (rc); 3199 } 3200 3201 /* 3202 * called from page_get_contig_pages to search 'pfnlo' thru 'pfnhi' to claim a 3203 * page with size code 'szc'. Claiming such a page requires acquiring 3204 * exclusive locks on all constituent pages (page_trylock_contig_pages), 3205 * relocating pages in use and concatenating these constituent pages into a 3206 * large page. 3207 * 3208 * The page lists do not have such a large page and page_freelist_split has 3209 * already failed to demote larger pages and/or coalesce smaller free pages. 3210 * 3211 * 'flags' may specify PG_COLOR_MATCH which would limit the search of large 3212 * pages with the same color as 'bin'. 3213 * 3214 * 'pfnflag' specifies the subset of the pfn range to search. 3215 */ 3216 3217 3218 static page_t * 3219 page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags, 3220 pfn_t pfnlo, pfn_t pfnhi, pgcnt_t pfnflag) 3221 { 3222 struct memseg *mseg; 3223 pgcnt_t szcpgcnt = page_get_pagecnt(szc); 3224 pgcnt_t szcpgmask = szcpgcnt - 1; 3225 pfn_t randpfn; 3226 page_t *pp, *randpp, *endpp; 3227 uint_t colors, ceq_mask; 3228 /* LINTED : set but not used in function */ 3229 uint_t color_mask; 3230 pfn_t hi, lo; 3231 uint_t skip; 3232 3233 ASSERT(szc != 0 || (flags & PGI_PGCPSZC0)); 3234 3235 if ((pfnhi - pfnlo) + 1 < szcpgcnt) 3236 return (NULL); 3237 3238 ASSERT(szc < mmu_page_sizes); 3239 3240 colors = PAGE_GET_PAGECOLORS(szc); 3241 color_mask = colors - 1; 3242 if ((colors > 1) && (flags & PG_MATCH_COLOR)) { 3243 uchar_t ceq = colorequivszc[szc]; 3244 uint_t ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf)); 3245 3246 ASSERT(ceq_dif > 0); 3247 ceq_mask = (ceq_dif - 1) << (ceq & 0xf); 3248 } else { 3249 ceq_mask = 0; 3250 } 3251 3252 ASSERT(bin < colors); 3253 3254 /* clear "non-significant" color bits */ 3255 bin &= ceq_mask; 3256 3257 /* 3258 * trim the pfn range to search based on pfnflag. pfnflag is set 3259 * when there have been previous page_get_contig_page failures to 3260 * limit the search. 3261 * 3262 * The high bit in pfnflag specifies the number of 'slots' in the 3263 * pfn range and the remainder of pfnflag specifies which slot. 3264 * For example, a value of 1010b would mean the second slot of 3265 * the pfn range that has been divided into 8 slots. 3266 */ 3267 if (pfnflag > 1) { 3268 int slots = 1 << (highbit(pfnflag) - 1); 3269 int slotid = pfnflag & (slots - 1); 3270 pgcnt_t szcpages; 3271 int slotlen; 3272 3273 pfnlo = P2ROUNDUP(pfnlo, szcpgcnt); 3274 pfnhi = pfnhi & ~(szcpgcnt - 1); 3275 3276 szcpages = ((pfnhi - pfnlo) + 1) / szcpgcnt; 3277 slotlen = howmany(szcpages, slots); 3278 pfnlo = pfnlo + (((slotid * slotlen) % szcpages) * szcpgcnt); 3279 ASSERT(pfnlo < pfnhi); 3280 if (pfnhi > pfnlo + (slotlen * szcpgcnt)) 3281 pfnhi = pfnlo + (slotlen * szcpgcnt); 3282 } 3283 3284 memsegs_lock(0); 3285 3286 /* 3287 * loop through memsegs to look for contig page candidates 3288 */ 3289 3290 for (mseg = memsegs; mseg != NULL; mseg = mseg->next) { 3291 if (pfnhi < mseg->pages_base || pfnlo >= mseg->pages_end) { 3292 /* no overlap */ 3293 continue; 3294 } 3295 3296 if (mseg->pages_end - mseg->pages_base < szcpgcnt) 3297 /* mseg too small */ 3298 continue; 3299 3300 /* trim off kernel cage pages from pfn range */ 3301 if (kcage_on) { 3302 if (trimkcage(mseg, &lo, &hi, pfnlo, pfnhi) == 0) 3303 continue; 3304 } else { 3305 lo = MAX(pfnlo, mseg->pages_base); 3306 hi = MIN(pfnhi, (mseg->pages_end - 1)); 3307 } 3308 3309 /* round to szcpgcnt boundaries */ 3310 lo = P2ROUNDUP(lo, szcpgcnt); 3311 hi = hi & ~(szcpgcnt - 1); 3312 3313 if (hi <= lo) 3314 continue; 3315 3316 /* 3317 * set lo to point to the pfn for the desired bin. Large 3318 * page sizes may only have a single page color 3319 */ 3320 skip = szcpgcnt; 3321 if (ceq_mask > 0) { 3322 /* set lo to point at appropriate color */ 3323 PAGE_NEXT_PFN_FOR_COLOR(lo, szc, bin, ceq_mask, 3324 color_mask); 3325 if (hi <= lo) 3326 /* mseg cannot satisfy color request */ 3327 continue; 3328 } 3329 3330 /* randomly choose a point between lo and hi to begin search */ 3331 3332 randpfn = (pfn_t)GETTICK(); 3333 randpfn = ((randpfn % (hi - lo)) + lo) & ~(skip - 1); 3334 if (ceq_mask) { 3335 PAGE_NEXT_PFN_FOR_COLOR(randpfn, szc, bin, ceq_mask, 3336 color_mask); 3337 randpfn = (randpfn >= hi) ? lo : randpfn; 3338 } 3339 randpp = mseg->pages + (randpfn - mseg->pages_base); 3340 3341 ASSERT(randpp->p_pagenum == randpfn); 3342 3343 pp = randpp; 3344 endpp = mseg->pages + (hi - mseg->pages_base); 3345 3346 ASSERT(randpp + szcpgcnt <= endpp); 3347 3348 do { 3349 ASSERT(!(pp->p_pagenum & szcpgmask)); 3350 ASSERT(((PP_2_BIN(pp) ^ bin) & ceq_mask) == 0); 3351 3352 if (page_trylock_contig_pages(mnode, pp, szc, flags)) { 3353 /* pages unlocked by page_claim on failure */ 3354 if (page_claim_contig_pages(pp, szc, flags)) { 3355 memsegs_unlock(0); 3356 return (pp); 3357 } 3358 } 3359 3360 if (ceq_mask == 0) { 3361 pp += skip; 3362 } else { 3363 pfn_t pfn = pp->p_pagenum; 3364 3365 PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, bin, 3366 ceq_mask, color_mask); 3367 pp = mseg->pages + (pfn - mseg->pages_base); 3368 } 3369 if (pp >= endpp) { 3370 /* start from the beginning */ 3371 pp = mseg->pages + (lo - mseg->pages_base); 3372 ASSERT(pp->p_pagenum == lo); 3373 ASSERT(pp + szcpgcnt <= endpp); 3374 } 3375 } while (pp != randpp); 3376 } 3377 memsegs_unlock(0); 3378 return (NULL); 3379 } 3380 3381 3382 /* 3383 * controlling routine that searches through physical memory in an attempt to 3384 * claim a large page based on the input parameters. 3385 * on the page free lists. 3386 * 3387 * calls page_geti_contig_pages with an initial pfn range from the mnode 3388 * and mtype. page_geti_contig_pages will trim off the parts of the pfn range 3389 * that overlaps with the kernel cage or does not match the requested page 3390 * color if PG_MATCH_COLOR is set. Since this search is very expensive, 3391 * page_geti_contig_pages may further limit the search range based on 3392 * previous failure counts (pgcpfailcnt[]). 3393 * 3394 * for PGI_PGCPSZC0 requests, page_get_contig_pages will relocate a base 3395 * pagesize page that satisfies mtype. 3396 */ 3397 page_t * 3398 page_get_contig_pages(int mnode, uint_t bin, int mtype, uchar_t szc, 3399 uint_t flags) 3400 { 3401 pfn_t pfnlo, pfnhi; /* contig pages pfn range */ 3402 page_t *pp; 3403 pgcnt_t pfnflag = 0; /* no limit on search if 0 */ 3404 3405 VM_STAT_ADD(vmm_vmstats.pgcp_alloc[szc]); 3406 3407 /* no allocations from cage */ 3408 flags |= PGI_NOCAGE; 3409 3410 /* LINTED */ 3411 MTYPE_START(mnode, mtype, flags); 3412 if (mtype < 0) { /* mnode does not have memory in mtype range */ 3413 VM_STAT_ADD(vmm_vmstats.pgcp_allocempty[szc]); 3414 return (NULL); 3415 } 3416 3417 ASSERT(szc > 0 || (flags & PGI_PGCPSZC0)); 3418 3419 /* do not limit search and ignore color if hi pri */ 3420 3421 if (pgcplimitsearch && ((flags & PGI_PGCPHIPRI) == 0)) 3422 pfnflag = pgcpfailcnt[szc]; 3423 3424 /* remove color match to improve chances */ 3425 3426 if (flags & PGI_PGCPHIPRI || pfnflag) 3427 flags &= ~PG_MATCH_COLOR; 3428 3429 do { 3430 /* get pfn range based on mnode and mtype */ 3431 MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi); 3432 3433 ASSERT(pfnhi >= pfnlo); 3434 3435 pp = page_geti_contig_pages(mnode, bin, szc, flags, 3436 pfnlo, pfnhi, pfnflag); 3437 3438 if (pp != NULL) { 3439 pfnflag = pgcpfailcnt[szc]; 3440 if (pfnflag) { 3441 /* double the search size */ 3442 pgcpfailcnt[szc] = pfnflag >> 1; 3443 } 3444 VM_STAT_ADD(vmm_vmstats.pgcp_allocok[szc]); 3445 return (pp); 3446 } 3447 MTYPE_NEXT(mnode, mtype, flags); 3448 } while (mtype >= 0); 3449 3450 VM_STAT_ADD(vmm_vmstats.pgcp_allocfailed[szc]); 3451 return (NULL); 3452 } 3453 3454 3455 /* 3456 * Find the `best' page on the freelist for this (vp,off) (as,vaddr) pair. 3457 * 3458 * Does its own locking and accounting. 3459 * If PG_MATCH_COLOR is set, then NULL will be returned if there are no 3460 * pages of the proper color even if there are pages of a different color. 3461 * 3462 * Finds a page, removes it, THEN locks it. 3463 */ 3464 3465 /*ARGSUSED*/ 3466 page_t * 3467 page_get_freelist(struct vnode *vp, u_offset_t off, struct seg *seg, 3468 caddr_t vaddr, size_t size, uint_t flags, struct lgrp *lgrp) 3469 { 3470 struct as *as = seg->s_as; 3471 page_t *pp = NULL; 3472 ulong_t bin; 3473 uchar_t szc; 3474 int mnode; 3475 int mtype; 3476 page_t *(*page_get_func)(int, uint_t, int, uchar_t, uint_t); 3477 lgrp_mnode_cookie_t lgrp_cookie; 3478 3479 page_get_func = page_get_mnode_freelist; 3480 3481 /* 3482 * If we aren't passed a specific lgroup, or passed a freed lgrp 3483 * assume we wish to allocate near to the current thread's home. 3484 */ 3485 if (!LGRP_EXISTS(lgrp)) 3486 lgrp = lgrp_home_lgrp(); 3487 3488 if (kcage_on) { 3489 if ((flags & (PG_NORELOC | PG_PANIC)) == PG_NORELOC && 3490 kcage_freemem < kcage_throttlefree + btop(size) && 3491 curthread != kcage_cageout_thread) { 3492 /* 3493 * Set a "reserve" of kcage_throttlefree pages for 3494 * PG_PANIC and cageout thread allocations. 3495 * 3496 * Everybody else has to serialize in 3497 * page_create_get_something() to get a cage page, so 3498 * that we don't deadlock cageout! 3499 */ 3500 return (NULL); 3501 } 3502 } else { 3503 flags &= ~PG_NORELOC; 3504 flags |= PGI_NOCAGE; 3505 } 3506 3507 /* LINTED */ 3508 MTYPE_INIT(mtype, vp, vaddr, flags, size); 3509 3510 /* 3511 * Convert size to page size code. 3512 */ 3513 if ((szc = page_szc(size)) == (uchar_t)-1) 3514 panic("page_get_freelist: illegal page size request"); 3515 ASSERT(szc < mmu_page_sizes); 3516 3517 VM_STAT_ADD(vmm_vmstats.pgf_alloc[szc]); 3518 3519 /* LINTED */ 3520 AS_2_BIN(as, seg, vp, vaddr, bin, szc); 3521 3522 ASSERT(bin < PAGE_GET_PAGECOLORS(szc)); 3523 3524 /* 3525 * Try to get a local page first, but try remote if we can't 3526 * get a page of the right color. 3527 */ 3528 pgretry: 3529 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL); 3530 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3531 pp = page_get_func(mnode, bin, mtype, szc, flags); 3532 if (pp != NULL) { 3533 VM_STAT_ADD(vmm_vmstats.pgf_allocok[szc]); 3534 DTRACE_PROBE4(page__get, 3535 lgrp_t *, lgrp, 3536 int, mnode, 3537 ulong_t, bin, 3538 uint_t, flags); 3539 return (pp); 3540 } 3541 } 3542 ASSERT(pp == NULL); 3543 3544 /* 3545 * for non-SZC0 PAGESIZE requests, check cachelist before checking 3546 * remote free lists. Caller expected to call page_get_cachelist which 3547 * will check local cache lists and remote free lists. 3548 */ 3549 if (szc == 0 && ((flags & PGI_PGCPSZC0) == 0)) { 3550 VM_STAT_ADD(vmm_vmstats.pgf_allocdeferred); 3551 return (NULL); 3552 } 3553 3554 ASSERT(szc > 0 || (flags & PGI_PGCPSZC0)); 3555 3556 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); 3557 3558 /* 3559 * Try to get a non-local freelist page. 3560 */ 3561 LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie); 3562 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3563 pp = page_get_func(mnode, bin, mtype, szc, flags); 3564 if (pp != NULL) { 3565 DTRACE_PROBE4(page__get, 3566 lgrp_t *, lgrp, 3567 int, mnode, 3568 ulong_t, bin, 3569 uint_t, flags); 3570 VM_STAT_ADD(vmm_vmstats.pgf_allocokrem[szc]); 3571 return (pp); 3572 } 3573 } 3574 3575 ASSERT(pp == NULL); 3576 3577 /* 3578 * when the cage is off chances are page_get_contig_pages() will fail 3579 * to lock a large page chunk therefore when the cage is off it's not 3580 * called by default. this can be changed via /etc/system. 3581 * 3582 * page_get_contig_pages() also called to acquire a base pagesize page 3583 * for page_create_get_something(). 3584 */ 3585 if (!(flags & PG_NORELOC) && (pg_contig_disable == 0) && 3586 (kcage_on || pg_lpgcreate_nocage || szc == 0) && 3587 (page_get_func != page_get_contig_pages)) { 3588 3589 VM_STAT_ADD(vmm_vmstats.pgf_allocretry[szc]); 3590 page_get_func = page_get_contig_pages; 3591 goto pgretry; 3592 } 3593 3594 if (pgcplimitsearch && page_get_func == page_get_contig_pages) 3595 SETPGCPFAILCNT(szc); 3596 3597 VM_STAT_ADD(vmm_vmstats.pgf_allocfailed[szc]); 3598 return (NULL); 3599 } 3600 3601 /* 3602 * Find the `best' page on the cachelist for this (vp,off) (as,vaddr) pair. 3603 * 3604 * Does its own locking. 3605 * If PG_MATCH_COLOR is set, then NULL will be returned if there are no 3606 * pages of the proper color even if there are pages of a different color. 3607 * Otherwise, scan the bins for ones with pages. For each bin with pages, 3608 * try to lock one of them. If no page can be locked, try the 3609 * next bin. Return NULL if a page can not be found and locked. 3610 * 3611 * Finds a pages, trys to lock it, then removes it. 3612 */ 3613 3614 /*ARGSUSED*/ 3615 page_t * 3616 page_get_cachelist(struct vnode *vp, u_offset_t off, struct seg *seg, 3617 caddr_t vaddr, uint_t flags, struct lgrp *lgrp) 3618 { 3619 page_t *pp; 3620 struct as *as = seg->s_as; 3621 ulong_t bin; 3622 /*LINTED*/ 3623 int mnode; 3624 int mtype; 3625 lgrp_mnode_cookie_t lgrp_cookie; 3626 3627 /* 3628 * If we aren't passed a specific lgroup, or pasased a freed lgrp 3629 * assume we wish to allocate near to the current thread's home. 3630 */ 3631 if (!LGRP_EXISTS(lgrp)) 3632 lgrp = lgrp_home_lgrp(); 3633 3634 if (!kcage_on) { 3635 flags &= ~PG_NORELOC; 3636 flags |= PGI_NOCAGE; 3637 } 3638 3639 if ((flags & (PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == PG_NORELOC && 3640 kcage_freemem <= kcage_throttlefree) { 3641 /* 3642 * Reserve kcage_throttlefree pages for critical kernel 3643 * threads. 3644 * 3645 * Everybody else has to go to page_create_get_something() 3646 * to get a cage page, so we don't deadlock cageout. 3647 */ 3648 return (NULL); 3649 } 3650 3651 /* LINTED */ 3652 AS_2_BIN(as, seg, vp, vaddr, bin, 0); 3653 3654 ASSERT(bin < PAGE_GET_PAGECOLORS(0)); 3655 3656 /* LINTED */ 3657 MTYPE_INIT(mtype, vp, vaddr, flags, MMU_PAGESIZE); 3658 3659 VM_STAT_ADD(vmm_vmstats.pgc_alloc); 3660 3661 /* 3662 * Try local cachelists first 3663 */ 3664 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL); 3665 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3666 pp = page_get_mnode_cachelist(bin, flags, mnode, mtype); 3667 if (pp != NULL) { 3668 VM_STAT_ADD(vmm_vmstats.pgc_allocok); 3669 DTRACE_PROBE4(page__get, 3670 lgrp_t *, lgrp, 3671 int, mnode, 3672 ulong_t, bin, 3673 uint_t, flags); 3674 return (pp); 3675 } 3676 } 3677 3678 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); 3679 3680 /* 3681 * Try freelists/cachelists that are farther away 3682 * This is our only chance to allocate remote pages for PAGESIZE 3683 * requests. 3684 */ 3685 LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie); 3686 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3687 pp = page_get_mnode_freelist(mnode, bin, mtype, 3688 0, flags); 3689 if (pp != NULL) { 3690 VM_STAT_ADD(vmm_vmstats.pgc_allocokdeferred); 3691 DTRACE_PROBE4(page__get, 3692 lgrp_t *, lgrp, 3693 int, mnode, 3694 ulong_t, bin, 3695 uint_t, flags); 3696 return (pp); 3697 } 3698 pp = page_get_mnode_cachelist(bin, flags, mnode, mtype); 3699 if (pp != NULL) { 3700 VM_STAT_ADD(vmm_vmstats.pgc_allocokrem); 3701 DTRACE_PROBE4(page__get, 3702 lgrp_t *, lgrp, 3703 int, mnode, 3704 ulong_t, bin, 3705 uint_t, flags); 3706 return (pp); 3707 } 3708 } 3709 3710 VM_STAT_ADD(vmm_vmstats.pgc_allocfailed); 3711 return (NULL); 3712 } 3713 3714 page_t * 3715 page_get_mnode_cachelist(uint_t bin, uint_t flags, int mnode, int mtype) 3716 { 3717 kmutex_t *pcm; 3718 page_t *pp, *first_pp; 3719 uint_t sbin; 3720 int plw_initialized; 3721 page_list_walker_t plw; 3722 3723 VM_STAT_ADD(vmm_vmstats.pgmc_alloc); 3724 3725 /* LINTED */ 3726 MTYPE_START(mnode, mtype, flags); 3727 if (mtype < 0) { /* mnode does not have memory in mtype range */ 3728 VM_STAT_ADD(vmm_vmstats.pgmc_allocempty); 3729 return (NULL); 3730 } 3731 3732 try_again: 3733 3734 plw_initialized = 0; 3735 plw.plw_ceq_dif = 1; 3736 3737 /* 3738 * Only hold one cachelist lock at a time, that way we 3739 * can start anywhere and not have to worry about lock 3740 * ordering. 3741 */ 3742 3743 for (plw.plw_count = 0; 3744 plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) { 3745 sbin = bin; 3746 do { 3747 3748 if (!PAGE_CACHELISTS(mnode, bin, mtype)) 3749 goto bin_empty_1; 3750 pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST); 3751 mutex_enter(pcm); 3752 pp = PAGE_CACHELISTS(mnode, bin, mtype); 3753 if (pp == NULL) 3754 goto bin_empty_0; 3755 3756 first_pp = pp; 3757 ASSERT(pp->p_vnode); 3758 ASSERT(PP_ISAGED(pp) == 0); 3759 ASSERT(pp->p_szc == 0); 3760 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 3761 while (!page_trylock(pp, SE_EXCL)) { 3762 pp = pp->p_next; 3763 ASSERT(pp->p_szc == 0); 3764 if (pp == first_pp) { 3765 /* 3766 * We have searched the complete list! 3767 * And all of them (might only be one) 3768 * are locked. This can happen since 3769 * these pages can also be found via 3770 * the hash list. When found via the 3771 * hash list, they are locked first, 3772 * then removed. We give up to let the 3773 * other thread run. 3774 */ 3775 pp = NULL; 3776 break; 3777 } 3778 ASSERT(pp->p_vnode); 3779 ASSERT(PP_ISFREE(pp)); 3780 ASSERT(PP_ISAGED(pp) == 0); 3781 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == 3782 mnode); 3783 } 3784 3785 if (pp) { 3786 page_t **ppp; 3787 /* 3788 * Found and locked a page. 3789 * Pull it off the list. 3790 */ 3791 ASSERT(mtype == PP_2_MTYPE(pp)); 3792 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 3793 page_sub(ppp, pp); 3794 /* 3795 * Subtract counters before releasing pcm mutex 3796 * to avoid a race with page_freelist_coalesce 3797 * and page_freelist_split. 3798 */ 3799 page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST); 3800 mutex_exit(pcm); 3801 ASSERT(pp->p_vnode); 3802 ASSERT(PP_ISAGED(pp) == 0); 3803 #if defined(__sparc) 3804 ASSERT(!kcage_on || 3805 (flags & PG_NORELOC) == 0 || 3806 PP_ISNORELOC(pp)); 3807 if (PP_ISNORELOC(pp)) { 3808 kcage_freemem_sub(1); 3809 } 3810 #endif 3811 VM_STAT_ADD(vmm_vmstats. pgmc_allocok); 3812 return (pp); 3813 } 3814 bin_empty_0: 3815 mutex_exit(pcm); 3816 bin_empty_1: 3817 if (plw_initialized == 0) { 3818 page_list_walk_init(0, flags, bin, 0, 1, &plw); 3819 plw_initialized = 1; 3820 } 3821 /* calculate the next bin with equivalent color */ 3822 bin = ADD_MASKED(bin, plw.plw_bin_step, 3823 plw.plw_ceq_mask[0], plw.plw_color_mask); 3824 } while (sbin != bin); 3825 3826 if (plw.plw_ceq_dif > 1) 3827 bin = page_list_walk_next_bin(0, bin, &plw); 3828 } 3829 3830 MTYPE_NEXT(mnode, mtype, flags); 3831 if (mtype >= 0) 3832 goto try_again; 3833 3834 VM_STAT_ADD(vmm_vmstats.pgmc_allocfailed); 3835 return (NULL); 3836 } 3837 3838 #ifdef DEBUG 3839 #define REPL_PAGE_STATS 3840 #endif /* DEBUG */ 3841 3842 #ifdef REPL_PAGE_STATS 3843 struct repl_page_stats { 3844 uint_t ngets; 3845 uint_t ngets_noreloc; 3846 uint_t npgr_noreloc; 3847 uint_t nnopage_first; 3848 uint_t nnopage; 3849 uint_t nhashout; 3850 uint_t nnofree; 3851 uint_t nnext_pp; 3852 } repl_page_stats; 3853 #define REPL_STAT_INCR(v) atomic_add_32(&repl_page_stats.v, 1) 3854 #else /* REPL_PAGE_STATS */ 3855 #define REPL_STAT_INCR(v) 3856 #endif /* REPL_PAGE_STATS */ 3857 3858 int pgrppgcp; 3859 3860 /* 3861 * The freemem accounting must be done by the caller. 3862 * First we try to get a replacement page of the same size as like_pp, 3863 * if that is not possible, then we just get a set of discontiguous 3864 * PAGESIZE pages. 3865 */ 3866 page_t * 3867 page_get_replacement_page(page_t *orig_like_pp, struct lgrp *lgrp_target, 3868 uint_t pgrflags) 3869 { 3870 page_t *like_pp; 3871 page_t *pp, *pplist; 3872 page_t *pl = NULL; 3873 ulong_t bin; 3874 int mnode, page_mnode; 3875 int szc; 3876 spgcnt_t npgs, pg_cnt; 3877 pfn_t pfnum; 3878 int mtype; 3879 int flags = 0; 3880 lgrp_mnode_cookie_t lgrp_cookie; 3881 lgrp_t *lgrp; 3882 3883 REPL_STAT_INCR(ngets); 3884 like_pp = orig_like_pp; 3885 ASSERT(PAGE_EXCL(like_pp)); 3886 3887 szc = like_pp->p_szc; 3888 npgs = page_get_pagecnt(szc); 3889 /* 3890 * Now we reset like_pp to the base page_t. 3891 * That way, we won't walk past the end of this 'szc' page. 3892 */ 3893 pfnum = PFN_BASE(like_pp->p_pagenum, szc); 3894 like_pp = page_numtopp_nolock(pfnum); 3895 ASSERT(like_pp->p_szc == szc); 3896 3897 if (PP_ISNORELOC(like_pp)) { 3898 ASSERT(kcage_on); 3899 REPL_STAT_INCR(ngets_noreloc); 3900 flags = PGI_RELOCONLY; 3901 } else if (pgrflags & PGR_NORELOC) { 3902 ASSERT(kcage_on); 3903 REPL_STAT_INCR(npgr_noreloc); 3904 flags = PG_NORELOC; 3905 } 3906 3907 /* 3908 * Kernel pages must always be replaced with the same size 3909 * pages, since we cannot properly handle demotion of kernel 3910 * pages. 3911 */ 3912 if (PP_ISKAS(like_pp)) 3913 pgrflags |= PGR_SAMESZC; 3914 3915 /* LINTED */ 3916 MTYPE_PGR_INIT(mtype, flags, like_pp, page_mnode, npgs); 3917 3918 while (npgs) { 3919 pplist = NULL; 3920 for (;;) { 3921 pg_cnt = page_get_pagecnt(szc); 3922 bin = PP_2_BIN(like_pp); 3923 ASSERT(like_pp->p_szc == orig_like_pp->p_szc); 3924 ASSERT(pg_cnt <= npgs); 3925 3926 /* 3927 * If an lgroup was specified, try to get the 3928 * page from that lgroup. 3929 * NOTE: Must be careful with code below because 3930 * lgroup may disappear and reappear since there 3931 * is no locking for lgroup here. 3932 */ 3933 if (LGRP_EXISTS(lgrp_target)) { 3934 /* 3935 * Keep local variable for lgroup separate 3936 * from lgroup argument since this code should 3937 * only be exercised when lgroup argument 3938 * exists.... 3939 */ 3940 lgrp = lgrp_target; 3941 3942 /* Try the lgroup's freelists first */ 3943 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 3944 LGRP_SRCH_LOCAL); 3945 while ((pplist == NULL) && 3946 (mnode = lgrp_memnode_choose(&lgrp_cookie)) 3947 != -1) { 3948 pplist = page_get_mnode_freelist( 3949 mnode, bin, mtype, szc, 3950 flags); 3951 } 3952 3953 /* 3954 * Now try it's cachelists if this is a 3955 * small page. Don't need to do it for 3956 * larger ones since page_freelist_coalesce() 3957 * already failed. 3958 */ 3959 if (pplist != NULL || szc != 0) 3960 break; 3961 3962 /* Now try it's cachelists */ 3963 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 3964 LGRP_SRCH_LOCAL); 3965 3966 while ((pplist == NULL) && 3967 (mnode = lgrp_memnode_choose(&lgrp_cookie)) 3968 != -1) { 3969 pplist = page_get_mnode_cachelist( 3970 bin, flags, mnode, mtype); 3971 } 3972 if (pplist != NULL) { 3973 page_hashout(pplist, NULL); 3974 PP_SETAGED(pplist); 3975 REPL_STAT_INCR(nhashout); 3976 break; 3977 } 3978 /* Done looking in this lgroup. Bail out. */ 3979 break; 3980 } 3981 3982 /* 3983 * No lgroup was specified (or lgroup was removed by 3984 * DR, so just try to get the page as close to 3985 * like_pp's mnode as possible. 3986 * First try the local freelist... 3987 */ 3988 mnode = PP_2_MEM_NODE(like_pp); 3989 pplist = page_get_mnode_freelist(mnode, bin, 3990 mtype, szc, flags); 3991 if (pplist != NULL) 3992 break; 3993 3994 REPL_STAT_INCR(nnofree); 3995 3996 /* 3997 * ...then the local cachelist. Don't need to do it for 3998 * larger pages cause page_freelist_coalesce() already 3999 * failed there anyway. 4000 */ 4001 if (szc == 0) { 4002 pplist = page_get_mnode_cachelist(bin, flags, 4003 mnode, mtype); 4004 if (pplist != NULL) { 4005 page_hashout(pplist, NULL); 4006 PP_SETAGED(pplist); 4007 REPL_STAT_INCR(nhashout); 4008 break; 4009 } 4010 } 4011 4012 /* Now try remote freelists */ 4013 page_mnode = mnode; 4014 lgrp = 4015 lgrp_hand_to_lgrp(MEM_NODE_2_LGRPHAND(page_mnode)); 4016 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 4017 LGRP_SRCH_HIER); 4018 while (pplist == NULL && 4019 (mnode = lgrp_memnode_choose(&lgrp_cookie)) 4020 != -1) { 4021 /* 4022 * Skip local mnode. 4023 */ 4024 if ((mnode == page_mnode) || 4025 (mem_node_config[mnode].exists == 0)) 4026 continue; 4027 4028 pplist = page_get_mnode_freelist(mnode, 4029 bin, mtype, szc, flags); 4030 } 4031 4032 if (pplist != NULL) 4033 break; 4034 4035 4036 /* Now try remote cachelists */ 4037 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 4038 LGRP_SRCH_HIER); 4039 while (pplist == NULL && szc == 0) { 4040 mnode = lgrp_memnode_choose(&lgrp_cookie); 4041 if (mnode == -1) 4042 break; 4043 /* 4044 * Skip local mnode. 4045 */ 4046 if ((mnode == page_mnode) || 4047 (mem_node_config[mnode].exists == 0)) 4048 continue; 4049 4050 pplist = page_get_mnode_cachelist(bin, 4051 flags, mnode, mtype); 4052 4053 if (pplist != NULL) { 4054 page_hashout(pplist, NULL); 4055 PP_SETAGED(pplist); 4056 REPL_STAT_INCR(nhashout); 4057 break; 4058 } 4059 } 4060 4061 /* 4062 * Break out of while loop under the following cases: 4063 * - If we successfully got a page. 4064 * - If pgrflags specified only returning a specific 4065 * page size and we could not find that page size. 4066 * - If we could not satisfy the request with PAGESIZE 4067 * or larger pages. 4068 */ 4069 if (pplist != NULL || szc == 0) 4070 break; 4071 4072 if ((pgrflags & PGR_SAMESZC) || pgrppgcp) { 4073 /* try to find contig page */ 4074 4075 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 4076 LGRP_SRCH_HIER); 4077 4078 while ((pplist == NULL) && 4079 (mnode = 4080 lgrp_memnode_choose(&lgrp_cookie)) 4081 != -1) { 4082 pplist = page_get_contig_pages( 4083 mnode, bin, mtype, szc, 4084 flags | PGI_PGCPHIPRI); 4085 } 4086 break; 4087 } 4088 4089 /* 4090 * The correct thing to do here is try the next 4091 * page size down using szc--. Due to a bug 4092 * with the processing of HAT_RELOAD_SHARE 4093 * where the sfmmu_ttecnt arrays of all 4094 * hats sharing an ISM segment don't get updated, 4095 * using intermediate size pages for relocation 4096 * can lead to continuous page faults. 4097 */ 4098 szc = 0; 4099 } 4100 4101 if (pplist != NULL) { 4102 DTRACE_PROBE4(page__get, 4103 lgrp_t *, lgrp, 4104 int, mnode, 4105 ulong_t, bin, 4106 uint_t, flags); 4107 4108 while (pplist != NULL && pg_cnt--) { 4109 ASSERT(pplist != NULL); 4110 pp = pplist; 4111 page_sub(&pplist, pp); 4112 PP_CLRFREE(pp); 4113 PP_CLRAGED(pp); 4114 page_list_concat(&pl, &pp); 4115 npgs--; 4116 like_pp = like_pp + 1; 4117 REPL_STAT_INCR(nnext_pp); 4118 } 4119 ASSERT(pg_cnt == 0); 4120 } else { 4121 break; 4122 } 4123 } 4124 4125 if (npgs) { 4126 /* 4127 * We were unable to allocate the necessary number 4128 * of pages. 4129 * We need to free up any pl. 4130 */ 4131 REPL_STAT_INCR(nnopage); 4132 page_free_replacement_page(pl); 4133 return (NULL); 4134 } else { 4135 return (pl); 4136 } 4137 } 4138 4139 /* 4140 * demote a free large page to it's constituent pages 4141 */ 4142 void 4143 page_demote_free_pages(page_t *pp) 4144 { 4145 4146 int mnode; 4147 4148 ASSERT(pp != NULL); 4149 ASSERT(PAGE_LOCKED(pp)); 4150 ASSERT(PP_ISFREE(pp)); 4151 ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes); 4152 4153 mnode = PP_2_MEM_NODE(pp); 4154 page_freelist_lock(mnode); 4155 if (pp->p_szc != 0) { 4156 (void) page_demote(mnode, PFN_BASE(pp->p_pagenum, 4157 pp->p_szc), pp->p_szc, 0, PC_NO_COLOR, PC_FREE); 4158 } 4159 page_freelist_unlock(mnode); 4160 ASSERT(pp->p_szc == 0); 4161 } 4162 4163 /* 4164 * Factor in colorequiv to check additional 'equivalent' bins. 4165 * colorequiv may be set in /etc/system 4166 */ 4167 void 4168 page_set_colorequiv_arr(void) 4169 { 4170 if (colorequiv > 1) { 4171 int i; 4172 uint_t sv_a = lowbit(colorequiv) - 1; 4173 4174 if (sv_a > 15) 4175 sv_a = 15; 4176 4177 for (i = 0; i < MMU_PAGE_SIZES; i++) { 4178 uint_t colors; 4179 uint_t a = sv_a; 4180 4181 if ((colors = hw_page_array[i].hp_colors) <= 1) { 4182 continue; 4183 } 4184 while ((colors >> a) == 0) 4185 a--; 4186 if ((a << 4) > colorequivszc[i]) { 4187 colorequivszc[i] = (a << 4); 4188 } 4189 } 4190 } 4191 } 4192