1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * Portions of this source code were derived from Berkeley 4.3 BSD 31 * under license from the Regents of the University of California. 32 */ 33 34 #pragma ident "%Z%%M% %I% %E% SMI" 35 36 /* 37 * This file contains common functions to access and manage the page lists. 38 * Many of these routines originated from platform dependent modules 39 * (sun4/vm/vm_dep.c, i86pc/vm/vm_machdep.c) and modified to function in 40 * a platform independent manner. 41 * 42 * vm/vm_dep.h provides for platform specific support. 43 */ 44 45 #include <sys/types.h> 46 #include <sys/debug.h> 47 #include <sys/cmn_err.h> 48 #include <sys/systm.h> 49 #include <sys/atomic.h> 50 #include <sys/sysmacros.h> 51 #include <vm/as.h> 52 #include <vm/page.h> 53 #include <vm/seg_kmem.h> 54 #include <vm/seg_vn.h> 55 #include <sys/memnode.h> 56 #include <vm/vm_dep.h> 57 #include <sys/lgrp.h> 58 #include <sys/mem_config.h> 59 #include <sys/callb.h> 60 #include <sys/mem_cage.h> 61 #include <sys/sdt.h> 62 63 extern uint_t vac_colors; 64 65 #define MAX_PRAGMA_ALIGN 128 66 67 /* vm_cpu_data0 for the boot cpu before kmem is initialized */ 68 69 #if L2CACHE_ALIGN_MAX <= MAX_PRAGMA_ALIGN 70 #pragma align L2CACHE_ALIGN_MAX(vm_cpu_data0) 71 #else 72 #pragma align MAX_PRAGMA_ALIGN(vm_cpu_data0) 73 #endif 74 char vm_cpu_data0[VM_CPU_DATA_PADSIZE]; 75 76 /* 77 * number of page colors equivalent to reqested color in page_get routines. 78 * If set, keeps large pages intact longer and keeps MPO allocation 79 * from the local mnode in favor of acquiring the 'correct' page color from 80 * a demoted large page or from a remote mnode. 81 */ 82 uint_t colorequiv; 83 84 /* 85 * color equivalency mask for each page size. 86 * Mask is computed based on cpu L2$ way sizes and colorequiv global. 87 * High 4 bits determine the number of high order bits of the color to ignore. 88 * Low 4 bits determines number of low order bits of color to ignore (it's only 89 * relevant for hashed index based page coloring). 90 */ 91 uchar_t colorequivszc[MMU_PAGE_SIZES]; 92 93 /* 94 * if set, specifies the percentage of large pages that are free from within 95 * a large page region before attempting to lock those pages for 96 * page_get_contig_pages processing. 97 * 98 * Should be turned on when kpr is available when page_trylock_contig_pages 99 * can be more selective. 100 */ 101 102 int ptcpthreshold; 103 104 /* 105 * Limit page get contig page search based on failure cnts in pgcpfailcnt[]. 106 * Enabled by default via pgcplimitsearch. 107 * 108 * pgcpfailcnt[] is bounded by PGCPFAILMAX (>= 1/2 of installed 109 * memory). When reached, pgcpfailcnt[] is reset to 1/2 of this upper 110 * bound. This upper bound range guarantees: 111 * - all large page 'slots' will be searched over time 112 * - the minimum (1) large page candidates considered on each pgcp call 113 * - count doesn't wrap around to 0 114 */ 115 pgcnt_t pgcpfailcnt[MMU_PAGE_SIZES]; 116 int pgcplimitsearch = 1; 117 118 #define PGCPFAILMAX (1 << (highbit(physinstalled) - 1)) 119 #define SETPGCPFAILCNT(szc) \ 120 if (++pgcpfailcnt[szc] >= PGCPFAILMAX) \ 121 pgcpfailcnt[szc] = PGCPFAILMAX / 2; 122 123 #ifdef VM_STATS 124 struct vmm_vmstats_str vmm_vmstats; 125 126 #endif /* VM_STATS */ 127 128 #if defined(__sparc) 129 #define LPGCREATE 0 130 #else 131 /* enable page_get_contig_pages */ 132 #define LPGCREATE 1 133 #endif 134 135 int pg_contig_disable; 136 int pg_lpgcreate_nocage = LPGCREATE; 137 138 /* 139 * page_freelist_split pfn flag to signify no hi pfn requirement. 140 */ 141 #define PFNNULL 0 142 143 /* Flags involved in promotion and demotion routines */ 144 #define PC_FREE 0x1 /* put page on freelist */ 145 #define PC_ALLOC 0x2 /* return page for allocation */ 146 147 /* 148 * Flag for page_demote to be used with PC_FREE to denote that we don't care 149 * what the color is as the color parameter to the function is ignored. 150 */ 151 #define PC_NO_COLOR (-1) 152 153 /* mtype value for page_promote to use when mtype does not matter */ 154 #define PC_MTYPE_ANY (-1) 155 156 /* 157 * page counters candidates info 158 * See page_ctrs_cands comment below for more details. 159 * fields are as follows: 160 * pcc_pages_free: # pages which freelist coalesce can create 161 * pcc_color_free: pointer to page free counts per color 162 */ 163 typedef struct pcc_info { 164 pgcnt_t pcc_pages_free; 165 pgcnt_t *pcc_color_free; 166 } pcc_info_t; 167 168 /* 169 * On big machines it can take a long time to check page_counters 170 * arrays. page_ctrs_cands is a summary array whose elements are a dynamically 171 * updated sum of all elements of the corresponding page_counters arrays. 172 * page_freelist_coalesce() searches page_counters only if an appropriate 173 * element of page_ctrs_cands array is greater than 0. 174 * 175 * page_ctrs_cands is indexed by mutex (i), region (r), mnode (m), mrange (g) 176 */ 177 pcc_info_t **page_ctrs_cands[NPC_MUTEX][MMU_PAGE_SIZES]; 178 179 /* 180 * Return in val the total number of free pages which can be created 181 * for the given mnode (m), mrange (g), and region size (r) 182 */ 183 #define PGCTRS_CANDS_GETVALUE(m, g, r, val) { \ 184 int i; \ 185 val = 0; \ 186 for (i = 0; i < NPC_MUTEX; i++) { \ 187 val += page_ctrs_cands[i][(r)][(m)][(g)].pcc_pages_free; \ 188 } \ 189 } 190 191 /* 192 * Return in val the total number of free pages which can be created 193 * for the given mnode (m), mrange (g), region size (r), and color (c) 194 */ 195 #define PGCTRS_CANDS_GETVALUECOLOR(m, g, r, c, val) { \ 196 int i; \ 197 val = 0; \ 198 ASSERT((c) < PAGE_GET_PAGECOLORS(r)); \ 199 for (i = 0; i < NPC_MUTEX; i++) { \ 200 val += \ 201 page_ctrs_cands[i][(r)][(m)][(g)].pcc_color_free[(c)]; \ 202 } \ 203 } 204 205 /* 206 * We can only allow a single thread to update a counter within the physical 207 * range of the largest supported page size. That is the finest granularity 208 * possible since the counter values are dependent on each other 209 * as you move accross region sizes. PP_CTR_LOCK_INDX is used to determine the 210 * ctr_mutex lock index for a particular physical range. 211 */ 212 static kmutex_t *ctr_mutex[NPC_MUTEX]; 213 214 #define PP_CTR_LOCK_INDX(pp) \ 215 (((pp)->p_pagenum >> \ 216 (PAGE_BSZS_SHIFT(mmu_page_sizes - 1))) & (NPC_MUTEX - 1)) 217 218 #define INVALID_COLOR 0xffffffff 219 #define INVALID_MASK 0xffffffff 220 221 /* 222 * Local functions prototypes. 223 */ 224 225 void page_ctr_add(int, int, page_t *, int); 226 void page_ctr_add_internal(int, int, page_t *, int); 227 void page_ctr_sub(int, int, page_t *, int); 228 void page_ctr_sub_internal(int, int, page_t *, int); 229 void page_freelist_lock(int); 230 void page_freelist_unlock(int); 231 page_t *page_promote(int, pfn_t, uchar_t, int, int); 232 page_t *page_demote(int, pfn_t, uchar_t, uchar_t, int, int); 233 page_t *page_freelist_split(uchar_t, 234 uint_t, int, int, pfn_t, page_list_walker_t *); 235 page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int); 236 static int page_trylock_cons(page_t *pp, se_t se); 237 238 /* 239 * The page_counters array below is used to keep track of free contiguous 240 * physical memory. A hw_page_map_t will be allocated per mnode per szc. 241 * This contains an array of counters, the size of the array, a shift value 242 * used to convert a pagenum into a counter array index or vice versa, as 243 * well as a cache of the last successful index to be promoted to a larger 244 * page size. As an optimization, we keep track of the last successful index 245 * to be promoted per page color for the given size region, and this is 246 * allocated dynamically based upon the number of colors for a given 247 * region size. 248 * 249 * Conceptually, the page counters are represented as: 250 * 251 * page_counters[region_size][mnode] 252 * 253 * region_size: size code of a candidate larger page made up 254 * of contiguous free smaller pages. 255 * 256 * page_counters[region_size][mnode].hpm_counters[index]: 257 * represents how many (region_size - 1) pages either 258 * exist or can be created within the given index range. 259 * 260 * Let's look at a sparc example: 261 * If we want to create a free 512k page, we look at region_size 2 262 * for the mnode we want. We calculate the index and look at a specific 263 * hpm_counters location. If we see 8 (FULL_REGION_CNT on sparc) at 264 * this location, it means that 8 64k pages either exist or can be created 265 * from 8K pages in order to make a single free 512k page at the given 266 * index. Note that when a region is full, it will contribute to the 267 * counts in the region above it. Thus we will not know what page 268 * size the free pages will be which can be promoted to this new free 269 * page unless we look at all regions below the current region. 270 */ 271 272 /* 273 * Note: hpmctr_t is defined in platform vm_dep.h 274 * hw_page_map_t contains all the information needed for the page_counters 275 * logic. The fields are as follows: 276 * 277 * hpm_counters: dynamically allocated array to hold counter data 278 * hpm_entries: entries in hpm_counters 279 * hpm_shift: shift for pnum/array index conv 280 * hpm_base: PFN mapped to counter index 0 281 * hpm_color_current: last index in counter array for this color at 282 * which we successfully created a large page 283 */ 284 typedef struct hw_page_map { 285 hpmctr_t *hpm_counters; 286 size_t hpm_entries; 287 int hpm_shift; 288 pfn_t hpm_base; 289 size_t *hpm_color_current[MAX_MNODE_MRANGES]; 290 } hw_page_map_t; 291 292 /* 293 * Element zero is not used, but is allocated for convenience. 294 */ 295 static hw_page_map_t *page_counters[MMU_PAGE_SIZES]; 296 297 /* 298 * Cached value of MNODE_RANGE_CNT(mnode). 299 * This is a function call in x86. 300 */ 301 static int mnode_nranges[MAX_MEM_NODES]; 302 static int mnode_maxmrange[MAX_MEM_NODES]; 303 304 /* 305 * The following macros are convenient ways to get access to the individual 306 * elements of the page_counters arrays. They can be used on both 307 * the left side and right side of equations. 308 */ 309 #define PAGE_COUNTERS(mnode, rg_szc, idx) \ 310 (page_counters[(rg_szc)][(mnode)].hpm_counters[(idx)]) 311 312 #define PAGE_COUNTERS_COUNTERS(mnode, rg_szc) \ 313 (page_counters[(rg_szc)][(mnode)].hpm_counters) 314 315 #define PAGE_COUNTERS_SHIFT(mnode, rg_szc) \ 316 (page_counters[(rg_szc)][(mnode)].hpm_shift) 317 318 #define PAGE_COUNTERS_ENTRIES(mnode, rg_szc) \ 319 (page_counters[(rg_szc)][(mnode)].hpm_entries) 320 321 #define PAGE_COUNTERS_BASE(mnode, rg_szc) \ 322 (page_counters[(rg_szc)][(mnode)].hpm_base) 323 324 #define PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, rg_szc, g) \ 325 (page_counters[(rg_szc)][(mnode)].hpm_color_current[(g)]) 326 327 #define PAGE_COUNTERS_CURRENT_COLOR(mnode, rg_szc, color, mrange) \ 328 (page_counters[(rg_szc)][(mnode)]. \ 329 hpm_color_current[(mrange)][(color)]) 330 331 #define PNUM_TO_IDX(mnode, rg_szc, pnum) \ 332 (((pnum) - PAGE_COUNTERS_BASE((mnode), (rg_szc))) >> \ 333 PAGE_COUNTERS_SHIFT((mnode), (rg_szc))) 334 335 #define IDX_TO_PNUM(mnode, rg_szc, index) \ 336 (PAGE_COUNTERS_BASE((mnode), (rg_szc)) + \ 337 ((index) << PAGE_COUNTERS_SHIFT((mnode), (rg_szc)))) 338 339 /* 340 * Protects the hpm_counters and hpm_color_current memory from changing while 341 * looking at page counters information. 342 * Grab the write lock to modify what these fields point at. 343 * Grab the read lock to prevent any pointers from changing. 344 * The write lock can not be held during memory allocation due to a possible 345 * recursion deadlock with trying to grab the read lock while the 346 * write lock is already held. 347 */ 348 krwlock_t page_ctrs_rwlock[MAX_MEM_NODES]; 349 350 351 /* 352 * initialize cpu_vm_data to point at cache aligned vm_cpu_data_t. 353 */ 354 void 355 cpu_vm_data_init(struct cpu *cp) 356 { 357 if (cp == CPU0) { 358 cp->cpu_vm_data = (void *)&vm_cpu_data0; 359 } else { 360 void *kmptr; 361 int align; 362 size_t sz; 363 364 align = (L2CACHE_ALIGN) ? L2CACHE_ALIGN : L2CACHE_ALIGN_MAX; 365 sz = P2ROUNDUP(sizeof (vm_cpu_data_t), align) + align; 366 kmptr = kmem_zalloc(sz, KM_SLEEP); 367 cp->cpu_vm_data = (void *) P2ROUNDUP((uintptr_t)kmptr, align); 368 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr = kmptr; 369 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize = sz; 370 } 371 } 372 373 /* 374 * free cpu_vm_data 375 */ 376 void 377 cpu_vm_data_destroy(struct cpu *cp) 378 { 379 if (cp->cpu_seqid && cp->cpu_vm_data) { 380 ASSERT(cp != CPU0); 381 kmem_free(((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr, 382 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize); 383 } 384 cp->cpu_vm_data = NULL; 385 } 386 387 388 /* 389 * page size to page size code 390 */ 391 int 392 page_szc(size_t pagesize) 393 { 394 int i = 0; 395 396 while (hw_page_array[i].hp_size) { 397 if (pagesize == hw_page_array[i].hp_size) 398 return (i); 399 i++; 400 } 401 return (-1); 402 } 403 404 /* 405 * page size to page size code with the restriction that it be a supported 406 * user page size. If it's not a supported user page size, -1 will be returned. 407 */ 408 int 409 page_szc_user_filtered(size_t pagesize) 410 { 411 int szc = page_szc(pagesize); 412 if ((szc != -1) && (SZC_2_USERSZC(szc) != -1)) { 413 return (szc); 414 } 415 return (-1); 416 } 417 418 /* 419 * Return how many page sizes are available for the user to use. This is 420 * what the hardware supports and not based upon how the OS implements the 421 * support of different page sizes. 422 */ 423 uint_t 424 page_num_user_pagesizes(void) 425 { 426 return (mmu_exported_page_sizes); 427 } 428 429 uint_t 430 page_num_pagesizes(void) 431 { 432 return (mmu_page_sizes); 433 } 434 435 /* 436 * returns the count of the number of base pagesize pages associated with szc 437 */ 438 pgcnt_t 439 page_get_pagecnt(uint_t szc) 440 { 441 if (szc >= mmu_page_sizes) 442 panic("page_get_pagecnt: out of range %d", szc); 443 return (hw_page_array[szc].hp_pgcnt); 444 } 445 446 size_t 447 page_get_pagesize(uint_t szc) 448 { 449 if (szc >= mmu_page_sizes) 450 panic("page_get_pagesize: out of range %d", szc); 451 return (hw_page_array[szc].hp_size); 452 } 453 454 /* 455 * Return the size of a page based upon the index passed in. An index of 456 * zero refers to the smallest page size in the system, and as index increases 457 * it refers to the next larger supported page size in the system. 458 * Note that szc and userszc may not be the same due to unsupported szc's on 459 * some systems. 460 */ 461 size_t 462 page_get_user_pagesize(uint_t userszc) 463 { 464 uint_t szc = USERSZC_2_SZC(userszc); 465 466 if (szc >= mmu_page_sizes) 467 panic("page_get_user_pagesize: out of range %d", szc); 468 return (hw_page_array[szc].hp_size); 469 } 470 471 uint_t 472 page_get_shift(uint_t szc) 473 { 474 if (szc >= mmu_page_sizes) 475 panic("page_get_shift: out of range %d", szc); 476 return (PAGE_GET_SHIFT(szc)); 477 } 478 479 uint_t 480 page_get_pagecolors(uint_t szc) 481 { 482 if (szc >= mmu_page_sizes) 483 panic("page_get_pagecolors: out of range %d", szc); 484 return (PAGE_GET_PAGECOLORS(szc)); 485 } 486 487 /* 488 * this assigns the desired equivalent color after a split 489 */ 490 uint_t 491 page_correct_color(uchar_t szc, uchar_t nszc, uint_t color, 492 uint_t ncolor, uint_t ceq_mask) 493 { 494 ASSERT(nszc > szc); 495 ASSERT(szc < mmu_page_sizes); 496 ASSERT(color < PAGE_GET_PAGECOLORS(szc)); 497 ASSERT(ncolor < PAGE_GET_PAGECOLORS(nszc)); 498 499 color &= ceq_mask; 500 ncolor = PAGE_CONVERT_COLOR(ncolor, szc, nszc); 501 return (color | (ncolor & ~ceq_mask)); 502 } 503 504 /* 505 * The interleaved_mnodes flag is set when mnodes overlap in 506 * the physbase..physmax range, but have disjoint slices. 507 * In this case hpm_counters is shared by all mnodes. 508 * This flag is set dynamically by the platform. 509 */ 510 int interleaved_mnodes = 0; 511 512 /* 513 * Called by startup(). 514 * Size up the per page size free list counters based on physmax 515 * of each node and max_mem_nodes. 516 * 517 * If interleaved_mnodes is set we need to find the first mnode that 518 * exists. hpm_counters for the first mnode will then be shared by 519 * all other mnodes. If interleaved_mnodes is not set, just set 520 * first=mnode each time. That means there will be no sharing. 521 */ 522 size_t 523 page_ctrs_sz(void) 524 { 525 int r; /* region size */ 526 int mnode; 527 int firstmn; /* first mnode that exists */ 528 int nranges; 529 pfn_t physbase; 530 pfn_t physmax; 531 uint_t ctrs_sz = 0; 532 int i; 533 pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; 534 535 /* 536 * We need to determine how many page colors there are for each 537 * page size in order to allocate memory for any color specific 538 * arrays. 539 */ 540 for (i = 0; i < mmu_page_sizes; i++) { 541 colors_per_szc[i] = PAGE_GET_PAGECOLORS(i); 542 } 543 544 for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) { 545 546 pgcnt_t r_pgcnt; 547 pfn_t r_base; 548 pgcnt_t r_align; 549 550 if (mem_node_config[mnode].exists == 0) 551 continue; 552 553 HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn); 554 nranges = MNODE_RANGE_CNT(mnode); 555 mnode_nranges[mnode] = nranges; 556 mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode); 557 558 /* 559 * determine size needed for page counter arrays with 560 * base aligned to large page size. 561 */ 562 for (r = 1; r < mmu_page_sizes; r++) { 563 /* add in space for hpm_color_current */ 564 ctrs_sz += sizeof (size_t) * 565 colors_per_szc[r] * nranges; 566 567 if (firstmn != mnode) 568 continue; 569 570 /* add in space for hpm_counters */ 571 r_align = page_get_pagecnt(r); 572 r_base = physbase; 573 r_base &= ~(r_align - 1); 574 r_pgcnt = howmany(physmax - r_base + 1, r_align); 575 576 /* 577 * Round up to always allocate on pointer sized 578 * boundaries. 579 */ 580 ctrs_sz += P2ROUNDUP((r_pgcnt * sizeof (hpmctr_t)), 581 sizeof (hpmctr_t *)); 582 } 583 } 584 585 for (r = 1; r < mmu_page_sizes; r++) { 586 ctrs_sz += (max_mem_nodes * sizeof (hw_page_map_t)); 587 } 588 589 /* add in space for page_ctrs_cands and pcc_color_free */ 590 ctrs_sz += sizeof (pcc_info_t *) * max_mem_nodes * 591 mmu_page_sizes * NPC_MUTEX; 592 593 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 594 595 if (mem_node_config[mnode].exists == 0) 596 continue; 597 598 nranges = mnode_nranges[mnode]; 599 ctrs_sz += sizeof (pcc_info_t) * nranges * 600 mmu_page_sizes * NPC_MUTEX; 601 for (r = 1; r < mmu_page_sizes; r++) { 602 ctrs_sz += sizeof (pgcnt_t) * nranges * 603 colors_per_szc[r] * NPC_MUTEX; 604 } 605 } 606 607 /* ctr_mutex */ 608 ctrs_sz += (max_mem_nodes * NPC_MUTEX * sizeof (kmutex_t)); 609 610 /* size for page list counts */ 611 PLCNT_SZ(ctrs_sz); 612 613 /* 614 * add some slop for roundups. page_ctrs_alloc will roundup the start 615 * address of the counters to ecache_alignsize boundary for every 616 * memory node. 617 */ 618 return (ctrs_sz + max_mem_nodes * L2CACHE_ALIGN); 619 } 620 621 caddr_t 622 page_ctrs_alloc(caddr_t alloc_base) 623 { 624 int mnode; 625 int mrange, nranges; 626 int r; /* region size */ 627 int i; 628 int firstmn; /* first mnode that exists */ 629 pfn_t physbase; 630 pfn_t physmax; 631 pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; 632 633 /* 634 * We need to determine how many page colors there are for each 635 * page size in order to allocate memory for any color specific 636 * arrays. 637 */ 638 for (i = 0; i < mmu_page_sizes; i++) { 639 colors_per_szc[i] = PAGE_GET_PAGECOLORS(i); 640 } 641 642 for (r = 1; r < mmu_page_sizes; r++) { 643 page_counters[r] = (hw_page_map_t *)alloc_base; 644 alloc_base += (max_mem_nodes * sizeof (hw_page_map_t)); 645 } 646 647 /* page_ctrs_cands and pcc_color_free array */ 648 for (i = 0; i < NPC_MUTEX; i++) { 649 for (r = 1; r < mmu_page_sizes; r++) { 650 651 page_ctrs_cands[i][r] = (pcc_info_t **)alloc_base; 652 alloc_base += sizeof (pcc_info_t *) * max_mem_nodes; 653 654 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 655 pcc_info_t *pi; 656 657 if (mem_node_config[mnode].exists == 0) 658 continue; 659 660 nranges = mnode_nranges[mnode]; 661 662 pi = (pcc_info_t *)alloc_base; 663 alloc_base += sizeof (pcc_info_t) * nranges; 664 page_ctrs_cands[i][r][mnode] = pi; 665 666 for (mrange = 0; mrange < nranges; mrange++) { 667 pi->pcc_color_free = 668 (pgcnt_t *)alloc_base; 669 alloc_base += sizeof (pgcnt_t) * 670 colors_per_szc[r]; 671 pi++; 672 } 673 } 674 } 675 } 676 677 /* ctr_mutex */ 678 for (i = 0; i < NPC_MUTEX; i++) { 679 ctr_mutex[i] = (kmutex_t *)alloc_base; 680 alloc_base += (max_mem_nodes * sizeof (kmutex_t)); 681 } 682 683 /* initialize page list counts */ 684 PLCNT_INIT(alloc_base); 685 686 for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) { 687 688 pgcnt_t r_pgcnt; 689 pfn_t r_base; 690 pgcnt_t r_align; 691 int r_shift; 692 int nranges = mnode_nranges[mnode]; 693 694 if (mem_node_config[mnode].exists == 0) 695 continue; 696 697 HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn); 698 699 for (r = 1; r < mmu_page_sizes; r++) { 700 /* 701 * the page_counters base has to be aligned to the 702 * page count of page size code r otherwise the counts 703 * will cross large page boundaries. 704 */ 705 r_align = page_get_pagecnt(r); 706 r_base = physbase; 707 /* base needs to be aligned - lower to aligned value */ 708 r_base &= ~(r_align - 1); 709 r_pgcnt = howmany(physmax - r_base + 1, r_align); 710 r_shift = PAGE_BSZS_SHIFT(r); 711 712 PAGE_COUNTERS_SHIFT(mnode, r) = r_shift; 713 PAGE_COUNTERS_ENTRIES(mnode, r) = r_pgcnt; 714 PAGE_COUNTERS_BASE(mnode, r) = r_base; 715 for (mrange = 0; mrange < nranges; mrange++) { 716 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, 717 r, mrange) = (size_t *)alloc_base; 718 alloc_base += sizeof (size_t) * 719 colors_per_szc[r]; 720 } 721 for (i = 0; i < colors_per_szc[r]; i++) { 722 uint_t color_mask = colors_per_szc[r] - 1; 723 pfn_t pfnum = r_base; 724 size_t idx; 725 int mrange; 726 MEM_NODE_ITERATOR_DECL(it); 727 728 MEM_NODE_ITERATOR_INIT(pfnum, mnode, &it); 729 ASSERT(pfnum != (pfn_t)-1); 730 PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i, 731 color_mask, color_mask, &it); 732 idx = PNUM_TO_IDX(mnode, r, pfnum); 733 idx = (idx >= r_pgcnt) ? 0 : idx; 734 for (mrange = 0; mrange < nranges; mrange++) { 735 PAGE_COUNTERS_CURRENT_COLOR(mnode, 736 r, i, mrange) = idx; 737 } 738 } 739 740 /* hpm_counters may be shared by all mnodes */ 741 if (firstmn == mnode) { 742 PAGE_COUNTERS_COUNTERS(mnode, r) = 743 (hpmctr_t *)alloc_base; 744 alloc_base += 745 P2ROUNDUP((sizeof (hpmctr_t) * r_pgcnt), 746 sizeof (hpmctr_t *)); 747 } else { 748 PAGE_COUNTERS_COUNTERS(mnode, r) = 749 PAGE_COUNTERS_COUNTERS(firstmn, r); 750 } 751 752 /* 753 * Verify that PNUM_TO_IDX and IDX_TO_PNUM 754 * satisfy the identity requirement. 755 * We should be able to go from one to the other 756 * and get consistent values. 757 */ 758 ASSERT(PNUM_TO_IDX(mnode, r, 759 (IDX_TO_PNUM(mnode, r, 0))) == 0); 760 ASSERT(IDX_TO_PNUM(mnode, r, 761 (PNUM_TO_IDX(mnode, r, r_base))) == r_base); 762 } 763 /* 764 * Roundup the start address of the page_counters to 765 * cache aligned boundary for every memory node. 766 * page_ctrs_sz() has added some slop for these roundups. 767 */ 768 alloc_base = (caddr_t)P2ROUNDUP((uintptr_t)alloc_base, 769 L2CACHE_ALIGN); 770 } 771 772 /* Initialize other page counter specific data structures. */ 773 for (mnode = 0; mnode < MAX_MEM_NODES; mnode++) { 774 rw_init(&page_ctrs_rwlock[mnode], NULL, RW_DEFAULT, NULL); 775 } 776 777 return (alloc_base); 778 } 779 780 /* 781 * Functions to adjust region counters for each size free list. 782 * Caller is responsible to acquire the ctr_mutex lock if necessary and 783 * thus can be called during startup without locks. 784 */ 785 /* ARGSUSED */ 786 void 787 page_ctr_add_internal(int mnode, int mtype, page_t *pp, int flags) 788 { 789 ssize_t r; /* region size */ 790 ssize_t idx; 791 pfn_t pfnum; 792 int lckidx; 793 794 ASSERT(mnode == PP_2_MEM_NODE(pp)); 795 ASSERT(mtype == PP_2_MTYPE(pp)); 796 797 ASSERT(pp->p_szc < mmu_page_sizes); 798 799 PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags); 800 801 /* no counter update needed for largest page size */ 802 if (pp->p_szc >= mmu_page_sizes - 1) { 803 return; 804 } 805 806 r = pp->p_szc + 1; 807 pfnum = pp->p_pagenum; 808 lckidx = PP_CTR_LOCK_INDX(pp); 809 810 /* 811 * Increment the count of free pages for the current 812 * region. Continue looping up in region size incrementing 813 * count if the preceeding region is full. 814 */ 815 while (r < mmu_page_sizes) { 816 idx = PNUM_TO_IDX(mnode, r, pfnum); 817 818 ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r)); 819 ASSERT(PAGE_COUNTERS(mnode, r, idx) < FULL_REGION_CNT(r)); 820 821 if (++PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r)) { 822 break; 823 } else { 824 int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r)); 825 pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode] 826 [MTYPE_2_MRANGE(mnode, root_mtype)]; 827 828 cand->pcc_pages_free++; 829 cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]++; 830 } 831 r++; 832 } 833 } 834 835 void 836 page_ctr_add(int mnode, int mtype, page_t *pp, int flags) 837 { 838 int lckidx = PP_CTR_LOCK_INDX(pp); 839 kmutex_t *lock = &ctr_mutex[lckidx][mnode]; 840 841 mutex_enter(lock); 842 page_ctr_add_internal(mnode, mtype, pp, flags); 843 mutex_exit(lock); 844 } 845 846 void 847 page_ctr_sub_internal(int mnode, int mtype, page_t *pp, int flags) 848 { 849 int lckidx; 850 ssize_t r; /* region size */ 851 ssize_t idx; 852 pfn_t pfnum; 853 854 ASSERT(mnode == PP_2_MEM_NODE(pp)); 855 ASSERT(mtype == PP_2_MTYPE(pp)); 856 857 ASSERT(pp->p_szc < mmu_page_sizes); 858 859 PLCNT_DECR(pp, mnode, mtype, pp->p_szc, flags); 860 861 /* no counter update needed for largest page size */ 862 if (pp->p_szc >= mmu_page_sizes - 1) { 863 return; 864 } 865 866 r = pp->p_szc + 1; 867 pfnum = pp->p_pagenum; 868 lckidx = PP_CTR_LOCK_INDX(pp); 869 870 /* 871 * Decrement the count of free pages for the current 872 * region. Continue looping up in region size decrementing 873 * count if the preceeding region was full. 874 */ 875 while (r < mmu_page_sizes) { 876 idx = PNUM_TO_IDX(mnode, r, pfnum); 877 878 ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r)); 879 ASSERT(PAGE_COUNTERS(mnode, r, idx) > 0); 880 881 if (--PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r) - 1) { 882 break; 883 } else { 884 int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r)); 885 pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode] 886 [MTYPE_2_MRANGE(mnode, root_mtype)]; 887 888 ASSERT(cand->pcc_pages_free != 0); 889 ASSERT(cand->pcc_color_free[PP_2_BIN_SZC(pp, r)] != 0); 890 891 cand->pcc_pages_free--; 892 cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]--; 893 } 894 r++; 895 } 896 } 897 898 void 899 page_ctr_sub(int mnode, int mtype, page_t *pp, int flags) 900 { 901 int lckidx = PP_CTR_LOCK_INDX(pp); 902 kmutex_t *lock = &ctr_mutex[lckidx][mnode]; 903 904 mutex_enter(lock); 905 page_ctr_sub_internal(mnode, mtype, pp, flags); 906 mutex_exit(lock); 907 } 908 909 /* 910 * Adjust page counters following a memory attach, since typically the 911 * size of the array needs to change, and the PFN to counter index 912 * mapping needs to change. 913 * 914 * It is possible this mnode did not exist at startup. In that case 915 * allocate pcc_info_t and pcc_color_free arrays. Also, allow for nranges 916 * to change (a theoretical possibility on x86), which means pcc_color_free 917 * arrays must be extended. 918 */ 919 uint_t 920 page_ctrs_adjust(int mnode) 921 { 922 pgcnt_t npgs; 923 int r; /* region size */ 924 int i; 925 size_t pcsz, old_csz; 926 hpmctr_t *new_ctr, *old_ctr; 927 pfn_t oldbase, newbase; 928 pfn_t physbase, physmax; 929 size_t old_npgs; 930 hpmctr_t *ctr_cache[MMU_PAGE_SIZES]; 931 size_t size_cache[MMU_PAGE_SIZES]; 932 size_t *color_cache[MMU_PAGE_SIZES][MAX_MNODE_MRANGES]; 933 size_t *old_color_array[MAX_MNODE_MRANGES]; 934 pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; 935 pcc_info_t **cands_cache; 936 pcc_info_t *old_pi, *pi; 937 pgcnt_t *pgcntp; 938 int nr, old_nranges, mrange, nranges = MNODE_RANGE_CNT(mnode); 939 int cands_cache_nranges; 940 int old_maxmrange, new_maxmrange; 941 int rc = 0; 942 943 cands_cache = kmem_zalloc(sizeof (pcc_info_t *) * NPC_MUTEX * 944 MMU_PAGE_SIZES, KM_NOSLEEP); 945 if (cands_cache == NULL) 946 return (ENOMEM); 947 948 i = -1; 949 HPM_COUNTERS_LIMITS(mnode, physbase, physmax, i); 950 951 newbase = physbase & ~PC_BASE_ALIGN_MASK; 952 npgs = roundup(physmax, PC_BASE_ALIGN) - newbase; 953 954 /* prepare to free non-null pointers on the way out */ 955 cands_cache_nranges = nranges; 956 bzero(ctr_cache, sizeof (ctr_cache)); 957 bzero(color_cache, sizeof (color_cache)); 958 959 /* 960 * We need to determine how many page colors there are for each 961 * page size in order to allocate memory for any color specific 962 * arrays. 963 */ 964 for (r = 0; r < mmu_page_sizes; r++) { 965 colors_per_szc[r] = PAGE_GET_PAGECOLORS(r); 966 } 967 968 /* 969 * Preallocate all of the new hpm_counters arrays as we can't 970 * hold the page_ctrs_rwlock as a writer and allocate memory. 971 * If we can't allocate all of the arrays, undo our work so far 972 * and return failure. 973 */ 974 for (r = 1; r < mmu_page_sizes; r++) { 975 pcsz = npgs >> PAGE_BSZS_SHIFT(r); 976 size_cache[r] = pcsz; 977 ctr_cache[r] = kmem_zalloc(pcsz * 978 sizeof (hpmctr_t), KM_NOSLEEP); 979 if (ctr_cache[r] == NULL) { 980 rc = ENOMEM; 981 goto cleanup; 982 } 983 } 984 985 /* 986 * Preallocate all of the new color current arrays as we can't 987 * hold the page_ctrs_rwlock as a writer and allocate memory. 988 * If we can't allocate all of the arrays, undo our work so far 989 * and return failure. 990 */ 991 for (r = 1; r < mmu_page_sizes; r++) { 992 for (mrange = 0; mrange < nranges; mrange++) { 993 color_cache[r][mrange] = kmem_zalloc(sizeof (size_t) * 994 colors_per_szc[r], KM_NOSLEEP); 995 if (color_cache[r][mrange] == NULL) { 996 rc = ENOMEM; 997 goto cleanup; 998 } 999 } 1000 } 1001 1002 /* 1003 * Preallocate all of the new pcc_info_t arrays as we can't 1004 * hold the page_ctrs_rwlock as a writer and allocate memory. 1005 * If we can't allocate all of the arrays, undo our work so far 1006 * and return failure. 1007 */ 1008 for (r = 1; r < mmu_page_sizes; r++) { 1009 for (i = 0; i < NPC_MUTEX; i++) { 1010 pi = kmem_zalloc(nranges * sizeof (pcc_info_t), 1011 KM_NOSLEEP); 1012 if (pi == NULL) { 1013 rc = ENOMEM; 1014 goto cleanup; 1015 } 1016 cands_cache[i * MMU_PAGE_SIZES + r] = pi; 1017 1018 for (mrange = 0; mrange < nranges; mrange++, pi++) { 1019 pgcntp = kmem_zalloc(colors_per_szc[r] * 1020 sizeof (pgcnt_t), KM_NOSLEEP); 1021 if (pgcntp == NULL) { 1022 rc = ENOMEM; 1023 goto cleanup; 1024 } 1025 pi->pcc_color_free = pgcntp; 1026 } 1027 } 1028 } 1029 1030 /* 1031 * Grab the write lock to prevent others from walking these arrays 1032 * while we are modifying them. 1033 */ 1034 PAGE_CTRS_WRITE_LOCK(mnode); 1035 1036 old_nranges = mnode_nranges[mnode]; 1037 cands_cache_nranges = old_nranges; 1038 mnode_nranges[mnode] = nranges; 1039 old_maxmrange = mnode_maxmrange[mnode]; 1040 mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode); 1041 new_maxmrange = mnode_maxmrange[mnode]; 1042 1043 for (r = 1; r < mmu_page_sizes; r++) { 1044 PAGE_COUNTERS_SHIFT(mnode, r) = PAGE_BSZS_SHIFT(r); 1045 old_ctr = PAGE_COUNTERS_COUNTERS(mnode, r); 1046 old_csz = PAGE_COUNTERS_ENTRIES(mnode, r); 1047 oldbase = PAGE_COUNTERS_BASE(mnode, r); 1048 old_npgs = old_csz << PAGE_COUNTERS_SHIFT(mnode, r); 1049 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) { 1050 old_color_array[mrange] = 1051 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, 1052 r, mrange); 1053 } 1054 1055 pcsz = npgs >> PAGE_COUNTERS_SHIFT(mnode, r); 1056 new_ctr = ctr_cache[r]; 1057 ctr_cache[r] = NULL; 1058 if (old_ctr != NULL && 1059 (oldbase + old_npgs > newbase) && 1060 (newbase + npgs > oldbase)) { 1061 /* 1062 * Map the intersection of the old and new 1063 * counters into the new array. 1064 */ 1065 size_t offset; 1066 if (newbase > oldbase) { 1067 offset = (newbase - oldbase) >> 1068 PAGE_COUNTERS_SHIFT(mnode, r); 1069 bcopy(old_ctr + offset, new_ctr, 1070 MIN(pcsz, (old_csz - offset)) * 1071 sizeof (hpmctr_t)); 1072 } else { 1073 offset = (oldbase - newbase) >> 1074 PAGE_COUNTERS_SHIFT(mnode, r); 1075 bcopy(old_ctr, new_ctr + offset, 1076 MIN(pcsz - offset, old_csz) * 1077 sizeof (hpmctr_t)); 1078 } 1079 } 1080 1081 PAGE_COUNTERS_COUNTERS(mnode, r) = new_ctr; 1082 PAGE_COUNTERS_ENTRIES(mnode, r) = pcsz; 1083 PAGE_COUNTERS_BASE(mnode, r) = newbase; 1084 1085 /* update shared hpm_counters in other mnodes */ 1086 if (interleaved_mnodes) { 1087 for (i = 0; i < max_mem_nodes; i++) { 1088 if (i == mnode) 1089 continue; 1090 if (mem_node_config[i].exists == 0) 1091 continue; 1092 ASSERT(PAGE_COUNTERS_COUNTERS(i, r) == old_ctr); 1093 PAGE_COUNTERS_COUNTERS(i, r) = new_ctr; 1094 PAGE_COUNTERS_ENTRIES(i, r) = pcsz; 1095 PAGE_COUNTERS_BASE(i, r) = newbase; 1096 } 1097 } 1098 1099 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) { 1100 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r, mrange) = 1101 color_cache[r][mrange]; 1102 color_cache[r][mrange] = NULL; 1103 } 1104 /* 1105 * for now, just reset on these events as it's probably 1106 * not worthwhile to try and optimize this. 1107 */ 1108 for (i = 0; i < colors_per_szc[r]; i++) { 1109 uint_t color_mask = colors_per_szc[r] - 1; 1110 int mlo = interleaved_mnodes ? 0 : mnode; 1111 int mhi = interleaved_mnodes ? max_mem_nodes : 1112 (mnode + 1); 1113 int m; 1114 pfn_t pfnum = newbase; 1115 size_t idx; 1116 MEM_NODE_ITERATOR_DECL(it); 1117 1118 for (m = mlo; m < mhi; m++) { 1119 if (mem_node_config[m].exists == 0) 1120 continue; 1121 MEM_NODE_ITERATOR_INIT(pfnum, m, &it); 1122 ASSERT(pfnum != (pfn_t)-1); 1123 PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i, color_mask, 1124 color_mask, &it); 1125 idx = PNUM_TO_IDX(m, r, pfnum); 1126 idx = (idx < pcsz) ? idx : 0; 1127 for (mrange = 0; mrange < nranges; mrange++) { 1128 PAGE_COUNTERS_CURRENT_COLOR(m, 1129 r, i, mrange) = idx; 1130 } 1131 } 1132 } 1133 1134 /* cache info for freeing out of the critical path */ 1135 if ((caddr_t)old_ctr >= kernelheap && 1136 (caddr_t)old_ctr < ekernelheap) { 1137 ctr_cache[r] = old_ctr; 1138 size_cache[r] = old_csz; 1139 } 1140 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) { 1141 size_t *tmp = old_color_array[mrange]; 1142 if ((caddr_t)tmp >= kernelheap && 1143 (caddr_t)tmp < ekernelheap) { 1144 color_cache[r][mrange] = tmp; 1145 } 1146 } 1147 /* 1148 * Verify that PNUM_TO_IDX and IDX_TO_PNUM 1149 * satisfy the identity requirement. 1150 * We should be able to go from one to the other 1151 * and get consistent values. 1152 */ 1153 ASSERT(PNUM_TO_IDX(mnode, r, 1154 (IDX_TO_PNUM(mnode, r, 0))) == 0); 1155 ASSERT(IDX_TO_PNUM(mnode, r, 1156 (PNUM_TO_IDX(mnode, r, newbase))) == newbase); 1157 1158 /* pcc_info_t and pcc_color_free */ 1159 for (i = 0; i < NPC_MUTEX; i++) { 1160 pcc_info_t *epi; 1161 pcc_info_t *eold_pi; 1162 1163 pi = cands_cache[i * MMU_PAGE_SIZES + r]; 1164 old_pi = page_ctrs_cands[i][r][mnode]; 1165 page_ctrs_cands[i][r][mnode] = pi; 1166 cands_cache[i * MMU_PAGE_SIZES + r] = old_pi; 1167 1168 /* preserve old pcc_color_free values, if any */ 1169 if (old_pi == NULL) 1170 continue; 1171 1172 /* 1173 * when/if x86 does DR, must account for 1174 * possible change in range index when 1175 * preserving pcc_info 1176 */ 1177 epi = &pi[nranges]; 1178 eold_pi = &old_pi[old_nranges]; 1179 if (new_maxmrange > old_maxmrange) { 1180 pi += new_maxmrange - old_maxmrange; 1181 } else if (new_maxmrange < old_maxmrange) { 1182 old_pi += old_maxmrange - new_maxmrange; 1183 } 1184 for (; pi < epi && old_pi < eold_pi; pi++, old_pi++) { 1185 pcc_info_t tmp = *pi; 1186 *pi = *old_pi; 1187 *old_pi = tmp; 1188 } 1189 } 1190 } 1191 PAGE_CTRS_WRITE_UNLOCK(mnode); 1192 1193 /* 1194 * Now that we have dropped the write lock, it is safe to free all 1195 * of the memory we have cached above. 1196 * We come thru here to free memory when pre-alloc fails, and also to 1197 * free old pointers which were recorded while locked. 1198 */ 1199 cleanup: 1200 for (r = 1; r < mmu_page_sizes; r++) { 1201 if (ctr_cache[r] != NULL) { 1202 kmem_free(ctr_cache[r], 1203 size_cache[r] * sizeof (hpmctr_t)); 1204 } 1205 for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) { 1206 if (color_cache[r][mrange] != NULL) { 1207 kmem_free(color_cache[r][mrange], 1208 colors_per_szc[r] * sizeof (size_t)); 1209 } 1210 } 1211 for (i = 0; i < NPC_MUTEX; i++) { 1212 pi = cands_cache[i * MMU_PAGE_SIZES + r]; 1213 if (pi == NULL) 1214 continue; 1215 nr = cands_cache_nranges; 1216 for (mrange = 0; mrange < nr; mrange++, pi++) { 1217 pgcntp = pi->pcc_color_free; 1218 if (pgcntp == NULL) 1219 continue; 1220 if ((caddr_t)pgcntp >= kernelheap && 1221 (caddr_t)pgcntp < ekernelheap) { 1222 kmem_free(pgcntp, 1223 colors_per_szc[r] * 1224 sizeof (pgcnt_t)); 1225 } 1226 } 1227 pi = cands_cache[i * MMU_PAGE_SIZES + r]; 1228 if ((caddr_t)pi >= kernelheap && 1229 (caddr_t)pi < ekernelheap) { 1230 kmem_free(pi, nr * sizeof (pcc_info_t)); 1231 } 1232 } 1233 } 1234 1235 kmem_free(cands_cache, 1236 sizeof (pcc_info_t *) * NPC_MUTEX * MMU_PAGE_SIZES); 1237 return (rc); 1238 } 1239 1240 1241 #ifdef DEBUG 1242 1243 /* 1244 * confirm pp is a large page corresponding to szc 1245 */ 1246 void 1247 chk_lpg(page_t *pp, uchar_t szc) 1248 { 1249 spgcnt_t npgs = page_get_pagecnt(pp->p_szc); 1250 uint_t noreloc; 1251 1252 if (npgs == 1) { 1253 ASSERT(pp->p_szc == 0); 1254 ASSERT(pp->p_next == pp); 1255 ASSERT(pp->p_prev == pp); 1256 return; 1257 } 1258 1259 ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL); 1260 ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL); 1261 1262 ASSERT(IS_P2ALIGNED(pp->p_pagenum, npgs)); 1263 ASSERT(pp->p_pagenum == (pp->p_next->p_pagenum - 1)); 1264 ASSERT(pp->p_prev->p_pagenum == (pp->p_pagenum + (npgs - 1))); 1265 ASSERT(pp->p_prev == (pp + (npgs - 1))); 1266 1267 /* 1268 * Check list of pages. 1269 */ 1270 noreloc = PP_ISNORELOC(pp); 1271 while (npgs--) { 1272 if (npgs != 0) { 1273 ASSERT(pp->p_pagenum == pp->p_next->p_pagenum - 1); 1274 ASSERT(pp->p_next == (pp + 1)); 1275 } 1276 ASSERT(pp->p_szc == szc); 1277 ASSERT(PP_ISFREE(pp)); 1278 ASSERT(PP_ISAGED(pp)); 1279 ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL); 1280 ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL); 1281 ASSERT(pp->p_vnode == NULL); 1282 ASSERT(PP_ISNORELOC(pp) == noreloc); 1283 1284 pp = pp->p_next; 1285 } 1286 } 1287 #endif /* DEBUG */ 1288 1289 void 1290 page_freelist_lock(int mnode) 1291 { 1292 int i; 1293 for (i = 0; i < NPC_MUTEX; i++) { 1294 mutex_enter(FPC_MUTEX(mnode, i)); 1295 mutex_enter(CPC_MUTEX(mnode, i)); 1296 } 1297 } 1298 1299 void 1300 page_freelist_unlock(int mnode) 1301 { 1302 int i; 1303 for (i = 0; i < NPC_MUTEX; i++) { 1304 mutex_exit(FPC_MUTEX(mnode, i)); 1305 mutex_exit(CPC_MUTEX(mnode, i)); 1306 } 1307 } 1308 1309 /* 1310 * add pp to the specified page list. Defaults to head of the page list 1311 * unless PG_LIST_TAIL is specified. 1312 */ 1313 void 1314 page_list_add(page_t *pp, int flags) 1315 { 1316 page_t **ppp; 1317 kmutex_t *pcm; 1318 uint_t bin, mtype; 1319 int mnode; 1320 1321 ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT)); 1322 ASSERT(PP_ISFREE(pp)); 1323 ASSERT(!hat_page_is_mapped(pp)); 1324 ASSERT(hat_page_getshare(pp) == 0); 1325 1326 /* 1327 * Large pages should be freed via page_list_add_pages(). 1328 */ 1329 ASSERT(pp->p_szc == 0); 1330 1331 /* 1332 * Don't need to lock the freelist first here 1333 * because the page isn't on the freelist yet. 1334 * This means p_szc can't change on us. 1335 */ 1336 1337 bin = PP_2_BIN(pp); 1338 mnode = PP_2_MEM_NODE(pp); 1339 mtype = PP_2_MTYPE(pp); 1340 1341 if (flags & PG_LIST_ISINIT) { 1342 /* 1343 * PG_LIST_ISINIT is set during system startup (ie. single 1344 * threaded), add a page to the free list and add to the 1345 * the free region counters w/o any locking 1346 */ 1347 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1348 1349 /* inline version of page_add() */ 1350 if (*ppp != NULL) { 1351 pp->p_next = *ppp; 1352 pp->p_prev = (*ppp)->p_prev; 1353 (*ppp)->p_prev = pp; 1354 pp->p_prev->p_next = pp; 1355 } else 1356 *ppp = pp; 1357 1358 page_ctr_add_internal(mnode, mtype, pp, flags); 1359 VM_STAT_ADD(vmm_vmstats.pladd_free[0]); 1360 } else { 1361 pcm = PC_BIN_MUTEX(mnode, bin, flags); 1362 1363 if (flags & PG_FREE_LIST) { 1364 VM_STAT_ADD(vmm_vmstats.pladd_free[0]); 1365 ASSERT(PP_ISAGED(pp)); 1366 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1367 1368 } else { 1369 VM_STAT_ADD(vmm_vmstats.pladd_cache); 1370 ASSERT(pp->p_vnode); 1371 ASSERT((pp->p_offset & PAGEOFFSET) == 0); 1372 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1373 } 1374 mutex_enter(pcm); 1375 page_add(ppp, pp); 1376 1377 if (flags & PG_LIST_TAIL) 1378 *ppp = (*ppp)->p_next; 1379 /* 1380 * Add counters before releasing pcm mutex to avoid a race with 1381 * page_freelist_coalesce and page_freelist_split. 1382 */ 1383 page_ctr_add(mnode, mtype, pp, flags); 1384 mutex_exit(pcm); 1385 } 1386 1387 1388 #if defined(__sparc) 1389 if (PP_ISNORELOC(pp)) { 1390 kcage_freemem_add(1); 1391 } 1392 #endif 1393 /* 1394 * It is up to the caller to unlock the page! 1395 */ 1396 ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT)); 1397 } 1398 1399 1400 #ifdef __sparc 1401 /* 1402 * This routine is only used by kcage_init during system startup. 1403 * It performs the function of page_list_sub/PP_SETNORELOC/page_list_add 1404 * without the overhead of taking locks and updating counters. 1405 */ 1406 void 1407 page_list_noreloc_startup(page_t *pp) 1408 { 1409 page_t **ppp; 1410 uint_t bin; 1411 int mnode; 1412 int mtype; 1413 int flags = 0; 1414 1415 /* 1416 * If this is a large page on the freelist then 1417 * break it up into smaller pages. 1418 */ 1419 if (pp->p_szc != 0) 1420 page_boot_demote(pp); 1421 1422 /* 1423 * Get list page is currently on. 1424 */ 1425 bin = PP_2_BIN(pp); 1426 mnode = PP_2_MEM_NODE(pp); 1427 mtype = PP_2_MTYPE(pp); 1428 ASSERT(mtype == MTYPE_RELOC); 1429 ASSERT(pp->p_szc == 0); 1430 1431 if (PP_ISAGED(pp)) { 1432 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1433 flags |= PG_FREE_LIST; 1434 } else { 1435 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1436 flags |= PG_CACHE_LIST; 1437 } 1438 1439 ASSERT(*ppp != NULL); 1440 1441 /* 1442 * Delete page from current list. 1443 */ 1444 if (*ppp == pp) 1445 *ppp = pp->p_next; /* go to next page */ 1446 if (*ppp == pp) { 1447 *ppp = NULL; /* page list is gone */ 1448 } else { 1449 pp->p_prev->p_next = pp->p_next; 1450 pp->p_next->p_prev = pp->p_prev; 1451 } 1452 1453 /* 1454 * Decrement page counters 1455 */ 1456 page_ctr_sub_internal(mnode, mtype, pp, flags); 1457 1458 /* 1459 * Set no reloc for cage initted pages. 1460 */ 1461 PP_SETNORELOC(pp); 1462 1463 mtype = PP_2_MTYPE(pp); 1464 ASSERT(mtype == MTYPE_NORELOC); 1465 1466 /* 1467 * Get new list for page. 1468 */ 1469 if (PP_ISAGED(pp)) { 1470 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1471 } else { 1472 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1473 } 1474 1475 /* 1476 * Insert page on new list. 1477 */ 1478 if (*ppp == NULL) { 1479 *ppp = pp; 1480 pp->p_next = pp->p_prev = pp; 1481 } else { 1482 pp->p_next = *ppp; 1483 pp->p_prev = (*ppp)->p_prev; 1484 (*ppp)->p_prev = pp; 1485 pp->p_prev->p_next = pp; 1486 } 1487 1488 /* 1489 * Increment page counters 1490 */ 1491 page_ctr_add_internal(mnode, mtype, pp, flags); 1492 1493 /* 1494 * Update cage freemem counter 1495 */ 1496 atomic_add_long(&kcage_freemem, 1); 1497 } 1498 #else /* __sparc */ 1499 1500 /* ARGSUSED */ 1501 void 1502 page_list_noreloc_startup(page_t *pp) 1503 { 1504 panic("page_list_noreloc_startup: should be here only for sparc"); 1505 } 1506 #endif 1507 1508 void 1509 page_list_add_pages(page_t *pp, int flags) 1510 { 1511 kmutex_t *pcm; 1512 pgcnt_t pgcnt; 1513 uint_t bin, mtype, i; 1514 int mnode; 1515 1516 /* default to freelist/head */ 1517 ASSERT((flags & (PG_CACHE_LIST | PG_LIST_TAIL)) == 0); 1518 1519 CHK_LPG(pp, pp->p_szc); 1520 VM_STAT_ADD(vmm_vmstats.pladd_free[pp->p_szc]); 1521 1522 bin = PP_2_BIN(pp); 1523 mnode = PP_2_MEM_NODE(pp); 1524 mtype = PP_2_MTYPE(pp); 1525 1526 if (flags & PG_LIST_ISINIT) { 1527 ASSERT(pp->p_szc == mmu_page_sizes - 1); 1528 page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1529 ASSERT(!PP_ISNORELOC(pp)); 1530 PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags); 1531 } else { 1532 1533 ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes); 1534 1535 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 1536 1537 mutex_enter(pcm); 1538 page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1539 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST); 1540 mutex_exit(pcm); 1541 1542 pgcnt = page_get_pagecnt(pp->p_szc); 1543 #if defined(__sparc) 1544 if (PP_ISNORELOC(pp)) 1545 kcage_freemem_add(pgcnt); 1546 #endif 1547 for (i = 0; i < pgcnt; i++, pp++) 1548 page_unlock_nocapture(pp); 1549 } 1550 } 1551 1552 /* 1553 * During boot, need to demote a large page to base 1554 * pagesize pages for seg_kmem for use in boot_alloc() 1555 */ 1556 void 1557 page_boot_demote(page_t *pp) 1558 { 1559 ASSERT(pp->p_szc != 0); 1560 ASSERT(PP_ISFREE(pp)); 1561 ASSERT(PP_ISAGED(pp)); 1562 1563 (void) page_demote(PP_2_MEM_NODE(pp), 1564 PFN_BASE(pp->p_pagenum, pp->p_szc), pp->p_szc, 0, PC_NO_COLOR, 1565 PC_FREE); 1566 1567 ASSERT(PP_ISFREE(pp)); 1568 ASSERT(PP_ISAGED(pp)); 1569 ASSERT(pp->p_szc == 0); 1570 } 1571 1572 /* 1573 * Take a particular page off of whatever freelist the page 1574 * is claimed to be on. 1575 * 1576 * NOTE: Only used for PAGESIZE pages. 1577 */ 1578 void 1579 page_list_sub(page_t *pp, int flags) 1580 { 1581 int bin; 1582 uint_t mtype; 1583 int mnode; 1584 kmutex_t *pcm; 1585 page_t **ppp; 1586 1587 ASSERT(PAGE_EXCL(pp)); 1588 ASSERT(PP_ISFREE(pp)); 1589 1590 /* 1591 * The p_szc field can only be changed by page_promote() 1592 * and page_demote(). Only free pages can be promoted and 1593 * demoted and the free list MUST be locked during these 1594 * operations. So to prevent a race in page_list_sub() 1595 * between computing which bin of the freelist lock to 1596 * grab and actually grabing the lock we check again that 1597 * the bin we locked is still the correct one. Notice that 1598 * the p_szc field could have actually changed on us but 1599 * if the bin happens to still be the same we are safe. 1600 */ 1601 try_again: 1602 bin = PP_2_BIN(pp); 1603 mnode = PP_2_MEM_NODE(pp); 1604 pcm = PC_BIN_MUTEX(mnode, bin, flags); 1605 mutex_enter(pcm); 1606 if (PP_2_BIN(pp) != bin) { 1607 mutex_exit(pcm); 1608 goto try_again; 1609 } 1610 mtype = PP_2_MTYPE(pp); 1611 1612 if (flags & PG_FREE_LIST) { 1613 VM_STAT_ADD(vmm_vmstats.plsub_free[0]); 1614 ASSERT(PP_ISAGED(pp)); 1615 ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype); 1616 } else { 1617 VM_STAT_ADD(vmm_vmstats.plsub_cache); 1618 ASSERT(!PP_ISAGED(pp)); 1619 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1620 } 1621 1622 /* 1623 * Common PAGESIZE case. 1624 * 1625 * Note that we locked the freelist. This prevents 1626 * any page promotion/demotion operations. Therefore 1627 * the p_szc will not change until we drop pcm mutex. 1628 */ 1629 if (pp->p_szc == 0) { 1630 page_sub(ppp, pp); 1631 /* 1632 * Subtract counters before releasing pcm mutex 1633 * to avoid race with page_freelist_coalesce. 1634 */ 1635 page_ctr_sub(mnode, mtype, pp, flags); 1636 mutex_exit(pcm); 1637 1638 #if defined(__sparc) 1639 if (PP_ISNORELOC(pp)) { 1640 kcage_freemem_sub(1); 1641 } 1642 #endif 1643 return; 1644 } 1645 1646 /* 1647 * Large pages on the cache list are not supported. 1648 */ 1649 if (flags & PG_CACHE_LIST) 1650 panic("page_list_sub: large page on cachelist"); 1651 1652 /* 1653 * Slow but rare. 1654 * 1655 * Somebody wants this particular page which is part 1656 * of a large page. In this case we just demote the page 1657 * if it's on the freelist. 1658 * 1659 * We have to drop pcm before locking the entire freelist. 1660 * Once we have re-locked the freelist check to make sure 1661 * the page hasn't already been demoted or completely 1662 * freed. 1663 */ 1664 mutex_exit(pcm); 1665 page_freelist_lock(mnode); 1666 if (pp->p_szc != 0) { 1667 /* 1668 * Large page is on freelist. 1669 */ 1670 (void) page_demote(mnode, PFN_BASE(pp->p_pagenum, pp->p_szc), 1671 pp->p_szc, 0, PC_NO_COLOR, PC_FREE); 1672 } 1673 ASSERT(PP_ISFREE(pp)); 1674 ASSERT(PP_ISAGED(pp)); 1675 ASSERT(pp->p_szc == 0); 1676 1677 /* 1678 * Subtract counters before releasing pcm mutex 1679 * to avoid race with page_freelist_coalesce. 1680 */ 1681 bin = PP_2_BIN(pp); 1682 mtype = PP_2_MTYPE(pp); 1683 ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype); 1684 1685 page_sub(ppp, pp); 1686 page_ctr_sub(mnode, mtype, pp, flags); 1687 page_freelist_unlock(mnode); 1688 1689 #if defined(__sparc) 1690 if (PP_ISNORELOC(pp)) { 1691 kcage_freemem_sub(1); 1692 } 1693 #endif 1694 } 1695 1696 void 1697 page_list_sub_pages(page_t *pp, uint_t szc) 1698 { 1699 kmutex_t *pcm; 1700 uint_t bin, mtype; 1701 int mnode; 1702 1703 ASSERT(PAGE_EXCL(pp)); 1704 ASSERT(PP_ISFREE(pp)); 1705 ASSERT(PP_ISAGED(pp)); 1706 1707 /* 1708 * See comment in page_list_sub(). 1709 */ 1710 try_again: 1711 bin = PP_2_BIN(pp); 1712 mnode = PP_2_MEM_NODE(pp); 1713 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 1714 mutex_enter(pcm); 1715 if (PP_2_BIN(pp) != bin) { 1716 mutex_exit(pcm); 1717 goto try_again; 1718 } 1719 1720 /* 1721 * If we're called with a page larger than szc or it got 1722 * promoted above szc before we locked the freelist then 1723 * drop pcm and re-lock entire freelist. If page still larger 1724 * than szc then demote it. 1725 */ 1726 if (pp->p_szc > szc) { 1727 mutex_exit(pcm); 1728 pcm = NULL; 1729 page_freelist_lock(mnode); 1730 if (pp->p_szc > szc) { 1731 VM_STAT_ADD(vmm_vmstats.plsubpages_szcbig); 1732 (void) page_demote(mnode, 1733 PFN_BASE(pp->p_pagenum, pp->p_szc), 1734 pp->p_szc, szc, PC_NO_COLOR, PC_FREE); 1735 } 1736 bin = PP_2_BIN(pp); 1737 } 1738 ASSERT(PP_ISFREE(pp)); 1739 ASSERT(PP_ISAGED(pp)); 1740 ASSERT(pp->p_szc <= szc); 1741 ASSERT(pp == PP_PAGEROOT(pp)); 1742 1743 VM_STAT_ADD(vmm_vmstats.plsub_free[pp->p_szc]); 1744 1745 mtype = PP_2_MTYPE(pp); 1746 if (pp->p_szc != 0) { 1747 page_vpsub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1748 CHK_LPG(pp, pp->p_szc); 1749 } else { 1750 VM_STAT_ADD(vmm_vmstats.plsubpages_szc0); 1751 page_sub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1752 } 1753 page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST); 1754 1755 if (pcm != NULL) { 1756 mutex_exit(pcm); 1757 } else { 1758 page_freelist_unlock(mnode); 1759 } 1760 1761 #if defined(__sparc) 1762 if (PP_ISNORELOC(pp)) { 1763 pgcnt_t pgcnt; 1764 1765 pgcnt = page_get_pagecnt(pp->p_szc); 1766 kcage_freemem_sub(pgcnt); 1767 } 1768 #endif 1769 } 1770 1771 /* 1772 * Add the page to the front of a linked list of pages 1773 * using the p_next & p_prev pointers for the list. 1774 * The caller is responsible for protecting the list pointers. 1775 */ 1776 void 1777 mach_page_add(page_t **ppp, page_t *pp) 1778 { 1779 if (*ppp == NULL) { 1780 pp->p_next = pp->p_prev = pp; 1781 } else { 1782 pp->p_next = *ppp; 1783 pp->p_prev = (*ppp)->p_prev; 1784 (*ppp)->p_prev = pp; 1785 pp->p_prev->p_next = pp; 1786 } 1787 *ppp = pp; 1788 } 1789 1790 /* 1791 * Remove this page from a linked list of pages 1792 * using the p_next & p_prev pointers for the list. 1793 * 1794 * The caller is responsible for protecting the list pointers. 1795 */ 1796 void 1797 mach_page_sub(page_t **ppp, page_t *pp) 1798 { 1799 ASSERT(PP_ISFREE(pp)); 1800 1801 if (*ppp == NULL || pp == NULL) 1802 panic("mach_page_sub"); 1803 1804 if (*ppp == pp) 1805 *ppp = pp->p_next; /* go to next page */ 1806 1807 if (*ppp == pp) 1808 *ppp = NULL; /* page list is gone */ 1809 else { 1810 pp->p_prev->p_next = pp->p_next; 1811 pp->p_next->p_prev = pp->p_prev; 1812 } 1813 pp->p_prev = pp->p_next = pp; /* make pp a list of one */ 1814 } 1815 1816 /* 1817 * Routine fsflush uses to gradually coalesce the free list into larger pages. 1818 */ 1819 void 1820 page_promote_size(page_t *pp, uint_t cur_szc) 1821 { 1822 pfn_t pfn; 1823 int mnode; 1824 int idx; 1825 int new_szc = cur_szc + 1; 1826 int full = FULL_REGION_CNT(new_szc); 1827 1828 pfn = page_pptonum(pp); 1829 mnode = PFN_2_MEM_NODE(pfn); 1830 1831 page_freelist_lock(mnode); 1832 1833 idx = PNUM_TO_IDX(mnode, new_szc, pfn); 1834 if (PAGE_COUNTERS(mnode, new_szc, idx) == full) 1835 (void) page_promote(mnode, pfn, new_szc, PC_FREE, PC_MTYPE_ANY); 1836 1837 page_freelist_unlock(mnode); 1838 } 1839 1840 static uint_t page_promote_err; 1841 static uint_t page_promote_noreloc_err; 1842 1843 /* 1844 * Create a single larger page (of szc new_szc) from smaller contiguous pages 1845 * for the given mnode starting at pfnum. Pages involved are on the freelist 1846 * before the call and may be returned to the caller if requested, otherwise 1847 * they will be placed back on the freelist. 1848 * If flags is PC_ALLOC, then the large page will be returned to the user in 1849 * a state which is consistent with a page being taken off the freelist. If 1850 * we failed to lock the new large page, then we will return NULL to the 1851 * caller and put the large page on the freelist instead. 1852 * If flags is PC_FREE, then the large page will be placed on the freelist, 1853 * and NULL will be returned. 1854 * The caller is responsible for locking the freelist as well as any other 1855 * accounting which needs to be done for a returned page. 1856 * 1857 * RFE: For performance pass in pp instead of pfnum so 1858 * we can avoid excessive calls to page_numtopp_nolock(). 1859 * This would depend on an assumption that all contiguous 1860 * pages are in the same memseg so we can just add/dec 1861 * our pp. 1862 * 1863 * Lock ordering: 1864 * 1865 * There is a potential but rare deadlock situation 1866 * for page promotion and demotion operations. The problem 1867 * is there are two paths into the freelist manager and 1868 * they have different lock orders: 1869 * 1870 * page_create() 1871 * lock freelist 1872 * page_lock(EXCL) 1873 * unlock freelist 1874 * return 1875 * caller drops page_lock 1876 * 1877 * page_free() and page_reclaim() 1878 * caller grabs page_lock(EXCL) 1879 * 1880 * lock freelist 1881 * unlock freelist 1882 * drop page_lock 1883 * 1884 * What prevents a thread in page_create() from deadlocking 1885 * with a thread freeing or reclaiming the same page is the 1886 * page_trylock() in page_get_freelist(). If the trylock fails 1887 * it skips the page. 1888 * 1889 * The lock ordering for promotion and demotion is the same as 1890 * for page_create(). Since the same deadlock could occur during 1891 * page promotion and freeing or reclaiming of a page on the 1892 * cache list we might have to fail the operation and undo what 1893 * have done so far. Again this is rare. 1894 */ 1895 page_t * 1896 page_promote(int mnode, pfn_t pfnum, uchar_t new_szc, int flags, int mtype) 1897 { 1898 page_t *pp, *pplist, *tpp, *start_pp; 1899 pgcnt_t new_npgs, npgs; 1900 uint_t bin; 1901 pgcnt_t tmpnpgs, pages_left; 1902 uint_t noreloc; 1903 int which_list; 1904 ulong_t index; 1905 kmutex_t *phm; 1906 1907 /* 1908 * General algorithm: 1909 * Find the starting page 1910 * Walk each page struct removing it from the freelist, 1911 * and linking it to all the other pages removed. 1912 * Once all pages are off the freelist, 1913 * walk the list, modifying p_szc to new_szc and what 1914 * ever other info needs to be done to create a large free page. 1915 * According to the flags, either return the page or put it 1916 * on the freelist. 1917 */ 1918 1919 start_pp = page_numtopp_nolock(pfnum); 1920 ASSERT(start_pp && (start_pp->p_pagenum == pfnum)); 1921 new_npgs = page_get_pagecnt(new_szc); 1922 ASSERT(IS_P2ALIGNED(pfnum, new_npgs)); 1923 1924 /* don't return page of the wrong mtype */ 1925 if (mtype != PC_MTYPE_ANY && mtype != PP_2_MTYPE(start_pp)) 1926 return (NULL); 1927 1928 /* 1929 * Loop through smaller pages to confirm that all pages 1930 * give the same result for PP_ISNORELOC(). 1931 * We can check this reliably here as the protocol for setting 1932 * P_NORELOC requires pages to be taken off the free list first. 1933 */ 1934 noreloc = PP_ISNORELOC(start_pp); 1935 for (pp = start_pp + new_npgs; --pp > start_pp; ) { 1936 if (noreloc != PP_ISNORELOC(pp)) { 1937 page_promote_noreloc_err++; 1938 page_promote_err++; 1939 return (NULL); 1940 } 1941 } 1942 1943 pages_left = new_npgs; 1944 pplist = NULL; 1945 pp = start_pp; 1946 1947 /* Loop around coalescing the smaller pages into a big page. */ 1948 while (pages_left) { 1949 /* 1950 * Remove from the freelist. 1951 */ 1952 ASSERT(PP_ISFREE(pp)); 1953 bin = PP_2_BIN(pp); 1954 ASSERT(mnode == PP_2_MEM_NODE(pp)); 1955 mtype = PP_2_MTYPE(pp); 1956 if (PP_ISAGED(pp)) { 1957 1958 /* 1959 * PG_FREE_LIST 1960 */ 1961 if (pp->p_szc) { 1962 page_vpsub(&PAGE_FREELISTS(mnode, 1963 pp->p_szc, bin, mtype), pp); 1964 } else { 1965 mach_page_sub(&PAGE_FREELISTS(mnode, 0, 1966 bin, mtype), pp); 1967 } 1968 which_list = PG_FREE_LIST; 1969 } else { 1970 ASSERT(pp->p_szc == 0); 1971 1972 /* 1973 * PG_CACHE_LIST 1974 * 1975 * Since this page comes from the 1976 * cachelist, we must destroy the 1977 * vnode association. 1978 */ 1979 if (!page_trylock(pp, SE_EXCL)) { 1980 goto fail_promote; 1981 } 1982 1983 /* 1984 * We need to be careful not to deadlock 1985 * with another thread in page_lookup(). 1986 * The page_lookup() thread could be holding 1987 * the same phm that we need if the two 1988 * pages happen to hash to the same phm lock. 1989 * At this point we have locked the entire 1990 * freelist and page_lookup() could be trying 1991 * to grab a freelist lock. 1992 */ 1993 index = PAGE_HASH_FUNC(pp->p_vnode, pp->p_offset); 1994 phm = PAGE_HASH_MUTEX(index); 1995 if (!mutex_tryenter(phm)) { 1996 page_unlock_nocapture(pp); 1997 goto fail_promote; 1998 } 1999 2000 mach_page_sub(&PAGE_CACHELISTS(mnode, bin, mtype), pp); 2001 page_hashout(pp, phm); 2002 mutex_exit(phm); 2003 PP_SETAGED(pp); 2004 page_unlock_nocapture(pp); 2005 which_list = PG_CACHE_LIST; 2006 } 2007 page_ctr_sub(mnode, mtype, pp, which_list); 2008 2009 /* 2010 * Concatenate the smaller page(s) onto 2011 * the large page list. 2012 */ 2013 tmpnpgs = npgs = page_get_pagecnt(pp->p_szc); 2014 pages_left -= npgs; 2015 tpp = pp; 2016 while (npgs--) { 2017 tpp->p_szc = new_szc; 2018 tpp = tpp->p_next; 2019 } 2020 page_list_concat(&pplist, &pp); 2021 pp += tmpnpgs; 2022 } 2023 CHK_LPG(pplist, new_szc); 2024 2025 /* 2026 * return the page to the user if requested 2027 * in the properly locked state. 2028 */ 2029 if (flags == PC_ALLOC && (page_trylock_cons(pplist, SE_EXCL))) { 2030 return (pplist); 2031 } 2032 2033 /* 2034 * Otherwise place the new large page on the freelist 2035 */ 2036 bin = PP_2_BIN(pplist); 2037 mnode = PP_2_MEM_NODE(pplist); 2038 mtype = PP_2_MTYPE(pplist); 2039 page_vpadd(&PAGE_FREELISTS(mnode, new_szc, bin, mtype), pplist); 2040 2041 page_ctr_add(mnode, mtype, pplist, PG_FREE_LIST); 2042 return (NULL); 2043 2044 fail_promote: 2045 /* 2046 * A thread must have still been freeing or 2047 * reclaiming the page on the cachelist. 2048 * To prevent a deadlock undo what we have 2049 * done sofar and return failure. This 2050 * situation can only happen while promoting 2051 * PAGESIZE pages. 2052 */ 2053 page_promote_err++; 2054 while (pplist) { 2055 pp = pplist; 2056 mach_page_sub(&pplist, pp); 2057 pp->p_szc = 0; 2058 bin = PP_2_BIN(pp); 2059 mtype = PP_2_MTYPE(pp); 2060 mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, mtype), pp); 2061 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST); 2062 } 2063 return (NULL); 2064 2065 } 2066 2067 /* 2068 * Break up a large page into smaller size pages. 2069 * Pages involved are on the freelist before the call and may 2070 * be returned to the caller if requested, otherwise they will 2071 * be placed back on the freelist. 2072 * The caller is responsible for locking the freelist as well as any other 2073 * accounting which needs to be done for a returned page. 2074 * If flags is not PC_ALLOC, the color argument is ignored, and thus 2075 * technically, any value may be passed in but PC_NO_COLOR is the standard 2076 * which should be followed for clarity's sake. 2077 */ 2078 page_t * 2079 page_demote(int mnode, pfn_t pfnum, uchar_t cur_szc, uchar_t new_szc, 2080 int color, int flags) 2081 { 2082 page_t *pp, *pplist, *npplist; 2083 pgcnt_t npgs, n; 2084 uint_t bin; 2085 uint_t mtype; 2086 page_t *ret_pp = NULL; 2087 2088 ASSERT(cur_szc != 0); 2089 ASSERT(new_szc < cur_szc); 2090 2091 pplist = page_numtopp_nolock(pfnum); 2092 ASSERT(pplist != NULL); 2093 2094 ASSERT(pplist->p_szc == cur_szc); 2095 2096 bin = PP_2_BIN(pplist); 2097 ASSERT(mnode == PP_2_MEM_NODE(pplist)); 2098 mtype = PP_2_MTYPE(pplist); 2099 page_vpsub(&PAGE_FREELISTS(mnode, cur_szc, bin, mtype), pplist); 2100 2101 CHK_LPG(pplist, cur_szc); 2102 page_ctr_sub(mnode, mtype, pplist, PG_FREE_LIST); 2103 2104 /* 2105 * Number of PAGESIZE pages for smaller new_szc 2106 * page. 2107 */ 2108 npgs = page_get_pagecnt(new_szc); 2109 2110 while (pplist) { 2111 pp = pplist; 2112 2113 ASSERT(pp->p_szc == cur_szc); 2114 2115 /* 2116 * We either break it up into PAGESIZE pages or larger. 2117 */ 2118 if (npgs == 1) { /* PAGESIZE case */ 2119 mach_page_sub(&pplist, pp); 2120 ASSERT(pp->p_szc == cur_szc); 2121 ASSERT(new_szc == 0); 2122 ASSERT(mnode == PP_2_MEM_NODE(pp)); 2123 pp->p_szc = new_szc; 2124 bin = PP_2_BIN(pp); 2125 if ((bin == color) && (flags == PC_ALLOC) && 2126 (ret_pp == NULL) && 2127 page_trylock_cons(pp, SE_EXCL)) { 2128 ret_pp = pp; 2129 } else { 2130 mtype = PP_2_MTYPE(pp); 2131 mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, 2132 mtype), pp); 2133 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST); 2134 } 2135 } else { 2136 2137 /* 2138 * Break down into smaller lists of pages. 2139 */ 2140 page_list_break(&pplist, &npplist, npgs); 2141 2142 pp = pplist; 2143 n = npgs; 2144 while (n--) { 2145 ASSERT(pp->p_szc == cur_szc); 2146 pp->p_szc = new_szc; 2147 pp = pp->p_next; 2148 } 2149 2150 CHK_LPG(pplist, new_szc); 2151 2152 bin = PP_2_BIN(pplist); 2153 ASSERT(mnode == PP_2_MEM_NODE(pp)); 2154 if ((bin == color) && (flags == PC_ALLOC) && 2155 (ret_pp == NULL) && 2156 page_trylock_cons(pp, SE_EXCL)) { 2157 ret_pp = pp; 2158 } else { 2159 mtype = PP_2_MTYPE(pp); 2160 page_vpadd(&PAGE_FREELISTS(mnode, new_szc, 2161 bin, mtype), pplist); 2162 2163 page_ctr_add(mnode, mtype, pplist, 2164 PG_FREE_LIST); 2165 } 2166 pplist = npplist; 2167 } 2168 } 2169 return (ret_pp); 2170 } 2171 2172 int mpss_coalesce_disable = 0; 2173 2174 /* 2175 * Coalesce free pages into a page of the given szc and color if possible. 2176 * Return the pointer to the page created, otherwise, return NULL. 2177 * 2178 * If pfnhi is non-zero, search for large page with pfn range less than pfnhi. 2179 */ 2180 page_t * 2181 page_freelist_coalesce(int mnode, uchar_t szc, uint_t color, uint_t ceq_mask, 2182 int mtype, pfn_t pfnhi) 2183 { 2184 int r = szc; /* region size */ 2185 int mrange; 2186 uint_t full, bin, color_mask, wrap = 0; 2187 pfn_t pfnum, lo, hi; 2188 size_t len, idx, idx0; 2189 pgcnt_t cands = 0, szcpgcnt = page_get_pagecnt(szc); 2190 page_t *ret_pp; 2191 MEM_NODE_ITERATOR_DECL(it); 2192 #if defined(__sparc) 2193 pfn_t pfnum0, nlo, nhi; 2194 #endif 2195 2196 if (mpss_coalesce_disable) { 2197 ASSERT(szc < MMU_PAGE_SIZES); 2198 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[szc][0]); 2199 return (NULL); 2200 } 2201 2202 ASSERT(szc < mmu_page_sizes); 2203 color_mask = PAGE_GET_PAGECOLORS(szc) - 1; 2204 ASSERT(ceq_mask <= color_mask); 2205 ASSERT(color <= color_mask); 2206 color &= ceq_mask; 2207 2208 /* Prevent page_counters dynamic memory from being freed */ 2209 rw_enter(&page_ctrs_rwlock[mnode], RW_READER); 2210 2211 mrange = MTYPE_2_MRANGE(mnode, mtype); 2212 ASSERT(mrange < mnode_nranges[mnode]); 2213 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[r][mrange]); 2214 2215 /* get pfn range for mtype */ 2216 len = PAGE_COUNTERS_ENTRIES(mnode, r); 2217 #if defined(__sparc) 2218 lo = PAGE_COUNTERS_BASE(mnode, r); 2219 hi = IDX_TO_PNUM(mnode, r, len); 2220 #else 2221 MNODETYPE_2_PFN(mnode, mtype, lo, hi); 2222 hi++; 2223 #endif 2224 2225 /* use lower limit if given */ 2226 if (pfnhi != PFNNULL && pfnhi < hi) 2227 hi = pfnhi; 2228 2229 /* round to szcpgcnt boundaries */ 2230 lo = P2ROUNDUP(lo, szcpgcnt); 2231 MEM_NODE_ITERATOR_INIT(lo, mnode, &it); 2232 ASSERT(lo != (pfn_t)-1); 2233 hi = hi & ~(szcpgcnt - 1); 2234 2235 /* set lo to the closest pfn of the right color */ 2236 if (((PFN_2_COLOR(lo, szc, &it) ^ color) & ceq_mask) || 2237 (interleaved_mnodes && PFN_2_MEM_NODE(lo) != mnode)) { 2238 PAGE_NEXT_PFN_FOR_COLOR(lo, szc, color, ceq_mask, color_mask, 2239 &it); 2240 } 2241 2242 if (hi <= lo) { 2243 rw_exit(&page_ctrs_rwlock[mnode]); 2244 return (NULL); 2245 } 2246 2247 full = FULL_REGION_CNT(r); 2248 2249 /* calculate the number of page candidates and initial search index */ 2250 bin = color; 2251 idx0 = (size_t)(-1); 2252 do { 2253 pgcnt_t acand; 2254 2255 PGCTRS_CANDS_GETVALUECOLOR(mnode, mrange, r, bin, acand); 2256 if (acand) { 2257 idx = PAGE_COUNTERS_CURRENT_COLOR(mnode, 2258 r, bin, mrange); 2259 idx0 = MIN(idx0, idx); 2260 cands += acand; 2261 } 2262 bin = ADD_MASKED(bin, 1, ceq_mask, color_mask); 2263 } while (bin != color); 2264 2265 if (cands == 0) { 2266 VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip[r][mrange]); 2267 rw_exit(&page_ctrs_rwlock[mnode]); 2268 return (NULL); 2269 } 2270 2271 pfnum = IDX_TO_PNUM(mnode, r, idx0); 2272 if (pfnum < lo || pfnum >= hi) { 2273 pfnum = lo; 2274 } else { 2275 MEM_NODE_ITERATOR_INIT(pfnum, mnode, &it); 2276 if (pfnum == (pfn_t)-1) { 2277 pfnum = lo; 2278 MEM_NODE_ITERATOR_INIT(pfnum, mnode, &it); 2279 ASSERT(pfnum != (pfn_t)-1); 2280 } else if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) & ceq_mask || 2281 (interleaved_mnodes && PFN_2_MEM_NODE(pfnum) != mnode)) { 2282 /* invalid color, get the closest correct pfn */ 2283 PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask, 2284 color_mask, &it); 2285 if (pfnum >= hi) { 2286 pfnum = lo; 2287 MEM_NODE_ITERATOR_INIT(pfnum, mnode, &it); 2288 } 2289 } 2290 } 2291 2292 /* set starting index */ 2293 idx0 = PNUM_TO_IDX(mnode, r, pfnum); 2294 ASSERT(idx0 < len); 2295 2296 #if defined(__sparc) 2297 pfnum0 = pfnum; /* page corresponding to idx0 */ 2298 nhi = 0; /* search kcage ranges */ 2299 #endif 2300 2301 for (idx = idx0; wrap == 0 || (idx < idx0 && wrap < 2); ) { 2302 2303 #if defined(__sparc) 2304 /* 2305 * Find lowest intersection of kcage ranges and mnode. 2306 * MTYPE_NORELOC means look in the cage, otherwise outside. 2307 */ 2308 if (nhi <= pfnum) { 2309 if (kcage_next_range(mtype == MTYPE_NORELOC, pfnum, 2310 (wrap == 0 ? hi : pfnum0), &nlo, &nhi)) 2311 goto wrapit; 2312 2313 /* jump to the next page in the range */ 2314 if (pfnum < nlo) { 2315 pfnum = P2ROUNDUP(nlo, szcpgcnt); 2316 MEM_NODE_ITERATOR_INIT(pfnum, mnode, &it); 2317 idx = PNUM_TO_IDX(mnode, r, pfnum); 2318 if (idx >= len || pfnum >= hi) 2319 goto wrapit; 2320 if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) & 2321 ceq_mask) 2322 goto next; 2323 if (interleaved_mnodes && 2324 PFN_2_MEM_NODE(pfnum) != mnode) 2325 goto next; 2326 } 2327 } 2328 #endif 2329 2330 if (PAGE_COUNTERS(mnode, r, idx) != full) 2331 goto next; 2332 2333 /* 2334 * RFE: For performance maybe we can do something less 2335 * brutal than locking the entire freelist. So far 2336 * this doesn't seem to be a performance problem? 2337 */ 2338 page_freelist_lock(mnode); 2339 if (PAGE_COUNTERS(mnode, r, idx) == full) { 2340 ret_pp = 2341 page_promote(mnode, pfnum, r, PC_ALLOC, mtype); 2342 if (ret_pp != NULL) { 2343 VM_STAT_ADD(vmm_vmstats.pfc_coalok[r][mrange]); 2344 PAGE_COUNTERS_CURRENT_COLOR(mnode, r, 2345 PFN_2_COLOR(pfnum, szc, &it), mrange) = idx; 2346 page_freelist_unlock(mnode); 2347 rw_exit(&page_ctrs_rwlock[mnode]); 2348 #if defined(__sparc) 2349 if (PP_ISNORELOC(ret_pp)) { 2350 pgcnt_t npgs; 2351 2352 npgs = page_get_pagecnt(ret_pp->p_szc); 2353 kcage_freemem_sub(npgs); 2354 } 2355 #endif 2356 return (ret_pp); 2357 } 2358 } else { 2359 VM_STAT_ADD(vmm_vmstats.page_ctrs_changed[r][mrange]); 2360 } 2361 2362 page_freelist_unlock(mnode); 2363 /* 2364 * No point looking for another page if we've 2365 * already tried all of the ones that 2366 * page_ctr_cands indicated. Stash off where we left 2367 * off. 2368 * Note: this is not exact since we don't hold the 2369 * page_freelist_locks before we initially get the 2370 * value of cands for performance reasons, but should 2371 * be a decent approximation. 2372 */ 2373 if (--cands == 0) { 2374 PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color, mrange) = 2375 idx; 2376 break; 2377 } 2378 next: 2379 PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask, 2380 color_mask, &it); 2381 idx = PNUM_TO_IDX(mnode, r, pfnum); 2382 if (idx >= len || pfnum >= hi) { 2383 wrapit: 2384 pfnum = lo; 2385 MEM_NODE_ITERATOR_INIT(pfnum, mnode, &it); 2386 idx = PNUM_TO_IDX(mnode, r, pfnum); 2387 wrap++; 2388 #if defined(__sparc) 2389 nhi = 0; /* search kcage ranges */ 2390 #endif 2391 } 2392 } 2393 2394 rw_exit(&page_ctrs_rwlock[mnode]); 2395 VM_STAT_ADD(vmm_vmstats.page_ctrs_failed[r][mrange]); 2396 return (NULL); 2397 } 2398 2399 /* 2400 * For the given mnode, promote as many small pages to large pages as possible. 2401 * mnode can be -1, which means do them all 2402 */ 2403 void 2404 page_freelist_coalesce_all(int mnode) 2405 { 2406 int r; /* region size */ 2407 int idx, full; 2408 size_t len; 2409 int doall = interleaved_mnodes || mnode < 0; 2410 int mlo = doall ? 0 : mnode; 2411 int mhi = doall ? max_mem_nodes : (mnode + 1); 2412 2413 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce_all); 2414 2415 if (mpss_coalesce_disable) { 2416 return; 2417 } 2418 2419 /* 2420 * Lock the entire freelist and coalesce what we can. 2421 * 2422 * Always promote to the largest page possible 2423 * first to reduce the number of page promotions. 2424 */ 2425 for (mnode = mlo; mnode < mhi; mnode++) { 2426 rw_enter(&page_ctrs_rwlock[mnode], RW_READER); 2427 page_freelist_lock(mnode); 2428 } 2429 for (r = mmu_page_sizes - 1; r > 0; r--) { 2430 for (mnode = mlo; mnode < mhi; mnode++) { 2431 pgcnt_t cands = 0; 2432 int mrange, nranges = mnode_nranges[mnode]; 2433 2434 for (mrange = 0; mrange < nranges; mrange++) { 2435 PGCTRS_CANDS_GETVALUE(mnode, mrange, r, cands); 2436 if (cands != 0) 2437 break; 2438 } 2439 if (cands == 0) { 2440 VM_STAT_ADD(vmm_vmstats. 2441 page_ctrs_cands_skip_all); 2442 continue; 2443 } 2444 2445 full = FULL_REGION_CNT(r); 2446 len = PAGE_COUNTERS_ENTRIES(mnode, r); 2447 2448 for (idx = 0; idx < len; idx++) { 2449 if (PAGE_COUNTERS(mnode, r, idx) == full) { 2450 pfn_t pfnum = 2451 IDX_TO_PNUM(mnode, r, idx); 2452 int tmnode = interleaved_mnodes ? 2453 PFN_2_MEM_NODE(pfnum) : mnode; 2454 2455 ASSERT(pfnum >= 2456 mem_node_config[tmnode].physbase && 2457 pfnum < 2458 mem_node_config[tmnode].physmax); 2459 2460 (void) page_promote(tmnode, 2461 pfnum, r, PC_FREE, PC_MTYPE_ANY); 2462 } 2463 } 2464 /* shared hpm_counters covers all mnodes, so we quit */ 2465 if (interleaved_mnodes) 2466 break; 2467 } 2468 } 2469 for (mnode = mlo; mnode < mhi; mnode++) { 2470 page_freelist_unlock(mnode); 2471 rw_exit(&page_ctrs_rwlock[mnode]); 2472 } 2473 } 2474 2475 /* 2476 * This is where all polices for moving pages around 2477 * to different page size free lists is implemented. 2478 * Returns 1 on success, 0 on failure. 2479 * 2480 * So far these are the priorities for this algorithm in descending 2481 * order: 2482 * 2483 * 1) When servicing a request try to do so with a free page 2484 * from next size up. Helps defer fragmentation as long 2485 * as possible. 2486 * 2487 * 2) Page coalesce on demand. Only when a freelist 2488 * larger than PAGESIZE is empty and step 1 2489 * will not work since all larger size lists are 2490 * also empty. 2491 * 2492 * If pfnhi is non-zero, search for large page with pfn range less than pfnhi. 2493 */ 2494 2495 page_t * 2496 page_freelist_split(uchar_t szc, uint_t color, int mnode, int mtype, 2497 pfn_t pfnhi, page_list_walker_t *plw) 2498 { 2499 uchar_t nszc = szc + 1; 2500 uint_t bin, sbin, bin_prev; 2501 page_t *pp, *firstpp; 2502 page_t *ret_pp = NULL; 2503 uint_t color_mask; 2504 2505 if (nszc == mmu_page_sizes) 2506 return (NULL); 2507 2508 ASSERT(nszc < mmu_page_sizes); 2509 color_mask = PAGE_GET_PAGECOLORS(nszc) - 1; 2510 bin = sbin = PAGE_GET_NSZ_COLOR(szc, color); 2511 bin_prev = (plw->plw_bin_split_prev == color) ? INVALID_COLOR : 2512 PAGE_GET_NSZ_COLOR(szc, plw->plw_bin_split_prev); 2513 2514 VM_STAT_ADD(vmm_vmstats.pfs_req[szc]); 2515 /* 2516 * First try to break up a larger page to fill current size freelist. 2517 */ 2518 while (plw->plw_bins[nszc] != 0) { 2519 2520 ASSERT(nszc < mmu_page_sizes); 2521 2522 /* 2523 * If page found then demote it. 2524 */ 2525 if (PAGE_FREELISTS(mnode, nszc, bin, mtype)) { 2526 page_freelist_lock(mnode); 2527 firstpp = pp = PAGE_FREELISTS(mnode, nszc, bin, mtype); 2528 2529 /* 2530 * If pfnhi is not PFNNULL, look for large page below 2531 * pfnhi. PFNNULL signifies no pfn requirement. 2532 */ 2533 if (pfnhi != PFNNULL && pp->p_pagenum >= pfnhi) { 2534 do { 2535 pp = pp->p_vpnext; 2536 if (pp == firstpp) { 2537 pp = NULL; 2538 break; 2539 } 2540 } while (pp->p_pagenum >= pfnhi); 2541 } 2542 if (pp) { 2543 uint_t ccolor = page_correct_color(szc, nszc, 2544 color, bin, plw->plw_ceq_mask[szc]); 2545 2546 ASSERT(pp->p_szc == nszc); 2547 VM_STAT_ADD(vmm_vmstats.pfs_demote[nszc]); 2548 ret_pp = page_demote(mnode, pp->p_pagenum, 2549 pp->p_szc, szc, ccolor, PC_ALLOC); 2550 if (ret_pp) { 2551 page_freelist_unlock(mnode); 2552 #if defined(__sparc) 2553 if (PP_ISNORELOC(ret_pp)) { 2554 pgcnt_t npgs; 2555 2556 npgs = page_get_pagecnt( 2557 ret_pp->p_szc); 2558 kcage_freemem_sub(npgs); 2559 } 2560 #endif 2561 return (ret_pp); 2562 } 2563 } 2564 page_freelist_unlock(mnode); 2565 } 2566 2567 /* loop through next size bins */ 2568 bin = ADD_MASKED(bin, 1, plw->plw_ceq_mask[nszc], color_mask); 2569 plw->plw_bins[nszc]--; 2570 2571 if (bin == sbin) { 2572 uchar_t nnszc = nszc + 1; 2573 2574 /* we are done with this page size - check next */ 2575 if (plw->plw_bins[nnszc] == 0) 2576 /* we have already checked next size bins */ 2577 break; 2578 2579 bin = sbin = PAGE_GET_NSZ_COLOR(nszc, bin); 2580 if (bin_prev != INVALID_COLOR) { 2581 bin_prev = PAGE_GET_NSZ_COLOR(nszc, bin_prev); 2582 if (!((bin ^ bin_prev) & 2583 plw->plw_ceq_mask[nnszc])) 2584 break; 2585 } 2586 ASSERT(nnszc < mmu_page_sizes); 2587 color_mask = PAGE_GET_PAGECOLORS(nnszc) - 1; 2588 nszc = nnszc; 2589 ASSERT(nszc < mmu_page_sizes); 2590 } 2591 } 2592 2593 return (ret_pp); 2594 } 2595 2596 /* 2597 * Helper routine used only by the freelist code to lock 2598 * a page. If the page is a large page then it succeeds in 2599 * locking all the constituent pages or none at all. 2600 * Returns 1 on sucess, 0 on failure. 2601 */ 2602 static int 2603 page_trylock_cons(page_t *pp, se_t se) 2604 { 2605 page_t *tpp, *first_pp = pp; 2606 2607 /* 2608 * Fail if can't lock first or only page. 2609 */ 2610 if (!page_trylock(pp, se)) { 2611 return (0); 2612 } 2613 2614 /* 2615 * PAGESIZE: common case. 2616 */ 2617 if (pp->p_szc == 0) { 2618 return (1); 2619 } 2620 2621 /* 2622 * Large page case. 2623 */ 2624 tpp = pp->p_next; 2625 while (tpp != pp) { 2626 if (!page_trylock(tpp, se)) { 2627 /* 2628 * On failure unlock what we have locked so far. 2629 * We want to avoid attempting to capture these 2630 * pages as the pcm mutex may be held which could 2631 * lead to a recursive mutex panic. 2632 */ 2633 while (first_pp != tpp) { 2634 page_unlock_nocapture(first_pp); 2635 first_pp = first_pp->p_next; 2636 } 2637 return (0); 2638 } 2639 tpp = tpp->p_next; 2640 } 2641 return (1); 2642 } 2643 2644 /* 2645 * init context for walking page lists 2646 * Called when a page of the given szc in unavailable. Sets markers 2647 * for the beginning of the search to detect when search has 2648 * completed a full cycle. Sets flags for splitting larger pages 2649 * and coalescing smaller pages. Page walking procedes until a page 2650 * of the desired equivalent color is found. 2651 */ 2652 void 2653 page_list_walk_init(uchar_t szc, uint_t flags, uint_t bin, int can_split, 2654 int use_ceq, page_list_walker_t *plw) 2655 { 2656 uint_t nszc, ceq_mask, colors; 2657 uchar_t ceq = use_ceq ? colorequivszc[szc] : 0; 2658 2659 ASSERT(szc < mmu_page_sizes); 2660 colors = PAGE_GET_PAGECOLORS(szc); 2661 2662 plw->plw_colors = colors; 2663 plw->plw_color_mask = colors - 1; 2664 plw->plw_bin_marker = plw->plw_bin0 = bin; 2665 plw->plw_bin_split_prev = bin; 2666 plw->plw_bin_step = (szc == 0) ? vac_colors : 1; 2667 2668 /* 2669 * if vac aliasing is possible make sure lower order color 2670 * bits are never ignored 2671 */ 2672 if (vac_colors > 1) 2673 ceq &= 0xf0; 2674 2675 /* 2676 * calculate the number of non-equivalent colors and 2677 * color equivalency mask 2678 */ 2679 plw->plw_ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf)); 2680 ASSERT(szc > 0 || plw->plw_ceq_dif >= vac_colors); 2681 ASSERT(plw->plw_ceq_dif > 0); 2682 plw->plw_ceq_mask[szc] = (plw->plw_ceq_dif - 1) << (ceq & 0xf); 2683 2684 if (flags & PG_MATCH_COLOR) { 2685 if (cpu_page_colors < 0) { 2686 /* 2687 * this is a heterogeneous machine with different CPUs 2688 * having different size e$ (not supported for ni2/rock 2689 */ 2690 uint_t cpucolors = CPUSETSIZE() >> PAGE_GET_SHIFT(szc); 2691 cpucolors = MAX(cpucolors, 1); 2692 ceq_mask = plw->plw_color_mask & (cpucolors - 1); 2693 plw->plw_ceq_mask[szc] = 2694 MIN(ceq_mask, plw->plw_ceq_mask[szc]); 2695 } 2696 plw->plw_ceq_dif = 1; 2697 } 2698 2699 /* we can split pages in the freelist, but not the cachelist */ 2700 if (can_split) { 2701 plw->plw_do_split = (szc + 1 < mmu_page_sizes) ? 1 : 0; 2702 2703 /* set next szc color masks and number of free list bins */ 2704 for (nszc = szc + 1; nszc < mmu_page_sizes; nszc++, szc++) { 2705 plw->plw_ceq_mask[nszc] = PAGE_GET_NSZ_MASK(szc, 2706 plw->plw_ceq_mask[szc]); 2707 plw->plw_bins[nszc] = PAGE_GET_PAGECOLORS(nszc); 2708 } 2709 plw->plw_ceq_mask[nszc] = INVALID_MASK; 2710 plw->plw_bins[nszc] = 0; 2711 2712 } else { 2713 ASSERT(szc == 0); 2714 plw->plw_do_split = 0; 2715 plw->plw_bins[1] = 0; 2716 plw->plw_ceq_mask[1] = INVALID_MASK; 2717 } 2718 } 2719 2720 /* 2721 * set mark to flag where next split should occur 2722 */ 2723 #define PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw) { \ 2724 uint_t bin_nsz = PAGE_GET_NSZ_COLOR(szc, bin); \ 2725 uint_t bin0_nsz = PAGE_GET_NSZ_COLOR(szc, plw->plw_bin0); \ 2726 uint_t neq_mask = ~plw->plw_ceq_mask[nszc] & plw->plw_color_mask; \ 2727 plw->plw_split_next = \ 2728 INC_MASKED(bin_nsz, neq_mask, plw->plw_color_mask); \ 2729 if (!((plw->plw_split_next ^ bin0_nsz) & plw->plw_ceq_mask[nszc])) { \ 2730 plw->plw_split_next = \ 2731 INC_MASKED(plw->plw_split_next, \ 2732 neq_mask, plw->plw_color_mask); \ 2733 } \ 2734 } 2735 2736 uint_t 2737 page_list_walk_next_bin(uchar_t szc, uint_t bin, page_list_walker_t *plw) 2738 { 2739 uint_t neq_mask = ~plw->plw_ceq_mask[szc] & plw->plw_color_mask; 2740 uint_t bin0_nsz, nbin_nsz, nbin0, nbin; 2741 uchar_t nszc = szc + 1; 2742 2743 nbin = ADD_MASKED(bin, 2744 plw->plw_bin_step, neq_mask, plw->plw_color_mask); 2745 2746 if (plw->plw_do_split) { 2747 plw->plw_bin_split_prev = bin; 2748 PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw); 2749 plw->plw_do_split = 0; 2750 } 2751 2752 if (szc == 0) { 2753 if (plw->plw_count != 0 || plw->plw_ceq_dif == vac_colors) { 2754 if (nbin == plw->plw_bin0 && 2755 (vac_colors == 1 || nbin != plw->plw_bin_marker)) { 2756 nbin = ADD_MASKED(nbin, plw->plw_bin_step, 2757 neq_mask, plw->plw_color_mask); 2758 plw->plw_bin_split_prev = plw->plw_bin0; 2759 } 2760 2761 if (vac_colors > 1 && nbin == plw->plw_bin_marker) { 2762 plw->plw_bin_marker = 2763 nbin = INC_MASKED(nbin, neq_mask, 2764 plw->plw_color_mask); 2765 plw->plw_bin_split_prev = plw->plw_bin0; 2766 /* 2767 * large pages all have the same vac color 2768 * so by now we should be done with next 2769 * size page splitting process 2770 */ 2771 ASSERT(plw->plw_bins[1] == 0); 2772 plw->plw_do_split = 0; 2773 return (nbin); 2774 } 2775 2776 } else { 2777 uint_t bin_jump = (vac_colors == 1) ? 2778 (BIN_STEP & ~3) - (plw->plw_bin0 & 3) : BIN_STEP; 2779 2780 bin_jump &= ~(vac_colors - 1); 2781 2782 nbin0 = ADD_MASKED(plw->plw_bin0, bin_jump, neq_mask, 2783 plw->plw_color_mask); 2784 2785 if ((nbin0 ^ plw->plw_bin0) & plw->plw_ceq_mask[szc]) { 2786 2787 plw->plw_bin_marker = nbin = nbin0; 2788 2789 if (plw->plw_bins[nszc] != 0) { 2790 /* 2791 * check if next page size bin is the 2792 * same as the next page size bin for 2793 * bin0 2794 */ 2795 nbin_nsz = PAGE_GET_NSZ_COLOR(szc, 2796 nbin); 2797 bin0_nsz = PAGE_GET_NSZ_COLOR(szc, 2798 plw->plw_bin0); 2799 2800 if ((bin0_nsz ^ nbin_nsz) & 2801 plw->plw_ceq_mask[nszc]) 2802 plw->plw_do_split = 1; 2803 } 2804 return (nbin); 2805 } 2806 } 2807 } 2808 2809 if (plw->plw_bins[nszc] != 0) { 2810 nbin_nsz = PAGE_GET_NSZ_COLOR(szc, nbin); 2811 if (!((plw->plw_split_next ^ nbin_nsz) & 2812 plw->plw_ceq_mask[nszc])) 2813 plw->plw_do_split = 1; 2814 } 2815 2816 return (nbin); 2817 } 2818 2819 page_t * 2820 page_get_mnode_freelist(int mnode, uint_t bin, int mtype, uchar_t szc, 2821 uint_t flags) 2822 { 2823 kmutex_t *pcm; 2824 page_t *pp, *first_pp; 2825 uint_t sbin; 2826 int plw_initialized; 2827 page_list_walker_t plw; 2828 2829 ASSERT(szc < mmu_page_sizes); 2830 2831 VM_STAT_ADD(vmm_vmstats.pgmf_alloc[szc]); 2832 2833 MTYPE_START(mnode, mtype, flags); 2834 if (mtype < 0) { /* mnode does not have memory in mtype range */ 2835 VM_STAT_ADD(vmm_vmstats.pgmf_allocempty[szc]); 2836 return (NULL); 2837 } 2838 try_again: 2839 2840 plw_initialized = 0; 2841 plw.plw_ceq_dif = 1; 2842 2843 /* 2844 * Only hold one freelist lock at a time, that way we 2845 * can start anywhere and not have to worry about lock 2846 * ordering. 2847 */ 2848 for (plw.plw_count = 0; 2849 plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) { 2850 sbin = bin; 2851 do { 2852 if (!PAGE_FREELISTS(mnode, szc, bin, mtype)) 2853 goto bin_empty_1; 2854 2855 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 2856 mutex_enter(pcm); 2857 pp = PAGE_FREELISTS(mnode, szc, bin, mtype); 2858 if (pp == NULL) 2859 goto bin_empty_0; 2860 2861 /* 2862 * These were set before the page 2863 * was put on the free list, 2864 * they must still be set. 2865 */ 2866 ASSERT(PP_ISFREE(pp)); 2867 ASSERT(PP_ISAGED(pp)); 2868 ASSERT(pp->p_vnode == NULL); 2869 ASSERT(pp->p_hash == NULL); 2870 ASSERT(pp->p_offset == (u_offset_t)-1); 2871 ASSERT(pp->p_szc == szc); 2872 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 2873 2874 /* 2875 * Walk down the hash chain. 2876 * 8k pages are linked on p_next 2877 * and p_prev fields. Large pages 2878 * are a contiguous group of 2879 * constituent pages linked together 2880 * on their p_next and p_prev fields. 2881 * The large pages are linked together 2882 * on the hash chain using p_vpnext 2883 * p_vpprev of the base constituent 2884 * page of each large page. 2885 */ 2886 first_pp = pp; 2887 while (!page_trylock_cons(pp, SE_EXCL)) { 2888 if (szc == 0) { 2889 pp = pp->p_next; 2890 } else { 2891 pp = pp->p_vpnext; 2892 } 2893 2894 ASSERT(PP_ISFREE(pp)); 2895 ASSERT(PP_ISAGED(pp)); 2896 ASSERT(pp->p_vnode == NULL); 2897 ASSERT(pp->p_hash == NULL); 2898 ASSERT(pp->p_offset == (u_offset_t)-1); 2899 ASSERT(pp->p_szc == szc); 2900 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 2901 2902 if (pp == first_pp) 2903 goto bin_empty_0; 2904 } 2905 2906 ASSERT(pp != NULL); 2907 ASSERT(mtype == PP_2_MTYPE(pp)); 2908 ASSERT(pp->p_szc == szc); 2909 if (szc == 0) { 2910 page_sub(&PAGE_FREELISTS(mnode, 2911 szc, bin, mtype), pp); 2912 } else { 2913 page_vpsub(&PAGE_FREELISTS(mnode, 2914 szc, bin, mtype), pp); 2915 CHK_LPG(pp, szc); 2916 } 2917 page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST); 2918 2919 if ((PP_ISFREE(pp) == 0) || (PP_ISAGED(pp) == 0)) 2920 panic("free page is not. pp %p", (void *)pp); 2921 mutex_exit(pcm); 2922 2923 #if defined(__sparc) 2924 ASSERT(!kcage_on || PP_ISNORELOC(pp) || 2925 (flags & PG_NORELOC) == 0); 2926 2927 if (PP_ISNORELOC(pp)) 2928 kcage_freemem_sub(page_get_pagecnt(szc)); 2929 #endif 2930 VM_STAT_ADD(vmm_vmstats.pgmf_allocok[szc]); 2931 return (pp); 2932 2933 bin_empty_0: 2934 mutex_exit(pcm); 2935 bin_empty_1: 2936 if (plw_initialized == 0) { 2937 page_list_walk_init(szc, flags, bin, 1, 1, 2938 &plw); 2939 plw_initialized = 1; 2940 ASSERT(plw.plw_colors <= 2941 PAGE_GET_PAGECOLORS(szc)); 2942 ASSERT(plw.plw_colors > 0); 2943 ASSERT((plw.plw_colors & 2944 (plw.plw_colors - 1)) == 0); 2945 ASSERT(bin < plw.plw_colors); 2946 ASSERT(plw.plw_ceq_mask[szc] < plw.plw_colors); 2947 } 2948 /* calculate the next bin with equivalent color */ 2949 bin = ADD_MASKED(bin, plw.plw_bin_step, 2950 plw.plw_ceq_mask[szc], plw.plw_color_mask); 2951 } while (sbin != bin); 2952 2953 /* 2954 * color bins are all empty if color match. Try and 2955 * satisfy the request by breaking up or coalescing 2956 * pages from a different size freelist of the correct 2957 * color that satisfies the ORIGINAL color requested. 2958 * If that fails then try pages of the same size but 2959 * different colors assuming we are not called with 2960 * PG_MATCH_COLOR. 2961 */ 2962 if (plw.plw_do_split && 2963 (pp = page_freelist_split(szc, bin, mnode, 2964 mtype, PFNNULL, &plw)) != NULL) 2965 return (pp); 2966 2967 if (szc > 0 && (pp = page_freelist_coalesce(mnode, szc, 2968 bin, plw.plw_ceq_mask[szc], mtype, PFNNULL)) != NULL) 2969 return (pp); 2970 2971 if (plw.plw_ceq_dif > 1) 2972 bin = page_list_walk_next_bin(szc, bin, &plw); 2973 } 2974 2975 /* if allowed, cycle through additional mtypes */ 2976 MTYPE_NEXT(mnode, mtype, flags); 2977 if (mtype >= 0) 2978 goto try_again; 2979 2980 VM_STAT_ADD(vmm_vmstats.pgmf_allocfailed[szc]); 2981 2982 return (NULL); 2983 } 2984 2985 /* 2986 * Returns the count of free pages for 'pp' with size code 'szc'. 2987 * Note: This function does not return an exact value as the page freelist 2988 * locks are not held and thus the values in the page_counters may be 2989 * changing as we walk through the data. 2990 */ 2991 static int 2992 page_freecnt(int mnode, page_t *pp, uchar_t szc) 2993 { 2994 pgcnt_t pgfree; 2995 pgcnt_t cnt; 2996 ssize_t r = szc; /* region size */ 2997 ssize_t idx; 2998 int i; 2999 int full, range; 3000 3001 /* Make sure pagenum passed in is aligned properly */ 3002 ASSERT((pp->p_pagenum & (PNUM_SIZE(szc) - 1)) == 0); 3003 ASSERT(szc > 0); 3004 3005 /* Prevent page_counters dynamic memory from being freed */ 3006 rw_enter(&page_ctrs_rwlock[mnode], RW_READER); 3007 idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum); 3008 cnt = PAGE_COUNTERS(mnode, r, idx); 3009 pgfree = cnt << PNUM_SHIFT(r - 1); 3010 range = FULL_REGION_CNT(szc); 3011 3012 /* Check for completely full region */ 3013 if (cnt == range) { 3014 rw_exit(&page_ctrs_rwlock[mnode]); 3015 return (pgfree); 3016 } 3017 3018 while (--r > 0) { 3019 idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum); 3020 full = FULL_REGION_CNT(r); 3021 for (i = 0; i < range; i++, idx++) { 3022 cnt = PAGE_COUNTERS(mnode, r, idx); 3023 /* 3024 * If cnt here is full, that means we have already 3025 * accounted for these pages earlier. 3026 */ 3027 if (cnt != full) { 3028 pgfree += (cnt << PNUM_SHIFT(r - 1)); 3029 } 3030 } 3031 range *= full; 3032 } 3033 rw_exit(&page_ctrs_rwlock[mnode]); 3034 return (pgfree); 3035 } 3036 3037 /* 3038 * Called from page_geti_contig_pages to exclusively lock constituent pages 3039 * starting from 'spp' for page size code 'szc'. 3040 * 3041 * If 'ptcpthreshold' is set, the number of free pages needed in the 'szc' 3042 * region needs to be greater than or equal to the threshold. 3043 */ 3044 static int 3045 page_trylock_contig_pages(int mnode, page_t *spp, uchar_t szc, int flags) 3046 { 3047 pgcnt_t pgcnt = PNUM_SIZE(szc); 3048 pgcnt_t pgfree, i; 3049 page_t *pp; 3050 3051 VM_STAT_ADD(vmm_vmstats.ptcp[szc]); 3052 3053 3054 if ((ptcpthreshold == 0) || (flags & PGI_PGCPHIPRI)) 3055 goto skipptcpcheck; 3056 /* 3057 * check if there are sufficient free pages available before attempting 3058 * to trylock. Count is approximate as page counters can change. 3059 */ 3060 pgfree = page_freecnt(mnode, spp, szc); 3061 3062 /* attempt to trylock if there are sufficient already free pages */ 3063 if (pgfree < pgcnt/ptcpthreshold) { 3064 VM_STAT_ADD(vmm_vmstats.ptcpfreethresh[szc]); 3065 return (0); 3066 } 3067 3068 skipptcpcheck: 3069 3070 for (i = 0; i < pgcnt; i++) { 3071 pp = &spp[i]; 3072 if (!page_trylock(pp, SE_EXCL)) { 3073 VM_STAT_ADD(vmm_vmstats.ptcpfailexcl[szc]); 3074 while (--i != (pgcnt_t)-1) { 3075 pp = &spp[i]; 3076 ASSERT(PAGE_EXCL(pp)); 3077 page_unlock_nocapture(pp); 3078 } 3079 return (0); 3080 } 3081 ASSERT(spp[i].p_pagenum == spp->p_pagenum + i); 3082 if ((pp->p_szc > szc || (szc && pp->p_szc == szc)) && 3083 !PP_ISFREE(pp)) { 3084 VM_STAT_ADD(vmm_vmstats.ptcpfailszc[szc]); 3085 ASSERT(i == 0); 3086 page_unlock_nocapture(pp); 3087 return (0); 3088 } 3089 if (PP_ISNORELOC(pp)) { 3090 VM_STAT_ADD(vmm_vmstats.ptcpfailcage[szc]); 3091 while (i != (pgcnt_t)-1) { 3092 pp = &spp[i]; 3093 ASSERT(PAGE_EXCL(pp)); 3094 page_unlock_nocapture(pp); 3095 i--; 3096 } 3097 return (0); 3098 } 3099 } 3100 VM_STAT_ADD(vmm_vmstats.ptcpok[szc]); 3101 return (1); 3102 } 3103 3104 /* 3105 * Claim large page pointed to by 'pp'. 'pp' is the starting set 3106 * of 'szc' constituent pages that had been locked exclusively previously. 3107 * Will attempt to relocate constituent pages in use. 3108 */ 3109 static page_t * 3110 page_claim_contig_pages(page_t *pp, uchar_t szc, int flags) 3111 { 3112 spgcnt_t pgcnt, npgs, i; 3113 page_t *targpp, *rpp, *hpp; 3114 page_t *replpp = NULL; 3115 page_t *pplist = NULL; 3116 3117 ASSERT(pp != NULL); 3118 3119 pgcnt = page_get_pagecnt(szc); 3120 while (pgcnt) { 3121 ASSERT(PAGE_EXCL(pp)); 3122 ASSERT(!PP_ISNORELOC(pp)); 3123 if (PP_ISFREE(pp)) { 3124 /* 3125 * If this is a PG_FREE_LIST page then its 3126 * size code can change underneath us due to 3127 * page promotion or demotion. As an optimzation 3128 * use page_list_sub_pages() instead of 3129 * page_list_sub(). 3130 */ 3131 if (PP_ISAGED(pp)) { 3132 page_list_sub_pages(pp, szc); 3133 if (pp->p_szc == szc) { 3134 return (pp); 3135 } 3136 ASSERT(pp->p_szc < szc); 3137 npgs = page_get_pagecnt(pp->p_szc); 3138 hpp = pp; 3139 for (i = 0; i < npgs; i++, pp++) { 3140 pp->p_szc = szc; 3141 } 3142 page_list_concat(&pplist, &hpp); 3143 pgcnt -= npgs; 3144 continue; 3145 } 3146 ASSERT(!PP_ISAGED(pp)); 3147 ASSERT(pp->p_szc == 0); 3148 page_list_sub(pp, PG_CACHE_LIST); 3149 page_hashout(pp, NULL); 3150 PP_SETAGED(pp); 3151 pp->p_szc = szc; 3152 page_list_concat(&pplist, &pp); 3153 pp++; 3154 pgcnt--; 3155 continue; 3156 } 3157 npgs = page_get_pagecnt(pp->p_szc); 3158 3159 /* 3160 * page_create_wait freemem accounting done by caller of 3161 * page_get_freelist and not necessary to call it prior to 3162 * calling page_get_replacement_page. 3163 * 3164 * page_get_replacement_page can call page_get_contig_pages 3165 * to acquire a large page (szc > 0); the replacement must be 3166 * smaller than the contig page size to avoid looping or 3167 * szc == 0 and PGI_PGCPSZC0 is set. 3168 */ 3169 if (pp->p_szc < szc || (szc == 0 && (flags & PGI_PGCPSZC0))) { 3170 replpp = page_get_replacement_page(pp, NULL, 0); 3171 if (replpp) { 3172 npgs = page_get_pagecnt(pp->p_szc); 3173 ASSERT(npgs <= pgcnt); 3174 targpp = pp; 3175 } 3176 } 3177 3178 /* 3179 * If replacement is NULL or do_page_relocate fails, fail 3180 * coalescing of pages. 3181 */ 3182 if (replpp == NULL || (do_page_relocate(&targpp, &replpp, 0, 3183 &npgs, NULL) != 0)) { 3184 /* 3185 * Unlock un-processed target list 3186 */ 3187 while (pgcnt--) { 3188 ASSERT(PAGE_EXCL(pp)); 3189 page_unlock_nocapture(pp); 3190 pp++; 3191 } 3192 /* 3193 * Free the processed target list. 3194 */ 3195 while (pplist) { 3196 pp = pplist; 3197 page_sub(&pplist, pp); 3198 ASSERT(PAGE_EXCL(pp)); 3199 ASSERT(pp->p_szc == szc); 3200 ASSERT(PP_ISFREE(pp)); 3201 ASSERT(PP_ISAGED(pp)); 3202 pp->p_szc = 0; 3203 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); 3204 page_unlock_nocapture(pp); 3205 } 3206 3207 if (replpp != NULL) 3208 page_free_replacement_page(replpp); 3209 3210 return (NULL); 3211 } 3212 ASSERT(pp == targpp); 3213 3214 /* LINTED */ 3215 ASSERT(hpp = pp); /* That's right, it's an assignment */ 3216 3217 pp += npgs; 3218 pgcnt -= npgs; 3219 3220 while (npgs--) { 3221 ASSERT(PAGE_EXCL(targpp)); 3222 ASSERT(!PP_ISFREE(targpp)); 3223 ASSERT(!PP_ISNORELOC(targpp)); 3224 PP_SETFREE(targpp); 3225 ASSERT(PP_ISAGED(targpp)); 3226 ASSERT(targpp->p_szc < szc || (szc == 0 && 3227 (flags & PGI_PGCPSZC0))); 3228 targpp->p_szc = szc; 3229 targpp = targpp->p_next; 3230 3231 rpp = replpp; 3232 ASSERT(rpp != NULL); 3233 page_sub(&replpp, rpp); 3234 ASSERT(PAGE_EXCL(rpp)); 3235 ASSERT(!PP_ISFREE(rpp)); 3236 page_unlock_nocapture(rpp); 3237 } 3238 ASSERT(targpp == hpp); 3239 ASSERT(replpp == NULL); 3240 page_list_concat(&pplist, &targpp); 3241 } 3242 CHK_LPG(pplist, szc); 3243 return (pplist); 3244 } 3245 3246 /* 3247 * Trim kernel cage from pfnlo-pfnhi and store result in lo-hi. Return code 3248 * of 0 means nothing left after trim. 3249 */ 3250 int 3251 trimkcage(struct memseg *mseg, pfn_t *lo, pfn_t *hi, pfn_t pfnlo, pfn_t pfnhi) 3252 { 3253 pfn_t kcagepfn; 3254 int decr; 3255 int rc = 0; 3256 3257 if (PP_ISNORELOC(mseg->pages)) { 3258 if (PP_ISNORELOC(mseg->epages - 1) == 0) { 3259 3260 /* lower part of this mseg inside kernel cage */ 3261 decr = kcage_current_pfn(&kcagepfn); 3262 3263 /* kernel cage may have transitioned past mseg */ 3264 if (kcagepfn >= mseg->pages_base && 3265 kcagepfn < mseg->pages_end) { 3266 ASSERT(decr == 0); 3267 *lo = kcagepfn; 3268 *hi = MIN(pfnhi, 3269 (mseg->pages_end - 1)); 3270 rc = 1; 3271 } 3272 } 3273 /* else entire mseg in the cage */ 3274 } else { 3275 if (PP_ISNORELOC(mseg->epages - 1)) { 3276 3277 /* upper part of this mseg inside kernel cage */ 3278 decr = kcage_current_pfn(&kcagepfn); 3279 3280 /* kernel cage may have transitioned past mseg */ 3281 if (kcagepfn >= mseg->pages_base && 3282 kcagepfn < mseg->pages_end) { 3283 ASSERT(decr); 3284 *hi = kcagepfn; 3285 *lo = MAX(pfnlo, mseg->pages_base); 3286 rc = 1; 3287 } 3288 } else { 3289 /* entire mseg outside of kernel cage */ 3290 *lo = MAX(pfnlo, mseg->pages_base); 3291 *hi = MIN(pfnhi, (mseg->pages_end - 1)); 3292 rc = 1; 3293 } 3294 } 3295 return (rc); 3296 } 3297 3298 /* 3299 * called from page_get_contig_pages to search 'pfnlo' thru 'pfnhi' to claim a 3300 * page with size code 'szc'. Claiming such a page requires acquiring 3301 * exclusive locks on all constituent pages (page_trylock_contig_pages), 3302 * relocating pages in use and concatenating these constituent pages into a 3303 * large page. 3304 * 3305 * The page lists do not have such a large page and page_freelist_split has 3306 * already failed to demote larger pages and/or coalesce smaller free pages. 3307 * 3308 * 'flags' may specify PG_COLOR_MATCH which would limit the search of large 3309 * pages with the same color as 'bin'. 3310 * 3311 * 'pfnflag' specifies the subset of the pfn range to search. 3312 */ 3313 3314 3315 static page_t * 3316 page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags, 3317 pfn_t pfnlo, pfn_t pfnhi, pgcnt_t pfnflag) 3318 { 3319 struct memseg *mseg; 3320 pgcnt_t szcpgcnt = page_get_pagecnt(szc); 3321 pgcnt_t szcpgmask = szcpgcnt - 1; 3322 pfn_t randpfn; 3323 page_t *pp, *randpp, *endpp; 3324 uint_t colors, ceq_mask; 3325 /* LINTED : set but not used in function */ 3326 uint_t color_mask; 3327 pfn_t hi, lo; 3328 uint_t skip; 3329 MEM_NODE_ITERATOR_DECL(it); 3330 3331 ASSERT(szc != 0 || (flags & PGI_PGCPSZC0)); 3332 3333 if ((pfnhi - pfnlo) + 1 < szcpgcnt) 3334 return (NULL); 3335 3336 ASSERT(szc < mmu_page_sizes); 3337 3338 colors = PAGE_GET_PAGECOLORS(szc); 3339 color_mask = colors - 1; 3340 if ((colors > 1) && (flags & PG_MATCH_COLOR)) { 3341 uchar_t ceq = colorequivszc[szc]; 3342 uint_t ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf)); 3343 3344 ASSERT(ceq_dif > 0); 3345 ceq_mask = (ceq_dif - 1) << (ceq & 0xf); 3346 } else { 3347 ceq_mask = 0; 3348 } 3349 3350 ASSERT(bin < colors); 3351 3352 /* clear "non-significant" color bits */ 3353 bin &= ceq_mask; 3354 3355 /* 3356 * trim the pfn range to search based on pfnflag. pfnflag is set 3357 * when there have been previous page_get_contig_page failures to 3358 * limit the search. 3359 * 3360 * The high bit in pfnflag specifies the number of 'slots' in the 3361 * pfn range and the remainder of pfnflag specifies which slot. 3362 * For example, a value of 1010b would mean the second slot of 3363 * the pfn range that has been divided into 8 slots. 3364 */ 3365 if (pfnflag > 1) { 3366 int slots = 1 << (highbit(pfnflag) - 1); 3367 int slotid = pfnflag & (slots - 1); 3368 pgcnt_t szcpages; 3369 int slotlen; 3370 3371 pfnlo = P2ROUNDUP(pfnlo, szcpgcnt); 3372 pfnhi = pfnhi & ~(szcpgcnt - 1); 3373 3374 szcpages = ((pfnhi - pfnlo) + 1) / szcpgcnt; 3375 slotlen = howmany(szcpages, slots); 3376 pfnlo = pfnlo + (((slotid * slotlen) % szcpages) * szcpgcnt); 3377 ASSERT(pfnlo < pfnhi); 3378 if (pfnhi > pfnlo + (slotlen * szcpgcnt)) 3379 pfnhi = pfnlo + (slotlen * szcpgcnt); 3380 } 3381 3382 memsegs_lock(0); 3383 3384 /* 3385 * loop through memsegs to look for contig page candidates 3386 */ 3387 3388 for (mseg = memsegs; mseg != NULL; mseg = mseg->next) { 3389 if (pfnhi < mseg->pages_base || pfnlo >= mseg->pages_end) { 3390 /* no overlap */ 3391 continue; 3392 } 3393 3394 if (mseg->pages_end - mseg->pages_base < szcpgcnt) 3395 /* mseg too small */ 3396 continue; 3397 3398 /* trim off kernel cage pages from pfn range */ 3399 if (kcage_on) { 3400 if (trimkcage(mseg, &lo, &hi, pfnlo, pfnhi) == 0) 3401 continue; 3402 } else { 3403 lo = MAX(pfnlo, mseg->pages_base); 3404 hi = MIN(pfnhi, (mseg->pages_end - 1)); 3405 } 3406 3407 /* round to szcpgcnt boundaries */ 3408 lo = P2ROUNDUP(lo, szcpgcnt); 3409 MEM_NODE_ITERATOR_INIT(lo, mnode, &it); 3410 hi = hi & ~(szcpgcnt - 1); 3411 3412 if (hi <= lo) 3413 continue; 3414 3415 /* 3416 * set lo to point to the pfn for the desired bin. Large 3417 * page sizes may only have a single page color 3418 */ 3419 skip = szcpgcnt; 3420 if (ceq_mask > 0 || interleaved_mnodes) { 3421 /* set lo to point at appropriate color */ 3422 if (((PFN_2_COLOR(lo, szc, &it) ^ bin) & ceq_mask) || 3423 (interleaved_mnodes && 3424 PFN_2_MEM_NODE(lo) != mnode)) { 3425 PAGE_NEXT_PFN_FOR_COLOR(lo, szc, bin, ceq_mask, 3426 color_mask, &it); 3427 } 3428 if (hi <= lo) 3429 /* mseg cannot satisfy color request */ 3430 continue; 3431 } 3432 3433 /* randomly choose a point between lo and hi to begin search */ 3434 3435 randpfn = (pfn_t)GETTICK(); 3436 randpfn = ((randpfn % (hi - lo)) + lo) & ~(skip - 1); 3437 MEM_NODE_ITERATOR_INIT(randpfn, mnode, &it); 3438 if (ceq_mask || interleaved_mnodes) { 3439 if (randpfn != (pfn_t)-1) 3440 PAGE_NEXT_PFN_FOR_COLOR(randpfn, szc, bin, 3441 ceq_mask, color_mask, &it); 3442 if (randpfn >= hi) { 3443 randpfn = lo; 3444 MEM_NODE_ITERATOR_INIT(randpfn, mnode, &it); 3445 } 3446 } 3447 randpp = mseg->pages + (randpfn - mseg->pages_base); 3448 3449 ASSERT(randpp->p_pagenum == randpfn); 3450 3451 pp = randpp; 3452 endpp = mseg->pages + (hi - mseg->pages_base); 3453 3454 ASSERT(randpp + szcpgcnt <= endpp); 3455 3456 do { 3457 ASSERT(!(pp->p_pagenum & szcpgmask)); 3458 ASSERT(((PP_2_BIN(pp) ^ bin) & ceq_mask) == 0); 3459 3460 if (page_trylock_contig_pages(mnode, pp, szc, flags)) { 3461 /* pages unlocked by page_claim on failure */ 3462 if (page_claim_contig_pages(pp, szc, flags)) { 3463 memsegs_unlock(0); 3464 return (pp); 3465 } 3466 } 3467 3468 if (ceq_mask == 0 && !interleaved_mnodes) { 3469 pp += skip; 3470 } else { 3471 pfn_t pfn = pp->p_pagenum; 3472 3473 PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, bin, 3474 ceq_mask, color_mask, &it); 3475 if (pfn == (pfn_t)-1) { 3476 pp = endpp; 3477 } else { 3478 pp = mseg->pages + 3479 (pfn - mseg->pages_base); 3480 } 3481 } 3482 if (pp >= endpp) { 3483 /* start from the beginning */ 3484 MEM_NODE_ITERATOR_INIT(lo, mnode, &it); 3485 pp = mseg->pages + (lo - mseg->pages_base); 3486 ASSERT(pp->p_pagenum == lo); 3487 ASSERT(pp + szcpgcnt <= endpp); 3488 } 3489 } while (pp != randpp); 3490 } 3491 memsegs_unlock(0); 3492 return (NULL); 3493 } 3494 3495 3496 /* 3497 * controlling routine that searches through physical memory in an attempt to 3498 * claim a large page based on the input parameters. 3499 * on the page free lists. 3500 * 3501 * calls page_geti_contig_pages with an initial pfn range from the mnode 3502 * and mtype. page_geti_contig_pages will trim off the parts of the pfn range 3503 * that overlaps with the kernel cage or does not match the requested page 3504 * color if PG_MATCH_COLOR is set. Since this search is very expensive, 3505 * page_geti_contig_pages may further limit the search range based on 3506 * previous failure counts (pgcpfailcnt[]). 3507 * 3508 * for PGI_PGCPSZC0 requests, page_get_contig_pages will relocate a base 3509 * pagesize page that satisfies mtype. 3510 */ 3511 page_t * 3512 page_get_contig_pages(int mnode, uint_t bin, int mtype, uchar_t szc, 3513 uint_t flags) 3514 { 3515 pfn_t pfnlo, pfnhi; /* contig pages pfn range */ 3516 page_t *pp; 3517 pgcnt_t pfnflag = 0; /* no limit on search if 0 */ 3518 3519 VM_STAT_ADD(vmm_vmstats.pgcp_alloc[szc]); 3520 3521 /* no allocations from cage */ 3522 flags |= PGI_NOCAGE; 3523 3524 /* LINTED */ 3525 MTYPE_START(mnode, mtype, flags); 3526 if (mtype < 0) { /* mnode does not have memory in mtype range */ 3527 VM_STAT_ADD(vmm_vmstats.pgcp_allocempty[szc]); 3528 return (NULL); 3529 } 3530 3531 ASSERT(szc > 0 || (flags & PGI_PGCPSZC0)); 3532 3533 /* do not limit search and ignore color if hi pri */ 3534 3535 if (pgcplimitsearch && ((flags & PGI_PGCPHIPRI) == 0)) 3536 pfnflag = pgcpfailcnt[szc]; 3537 3538 /* remove color match to improve chances */ 3539 3540 if (flags & PGI_PGCPHIPRI || pfnflag) 3541 flags &= ~PG_MATCH_COLOR; 3542 3543 do { 3544 /* get pfn range based on mnode and mtype */ 3545 MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi); 3546 3547 ASSERT(pfnhi >= pfnlo); 3548 3549 pp = page_geti_contig_pages(mnode, bin, szc, flags, 3550 pfnlo, pfnhi, pfnflag); 3551 3552 if (pp != NULL) { 3553 pfnflag = pgcpfailcnt[szc]; 3554 if (pfnflag) { 3555 /* double the search size */ 3556 pgcpfailcnt[szc] = pfnflag >> 1; 3557 } 3558 VM_STAT_ADD(vmm_vmstats.pgcp_allocok[szc]); 3559 return (pp); 3560 } 3561 MTYPE_NEXT(mnode, mtype, flags); 3562 } while (mtype >= 0); 3563 3564 VM_STAT_ADD(vmm_vmstats.pgcp_allocfailed[szc]); 3565 return (NULL); 3566 } 3567 3568 3569 /* 3570 * Find the `best' page on the freelist for this (vp,off) (as,vaddr) pair. 3571 * 3572 * Does its own locking and accounting. 3573 * If PG_MATCH_COLOR is set, then NULL will be returned if there are no 3574 * pages of the proper color even if there are pages of a different color. 3575 * 3576 * Finds a page, removes it, THEN locks it. 3577 */ 3578 3579 /*ARGSUSED*/ 3580 page_t * 3581 page_get_freelist(struct vnode *vp, u_offset_t off, struct seg *seg, 3582 caddr_t vaddr, size_t size, uint_t flags, struct lgrp *lgrp) 3583 { 3584 struct as *as = seg->s_as; 3585 page_t *pp = NULL; 3586 ulong_t bin; 3587 uchar_t szc; 3588 int mnode; 3589 int mtype; 3590 page_t *(*page_get_func)(int, uint_t, int, uchar_t, uint_t); 3591 lgrp_mnode_cookie_t lgrp_cookie; 3592 3593 page_get_func = page_get_mnode_freelist; 3594 3595 /* 3596 * If we aren't passed a specific lgroup, or passed a freed lgrp 3597 * assume we wish to allocate near to the current thread's home. 3598 */ 3599 if (!LGRP_EXISTS(lgrp)) 3600 lgrp = lgrp_home_lgrp(); 3601 3602 if (kcage_on) { 3603 if ((flags & (PG_NORELOC | PG_PANIC)) == PG_NORELOC && 3604 kcage_freemem < kcage_throttlefree + btop(size) && 3605 curthread != kcage_cageout_thread) { 3606 /* 3607 * Set a "reserve" of kcage_throttlefree pages for 3608 * PG_PANIC and cageout thread allocations. 3609 * 3610 * Everybody else has to serialize in 3611 * page_create_get_something() to get a cage page, so 3612 * that we don't deadlock cageout! 3613 */ 3614 return (NULL); 3615 } 3616 } else { 3617 flags &= ~PG_NORELOC; 3618 flags |= PGI_NOCAGE; 3619 } 3620 3621 /* LINTED */ 3622 MTYPE_INIT(mtype, vp, vaddr, flags, size); 3623 3624 /* 3625 * Convert size to page size code. 3626 */ 3627 if ((szc = page_szc(size)) == (uchar_t)-1) 3628 panic("page_get_freelist: illegal page size request"); 3629 ASSERT(szc < mmu_page_sizes); 3630 3631 VM_STAT_ADD(vmm_vmstats.pgf_alloc[szc]); 3632 3633 /* LINTED */ 3634 AS_2_BIN(as, seg, vp, vaddr, bin, szc); 3635 3636 ASSERT(bin < PAGE_GET_PAGECOLORS(szc)); 3637 3638 /* 3639 * Try to get a local page first, but try remote if we can't 3640 * get a page of the right color. 3641 */ 3642 pgretry: 3643 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL); 3644 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3645 pp = page_get_func(mnode, bin, mtype, szc, flags); 3646 if (pp != NULL) { 3647 VM_STAT_ADD(vmm_vmstats.pgf_allocok[szc]); 3648 DTRACE_PROBE4(page__get, 3649 lgrp_t *, lgrp, 3650 int, mnode, 3651 ulong_t, bin, 3652 uint_t, flags); 3653 return (pp); 3654 } 3655 } 3656 ASSERT(pp == NULL); 3657 3658 /* 3659 * for non-SZC0 PAGESIZE requests, check cachelist before checking 3660 * remote free lists. Caller expected to call page_get_cachelist which 3661 * will check local cache lists and remote free lists. 3662 */ 3663 if (szc == 0 && ((flags & PGI_PGCPSZC0) == 0)) { 3664 VM_STAT_ADD(vmm_vmstats.pgf_allocdeferred); 3665 return (NULL); 3666 } 3667 3668 ASSERT(szc > 0 || (flags & PGI_PGCPSZC0)); 3669 3670 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); 3671 3672 if (!(flags & PG_LOCAL)) { 3673 /* 3674 * Try to get a non-local freelist page. 3675 */ 3676 LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie); 3677 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3678 pp = page_get_func(mnode, bin, mtype, szc, flags); 3679 if (pp != NULL) { 3680 DTRACE_PROBE4(page__get, 3681 lgrp_t *, lgrp, 3682 int, mnode, 3683 ulong_t, bin, 3684 uint_t, flags); 3685 VM_STAT_ADD(vmm_vmstats.pgf_allocokrem[szc]); 3686 return (pp); 3687 } 3688 } 3689 ASSERT(pp == NULL); 3690 } 3691 3692 /* 3693 * when the cage is off chances are page_get_contig_pages() will fail 3694 * to lock a large page chunk therefore when the cage is off it's not 3695 * called by default. this can be changed via /etc/system. 3696 * 3697 * page_get_contig_pages() also called to acquire a base pagesize page 3698 * for page_create_get_something(). 3699 */ 3700 if (!(flags & PG_NORELOC) && (pg_contig_disable == 0) && 3701 (kcage_on || pg_lpgcreate_nocage || szc == 0) && 3702 (page_get_func != page_get_contig_pages)) { 3703 3704 VM_STAT_ADD(vmm_vmstats.pgf_allocretry[szc]); 3705 page_get_func = page_get_contig_pages; 3706 goto pgretry; 3707 } 3708 3709 if (!(flags & PG_LOCAL) && pgcplimitsearch && 3710 page_get_func == page_get_contig_pages) 3711 SETPGCPFAILCNT(szc); 3712 3713 VM_STAT_ADD(vmm_vmstats.pgf_allocfailed[szc]); 3714 return (NULL); 3715 } 3716 3717 /* 3718 * Find the `best' page on the cachelist for this (vp,off) (as,vaddr) pair. 3719 * 3720 * Does its own locking. 3721 * If PG_MATCH_COLOR is set, then NULL will be returned if there are no 3722 * pages of the proper color even if there are pages of a different color. 3723 * Otherwise, scan the bins for ones with pages. For each bin with pages, 3724 * try to lock one of them. If no page can be locked, try the 3725 * next bin. Return NULL if a page can not be found and locked. 3726 * 3727 * Finds a pages, trys to lock it, then removes it. 3728 */ 3729 3730 /*ARGSUSED*/ 3731 page_t * 3732 page_get_cachelist(struct vnode *vp, u_offset_t off, struct seg *seg, 3733 caddr_t vaddr, uint_t flags, struct lgrp *lgrp) 3734 { 3735 page_t *pp; 3736 struct as *as = seg->s_as; 3737 ulong_t bin; 3738 /*LINTED*/ 3739 int mnode; 3740 int mtype; 3741 lgrp_mnode_cookie_t lgrp_cookie; 3742 3743 /* 3744 * If we aren't passed a specific lgroup, or pasased a freed lgrp 3745 * assume we wish to allocate near to the current thread's home. 3746 */ 3747 if (!LGRP_EXISTS(lgrp)) 3748 lgrp = lgrp_home_lgrp(); 3749 3750 if (!kcage_on) { 3751 flags &= ~PG_NORELOC; 3752 flags |= PGI_NOCAGE; 3753 } 3754 3755 if ((flags & (PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == PG_NORELOC && 3756 kcage_freemem <= kcage_throttlefree) { 3757 /* 3758 * Reserve kcage_throttlefree pages for critical kernel 3759 * threads. 3760 * 3761 * Everybody else has to go to page_create_get_something() 3762 * to get a cage page, so we don't deadlock cageout. 3763 */ 3764 return (NULL); 3765 } 3766 3767 /* LINTED */ 3768 AS_2_BIN(as, seg, vp, vaddr, bin, 0); 3769 3770 ASSERT(bin < PAGE_GET_PAGECOLORS(0)); 3771 3772 /* LINTED */ 3773 MTYPE_INIT(mtype, vp, vaddr, flags, MMU_PAGESIZE); 3774 3775 VM_STAT_ADD(vmm_vmstats.pgc_alloc); 3776 3777 /* 3778 * Try local cachelists first 3779 */ 3780 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL); 3781 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3782 pp = page_get_mnode_cachelist(bin, flags, mnode, mtype); 3783 if (pp != NULL) { 3784 VM_STAT_ADD(vmm_vmstats.pgc_allocok); 3785 DTRACE_PROBE4(page__get, 3786 lgrp_t *, lgrp, 3787 int, mnode, 3788 ulong_t, bin, 3789 uint_t, flags); 3790 return (pp); 3791 } 3792 } 3793 3794 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); 3795 3796 /* 3797 * Try freelists/cachelists that are farther away 3798 * This is our only chance to allocate remote pages for PAGESIZE 3799 * requests. 3800 */ 3801 LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie); 3802 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3803 pp = page_get_mnode_freelist(mnode, bin, mtype, 3804 0, flags); 3805 if (pp != NULL) { 3806 VM_STAT_ADD(vmm_vmstats.pgc_allocokdeferred); 3807 DTRACE_PROBE4(page__get, 3808 lgrp_t *, lgrp, 3809 int, mnode, 3810 ulong_t, bin, 3811 uint_t, flags); 3812 return (pp); 3813 } 3814 pp = page_get_mnode_cachelist(bin, flags, mnode, mtype); 3815 if (pp != NULL) { 3816 VM_STAT_ADD(vmm_vmstats.pgc_allocokrem); 3817 DTRACE_PROBE4(page__get, 3818 lgrp_t *, lgrp, 3819 int, mnode, 3820 ulong_t, bin, 3821 uint_t, flags); 3822 return (pp); 3823 } 3824 } 3825 3826 VM_STAT_ADD(vmm_vmstats.pgc_allocfailed); 3827 return (NULL); 3828 } 3829 3830 page_t * 3831 page_get_mnode_cachelist(uint_t bin, uint_t flags, int mnode, int mtype) 3832 { 3833 kmutex_t *pcm; 3834 page_t *pp, *first_pp; 3835 uint_t sbin; 3836 int plw_initialized; 3837 page_list_walker_t plw; 3838 3839 VM_STAT_ADD(vmm_vmstats.pgmc_alloc); 3840 3841 /* LINTED */ 3842 MTYPE_START(mnode, mtype, flags); 3843 if (mtype < 0) { /* mnode does not have memory in mtype range */ 3844 VM_STAT_ADD(vmm_vmstats.pgmc_allocempty); 3845 return (NULL); 3846 } 3847 3848 try_again: 3849 3850 plw_initialized = 0; 3851 plw.plw_ceq_dif = 1; 3852 3853 /* 3854 * Only hold one cachelist lock at a time, that way we 3855 * can start anywhere and not have to worry about lock 3856 * ordering. 3857 */ 3858 3859 for (plw.plw_count = 0; 3860 plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) { 3861 sbin = bin; 3862 do { 3863 3864 if (!PAGE_CACHELISTS(mnode, bin, mtype)) 3865 goto bin_empty_1; 3866 pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST); 3867 mutex_enter(pcm); 3868 pp = PAGE_CACHELISTS(mnode, bin, mtype); 3869 if (pp == NULL) 3870 goto bin_empty_0; 3871 3872 first_pp = pp; 3873 ASSERT(pp->p_vnode); 3874 ASSERT(PP_ISAGED(pp) == 0); 3875 ASSERT(pp->p_szc == 0); 3876 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 3877 while (!page_trylock(pp, SE_EXCL)) { 3878 pp = pp->p_next; 3879 ASSERT(pp->p_szc == 0); 3880 if (pp == first_pp) { 3881 /* 3882 * We have searched the complete list! 3883 * And all of them (might only be one) 3884 * are locked. This can happen since 3885 * these pages can also be found via 3886 * the hash list. When found via the 3887 * hash list, they are locked first, 3888 * then removed. We give up to let the 3889 * other thread run. 3890 */ 3891 pp = NULL; 3892 break; 3893 } 3894 ASSERT(pp->p_vnode); 3895 ASSERT(PP_ISFREE(pp)); 3896 ASSERT(PP_ISAGED(pp) == 0); 3897 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == 3898 mnode); 3899 } 3900 3901 if (pp) { 3902 page_t **ppp; 3903 /* 3904 * Found and locked a page. 3905 * Pull it off the list. 3906 */ 3907 ASSERT(mtype == PP_2_MTYPE(pp)); 3908 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 3909 page_sub(ppp, pp); 3910 /* 3911 * Subtract counters before releasing pcm mutex 3912 * to avoid a race with page_freelist_coalesce 3913 * and page_freelist_split. 3914 */ 3915 page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST); 3916 mutex_exit(pcm); 3917 ASSERT(pp->p_vnode); 3918 ASSERT(PP_ISAGED(pp) == 0); 3919 #if defined(__sparc) 3920 ASSERT(!kcage_on || 3921 (flags & PG_NORELOC) == 0 || 3922 PP_ISNORELOC(pp)); 3923 if (PP_ISNORELOC(pp)) { 3924 kcage_freemem_sub(1); 3925 } 3926 #endif 3927 VM_STAT_ADD(vmm_vmstats. pgmc_allocok); 3928 return (pp); 3929 } 3930 bin_empty_0: 3931 mutex_exit(pcm); 3932 bin_empty_1: 3933 if (plw_initialized == 0) { 3934 page_list_walk_init(0, flags, bin, 0, 1, &plw); 3935 plw_initialized = 1; 3936 } 3937 /* calculate the next bin with equivalent color */ 3938 bin = ADD_MASKED(bin, plw.plw_bin_step, 3939 plw.plw_ceq_mask[0], plw.plw_color_mask); 3940 } while (sbin != bin); 3941 3942 if (plw.plw_ceq_dif > 1) 3943 bin = page_list_walk_next_bin(0, bin, &plw); 3944 } 3945 3946 MTYPE_NEXT(mnode, mtype, flags); 3947 if (mtype >= 0) 3948 goto try_again; 3949 3950 VM_STAT_ADD(vmm_vmstats.pgmc_allocfailed); 3951 return (NULL); 3952 } 3953 3954 #ifdef DEBUG 3955 #define REPL_PAGE_STATS 3956 #endif /* DEBUG */ 3957 3958 #ifdef REPL_PAGE_STATS 3959 struct repl_page_stats { 3960 uint_t ngets; 3961 uint_t ngets_noreloc; 3962 uint_t npgr_noreloc; 3963 uint_t nnopage_first; 3964 uint_t nnopage; 3965 uint_t nhashout; 3966 uint_t nnofree; 3967 uint_t nnext_pp; 3968 } repl_page_stats; 3969 #define REPL_STAT_INCR(v) atomic_add_32(&repl_page_stats.v, 1) 3970 #else /* REPL_PAGE_STATS */ 3971 #define REPL_STAT_INCR(v) 3972 #endif /* REPL_PAGE_STATS */ 3973 3974 int pgrppgcp; 3975 3976 /* 3977 * The freemem accounting must be done by the caller. 3978 * First we try to get a replacement page of the same size as like_pp, 3979 * if that is not possible, then we just get a set of discontiguous 3980 * PAGESIZE pages. 3981 */ 3982 page_t * 3983 page_get_replacement_page(page_t *orig_like_pp, struct lgrp *lgrp_target, 3984 uint_t pgrflags) 3985 { 3986 page_t *like_pp; 3987 page_t *pp, *pplist; 3988 page_t *pl = NULL; 3989 ulong_t bin; 3990 int mnode, page_mnode; 3991 int szc; 3992 spgcnt_t npgs, pg_cnt; 3993 pfn_t pfnum; 3994 int mtype; 3995 int flags = 0; 3996 lgrp_mnode_cookie_t lgrp_cookie; 3997 lgrp_t *lgrp; 3998 3999 REPL_STAT_INCR(ngets); 4000 like_pp = orig_like_pp; 4001 ASSERT(PAGE_EXCL(like_pp)); 4002 4003 szc = like_pp->p_szc; 4004 npgs = page_get_pagecnt(szc); 4005 /* 4006 * Now we reset like_pp to the base page_t. 4007 * That way, we won't walk past the end of this 'szc' page. 4008 */ 4009 pfnum = PFN_BASE(like_pp->p_pagenum, szc); 4010 like_pp = page_numtopp_nolock(pfnum); 4011 ASSERT(like_pp->p_szc == szc); 4012 4013 if (PP_ISNORELOC(like_pp)) { 4014 ASSERT(kcage_on); 4015 REPL_STAT_INCR(ngets_noreloc); 4016 flags = PGI_RELOCONLY; 4017 } else if (pgrflags & PGR_NORELOC) { 4018 ASSERT(kcage_on); 4019 REPL_STAT_INCR(npgr_noreloc); 4020 flags = PG_NORELOC; 4021 } 4022 4023 /* 4024 * Kernel pages must always be replaced with the same size 4025 * pages, since we cannot properly handle demotion of kernel 4026 * pages. 4027 */ 4028 if (PP_ISKAS(like_pp)) 4029 pgrflags |= PGR_SAMESZC; 4030 4031 /* LINTED */ 4032 MTYPE_PGR_INIT(mtype, flags, like_pp, page_mnode, npgs); 4033 4034 while (npgs) { 4035 pplist = NULL; 4036 for (;;) { 4037 pg_cnt = page_get_pagecnt(szc); 4038 bin = PP_2_BIN(like_pp); 4039 ASSERT(like_pp->p_szc == orig_like_pp->p_szc); 4040 ASSERT(pg_cnt <= npgs); 4041 4042 /* 4043 * If an lgroup was specified, try to get the 4044 * page from that lgroup. 4045 * NOTE: Must be careful with code below because 4046 * lgroup may disappear and reappear since there 4047 * is no locking for lgroup here. 4048 */ 4049 if (LGRP_EXISTS(lgrp_target)) { 4050 /* 4051 * Keep local variable for lgroup separate 4052 * from lgroup argument since this code should 4053 * only be exercised when lgroup argument 4054 * exists.... 4055 */ 4056 lgrp = lgrp_target; 4057 4058 /* Try the lgroup's freelists first */ 4059 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 4060 LGRP_SRCH_LOCAL); 4061 while ((pplist == NULL) && 4062 (mnode = lgrp_memnode_choose(&lgrp_cookie)) 4063 != -1) { 4064 pplist = 4065 page_get_mnode_freelist(mnode, bin, 4066 mtype, szc, flags); 4067 } 4068 4069 /* 4070 * Now try it's cachelists if this is a 4071 * small page. Don't need to do it for 4072 * larger ones since page_freelist_coalesce() 4073 * already failed. 4074 */ 4075 if (pplist != NULL || szc != 0) 4076 break; 4077 4078 /* Now try it's cachelists */ 4079 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 4080 LGRP_SRCH_LOCAL); 4081 4082 while ((pplist == NULL) && 4083 (mnode = lgrp_memnode_choose(&lgrp_cookie)) 4084 != -1) { 4085 pplist = 4086 page_get_mnode_cachelist(bin, flags, 4087 mnode, mtype); 4088 } 4089 if (pplist != NULL) { 4090 page_hashout(pplist, NULL); 4091 PP_SETAGED(pplist); 4092 REPL_STAT_INCR(nhashout); 4093 break; 4094 } 4095 /* Done looking in this lgroup. Bail out. */ 4096 break; 4097 } 4098 4099 /* 4100 * No lgroup was specified (or lgroup was removed by 4101 * DR, so just try to get the page as close to 4102 * like_pp's mnode as possible. 4103 * First try the local freelist... 4104 */ 4105 mnode = PP_2_MEM_NODE(like_pp); 4106 pplist = page_get_mnode_freelist(mnode, bin, 4107 mtype, szc, flags); 4108 if (pplist != NULL) 4109 break; 4110 4111 REPL_STAT_INCR(nnofree); 4112 4113 /* 4114 * ...then the local cachelist. Don't need to do it for 4115 * larger pages cause page_freelist_coalesce() already 4116 * failed there anyway. 4117 */ 4118 if (szc == 0) { 4119 pplist = page_get_mnode_cachelist(bin, flags, 4120 mnode, mtype); 4121 if (pplist != NULL) { 4122 page_hashout(pplist, NULL); 4123 PP_SETAGED(pplist); 4124 REPL_STAT_INCR(nhashout); 4125 break; 4126 } 4127 } 4128 4129 /* Now try remote freelists */ 4130 page_mnode = mnode; 4131 lgrp = 4132 lgrp_hand_to_lgrp(MEM_NODE_2_LGRPHAND(page_mnode)); 4133 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 4134 LGRP_SRCH_HIER); 4135 while (pplist == NULL && 4136 (mnode = lgrp_memnode_choose(&lgrp_cookie)) 4137 != -1) { 4138 /* 4139 * Skip local mnode. 4140 */ 4141 if ((mnode == page_mnode) || 4142 (mem_node_config[mnode].exists == 0)) 4143 continue; 4144 4145 pplist = page_get_mnode_freelist(mnode, 4146 bin, mtype, szc, flags); 4147 } 4148 4149 if (pplist != NULL) 4150 break; 4151 4152 4153 /* Now try remote cachelists */ 4154 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 4155 LGRP_SRCH_HIER); 4156 while (pplist == NULL && szc == 0) { 4157 mnode = lgrp_memnode_choose(&lgrp_cookie); 4158 if (mnode == -1) 4159 break; 4160 /* 4161 * Skip local mnode. 4162 */ 4163 if ((mnode == page_mnode) || 4164 (mem_node_config[mnode].exists == 0)) 4165 continue; 4166 4167 pplist = page_get_mnode_cachelist(bin, 4168 flags, mnode, mtype); 4169 4170 if (pplist != NULL) { 4171 page_hashout(pplist, NULL); 4172 PP_SETAGED(pplist); 4173 REPL_STAT_INCR(nhashout); 4174 break; 4175 } 4176 } 4177 4178 /* 4179 * Break out of while loop under the following cases: 4180 * - If we successfully got a page. 4181 * - If pgrflags specified only returning a specific 4182 * page size and we could not find that page size. 4183 * - If we could not satisfy the request with PAGESIZE 4184 * or larger pages. 4185 */ 4186 if (pplist != NULL || szc == 0) 4187 break; 4188 4189 if ((pgrflags & PGR_SAMESZC) || pgrppgcp) { 4190 /* try to find contig page */ 4191 4192 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 4193 LGRP_SRCH_HIER); 4194 4195 while ((pplist == NULL) && 4196 (mnode = 4197 lgrp_memnode_choose(&lgrp_cookie)) 4198 != -1) { 4199 pplist = page_get_contig_pages( 4200 mnode, bin, mtype, szc, 4201 flags | PGI_PGCPHIPRI); 4202 } 4203 break; 4204 } 4205 4206 /* 4207 * The correct thing to do here is try the next 4208 * page size down using szc--. Due to a bug 4209 * with the processing of HAT_RELOAD_SHARE 4210 * where the sfmmu_ttecnt arrays of all 4211 * hats sharing an ISM segment don't get updated, 4212 * using intermediate size pages for relocation 4213 * can lead to continuous page faults. 4214 */ 4215 szc = 0; 4216 } 4217 4218 if (pplist != NULL) { 4219 DTRACE_PROBE4(page__get, 4220 lgrp_t *, lgrp, 4221 int, mnode, 4222 ulong_t, bin, 4223 uint_t, flags); 4224 4225 while (pplist != NULL && pg_cnt--) { 4226 ASSERT(pplist != NULL); 4227 pp = pplist; 4228 page_sub(&pplist, pp); 4229 PP_CLRFREE(pp); 4230 PP_CLRAGED(pp); 4231 page_list_concat(&pl, &pp); 4232 npgs--; 4233 like_pp = like_pp + 1; 4234 REPL_STAT_INCR(nnext_pp); 4235 } 4236 ASSERT(pg_cnt == 0); 4237 } else { 4238 break; 4239 } 4240 } 4241 4242 if (npgs) { 4243 /* 4244 * We were unable to allocate the necessary number 4245 * of pages. 4246 * We need to free up any pl. 4247 */ 4248 REPL_STAT_INCR(nnopage); 4249 page_free_replacement_page(pl); 4250 return (NULL); 4251 } else { 4252 return (pl); 4253 } 4254 } 4255 4256 /* 4257 * demote a free large page to it's constituent pages 4258 */ 4259 void 4260 page_demote_free_pages(page_t *pp) 4261 { 4262 4263 int mnode; 4264 4265 ASSERT(pp != NULL); 4266 ASSERT(PAGE_LOCKED(pp)); 4267 ASSERT(PP_ISFREE(pp)); 4268 ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes); 4269 4270 mnode = PP_2_MEM_NODE(pp); 4271 page_freelist_lock(mnode); 4272 if (pp->p_szc != 0) { 4273 (void) page_demote(mnode, PFN_BASE(pp->p_pagenum, 4274 pp->p_szc), pp->p_szc, 0, PC_NO_COLOR, PC_FREE); 4275 } 4276 page_freelist_unlock(mnode); 4277 ASSERT(pp->p_szc == 0); 4278 } 4279 4280 /* 4281 * Factor in colorequiv to check additional 'equivalent' bins. 4282 * colorequiv may be set in /etc/system 4283 */ 4284 void 4285 page_set_colorequiv_arr(void) 4286 { 4287 if (colorequiv > 1) { 4288 int i; 4289 uint_t sv_a = lowbit(colorequiv) - 1; 4290 4291 if (sv_a > 15) 4292 sv_a = 15; 4293 4294 for (i = 0; i < MMU_PAGE_SIZES; i++) { 4295 uint_t colors; 4296 uint_t a = sv_a; 4297 4298 if ((colors = hw_page_array[i].hp_colors) <= 1) { 4299 continue; 4300 } 4301 while ((colors >> a) == 0) 4302 a--; 4303 if ((a << 4) > colorequivszc[i]) { 4304 colorequivszc[i] = (a << 4); 4305 } 4306 } 4307 } 4308 } 4309