1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * Portions of this source code were derived from Berkeley 4.3 BSD 32 * under license from the Regents of the University of California. 33 */ 34 35 #pragma ident "%Z%%M% %I% %E% SMI" 36 37 /* 38 * This file contains common functions to access and manage the page lists. 39 * Many of these routines originated from platform dependent modules 40 * (sun4/vm/vm_dep.c, i86pc/vm/vm_machdep.c) and modified to function in 41 * a platform independent manner. 42 * 43 * vm/vm_dep.h provides for platform specific support. 44 */ 45 46 #include <sys/types.h> 47 #include <sys/debug.h> 48 #include <sys/cmn_err.h> 49 #include <sys/systm.h> 50 #include <sys/atomic.h> 51 #include <sys/sysmacros.h> 52 #include <vm/as.h> 53 #include <vm/page.h> 54 #include <vm/seg_kmem.h> 55 #include <vm/seg_vn.h> 56 #include <sys/memnode.h> 57 #include <vm/vm_dep.h> 58 #include <sys/lgrp.h> 59 #include <sys/mem_config.h> 60 #include <sys/callb.h> 61 #include <sys/mem_cage.h> 62 #include <sys/sdt.h> 63 64 extern uint_t vac_colors; 65 66 /* vm_cpu_data for the boot cpu before kmem is initialized */ 67 #pragma align L2CACHE_ALIGN_MAX(vm_cpu_data0) 68 char vm_cpu_data0[VM_CPU_DATA_PADSIZE]; 69 70 /* 71 * number of page colors equivalent to reqested color in page_get routines. 72 * If set, keeps large pages intact longer and keeps MPO allocation 73 * from the local mnode in favor of acquiring the 'correct' page color from 74 * a demoted large page or from a remote mnode. 75 */ 76 int colorequiv; 77 78 /* 79 * if set, specifies the percentage of large pages that are free from within 80 * a large page region before attempting to lock those pages for 81 * page_get_contig_pages processing. 82 * 83 * Should be turned on when kpr is available when page_trylock_contig_pages 84 * can be more selective. 85 */ 86 87 int ptcpthreshold; 88 89 /* 90 * Limit page get contig page search based on failure cnts in pgcpfailcnt[]. 91 * use slot 0 (base page size unused) to enable or disable limiting search. 92 * Enabled by default. 93 */ 94 int pgcpfailcnt[MMU_PAGE_SIZES]; 95 int pgcplimitsearch = 1; 96 97 #ifdef VM_STATS 98 struct vmm_vmstats_str vmm_vmstats; 99 100 #endif /* VM_STATS */ 101 102 #if defined(__sparc) 103 #define LPGCREATE 0 104 #else 105 /* enable page_get_contig_pages */ 106 #define LPGCREATE 1 107 #endif 108 109 int pg_contig_disable; 110 int pg_lpgcreate_nocage = LPGCREATE; 111 112 /* 113 * page_freelist_fill pfn flag to signify no hi pfn requirement. 114 */ 115 #define PFNNULL 0 116 117 /* Flags involved in promotion and demotion routines */ 118 #define PC_FREE 0x1 /* put page on freelist */ 119 #define PC_ALLOC 0x2 /* return page for allocation */ 120 121 /* 122 * Flag for page_demote to be used with PC_FREE to denote that we don't care 123 * what the color is as the color parameter to the function is ignored. 124 */ 125 #define PC_NO_COLOR (-1) 126 127 /* 128 * page counters candidates info 129 * See page_ctrs_cands comment below for more details. 130 * fields are as follows: 131 * pcc_pages_free: # pages which freelist coalesce can create 132 * pcc_color_free_len: number of elements in pcc_color_free array 133 * pcc_color_free: pointer to page free counts per color 134 */ 135 typedef struct pcc_info { 136 pgcnt_t pcc_pages_free; 137 int pcc_color_free_len; 138 pgcnt_t *pcc_color_free; 139 } pcc_info_t; 140 141 /* 142 * On big machines it can take a long time to check page_counters 143 * arrays. page_ctrs_cands is a summary array whose elements are a dynamically 144 * updated sum of all elements of the corresponding page_counters arrays. 145 * page_freelist_coalesce() searches page_counters only if an appropriate 146 * element of page_ctrs_cands array is greater than 0. 147 * 148 * An extra dimension is used for page_ctrs_cands to spread the elements 149 * over a few e$ cache lines to avoid serialization during the array 150 * updates. 151 */ 152 #pragma align 64(page_ctrs_cands) 153 154 static pcc_info_t *page_ctrs_cands[NPC_MUTEX][MMU_PAGE_SIZES]; 155 156 /* 157 * Return in val the total number of free pages which can be created 158 * for the given mnode (m) and region size (r) 159 */ 160 #define PGCTRS_CANDS_GETVALUE(m, r, val) { \ 161 int i; \ 162 val = 0; \ 163 for (i = 0; i < NPC_MUTEX; i++) { \ 164 val += page_ctrs_cands[i][(r)][(m)].pcc_pages_free; \ 165 } \ 166 } 167 168 /* 169 * Return in val the total number of free pages which can be created 170 * for the given mnode (m), region size (r), and color (c) 171 */ 172 #define PGCTRS_CANDS_GETVALUECOLOR(m, r, c, val) { \ 173 int i; \ 174 val = 0; \ 175 ASSERT((c) < page_ctrs_cands[0][(r)][(m)].pcc_color_free_len); \ 176 for (i = 0; i < NPC_MUTEX; i++) { \ 177 val += page_ctrs_cands[i][(r)][(m)].pcc_color_free[(c)]; \ 178 } \ 179 } 180 181 /* 182 * We can only allow a single thread to update a counter within the physical 183 * range of the largest supported page size. That is the finest granularity 184 * possible since the counter values are dependent on each other 185 * as you move accross region sizes. PP_CTR_LOCK_INDX is used to determine the 186 * ctr_mutex lock index for a particular physical range. 187 */ 188 static kmutex_t *ctr_mutex[NPC_MUTEX]; 189 190 #define PP_CTR_LOCK_INDX(pp) \ 191 (((pp)->p_pagenum >> \ 192 (PAGE_BSZS_SHIFT(mmu_page_sizes - 1))) & (NPC_MUTEX - 1)) 193 194 /* 195 * Local functions prototypes. 196 */ 197 198 void page_ctr_add(int, int, page_t *, int); 199 void page_ctr_add_internal(int, int, page_t *, int); 200 void page_ctr_sub(int, int, page_t *, int); 201 uint_t page_convert_color(uchar_t, uchar_t, uint_t); 202 void page_freelist_lock(int); 203 void page_freelist_unlock(int); 204 page_t *page_promote(int, pfn_t, uchar_t, int); 205 page_t *page_demote(int, pfn_t, uchar_t, uchar_t, int, int); 206 page_t *page_freelist_fill(uchar_t, int, int, int, pfn_t); 207 page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int); 208 static int page_trylock_cons(page_t *pp, se_t se); 209 210 #define PNUM_SIZE(szc) \ 211 (hw_page_array[(szc)].hp_size >> hw_page_array[0].hp_shift) 212 #define PNUM_SHIFT(szc) \ 213 (hw_page_array[(szc)].hp_shift - hw_page_array[0].hp_shift) 214 215 /* 216 * The page_counters array below is used to keep track of free contiguous 217 * physical memory. A hw_page_map_t will be allocated per mnode per szc. 218 * This contains an array of counters, the size of the array, a shift value 219 * used to convert a pagenum into a counter array index or vice versa, as 220 * well as a cache of the last successful index to be promoted to a larger 221 * page size. As an optimization, we keep track of the last successful index 222 * to be promoted per page color for the given size region, and this is 223 * allocated dynamically based upon the number of colors for a given 224 * region size. 225 * 226 * Conceptually, the page counters are represented as: 227 * 228 * page_counters[region_size][mnode] 229 * 230 * region_size: size code of a candidate larger page made up 231 * of contiguous free smaller pages. 232 * 233 * page_counters[region_size][mnode].hpm_counters[index]: 234 * represents how many (region_size - 1) pages either 235 * exist or can be created within the given index range. 236 * 237 * Let's look at a sparc example: 238 * If we want to create a free 512k page, we look at region_size 2 239 * for the mnode we want. We calculate the index and look at a specific 240 * hpm_counters location. If we see 8 (FULL_REGION_CNT on sparc) at 241 * this location, it means that 8 64k pages either exist or can be created 242 * from 8K pages in order to make a single free 512k page at the given 243 * index. Note that when a region is full, it will contribute to the 244 * counts in the region above it. Thus we will not know what page 245 * size the free pages will be which can be promoted to this new free 246 * page unless we look at all regions below the current region. 247 */ 248 249 /* 250 * Note: hpmctr_t is defined in platform vm_dep.h 251 * hw_page_map_t contains all the information needed for the page_counters 252 * logic. The fields are as follows: 253 * 254 * hpm_counters: dynamically allocated array to hold counter data 255 * hpm_entries: entries in hpm_counters 256 * hpm_shift: shift for pnum/array index conv 257 * hpm_base: PFN mapped to counter index 0 258 * hpm_color_current_len: # of elements in hpm_color_current "array" below 259 * hpm_color_current: last index in counter array for this color at 260 * which we successfully created a large page 261 */ 262 typedef struct hw_page_map { 263 hpmctr_t *hpm_counters; 264 size_t hpm_entries; 265 int hpm_shift; 266 pfn_t hpm_base; 267 size_t hpm_color_current_len; 268 size_t *hpm_color_current; 269 } hw_page_map_t; 270 271 /* 272 * Element zero is not used, but is allocated for convenience. 273 */ 274 static hw_page_map_t *page_counters[MMU_PAGE_SIZES]; 275 276 /* 277 * The following macros are convenient ways to get access to the individual 278 * elements of the page_counters arrays. They can be used on both 279 * the left side and right side of equations. 280 */ 281 #define PAGE_COUNTERS(mnode, rg_szc, idx) \ 282 (page_counters[(rg_szc)][(mnode)].hpm_counters[(idx)]) 283 284 #define PAGE_COUNTERS_COUNTERS(mnode, rg_szc) \ 285 (page_counters[(rg_szc)][(mnode)].hpm_counters) 286 287 #define PAGE_COUNTERS_SHIFT(mnode, rg_szc) \ 288 (page_counters[(rg_szc)][(mnode)].hpm_shift) 289 290 #define PAGE_COUNTERS_ENTRIES(mnode, rg_szc) \ 291 (page_counters[(rg_szc)][(mnode)].hpm_entries) 292 293 #define PAGE_COUNTERS_BASE(mnode, rg_szc) \ 294 (page_counters[(rg_szc)][(mnode)].hpm_base) 295 296 #define PAGE_COUNTERS_CURRENT_COLOR_LEN(mnode, rg_szc) \ 297 (page_counters[(rg_szc)][(mnode)].hpm_color_current_len) 298 299 #define PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, rg_szc) \ 300 (page_counters[(rg_szc)][(mnode)].hpm_color_current) 301 302 #define PAGE_COUNTERS_CURRENT_COLOR(mnode, rg_szc, color) \ 303 (page_counters[(rg_szc)][(mnode)].hpm_color_current[(color)]) 304 305 #define PNUM_TO_IDX(mnode, rg_szc, pnum) \ 306 (((pnum) - PAGE_COUNTERS_BASE((mnode), (rg_szc))) >> \ 307 PAGE_COUNTERS_SHIFT((mnode), (rg_szc))) 308 309 #define IDX_TO_PNUM(mnode, rg_szc, index) \ 310 (PAGE_COUNTERS_BASE((mnode), (rg_szc)) + \ 311 ((index) << PAGE_COUNTERS_SHIFT((mnode), (rg_szc)))) 312 313 /* 314 * Protects the hpm_counters and hpm_color_current memory from changing while 315 * looking at page counters information. 316 * Grab the write lock to modify what these fields point at. 317 * Grab the read lock to prevent any pointers from changing. 318 * The write lock can not be held during memory allocation due to a possible 319 * recursion deadlock with trying to grab the read lock while the 320 * write lock is already held. 321 */ 322 krwlock_t page_ctrs_rwlock[MAX_MEM_NODES]; 323 324 325 /* 326 * initialize cpu_vm_data to point at cache aligned vm_cpu_data_t. 327 */ 328 void 329 cpu_vm_data_init(struct cpu *cp) 330 { 331 int align = (L2CACHE_ALIGN) ? L2CACHE_ALIGN : L2CACHE_ALIGN_MAX; 332 333 ASSERT(L2CACHE_ALIGN <= L2CACHE_ALIGN_MAX); 334 335 if (cp == CPU0) { 336 cp->cpu_vm_data = (void *)&vm_cpu_data0; 337 } else { 338 void *kmptr; 339 340 kmptr = kmem_zalloc(VM_CPU_DATA_PADSIZE + align, KM_SLEEP); 341 cp->cpu_vm_data = (void *) P2ROUNDUP((uintptr_t)kmptr, align); 342 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr = kmptr; 343 } 344 } 345 346 /* 347 * free cpu_vm_data 348 */ 349 void 350 cpu_vm_data_destroy(struct cpu *cp) 351 { 352 if (cp->cpu_seqid && cp->cpu_vm_data) { 353 ASSERT(cp != CPU0); 354 kmem_free(((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr, 355 VM_CPU_DATA_PADSIZE); 356 } 357 cp->cpu_vm_data = NULL; 358 } 359 360 361 /* 362 * page size to page size code 363 */ 364 int 365 page_szc(size_t pagesize) 366 { 367 int i = 0; 368 369 while (hw_page_array[i].hp_size) { 370 if (pagesize == hw_page_array[i].hp_size) 371 return (i); 372 i++; 373 } 374 return (-1); 375 } 376 377 /* 378 * page size to page size code with the restriction that it be a supported 379 * user page size. If it's not a supported user page size, -1 will be returned. 380 */ 381 int 382 page_szc_user_filtered(size_t pagesize) 383 { 384 int szc = page_szc(pagesize); 385 if ((szc != -1) && (SZC_2_USERSZC(szc) != -1)) { 386 return (szc); 387 } 388 return (-1); 389 } 390 391 /* 392 * Return how many page sizes are available for the user to use. This is 393 * what the hardware supports and not based upon how the OS implements the 394 * support of different page sizes. 395 */ 396 uint_t 397 page_num_user_pagesizes(void) 398 { 399 return (mmu_exported_page_sizes); 400 } 401 402 uint_t 403 page_num_pagesizes(void) 404 { 405 return (mmu_page_sizes); 406 } 407 408 /* 409 * returns the count of the number of base pagesize pages associated with szc 410 */ 411 pgcnt_t 412 page_get_pagecnt(uint_t szc) 413 { 414 if (szc >= mmu_page_sizes) 415 panic("page_get_pagecnt: out of range %d", szc); 416 return (hw_page_array[szc].hp_pgcnt); 417 } 418 419 size_t 420 page_get_pagesize(uint_t szc) 421 { 422 if (szc >= mmu_page_sizes) 423 panic("page_get_pagesize: out of range %d", szc); 424 return (hw_page_array[szc].hp_size); 425 } 426 427 /* 428 * Return the size of a page based upon the index passed in. An index of 429 * zero refers to the smallest page size in the system, and as index increases 430 * it refers to the next larger supported page size in the system. 431 * Note that szc and userszc may not be the same due to unsupported szc's on 432 * some systems. 433 */ 434 size_t 435 page_get_user_pagesize(uint_t userszc) 436 { 437 uint_t szc = USERSZC_2_SZC(userszc); 438 439 if (szc >= mmu_page_sizes) 440 panic("page_get_user_pagesize: out of range %d", szc); 441 return (hw_page_array[szc].hp_size); 442 } 443 444 uint_t 445 page_get_shift(uint_t szc) 446 { 447 if (szc >= mmu_page_sizes) 448 panic("page_get_shift: out of range %d", szc); 449 return (hw_page_array[szc].hp_shift); 450 } 451 452 uint_t 453 page_get_pagecolors(uint_t szc) 454 { 455 ASSERT(page_colors != 0); 456 return (MAX(page_colors >> PAGE_BSZS_SHIFT(szc), 1)); 457 } 458 459 /* 460 * Called by startup(). 461 * Size up the per page size free list counters based on physmax 462 * of each node and max_mem_nodes. 463 */ 464 size_t 465 page_ctrs_sz(void) 466 { 467 int r; /* region size */ 468 int mnode; 469 uint_t ctrs_sz = 0; 470 int i; 471 pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; 472 473 /* 474 * We need to determine how many page colors there are for each 475 * page size in order to allocate memory for any color specific 476 * arrays. 477 */ 478 colors_per_szc[0] = page_colors; 479 for (i = 1; i < mmu_page_sizes; i++) { 480 colors_per_szc[i] = 481 page_convert_color(0, i, page_colors - 1) + 1; 482 } 483 484 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 485 486 pgcnt_t r_pgcnt; 487 pfn_t r_base; 488 pgcnt_t r_align; 489 490 if (mem_node_config[mnode].exists == 0) 491 continue; 492 493 /* 494 * determine size needed for page counter arrays with 495 * base aligned to large page size. 496 */ 497 for (r = 1; r < mmu_page_sizes; r++) { 498 /* add in space for hpm_counters */ 499 r_align = page_get_pagecnt(r); 500 r_base = mem_node_config[mnode].physbase; 501 r_base &= ~(r_align - 1); 502 r_pgcnt = howmany(mem_node_config[mnode].physmax - 503 r_base, r_align); 504 /* 505 * Round up to always allocate on pointer sized 506 * boundaries. 507 */ 508 ctrs_sz += P2ROUNDUP((r_pgcnt * sizeof (hpmctr_t)), 509 sizeof (hpmctr_t *)); 510 511 /* add in space for hpm_color_current */ 512 ctrs_sz += (colors_per_szc[r] * 513 sizeof (size_t)); 514 } 515 } 516 517 for (r = 1; r < mmu_page_sizes; r++) { 518 ctrs_sz += (max_mem_nodes * sizeof (hw_page_map_t)); 519 520 /* add in space for page_ctrs_cands */ 521 ctrs_sz += NPC_MUTEX * max_mem_nodes * (sizeof (pcc_info_t)); 522 ctrs_sz += NPC_MUTEX * max_mem_nodes * colors_per_szc[r] * 523 sizeof (pgcnt_t); 524 } 525 526 /* ctr_mutex */ 527 ctrs_sz += (max_mem_nodes * NPC_MUTEX * sizeof (kmutex_t)); 528 529 /* size for page list counts */ 530 PLCNT_SZ(ctrs_sz); 531 532 /* 533 * add some slop for roundups. page_ctrs_alloc will roundup the start 534 * address of the counters to ecache_alignsize boundary for every 535 * memory node. 536 */ 537 return (ctrs_sz + max_mem_nodes * L2CACHE_ALIGN); 538 } 539 540 caddr_t 541 page_ctrs_alloc(caddr_t alloc_base) 542 { 543 int mnode; 544 int r; /* region size */ 545 int i; 546 pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; 547 548 /* 549 * We need to determine how many page colors there are for each 550 * page size in order to allocate memory for any color specific 551 * arrays. 552 */ 553 colors_per_szc[0] = page_colors; 554 for (i = 1; i < mmu_page_sizes; i++) { 555 colors_per_szc[i] = 556 page_convert_color(0, i, page_colors - 1) + 1; 557 } 558 559 for (r = 1; r < mmu_page_sizes; r++) { 560 page_counters[r] = (hw_page_map_t *)alloc_base; 561 alloc_base += (max_mem_nodes * sizeof (hw_page_map_t)); 562 } 563 564 /* page_ctrs_cands */ 565 for (r = 1; r < mmu_page_sizes; r++) { 566 for (i = 0; i < NPC_MUTEX; i++) { 567 page_ctrs_cands[i][r] = (pcc_info_t *)alloc_base; 568 alloc_base += max_mem_nodes * (sizeof (pcc_info_t)); 569 570 } 571 } 572 573 /* page_ctrs_cands pcc_color_free array */ 574 for (r = 1; r < mmu_page_sizes; r++) { 575 for (i = 0; i < NPC_MUTEX; i++) { 576 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 577 page_ctrs_cands[i][r][mnode].pcc_color_free_len 578 = colors_per_szc[r]; 579 page_ctrs_cands[i][r][mnode].pcc_color_free = 580 (pgcnt_t *)alloc_base; 581 alloc_base += colors_per_szc[r] * 582 sizeof (pgcnt_t); 583 } 584 } 585 } 586 587 /* ctr_mutex */ 588 for (i = 0; i < NPC_MUTEX; i++) { 589 ctr_mutex[i] = (kmutex_t *)alloc_base; 590 alloc_base += (max_mem_nodes * sizeof (kmutex_t)); 591 } 592 593 /* initialize page list counts */ 594 PLCNT_INIT(alloc_base); 595 596 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 597 598 pgcnt_t r_pgcnt; 599 pfn_t r_base; 600 pgcnt_t r_align; 601 int r_shift; 602 603 if (mem_node_config[mnode].exists == 0) 604 continue; 605 606 for (r = 1; r < mmu_page_sizes; r++) { 607 /* 608 * the page_counters base has to be aligned to the 609 * page count of page size code r otherwise the counts 610 * will cross large page boundaries. 611 */ 612 r_align = page_get_pagecnt(r); 613 r_base = mem_node_config[mnode].physbase; 614 /* base needs to be aligned - lower to aligned value */ 615 r_base &= ~(r_align - 1); 616 r_pgcnt = howmany(mem_node_config[mnode].physmax - 617 r_base, r_align); 618 r_shift = PAGE_BSZS_SHIFT(r); 619 620 PAGE_COUNTERS_SHIFT(mnode, r) = r_shift; 621 PAGE_COUNTERS_ENTRIES(mnode, r) = r_pgcnt; 622 PAGE_COUNTERS_BASE(mnode, r) = r_base; 623 PAGE_COUNTERS_CURRENT_COLOR_LEN(mnode, r) = 624 colors_per_szc[r]; 625 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r) = 626 (size_t *)alloc_base; 627 alloc_base += (sizeof (size_t) * colors_per_szc[r]); 628 for (i = 0; i < colors_per_szc[r]; i++) { 629 PAGE_COUNTERS_CURRENT_COLOR(mnode, r, i) = i; 630 } 631 PAGE_COUNTERS_COUNTERS(mnode, r) = 632 (hpmctr_t *)alloc_base; 633 /* 634 * Round up to make alloc_base always be aligned on 635 * a pointer boundary. 636 */ 637 alloc_base += P2ROUNDUP((sizeof (hpmctr_t) * r_pgcnt), 638 sizeof (hpmctr_t *)); 639 640 /* 641 * Verify that PNUM_TO_IDX and IDX_TO_PNUM 642 * satisfy the identity requirement. 643 * We should be able to go from one to the other 644 * and get consistent values. 645 */ 646 ASSERT(PNUM_TO_IDX(mnode, r, 647 (IDX_TO_PNUM(mnode, r, 0))) == 0); 648 ASSERT(IDX_TO_PNUM(mnode, r, 649 (PNUM_TO_IDX(mnode, r, r_base))) == r_base); 650 } 651 /* 652 * Roundup the start address of the page_counters to 653 * cache aligned boundary for every memory node. 654 * page_ctrs_sz() has added some slop for these roundups. 655 */ 656 alloc_base = (caddr_t)P2ROUNDUP((uintptr_t)alloc_base, 657 L2CACHE_ALIGN); 658 } 659 660 /* Initialize other page counter specific data structures. */ 661 for (mnode = 0; mnode < MAX_MEM_NODES; mnode++) { 662 rw_init(&page_ctrs_rwlock[mnode], NULL, RW_DEFAULT, NULL); 663 } 664 665 return (alloc_base); 666 } 667 668 /* 669 * Functions to adjust region counters for each size free list. 670 * Caller is responsible to acquire the ctr_mutex lock if necessary and 671 * thus can be called during startup without locks. 672 */ 673 /* ARGSUSED */ 674 void 675 page_ctr_add_internal(int mnode, int mtype, page_t *pp, int flags) 676 { 677 ssize_t r; /* region size */ 678 ssize_t idx; 679 pfn_t pfnum; 680 int lckidx; 681 682 ASSERT(mnode == PP_2_MEM_NODE(pp)); 683 ASSERT(mtype == PP_2_MTYPE(pp)); 684 685 ASSERT(pp->p_szc < mmu_page_sizes); 686 687 PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags); 688 689 /* no counter update needed for largest page size */ 690 if (pp->p_szc >= mmu_page_sizes - 1) { 691 return; 692 } 693 694 r = pp->p_szc + 1; 695 pfnum = pp->p_pagenum; 696 lckidx = PP_CTR_LOCK_INDX(pp); 697 698 /* 699 * Increment the count of free pages for the current 700 * region. Continue looping up in region size incrementing 701 * count if the preceeding region is full. 702 */ 703 while (r < mmu_page_sizes) { 704 idx = PNUM_TO_IDX(mnode, r, pfnum); 705 706 ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r)); 707 ASSERT(PAGE_COUNTERS(mnode, r, idx) < FULL_REGION_CNT(r)); 708 709 if (++PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r)) 710 break; 711 712 page_ctrs_cands[lckidx][r][mnode].pcc_pages_free++; 713 page_ctrs_cands[lckidx][r][mnode]. 714 pcc_color_free[PP_2_BIN_SZC(pp, r)]++; 715 r++; 716 } 717 } 718 719 void 720 page_ctr_add(int mnode, int mtype, page_t *pp, int flags) 721 { 722 int lckidx = PP_CTR_LOCK_INDX(pp); 723 kmutex_t *lock = &ctr_mutex[lckidx][mnode]; 724 725 mutex_enter(lock); 726 page_ctr_add_internal(mnode, mtype, pp, flags); 727 mutex_exit(lock); 728 } 729 730 void 731 page_ctr_sub(int mnode, int mtype, page_t *pp, int flags) 732 { 733 int lckidx; 734 kmutex_t *lock; 735 ssize_t r; /* region size */ 736 ssize_t idx; 737 pfn_t pfnum; 738 739 ASSERT(mnode == PP_2_MEM_NODE(pp)); 740 ASSERT(mtype == PP_2_MTYPE(pp)); 741 742 ASSERT(pp->p_szc < mmu_page_sizes); 743 744 PLCNT_DECR(pp, mnode, mtype, pp->p_szc, flags); 745 746 /* no counter update needed for largest page size */ 747 if (pp->p_szc >= mmu_page_sizes - 1) { 748 return; 749 } 750 751 r = pp->p_szc + 1; 752 pfnum = pp->p_pagenum; 753 lckidx = PP_CTR_LOCK_INDX(pp); 754 lock = &ctr_mutex[lckidx][mnode]; 755 756 /* 757 * Decrement the count of free pages for the current 758 * region. Continue looping up in region size decrementing 759 * count if the preceeding region was full. 760 */ 761 mutex_enter(lock); 762 while (r < mmu_page_sizes) { 763 idx = PNUM_TO_IDX(mnode, r, pfnum); 764 765 ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r)); 766 ASSERT(PAGE_COUNTERS(mnode, r, idx) > 0); 767 768 if (--PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r) - 1) { 769 break; 770 } 771 ASSERT(page_ctrs_cands[lckidx][r][mnode].pcc_pages_free != 0); 772 ASSERT(page_ctrs_cands[lckidx][r][mnode]. 773 pcc_color_free[PP_2_BIN_SZC(pp, r)] != 0); 774 775 page_ctrs_cands[lckidx][r][mnode].pcc_pages_free--; 776 page_ctrs_cands[lckidx][r][mnode]. 777 pcc_color_free[PP_2_BIN_SZC(pp, r)]--; 778 r++; 779 } 780 mutex_exit(lock); 781 } 782 783 /* 784 * Adjust page counters following a memory attach, since typically the 785 * size of the array needs to change, and the PFN to counter index 786 * mapping needs to change. 787 */ 788 uint_t 789 page_ctrs_adjust(int mnode) 790 { 791 pgcnt_t npgs; 792 int r; /* region size */ 793 int i; 794 size_t pcsz, old_csz; 795 hpmctr_t *new_ctr, *old_ctr; 796 pfn_t oldbase, newbase; 797 size_t old_npgs; 798 hpmctr_t *ctr_cache[MMU_PAGE_SIZES]; 799 size_t size_cache[MMU_PAGE_SIZES]; 800 size_t *color_cache[MMU_PAGE_SIZES]; 801 size_t *old_color_array; 802 pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; 803 804 newbase = mem_node_config[mnode].physbase & ~PC_BASE_ALIGN_MASK; 805 npgs = roundup(mem_node_config[mnode].physmax, 806 PC_BASE_ALIGN) - newbase; 807 808 /* 809 * We need to determine how many page colors there are for each 810 * page size in order to allocate memory for any color specific 811 * arrays. 812 */ 813 colors_per_szc[0] = page_colors; 814 for (r = 1; r < mmu_page_sizes; r++) { 815 colors_per_szc[r] = 816 page_convert_color(0, r, page_colors - 1) + 1; 817 } 818 819 /* 820 * Preallocate all of the new hpm_counters arrays as we can't 821 * hold the page_ctrs_rwlock as a writer and allocate memory. 822 * If we can't allocate all of the arrays, undo our work so far 823 * and return failure. 824 */ 825 for (r = 1; r < mmu_page_sizes; r++) { 826 pcsz = npgs >> PAGE_BSZS_SHIFT(r); 827 828 ctr_cache[r] = kmem_zalloc(pcsz * 829 sizeof (hpmctr_t), KM_NOSLEEP); 830 if (ctr_cache[r] == NULL) { 831 while (--r >= 1) { 832 kmem_free(ctr_cache[r], 833 size_cache[r] * sizeof (hpmctr_t)); 834 } 835 return (ENOMEM); 836 } 837 size_cache[r] = pcsz; 838 } 839 /* 840 * Preallocate all of the new color current arrays as we can't 841 * hold the page_ctrs_rwlock as a writer and allocate memory. 842 * If we can't allocate all of the arrays, undo our work so far 843 * and return failure. 844 */ 845 for (r = 1; r < mmu_page_sizes; r++) { 846 color_cache[r] = kmem_zalloc(sizeof (size_t) * 847 colors_per_szc[r], KM_NOSLEEP); 848 if (color_cache[r] == NULL) { 849 while (--r >= 1) { 850 kmem_free(color_cache[r], 851 colors_per_szc[r] * sizeof (size_t)); 852 } 853 for (r = 1; r < mmu_page_sizes; r++) { 854 kmem_free(ctr_cache[r], 855 size_cache[r] * sizeof (hpmctr_t)); 856 } 857 return (ENOMEM); 858 } 859 } 860 861 /* 862 * Grab the write lock to prevent others from walking these arrays 863 * while we are modifying them. 864 */ 865 rw_enter(&page_ctrs_rwlock[mnode], RW_WRITER); 866 page_freelist_lock(mnode); 867 for (r = 1; r < mmu_page_sizes; r++) { 868 PAGE_COUNTERS_SHIFT(mnode, r) = PAGE_BSZS_SHIFT(r); 869 old_ctr = PAGE_COUNTERS_COUNTERS(mnode, r); 870 old_csz = PAGE_COUNTERS_ENTRIES(mnode, r); 871 oldbase = PAGE_COUNTERS_BASE(mnode, r); 872 old_npgs = old_csz << PAGE_COUNTERS_SHIFT(mnode, r); 873 old_color_array = PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r); 874 875 pcsz = npgs >> PAGE_COUNTERS_SHIFT(mnode, r); 876 new_ctr = ctr_cache[r]; 877 ctr_cache[r] = NULL; 878 if (old_ctr != NULL && 879 (oldbase + old_npgs > newbase) && 880 (newbase + npgs > oldbase)) { 881 /* 882 * Map the intersection of the old and new 883 * counters into the new array. 884 */ 885 size_t offset; 886 if (newbase > oldbase) { 887 offset = (newbase - oldbase) >> 888 PAGE_COUNTERS_SHIFT(mnode, r); 889 bcopy(old_ctr + offset, new_ctr, 890 MIN(pcsz, (old_csz - offset)) * 891 sizeof (hpmctr_t)); 892 } else { 893 offset = (oldbase - newbase) >> 894 PAGE_COUNTERS_SHIFT(mnode, r); 895 bcopy(old_ctr, new_ctr + offset, 896 MIN(pcsz - offset, old_csz) * 897 sizeof (hpmctr_t)); 898 } 899 } 900 901 PAGE_COUNTERS_COUNTERS(mnode, r) = new_ctr; 902 PAGE_COUNTERS_ENTRIES(mnode, r) = pcsz; 903 PAGE_COUNTERS_BASE(mnode, r) = newbase; 904 PAGE_COUNTERS_CURRENT_COLOR_LEN(mnode, r) = colors_per_szc[r]; 905 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r) = color_cache[r]; 906 color_cache[r] = NULL; 907 /* 908 * for now, just reset on these events as it's probably 909 * not worthwhile to try and optimize this. 910 */ 911 for (i = 0; i < colors_per_szc[r]; i++) { 912 PAGE_COUNTERS_CURRENT_COLOR(mnode, r, i) = i; 913 } 914 915 /* cache info for freeing out of the critical path */ 916 if ((caddr_t)old_ctr >= kernelheap && 917 (caddr_t)old_ctr < ekernelheap) { 918 ctr_cache[r] = old_ctr; 919 size_cache[r] = old_csz; 920 } 921 if ((caddr_t)old_color_array >= kernelheap && 922 (caddr_t)old_color_array < ekernelheap) { 923 color_cache[r] = old_color_array; 924 } 925 /* 926 * Verify that PNUM_TO_IDX and IDX_TO_PNUM 927 * satisfy the identity requirement. 928 * We should be able to go from one to the other 929 * and get consistent values. 930 */ 931 ASSERT(PNUM_TO_IDX(mnode, r, 932 (IDX_TO_PNUM(mnode, r, 0))) == 0); 933 ASSERT(IDX_TO_PNUM(mnode, r, 934 (PNUM_TO_IDX(mnode, r, newbase))) == newbase); 935 } 936 page_freelist_unlock(mnode); 937 rw_exit(&page_ctrs_rwlock[mnode]); 938 939 /* 940 * Now that we have dropped the write lock, it is safe to free all 941 * of the memory we have cached above. 942 */ 943 for (r = 1; r < mmu_page_sizes; r++) { 944 if (ctr_cache[r] != NULL) { 945 kmem_free(ctr_cache[r], 946 size_cache[r] * sizeof (hpmctr_t)); 947 } 948 if (color_cache[r] != NULL) { 949 kmem_free(color_cache[r], 950 colors_per_szc[r] * sizeof (size_t)); 951 } 952 } 953 return (0); 954 } 955 956 /* 957 * color contains a valid color index or bin for cur_szc 958 */ 959 uint_t 960 page_convert_color(uchar_t cur_szc, uchar_t new_szc, uint_t color) 961 { 962 uint_t shift; 963 964 if (cur_szc > new_szc) { 965 shift = page_get_shift(cur_szc) - page_get_shift(new_szc); 966 return (color << shift); 967 } else if (cur_szc < new_szc) { 968 shift = page_get_shift(new_szc) - page_get_shift(cur_szc); 969 return (color >> shift); 970 } 971 return (color); 972 } 973 974 #ifdef DEBUG 975 976 /* 977 * confirm pp is a large page corresponding to szc 978 */ 979 void 980 chk_lpg(page_t *pp, uchar_t szc) 981 { 982 spgcnt_t npgs = page_get_pagecnt(pp->p_szc); 983 uint_t noreloc; 984 985 if (npgs == 1) { 986 ASSERT(pp->p_szc == 0); 987 ASSERT(pp->p_next == pp); 988 ASSERT(pp->p_prev == pp); 989 return; 990 } 991 992 ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL); 993 ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL); 994 995 ASSERT(IS_P2ALIGNED(pp->p_pagenum, npgs)); 996 ASSERT(pp->p_pagenum == (pp->p_next->p_pagenum - 1)); 997 ASSERT(pp->p_prev->p_pagenum == (pp->p_pagenum + (npgs - 1))); 998 ASSERT(pp->p_prev == (pp + (npgs - 1))); 999 1000 /* 1001 * Check list of pages. 1002 */ 1003 noreloc = PP_ISNORELOC(pp); 1004 while (npgs--) { 1005 if (npgs != 0) { 1006 ASSERT(pp->p_pagenum == pp->p_next->p_pagenum - 1); 1007 ASSERT(pp->p_next == (pp + 1)); 1008 } 1009 ASSERT(pp->p_szc == szc); 1010 ASSERT(PP_ISFREE(pp)); 1011 ASSERT(PP_ISAGED(pp)); 1012 ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL); 1013 ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL); 1014 ASSERT(pp->p_vnode == NULL); 1015 ASSERT(PP_ISNORELOC(pp) == noreloc); 1016 1017 pp = pp->p_next; 1018 } 1019 } 1020 #endif /* DEBUG */ 1021 1022 void 1023 page_freelist_lock(int mnode) 1024 { 1025 int i; 1026 for (i = 0; i < NPC_MUTEX; i++) { 1027 mutex_enter(FPC_MUTEX(mnode, i)); 1028 mutex_enter(CPC_MUTEX(mnode, i)); 1029 } 1030 } 1031 1032 void 1033 page_freelist_unlock(int mnode) 1034 { 1035 int i; 1036 for (i = 0; i < NPC_MUTEX; i++) { 1037 mutex_exit(FPC_MUTEX(mnode, i)); 1038 mutex_exit(CPC_MUTEX(mnode, i)); 1039 } 1040 } 1041 1042 /* 1043 * update the page list max counts for already allocated pages that has xfer'ed 1044 * (kcage_assimilate_page) between different mtypes. 1045 */ 1046 /* ARGSUSED */ 1047 void 1048 page_list_xfer(page_t *pp, int to_mtype, int from_mtype) 1049 { 1050 PLCNT_MAX_INCR(pp, PP_2_MEM_NODE(pp), to_mtype, pp->p_szc); 1051 PLCNT_MAX_DECR(pp, PP_2_MEM_NODE(pp), from_mtype, pp->p_szc); 1052 } 1053 1054 /* 1055 * add pp to the specified page list. Defaults to head of the page list 1056 * unless PG_LIST_TAIL is specified. 1057 */ 1058 void 1059 page_list_add(page_t *pp, int flags) 1060 { 1061 page_t **ppp; 1062 kmutex_t *pcm; 1063 uint_t bin, mtype; 1064 int mnode; 1065 1066 ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT)); 1067 ASSERT(PP_ISFREE(pp)); 1068 ASSERT(!hat_page_is_mapped(pp)); 1069 ASSERT(hat_page_getshare(pp) == 0); 1070 1071 /* 1072 * Large pages should be freed via page_list_add_pages(). 1073 */ 1074 ASSERT(pp->p_szc == 0); 1075 1076 /* 1077 * Don't need to lock the freelist first here 1078 * because the page isn't on the freelist yet. 1079 * This means p_szc can't change on us. 1080 */ 1081 1082 bin = PP_2_BIN(pp); 1083 mnode = PP_2_MEM_NODE(pp); 1084 mtype = PP_2_MTYPE(pp); 1085 1086 if (flags & PG_LIST_ISINIT) { 1087 /* 1088 * PG_LIST_ISINIT is set during system startup (ie. single 1089 * threaded), add a page to the free list and add to the 1090 * the free region counters w/o any locking 1091 */ 1092 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1093 1094 /* inline version of page_add() */ 1095 if (*ppp != NULL) { 1096 pp->p_next = *ppp; 1097 pp->p_prev = (*ppp)->p_prev; 1098 (*ppp)->p_prev = pp; 1099 pp->p_prev->p_next = pp; 1100 } else 1101 *ppp = pp; 1102 1103 page_ctr_add_internal(mnode, mtype, pp, flags); 1104 VM_STAT_ADD(vmm_vmstats.pladd_free[0]); 1105 } else { 1106 pcm = PC_BIN_MUTEX(mnode, bin, flags); 1107 1108 if (flags & PG_FREE_LIST) { 1109 VM_STAT_ADD(vmm_vmstats.pladd_free[0]); 1110 ASSERT(PP_ISAGED(pp)); 1111 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1112 1113 } else { 1114 VM_STAT_ADD(vmm_vmstats.pladd_cache); 1115 ASSERT(pp->p_vnode); 1116 ASSERT((pp->p_offset & PAGEOFFSET) == 0); 1117 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1118 } 1119 mutex_enter(pcm); 1120 page_add(ppp, pp); 1121 1122 if (flags & PG_LIST_TAIL) 1123 *ppp = (*ppp)->p_next; 1124 /* 1125 * Add counters before releasing pcm mutex to avoid a race with 1126 * page_freelist_coalesce and page_freelist_fill. 1127 */ 1128 page_ctr_add(mnode, mtype, pp, flags); 1129 mutex_exit(pcm); 1130 } 1131 1132 1133 #if defined(__sparc) 1134 if (PP_ISNORELOC(pp)) { 1135 kcage_freemem_add(1); 1136 } 1137 #endif 1138 /* 1139 * It is up to the caller to unlock the page! 1140 */ 1141 ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT)); 1142 } 1143 1144 1145 #ifdef __sparc 1146 /* 1147 * This routine is only used by kcage_init during system startup. 1148 * It performs the function of page_list_sub/PP_SETNORELOC/page_list_add 1149 * without the overhead of taking locks and updating counters. 1150 */ 1151 void 1152 page_list_noreloc_startup(page_t *pp) 1153 { 1154 page_t **ppp; 1155 uint_t bin; 1156 int mnode; 1157 int mtype; 1158 int flags = PG_LIST_ISCAGE; 1159 1160 /* 1161 * If this is a large page on the freelist then 1162 * break it up into smaller pages. 1163 */ 1164 if (pp->p_szc != 0) 1165 page_boot_demote(pp); 1166 1167 /* 1168 * Get list page is currently on. 1169 */ 1170 bin = PP_2_BIN(pp); 1171 mnode = PP_2_MEM_NODE(pp); 1172 mtype = PP_2_MTYPE(pp); 1173 ASSERT(mtype == MTYPE_RELOC); 1174 ASSERT(pp->p_szc == 0); 1175 1176 if (PP_ISAGED(pp)) { 1177 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1178 flags |= PG_FREE_LIST; 1179 } else { 1180 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1181 flags |= PG_CACHE_LIST; 1182 } 1183 1184 ASSERT(*ppp != NULL); 1185 1186 /* 1187 * Delete page from current list. 1188 */ 1189 if (*ppp == pp) 1190 *ppp = pp->p_next; /* go to next page */ 1191 if (*ppp == pp) { 1192 *ppp = NULL; /* page list is gone */ 1193 } else { 1194 pp->p_prev->p_next = pp->p_next; 1195 pp->p_next->p_prev = pp->p_prev; 1196 } 1197 1198 /* LINTED */ 1199 PLCNT_DECR(pp, mnode, mtype, 0, flags); 1200 1201 /* 1202 * Set no reloc for cage initted pages. 1203 */ 1204 PP_SETNORELOC(pp); 1205 1206 mtype = PP_2_MTYPE(pp); 1207 ASSERT(mtype == MTYPE_NORELOC); 1208 1209 /* 1210 * Get new list for page. 1211 */ 1212 if (PP_ISAGED(pp)) { 1213 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1214 } else { 1215 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1216 } 1217 1218 /* 1219 * Insert page on new list. 1220 */ 1221 if (*ppp == NULL) { 1222 *ppp = pp; 1223 pp->p_next = pp->p_prev = pp; 1224 } else { 1225 pp->p_next = *ppp; 1226 pp->p_prev = (*ppp)->p_prev; 1227 (*ppp)->p_prev = pp; 1228 pp->p_prev->p_next = pp; 1229 } 1230 1231 /* LINTED */ 1232 PLCNT_INCR(pp, mnode, mtype, 0, flags); 1233 1234 /* 1235 * Update cage freemem counter 1236 */ 1237 atomic_add_long(&kcage_freemem, 1); 1238 } 1239 #else /* __sparc */ 1240 1241 /* ARGSUSED */ 1242 void 1243 page_list_noreloc_startup(page_t *pp) 1244 { 1245 panic("page_list_noreloc_startup: should be here only for sparc"); 1246 } 1247 #endif 1248 1249 void 1250 page_list_add_pages(page_t *pp, int flags) 1251 { 1252 kmutex_t *pcm; 1253 pgcnt_t pgcnt; 1254 uint_t bin, mtype, i; 1255 int mnode; 1256 1257 /* default to freelist/head */ 1258 ASSERT((flags & (PG_CACHE_LIST | PG_LIST_TAIL)) == 0); 1259 1260 CHK_LPG(pp, pp->p_szc); 1261 VM_STAT_ADD(vmm_vmstats.pladd_free[pp->p_szc]); 1262 1263 bin = PP_2_BIN(pp); 1264 mnode = PP_2_MEM_NODE(pp); 1265 mtype = PP_2_MTYPE(pp); 1266 1267 if (flags & PG_LIST_ISINIT) { 1268 ASSERT(pp->p_szc == mmu_page_sizes - 1); 1269 page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1270 ASSERT(!PP_ISNORELOC(pp)); 1271 PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags); 1272 } else { 1273 1274 ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes); 1275 1276 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 1277 1278 mutex_enter(pcm); 1279 page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1280 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST); 1281 mutex_exit(pcm); 1282 1283 pgcnt = page_get_pagecnt(pp->p_szc); 1284 #if defined(__sparc) 1285 if (PP_ISNORELOC(pp)) 1286 kcage_freemem_add(pgcnt); 1287 #endif 1288 for (i = 0; i < pgcnt; i++, pp++) 1289 page_unlock(pp); 1290 } 1291 } 1292 1293 /* 1294 * During boot, need to demote a large page to base 1295 * pagesize pages for seg_kmem for use in boot_alloc() 1296 */ 1297 void 1298 page_boot_demote(page_t *pp) 1299 { 1300 ASSERT(pp->p_szc != 0); 1301 ASSERT(PP_ISFREE(pp)); 1302 ASSERT(PP_ISAGED(pp)); 1303 1304 (void) page_demote(PP_2_MEM_NODE(pp), 1305 PFN_BASE(pp->p_pagenum, pp->p_szc), pp->p_szc, 0, PC_NO_COLOR, 1306 PC_FREE); 1307 1308 ASSERT(PP_ISFREE(pp)); 1309 ASSERT(PP_ISAGED(pp)); 1310 ASSERT(pp->p_szc == 0); 1311 } 1312 1313 /* 1314 * Take a particular page off of whatever freelist the page 1315 * is claimed to be on. 1316 * 1317 * NOTE: Only used for PAGESIZE pages. 1318 */ 1319 void 1320 page_list_sub(page_t *pp, int flags) 1321 { 1322 int bin; 1323 uint_t mtype; 1324 int mnode; 1325 kmutex_t *pcm; 1326 page_t **ppp; 1327 1328 ASSERT(PAGE_EXCL(pp)); 1329 ASSERT(PP_ISFREE(pp)); 1330 1331 /* 1332 * The p_szc field can only be changed by page_promote() 1333 * and page_demote(). Only free pages can be promoted and 1334 * demoted and the free list MUST be locked during these 1335 * operations. So to prevent a race in page_list_sub() 1336 * between computing which bin of the freelist lock to 1337 * grab and actually grabing the lock we check again that 1338 * the bin we locked is still the correct one. Notice that 1339 * the p_szc field could have actually changed on us but 1340 * if the bin happens to still be the same we are safe. 1341 */ 1342 try_again: 1343 bin = PP_2_BIN(pp); 1344 mnode = PP_2_MEM_NODE(pp); 1345 pcm = PC_BIN_MUTEX(mnode, bin, flags); 1346 mutex_enter(pcm); 1347 if (PP_2_BIN(pp) != bin) { 1348 mutex_exit(pcm); 1349 goto try_again; 1350 } 1351 mtype = PP_2_MTYPE(pp); 1352 1353 if (flags & PG_FREE_LIST) { 1354 VM_STAT_ADD(vmm_vmstats.plsub_free[0]); 1355 ASSERT(PP_ISAGED(pp)); 1356 ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype); 1357 } else { 1358 VM_STAT_ADD(vmm_vmstats.plsub_cache); 1359 ASSERT(!PP_ISAGED(pp)); 1360 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1361 } 1362 1363 /* 1364 * Common PAGESIZE case. 1365 * 1366 * Note that we locked the freelist. This prevents 1367 * any page promotion/demotion operations. Therefore 1368 * the p_szc will not change until we drop pcm mutex. 1369 */ 1370 if (pp->p_szc == 0) { 1371 page_sub(ppp, pp); 1372 /* 1373 * Subtract counters before releasing pcm mutex 1374 * to avoid race with page_freelist_coalesce. 1375 */ 1376 page_ctr_sub(mnode, mtype, pp, flags); 1377 mutex_exit(pcm); 1378 1379 #if defined(__sparc) 1380 if (PP_ISNORELOC(pp)) { 1381 kcage_freemem_sub(1); 1382 } 1383 #endif 1384 return; 1385 } 1386 1387 /* 1388 * Large pages on the cache list are not supported. 1389 */ 1390 if (flags & PG_CACHE_LIST) 1391 panic("page_list_sub: large page on cachelist"); 1392 1393 /* 1394 * Slow but rare. 1395 * 1396 * Somebody wants this particular page which is part 1397 * of a large page. In this case we just demote the page 1398 * if it's on the freelist. 1399 * 1400 * We have to drop pcm before locking the entire freelist. 1401 * Once we have re-locked the freelist check to make sure 1402 * the page hasn't already been demoted or completely 1403 * freed. 1404 */ 1405 mutex_exit(pcm); 1406 page_freelist_lock(mnode); 1407 if (pp->p_szc != 0) { 1408 /* 1409 * Large page is on freelist. 1410 */ 1411 (void) page_demote(mnode, PFN_BASE(pp->p_pagenum, pp->p_szc), 1412 pp->p_szc, 0, PC_NO_COLOR, PC_FREE); 1413 } 1414 ASSERT(PP_ISFREE(pp)); 1415 ASSERT(PP_ISAGED(pp)); 1416 ASSERT(pp->p_szc == 0); 1417 1418 /* 1419 * Subtract counters before releasing pcm mutex 1420 * to avoid race with page_freelist_coalesce. 1421 */ 1422 bin = PP_2_BIN(pp); 1423 mtype = PP_2_MTYPE(pp); 1424 ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype); 1425 1426 page_sub(ppp, pp); 1427 page_ctr_sub(mnode, mtype, pp, flags); 1428 page_freelist_unlock(mnode); 1429 1430 #if defined(__sparc) 1431 if (PP_ISNORELOC(pp)) { 1432 kcage_freemem_sub(1); 1433 } 1434 #endif 1435 } 1436 1437 void 1438 page_list_sub_pages(page_t *pp, uint_t szc) 1439 { 1440 kmutex_t *pcm; 1441 uint_t bin, mtype; 1442 int mnode; 1443 1444 ASSERT(PAGE_EXCL(pp)); 1445 ASSERT(PP_ISFREE(pp)); 1446 ASSERT(PP_ISAGED(pp)); 1447 1448 /* 1449 * See comment in page_list_sub(). 1450 */ 1451 try_again: 1452 bin = PP_2_BIN(pp); 1453 mnode = PP_2_MEM_NODE(pp); 1454 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 1455 mutex_enter(pcm); 1456 if (PP_2_BIN(pp) != bin) { 1457 mutex_exit(pcm); 1458 goto try_again; 1459 } 1460 1461 /* 1462 * If we're called with a page larger than szc or it got 1463 * promoted above szc before we locked the freelist then 1464 * drop pcm and re-lock entire freelist. If page still larger 1465 * than szc then demote it. 1466 */ 1467 if (pp->p_szc > szc) { 1468 mutex_exit(pcm); 1469 pcm = NULL; 1470 page_freelist_lock(mnode); 1471 if (pp->p_szc > szc) { 1472 VM_STAT_ADD(vmm_vmstats.plsubpages_szcbig); 1473 (void) page_demote(mnode, 1474 PFN_BASE(pp->p_pagenum, pp->p_szc), 1475 pp->p_szc, szc, PC_NO_COLOR, PC_FREE); 1476 } 1477 bin = PP_2_BIN(pp); 1478 } 1479 ASSERT(PP_ISFREE(pp)); 1480 ASSERT(PP_ISAGED(pp)); 1481 ASSERT(pp->p_szc <= szc); 1482 ASSERT(pp == PP_PAGEROOT(pp)); 1483 1484 VM_STAT_ADD(vmm_vmstats.plsub_free[pp->p_szc]); 1485 1486 mtype = PP_2_MTYPE(pp); 1487 if (pp->p_szc != 0) { 1488 page_vpsub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1489 CHK_LPG(pp, pp->p_szc); 1490 } else { 1491 VM_STAT_ADD(vmm_vmstats.plsubpages_szc0); 1492 page_sub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1493 } 1494 page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST); 1495 1496 if (pcm != NULL) { 1497 mutex_exit(pcm); 1498 } else { 1499 page_freelist_unlock(mnode); 1500 } 1501 1502 #if defined(__sparc) 1503 if (PP_ISNORELOC(pp)) { 1504 pgcnt_t pgcnt; 1505 1506 pgcnt = page_get_pagecnt(pp->p_szc); 1507 kcage_freemem_sub(pgcnt); 1508 } 1509 #endif 1510 } 1511 1512 /* 1513 * Add the page to the front of a linked list of pages 1514 * using the p_next & p_prev pointers for the list. 1515 * The caller is responsible for protecting the list pointers. 1516 */ 1517 void 1518 mach_page_add(page_t **ppp, page_t *pp) 1519 { 1520 if (*ppp == NULL) { 1521 pp->p_next = pp->p_prev = pp; 1522 } else { 1523 pp->p_next = *ppp; 1524 pp->p_prev = (*ppp)->p_prev; 1525 (*ppp)->p_prev = pp; 1526 pp->p_prev->p_next = pp; 1527 } 1528 *ppp = pp; 1529 } 1530 1531 /* 1532 * Remove this page from a linked list of pages 1533 * using the p_next & p_prev pointers for the list. 1534 * 1535 * The caller is responsible for protecting the list pointers. 1536 */ 1537 void 1538 mach_page_sub(page_t **ppp, page_t *pp) 1539 { 1540 ASSERT(PP_ISFREE(pp)); 1541 1542 if (*ppp == NULL || pp == NULL) 1543 panic("mach_page_sub"); 1544 1545 if (*ppp == pp) 1546 *ppp = pp->p_next; /* go to next page */ 1547 1548 if (*ppp == pp) 1549 *ppp = NULL; /* page list is gone */ 1550 else { 1551 pp->p_prev->p_next = pp->p_next; 1552 pp->p_next->p_prev = pp->p_prev; 1553 } 1554 pp->p_prev = pp->p_next = pp; /* make pp a list of one */ 1555 } 1556 1557 /* 1558 * Routine fsflush uses to gradually coalesce the free list into larger pages. 1559 */ 1560 void 1561 page_promote_size(page_t *pp, uint_t cur_szc) 1562 { 1563 pfn_t pfn; 1564 int mnode; 1565 int idx; 1566 int new_szc = cur_szc + 1; 1567 int full = FULL_REGION_CNT(new_szc); 1568 1569 pfn = page_pptonum(pp); 1570 mnode = PFN_2_MEM_NODE(pfn); 1571 1572 page_freelist_lock(mnode); 1573 1574 idx = PNUM_TO_IDX(mnode, new_szc, pfn); 1575 if (PAGE_COUNTERS(mnode, new_szc, idx) == full) 1576 (void) page_promote(mnode, pfn, new_szc, PC_FREE); 1577 1578 page_freelist_unlock(mnode); 1579 } 1580 1581 static uint_t page_promote_err; 1582 static uint_t page_promote_noreloc_err; 1583 1584 /* 1585 * Create a single larger page (of szc new_szc) from smaller contiguous pages 1586 * for the given mnode starting at pfnum. Pages involved are on the freelist 1587 * before the call and may be returned to the caller if requested, otherwise 1588 * they will be placed back on the freelist. 1589 * If flags is PC_ALLOC, then the large page will be returned to the user in 1590 * a state which is consistent with a page being taken off the freelist. If 1591 * we failed to lock the new large page, then we will return NULL to the 1592 * caller and put the large page on the freelist instead. 1593 * If flags is PC_FREE, then the large page will be placed on the freelist, 1594 * and NULL will be returned. 1595 * The caller is responsible for locking the freelist as well as any other 1596 * accounting which needs to be done for a returned page. 1597 * 1598 * RFE: For performance pass in pp instead of pfnum so 1599 * we can avoid excessive calls to page_numtopp_nolock(). 1600 * This would depend on an assumption that all contiguous 1601 * pages are in the same memseg so we can just add/dec 1602 * our pp. 1603 * 1604 * Lock ordering: 1605 * 1606 * There is a potential but rare deadlock situation 1607 * for page promotion and demotion operations. The problem 1608 * is there are two paths into the freelist manager and 1609 * they have different lock orders: 1610 * 1611 * page_create() 1612 * lock freelist 1613 * page_lock(EXCL) 1614 * unlock freelist 1615 * return 1616 * caller drops page_lock 1617 * 1618 * page_free() and page_reclaim() 1619 * caller grabs page_lock(EXCL) 1620 * 1621 * lock freelist 1622 * unlock freelist 1623 * drop page_lock 1624 * 1625 * What prevents a thread in page_create() from deadlocking 1626 * with a thread freeing or reclaiming the same page is the 1627 * page_trylock() in page_get_freelist(). If the trylock fails 1628 * it skips the page. 1629 * 1630 * The lock ordering for promotion and demotion is the same as 1631 * for page_create(). Since the same deadlock could occur during 1632 * page promotion and freeing or reclaiming of a page on the 1633 * cache list we might have to fail the operation and undo what 1634 * have done so far. Again this is rare. 1635 */ 1636 page_t * 1637 page_promote(int mnode, pfn_t pfnum, uchar_t new_szc, int flags) 1638 { 1639 page_t *pp, *pplist, *tpp, *start_pp; 1640 pgcnt_t new_npgs, npgs; 1641 uint_t bin; 1642 pgcnt_t tmpnpgs, pages_left; 1643 uint_t mtype; 1644 uint_t noreloc; 1645 uint_t i; 1646 int which_list; 1647 ulong_t index; 1648 kmutex_t *phm; 1649 1650 /* 1651 * General algorithm: 1652 * Find the starting page 1653 * Walk each page struct removing it from the freelist, 1654 * and linking it to all the other pages removed. 1655 * Once all pages are off the freelist, 1656 * walk the list, modifying p_szc to new_szc and what 1657 * ever other info needs to be done to create a large free page. 1658 * According to the flags, either return the page or put it 1659 * on the freelist. 1660 */ 1661 1662 start_pp = page_numtopp_nolock(pfnum); 1663 ASSERT(start_pp && (start_pp->p_pagenum == pfnum)); 1664 new_npgs = page_get_pagecnt(new_szc); 1665 ASSERT(IS_P2ALIGNED(pfnum, new_npgs)); 1666 1667 /* 1668 * Loop through smaller pages to confirm that all pages 1669 * give the same result for PP_ISNORELOC(). 1670 * We can check this reliably here as the protocol for setting 1671 * P_NORELOC requires pages to be taken off the free list first. 1672 */ 1673 for (i = 0, pp = start_pp; i < new_npgs; i++, pp++) { 1674 if (pp == start_pp) { 1675 /* First page, set requirement. */ 1676 noreloc = PP_ISNORELOC(pp); 1677 } else if (noreloc != PP_ISNORELOC(pp)) { 1678 page_promote_noreloc_err++; 1679 page_promote_err++; 1680 return (NULL); 1681 } 1682 } 1683 1684 pages_left = new_npgs; 1685 pplist = NULL; 1686 pp = start_pp; 1687 1688 /* Loop around coalescing the smaller pages into a big page. */ 1689 while (pages_left) { 1690 /* 1691 * Remove from the freelist. 1692 */ 1693 ASSERT(PP_ISFREE(pp)); 1694 bin = PP_2_BIN(pp); 1695 ASSERT(mnode == PP_2_MEM_NODE(pp)); 1696 mtype = PP_2_MTYPE(pp); 1697 if (PP_ISAGED(pp)) { 1698 1699 /* 1700 * PG_FREE_LIST 1701 */ 1702 if (pp->p_szc) { 1703 page_vpsub(&PAGE_FREELISTS(mnode, 1704 pp->p_szc, bin, mtype), pp); 1705 } else { 1706 mach_page_sub(&PAGE_FREELISTS(mnode, 0, 1707 bin, mtype), pp); 1708 } 1709 which_list = PG_FREE_LIST; 1710 } else { 1711 ASSERT(pp->p_szc == 0); 1712 1713 /* 1714 * PG_CACHE_LIST 1715 * 1716 * Since this page comes from the 1717 * cachelist, we must destroy the 1718 * vnode association. 1719 */ 1720 if (!page_trylock(pp, SE_EXCL)) { 1721 goto fail_promote; 1722 } 1723 1724 /* 1725 * We need to be careful not to deadlock 1726 * with another thread in page_lookup(). 1727 * The page_lookup() thread could be holding 1728 * the same phm that we need if the two 1729 * pages happen to hash to the same phm lock. 1730 * At this point we have locked the entire 1731 * freelist and page_lookup() could be trying 1732 * to grab a freelist lock. 1733 */ 1734 index = PAGE_HASH_FUNC(pp->p_vnode, pp->p_offset); 1735 phm = PAGE_HASH_MUTEX(index); 1736 if (!mutex_tryenter(phm)) { 1737 page_unlock(pp); 1738 goto fail_promote; 1739 } 1740 1741 mach_page_sub(&PAGE_CACHELISTS(mnode, bin, mtype), pp); 1742 page_hashout(pp, phm); 1743 mutex_exit(phm); 1744 PP_SETAGED(pp); 1745 page_unlock(pp); 1746 which_list = PG_CACHE_LIST; 1747 } 1748 page_ctr_sub(mnode, mtype, pp, which_list); 1749 1750 /* 1751 * Concatenate the smaller page(s) onto 1752 * the large page list. 1753 */ 1754 tmpnpgs = npgs = page_get_pagecnt(pp->p_szc); 1755 pages_left -= npgs; 1756 tpp = pp; 1757 while (npgs--) { 1758 tpp->p_szc = new_szc; 1759 tpp = tpp->p_next; 1760 } 1761 page_list_concat(&pplist, &pp); 1762 pp += tmpnpgs; 1763 } 1764 CHK_LPG(pplist, new_szc); 1765 1766 /* 1767 * return the page to the user if requested 1768 * in the properly locked state. 1769 */ 1770 if (flags == PC_ALLOC && (page_trylock_cons(pplist, SE_EXCL))) { 1771 return (pplist); 1772 } 1773 1774 /* 1775 * Otherwise place the new large page on the freelist 1776 */ 1777 bin = PP_2_BIN(pplist); 1778 mnode = PP_2_MEM_NODE(pplist); 1779 mtype = PP_2_MTYPE(pplist); 1780 page_vpadd(&PAGE_FREELISTS(mnode, new_szc, bin, mtype), pplist); 1781 1782 page_ctr_add(mnode, mtype, pplist, PG_FREE_LIST); 1783 return (NULL); 1784 1785 fail_promote: 1786 /* 1787 * A thread must have still been freeing or 1788 * reclaiming the page on the cachelist. 1789 * To prevent a deadlock undo what we have 1790 * done sofar and return failure. This 1791 * situation can only happen while promoting 1792 * PAGESIZE pages. 1793 */ 1794 page_promote_err++; 1795 while (pplist) { 1796 pp = pplist; 1797 mach_page_sub(&pplist, pp); 1798 pp->p_szc = 0; 1799 bin = PP_2_BIN(pp); 1800 mtype = PP_2_MTYPE(pp); 1801 mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, mtype), pp); 1802 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST); 1803 } 1804 return (NULL); 1805 1806 } 1807 1808 /* 1809 * Break up a large page into smaller size pages. 1810 * Pages involved are on the freelist before the call and may 1811 * be returned to the caller if requested, otherwise they will 1812 * be placed back on the freelist. 1813 * The caller is responsible for locking the freelist as well as any other 1814 * accounting which needs to be done for a returned page. 1815 * If flags is not PC_ALLOC, the color argument is ignored, and thus 1816 * technically, any value may be passed in but PC_NO_COLOR is the standard 1817 * which should be followed for clarity's sake. 1818 */ 1819 page_t * 1820 page_demote(int mnode, pfn_t pfnum, uchar_t cur_szc, uchar_t new_szc, 1821 int color, int flags) 1822 { 1823 page_t *pp, *pplist, *npplist; 1824 pgcnt_t npgs, n; 1825 uint_t bin; 1826 uint_t mtype; 1827 page_t *ret_pp = NULL; 1828 1829 ASSERT(cur_szc != 0); 1830 ASSERT(new_szc < cur_szc); 1831 1832 pplist = page_numtopp_nolock(pfnum); 1833 ASSERT(pplist != NULL); 1834 1835 ASSERT(pplist->p_szc == cur_szc); 1836 1837 bin = PP_2_BIN(pplist); 1838 ASSERT(mnode == PP_2_MEM_NODE(pplist)); 1839 mtype = PP_2_MTYPE(pplist); 1840 page_vpsub(&PAGE_FREELISTS(mnode, cur_szc, bin, mtype), pplist); 1841 1842 CHK_LPG(pplist, cur_szc); 1843 page_ctr_sub(mnode, mtype, pplist, PG_FREE_LIST); 1844 1845 /* 1846 * Number of PAGESIZE pages for smaller new_szc 1847 * page. 1848 */ 1849 npgs = page_get_pagecnt(new_szc); 1850 1851 while (pplist) { 1852 pp = pplist; 1853 1854 ASSERT(pp->p_szc == cur_szc); 1855 1856 /* 1857 * We either break it up into PAGESIZE pages or larger. 1858 */ 1859 if (npgs == 1) { /* PAGESIZE case */ 1860 mach_page_sub(&pplist, pp); 1861 ASSERT(pp->p_szc == cur_szc); 1862 ASSERT(new_szc == 0); 1863 ASSERT(mnode == PP_2_MEM_NODE(pp)); 1864 pp->p_szc = new_szc; 1865 bin = PP_2_BIN(pp); 1866 if ((bin == color) && (flags == PC_ALLOC) && 1867 (ret_pp == NULL) && 1868 page_trylock_cons(pp, SE_EXCL)) { 1869 ret_pp = pp; 1870 } else { 1871 mtype = PP_2_MTYPE(pp); 1872 mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, 1873 mtype), pp); 1874 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST); 1875 } 1876 } else { 1877 1878 /* 1879 * Break down into smaller lists of pages. 1880 */ 1881 page_list_break(&pplist, &npplist, npgs); 1882 1883 pp = pplist; 1884 n = npgs; 1885 while (n--) { 1886 ASSERT(pp->p_szc == cur_szc); 1887 pp->p_szc = new_szc; 1888 pp = pp->p_next; 1889 } 1890 1891 CHK_LPG(pplist, new_szc); 1892 1893 bin = PP_2_BIN(pplist); 1894 ASSERT(mnode == PP_2_MEM_NODE(pp)); 1895 if ((bin == color) && (flags == PC_ALLOC) && 1896 (ret_pp == NULL) && 1897 page_trylock_cons(pp, SE_EXCL)) { 1898 ret_pp = pp; 1899 } else { 1900 mtype = PP_2_MTYPE(pp); 1901 page_vpadd(&PAGE_FREELISTS(mnode, new_szc, 1902 bin, mtype), pplist); 1903 1904 page_ctr_add(mnode, mtype, pplist, 1905 PG_FREE_LIST); 1906 } 1907 pplist = npplist; 1908 } 1909 } 1910 return (ret_pp); 1911 } 1912 1913 int mpss_coalesce_disable = 0; 1914 1915 /* 1916 * Coalesce free pages into a page of the given szc and color if possible. 1917 * Return the pointer to the page created, otherwise, return NULL. 1918 */ 1919 static page_t * 1920 page_freelist_coalesce(int mnode, uchar_t szc, int color) 1921 { 1922 int r; /* region size */ 1923 int idx, full, i; 1924 pfn_t pfnum; 1925 size_t len; 1926 size_t buckets_to_check; 1927 pgcnt_t cands; 1928 page_t *ret_pp; 1929 int color_stride; 1930 1931 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce); 1932 1933 if (mpss_coalesce_disable) { 1934 return (NULL); 1935 } 1936 1937 r = szc; 1938 PGCTRS_CANDS_GETVALUECOLOR(mnode, r, color, cands); 1939 if (cands == 0) { 1940 VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip); 1941 return (NULL); 1942 } 1943 full = FULL_REGION_CNT(r); 1944 color_stride = (szc) ? page_convert_color(0, szc, page_colors - 1) + 1 : 1945 page_colors; 1946 1947 /* Prevent page_counters dynamic memory from being freed */ 1948 rw_enter(&page_ctrs_rwlock[mnode], RW_READER); 1949 len = PAGE_COUNTERS_ENTRIES(mnode, r); 1950 buckets_to_check = len / color_stride; 1951 idx = PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color); 1952 ASSERT((idx % color_stride) == color); 1953 idx += color_stride; 1954 if (idx >= len) 1955 idx = color; 1956 for (i = 0; i < buckets_to_check; i++) { 1957 if (PAGE_COUNTERS(mnode, r, idx) == full) { 1958 pfnum = IDX_TO_PNUM(mnode, r, idx); 1959 ASSERT(pfnum >= mem_node_config[mnode].physbase && 1960 pfnum < mem_node_config[mnode].physmax); 1961 /* 1962 * RFE: For performance maybe we can do something less 1963 * brutal than locking the entire freelist. So far 1964 * this doesn't seem to be a performance problem? 1965 */ 1966 page_freelist_lock(mnode); 1967 if (PAGE_COUNTERS(mnode, r, idx) != full) { 1968 VM_STAT_ADD(vmm_vmstats.page_ctrs_changed); 1969 goto skip_this_one; 1970 } 1971 ret_pp = page_promote(mnode, pfnum, r, PC_ALLOC); 1972 if (ret_pp != NULL) { 1973 PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color) = 1974 idx; 1975 page_freelist_unlock(mnode); 1976 rw_exit(&page_ctrs_rwlock[mnode]); 1977 #if defined(__sparc) 1978 if (PP_ISNORELOC(ret_pp)) { 1979 pgcnt_t npgs; 1980 1981 npgs = page_get_pagecnt(ret_pp->p_szc); 1982 kcage_freemem_sub(npgs); 1983 } 1984 #endif 1985 return (ret_pp); 1986 } 1987 skip_this_one: 1988 page_freelist_unlock(mnode); 1989 /* 1990 * No point looking for another page if we've 1991 * already tried all of the ones that 1992 * page_ctr_cands indicated. Stash off where we left 1993 * off. 1994 * Note: this is not exact since we don't hold the 1995 * page_freelist_locks before we initially get the 1996 * value of cands for performance reasons, but should 1997 * be a decent approximation. 1998 */ 1999 if (--cands == 0) { 2000 PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color) = 2001 idx; 2002 break; 2003 } 2004 } 2005 idx += color_stride; 2006 if (idx >= len) 2007 idx = color; 2008 } 2009 rw_exit(&page_ctrs_rwlock[mnode]); 2010 VM_STAT_ADD(vmm_vmstats.page_ctrs_failed); 2011 return (NULL); 2012 } 2013 2014 /* 2015 * For the given mnode, promote as many small pages to large pages as possible. 2016 */ 2017 void 2018 page_freelist_coalesce_all(int mnode) 2019 { 2020 int r; /* region size */ 2021 int idx, full; 2022 pfn_t pfnum; 2023 size_t len; 2024 2025 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce_all); 2026 2027 if (mpss_coalesce_disable) { 2028 return; 2029 } 2030 2031 /* 2032 * Lock the entire freelist and coalesce what we can. 2033 * 2034 * Always promote to the largest page possible 2035 * first to reduce the number of page promotions. 2036 */ 2037 rw_enter(&page_ctrs_rwlock[mnode], RW_READER); 2038 page_freelist_lock(mnode); 2039 for (r = mmu_page_sizes - 1; r > 0; r--) { 2040 pgcnt_t cands; 2041 2042 PGCTRS_CANDS_GETVALUE(mnode, r, cands); 2043 if (cands == 0) { 2044 VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip_all); 2045 continue; 2046 } 2047 2048 full = FULL_REGION_CNT(r); 2049 len = PAGE_COUNTERS_ENTRIES(mnode, r); 2050 2051 for (idx = 0; idx < len; idx++) { 2052 if (PAGE_COUNTERS(mnode, r, idx) == full) { 2053 pfnum = IDX_TO_PNUM(mnode, r, idx); 2054 ASSERT(pfnum >= 2055 mem_node_config[mnode].physbase && 2056 pfnum < 2057 mem_node_config[mnode].physmax); 2058 (void) page_promote(mnode, pfnum, r, PC_FREE); 2059 } 2060 } 2061 } 2062 page_freelist_unlock(mnode); 2063 rw_exit(&page_ctrs_rwlock[mnode]); 2064 } 2065 2066 /* 2067 * This is where all polices for moving pages around 2068 * to different page size free lists is implemented. 2069 * Returns 1 on success, 0 on failure. 2070 * 2071 * So far these are the priorities for this algorithm in descending 2072 * order: 2073 * 2074 * 1) When servicing a request try to do so with a free page 2075 * from next size up. Helps defer fragmentation as long 2076 * as possible. 2077 * 2078 * 2) Page coalesce on demand. Only when a freelist 2079 * larger than PAGESIZE is empty and step 1 2080 * will not work since all larger size lists are 2081 * also empty. 2082 * 2083 * If pfnhi is non-zero, search for large page with pfn range less than pfnhi. 2084 */ 2085 page_t * 2086 page_freelist_fill(uchar_t szc, int color, int mnode, int mtype, pfn_t pfnhi) 2087 { 2088 uchar_t nszc = szc + 1; 2089 int bin; 2090 page_t *pp, *firstpp; 2091 page_t *ret_pp = NULL; 2092 2093 ASSERT(szc < mmu_page_sizes); 2094 2095 VM_STAT_ADD(vmm_vmstats.pff_req[szc]); 2096 /* 2097 * First try to break up a larger page to fill 2098 * current size freelist. 2099 */ 2100 while (nszc < mmu_page_sizes) { 2101 /* 2102 * If page found then demote it. 2103 */ 2104 bin = page_convert_color(szc, nszc, color); 2105 if (PAGE_FREELISTS(mnode, nszc, bin, mtype)) { 2106 page_freelist_lock(mnode); 2107 firstpp = pp = PAGE_FREELISTS(mnode, nszc, bin, mtype); 2108 2109 /* 2110 * If pfnhi is not PFNNULL, look for large page below 2111 * pfnhi. PFNNULL signifies no pfn requirement. 2112 */ 2113 if (pfnhi != PFNNULL && pp->p_pagenum >= pfnhi) { 2114 do { 2115 pp = pp->p_vpnext; 2116 if (pp == firstpp) { 2117 pp = NULL; 2118 break; 2119 } 2120 } while (pp->p_pagenum >= pfnhi); 2121 } 2122 if (pp) { 2123 ASSERT(pp->p_szc == nszc); 2124 VM_STAT_ADD(vmm_vmstats.pff_demote[nszc]); 2125 ret_pp = page_demote(mnode, pp->p_pagenum, 2126 pp->p_szc, szc, color, PC_ALLOC); 2127 if (ret_pp) { 2128 page_freelist_unlock(mnode); 2129 #if defined(__sparc) 2130 if (PP_ISNORELOC(ret_pp)) { 2131 pgcnt_t npgs; 2132 2133 npgs = page_get_pagecnt( 2134 ret_pp->p_szc); 2135 kcage_freemem_sub(npgs); 2136 } 2137 #endif 2138 return (ret_pp); 2139 } 2140 } 2141 page_freelist_unlock(mnode); 2142 } 2143 nszc++; 2144 } 2145 2146 /* 2147 * Ok that didn't work. Time to coalesce. 2148 */ 2149 if (szc != 0) { 2150 ret_pp = page_freelist_coalesce(mnode, szc, color); 2151 VM_STAT_COND_ADD(ret_pp, vmm_vmstats.pff_coalok[szc]); 2152 } 2153 2154 return (ret_pp); 2155 } 2156 2157 /* 2158 * Helper routine used only by the freelist code to lock 2159 * a page. If the page is a large page then it succeeds in 2160 * locking all the constituent pages or none at all. 2161 * Returns 1 on sucess, 0 on failure. 2162 */ 2163 static int 2164 page_trylock_cons(page_t *pp, se_t se) 2165 { 2166 page_t *tpp, *first_pp = pp; 2167 2168 /* 2169 * Fail if can't lock first or only page. 2170 */ 2171 if (!page_trylock(pp, se)) { 2172 return (0); 2173 } 2174 2175 /* 2176 * PAGESIZE: common case. 2177 */ 2178 if (pp->p_szc == 0) { 2179 return (1); 2180 } 2181 2182 /* 2183 * Large page case. 2184 */ 2185 tpp = pp->p_next; 2186 while (tpp != pp) { 2187 if (!page_trylock(tpp, se)) { 2188 /* 2189 * On failure unlock what we 2190 * have locked so far. 2191 */ 2192 while (first_pp != tpp) { 2193 page_unlock(first_pp); 2194 first_pp = first_pp->p_next; 2195 } 2196 return (0); 2197 } 2198 tpp = tpp->p_next; 2199 } 2200 return (1); 2201 } 2202 2203 page_t * 2204 page_get_mnode_freelist(int mnode, uint_t bin, int mtype, uchar_t szc, 2205 uint_t flags) 2206 { 2207 kmutex_t *pcm; 2208 int i, fill_tried, fill_marker; 2209 page_t *pp, *first_pp; 2210 uint_t bin_marker; 2211 int colors, cpucolors; 2212 uchar_t nszc; 2213 uint_t nszc_color_shift; 2214 int nwaybins = 0, nwaycnt; 2215 2216 ASSERT(szc < mmu_page_sizes); 2217 2218 VM_STAT_ADD(vmm_vmstats.pgmf_alloc[szc]); 2219 2220 MTYPE_START(mnode, mtype, flags); 2221 if (mtype < 0) { /* mnode foes not have memory in mtype range */ 2222 VM_STAT_ADD(vmm_vmstats.pgmf_allocempty[szc]); 2223 return (NULL); 2224 } 2225 2226 /* 2227 * Set how many physical colors for this page size. 2228 */ 2229 colors = (szc) ? page_convert_color(0, szc, page_colors - 1) + 1 : 2230 page_colors; 2231 2232 nszc = MIN(szc + 1, mmu_page_sizes - 1); 2233 nszc_color_shift = page_get_shift(nszc) - page_get_shift(szc); 2234 2235 /* cpu_page_colors is non-zero if a page color may be in > 1 bin */ 2236 cpucolors = cpu_page_colors; 2237 2238 /* 2239 * adjust cpucolors to possibly check additional 'equivalent' bins 2240 * to try to minimize fragmentation of large pages by delaying calls 2241 * to page_freelist_fill. 2242 */ 2243 if (colorequiv > 1) { 2244 int equivcolors = colors / colorequiv; 2245 2246 if (equivcolors && (cpucolors == 0 || equivcolors < cpucolors)) 2247 cpucolors = equivcolors; 2248 } 2249 2250 ASSERT(colors <= page_colors); 2251 ASSERT(colors); 2252 ASSERT((colors & (colors - 1)) == 0); 2253 2254 ASSERT(bin < colors); 2255 2256 /* 2257 * Only hold one freelist lock at a time, that way we 2258 * can start anywhere and not have to worry about lock 2259 * ordering. 2260 */ 2261 big_try_again: 2262 fill_tried = 0; 2263 nwaycnt = 0; 2264 for (i = 0; i <= colors; i++) { 2265 try_again: 2266 ASSERT(bin < colors); 2267 if (PAGE_FREELISTS(mnode, szc, bin, mtype)) { 2268 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 2269 mutex_enter(pcm); 2270 pp = PAGE_FREELISTS(mnode, szc, bin, mtype); 2271 if (pp != NULL) { 2272 /* 2273 * These were set before the page 2274 * was put on the free list, 2275 * they must still be set. 2276 */ 2277 ASSERT(PP_ISFREE(pp)); 2278 ASSERT(PP_ISAGED(pp)); 2279 ASSERT(pp->p_vnode == NULL); 2280 ASSERT(pp->p_hash == NULL); 2281 ASSERT(pp->p_offset == (u_offset_t)-1); 2282 ASSERT(pp->p_szc == szc); 2283 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 2284 2285 /* 2286 * Walk down the hash chain. 2287 * 8k pages are linked on p_next 2288 * and p_prev fields. Large pages 2289 * are a contiguous group of 2290 * constituent pages linked together 2291 * on their p_next and p_prev fields. 2292 * The large pages are linked together 2293 * on the hash chain using p_vpnext 2294 * p_vpprev of the base constituent 2295 * page of each large page. 2296 */ 2297 first_pp = pp; 2298 while (!page_trylock_cons(pp, SE_EXCL)) { 2299 if (szc == 0) { 2300 pp = pp->p_next; 2301 } else { 2302 pp = pp->p_vpnext; 2303 } 2304 2305 ASSERT(PP_ISFREE(pp)); 2306 ASSERT(PP_ISAGED(pp)); 2307 ASSERT(pp->p_vnode == NULL); 2308 ASSERT(pp->p_hash == NULL); 2309 ASSERT(pp->p_offset == (u_offset_t)-1); 2310 ASSERT(pp->p_szc == szc); 2311 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == 2312 mnode); 2313 2314 if (pp == first_pp) { 2315 pp = NULL; 2316 break; 2317 } 2318 } 2319 2320 if (pp) { 2321 ASSERT(mtype == PP_2_MTYPE(pp)); 2322 ASSERT(pp->p_szc == szc); 2323 if (szc == 0) { 2324 page_sub(&PAGE_FREELISTS(mnode, 2325 szc, bin, mtype), pp); 2326 } else { 2327 page_vpsub(&PAGE_FREELISTS( 2328 mnode, szc, bin, mtype), 2329 pp); 2330 CHK_LPG(pp, szc); 2331 } 2332 page_ctr_sub(mnode, mtype, pp, 2333 PG_FREE_LIST); 2334 2335 if ((PP_ISFREE(pp) == 0) || 2336 (PP_ISAGED(pp) == 0)) 2337 panic("free page is not. pp %p", 2338 (void *)pp); 2339 mutex_exit(pcm); 2340 2341 #if defined(__sparc) 2342 ASSERT(!kcage_on || PP_ISNORELOC(pp) || 2343 (flags & PG_NORELOC) == 0); 2344 2345 if (PP_ISNORELOC(pp)) { 2346 pgcnt_t npgs; 2347 2348 npgs = page_get_pagecnt(szc); 2349 kcage_freemem_sub(npgs); 2350 } 2351 #endif 2352 VM_STAT_ADD(vmm_vmstats. 2353 pgmf_allocok[szc]); 2354 return (pp); 2355 } 2356 } 2357 mutex_exit(pcm); 2358 } 2359 2360 /* 2361 * Wow! The initial bin is empty. 2362 * If specific color is needed, check if page color may be 2363 * in other bins. cpucolors is: 2364 * 0 if the colors for this cpu is equal to page_colors. 2365 * This means that pages with a particular color are in a 2366 * single bin. 2367 * -1 if colors of cpus (cheetah+) are heterogenous. Need to 2368 * first determine the colors for the current cpu. 2369 * >0 colors of all cpus are homogenous and < page_colors 2370 */ 2371 2372 if ((flags & PG_MATCH_COLOR) && (cpucolors != 0)) { 2373 if (!nwaybins) { 2374 /* 2375 * cpucolors is negative if ecache setsizes 2376 * are heterogenous. determine colors for this 2377 * particular cpu. 2378 */ 2379 if (cpucolors < 0) { 2380 cpucolors = CPUSETSIZE() / MMU_PAGESIZE; 2381 ASSERT(cpucolors > 0); 2382 nwaybins = colors / cpucolors; 2383 } else { 2384 nwaybins = colors / cpucolors; 2385 ASSERT(szc > 0 || nwaybins > 1); 2386 } 2387 if (nwaybins < 2) 2388 cpucolors = 0; 2389 } 2390 2391 if (cpucolors && (nwaycnt + 1 <= nwaybins)) { 2392 nwaycnt++; 2393 bin = (bin + (colors / nwaybins)) & 2394 (colors - 1); 2395 if (nwaycnt < nwaybins) { 2396 goto try_again; 2397 } 2398 } 2399 /* back to initial color if fall-thru */ 2400 } 2401 2402 /* 2403 * color bins are all empty if color match. Try and satisfy 2404 * the request by breaking up or coalescing pages from 2405 * a different size freelist of the correct color that 2406 * satisfies the ORIGINAL color requested. If that 2407 * fails then try pages of the same size but different 2408 * colors assuming we are not called with 2409 * PG_MATCH_COLOR. 2410 */ 2411 if (!fill_tried) { 2412 fill_tried = 1; 2413 fill_marker = bin >> nszc_color_shift; 2414 pp = page_freelist_fill(szc, bin, mnode, mtype, 2415 PFNNULL); 2416 if (pp != NULL) { 2417 return (pp); 2418 } 2419 } 2420 2421 if (flags & PG_MATCH_COLOR) 2422 break; 2423 2424 /* 2425 * Select next color bin to try. 2426 */ 2427 if (szc == 0) { 2428 /* 2429 * PAGESIZE page case. 2430 */ 2431 if (i == 0) { 2432 bin = (bin + BIN_STEP) & page_colors_mask; 2433 bin_marker = bin; 2434 } else { 2435 bin = (bin + vac_colors) & page_colors_mask; 2436 if (bin == bin_marker) { 2437 bin = (bin + 1) & page_colors_mask; 2438 bin_marker = bin; 2439 } 2440 } 2441 } else { 2442 /* 2443 * Large page case. 2444 */ 2445 bin = (bin + 1) & (colors - 1); 2446 } 2447 /* 2448 * If bin advanced to the next color bin of the 2449 * next larger pagesize, there is a chance the fill 2450 * could succeed. 2451 */ 2452 if (fill_marker != (bin >> nszc_color_shift)) 2453 fill_tried = 0; 2454 } 2455 2456 /* if allowed, cycle through additional mtypes */ 2457 MTYPE_NEXT(mnode, mtype, flags); 2458 if (mtype >= 0) 2459 goto big_try_again; 2460 2461 VM_STAT_ADD(vmm_vmstats.pgmf_allocfailed[szc]); 2462 2463 return (NULL); 2464 } 2465 2466 2467 /* 2468 * Returns the count of free pages for 'pp' with size code 'szc'. 2469 * Note: This function does not return an exact value as the page freelist 2470 * locks are not held and thus the values in the page_counters may be 2471 * changing as we walk through the data. 2472 */ 2473 static int 2474 page_freecnt(int mnode, page_t *pp, uchar_t szc) 2475 { 2476 pgcnt_t pgfree; 2477 pgcnt_t cnt; 2478 ssize_t r = szc; /* region size */ 2479 ssize_t idx; 2480 int i; 2481 int full, range; 2482 2483 /* Make sure pagenum passed in is aligned properly */ 2484 ASSERT((pp->p_pagenum & (PNUM_SIZE(szc) - 1)) == 0); 2485 ASSERT(szc > 0); 2486 2487 /* Prevent page_counters dynamic memory from being freed */ 2488 rw_enter(&page_ctrs_rwlock[mnode], RW_READER); 2489 idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum); 2490 cnt = PAGE_COUNTERS(mnode, r, idx); 2491 pgfree = cnt << PNUM_SHIFT(r - 1); 2492 range = FULL_REGION_CNT(szc); 2493 2494 /* Check for completely full region */ 2495 if (cnt == range) { 2496 rw_exit(&page_ctrs_rwlock[mnode]); 2497 return (pgfree); 2498 } 2499 2500 while (--r > 0) { 2501 idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum); 2502 full = FULL_REGION_CNT(r); 2503 for (i = 0; i < range; i++, idx++) { 2504 cnt = PAGE_COUNTERS(mnode, r, idx); 2505 /* 2506 * If cnt here is full, that means we have already 2507 * accounted for these pages earlier. 2508 */ 2509 if (cnt != full) { 2510 pgfree += (cnt << PNUM_SHIFT(r - 1)); 2511 } 2512 } 2513 range *= full; 2514 } 2515 rw_exit(&page_ctrs_rwlock[mnode]); 2516 return (pgfree); 2517 } 2518 2519 /* 2520 * Called from page_geti_contig_pages to exclusively lock constituent pages 2521 * starting from 'spp' for page size code 'szc'. 2522 * 2523 * If 'ptcpthreshold' is set, the number of free pages needed in the 'szc' 2524 * region needs to be greater than or equal to the threshold. 2525 */ 2526 static int 2527 page_trylock_contig_pages(int mnode, page_t *spp, uchar_t szc, int flags) 2528 { 2529 pgcnt_t pgcnt = PNUM_SIZE(szc); 2530 pgcnt_t pgfree, i; 2531 page_t *pp; 2532 2533 VM_STAT_ADD(vmm_vmstats.ptcp[szc]); 2534 2535 2536 if ((ptcpthreshold == 0) || (flags & PGI_PGCPHIPRI)) 2537 goto skipptcpcheck; 2538 /* 2539 * check if there are sufficient free pages available before attempting 2540 * to trylock. Count is approximate as page counters can change. 2541 */ 2542 pgfree = page_freecnt(mnode, spp, szc); 2543 2544 /* attempt to trylock if there are sufficient already free pages */ 2545 if (pgfree < pgcnt/ptcpthreshold) { 2546 VM_STAT_ADD(vmm_vmstats.ptcpfreethresh[szc]); 2547 return (0); 2548 } 2549 2550 skipptcpcheck: 2551 2552 for (i = 0; i < pgcnt; i++) { 2553 pp = &spp[i]; 2554 if (!page_trylock(pp, SE_EXCL)) { 2555 VM_STAT_ADD(vmm_vmstats.ptcpfailexcl[szc]); 2556 while (--i != (pgcnt_t)-1) { 2557 pp = &spp[i]; 2558 ASSERT(PAGE_EXCL(pp)); 2559 page_unlock(pp); 2560 } 2561 return (0); 2562 } 2563 ASSERT(spp[i].p_pagenum == spp->p_pagenum + i); 2564 if ((pp->p_szc > szc || (szc && pp->p_szc == szc)) && 2565 !PP_ISFREE(pp)) { 2566 VM_STAT_ADD(vmm_vmstats.ptcpfailszc[szc]); 2567 ASSERT(i == 0); 2568 page_unlock(pp); 2569 return (0); 2570 } 2571 if (PP_ISNORELOC(pp)) { 2572 VM_STAT_ADD(vmm_vmstats.ptcpfailcage[szc]); 2573 while (i != (pgcnt_t)-1) { 2574 pp = &spp[i]; 2575 ASSERT(PAGE_EXCL(pp)); 2576 page_unlock(pp); 2577 i--; 2578 } 2579 return (0); 2580 } 2581 } 2582 VM_STAT_ADD(vmm_vmstats.ptcpok[szc]); 2583 return (1); 2584 } 2585 2586 /* 2587 * Claim large page pointed to by 'pp'. 'pp' is the starting set 2588 * of 'szc' constituent pages that had been locked exclusively previously. 2589 * Will attempt to relocate constituent pages in use. 2590 */ 2591 static page_t * 2592 page_claim_contig_pages(page_t *pp, uchar_t szc, int flags) 2593 { 2594 spgcnt_t pgcnt, npgs, i; 2595 page_t *targpp, *rpp, *hpp; 2596 page_t *replpp = NULL; 2597 page_t *pplist = NULL; 2598 2599 ASSERT(pp != NULL); 2600 2601 pgcnt = page_get_pagecnt(szc); 2602 while (pgcnt) { 2603 ASSERT(PAGE_EXCL(pp)); 2604 ASSERT(!PP_ISNORELOC(pp)); 2605 if (PP_ISFREE(pp)) { 2606 /* 2607 * If this is a PG_FREE_LIST page then its 2608 * size code can change underneath us due to 2609 * page promotion or demotion. As an optimzation 2610 * use page_list_sub_pages() instead of 2611 * page_list_sub(). 2612 */ 2613 if (PP_ISAGED(pp)) { 2614 page_list_sub_pages(pp, szc); 2615 if (pp->p_szc == szc) { 2616 return (pp); 2617 } 2618 ASSERT(pp->p_szc < szc); 2619 npgs = page_get_pagecnt(pp->p_szc); 2620 hpp = pp; 2621 for (i = 0; i < npgs; i++, pp++) { 2622 pp->p_szc = szc; 2623 } 2624 page_list_concat(&pplist, &hpp); 2625 pgcnt -= npgs; 2626 continue; 2627 } 2628 ASSERT(!PP_ISAGED(pp)); 2629 ASSERT(pp->p_szc == 0); 2630 page_list_sub(pp, PG_CACHE_LIST); 2631 page_hashout(pp, NULL); 2632 PP_SETAGED(pp); 2633 pp->p_szc = szc; 2634 page_list_concat(&pplist, &pp); 2635 pp++; 2636 pgcnt--; 2637 continue; 2638 } 2639 npgs = page_get_pagecnt(pp->p_szc); 2640 2641 /* 2642 * page_create_wait freemem accounting done by caller of 2643 * page_get_freelist and not necessary to call it prior to 2644 * calling page_get_replacement_page. 2645 * 2646 * page_get_replacement_page can call page_get_contig_pages 2647 * to acquire a large page (szc > 0); the replacement must be 2648 * smaller than the contig page size to avoid looping or 2649 * szc == 0 and PGI_PGCPSZC0 is set. 2650 */ 2651 if (pp->p_szc < szc || (szc == 0 && (flags & PGI_PGCPSZC0))) { 2652 replpp = page_get_replacement_page(pp, NULL, 0); 2653 if (replpp) { 2654 npgs = page_get_pagecnt(pp->p_szc); 2655 ASSERT(npgs <= pgcnt); 2656 targpp = pp; 2657 } 2658 } 2659 2660 /* 2661 * If replacement is NULL or do_page_relocate fails, fail 2662 * coalescing of pages. 2663 */ 2664 if (replpp == NULL || (do_page_relocate(&targpp, &replpp, 0, 2665 &npgs, NULL) != 0)) { 2666 /* 2667 * Unlock un-processed target list 2668 */ 2669 while (pgcnt--) { 2670 ASSERT(PAGE_EXCL(pp)); 2671 page_unlock(pp); 2672 pp++; 2673 } 2674 /* 2675 * Free the processed target list. 2676 */ 2677 while (pplist) { 2678 pp = pplist; 2679 page_sub(&pplist, pp); 2680 ASSERT(PAGE_EXCL(pp)); 2681 ASSERT(pp->p_szc == szc); 2682 ASSERT(PP_ISFREE(pp)); 2683 ASSERT(PP_ISAGED(pp)); 2684 pp->p_szc = 0; 2685 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); 2686 page_unlock(pp); 2687 } 2688 2689 if (replpp != NULL) 2690 page_free_replacement_page(replpp); 2691 2692 return (NULL); 2693 } 2694 ASSERT(pp == targpp); 2695 2696 /* LINTED */ 2697 ASSERT(hpp = pp); /* That's right, it's an assignment */ 2698 2699 pp += npgs; 2700 pgcnt -= npgs; 2701 2702 while (npgs--) { 2703 ASSERT(PAGE_EXCL(targpp)); 2704 ASSERT(!PP_ISFREE(targpp)); 2705 ASSERT(!PP_ISNORELOC(targpp)); 2706 PP_SETFREE(targpp); 2707 ASSERT(PP_ISAGED(targpp)); 2708 ASSERT(targpp->p_szc < szc || (szc == 0 && 2709 (flags & PGI_PGCPSZC0))); 2710 targpp->p_szc = szc; 2711 targpp = targpp->p_next; 2712 2713 rpp = replpp; 2714 ASSERT(rpp != NULL); 2715 page_sub(&replpp, rpp); 2716 ASSERT(PAGE_EXCL(rpp)); 2717 ASSERT(!PP_ISFREE(rpp)); 2718 page_unlock(rpp); 2719 } 2720 ASSERT(targpp == hpp); 2721 ASSERT(replpp == NULL); 2722 page_list_concat(&pplist, &targpp); 2723 } 2724 CHK_LPG(pplist, szc); 2725 return (pplist); 2726 } 2727 2728 /* 2729 * Trim kernel cage from pfnlo-pfnhi and store result in lo-hi. Return code 2730 * of 0 means nothing left after trim. 2731 */ 2732 2733 int 2734 trimkcage(struct memseg *mseg, pfn_t *lo, pfn_t *hi, pfn_t pfnlo, pfn_t pfnhi) 2735 { 2736 pfn_t kcagepfn; 2737 int decr; 2738 int rc = 0; 2739 2740 if (PP_ISNORELOC(mseg->pages)) { 2741 if (PP_ISNORELOC(mseg->epages - 1) == 0) { 2742 2743 /* lower part of this mseg inside kernel cage */ 2744 decr = kcage_current_pfn(&kcagepfn); 2745 2746 /* kernel cage may have transitioned past mseg */ 2747 if (kcagepfn >= mseg->pages_base && 2748 kcagepfn < mseg->pages_end) { 2749 ASSERT(decr == 0); 2750 *lo = kcagepfn; 2751 *hi = MIN(pfnhi, 2752 (mseg->pages_end - 1)); 2753 rc = 1; 2754 } 2755 } 2756 /* else entire mseg in the cage */ 2757 } else { 2758 if (PP_ISNORELOC(mseg->epages - 1)) { 2759 2760 /* upper part of this mseg inside kernel cage */ 2761 decr = kcage_current_pfn(&kcagepfn); 2762 2763 /* kernel cage may have transitioned past mseg */ 2764 if (kcagepfn >= mseg->pages_base && 2765 kcagepfn < mseg->pages_end) { 2766 ASSERT(decr); 2767 *hi = kcagepfn; 2768 *lo = MAX(pfnlo, mseg->pages_base); 2769 rc = 1; 2770 } 2771 } else { 2772 /* entire mseg outside of kernel cage */ 2773 *lo = MAX(pfnlo, mseg->pages_base); 2774 *hi = MIN(pfnhi, (mseg->pages_end - 1)); 2775 rc = 1; 2776 } 2777 } 2778 return (rc); 2779 } 2780 2781 /* 2782 * called from page_get_contig_pages to search 'pfnlo' thru 'pfnhi' to "claim" a 2783 * page with size code 'szc'. Claiming such a page requires acquiring 2784 * exclusive locks on all constituent pages (page_trylock_contig_pages), 2785 * relocating pages in use and concatenating these constituent pages into a 2786 * large page. 2787 * 2788 * The page lists do not have such a large page and page_freelist_fill has 2789 * already failed to demote larger pages and/or coalesce smaller free pages. 2790 * 2791 * 'flags' may specify PG_COLOR_MATCH which would limit the search of large 2792 * pages with the same color as 'bin'. 2793 * 2794 * 'pfnflag' specifies the subset of the pfn range to search. 2795 */ 2796 2797 2798 static page_t * 2799 page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags, 2800 pfn_t pfnlo, pfn_t pfnhi, int pfnflag) 2801 { 2802 struct memseg *mseg; 2803 pgcnt_t szcpgcnt = page_get_pagecnt(szc); 2804 pgcnt_t szcpgmask = szcpgcnt - 1; 2805 pfn_t randpfn; 2806 page_t *pp, *randpp, *endpp; 2807 uint_t colors; 2808 pfn_t hi, lo; 2809 uint_t skip; 2810 2811 ASSERT(szc != 0 || (flags & PGI_PGCPSZC0)); 2812 2813 if ((pfnhi - pfnlo) + 1 < szcpgcnt) 2814 return (NULL); 2815 2816 ASSERT(szc < mmu_page_sizes); 2817 2818 colors = (szc) ? page_convert_color(0, szc, page_colors - 1) + 1 : 2819 page_colors; 2820 2821 ASSERT(bin < colors); 2822 2823 /* 2824 * trim the pfn range to search based on pfnflag. pfnflag is set 2825 * when there have been previous page_get_contig_page failures to 2826 * limit the search. 2827 * 2828 * The high bit in pfnflag specifies the number of 'slots' in the 2829 * pfn range and the remainder of pfnflag specifies which slot. 2830 * For example, a value of 1010b would mean the second slot of 2831 * the pfn range that has been divided into 8 slots. 2832 */ 2833 if (pfnflag > 1) { 2834 int slots = 1 << (highbit(pfnflag) - 1); 2835 int slotid = pfnflag & (slots - 1); 2836 pgcnt_t szcpages; 2837 int slotlen; 2838 2839 pfnlo = P2ROUNDUP(pfnlo, szcpgcnt); 2840 pfnhi = pfnhi & ~(szcpgcnt - 1); 2841 2842 szcpages = ((pfnhi - pfnlo) + 1) / szcpgcnt; 2843 slotlen = howmany(szcpages, slots); 2844 pfnlo = pfnlo + (((slotid * slotlen) % szcpages) * szcpgcnt); 2845 ASSERT(pfnlo < pfnhi); 2846 if (pfnhi > pfnlo + (slotlen * szcpgcnt)) 2847 pfnhi = pfnlo + (slotlen * szcpgcnt); 2848 } 2849 2850 memsegs_lock(0); 2851 2852 /* 2853 * loop through memsegs to look for contig page candidates 2854 */ 2855 2856 for (mseg = memsegs; mseg != NULL; mseg = mseg->next) { 2857 if (pfnhi < mseg->pages_base || pfnlo >= mseg->pages_end) { 2858 /* no overlap */ 2859 continue; 2860 } 2861 2862 if (mseg->pages_end - mseg->pages_base < szcpgcnt) 2863 /* mseg too small */ 2864 continue; 2865 2866 /* trim off kernel cage pages from pfn range */ 2867 if (kcage_on) { 2868 if (trimkcage(mseg, &lo, &hi, pfnlo, pfnhi) == 0) 2869 continue; 2870 } else { 2871 lo = MAX(pfnlo, mseg->pages_base); 2872 hi = MIN(pfnhi, (mseg->pages_end - 1)); 2873 } 2874 2875 /* round to szcpgcnt boundaries */ 2876 lo = P2ROUNDUP(lo, szcpgcnt); 2877 hi = hi & ~(szcpgcnt - 1); 2878 2879 if (hi <= lo) 2880 continue; 2881 2882 /* 2883 * set lo to point to the pfn for the desired bin. Large 2884 * page sizes may only have a single page color 2885 */ 2886 if ((colors > 1) && (flags & PG_MATCH_COLOR)) { 2887 uint_t lobin; 2888 2889 /* 2890 * factor in colorequiv to check additional 2891 * 'equivalent' bins. 2892 */ 2893 if (colorequiv > 1 && colors > colorequiv) 2894 colors = colors / colorequiv; 2895 2896 /* determine bin that lo currently points to */ 2897 lobin = (lo & ((szcpgcnt * colors) - 1)) / szcpgcnt; 2898 2899 /* 2900 * set lo to point at appropriate color and set skip 2901 * to arrive at the next szc page of the same color. 2902 */ 2903 lo += ((bin - lobin) & (colors - 1)) * szcpgcnt; 2904 2905 skip = colors * szcpgcnt; 2906 } else { 2907 /* check all pages starting from lo */ 2908 skip = szcpgcnt; 2909 } 2910 if (hi <= lo) 2911 /* mseg cannot satisfy color request */ 2912 continue; 2913 2914 /* randomly choose a point between lo and hi to begin search */ 2915 2916 randpfn = (pfn_t)GETTICK(); 2917 randpfn = ((randpfn % (hi - lo)) + lo) & ~(skip - 1); 2918 randpp = mseg->pages + (randpfn - mseg->pages_base); 2919 2920 ASSERT(randpp->p_pagenum == randpfn); 2921 2922 pp = randpp; 2923 endpp = mseg->pages + (hi - mseg->pages_base); 2924 2925 ASSERT(randpp + szcpgcnt <= endpp); 2926 2927 do { 2928 ASSERT(!(pp->p_pagenum & szcpgmask)); 2929 ASSERT((flags & PG_MATCH_COLOR) == 0 || 2930 colorequiv > 1 || 2931 PP_2_BIN(pp) == bin); 2932 if (page_trylock_contig_pages(mnode, pp, szc, flags)) { 2933 /* pages unlocked by page_claim on failure */ 2934 if (page_claim_contig_pages(pp, szc, flags)) { 2935 memsegs_unlock(0); 2936 return (pp); 2937 } 2938 } 2939 2940 pp += skip; 2941 if (pp >= endpp) { 2942 /* start from the beginning */ 2943 pp = mseg->pages + (lo - mseg->pages_base); 2944 ASSERT(pp->p_pagenum == lo); 2945 ASSERT(pp + szcpgcnt <= endpp); 2946 } 2947 } while (pp != randpp); 2948 } 2949 memsegs_unlock(0); 2950 return (NULL); 2951 } 2952 2953 2954 /* 2955 * controlling routine that searches through physical memory in an attempt to 2956 * claim a large page based on the input parameters. 2957 * on the page free lists. 2958 * 2959 * calls page_geti_contig_pages with an initial pfn range from the mnode 2960 * and mtype. page_geti_contig_pages will trim off the parts of the pfn range 2961 * that overlaps with the kernel cage or does not match the requested page 2962 * color if PG_MATCH_COLOR is set. Since this search is very expensive, 2963 * page_geti_contig_pages may further limit the search range based on 2964 * previous failure counts (pgcpfailcnt[]). 2965 * 2966 * for PGI_PGCPSZC0 requests, page_get_contig_pages will relocate a base 2967 * pagesize page that satisfies mtype. 2968 */ 2969 page_t * 2970 page_get_contig_pages(int mnode, uint_t bin, int mtype, uchar_t szc, 2971 uint_t flags) 2972 { 2973 pfn_t pfnlo, pfnhi; /* contig pages pfn range */ 2974 page_t *pp; 2975 int pfnflag = 0; /* no limit on search if 0 */ 2976 2977 VM_STAT_ADD(vmm_vmstats.pgcp_alloc[szc]); 2978 2979 /* LINTED */ 2980 MTYPE_START(mnode, mtype, flags); 2981 if (mtype < 0) { /* mnode does not have memory in mtype range */ 2982 VM_STAT_ADD(vmm_vmstats.pgcp_allocempty[szc]); 2983 return (NULL); 2984 } 2985 2986 ASSERT(szc > 0 || (flags & PGI_PGCPSZC0)); 2987 2988 /* no allocations from cage */ 2989 flags |= PGI_NOCAGE; 2990 2991 /* do not limit search and ignore color if hi pri */ 2992 2993 if (pgcplimitsearch && ((flags & PGI_PGCPHIPRI) == 0)) 2994 pfnflag = pgcpfailcnt[szc]; 2995 2996 /* remove color match to improve chances */ 2997 2998 if (flags & PGI_PGCPHIPRI || pfnflag) 2999 flags &= ~PG_MATCH_COLOR; 3000 3001 do { 3002 /* get pfn range based on mnode and mtype */ 3003 MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi); 3004 3005 ASSERT(pfnhi >= pfnlo); 3006 3007 pp = page_geti_contig_pages(mnode, bin, szc, flags, 3008 pfnlo, pfnhi, pfnflag); 3009 3010 if (pp != NULL) { 3011 pfnflag = pgcpfailcnt[szc]; 3012 if (pfnflag) { 3013 /* double the search size */ 3014 pgcpfailcnt[szc] = pfnflag >> 1; 3015 } 3016 VM_STAT_ADD(vmm_vmstats.pgcp_allocok[szc]); 3017 return (pp); 3018 } 3019 MTYPE_NEXT(mnode, mtype, flags); 3020 } while (mtype >= 0); 3021 3022 VM_STAT_ADD(vmm_vmstats.pgcp_allocfailed[szc]); 3023 return (NULL); 3024 } 3025 3026 3027 /* 3028 * Find the `best' page on the freelist for this (vp,off) (as,vaddr) pair. 3029 * 3030 * Does its own locking and accounting. 3031 * If PG_MATCH_COLOR is set, then NULL will be returned if there are no 3032 * pages of the proper color even if there are pages of a different color. 3033 * 3034 * Finds a page, removes it, THEN locks it. 3035 */ 3036 3037 /*ARGSUSED*/ 3038 page_t * 3039 page_get_freelist(struct vnode *vp, u_offset_t off, struct seg *seg, 3040 caddr_t vaddr, size_t size, uint_t flags, struct lgrp *lgrp) 3041 { 3042 struct as *as = seg->s_as; 3043 page_t *pp = NULL; 3044 ulong_t bin; 3045 uchar_t szc; 3046 int mnode; 3047 int mtype; 3048 page_t *(*page_get_func)(int, uint_t, int, uchar_t, uint_t); 3049 lgrp_mnode_cookie_t lgrp_cookie; 3050 3051 page_get_func = page_get_mnode_freelist; 3052 3053 /* 3054 * If we aren't passed a specific lgroup, or passed a freed lgrp 3055 * assume we wish to allocate near to the current thread's home. 3056 */ 3057 if (!LGRP_EXISTS(lgrp)) 3058 lgrp = lgrp_home_lgrp(); 3059 3060 if (kcage_on) { 3061 if ((flags & (PG_NORELOC | PG_PANIC)) == PG_NORELOC && 3062 kcage_freemem < kcage_throttlefree + btop(size) && 3063 curthread != kcage_cageout_thread) { 3064 /* 3065 * Set a "reserve" of kcage_throttlefree pages for 3066 * PG_PANIC and cageout thread allocations. 3067 * 3068 * Everybody else has to serialize in 3069 * page_create_get_something() to get a cage page, so 3070 * that we don't deadlock cageout! 3071 */ 3072 return (NULL); 3073 } 3074 } else { 3075 flags &= ~PG_NORELOC; 3076 flags |= PGI_NOCAGE; 3077 } 3078 3079 /* LINTED */ 3080 MTYPE_INIT(mtype, vp, vaddr, flags); 3081 3082 /* 3083 * Convert size to page size code. 3084 */ 3085 if ((szc = page_szc(size)) == (uchar_t)-1) 3086 panic("page_get_freelist: illegal page size request"); 3087 ASSERT(szc < mmu_page_sizes); 3088 3089 VM_STAT_ADD(vmm_vmstats.pgf_alloc[szc]); 3090 3091 /* LINTED */ 3092 AS_2_BIN(as, seg, vp, vaddr, bin); 3093 3094 /* bin is for base pagesize color - convert if larger pagesize. */ 3095 if (szc) 3096 bin = page_convert_color(0, szc, bin); 3097 3098 /* 3099 * Try to get a local page first, but try remote if we can't 3100 * get a page of the right color. 3101 */ 3102 pgretry: 3103 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL); 3104 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3105 pp = page_get_func(mnode, bin, mtype, szc, flags); 3106 if (pp != NULL) { 3107 VM_STAT_ADD(vmm_vmstats.pgf_allocok[szc]); 3108 DTRACE_PROBE4(page__get, 3109 lgrp_t *, lgrp, 3110 int, mnode, 3111 ulong_t, bin, 3112 uint_t, flags); 3113 return (pp); 3114 } 3115 } 3116 ASSERT(pp == NULL); 3117 3118 /* 3119 * for non-SZC0 PAGESIZE requests, check cachelist before checking 3120 * remote free lists. Caller expected to call page_get_cachelist which 3121 * will check local cache lists and remote free lists. 3122 */ 3123 if (szc == 0 && ((flags & PGI_PGCPSZC0) == 0)) { 3124 VM_STAT_ADD(vmm_vmstats.pgf_allocdeferred); 3125 return (NULL); 3126 } 3127 3128 ASSERT(szc > 0 || (flags & PGI_PGCPSZC0)); 3129 3130 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); 3131 3132 /* 3133 * Try to get a non-local freelist page. 3134 */ 3135 LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie); 3136 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3137 pp = page_get_func(mnode, bin, mtype, szc, flags); 3138 if (pp != NULL) { 3139 DTRACE_PROBE4(page__get, 3140 lgrp_t *, lgrp, 3141 int, mnode, 3142 ulong_t, bin, 3143 uint_t, flags); 3144 VM_STAT_ADD(vmm_vmstats.pgf_allocokrem[szc]); 3145 return (pp); 3146 } 3147 } 3148 3149 ASSERT(pp == NULL); 3150 3151 /* 3152 * when the cage is off chances are page_get_contig_pages() will fail 3153 * to lock a large page chunk therefore when the cage is off it's not 3154 * called by default. this can be changed via /etc/system. 3155 * 3156 * page_get_contig_pages() also called to acquire a base pagesize page 3157 * for page_create_get_something(). 3158 */ 3159 if (!(flags & PG_NORELOC) && (pg_contig_disable == 0) && 3160 (kcage_on || pg_lpgcreate_nocage || szc == 0) && 3161 (page_get_func != page_get_contig_pages)) { 3162 3163 VM_STAT_ADD(vmm_vmstats.pgf_allocretry[szc]); 3164 page_get_func = page_get_contig_pages; 3165 goto pgretry; 3166 } 3167 3168 if (pgcplimitsearch && page_get_func == page_get_contig_pages) 3169 pgcpfailcnt[szc]++; 3170 3171 VM_STAT_ADD(vmm_vmstats.pgf_allocfailed[szc]); 3172 return (NULL); 3173 } 3174 3175 /* 3176 * Find the `best' page on the cachelist for this (vp,off) (as,vaddr) pair. 3177 * 3178 * Does its own locking. 3179 * If PG_MATCH_COLOR is set, then NULL will be returned if there are no 3180 * pages of the proper color even if there are pages of a different color. 3181 * Otherwise, scan the bins for ones with pages. For each bin with pages, 3182 * try to lock one of them. If no page can be locked, try the 3183 * next bin. Return NULL if a page can not be found and locked. 3184 * 3185 * Finds a pages, trys to lock it, then removes it. 3186 */ 3187 3188 /*ARGSUSED*/ 3189 page_t * 3190 page_get_cachelist(struct vnode *vp, u_offset_t off, struct seg *seg, 3191 caddr_t vaddr, uint_t flags, struct lgrp *lgrp) 3192 { 3193 page_t *pp; 3194 struct as *as = seg->s_as; 3195 ulong_t bin; 3196 /*LINTED*/ 3197 int mnode; 3198 int mtype; 3199 lgrp_mnode_cookie_t lgrp_cookie; 3200 3201 /* 3202 * If we aren't passed a specific lgroup, or pasased a freed lgrp 3203 * assume we wish to allocate near to the current thread's home. 3204 */ 3205 if (!LGRP_EXISTS(lgrp)) 3206 lgrp = lgrp_home_lgrp(); 3207 3208 if (!kcage_on) { 3209 flags &= ~PG_NORELOC; 3210 flags |= PGI_NOCAGE; 3211 } 3212 3213 if ((flags & (PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == PG_NORELOC && 3214 kcage_freemem <= kcage_throttlefree) { 3215 /* 3216 * Reserve kcage_throttlefree pages for critical kernel 3217 * threads. 3218 * 3219 * Everybody else has to go to page_create_get_something() 3220 * to get a cage page, so we don't deadlock cageout. 3221 */ 3222 return (NULL); 3223 } 3224 3225 /* LINTED */ 3226 AS_2_BIN(as, seg, vp, vaddr, bin); 3227 3228 ASSERT(bin <= page_colors_mask); 3229 3230 /* LINTED */ 3231 MTYPE_INIT(mtype, vp, vaddr, flags); 3232 3233 VM_STAT_ADD(vmm_vmstats.pgc_alloc); 3234 3235 /* 3236 * Try local cachelists first 3237 */ 3238 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL); 3239 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3240 pp = page_get_mnode_cachelist(bin, flags, mnode, mtype); 3241 if (pp != NULL) { 3242 VM_STAT_ADD(vmm_vmstats.pgc_allocok); 3243 DTRACE_PROBE4(page__get, 3244 lgrp_t *, lgrp, 3245 int, mnode, 3246 ulong_t, bin, 3247 uint_t, flags); 3248 return (pp); 3249 } 3250 } 3251 3252 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); 3253 3254 /* 3255 * Try freelists/cachelists that are farther away 3256 * This is our only chance to allocate remote pages for PAGESIZE 3257 * requests. 3258 */ 3259 LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie); 3260 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3261 pp = page_get_mnode_freelist(mnode, bin, mtype, 3262 0, flags); 3263 if (pp != NULL) { 3264 VM_STAT_ADD(vmm_vmstats.pgc_allocokdeferred); 3265 DTRACE_PROBE4(page__get, 3266 lgrp_t *, lgrp, 3267 int, mnode, 3268 ulong_t, bin, 3269 uint_t, flags); 3270 return (pp); 3271 } 3272 pp = page_get_mnode_cachelist(bin, flags, mnode, mtype); 3273 if (pp != NULL) { 3274 VM_STAT_ADD(vmm_vmstats.pgc_allocokrem); 3275 DTRACE_PROBE4(page__get, 3276 lgrp_t *, lgrp, 3277 int, mnode, 3278 ulong_t, bin, 3279 uint_t, flags); 3280 return (pp); 3281 } 3282 } 3283 3284 VM_STAT_ADD(vmm_vmstats.pgc_allocfailed); 3285 return (NULL); 3286 } 3287 3288 page_t * 3289 page_get_mnode_cachelist(uint_t bin, uint_t flags, int mnode, int mtype) 3290 { 3291 kmutex_t *pcm; 3292 int i; 3293 page_t *pp; 3294 page_t *first_pp; 3295 uint_t bin_marker; 3296 int nwaybins, nwaycnt; 3297 int cpucolors; 3298 3299 VM_STAT_ADD(vmm_vmstats.pgmc_alloc); 3300 3301 /* LINTED */ 3302 MTYPE_START(mnode, mtype, flags); 3303 if (mtype < 0) { /* mnode does not have memory in mtype range */ 3304 VM_STAT_ADD(vmm_vmstats.pgmc_allocempty); 3305 return (NULL); 3306 } 3307 3308 nwaybins = 0; 3309 cpucolors = cpu_page_colors; 3310 /* 3311 * adjust cpucolors to possibly check additional 'equivalent' bins 3312 * to try to minimize fragmentation of large pages by delaying calls 3313 * to page_freelist_fill. 3314 */ 3315 if (colorequiv > 1) { 3316 int equivcolors = page_colors / colorequiv; 3317 3318 if (equivcolors && (cpucolors == 0 || equivcolors < cpucolors)) 3319 cpucolors = equivcolors; 3320 } 3321 3322 /* 3323 * Only hold one cachelist lock at a time, that way we 3324 * can start anywhere and not have to worry about lock 3325 * ordering. 3326 */ 3327 3328 big_try_again: 3329 nwaycnt = 0; 3330 for (i = 0; i <= page_colors; i++) { 3331 if (PAGE_CACHELISTS(mnode, bin, mtype)) { 3332 pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST); 3333 mutex_enter(pcm); 3334 pp = PAGE_CACHELISTS(mnode, bin, mtype); 3335 if (pp != NULL) { 3336 first_pp = pp; 3337 ASSERT(pp->p_vnode); 3338 ASSERT(PP_ISAGED(pp) == 0); 3339 ASSERT(pp->p_szc == 0); 3340 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 3341 while (!page_trylock(pp, SE_EXCL)) { 3342 pp = pp->p_next; 3343 ASSERT(pp->p_szc == 0); 3344 if (pp == first_pp) { 3345 /* 3346 * We have searched the 3347 * complete list! 3348 * And all of them (might 3349 * only be one) are locked. 3350 * This can happen since 3351 * these pages can also be 3352 * found via the hash list. 3353 * When found via the hash 3354 * list, they are locked 3355 * first, then removed. 3356 * We give up to let the 3357 * other thread run. 3358 */ 3359 pp = NULL; 3360 break; 3361 } 3362 ASSERT(pp->p_vnode); 3363 ASSERT(PP_ISFREE(pp)); 3364 ASSERT(PP_ISAGED(pp) == 0); 3365 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == 3366 mnode); 3367 } 3368 3369 if (pp) { 3370 page_t **ppp; 3371 /* 3372 * Found and locked a page. 3373 * Pull it off the list. 3374 */ 3375 ASSERT(mtype == PP_2_MTYPE(pp)); 3376 ppp = &PAGE_CACHELISTS(mnode, bin, 3377 mtype); 3378 page_sub(ppp, pp); 3379 /* 3380 * Subtract counters before releasing 3381 * pcm mutex to avoid a race with 3382 * page_freelist_coalesce and 3383 * page_freelist_fill. 3384 */ 3385 page_ctr_sub(mnode, mtype, pp, 3386 PG_CACHE_LIST); 3387 mutex_exit(pcm); 3388 ASSERT(pp->p_vnode); 3389 ASSERT(PP_ISAGED(pp) == 0); 3390 #if defined(__sparc) 3391 ASSERT(!kcage_on || 3392 (flags & PG_NORELOC) == 0 || 3393 PP_ISNORELOC(pp)); 3394 if (PP_ISNORELOC(pp)) { 3395 kcage_freemem_sub(1); 3396 } 3397 #endif 3398 VM_STAT_ADD(vmm_vmstats. 3399 pgmc_allocok); 3400 return (pp); 3401 } 3402 } 3403 mutex_exit(pcm); 3404 } 3405 3406 /* 3407 * Wow! The initial bin is empty or no page in the bin could 3408 * be locked. 3409 * 3410 * If specific color is needed, check if page color may be in 3411 * other bins. 3412 */ 3413 if ((flags & PG_MATCH_COLOR) && (cpucolors != 0)) { 3414 if (!nwaybins) { 3415 if (cpucolors < 0) { 3416 cpucolors = CPUSETSIZE() / MMU_PAGESIZE; 3417 ASSERT(cpucolors > 0); 3418 nwaybins = page_colors / cpucolors; 3419 if (nwaybins < 2) 3420 cpucolors = 0; 3421 } else { 3422 nwaybins = page_colors / cpucolors; 3423 ASSERT(nwaybins > 1); 3424 } 3425 } 3426 3427 if (++nwaycnt >= nwaybins) { 3428 break; 3429 } 3430 bin = (bin + (page_colors / nwaybins)) & 3431 page_colors_mask; 3432 continue; 3433 } 3434 3435 if (i == 0) { 3436 bin = (bin + BIN_STEP) & page_colors_mask; 3437 bin_marker = bin; 3438 } else { 3439 bin = (bin + vac_colors) & page_colors_mask; 3440 if (bin == bin_marker) { 3441 bin = (bin + 1) & page_colors_mask; 3442 bin_marker = bin; 3443 } 3444 } 3445 } 3446 3447 MTYPE_NEXT(mnode, mtype, flags); 3448 if (mtype >= 0) 3449 goto big_try_again; 3450 3451 VM_STAT_ADD(vmm_vmstats.pgmc_allocfailed); 3452 return (NULL); 3453 } 3454 3455 #ifdef DEBUG 3456 #define REPL_PAGE_STATS 3457 #endif /* DEBUG */ 3458 3459 #ifdef REPL_PAGE_STATS 3460 struct repl_page_stats { 3461 uint_t ngets; 3462 uint_t ngets_noreloc; 3463 uint_t npgr_noreloc; 3464 uint_t nnopage_first; 3465 uint_t nnopage; 3466 uint_t nhashout; 3467 uint_t nnofree; 3468 uint_t nnext_pp; 3469 } repl_page_stats; 3470 #define REPL_STAT_INCR(v) atomic_add_32(&repl_page_stats.v, 1) 3471 #else /* REPL_PAGE_STATS */ 3472 #define REPL_STAT_INCR(v) 3473 #endif /* REPL_PAGE_STATS */ 3474 3475 int pgrppgcp; 3476 3477 /* 3478 * The freemem accounting must be done by the caller. 3479 * First we try to get a replacement page of the same size as like_pp, 3480 * if that is not possible, then we just get a set of discontiguous 3481 * PAGESIZE pages. 3482 */ 3483 page_t * 3484 page_get_replacement_page(page_t *orig_like_pp, struct lgrp *lgrp_target, 3485 uint_t pgrflags) 3486 { 3487 page_t *like_pp; 3488 page_t *pp, *pplist; 3489 page_t *pl = NULL; 3490 ulong_t bin; 3491 int mnode, page_mnode; 3492 int szc; 3493 spgcnt_t npgs, pg_cnt; 3494 pfn_t pfnum; 3495 int mtype; 3496 int flags = 0; 3497 lgrp_mnode_cookie_t lgrp_cookie; 3498 lgrp_t *lgrp; 3499 3500 REPL_STAT_INCR(ngets); 3501 like_pp = orig_like_pp; 3502 ASSERT(PAGE_EXCL(like_pp)); 3503 3504 szc = like_pp->p_szc; 3505 npgs = page_get_pagecnt(szc); 3506 /* 3507 * Now we reset like_pp to the base page_t. 3508 * That way, we won't walk past the end of this 'szc' page. 3509 */ 3510 pfnum = PFN_BASE(like_pp->p_pagenum, szc); 3511 like_pp = page_numtopp_nolock(pfnum); 3512 ASSERT(like_pp->p_szc == szc); 3513 3514 if (PP_ISNORELOC(like_pp)) { 3515 ASSERT(kcage_on); 3516 REPL_STAT_INCR(ngets_noreloc); 3517 flags = PGI_RELOCONLY; 3518 } else if (pgrflags & PGR_NORELOC) { 3519 ASSERT(kcage_on); 3520 REPL_STAT_INCR(npgr_noreloc); 3521 flags = PG_NORELOC; 3522 } 3523 3524 /* 3525 * Kernel pages must always be replaced with the same size 3526 * pages, since we cannot properly handle demotion of kernel 3527 * pages. 3528 */ 3529 if (like_pp->p_vnode == &kvp) 3530 pgrflags |= PGR_SAMESZC; 3531 3532 /* LINTED */ 3533 MTYPE_PGR_INIT(mtype, flags, like_pp, page_mnode); 3534 3535 while (npgs) { 3536 pplist = NULL; 3537 for (;;) { 3538 pg_cnt = page_get_pagecnt(szc); 3539 bin = PP_2_BIN(like_pp); 3540 ASSERT(like_pp->p_szc == orig_like_pp->p_szc); 3541 ASSERT(pg_cnt <= npgs); 3542 3543 /* 3544 * If an lgroup was specified, try to get the 3545 * page from that lgroup. 3546 * NOTE: Must be careful with code below because 3547 * lgroup may disappear and reappear since there 3548 * is no locking for lgroup here. 3549 */ 3550 if (LGRP_EXISTS(lgrp_target)) { 3551 /* 3552 * Keep local variable for lgroup separate 3553 * from lgroup argument since this code should 3554 * only be exercised when lgroup argument 3555 * exists.... 3556 */ 3557 lgrp = lgrp_target; 3558 3559 /* Try the lgroup's freelists first */ 3560 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 3561 LGRP_SRCH_LOCAL); 3562 while ((pplist == NULL) && 3563 (mnode = lgrp_memnode_choose(&lgrp_cookie)) 3564 != -1) { 3565 pplist = page_get_mnode_freelist( 3566 mnode, bin, mtype, szc, 3567 flags); 3568 } 3569 3570 /* 3571 * Now try it's cachelists if this is a 3572 * small page. Don't need to do it for 3573 * larger ones since page_freelist_coalesce() 3574 * already failed. 3575 */ 3576 if (pplist != NULL || szc != 0) 3577 break; 3578 3579 /* Now try it's cachelists */ 3580 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 3581 LGRP_SRCH_LOCAL); 3582 3583 while ((pplist == NULL) && 3584 (mnode = lgrp_memnode_choose(&lgrp_cookie)) 3585 != -1) { 3586 pplist = page_get_mnode_cachelist( 3587 bin, flags, mnode, mtype); 3588 } 3589 if (pplist != NULL) { 3590 page_hashout(pplist, NULL); 3591 PP_SETAGED(pplist); 3592 REPL_STAT_INCR(nhashout); 3593 break; 3594 } 3595 /* Done looking in this lgroup. Bail out. */ 3596 break; 3597 } 3598 3599 /* 3600 * No lgroup was specified (or lgroup was removed by 3601 * DR, so just try to get the page as close to 3602 * like_pp's mnode as possible. 3603 * First try the local freelist... 3604 */ 3605 mnode = PP_2_MEM_NODE(like_pp); 3606 pplist = page_get_mnode_freelist(mnode, bin, 3607 mtype, szc, flags); 3608 if (pplist != NULL) 3609 break; 3610 3611 REPL_STAT_INCR(nnofree); 3612 3613 /* 3614 * ...then the local cachelist. Don't need to do it for 3615 * larger pages cause page_freelist_coalesce() already 3616 * failed there anyway. 3617 */ 3618 if (szc == 0) { 3619 pplist = page_get_mnode_cachelist(bin, flags, 3620 mnode, mtype); 3621 if (pplist != NULL) { 3622 page_hashout(pplist, NULL); 3623 PP_SETAGED(pplist); 3624 REPL_STAT_INCR(nhashout); 3625 break; 3626 } 3627 } 3628 3629 /* Now try remote freelists */ 3630 page_mnode = mnode; 3631 lgrp = 3632 lgrp_hand_to_lgrp(MEM_NODE_2_LGRPHAND(page_mnode)); 3633 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 3634 LGRP_SRCH_HIER); 3635 while (pplist == NULL && 3636 (mnode = lgrp_memnode_choose(&lgrp_cookie)) 3637 != -1) { 3638 /* 3639 * Skip local mnode. 3640 */ 3641 if ((mnode == page_mnode) || 3642 (mem_node_config[mnode].exists == 0)) 3643 continue; 3644 3645 pplist = page_get_mnode_freelist(mnode, 3646 bin, mtype, szc, flags); 3647 } 3648 3649 if (pplist != NULL) 3650 break; 3651 3652 3653 /* Now try remote cachelists */ 3654 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 3655 LGRP_SRCH_HIER); 3656 while (pplist == NULL && szc == 0) { 3657 mnode = lgrp_memnode_choose(&lgrp_cookie); 3658 if (mnode == -1) 3659 break; 3660 /* 3661 * Skip local mnode. 3662 */ 3663 if ((mnode == page_mnode) || 3664 (mem_node_config[mnode].exists == 0)) 3665 continue; 3666 3667 pplist = page_get_mnode_cachelist(bin, 3668 flags, mnode, mtype); 3669 3670 if (pplist != NULL) { 3671 page_hashout(pplist, NULL); 3672 PP_SETAGED(pplist); 3673 REPL_STAT_INCR(nhashout); 3674 break; 3675 } 3676 } 3677 3678 /* 3679 * Break out of while loop under the following cases: 3680 * - If we successfully got a page. 3681 * - If pgrflags specified only returning a specific 3682 * page size and we could not find that page size. 3683 * - If we could not satisfy the request with PAGESIZE 3684 * or larger pages. 3685 */ 3686 if (pplist != NULL || szc == 0) 3687 break; 3688 3689 if ((pgrflags & PGR_SAMESZC) || pgrppgcp) { 3690 /* try to find contig page */ 3691 3692 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 3693 LGRP_SRCH_HIER); 3694 3695 while ((pplist == NULL) && 3696 (mnode = 3697 lgrp_memnode_choose(&lgrp_cookie)) 3698 != -1) { 3699 pplist = page_get_contig_pages( 3700 mnode, bin, mtype, szc, 3701 flags | PGI_PGCPHIPRI); 3702 } 3703 break; 3704 } 3705 3706 /* 3707 * The correct thing to do here is try the next 3708 * page size down using szc--. Due to a bug 3709 * with the processing of HAT_RELOAD_SHARE 3710 * where the sfmmu_ttecnt arrays of all 3711 * hats sharing an ISM segment don't get updated, 3712 * using intermediate size pages for relocation 3713 * can lead to continuous page faults. 3714 */ 3715 szc = 0; 3716 } 3717 3718 if (pplist != NULL) { 3719 DTRACE_PROBE4(page__get, 3720 lgrp_t *, lgrp, 3721 int, mnode, 3722 ulong_t, bin, 3723 uint_t, flags); 3724 3725 while (pplist != NULL && pg_cnt--) { 3726 ASSERT(pplist != NULL); 3727 pp = pplist; 3728 page_sub(&pplist, pp); 3729 PP_CLRFREE(pp); 3730 PP_CLRAGED(pp); 3731 page_list_concat(&pl, &pp); 3732 npgs--; 3733 like_pp = like_pp + 1; 3734 REPL_STAT_INCR(nnext_pp); 3735 } 3736 ASSERT(pg_cnt == 0); 3737 } else { 3738 break; 3739 } 3740 } 3741 3742 if (npgs) { 3743 /* 3744 * We were unable to allocate the necessary number 3745 * of pages. 3746 * We need to free up any pl. 3747 */ 3748 REPL_STAT_INCR(nnopage); 3749 page_free_replacement_page(pl); 3750 return (NULL); 3751 } else { 3752 return (pl); 3753 } 3754 } 3755 3756 /* 3757 * demote a free large page to it's constituent pages 3758 */ 3759 void 3760 page_demote_free_pages(page_t *pp) 3761 { 3762 3763 int mnode; 3764 3765 ASSERT(pp != NULL); 3766 ASSERT(PAGE_LOCKED(pp)); 3767 ASSERT(PP_ISFREE(pp)); 3768 ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes); 3769 3770 mnode = PP_2_MEM_NODE(pp); 3771 page_freelist_lock(mnode); 3772 if (pp->p_szc != 0) { 3773 (void) page_demote(mnode, PFN_BASE(pp->p_pagenum, 3774 pp->p_szc), pp->p_szc, 0, PC_NO_COLOR, PC_FREE); 3775 } 3776 page_freelist_unlock(mnode); 3777 ASSERT(pp->p_szc == 0); 3778 } 3779