1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * Portions of this source code were derived from Berkeley 4.3 BSD 32 * under license from the Regents of the University of California. 33 */ 34 35 #pragma ident "%Z%%M% %I% %E% SMI" 36 37 /* 38 * This file contains common functions to access and manage the page lists. 39 * Many of these routines originated from platform dependent modules 40 * (sun4/vm/vm_dep.c, i86pc/vm/vm_machdep.c) and modified to function in 41 * a platform independent manner. 42 * 43 * vm/vm_dep.h provides for platform specific support. 44 */ 45 46 #include <sys/types.h> 47 #include <sys/debug.h> 48 #include <sys/cmn_err.h> 49 #include <sys/systm.h> 50 #include <sys/atomic.h> 51 #include <sys/sysmacros.h> 52 #include <vm/as.h> 53 #include <vm/page.h> 54 #include <vm/seg_kmem.h> 55 #include <vm/seg_vn.h> 56 #include <sys/memnode.h> 57 #include <vm/vm_dep.h> 58 #include <sys/lgrp.h> 59 #include <sys/mem_config.h> 60 #include <sys/callb.h> 61 #include <sys/mem_cage.h> 62 #include <sys/sdt.h> 63 64 extern uint_t vac_colors; 65 66 #define MAX_PRAGMA_ALIGN 128 67 68 /* vm_cpu_data0 for the boot cpu before kmem is initialized */ 69 70 #if L2CACHE_ALIGN_MAX <= MAX_PRAGMA_ALIGN 71 #pragma align L2CACHE_ALIGN_MAX(vm_cpu_data0) 72 #else 73 #pragma align MAX_PRAGMA_ALIGN(vm_cpu_data0) 74 #endif 75 char vm_cpu_data0[VM_CPU_DATA_PADSIZE]; 76 77 /* 78 * number of page colors equivalent to reqested color in page_get routines. 79 * If set, keeps large pages intact longer and keeps MPO allocation 80 * from the local mnode in favor of acquiring the 'correct' page color from 81 * a demoted large page or from a remote mnode. 82 */ 83 int colorequiv; 84 85 /* 86 * if set, specifies the percentage of large pages that are free from within 87 * a large page region before attempting to lock those pages for 88 * page_get_contig_pages processing. 89 * 90 * Should be turned on when kpr is available when page_trylock_contig_pages 91 * can be more selective. 92 */ 93 94 int ptcpthreshold; 95 96 /* 97 * Limit page get contig page search based on failure cnts in pgcpfailcnt[]. 98 * Enabled by default via pgcplimitsearch. 99 * 100 * pgcpfailcnt[] is bounded by PGCPFAILMAX (>= 1/2 of installed 101 * memory). When reached, pgcpfailcnt[] is reset to 1/2 of this upper 102 * bound. This upper bound range guarantees: 103 * - all large page 'slots' will be searched over time 104 * - the minimum (1) large page candidates considered on each pgcp call 105 * - count doesn't wrap around to 0 106 */ 107 pgcnt_t pgcpfailcnt[MMU_PAGE_SIZES]; 108 int pgcplimitsearch = 1; 109 110 #define PGCPFAILMAX (1 << (highbit(physinstalled) - 1)) 111 #define SETPGCPFAILCNT(szc) \ 112 if (++pgcpfailcnt[szc] >= PGCPFAILMAX) \ 113 pgcpfailcnt[szc] = PGCPFAILMAX / 2; 114 115 #ifdef VM_STATS 116 struct vmm_vmstats_str vmm_vmstats; 117 118 #endif /* VM_STATS */ 119 120 #if defined(__sparc) 121 #define LPGCREATE 0 122 #else 123 /* enable page_get_contig_pages */ 124 #define LPGCREATE 1 125 #endif 126 127 int pg_contig_disable; 128 int pg_lpgcreate_nocage = LPGCREATE; 129 130 /* 131 * page_freelist_fill pfn flag to signify no hi pfn requirement. 132 */ 133 #define PFNNULL 0 134 135 /* Flags involved in promotion and demotion routines */ 136 #define PC_FREE 0x1 /* put page on freelist */ 137 #define PC_ALLOC 0x2 /* return page for allocation */ 138 139 /* 140 * Flag for page_demote to be used with PC_FREE to denote that we don't care 141 * what the color is as the color parameter to the function is ignored. 142 */ 143 #define PC_NO_COLOR (-1) 144 145 /* 146 * page counters candidates info 147 * See page_ctrs_cands comment below for more details. 148 * fields are as follows: 149 * pcc_pages_free: # pages which freelist coalesce can create 150 * pcc_color_free_len: number of elements in pcc_color_free array 151 * pcc_color_free: pointer to page free counts per color 152 */ 153 typedef struct pcc_info { 154 pgcnt_t pcc_pages_free; 155 int pcc_color_free_len; 156 pgcnt_t *pcc_color_free; 157 } pcc_info_t; 158 159 /* 160 * On big machines it can take a long time to check page_counters 161 * arrays. page_ctrs_cands is a summary array whose elements are a dynamically 162 * updated sum of all elements of the corresponding page_counters arrays. 163 * page_freelist_coalesce() searches page_counters only if an appropriate 164 * element of page_ctrs_cands array is greater than 0. 165 * 166 * An extra dimension is used for page_ctrs_cands to spread the elements 167 * over a few e$ cache lines to avoid serialization during the array 168 * updates. 169 */ 170 #pragma align 64(page_ctrs_cands) 171 172 static pcc_info_t *page_ctrs_cands[NPC_MUTEX][MMU_PAGE_SIZES]; 173 174 /* 175 * Return in val the total number of free pages which can be created 176 * for the given mnode (m) and region size (r) 177 */ 178 #define PGCTRS_CANDS_GETVALUE(m, r, val) { \ 179 int i; \ 180 val = 0; \ 181 for (i = 0; i < NPC_MUTEX; i++) { \ 182 val += page_ctrs_cands[i][(r)][(m)].pcc_pages_free; \ 183 } \ 184 } 185 186 /* 187 * Return in val the total number of free pages which can be created 188 * for the given mnode (m), region size (r), and color (c) 189 */ 190 #define PGCTRS_CANDS_GETVALUECOLOR(m, r, c, val) { \ 191 int i; \ 192 val = 0; \ 193 ASSERT((c) < page_ctrs_cands[0][(r)][(m)].pcc_color_free_len); \ 194 for (i = 0; i < NPC_MUTEX; i++) { \ 195 val += page_ctrs_cands[i][(r)][(m)].pcc_color_free[(c)]; \ 196 } \ 197 } 198 199 /* 200 * We can only allow a single thread to update a counter within the physical 201 * range of the largest supported page size. That is the finest granularity 202 * possible since the counter values are dependent on each other 203 * as you move accross region sizes. PP_CTR_LOCK_INDX is used to determine the 204 * ctr_mutex lock index for a particular physical range. 205 */ 206 static kmutex_t *ctr_mutex[NPC_MUTEX]; 207 208 #define PP_CTR_LOCK_INDX(pp) \ 209 (((pp)->p_pagenum >> \ 210 (PAGE_BSZS_SHIFT(mmu_page_sizes - 1))) & (NPC_MUTEX - 1)) 211 212 /* 213 * Local functions prototypes. 214 */ 215 216 void page_ctr_add(int, int, page_t *, int); 217 void page_ctr_add_internal(int, int, page_t *, int); 218 void page_ctr_sub(int, int, page_t *, int); 219 uint_t page_convert_color(uchar_t, uchar_t, uint_t); 220 void page_freelist_lock(int); 221 void page_freelist_unlock(int); 222 page_t *page_promote(int, pfn_t, uchar_t, int); 223 page_t *page_demote(int, pfn_t, uchar_t, uchar_t, int, int); 224 page_t *page_freelist_fill(uchar_t, int, int, int, pfn_t); 225 page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int); 226 static int page_trylock_cons(page_t *pp, se_t se); 227 228 #define PNUM_SIZE(szc) \ 229 (hw_page_array[(szc)].hp_size >> hw_page_array[0].hp_shift) 230 #define PNUM_SHIFT(szc) \ 231 (hw_page_array[(szc)].hp_shift - hw_page_array[0].hp_shift) 232 233 /* 234 * The page_counters array below is used to keep track of free contiguous 235 * physical memory. A hw_page_map_t will be allocated per mnode per szc. 236 * This contains an array of counters, the size of the array, a shift value 237 * used to convert a pagenum into a counter array index or vice versa, as 238 * well as a cache of the last successful index to be promoted to a larger 239 * page size. As an optimization, we keep track of the last successful index 240 * to be promoted per page color for the given size region, and this is 241 * allocated dynamically based upon the number of colors for a given 242 * region size. 243 * 244 * Conceptually, the page counters are represented as: 245 * 246 * page_counters[region_size][mnode] 247 * 248 * region_size: size code of a candidate larger page made up 249 * of contiguous free smaller pages. 250 * 251 * page_counters[region_size][mnode].hpm_counters[index]: 252 * represents how many (region_size - 1) pages either 253 * exist or can be created within the given index range. 254 * 255 * Let's look at a sparc example: 256 * If we want to create a free 512k page, we look at region_size 2 257 * for the mnode we want. We calculate the index and look at a specific 258 * hpm_counters location. If we see 8 (FULL_REGION_CNT on sparc) at 259 * this location, it means that 8 64k pages either exist or can be created 260 * from 8K pages in order to make a single free 512k page at the given 261 * index. Note that when a region is full, it will contribute to the 262 * counts in the region above it. Thus we will not know what page 263 * size the free pages will be which can be promoted to this new free 264 * page unless we look at all regions below the current region. 265 */ 266 267 /* 268 * Note: hpmctr_t is defined in platform vm_dep.h 269 * hw_page_map_t contains all the information needed for the page_counters 270 * logic. The fields are as follows: 271 * 272 * hpm_counters: dynamically allocated array to hold counter data 273 * hpm_entries: entries in hpm_counters 274 * hpm_shift: shift for pnum/array index conv 275 * hpm_base: PFN mapped to counter index 0 276 * hpm_color_current_len: # of elements in hpm_color_current "array" below 277 * hpm_color_current: last index in counter array for this color at 278 * which we successfully created a large page 279 */ 280 typedef struct hw_page_map { 281 hpmctr_t *hpm_counters; 282 size_t hpm_entries; 283 int hpm_shift; 284 pfn_t hpm_base; 285 size_t hpm_color_current_len; 286 size_t *hpm_color_current; 287 } hw_page_map_t; 288 289 /* 290 * Element zero is not used, but is allocated for convenience. 291 */ 292 static hw_page_map_t *page_counters[MMU_PAGE_SIZES]; 293 294 /* 295 * The following macros are convenient ways to get access to the individual 296 * elements of the page_counters arrays. They can be used on both 297 * the left side and right side of equations. 298 */ 299 #define PAGE_COUNTERS(mnode, rg_szc, idx) \ 300 (page_counters[(rg_szc)][(mnode)].hpm_counters[(idx)]) 301 302 #define PAGE_COUNTERS_COUNTERS(mnode, rg_szc) \ 303 (page_counters[(rg_szc)][(mnode)].hpm_counters) 304 305 #define PAGE_COUNTERS_SHIFT(mnode, rg_szc) \ 306 (page_counters[(rg_szc)][(mnode)].hpm_shift) 307 308 #define PAGE_COUNTERS_ENTRIES(mnode, rg_szc) \ 309 (page_counters[(rg_szc)][(mnode)].hpm_entries) 310 311 #define PAGE_COUNTERS_BASE(mnode, rg_szc) \ 312 (page_counters[(rg_szc)][(mnode)].hpm_base) 313 314 #define PAGE_COUNTERS_CURRENT_COLOR_LEN(mnode, rg_szc) \ 315 (page_counters[(rg_szc)][(mnode)].hpm_color_current_len) 316 317 #define PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, rg_szc) \ 318 (page_counters[(rg_szc)][(mnode)].hpm_color_current) 319 320 #define PAGE_COUNTERS_CURRENT_COLOR(mnode, rg_szc, color) \ 321 (page_counters[(rg_szc)][(mnode)].hpm_color_current[(color)]) 322 323 #define PNUM_TO_IDX(mnode, rg_szc, pnum) \ 324 (((pnum) - PAGE_COUNTERS_BASE((mnode), (rg_szc))) >> \ 325 PAGE_COUNTERS_SHIFT((mnode), (rg_szc))) 326 327 #define IDX_TO_PNUM(mnode, rg_szc, index) \ 328 (PAGE_COUNTERS_BASE((mnode), (rg_szc)) + \ 329 ((index) << PAGE_COUNTERS_SHIFT((mnode), (rg_szc)))) 330 331 /* 332 * Protects the hpm_counters and hpm_color_current memory from changing while 333 * looking at page counters information. 334 * Grab the write lock to modify what these fields point at. 335 * Grab the read lock to prevent any pointers from changing. 336 * The write lock can not be held during memory allocation due to a possible 337 * recursion deadlock with trying to grab the read lock while the 338 * write lock is already held. 339 */ 340 krwlock_t page_ctrs_rwlock[MAX_MEM_NODES]; 341 342 343 /* 344 * initialize cpu_vm_data to point at cache aligned vm_cpu_data_t. 345 */ 346 void 347 cpu_vm_data_init(struct cpu *cp) 348 { 349 if (cp == CPU0) { 350 cp->cpu_vm_data = (void *)&vm_cpu_data0; 351 } else { 352 void *kmptr; 353 int align; 354 size_t sz; 355 356 align = (L2CACHE_ALIGN) ? L2CACHE_ALIGN : L2CACHE_ALIGN_MAX; 357 sz = P2ROUNDUP(sizeof (vm_cpu_data_t), align) + align; 358 kmptr = kmem_zalloc(sz, KM_SLEEP); 359 cp->cpu_vm_data = (void *) P2ROUNDUP((uintptr_t)kmptr, align); 360 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr = kmptr; 361 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize = sz; 362 } 363 } 364 365 /* 366 * free cpu_vm_data 367 */ 368 void 369 cpu_vm_data_destroy(struct cpu *cp) 370 { 371 if (cp->cpu_seqid && cp->cpu_vm_data) { 372 ASSERT(cp != CPU0); 373 kmem_free(((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr, 374 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize); 375 } 376 cp->cpu_vm_data = NULL; 377 } 378 379 380 /* 381 * page size to page size code 382 */ 383 int 384 page_szc(size_t pagesize) 385 { 386 int i = 0; 387 388 while (hw_page_array[i].hp_size) { 389 if (pagesize == hw_page_array[i].hp_size) 390 return (i); 391 i++; 392 } 393 return (-1); 394 } 395 396 /* 397 * page size to page size code with the restriction that it be a supported 398 * user page size. If it's not a supported user page size, -1 will be returned. 399 */ 400 int 401 page_szc_user_filtered(size_t pagesize) 402 { 403 int szc = page_szc(pagesize); 404 if ((szc != -1) && (SZC_2_USERSZC(szc) != -1)) { 405 return (szc); 406 } 407 return (-1); 408 } 409 410 /* 411 * Return how many page sizes are available for the user to use. This is 412 * what the hardware supports and not based upon how the OS implements the 413 * support of different page sizes. 414 */ 415 uint_t 416 page_num_user_pagesizes(void) 417 { 418 return (mmu_exported_page_sizes); 419 } 420 421 uint_t 422 page_num_pagesizes(void) 423 { 424 return (mmu_page_sizes); 425 } 426 427 /* 428 * returns the count of the number of base pagesize pages associated with szc 429 */ 430 pgcnt_t 431 page_get_pagecnt(uint_t szc) 432 { 433 if (szc >= mmu_page_sizes) 434 panic("page_get_pagecnt: out of range %d", szc); 435 return (hw_page_array[szc].hp_pgcnt); 436 } 437 438 size_t 439 page_get_pagesize(uint_t szc) 440 { 441 if (szc >= mmu_page_sizes) 442 panic("page_get_pagesize: out of range %d", szc); 443 return (hw_page_array[szc].hp_size); 444 } 445 446 /* 447 * Return the size of a page based upon the index passed in. An index of 448 * zero refers to the smallest page size in the system, and as index increases 449 * it refers to the next larger supported page size in the system. 450 * Note that szc and userszc may not be the same due to unsupported szc's on 451 * some systems. 452 */ 453 size_t 454 page_get_user_pagesize(uint_t userszc) 455 { 456 uint_t szc = USERSZC_2_SZC(userszc); 457 458 if (szc >= mmu_page_sizes) 459 panic("page_get_user_pagesize: out of range %d", szc); 460 return (hw_page_array[szc].hp_size); 461 } 462 463 uint_t 464 page_get_shift(uint_t szc) 465 { 466 if (szc >= mmu_page_sizes) 467 panic("page_get_shift: out of range %d", szc); 468 return (hw_page_array[szc].hp_shift); 469 } 470 471 uint_t 472 page_get_pagecolors(uint_t szc) 473 { 474 ASSERT(page_colors != 0); 475 return (MAX(page_colors >> PAGE_BSZS_SHIFT(szc), 1)); 476 } 477 478 /* 479 * Called by startup(). 480 * Size up the per page size free list counters based on physmax 481 * of each node and max_mem_nodes. 482 */ 483 size_t 484 page_ctrs_sz(void) 485 { 486 int r; /* region size */ 487 int mnode; 488 uint_t ctrs_sz = 0; 489 int i; 490 pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; 491 492 /* 493 * We need to determine how many page colors there are for each 494 * page size in order to allocate memory for any color specific 495 * arrays. 496 */ 497 colors_per_szc[0] = page_colors; 498 for (i = 1; i < mmu_page_sizes; i++) { 499 colors_per_szc[i] = 500 page_convert_color(0, i, page_colors - 1) + 1; 501 } 502 503 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 504 505 pgcnt_t r_pgcnt; 506 pfn_t r_base; 507 pgcnt_t r_align; 508 509 if (mem_node_config[mnode].exists == 0) 510 continue; 511 512 /* 513 * determine size needed for page counter arrays with 514 * base aligned to large page size. 515 */ 516 for (r = 1; r < mmu_page_sizes; r++) { 517 /* add in space for hpm_counters */ 518 r_align = page_get_pagecnt(r); 519 r_base = mem_node_config[mnode].physbase; 520 r_base &= ~(r_align - 1); 521 r_pgcnt = howmany(mem_node_config[mnode].physmax - 522 r_base + 1, r_align); 523 /* 524 * Round up to always allocate on pointer sized 525 * boundaries. 526 */ 527 ctrs_sz += P2ROUNDUP((r_pgcnt * sizeof (hpmctr_t)), 528 sizeof (hpmctr_t *)); 529 530 /* add in space for hpm_color_current */ 531 ctrs_sz += (colors_per_szc[r] * 532 sizeof (size_t)); 533 } 534 } 535 536 for (r = 1; r < mmu_page_sizes; r++) { 537 ctrs_sz += (max_mem_nodes * sizeof (hw_page_map_t)); 538 539 /* add in space for page_ctrs_cands */ 540 ctrs_sz += NPC_MUTEX * max_mem_nodes * (sizeof (pcc_info_t)); 541 ctrs_sz += NPC_MUTEX * max_mem_nodes * colors_per_szc[r] * 542 sizeof (pgcnt_t); 543 } 544 545 /* ctr_mutex */ 546 ctrs_sz += (max_mem_nodes * NPC_MUTEX * sizeof (kmutex_t)); 547 548 /* size for page list counts */ 549 PLCNT_SZ(ctrs_sz); 550 551 /* 552 * add some slop for roundups. page_ctrs_alloc will roundup the start 553 * address of the counters to ecache_alignsize boundary for every 554 * memory node. 555 */ 556 return (ctrs_sz + max_mem_nodes * L2CACHE_ALIGN); 557 } 558 559 caddr_t 560 page_ctrs_alloc(caddr_t alloc_base) 561 { 562 int mnode; 563 int r; /* region size */ 564 int i; 565 pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; 566 567 /* 568 * We need to determine how many page colors there are for each 569 * page size in order to allocate memory for any color specific 570 * arrays. 571 */ 572 colors_per_szc[0] = page_colors; 573 for (i = 1; i < mmu_page_sizes; i++) { 574 colors_per_szc[i] = 575 page_convert_color(0, i, page_colors - 1) + 1; 576 } 577 578 for (r = 1; r < mmu_page_sizes; r++) { 579 page_counters[r] = (hw_page_map_t *)alloc_base; 580 alloc_base += (max_mem_nodes * sizeof (hw_page_map_t)); 581 } 582 583 /* page_ctrs_cands */ 584 for (r = 1; r < mmu_page_sizes; r++) { 585 for (i = 0; i < NPC_MUTEX; i++) { 586 page_ctrs_cands[i][r] = (pcc_info_t *)alloc_base; 587 alloc_base += max_mem_nodes * (sizeof (pcc_info_t)); 588 589 } 590 } 591 592 /* page_ctrs_cands pcc_color_free array */ 593 for (r = 1; r < mmu_page_sizes; r++) { 594 for (i = 0; i < NPC_MUTEX; i++) { 595 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 596 page_ctrs_cands[i][r][mnode].pcc_color_free_len 597 = colors_per_szc[r]; 598 page_ctrs_cands[i][r][mnode].pcc_color_free = 599 (pgcnt_t *)alloc_base; 600 alloc_base += colors_per_szc[r] * 601 sizeof (pgcnt_t); 602 } 603 } 604 } 605 606 /* ctr_mutex */ 607 for (i = 0; i < NPC_MUTEX; i++) { 608 ctr_mutex[i] = (kmutex_t *)alloc_base; 609 alloc_base += (max_mem_nodes * sizeof (kmutex_t)); 610 } 611 612 /* initialize page list counts */ 613 PLCNT_INIT(alloc_base); 614 615 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 616 617 pgcnt_t r_pgcnt; 618 pfn_t r_base; 619 pgcnt_t r_align; 620 int r_shift; 621 622 if (mem_node_config[mnode].exists == 0) 623 continue; 624 625 for (r = 1; r < mmu_page_sizes; r++) { 626 /* 627 * the page_counters base has to be aligned to the 628 * page count of page size code r otherwise the counts 629 * will cross large page boundaries. 630 */ 631 r_align = page_get_pagecnt(r); 632 r_base = mem_node_config[mnode].physbase; 633 /* base needs to be aligned - lower to aligned value */ 634 r_base &= ~(r_align - 1); 635 r_pgcnt = howmany(mem_node_config[mnode].physmax - 636 r_base + 1, r_align); 637 r_shift = PAGE_BSZS_SHIFT(r); 638 639 PAGE_COUNTERS_SHIFT(mnode, r) = r_shift; 640 PAGE_COUNTERS_ENTRIES(mnode, r) = r_pgcnt; 641 PAGE_COUNTERS_BASE(mnode, r) = r_base; 642 PAGE_COUNTERS_CURRENT_COLOR_LEN(mnode, r) = 643 colors_per_szc[r]; 644 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r) = 645 (size_t *)alloc_base; 646 alloc_base += (sizeof (size_t) * colors_per_szc[r]); 647 for (i = 0; i < colors_per_szc[r]; i++) { 648 PAGE_COUNTERS_CURRENT_COLOR(mnode, r, i) = i; 649 } 650 PAGE_COUNTERS_COUNTERS(mnode, r) = 651 (hpmctr_t *)alloc_base; 652 /* 653 * Round up to make alloc_base always be aligned on 654 * a pointer boundary. 655 */ 656 alloc_base += P2ROUNDUP((sizeof (hpmctr_t) * r_pgcnt), 657 sizeof (hpmctr_t *)); 658 659 /* 660 * Verify that PNUM_TO_IDX and IDX_TO_PNUM 661 * satisfy the identity requirement. 662 * We should be able to go from one to the other 663 * and get consistent values. 664 */ 665 ASSERT(PNUM_TO_IDX(mnode, r, 666 (IDX_TO_PNUM(mnode, r, 0))) == 0); 667 ASSERT(IDX_TO_PNUM(mnode, r, 668 (PNUM_TO_IDX(mnode, r, r_base))) == r_base); 669 } 670 /* 671 * Roundup the start address of the page_counters to 672 * cache aligned boundary for every memory node. 673 * page_ctrs_sz() has added some slop for these roundups. 674 */ 675 alloc_base = (caddr_t)P2ROUNDUP((uintptr_t)alloc_base, 676 L2CACHE_ALIGN); 677 } 678 679 /* Initialize other page counter specific data structures. */ 680 for (mnode = 0; mnode < MAX_MEM_NODES; mnode++) { 681 rw_init(&page_ctrs_rwlock[mnode], NULL, RW_DEFAULT, NULL); 682 } 683 684 return (alloc_base); 685 } 686 687 /* 688 * Functions to adjust region counters for each size free list. 689 * Caller is responsible to acquire the ctr_mutex lock if necessary and 690 * thus can be called during startup without locks. 691 */ 692 /* ARGSUSED */ 693 void 694 page_ctr_add_internal(int mnode, int mtype, page_t *pp, int flags) 695 { 696 ssize_t r; /* region size */ 697 ssize_t idx; 698 pfn_t pfnum; 699 int lckidx; 700 701 ASSERT(mnode == PP_2_MEM_NODE(pp)); 702 ASSERT(mtype == PP_2_MTYPE(pp)); 703 704 ASSERT(pp->p_szc < mmu_page_sizes); 705 706 PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags); 707 708 /* no counter update needed for largest page size */ 709 if (pp->p_szc >= mmu_page_sizes - 1) { 710 return; 711 } 712 713 r = pp->p_szc + 1; 714 pfnum = pp->p_pagenum; 715 lckidx = PP_CTR_LOCK_INDX(pp); 716 717 /* 718 * Increment the count of free pages for the current 719 * region. Continue looping up in region size incrementing 720 * count if the preceeding region is full. 721 */ 722 while (r < mmu_page_sizes) { 723 idx = PNUM_TO_IDX(mnode, r, pfnum); 724 725 ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r)); 726 ASSERT(PAGE_COUNTERS(mnode, r, idx) < FULL_REGION_CNT(r)); 727 728 if (++PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r)) 729 break; 730 731 page_ctrs_cands[lckidx][r][mnode].pcc_pages_free++; 732 page_ctrs_cands[lckidx][r][mnode]. 733 pcc_color_free[PP_2_BIN_SZC(pp, r)]++; 734 r++; 735 } 736 } 737 738 void 739 page_ctr_add(int mnode, int mtype, page_t *pp, int flags) 740 { 741 int lckidx = PP_CTR_LOCK_INDX(pp); 742 kmutex_t *lock = &ctr_mutex[lckidx][mnode]; 743 744 mutex_enter(lock); 745 page_ctr_add_internal(mnode, mtype, pp, flags); 746 mutex_exit(lock); 747 } 748 749 void 750 page_ctr_sub(int mnode, int mtype, page_t *pp, int flags) 751 { 752 int lckidx; 753 kmutex_t *lock; 754 ssize_t r; /* region size */ 755 ssize_t idx; 756 pfn_t pfnum; 757 758 ASSERT(mnode == PP_2_MEM_NODE(pp)); 759 ASSERT(mtype == PP_2_MTYPE(pp)); 760 761 ASSERT(pp->p_szc < mmu_page_sizes); 762 763 PLCNT_DECR(pp, mnode, mtype, pp->p_szc, flags); 764 765 /* no counter update needed for largest page size */ 766 if (pp->p_szc >= mmu_page_sizes - 1) { 767 return; 768 } 769 770 r = pp->p_szc + 1; 771 pfnum = pp->p_pagenum; 772 lckidx = PP_CTR_LOCK_INDX(pp); 773 lock = &ctr_mutex[lckidx][mnode]; 774 775 /* 776 * Decrement the count of free pages for the current 777 * region. Continue looping up in region size decrementing 778 * count if the preceeding region was full. 779 */ 780 mutex_enter(lock); 781 while (r < mmu_page_sizes) { 782 idx = PNUM_TO_IDX(mnode, r, pfnum); 783 784 ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r)); 785 ASSERT(PAGE_COUNTERS(mnode, r, idx) > 0); 786 787 if (--PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r) - 1) { 788 break; 789 } 790 ASSERT(page_ctrs_cands[lckidx][r][mnode].pcc_pages_free != 0); 791 ASSERT(page_ctrs_cands[lckidx][r][mnode]. 792 pcc_color_free[PP_2_BIN_SZC(pp, r)] != 0); 793 794 page_ctrs_cands[lckidx][r][mnode].pcc_pages_free--; 795 page_ctrs_cands[lckidx][r][mnode]. 796 pcc_color_free[PP_2_BIN_SZC(pp, r)]--; 797 r++; 798 } 799 mutex_exit(lock); 800 } 801 802 /* 803 * Adjust page counters following a memory attach, since typically the 804 * size of the array needs to change, and the PFN to counter index 805 * mapping needs to change. 806 */ 807 uint_t 808 page_ctrs_adjust(int mnode) 809 { 810 pgcnt_t npgs; 811 int r; /* region size */ 812 int i; 813 size_t pcsz, old_csz; 814 hpmctr_t *new_ctr, *old_ctr; 815 pfn_t oldbase, newbase; 816 size_t old_npgs; 817 hpmctr_t *ctr_cache[MMU_PAGE_SIZES]; 818 size_t size_cache[MMU_PAGE_SIZES]; 819 size_t *color_cache[MMU_PAGE_SIZES]; 820 size_t *old_color_array; 821 pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; 822 823 newbase = mem_node_config[mnode].physbase & ~PC_BASE_ALIGN_MASK; 824 npgs = roundup(mem_node_config[mnode].physmax, 825 PC_BASE_ALIGN) - newbase; 826 827 /* 828 * We need to determine how many page colors there are for each 829 * page size in order to allocate memory for any color specific 830 * arrays. 831 */ 832 colors_per_szc[0] = page_colors; 833 for (r = 1; r < mmu_page_sizes; r++) { 834 colors_per_szc[r] = 835 page_convert_color(0, r, page_colors - 1) + 1; 836 } 837 838 /* 839 * Preallocate all of the new hpm_counters arrays as we can't 840 * hold the page_ctrs_rwlock as a writer and allocate memory. 841 * If we can't allocate all of the arrays, undo our work so far 842 * and return failure. 843 */ 844 for (r = 1; r < mmu_page_sizes; r++) { 845 pcsz = npgs >> PAGE_BSZS_SHIFT(r); 846 847 ctr_cache[r] = kmem_zalloc(pcsz * 848 sizeof (hpmctr_t), KM_NOSLEEP); 849 if (ctr_cache[r] == NULL) { 850 while (--r >= 1) { 851 kmem_free(ctr_cache[r], 852 size_cache[r] * sizeof (hpmctr_t)); 853 } 854 return (ENOMEM); 855 } 856 size_cache[r] = pcsz; 857 } 858 /* 859 * Preallocate all of the new color current arrays as we can't 860 * hold the page_ctrs_rwlock as a writer and allocate memory. 861 * If we can't allocate all of the arrays, undo our work so far 862 * and return failure. 863 */ 864 for (r = 1; r < mmu_page_sizes; r++) { 865 color_cache[r] = kmem_zalloc(sizeof (size_t) * 866 colors_per_szc[r], KM_NOSLEEP); 867 if (color_cache[r] == NULL) { 868 while (--r >= 1) { 869 kmem_free(color_cache[r], 870 colors_per_szc[r] * sizeof (size_t)); 871 } 872 for (r = 1; r < mmu_page_sizes; r++) { 873 kmem_free(ctr_cache[r], 874 size_cache[r] * sizeof (hpmctr_t)); 875 } 876 return (ENOMEM); 877 } 878 } 879 880 /* 881 * Grab the write lock to prevent others from walking these arrays 882 * while we are modifying them. 883 */ 884 rw_enter(&page_ctrs_rwlock[mnode], RW_WRITER); 885 page_freelist_lock(mnode); 886 for (r = 1; r < mmu_page_sizes; r++) { 887 PAGE_COUNTERS_SHIFT(mnode, r) = PAGE_BSZS_SHIFT(r); 888 old_ctr = PAGE_COUNTERS_COUNTERS(mnode, r); 889 old_csz = PAGE_COUNTERS_ENTRIES(mnode, r); 890 oldbase = PAGE_COUNTERS_BASE(mnode, r); 891 old_npgs = old_csz << PAGE_COUNTERS_SHIFT(mnode, r); 892 old_color_array = PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r); 893 894 pcsz = npgs >> PAGE_COUNTERS_SHIFT(mnode, r); 895 new_ctr = ctr_cache[r]; 896 ctr_cache[r] = NULL; 897 if (old_ctr != NULL && 898 (oldbase + old_npgs > newbase) && 899 (newbase + npgs > oldbase)) { 900 /* 901 * Map the intersection of the old and new 902 * counters into the new array. 903 */ 904 size_t offset; 905 if (newbase > oldbase) { 906 offset = (newbase - oldbase) >> 907 PAGE_COUNTERS_SHIFT(mnode, r); 908 bcopy(old_ctr + offset, new_ctr, 909 MIN(pcsz, (old_csz - offset)) * 910 sizeof (hpmctr_t)); 911 } else { 912 offset = (oldbase - newbase) >> 913 PAGE_COUNTERS_SHIFT(mnode, r); 914 bcopy(old_ctr, new_ctr + offset, 915 MIN(pcsz - offset, old_csz) * 916 sizeof (hpmctr_t)); 917 } 918 } 919 920 PAGE_COUNTERS_COUNTERS(mnode, r) = new_ctr; 921 PAGE_COUNTERS_ENTRIES(mnode, r) = pcsz; 922 PAGE_COUNTERS_BASE(mnode, r) = newbase; 923 PAGE_COUNTERS_CURRENT_COLOR_LEN(mnode, r) = colors_per_szc[r]; 924 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r) = color_cache[r]; 925 color_cache[r] = NULL; 926 /* 927 * for now, just reset on these events as it's probably 928 * not worthwhile to try and optimize this. 929 */ 930 for (i = 0; i < colors_per_szc[r]; i++) { 931 PAGE_COUNTERS_CURRENT_COLOR(mnode, r, i) = i; 932 } 933 934 /* cache info for freeing out of the critical path */ 935 if ((caddr_t)old_ctr >= kernelheap && 936 (caddr_t)old_ctr < ekernelheap) { 937 ctr_cache[r] = old_ctr; 938 size_cache[r] = old_csz; 939 } 940 if ((caddr_t)old_color_array >= kernelheap && 941 (caddr_t)old_color_array < ekernelheap) { 942 color_cache[r] = old_color_array; 943 } 944 /* 945 * Verify that PNUM_TO_IDX and IDX_TO_PNUM 946 * satisfy the identity requirement. 947 * We should be able to go from one to the other 948 * and get consistent values. 949 */ 950 ASSERT(PNUM_TO_IDX(mnode, r, 951 (IDX_TO_PNUM(mnode, r, 0))) == 0); 952 ASSERT(IDX_TO_PNUM(mnode, r, 953 (PNUM_TO_IDX(mnode, r, newbase))) == newbase); 954 } 955 page_freelist_unlock(mnode); 956 rw_exit(&page_ctrs_rwlock[mnode]); 957 958 /* 959 * Now that we have dropped the write lock, it is safe to free all 960 * of the memory we have cached above. 961 */ 962 for (r = 1; r < mmu_page_sizes; r++) { 963 if (ctr_cache[r] != NULL) { 964 kmem_free(ctr_cache[r], 965 size_cache[r] * sizeof (hpmctr_t)); 966 } 967 if (color_cache[r] != NULL) { 968 kmem_free(color_cache[r], 969 colors_per_szc[r] * sizeof (size_t)); 970 } 971 } 972 return (0); 973 } 974 975 /* 976 * color contains a valid color index or bin for cur_szc 977 */ 978 uint_t 979 page_convert_color(uchar_t cur_szc, uchar_t new_szc, uint_t color) 980 { 981 uint_t shift; 982 983 if (cur_szc > new_szc) { 984 shift = page_get_shift(cur_szc) - page_get_shift(new_szc); 985 return (color << shift); 986 } else if (cur_szc < new_szc) { 987 shift = page_get_shift(new_szc) - page_get_shift(cur_szc); 988 return (color >> shift); 989 } 990 return (color); 991 } 992 993 #ifdef DEBUG 994 995 /* 996 * confirm pp is a large page corresponding to szc 997 */ 998 void 999 chk_lpg(page_t *pp, uchar_t szc) 1000 { 1001 spgcnt_t npgs = page_get_pagecnt(pp->p_szc); 1002 uint_t noreloc; 1003 1004 if (npgs == 1) { 1005 ASSERT(pp->p_szc == 0); 1006 ASSERT(pp->p_next == pp); 1007 ASSERT(pp->p_prev == pp); 1008 return; 1009 } 1010 1011 ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL); 1012 ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL); 1013 1014 ASSERT(IS_P2ALIGNED(pp->p_pagenum, npgs)); 1015 ASSERT(pp->p_pagenum == (pp->p_next->p_pagenum - 1)); 1016 ASSERT(pp->p_prev->p_pagenum == (pp->p_pagenum + (npgs - 1))); 1017 ASSERT(pp->p_prev == (pp + (npgs - 1))); 1018 1019 /* 1020 * Check list of pages. 1021 */ 1022 noreloc = PP_ISNORELOC(pp); 1023 while (npgs--) { 1024 if (npgs != 0) { 1025 ASSERT(pp->p_pagenum == pp->p_next->p_pagenum - 1); 1026 ASSERT(pp->p_next == (pp + 1)); 1027 } 1028 ASSERT(pp->p_szc == szc); 1029 ASSERT(PP_ISFREE(pp)); 1030 ASSERT(PP_ISAGED(pp)); 1031 ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL); 1032 ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL); 1033 ASSERT(pp->p_vnode == NULL); 1034 ASSERT(PP_ISNORELOC(pp) == noreloc); 1035 1036 pp = pp->p_next; 1037 } 1038 } 1039 #endif /* DEBUG */ 1040 1041 void 1042 page_freelist_lock(int mnode) 1043 { 1044 int i; 1045 for (i = 0; i < NPC_MUTEX; i++) { 1046 mutex_enter(FPC_MUTEX(mnode, i)); 1047 mutex_enter(CPC_MUTEX(mnode, i)); 1048 } 1049 } 1050 1051 void 1052 page_freelist_unlock(int mnode) 1053 { 1054 int i; 1055 for (i = 0; i < NPC_MUTEX; i++) { 1056 mutex_exit(FPC_MUTEX(mnode, i)); 1057 mutex_exit(CPC_MUTEX(mnode, i)); 1058 } 1059 } 1060 1061 /* 1062 * update the page list max counts for already allocated pages that has xfer'ed 1063 * (kcage_assimilate_page) between different mtypes. 1064 */ 1065 /* ARGSUSED */ 1066 void 1067 page_list_xfer(page_t *pp, int to_mtype, int from_mtype) 1068 { 1069 PLCNT_MAX_INCR(pp, PP_2_MEM_NODE(pp), to_mtype, pp->p_szc); 1070 PLCNT_MAX_DECR(pp, PP_2_MEM_NODE(pp), from_mtype, pp->p_szc); 1071 } 1072 1073 /* 1074 * add pp to the specified page list. Defaults to head of the page list 1075 * unless PG_LIST_TAIL is specified. 1076 */ 1077 void 1078 page_list_add(page_t *pp, int flags) 1079 { 1080 page_t **ppp; 1081 kmutex_t *pcm; 1082 uint_t bin, mtype; 1083 int mnode; 1084 1085 ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT)); 1086 ASSERT(PP_ISFREE(pp)); 1087 ASSERT(!hat_page_is_mapped(pp)); 1088 ASSERT(hat_page_getshare(pp) == 0); 1089 1090 /* 1091 * Large pages should be freed via page_list_add_pages(). 1092 */ 1093 ASSERT(pp->p_szc == 0); 1094 1095 /* 1096 * Don't need to lock the freelist first here 1097 * because the page isn't on the freelist yet. 1098 * This means p_szc can't change on us. 1099 */ 1100 1101 bin = PP_2_BIN(pp); 1102 mnode = PP_2_MEM_NODE(pp); 1103 mtype = PP_2_MTYPE(pp); 1104 1105 if (flags & PG_LIST_ISINIT) { 1106 /* 1107 * PG_LIST_ISINIT is set during system startup (ie. single 1108 * threaded), add a page to the free list and add to the 1109 * the free region counters w/o any locking 1110 */ 1111 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1112 1113 /* inline version of page_add() */ 1114 if (*ppp != NULL) { 1115 pp->p_next = *ppp; 1116 pp->p_prev = (*ppp)->p_prev; 1117 (*ppp)->p_prev = pp; 1118 pp->p_prev->p_next = pp; 1119 } else 1120 *ppp = pp; 1121 1122 page_ctr_add_internal(mnode, mtype, pp, flags); 1123 VM_STAT_ADD(vmm_vmstats.pladd_free[0]); 1124 } else { 1125 pcm = PC_BIN_MUTEX(mnode, bin, flags); 1126 1127 if (flags & PG_FREE_LIST) { 1128 VM_STAT_ADD(vmm_vmstats.pladd_free[0]); 1129 ASSERT(PP_ISAGED(pp)); 1130 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1131 1132 } else { 1133 VM_STAT_ADD(vmm_vmstats.pladd_cache); 1134 ASSERT(pp->p_vnode); 1135 ASSERT((pp->p_offset & PAGEOFFSET) == 0); 1136 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1137 } 1138 mutex_enter(pcm); 1139 page_add(ppp, pp); 1140 1141 if (flags & PG_LIST_TAIL) 1142 *ppp = (*ppp)->p_next; 1143 /* 1144 * Add counters before releasing pcm mutex to avoid a race with 1145 * page_freelist_coalesce and page_freelist_fill. 1146 */ 1147 page_ctr_add(mnode, mtype, pp, flags); 1148 mutex_exit(pcm); 1149 } 1150 1151 1152 #if defined(__sparc) 1153 if (PP_ISNORELOC(pp)) { 1154 kcage_freemem_add(1); 1155 } 1156 #endif 1157 /* 1158 * It is up to the caller to unlock the page! 1159 */ 1160 ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT)); 1161 } 1162 1163 1164 #ifdef __sparc 1165 /* 1166 * This routine is only used by kcage_init during system startup. 1167 * It performs the function of page_list_sub/PP_SETNORELOC/page_list_add 1168 * without the overhead of taking locks and updating counters. 1169 */ 1170 void 1171 page_list_noreloc_startup(page_t *pp) 1172 { 1173 page_t **ppp; 1174 uint_t bin; 1175 int mnode; 1176 int mtype; 1177 int flags = PG_LIST_ISCAGE; 1178 1179 /* 1180 * If this is a large page on the freelist then 1181 * break it up into smaller pages. 1182 */ 1183 if (pp->p_szc != 0) 1184 page_boot_demote(pp); 1185 1186 /* 1187 * Get list page is currently on. 1188 */ 1189 bin = PP_2_BIN(pp); 1190 mnode = PP_2_MEM_NODE(pp); 1191 mtype = PP_2_MTYPE(pp); 1192 ASSERT(mtype == MTYPE_RELOC); 1193 ASSERT(pp->p_szc == 0); 1194 1195 if (PP_ISAGED(pp)) { 1196 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1197 flags |= PG_FREE_LIST; 1198 } else { 1199 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1200 flags |= PG_CACHE_LIST; 1201 } 1202 1203 ASSERT(*ppp != NULL); 1204 1205 /* 1206 * Delete page from current list. 1207 */ 1208 if (*ppp == pp) 1209 *ppp = pp->p_next; /* go to next page */ 1210 if (*ppp == pp) { 1211 *ppp = NULL; /* page list is gone */ 1212 } else { 1213 pp->p_prev->p_next = pp->p_next; 1214 pp->p_next->p_prev = pp->p_prev; 1215 } 1216 1217 /* LINTED */ 1218 PLCNT_DECR(pp, mnode, mtype, 0, flags); 1219 1220 /* 1221 * Set no reloc for cage initted pages. 1222 */ 1223 PP_SETNORELOC(pp); 1224 1225 mtype = PP_2_MTYPE(pp); 1226 ASSERT(mtype == MTYPE_NORELOC); 1227 1228 /* 1229 * Get new list for page. 1230 */ 1231 if (PP_ISAGED(pp)) { 1232 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1233 } else { 1234 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1235 } 1236 1237 /* 1238 * Insert page on new list. 1239 */ 1240 if (*ppp == NULL) { 1241 *ppp = pp; 1242 pp->p_next = pp->p_prev = pp; 1243 } else { 1244 pp->p_next = *ppp; 1245 pp->p_prev = (*ppp)->p_prev; 1246 (*ppp)->p_prev = pp; 1247 pp->p_prev->p_next = pp; 1248 } 1249 1250 /* LINTED */ 1251 PLCNT_INCR(pp, mnode, mtype, 0, flags); 1252 1253 /* 1254 * Update cage freemem counter 1255 */ 1256 atomic_add_long(&kcage_freemem, 1); 1257 } 1258 #else /* __sparc */ 1259 1260 /* ARGSUSED */ 1261 void 1262 page_list_noreloc_startup(page_t *pp) 1263 { 1264 panic("page_list_noreloc_startup: should be here only for sparc"); 1265 } 1266 #endif 1267 1268 void 1269 page_list_add_pages(page_t *pp, int flags) 1270 { 1271 kmutex_t *pcm; 1272 pgcnt_t pgcnt; 1273 uint_t bin, mtype, i; 1274 int mnode; 1275 1276 /* default to freelist/head */ 1277 ASSERT((flags & (PG_CACHE_LIST | PG_LIST_TAIL)) == 0); 1278 1279 CHK_LPG(pp, pp->p_szc); 1280 VM_STAT_ADD(vmm_vmstats.pladd_free[pp->p_szc]); 1281 1282 bin = PP_2_BIN(pp); 1283 mnode = PP_2_MEM_NODE(pp); 1284 mtype = PP_2_MTYPE(pp); 1285 1286 if (flags & PG_LIST_ISINIT) { 1287 ASSERT(pp->p_szc == mmu_page_sizes - 1); 1288 page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1289 ASSERT(!PP_ISNORELOC(pp)); 1290 PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags); 1291 } else { 1292 1293 ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes); 1294 1295 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 1296 1297 mutex_enter(pcm); 1298 page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1299 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST); 1300 mutex_exit(pcm); 1301 1302 pgcnt = page_get_pagecnt(pp->p_szc); 1303 #if defined(__sparc) 1304 if (PP_ISNORELOC(pp)) 1305 kcage_freemem_add(pgcnt); 1306 #endif 1307 for (i = 0; i < pgcnt; i++, pp++) 1308 page_unlock_noretire(pp); 1309 } 1310 } 1311 1312 /* 1313 * During boot, need to demote a large page to base 1314 * pagesize pages for seg_kmem for use in boot_alloc() 1315 */ 1316 void 1317 page_boot_demote(page_t *pp) 1318 { 1319 ASSERT(pp->p_szc != 0); 1320 ASSERT(PP_ISFREE(pp)); 1321 ASSERT(PP_ISAGED(pp)); 1322 1323 (void) page_demote(PP_2_MEM_NODE(pp), 1324 PFN_BASE(pp->p_pagenum, pp->p_szc), pp->p_szc, 0, PC_NO_COLOR, 1325 PC_FREE); 1326 1327 ASSERT(PP_ISFREE(pp)); 1328 ASSERT(PP_ISAGED(pp)); 1329 ASSERT(pp->p_szc == 0); 1330 } 1331 1332 /* 1333 * Take a particular page off of whatever freelist the page 1334 * is claimed to be on. 1335 * 1336 * NOTE: Only used for PAGESIZE pages. 1337 */ 1338 void 1339 page_list_sub(page_t *pp, int flags) 1340 { 1341 int bin; 1342 uint_t mtype; 1343 int mnode; 1344 kmutex_t *pcm; 1345 page_t **ppp; 1346 1347 ASSERT(PAGE_EXCL(pp)); 1348 ASSERT(PP_ISFREE(pp)); 1349 1350 /* 1351 * The p_szc field can only be changed by page_promote() 1352 * and page_demote(). Only free pages can be promoted and 1353 * demoted and the free list MUST be locked during these 1354 * operations. So to prevent a race in page_list_sub() 1355 * between computing which bin of the freelist lock to 1356 * grab and actually grabing the lock we check again that 1357 * the bin we locked is still the correct one. Notice that 1358 * the p_szc field could have actually changed on us but 1359 * if the bin happens to still be the same we are safe. 1360 */ 1361 try_again: 1362 bin = PP_2_BIN(pp); 1363 mnode = PP_2_MEM_NODE(pp); 1364 pcm = PC_BIN_MUTEX(mnode, bin, flags); 1365 mutex_enter(pcm); 1366 if (PP_2_BIN(pp) != bin) { 1367 mutex_exit(pcm); 1368 goto try_again; 1369 } 1370 mtype = PP_2_MTYPE(pp); 1371 1372 if (flags & PG_FREE_LIST) { 1373 VM_STAT_ADD(vmm_vmstats.plsub_free[0]); 1374 ASSERT(PP_ISAGED(pp)); 1375 ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype); 1376 } else { 1377 VM_STAT_ADD(vmm_vmstats.plsub_cache); 1378 ASSERT(!PP_ISAGED(pp)); 1379 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1380 } 1381 1382 /* 1383 * Common PAGESIZE case. 1384 * 1385 * Note that we locked the freelist. This prevents 1386 * any page promotion/demotion operations. Therefore 1387 * the p_szc will not change until we drop pcm mutex. 1388 */ 1389 if (pp->p_szc == 0) { 1390 page_sub(ppp, pp); 1391 /* 1392 * Subtract counters before releasing pcm mutex 1393 * to avoid race with page_freelist_coalesce. 1394 */ 1395 page_ctr_sub(mnode, mtype, pp, flags); 1396 mutex_exit(pcm); 1397 1398 #if defined(__sparc) 1399 if (PP_ISNORELOC(pp)) { 1400 kcage_freemem_sub(1); 1401 } 1402 #endif 1403 return; 1404 } 1405 1406 /* 1407 * Large pages on the cache list are not supported. 1408 */ 1409 if (flags & PG_CACHE_LIST) 1410 panic("page_list_sub: large page on cachelist"); 1411 1412 /* 1413 * Slow but rare. 1414 * 1415 * Somebody wants this particular page which is part 1416 * of a large page. In this case we just demote the page 1417 * if it's on the freelist. 1418 * 1419 * We have to drop pcm before locking the entire freelist. 1420 * Once we have re-locked the freelist check to make sure 1421 * the page hasn't already been demoted or completely 1422 * freed. 1423 */ 1424 mutex_exit(pcm); 1425 page_freelist_lock(mnode); 1426 if (pp->p_szc != 0) { 1427 /* 1428 * Large page is on freelist. 1429 */ 1430 (void) page_demote(mnode, PFN_BASE(pp->p_pagenum, pp->p_szc), 1431 pp->p_szc, 0, PC_NO_COLOR, PC_FREE); 1432 } 1433 ASSERT(PP_ISFREE(pp)); 1434 ASSERT(PP_ISAGED(pp)); 1435 ASSERT(pp->p_szc == 0); 1436 1437 /* 1438 * Subtract counters before releasing pcm mutex 1439 * to avoid race with page_freelist_coalesce. 1440 */ 1441 bin = PP_2_BIN(pp); 1442 mtype = PP_2_MTYPE(pp); 1443 ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype); 1444 1445 page_sub(ppp, pp); 1446 page_ctr_sub(mnode, mtype, pp, flags); 1447 page_freelist_unlock(mnode); 1448 1449 #if defined(__sparc) 1450 if (PP_ISNORELOC(pp)) { 1451 kcage_freemem_sub(1); 1452 } 1453 #endif 1454 } 1455 1456 void 1457 page_list_sub_pages(page_t *pp, uint_t szc) 1458 { 1459 kmutex_t *pcm; 1460 uint_t bin, mtype; 1461 int mnode; 1462 1463 ASSERT(PAGE_EXCL(pp)); 1464 ASSERT(PP_ISFREE(pp)); 1465 ASSERT(PP_ISAGED(pp)); 1466 1467 /* 1468 * See comment in page_list_sub(). 1469 */ 1470 try_again: 1471 bin = PP_2_BIN(pp); 1472 mnode = PP_2_MEM_NODE(pp); 1473 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 1474 mutex_enter(pcm); 1475 if (PP_2_BIN(pp) != bin) { 1476 mutex_exit(pcm); 1477 goto try_again; 1478 } 1479 1480 /* 1481 * If we're called with a page larger than szc or it got 1482 * promoted above szc before we locked the freelist then 1483 * drop pcm and re-lock entire freelist. If page still larger 1484 * than szc then demote it. 1485 */ 1486 if (pp->p_szc > szc) { 1487 mutex_exit(pcm); 1488 pcm = NULL; 1489 page_freelist_lock(mnode); 1490 if (pp->p_szc > szc) { 1491 VM_STAT_ADD(vmm_vmstats.plsubpages_szcbig); 1492 (void) page_demote(mnode, 1493 PFN_BASE(pp->p_pagenum, pp->p_szc), 1494 pp->p_szc, szc, PC_NO_COLOR, PC_FREE); 1495 } 1496 bin = PP_2_BIN(pp); 1497 } 1498 ASSERT(PP_ISFREE(pp)); 1499 ASSERT(PP_ISAGED(pp)); 1500 ASSERT(pp->p_szc <= szc); 1501 ASSERT(pp == PP_PAGEROOT(pp)); 1502 1503 VM_STAT_ADD(vmm_vmstats.plsub_free[pp->p_szc]); 1504 1505 mtype = PP_2_MTYPE(pp); 1506 if (pp->p_szc != 0) { 1507 page_vpsub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1508 CHK_LPG(pp, pp->p_szc); 1509 } else { 1510 VM_STAT_ADD(vmm_vmstats.plsubpages_szc0); 1511 page_sub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1512 } 1513 page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST); 1514 1515 if (pcm != NULL) { 1516 mutex_exit(pcm); 1517 } else { 1518 page_freelist_unlock(mnode); 1519 } 1520 1521 #if defined(__sparc) 1522 if (PP_ISNORELOC(pp)) { 1523 pgcnt_t pgcnt; 1524 1525 pgcnt = page_get_pagecnt(pp->p_szc); 1526 kcage_freemem_sub(pgcnt); 1527 } 1528 #endif 1529 } 1530 1531 /* 1532 * Add the page to the front of a linked list of pages 1533 * using the p_next & p_prev pointers for the list. 1534 * The caller is responsible for protecting the list pointers. 1535 */ 1536 void 1537 mach_page_add(page_t **ppp, page_t *pp) 1538 { 1539 if (*ppp == NULL) { 1540 pp->p_next = pp->p_prev = pp; 1541 } else { 1542 pp->p_next = *ppp; 1543 pp->p_prev = (*ppp)->p_prev; 1544 (*ppp)->p_prev = pp; 1545 pp->p_prev->p_next = pp; 1546 } 1547 *ppp = pp; 1548 } 1549 1550 /* 1551 * Remove this page from a linked list of pages 1552 * using the p_next & p_prev pointers for the list. 1553 * 1554 * The caller is responsible for protecting the list pointers. 1555 */ 1556 void 1557 mach_page_sub(page_t **ppp, page_t *pp) 1558 { 1559 ASSERT(PP_ISFREE(pp)); 1560 1561 if (*ppp == NULL || pp == NULL) 1562 panic("mach_page_sub"); 1563 1564 if (*ppp == pp) 1565 *ppp = pp->p_next; /* go to next page */ 1566 1567 if (*ppp == pp) 1568 *ppp = NULL; /* page list is gone */ 1569 else { 1570 pp->p_prev->p_next = pp->p_next; 1571 pp->p_next->p_prev = pp->p_prev; 1572 } 1573 pp->p_prev = pp->p_next = pp; /* make pp a list of one */ 1574 } 1575 1576 /* 1577 * Routine fsflush uses to gradually coalesce the free list into larger pages. 1578 */ 1579 void 1580 page_promote_size(page_t *pp, uint_t cur_szc) 1581 { 1582 pfn_t pfn; 1583 int mnode; 1584 int idx; 1585 int new_szc = cur_szc + 1; 1586 int full = FULL_REGION_CNT(new_szc); 1587 1588 pfn = page_pptonum(pp); 1589 mnode = PFN_2_MEM_NODE(pfn); 1590 1591 page_freelist_lock(mnode); 1592 1593 idx = PNUM_TO_IDX(mnode, new_szc, pfn); 1594 if (PAGE_COUNTERS(mnode, new_szc, idx) == full) 1595 (void) page_promote(mnode, pfn, new_szc, PC_FREE); 1596 1597 page_freelist_unlock(mnode); 1598 } 1599 1600 static uint_t page_promote_err; 1601 static uint_t page_promote_noreloc_err; 1602 1603 /* 1604 * Create a single larger page (of szc new_szc) from smaller contiguous pages 1605 * for the given mnode starting at pfnum. Pages involved are on the freelist 1606 * before the call and may be returned to the caller if requested, otherwise 1607 * they will be placed back on the freelist. 1608 * If flags is PC_ALLOC, then the large page will be returned to the user in 1609 * a state which is consistent with a page being taken off the freelist. If 1610 * we failed to lock the new large page, then we will return NULL to the 1611 * caller and put the large page on the freelist instead. 1612 * If flags is PC_FREE, then the large page will be placed on the freelist, 1613 * and NULL will be returned. 1614 * The caller is responsible for locking the freelist as well as any other 1615 * accounting which needs to be done for a returned page. 1616 * 1617 * RFE: For performance pass in pp instead of pfnum so 1618 * we can avoid excessive calls to page_numtopp_nolock(). 1619 * This would depend on an assumption that all contiguous 1620 * pages are in the same memseg so we can just add/dec 1621 * our pp. 1622 * 1623 * Lock ordering: 1624 * 1625 * There is a potential but rare deadlock situation 1626 * for page promotion and demotion operations. The problem 1627 * is there are two paths into the freelist manager and 1628 * they have different lock orders: 1629 * 1630 * page_create() 1631 * lock freelist 1632 * page_lock(EXCL) 1633 * unlock freelist 1634 * return 1635 * caller drops page_lock 1636 * 1637 * page_free() and page_reclaim() 1638 * caller grabs page_lock(EXCL) 1639 * 1640 * lock freelist 1641 * unlock freelist 1642 * drop page_lock 1643 * 1644 * What prevents a thread in page_create() from deadlocking 1645 * with a thread freeing or reclaiming the same page is the 1646 * page_trylock() in page_get_freelist(). If the trylock fails 1647 * it skips the page. 1648 * 1649 * The lock ordering for promotion and demotion is the same as 1650 * for page_create(). Since the same deadlock could occur during 1651 * page promotion and freeing or reclaiming of a page on the 1652 * cache list we might have to fail the operation and undo what 1653 * have done so far. Again this is rare. 1654 */ 1655 page_t * 1656 page_promote(int mnode, pfn_t pfnum, uchar_t new_szc, int flags) 1657 { 1658 page_t *pp, *pplist, *tpp, *start_pp; 1659 pgcnt_t new_npgs, npgs; 1660 uint_t bin; 1661 pgcnt_t tmpnpgs, pages_left; 1662 uint_t mtype; 1663 uint_t noreloc; 1664 uint_t i; 1665 int which_list; 1666 ulong_t index; 1667 kmutex_t *phm; 1668 1669 /* 1670 * General algorithm: 1671 * Find the starting page 1672 * Walk each page struct removing it from the freelist, 1673 * and linking it to all the other pages removed. 1674 * Once all pages are off the freelist, 1675 * walk the list, modifying p_szc to new_szc and what 1676 * ever other info needs to be done to create a large free page. 1677 * According to the flags, either return the page or put it 1678 * on the freelist. 1679 */ 1680 1681 start_pp = page_numtopp_nolock(pfnum); 1682 ASSERT(start_pp && (start_pp->p_pagenum == pfnum)); 1683 new_npgs = page_get_pagecnt(new_szc); 1684 ASSERT(IS_P2ALIGNED(pfnum, new_npgs)); 1685 1686 /* 1687 * Loop through smaller pages to confirm that all pages 1688 * give the same result for PP_ISNORELOC(). 1689 * We can check this reliably here as the protocol for setting 1690 * P_NORELOC requires pages to be taken off the free list first. 1691 */ 1692 for (i = 0, pp = start_pp; i < new_npgs; i++, pp++) { 1693 if (pp == start_pp) { 1694 /* First page, set requirement. */ 1695 noreloc = PP_ISNORELOC(pp); 1696 } else if (noreloc != PP_ISNORELOC(pp)) { 1697 page_promote_noreloc_err++; 1698 page_promote_err++; 1699 return (NULL); 1700 } 1701 } 1702 1703 pages_left = new_npgs; 1704 pplist = NULL; 1705 pp = start_pp; 1706 1707 /* Loop around coalescing the smaller pages into a big page. */ 1708 while (pages_left) { 1709 /* 1710 * Remove from the freelist. 1711 */ 1712 ASSERT(PP_ISFREE(pp)); 1713 bin = PP_2_BIN(pp); 1714 ASSERT(mnode == PP_2_MEM_NODE(pp)); 1715 mtype = PP_2_MTYPE(pp); 1716 if (PP_ISAGED(pp)) { 1717 1718 /* 1719 * PG_FREE_LIST 1720 */ 1721 if (pp->p_szc) { 1722 page_vpsub(&PAGE_FREELISTS(mnode, 1723 pp->p_szc, bin, mtype), pp); 1724 } else { 1725 mach_page_sub(&PAGE_FREELISTS(mnode, 0, 1726 bin, mtype), pp); 1727 } 1728 which_list = PG_FREE_LIST; 1729 } else { 1730 ASSERT(pp->p_szc == 0); 1731 1732 /* 1733 * PG_CACHE_LIST 1734 * 1735 * Since this page comes from the 1736 * cachelist, we must destroy the 1737 * vnode association. 1738 */ 1739 if (!page_trylock(pp, SE_EXCL)) { 1740 goto fail_promote; 1741 } 1742 1743 /* 1744 * We need to be careful not to deadlock 1745 * with another thread in page_lookup(). 1746 * The page_lookup() thread could be holding 1747 * the same phm that we need if the two 1748 * pages happen to hash to the same phm lock. 1749 * At this point we have locked the entire 1750 * freelist and page_lookup() could be trying 1751 * to grab a freelist lock. 1752 */ 1753 index = PAGE_HASH_FUNC(pp->p_vnode, pp->p_offset); 1754 phm = PAGE_HASH_MUTEX(index); 1755 if (!mutex_tryenter(phm)) { 1756 page_unlock_noretire(pp); 1757 goto fail_promote; 1758 } 1759 1760 mach_page_sub(&PAGE_CACHELISTS(mnode, bin, mtype), pp); 1761 page_hashout(pp, phm); 1762 mutex_exit(phm); 1763 PP_SETAGED(pp); 1764 page_unlock_noretire(pp); 1765 which_list = PG_CACHE_LIST; 1766 } 1767 page_ctr_sub(mnode, mtype, pp, which_list); 1768 1769 /* 1770 * Concatenate the smaller page(s) onto 1771 * the large page list. 1772 */ 1773 tmpnpgs = npgs = page_get_pagecnt(pp->p_szc); 1774 pages_left -= npgs; 1775 tpp = pp; 1776 while (npgs--) { 1777 tpp->p_szc = new_szc; 1778 tpp = tpp->p_next; 1779 } 1780 page_list_concat(&pplist, &pp); 1781 pp += tmpnpgs; 1782 } 1783 CHK_LPG(pplist, new_szc); 1784 1785 /* 1786 * return the page to the user if requested 1787 * in the properly locked state. 1788 */ 1789 if (flags == PC_ALLOC && (page_trylock_cons(pplist, SE_EXCL))) { 1790 return (pplist); 1791 } 1792 1793 /* 1794 * Otherwise place the new large page on the freelist 1795 */ 1796 bin = PP_2_BIN(pplist); 1797 mnode = PP_2_MEM_NODE(pplist); 1798 mtype = PP_2_MTYPE(pplist); 1799 page_vpadd(&PAGE_FREELISTS(mnode, new_szc, bin, mtype), pplist); 1800 1801 page_ctr_add(mnode, mtype, pplist, PG_FREE_LIST); 1802 return (NULL); 1803 1804 fail_promote: 1805 /* 1806 * A thread must have still been freeing or 1807 * reclaiming the page on the cachelist. 1808 * To prevent a deadlock undo what we have 1809 * done sofar and return failure. This 1810 * situation can only happen while promoting 1811 * PAGESIZE pages. 1812 */ 1813 page_promote_err++; 1814 while (pplist) { 1815 pp = pplist; 1816 mach_page_sub(&pplist, pp); 1817 pp->p_szc = 0; 1818 bin = PP_2_BIN(pp); 1819 mtype = PP_2_MTYPE(pp); 1820 mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, mtype), pp); 1821 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST); 1822 } 1823 return (NULL); 1824 1825 } 1826 1827 /* 1828 * Break up a large page into smaller size pages. 1829 * Pages involved are on the freelist before the call and may 1830 * be returned to the caller if requested, otherwise they will 1831 * be placed back on the freelist. 1832 * The caller is responsible for locking the freelist as well as any other 1833 * accounting which needs to be done for a returned page. 1834 * If flags is not PC_ALLOC, the color argument is ignored, and thus 1835 * technically, any value may be passed in but PC_NO_COLOR is the standard 1836 * which should be followed for clarity's sake. 1837 */ 1838 page_t * 1839 page_demote(int mnode, pfn_t pfnum, uchar_t cur_szc, uchar_t new_szc, 1840 int color, int flags) 1841 { 1842 page_t *pp, *pplist, *npplist; 1843 pgcnt_t npgs, n; 1844 uint_t bin; 1845 uint_t mtype; 1846 page_t *ret_pp = NULL; 1847 1848 ASSERT(cur_szc != 0); 1849 ASSERT(new_szc < cur_szc); 1850 1851 pplist = page_numtopp_nolock(pfnum); 1852 ASSERT(pplist != NULL); 1853 1854 ASSERT(pplist->p_szc == cur_szc); 1855 1856 bin = PP_2_BIN(pplist); 1857 ASSERT(mnode == PP_2_MEM_NODE(pplist)); 1858 mtype = PP_2_MTYPE(pplist); 1859 page_vpsub(&PAGE_FREELISTS(mnode, cur_szc, bin, mtype), pplist); 1860 1861 CHK_LPG(pplist, cur_szc); 1862 page_ctr_sub(mnode, mtype, pplist, PG_FREE_LIST); 1863 1864 /* 1865 * Number of PAGESIZE pages for smaller new_szc 1866 * page. 1867 */ 1868 npgs = page_get_pagecnt(new_szc); 1869 1870 while (pplist) { 1871 pp = pplist; 1872 1873 ASSERT(pp->p_szc == cur_szc); 1874 1875 /* 1876 * We either break it up into PAGESIZE pages or larger. 1877 */ 1878 if (npgs == 1) { /* PAGESIZE case */ 1879 mach_page_sub(&pplist, pp); 1880 ASSERT(pp->p_szc == cur_szc); 1881 ASSERT(new_szc == 0); 1882 ASSERT(mnode == PP_2_MEM_NODE(pp)); 1883 pp->p_szc = new_szc; 1884 bin = PP_2_BIN(pp); 1885 if ((bin == color) && (flags == PC_ALLOC) && 1886 (ret_pp == NULL) && 1887 page_trylock_cons(pp, SE_EXCL)) { 1888 ret_pp = pp; 1889 } else { 1890 mtype = PP_2_MTYPE(pp); 1891 mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, 1892 mtype), pp); 1893 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST); 1894 } 1895 } else { 1896 1897 /* 1898 * Break down into smaller lists of pages. 1899 */ 1900 page_list_break(&pplist, &npplist, npgs); 1901 1902 pp = pplist; 1903 n = npgs; 1904 while (n--) { 1905 ASSERT(pp->p_szc == cur_szc); 1906 pp->p_szc = new_szc; 1907 pp = pp->p_next; 1908 } 1909 1910 CHK_LPG(pplist, new_szc); 1911 1912 bin = PP_2_BIN(pplist); 1913 ASSERT(mnode == PP_2_MEM_NODE(pp)); 1914 if ((bin == color) && (flags == PC_ALLOC) && 1915 (ret_pp == NULL) && 1916 page_trylock_cons(pp, SE_EXCL)) { 1917 ret_pp = pp; 1918 } else { 1919 mtype = PP_2_MTYPE(pp); 1920 page_vpadd(&PAGE_FREELISTS(mnode, new_szc, 1921 bin, mtype), pplist); 1922 1923 page_ctr_add(mnode, mtype, pplist, 1924 PG_FREE_LIST); 1925 } 1926 pplist = npplist; 1927 } 1928 } 1929 return (ret_pp); 1930 } 1931 1932 int mpss_coalesce_disable = 0; 1933 1934 /* 1935 * Coalesce free pages into a page of the given szc and color if possible. 1936 * Return the pointer to the page created, otherwise, return NULL. 1937 */ 1938 static page_t * 1939 page_freelist_coalesce(int mnode, uchar_t szc, int color) 1940 { 1941 int r; /* region size */ 1942 int idx, full, i; 1943 pfn_t pfnum; 1944 size_t len; 1945 size_t buckets_to_check; 1946 pgcnt_t cands; 1947 page_t *ret_pp; 1948 int color_stride; 1949 1950 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce); 1951 1952 if (mpss_coalesce_disable) { 1953 return (NULL); 1954 } 1955 1956 r = szc; 1957 PGCTRS_CANDS_GETVALUECOLOR(mnode, r, color, cands); 1958 if (cands == 0) { 1959 VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip); 1960 return (NULL); 1961 } 1962 full = FULL_REGION_CNT(r); 1963 color_stride = (szc) ? page_convert_color(0, szc, page_colors - 1) + 1 : 1964 page_colors; 1965 1966 /* Prevent page_counters dynamic memory from being freed */ 1967 rw_enter(&page_ctrs_rwlock[mnode], RW_READER); 1968 len = PAGE_COUNTERS_ENTRIES(mnode, r); 1969 buckets_to_check = len / color_stride; 1970 idx = PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color); 1971 ASSERT((idx % color_stride) == color); 1972 idx += color_stride; 1973 if (idx >= len) 1974 idx = color; 1975 for (i = 0; i < buckets_to_check; i++) { 1976 if (PAGE_COUNTERS(mnode, r, idx) == full) { 1977 pfnum = IDX_TO_PNUM(mnode, r, idx); 1978 ASSERT(pfnum >= mem_node_config[mnode].physbase && 1979 pfnum < mem_node_config[mnode].physmax); 1980 /* 1981 * RFE: For performance maybe we can do something less 1982 * brutal than locking the entire freelist. So far 1983 * this doesn't seem to be a performance problem? 1984 */ 1985 page_freelist_lock(mnode); 1986 if (PAGE_COUNTERS(mnode, r, idx) != full) { 1987 VM_STAT_ADD(vmm_vmstats.page_ctrs_changed); 1988 goto skip_this_one; 1989 } 1990 ret_pp = page_promote(mnode, pfnum, r, PC_ALLOC); 1991 if (ret_pp != NULL) { 1992 PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color) = 1993 idx; 1994 page_freelist_unlock(mnode); 1995 rw_exit(&page_ctrs_rwlock[mnode]); 1996 #if defined(__sparc) 1997 if (PP_ISNORELOC(ret_pp)) { 1998 pgcnt_t npgs; 1999 2000 npgs = page_get_pagecnt(ret_pp->p_szc); 2001 kcage_freemem_sub(npgs); 2002 } 2003 #endif 2004 return (ret_pp); 2005 } 2006 skip_this_one: 2007 page_freelist_unlock(mnode); 2008 /* 2009 * No point looking for another page if we've 2010 * already tried all of the ones that 2011 * page_ctr_cands indicated. Stash off where we left 2012 * off. 2013 * Note: this is not exact since we don't hold the 2014 * page_freelist_locks before we initially get the 2015 * value of cands for performance reasons, but should 2016 * be a decent approximation. 2017 */ 2018 if (--cands == 0) { 2019 PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color) = 2020 idx; 2021 break; 2022 } 2023 } 2024 idx += color_stride; 2025 if (idx >= len) 2026 idx = color; 2027 } 2028 rw_exit(&page_ctrs_rwlock[mnode]); 2029 VM_STAT_ADD(vmm_vmstats.page_ctrs_failed); 2030 return (NULL); 2031 } 2032 2033 /* 2034 * For the given mnode, promote as many small pages to large pages as possible. 2035 */ 2036 void 2037 page_freelist_coalesce_all(int mnode) 2038 { 2039 int r; /* region size */ 2040 int idx, full; 2041 pfn_t pfnum; 2042 size_t len; 2043 2044 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce_all); 2045 2046 if (mpss_coalesce_disable) { 2047 return; 2048 } 2049 2050 /* 2051 * Lock the entire freelist and coalesce what we can. 2052 * 2053 * Always promote to the largest page possible 2054 * first to reduce the number of page promotions. 2055 */ 2056 rw_enter(&page_ctrs_rwlock[mnode], RW_READER); 2057 page_freelist_lock(mnode); 2058 for (r = mmu_page_sizes - 1; r > 0; r--) { 2059 pgcnt_t cands; 2060 2061 PGCTRS_CANDS_GETVALUE(mnode, r, cands); 2062 if (cands == 0) { 2063 VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip_all); 2064 continue; 2065 } 2066 2067 full = FULL_REGION_CNT(r); 2068 len = PAGE_COUNTERS_ENTRIES(mnode, r); 2069 2070 for (idx = 0; idx < len; idx++) { 2071 if (PAGE_COUNTERS(mnode, r, idx) == full) { 2072 pfnum = IDX_TO_PNUM(mnode, r, idx); 2073 ASSERT(pfnum >= 2074 mem_node_config[mnode].physbase && 2075 pfnum < 2076 mem_node_config[mnode].physmax); 2077 (void) page_promote(mnode, pfnum, r, PC_FREE); 2078 } 2079 } 2080 } 2081 page_freelist_unlock(mnode); 2082 rw_exit(&page_ctrs_rwlock[mnode]); 2083 } 2084 2085 /* 2086 * This is where all polices for moving pages around 2087 * to different page size free lists is implemented. 2088 * Returns 1 on success, 0 on failure. 2089 * 2090 * So far these are the priorities for this algorithm in descending 2091 * order: 2092 * 2093 * 1) When servicing a request try to do so with a free page 2094 * from next size up. Helps defer fragmentation as long 2095 * as possible. 2096 * 2097 * 2) Page coalesce on demand. Only when a freelist 2098 * larger than PAGESIZE is empty and step 1 2099 * will not work since all larger size lists are 2100 * also empty. 2101 * 2102 * If pfnhi is non-zero, search for large page with pfn range less than pfnhi. 2103 */ 2104 page_t * 2105 page_freelist_fill(uchar_t szc, int color, int mnode, int mtype, pfn_t pfnhi) 2106 { 2107 uchar_t nszc = szc + 1; 2108 int bin; 2109 page_t *pp, *firstpp; 2110 page_t *ret_pp = NULL; 2111 2112 ASSERT(szc < mmu_page_sizes); 2113 2114 VM_STAT_ADD(vmm_vmstats.pff_req[szc]); 2115 /* 2116 * First try to break up a larger page to fill 2117 * current size freelist. 2118 */ 2119 while (nszc < mmu_page_sizes) { 2120 /* 2121 * If page found then demote it. 2122 */ 2123 bin = page_convert_color(szc, nszc, color); 2124 if (PAGE_FREELISTS(mnode, nszc, bin, mtype)) { 2125 page_freelist_lock(mnode); 2126 firstpp = pp = PAGE_FREELISTS(mnode, nszc, bin, mtype); 2127 2128 /* 2129 * If pfnhi is not PFNNULL, look for large page below 2130 * pfnhi. PFNNULL signifies no pfn requirement. 2131 */ 2132 if (pfnhi != PFNNULL && pp->p_pagenum >= pfnhi) { 2133 do { 2134 pp = pp->p_vpnext; 2135 if (pp == firstpp) { 2136 pp = NULL; 2137 break; 2138 } 2139 } while (pp->p_pagenum >= pfnhi); 2140 } 2141 if (pp) { 2142 ASSERT(pp->p_szc == nszc); 2143 VM_STAT_ADD(vmm_vmstats.pff_demote[nszc]); 2144 ret_pp = page_demote(mnode, pp->p_pagenum, 2145 pp->p_szc, szc, color, PC_ALLOC); 2146 if (ret_pp) { 2147 page_freelist_unlock(mnode); 2148 #if defined(__sparc) 2149 if (PP_ISNORELOC(ret_pp)) { 2150 pgcnt_t npgs; 2151 2152 npgs = page_get_pagecnt( 2153 ret_pp->p_szc); 2154 kcage_freemem_sub(npgs); 2155 } 2156 #endif 2157 return (ret_pp); 2158 } 2159 } 2160 page_freelist_unlock(mnode); 2161 } 2162 nszc++; 2163 } 2164 2165 /* 2166 * Ok that didn't work. Time to coalesce. 2167 */ 2168 if (szc != 0) { 2169 ret_pp = page_freelist_coalesce(mnode, szc, color); 2170 VM_STAT_COND_ADD(ret_pp, vmm_vmstats.pff_coalok[szc]); 2171 } 2172 2173 return (ret_pp); 2174 } 2175 2176 /* 2177 * Helper routine used only by the freelist code to lock 2178 * a page. If the page is a large page then it succeeds in 2179 * locking all the constituent pages or none at all. 2180 * Returns 1 on sucess, 0 on failure. 2181 */ 2182 static int 2183 page_trylock_cons(page_t *pp, se_t se) 2184 { 2185 page_t *tpp, *first_pp = pp; 2186 2187 /* 2188 * Fail if can't lock first or only page. 2189 */ 2190 if (!page_trylock(pp, se)) { 2191 return (0); 2192 } 2193 2194 /* 2195 * PAGESIZE: common case. 2196 */ 2197 if (pp->p_szc == 0) { 2198 return (1); 2199 } 2200 2201 /* 2202 * Large page case. 2203 */ 2204 tpp = pp->p_next; 2205 while (tpp != pp) { 2206 if (!page_trylock(tpp, se)) { 2207 /* 2208 * On failure unlock what we 2209 * have locked so far. 2210 */ 2211 while (first_pp != tpp) { 2212 page_unlock_noretire(first_pp); 2213 first_pp = first_pp->p_next; 2214 } 2215 return (0); 2216 } 2217 tpp = tpp->p_next; 2218 } 2219 return (1); 2220 } 2221 2222 page_t * 2223 page_get_mnode_freelist(int mnode, uint_t bin, int mtype, uchar_t szc, 2224 uint_t flags) 2225 { 2226 kmutex_t *pcm; 2227 int i, fill_tried, fill_marker; 2228 page_t *pp, *first_pp; 2229 uint_t bin_marker; 2230 int colors, cpucolors; 2231 uchar_t nszc; 2232 uint_t nszc_color_shift; 2233 int nwaybins = 0, nwaycnt; 2234 2235 ASSERT(szc < mmu_page_sizes); 2236 2237 VM_STAT_ADD(vmm_vmstats.pgmf_alloc[szc]); 2238 2239 MTYPE_START(mnode, mtype, flags); 2240 if (mtype < 0) { /* mnode foes not have memory in mtype range */ 2241 VM_STAT_ADD(vmm_vmstats.pgmf_allocempty[szc]); 2242 return (NULL); 2243 } 2244 2245 /* 2246 * Set how many physical colors for this page size. 2247 */ 2248 colors = (szc) ? page_convert_color(0, szc, page_colors - 1) + 1 : 2249 page_colors; 2250 2251 nszc = MIN(szc + 1, mmu_page_sizes - 1); 2252 nszc_color_shift = page_get_shift(nszc) - page_get_shift(szc); 2253 2254 /* cpu_page_colors is non-zero if a page color may be in > 1 bin */ 2255 cpucolors = cpu_page_colors; 2256 2257 /* 2258 * adjust cpucolors to possibly check additional 'equivalent' bins 2259 * to try to minimize fragmentation of large pages by delaying calls 2260 * to page_freelist_fill. 2261 */ 2262 if (colorequiv > 1) { 2263 int equivcolors = colors / colorequiv; 2264 2265 if (equivcolors && (cpucolors == 0 || equivcolors < cpucolors)) 2266 cpucolors = equivcolors; 2267 } 2268 2269 ASSERT(colors <= page_colors); 2270 ASSERT(colors); 2271 ASSERT((colors & (colors - 1)) == 0); 2272 2273 ASSERT(bin < colors); 2274 2275 /* 2276 * Only hold one freelist lock at a time, that way we 2277 * can start anywhere and not have to worry about lock 2278 * ordering. 2279 */ 2280 big_try_again: 2281 fill_tried = 0; 2282 nwaycnt = 0; 2283 for (i = 0; i <= colors; i++) { 2284 try_again: 2285 ASSERT(bin < colors); 2286 if (PAGE_FREELISTS(mnode, szc, bin, mtype)) { 2287 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 2288 mutex_enter(pcm); 2289 pp = PAGE_FREELISTS(mnode, szc, bin, mtype); 2290 if (pp != NULL) { 2291 /* 2292 * These were set before the page 2293 * was put on the free list, 2294 * they must still be set. 2295 */ 2296 ASSERT(PP_ISFREE(pp)); 2297 ASSERT(PP_ISAGED(pp)); 2298 ASSERT(pp->p_vnode == NULL); 2299 ASSERT(pp->p_hash == NULL); 2300 ASSERT(pp->p_offset == (u_offset_t)-1); 2301 ASSERT(pp->p_szc == szc); 2302 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 2303 2304 /* 2305 * Walk down the hash chain. 2306 * 8k pages are linked on p_next 2307 * and p_prev fields. Large pages 2308 * are a contiguous group of 2309 * constituent pages linked together 2310 * on their p_next and p_prev fields. 2311 * The large pages are linked together 2312 * on the hash chain using p_vpnext 2313 * p_vpprev of the base constituent 2314 * page of each large page. 2315 */ 2316 first_pp = pp; 2317 while (!page_trylock_cons(pp, SE_EXCL)) { 2318 if (szc == 0) { 2319 pp = pp->p_next; 2320 } else { 2321 pp = pp->p_vpnext; 2322 } 2323 2324 ASSERT(PP_ISFREE(pp)); 2325 ASSERT(PP_ISAGED(pp)); 2326 ASSERT(pp->p_vnode == NULL); 2327 ASSERT(pp->p_hash == NULL); 2328 ASSERT(pp->p_offset == (u_offset_t)-1); 2329 ASSERT(pp->p_szc == szc); 2330 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == 2331 mnode); 2332 2333 if (pp == first_pp) { 2334 pp = NULL; 2335 break; 2336 } 2337 } 2338 2339 if (pp) { 2340 ASSERT(mtype == PP_2_MTYPE(pp)); 2341 ASSERT(pp->p_szc == szc); 2342 if (szc == 0) { 2343 page_sub(&PAGE_FREELISTS(mnode, 2344 szc, bin, mtype), pp); 2345 } else { 2346 page_vpsub(&PAGE_FREELISTS( 2347 mnode, szc, bin, mtype), 2348 pp); 2349 CHK_LPG(pp, szc); 2350 } 2351 page_ctr_sub(mnode, mtype, pp, 2352 PG_FREE_LIST); 2353 2354 if ((PP_ISFREE(pp) == 0) || 2355 (PP_ISAGED(pp) == 0)) 2356 panic("free page is not. pp %p", 2357 (void *)pp); 2358 mutex_exit(pcm); 2359 2360 #if defined(__sparc) 2361 ASSERT(!kcage_on || PP_ISNORELOC(pp) || 2362 (flags & PG_NORELOC) == 0); 2363 2364 if (PP_ISNORELOC(pp)) { 2365 pgcnt_t npgs; 2366 2367 npgs = page_get_pagecnt(szc); 2368 kcage_freemem_sub(npgs); 2369 } 2370 #endif 2371 VM_STAT_ADD(vmm_vmstats. 2372 pgmf_allocok[szc]); 2373 return (pp); 2374 } 2375 } 2376 mutex_exit(pcm); 2377 } 2378 2379 /* 2380 * Wow! The initial bin is empty. 2381 * If specific color is needed, check if page color may be 2382 * in other bins. cpucolors is: 2383 * 0 if the colors for this cpu is equal to page_colors. 2384 * This means that pages with a particular color are in a 2385 * single bin. 2386 * -1 if colors of cpus (cheetah+) are heterogenous. Need to 2387 * first determine the colors for the current cpu. 2388 * >0 colors of all cpus are homogenous and < page_colors 2389 */ 2390 2391 if ((flags & PG_MATCH_COLOR) && (cpucolors != 0)) { 2392 if (!nwaybins) { 2393 /* 2394 * cpucolors is negative if ecache setsizes 2395 * are heterogenous. determine colors for this 2396 * particular cpu. 2397 */ 2398 if (cpucolors < 0) { 2399 cpucolors = CPUSETSIZE() / MMU_PAGESIZE; 2400 ASSERT(cpucolors > 0); 2401 nwaybins = colors / cpucolors; 2402 } else { 2403 nwaybins = colors / cpucolors; 2404 ASSERT(szc > 0 || nwaybins > 1); 2405 } 2406 if (nwaybins < 2) 2407 cpucolors = 0; 2408 } 2409 2410 if (cpucolors && (nwaycnt + 1 <= nwaybins)) { 2411 nwaycnt++; 2412 bin = (bin + (colors / nwaybins)) & 2413 (colors - 1); 2414 if (nwaycnt < nwaybins) { 2415 goto try_again; 2416 } 2417 } 2418 /* back to initial color if fall-thru */ 2419 } 2420 2421 /* 2422 * color bins are all empty if color match. Try and satisfy 2423 * the request by breaking up or coalescing pages from 2424 * a different size freelist of the correct color that 2425 * satisfies the ORIGINAL color requested. If that 2426 * fails then try pages of the same size but different 2427 * colors assuming we are not called with 2428 * PG_MATCH_COLOR. 2429 */ 2430 if (!fill_tried) { 2431 fill_tried = 1; 2432 fill_marker = bin >> nszc_color_shift; 2433 pp = page_freelist_fill(szc, bin, mnode, mtype, 2434 PFNNULL); 2435 if (pp != NULL) { 2436 return (pp); 2437 } 2438 } 2439 2440 if (flags & PG_MATCH_COLOR) 2441 break; 2442 2443 /* 2444 * Select next color bin to try. 2445 */ 2446 if (szc == 0) { 2447 /* 2448 * PAGESIZE page case. 2449 */ 2450 if (i == 0) { 2451 bin = (bin + BIN_STEP) & page_colors_mask; 2452 bin_marker = bin; 2453 } else { 2454 bin = (bin + vac_colors) & page_colors_mask; 2455 if (bin == bin_marker) { 2456 bin = (bin + 1) & page_colors_mask; 2457 bin_marker = bin; 2458 } 2459 } 2460 } else { 2461 /* 2462 * Large page case. 2463 */ 2464 bin = (bin + 1) & (colors - 1); 2465 } 2466 /* 2467 * If bin advanced to the next color bin of the 2468 * next larger pagesize, there is a chance the fill 2469 * could succeed. 2470 */ 2471 if (fill_marker != (bin >> nszc_color_shift)) 2472 fill_tried = 0; 2473 } 2474 2475 /* if allowed, cycle through additional mtypes */ 2476 MTYPE_NEXT(mnode, mtype, flags); 2477 if (mtype >= 0) 2478 goto big_try_again; 2479 2480 VM_STAT_ADD(vmm_vmstats.pgmf_allocfailed[szc]); 2481 2482 return (NULL); 2483 } 2484 2485 2486 /* 2487 * Returns the count of free pages for 'pp' with size code 'szc'. 2488 * Note: This function does not return an exact value as the page freelist 2489 * locks are not held and thus the values in the page_counters may be 2490 * changing as we walk through the data. 2491 */ 2492 static int 2493 page_freecnt(int mnode, page_t *pp, uchar_t szc) 2494 { 2495 pgcnt_t pgfree; 2496 pgcnt_t cnt; 2497 ssize_t r = szc; /* region size */ 2498 ssize_t idx; 2499 int i; 2500 int full, range; 2501 2502 /* Make sure pagenum passed in is aligned properly */ 2503 ASSERT((pp->p_pagenum & (PNUM_SIZE(szc) - 1)) == 0); 2504 ASSERT(szc > 0); 2505 2506 /* Prevent page_counters dynamic memory from being freed */ 2507 rw_enter(&page_ctrs_rwlock[mnode], RW_READER); 2508 idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum); 2509 cnt = PAGE_COUNTERS(mnode, r, idx); 2510 pgfree = cnt << PNUM_SHIFT(r - 1); 2511 range = FULL_REGION_CNT(szc); 2512 2513 /* Check for completely full region */ 2514 if (cnt == range) { 2515 rw_exit(&page_ctrs_rwlock[mnode]); 2516 return (pgfree); 2517 } 2518 2519 while (--r > 0) { 2520 idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum); 2521 full = FULL_REGION_CNT(r); 2522 for (i = 0; i < range; i++, idx++) { 2523 cnt = PAGE_COUNTERS(mnode, r, idx); 2524 /* 2525 * If cnt here is full, that means we have already 2526 * accounted for these pages earlier. 2527 */ 2528 if (cnt != full) { 2529 pgfree += (cnt << PNUM_SHIFT(r - 1)); 2530 } 2531 } 2532 range *= full; 2533 } 2534 rw_exit(&page_ctrs_rwlock[mnode]); 2535 return (pgfree); 2536 } 2537 2538 /* 2539 * Called from page_geti_contig_pages to exclusively lock constituent pages 2540 * starting from 'spp' for page size code 'szc'. 2541 * 2542 * If 'ptcpthreshold' is set, the number of free pages needed in the 'szc' 2543 * region needs to be greater than or equal to the threshold. 2544 */ 2545 static int 2546 page_trylock_contig_pages(int mnode, page_t *spp, uchar_t szc, int flags) 2547 { 2548 pgcnt_t pgcnt = PNUM_SIZE(szc); 2549 pgcnt_t pgfree, i; 2550 page_t *pp; 2551 2552 VM_STAT_ADD(vmm_vmstats.ptcp[szc]); 2553 2554 2555 if ((ptcpthreshold == 0) || (flags & PGI_PGCPHIPRI)) 2556 goto skipptcpcheck; 2557 /* 2558 * check if there are sufficient free pages available before attempting 2559 * to trylock. Count is approximate as page counters can change. 2560 */ 2561 pgfree = page_freecnt(mnode, spp, szc); 2562 2563 /* attempt to trylock if there are sufficient already free pages */ 2564 if (pgfree < pgcnt/ptcpthreshold) { 2565 VM_STAT_ADD(vmm_vmstats.ptcpfreethresh[szc]); 2566 return (0); 2567 } 2568 2569 skipptcpcheck: 2570 2571 for (i = 0; i < pgcnt; i++) { 2572 pp = &spp[i]; 2573 if (!page_trylock(pp, SE_EXCL)) { 2574 VM_STAT_ADD(vmm_vmstats.ptcpfailexcl[szc]); 2575 while (--i != (pgcnt_t)-1) { 2576 pp = &spp[i]; 2577 ASSERT(PAGE_EXCL(pp)); 2578 page_unlock_noretire(pp); 2579 } 2580 return (0); 2581 } 2582 ASSERT(spp[i].p_pagenum == spp->p_pagenum + i); 2583 if ((pp->p_szc > szc || (szc && pp->p_szc == szc)) && 2584 !PP_ISFREE(pp)) { 2585 VM_STAT_ADD(vmm_vmstats.ptcpfailszc[szc]); 2586 ASSERT(i == 0); 2587 page_unlock_noretire(pp); 2588 return (0); 2589 } 2590 if (PP_ISNORELOC(pp)) { 2591 VM_STAT_ADD(vmm_vmstats.ptcpfailcage[szc]); 2592 while (i != (pgcnt_t)-1) { 2593 pp = &spp[i]; 2594 ASSERT(PAGE_EXCL(pp)); 2595 page_unlock_noretire(pp); 2596 i--; 2597 } 2598 return (0); 2599 } 2600 } 2601 VM_STAT_ADD(vmm_vmstats.ptcpok[szc]); 2602 return (1); 2603 } 2604 2605 /* 2606 * Claim large page pointed to by 'pp'. 'pp' is the starting set 2607 * of 'szc' constituent pages that had been locked exclusively previously. 2608 * Will attempt to relocate constituent pages in use. 2609 */ 2610 static page_t * 2611 page_claim_contig_pages(page_t *pp, uchar_t szc, int flags) 2612 { 2613 spgcnt_t pgcnt, npgs, i; 2614 page_t *targpp, *rpp, *hpp; 2615 page_t *replpp = NULL; 2616 page_t *pplist = NULL; 2617 2618 ASSERT(pp != NULL); 2619 2620 pgcnt = page_get_pagecnt(szc); 2621 while (pgcnt) { 2622 ASSERT(PAGE_EXCL(pp)); 2623 ASSERT(!PP_ISNORELOC(pp)); 2624 if (PP_ISFREE(pp)) { 2625 /* 2626 * If this is a PG_FREE_LIST page then its 2627 * size code can change underneath us due to 2628 * page promotion or demotion. As an optimzation 2629 * use page_list_sub_pages() instead of 2630 * page_list_sub(). 2631 */ 2632 if (PP_ISAGED(pp)) { 2633 page_list_sub_pages(pp, szc); 2634 if (pp->p_szc == szc) { 2635 return (pp); 2636 } 2637 ASSERT(pp->p_szc < szc); 2638 npgs = page_get_pagecnt(pp->p_szc); 2639 hpp = pp; 2640 for (i = 0; i < npgs; i++, pp++) { 2641 pp->p_szc = szc; 2642 } 2643 page_list_concat(&pplist, &hpp); 2644 pgcnt -= npgs; 2645 continue; 2646 } 2647 ASSERT(!PP_ISAGED(pp)); 2648 ASSERT(pp->p_szc == 0); 2649 page_list_sub(pp, PG_CACHE_LIST); 2650 page_hashout(pp, NULL); 2651 PP_SETAGED(pp); 2652 pp->p_szc = szc; 2653 page_list_concat(&pplist, &pp); 2654 pp++; 2655 pgcnt--; 2656 continue; 2657 } 2658 npgs = page_get_pagecnt(pp->p_szc); 2659 2660 /* 2661 * page_create_wait freemem accounting done by caller of 2662 * page_get_freelist and not necessary to call it prior to 2663 * calling page_get_replacement_page. 2664 * 2665 * page_get_replacement_page can call page_get_contig_pages 2666 * to acquire a large page (szc > 0); the replacement must be 2667 * smaller than the contig page size to avoid looping or 2668 * szc == 0 and PGI_PGCPSZC0 is set. 2669 */ 2670 if (pp->p_szc < szc || (szc == 0 && (flags & PGI_PGCPSZC0))) { 2671 replpp = page_get_replacement_page(pp, NULL, 0); 2672 if (replpp) { 2673 npgs = page_get_pagecnt(pp->p_szc); 2674 ASSERT(npgs <= pgcnt); 2675 targpp = pp; 2676 } 2677 } 2678 2679 /* 2680 * If replacement is NULL or do_page_relocate fails, fail 2681 * coalescing of pages. 2682 */ 2683 if (replpp == NULL || (do_page_relocate(&targpp, &replpp, 0, 2684 &npgs, NULL) != 0)) { 2685 /* 2686 * Unlock un-processed target list 2687 */ 2688 while (pgcnt--) { 2689 ASSERT(PAGE_EXCL(pp)); 2690 page_unlock_noretire(pp); 2691 pp++; 2692 } 2693 /* 2694 * Free the processed target list. 2695 */ 2696 while (pplist) { 2697 pp = pplist; 2698 page_sub(&pplist, pp); 2699 ASSERT(PAGE_EXCL(pp)); 2700 ASSERT(pp->p_szc == szc); 2701 ASSERT(PP_ISFREE(pp)); 2702 ASSERT(PP_ISAGED(pp)); 2703 pp->p_szc = 0; 2704 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); 2705 page_unlock_noretire(pp); 2706 } 2707 2708 if (replpp != NULL) 2709 page_free_replacement_page(replpp); 2710 2711 return (NULL); 2712 } 2713 ASSERT(pp == targpp); 2714 2715 /* LINTED */ 2716 ASSERT(hpp = pp); /* That's right, it's an assignment */ 2717 2718 pp += npgs; 2719 pgcnt -= npgs; 2720 2721 while (npgs--) { 2722 ASSERT(PAGE_EXCL(targpp)); 2723 ASSERT(!PP_ISFREE(targpp)); 2724 ASSERT(!PP_ISNORELOC(targpp)); 2725 PP_SETFREE(targpp); 2726 ASSERT(PP_ISAGED(targpp)); 2727 ASSERT(targpp->p_szc < szc || (szc == 0 && 2728 (flags & PGI_PGCPSZC0))); 2729 targpp->p_szc = szc; 2730 targpp = targpp->p_next; 2731 2732 rpp = replpp; 2733 ASSERT(rpp != NULL); 2734 page_sub(&replpp, rpp); 2735 ASSERT(PAGE_EXCL(rpp)); 2736 ASSERT(!PP_ISFREE(rpp)); 2737 page_unlock_noretire(rpp); 2738 } 2739 ASSERT(targpp == hpp); 2740 ASSERT(replpp == NULL); 2741 page_list_concat(&pplist, &targpp); 2742 } 2743 CHK_LPG(pplist, szc); 2744 return (pplist); 2745 } 2746 2747 /* 2748 * Trim kernel cage from pfnlo-pfnhi and store result in lo-hi. Return code 2749 * of 0 means nothing left after trim. 2750 */ 2751 2752 int 2753 trimkcage(struct memseg *mseg, pfn_t *lo, pfn_t *hi, pfn_t pfnlo, pfn_t pfnhi) 2754 { 2755 pfn_t kcagepfn; 2756 int decr; 2757 int rc = 0; 2758 2759 if (PP_ISNORELOC(mseg->pages)) { 2760 if (PP_ISNORELOC(mseg->epages - 1) == 0) { 2761 2762 /* lower part of this mseg inside kernel cage */ 2763 decr = kcage_current_pfn(&kcagepfn); 2764 2765 /* kernel cage may have transitioned past mseg */ 2766 if (kcagepfn >= mseg->pages_base && 2767 kcagepfn < mseg->pages_end) { 2768 ASSERT(decr == 0); 2769 *lo = kcagepfn; 2770 *hi = MIN(pfnhi, 2771 (mseg->pages_end - 1)); 2772 rc = 1; 2773 } 2774 } 2775 /* else entire mseg in the cage */ 2776 } else { 2777 if (PP_ISNORELOC(mseg->epages - 1)) { 2778 2779 /* upper part of this mseg inside kernel cage */ 2780 decr = kcage_current_pfn(&kcagepfn); 2781 2782 /* kernel cage may have transitioned past mseg */ 2783 if (kcagepfn >= mseg->pages_base && 2784 kcagepfn < mseg->pages_end) { 2785 ASSERT(decr); 2786 *hi = kcagepfn; 2787 *lo = MAX(pfnlo, mseg->pages_base); 2788 rc = 1; 2789 } 2790 } else { 2791 /* entire mseg outside of kernel cage */ 2792 *lo = MAX(pfnlo, mseg->pages_base); 2793 *hi = MIN(pfnhi, (mseg->pages_end - 1)); 2794 rc = 1; 2795 } 2796 } 2797 return (rc); 2798 } 2799 2800 /* 2801 * called from page_get_contig_pages to search 'pfnlo' thru 'pfnhi' to "claim" a 2802 * page with size code 'szc'. Claiming such a page requires acquiring 2803 * exclusive locks on all constituent pages (page_trylock_contig_pages), 2804 * relocating pages in use and concatenating these constituent pages into a 2805 * large page. 2806 * 2807 * The page lists do not have such a large page and page_freelist_fill has 2808 * already failed to demote larger pages and/or coalesce smaller free pages. 2809 * 2810 * 'flags' may specify PG_COLOR_MATCH which would limit the search of large 2811 * pages with the same color as 'bin'. 2812 * 2813 * 'pfnflag' specifies the subset of the pfn range to search. 2814 */ 2815 2816 2817 static page_t * 2818 page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags, 2819 pfn_t pfnlo, pfn_t pfnhi, pgcnt_t pfnflag) 2820 { 2821 struct memseg *mseg; 2822 pgcnt_t szcpgcnt = page_get_pagecnt(szc); 2823 pgcnt_t szcpgmask = szcpgcnt - 1; 2824 pfn_t randpfn; 2825 page_t *pp, *randpp, *endpp; 2826 uint_t colors; 2827 pfn_t hi, lo; 2828 uint_t skip; 2829 2830 ASSERT(szc != 0 || (flags & PGI_PGCPSZC0)); 2831 2832 if ((pfnhi - pfnlo) + 1 < szcpgcnt) 2833 return (NULL); 2834 2835 ASSERT(szc < mmu_page_sizes); 2836 2837 colors = (szc) ? page_convert_color(0, szc, page_colors - 1) + 1 : 2838 page_colors; 2839 2840 ASSERT(bin < colors); 2841 2842 /* 2843 * trim the pfn range to search based on pfnflag. pfnflag is set 2844 * when there have been previous page_get_contig_page failures to 2845 * limit the search. 2846 * 2847 * The high bit in pfnflag specifies the number of 'slots' in the 2848 * pfn range and the remainder of pfnflag specifies which slot. 2849 * For example, a value of 1010b would mean the second slot of 2850 * the pfn range that has been divided into 8 slots. 2851 */ 2852 if (pfnflag > 1) { 2853 int slots = 1 << (highbit(pfnflag) - 1); 2854 int slotid = pfnflag & (slots - 1); 2855 pgcnt_t szcpages; 2856 int slotlen; 2857 2858 pfnlo = P2ROUNDUP(pfnlo, szcpgcnt); 2859 pfnhi = pfnhi & ~(szcpgcnt - 1); 2860 2861 szcpages = ((pfnhi - pfnlo) + 1) / szcpgcnt; 2862 slotlen = howmany(szcpages, slots); 2863 pfnlo = pfnlo + (((slotid * slotlen) % szcpages) * szcpgcnt); 2864 ASSERT(pfnlo < pfnhi); 2865 if (pfnhi > pfnlo + (slotlen * szcpgcnt)) 2866 pfnhi = pfnlo + (slotlen * szcpgcnt); 2867 } 2868 2869 memsegs_lock(0); 2870 2871 /* 2872 * loop through memsegs to look for contig page candidates 2873 */ 2874 2875 for (mseg = memsegs; mseg != NULL; mseg = mseg->next) { 2876 if (pfnhi < mseg->pages_base || pfnlo >= mseg->pages_end) { 2877 /* no overlap */ 2878 continue; 2879 } 2880 2881 if (mseg->pages_end - mseg->pages_base < szcpgcnt) 2882 /* mseg too small */ 2883 continue; 2884 2885 /* trim off kernel cage pages from pfn range */ 2886 if (kcage_on) { 2887 if (trimkcage(mseg, &lo, &hi, pfnlo, pfnhi) == 0) 2888 continue; 2889 } else { 2890 lo = MAX(pfnlo, mseg->pages_base); 2891 hi = MIN(pfnhi, (mseg->pages_end - 1)); 2892 } 2893 2894 /* round to szcpgcnt boundaries */ 2895 lo = P2ROUNDUP(lo, szcpgcnt); 2896 hi = hi & ~(szcpgcnt - 1); 2897 2898 if (hi <= lo) 2899 continue; 2900 2901 /* 2902 * set lo to point to the pfn for the desired bin. Large 2903 * page sizes may only have a single page color 2904 */ 2905 if ((colors > 1) && (flags & PG_MATCH_COLOR)) { 2906 uint_t lobin; 2907 2908 /* 2909 * factor in colorequiv to check additional 2910 * 'equivalent' bins. 2911 */ 2912 if (colorequiv > 1 && colors > colorequiv) 2913 colors = colors / colorequiv; 2914 2915 /* determine bin that lo currently points to */ 2916 lobin = (lo & ((szcpgcnt * colors) - 1)) / szcpgcnt; 2917 2918 /* 2919 * set lo to point at appropriate color and set skip 2920 * to arrive at the next szc page of the same color. 2921 */ 2922 lo += ((bin - lobin) & (colors - 1)) * szcpgcnt; 2923 2924 skip = colors * szcpgcnt; 2925 } else { 2926 /* check all pages starting from lo */ 2927 skip = szcpgcnt; 2928 } 2929 if (hi <= lo) 2930 /* mseg cannot satisfy color request */ 2931 continue; 2932 2933 /* randomly choose a point between lo and hi to begin search */ 2934 2935 randpfn = (pfn_t)GETTICK(); 2936 randpfn = ((randpfn % (hi - lo)) + lo) & ~(skip - 1); 2937 randpp = mseg->pages + (randpfn - mseg->pages_base); 2938 2939 ASSERT(randpp->p_pagenum == randpfn); 2940 2941 pp = randpp; 2942 endpp = mseg->pages + (hi - mseg->pages_base); 2943 2944 ASSERT(randpp + szcpgcnt <= endpp); 2945 2946 do { 2947 ASSERT(!(pp->p_pagenum & szcpgmask)); 2948 ASSERT((flags & PG_MATCH_COLOR) == 0 || 2949 colorequiv > 1 || 2950 PP_2_BIN(pp) == bin); 2951 if (page_trylock_contig_pages(mnode, pp, szc, flags)) { 2952 /* pages unlocked by page_claim on failure */ 2953 if (page_claim_contig_pages(pp, szc, flags)) { 2954 memsegs_unlock(0); 2955 return (pp); 2956 } 2957 } 2958 2959 pp += skip; 2960 if (pp >= endpp) { 2961 /* start from the beginning */ 2962 pp = mseg->pages + (lo - mseg->pages_base); 2963 ASSERT(pp->p_pagenum == lo); 2964 ASSERT(pp + szcpgcnt <= endpp); 2965 } 2966 } while (pp != randpp); 2967 } 2968 memsegs_unlock(0); 2969 return (NULL); 2970 } 2971 2972 2973 /* 2974 * controlling routine that searches through physical memory in an attempt to 2975 * claim a large page based on the input parameters. 2976 * on the page free lists. 2977 * 2978 * calls page_geti_contig_pages with an initial pfn range from the mnode 2979 * and mtype. page_geti_contig_pages will trim off the parts of the pfn range 2980 * that overlaps with the kernel cage or does not match the requested page 2981 * color if PG_MATCH_COLOR is set. Since this search is very expensive, 2982 * page_geti_contig_pages may further limit the search range based on 2983 * previous failure counts (pgcpfailcnt[]). 2984 * 2985 * for PGI_PGCPSZC0 requests, page_get_contig_pages will relocate a base 2986 * pagesize page that satisfies mtype. 2987 */ 2988 page_t * 2989 page_get_contig_pages(int mnode, uint_t bin, int mtype, uchar_t szc, 2990 uint_t flags) 2991 { 2992 pfn_t pfnlo, pfnhi; /* contig pages pfn range */ 2993 page_t *pp; 2994 pgcnt_t pfnflag = 0; /* no limit on search if 0 */ 2995 2996 VM_STAT_ADD(vmm_vmstats.pgcp_alloc[szc]); 2997 2998 /* LINTED */ 2999 MTYPE_START(mnode, mtype, flags); 3000 if (mtype < 0) { /* mnode does not have memory in mtype range */ 3001 VM_STAT_ADD(vmm_vmstats.pgcp_allocempty[szc]); 3002 return (NULL); 3003 } 3004 3005 ASSERT(szc > 0 || (flags & PGI_PGCPSZC0)); 3006 3007 /* no allocations from cage */ 3008 flags |= PGI_NOCAGE; 3009 3010 /* do not limit search and ignore color if hi pri */ 3011 3012 if (pgcplimitsearch && ((flags & PGI_PGCPHIPRI) == 0)) 3013 pfnflag = pgcpfailcnt[szc]; 3014 3015 /* remove color match to improve chances */ 3016 3017 if (flags & PGI_PGCPHIPRI || pfnflag) 3018 flags &= ~PG_MATCH_COLOR; 3019 3020 do { 3021 /* get pfn range based on mnode and mtype */ 3022 MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi); 3023 3024 ASSERT(pfnhi >= pfnlo); 3025 3026 pp = page_geti_contig_pages(mnode, bin, szc, flags, 3027 pfnlo, pfnhi, pfnflag); 3028 3029 if (pp != NULL) { 3030 pfnflag = pgcpfailcnt[szc]; 3031 if (pfnflag) { 3032 /* double the search size */ 3033 pgcpfailcnt[szc] = pfnflag >> 1; 3034 } 3035 VM_STAT_ADD(vmm_vmstats.pgcp_allocok[szc]); 3036 return (pp); 3037 } 3038 MTYPE_NEXT(mnode, mtype, flags); 3039 } while (mtype >= 0); 3040 3041 VM_STAT_ADD(vmm_vmstats.pgcp_allocfailed[szc]); 3042 return (NULL); 3043 } 3044 3045 3046 /* 3047 * Find the `best' page on the freelist for this (vp,off) (as,vaddr) pair. 3048 * 3049 * Does its own locking and accounting. 3050 * If PG_MATCH_COLOR is set, then NULL will be returned if there are no 3051 * pages of the proper color even if there are pages of a different color. 3052 * 3053 * Finds a page, removes it, THEN locks it. 3054 */ 3055 3056 /*ARGSUSED*/ 3057 page_t * 3058 page_get_freelist(struct vnode *vp, u_offset_t off, struct seg *seg, 3059 caddr_t vaddr, size_t size, uint_t flags, struct lgrp *lgrp) 3060 { 3061 struct as *as = seg->s_as; 3062 page_t *pp = NULL; 3063 ulong_t bin; 3064 uchar_t szc; 3065 int mnode; 3066 int mtype; 3067 page_t *(*page_get_func)(int, uint_t, int, uchar_t, uint_t); 3068 lgrp_mnode_cookie_t lgrp_cookie; 3069 3070 page_get_func = page_get_mnode_freelist; 3071 3072 /* 3073 * If we aren't passed a specific lgroup, or passed a freed lgrp 3074 * assume we wish to allocate near to the current thread's home. 3075 */ 3076 if (!LGRP_EXISTS(lgrp)) 3077 lgrp = lgrp_home_lgrp(); 3078 3079 if (kcage_on) { 3080 if ((flags & (PG_NORELOC | PG_PANIC)) == PG_NORELOC && 3081 kcage_freemem < kcage_throttlefree + btop(size) && 3082 curthread != kcage_cageout_thread) { 3083 /* 3084 * Set a "reserve" of kcage_throttlefree pages for 3085 * PG_PANIC and cageout thread allocations. 3086 * 3087 * Everybody else has to serialize in 3088 * page_create_get_something() to get a cage page, so 3089 * that we don't deadlock cageout! 3090 */ 3091 return (NULL); 3092 } 3093 } else { 3094 flags &= ~PG_NORELOC; 3095 flags |= PGI_NOCAGE; 3096 } 3097 3098 /* LINTED */ 3099 MTYPE_INIT(mtype, vp, vaddr, flags); 3100 3101 /* 3102 * Convert size to page size code. 3103 */ 3104 if ((szc = page_szc(size)) == (uchar_t)-1) 3105 panic("page_get_freelist: illegal page size request"); 3106 ASSERT(szc < mmu_page_sizes); 3107 3108 VM_STAT_ADD(vmm_vmstats.pgf_alloc[szc]); 3109 3110 /* LINTED */ 3111 AS_2_BIN(as, seg, vp, vaddr, bin); 3112 3113 /* bin is for base pagesize color - convert if larger pagesize. */ 3114 if (szc) 3115 bin = page_convert_color(0, szc, bin); 3116 3117 /* 3118 * Try to get a local page first, but try remote if we can't 3119 * get a page of the right color. 3120 */ 3121 pgretry: 3122 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL); 3123 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3124 pp = page_get_func(mnode, bin, mtype, szc, flags); 3125 if (pp != NULL) { 3126 VM_STAT_ADD(vmm_vmstats.pgf_allocok[szc]); 3127 DTRACE_PROBE4(page__get, 3128 lgrp_t *, lgrp, 3129 int, mnode, 3130 ulong_t, bin, 3131 uint_t, flags); 3132 return (pp); 3133 } 3134 } 3135 ASSERT(pp == NULL); 3136 3137 /* 3138 * for non-SZC0 PAGESIZE requests, check cachelist before checking 3139 * remote free lists. Caller expected to call page_get_cachelist which 3140 * will check local cache lists and remote free lists. 3141 */ 3142 if (szc == 0 && ((flags & PGI_PGCPSZC0) == 0)) { 3143 VM_STAT_ADD(vmm_vmstats.pgf_allocdeferred); 3144 return (NULL); 3145 } 3146 3147 ASSERT(szc > 0 || (flags & PGI_PGCPSZC0)); 3148 3149 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); 3150 3151 /* 3152 * Try to get a non-local freelist page. 3153 */ 3154 LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie); 3155 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3156 pp = page_get_func(mnode, bin, mtype, szc, flags); 3157 if (pp != NULL) { 3158 DTRACE_PROBE4(page__get, 3159 lgrp_t *, lgrp, 3160 int, mnode, 3161 ulong_t, bin, 3162 uint_t, flags); 3163 VM_STAT_ADD(vmm_vmstats.pgf_allocokrem[szc]); 3164 return (pp); 3165 } 3166 } 3167 3168 ASSERT(pp == NULL); 3169 3170 /* 3171 * when the cage is off chances are page_get_contig_pages() will fail 3172 * to lock a large page chunk therefore when the cage is off it's not 3173 * called by default. this can be changed via /etc/system. 3174 * 3175 * page_get_contig_pages() also called to acquire a base pagesize page 3176 * for page_create_get_something(). 3177 */ 3178 if (!(flags & PG_NORELOC) && (pg_contig_disable == 0) && 3179 (kcage_on || pg_lpgcreate_nocage || szc == 0) && 3180 (page_get_func != page_get_contig_pages)) { 3181 3182 VM_STAT_ADD(vmm_vmstats.pgf_allocretry[szc]); 3183 page_get_func = page_get_contig_pages; 3184 goto pgretry; 3185 } 3186 3187 if (pgcplimitsearch && page_get_func == page_get_contig_pages) 3188 SETPGCPFAILCNT(szc); 3189 3190 VM_STAT_ADD(vmm_vmstats.pgf_allocfailed[szc]); 3191 return (NULL); 3192 } 3193 3194 /* 3195 * Find the `best' page on the cachelist for this (vp,off) (as,vaddr) pair. 3196 * 3197 * Does its own locking. 3198 * If PG_MATCH_COLOR is set, then NULL will be returned if there are no 3199 * pages of the proper color even if there are pages of a different color. 3200 * Otherwise, scan the bins for ones with pages. For each bin with pages, 3201 * try to lock one of them. If no page can be locked, try the 3202 * next bin. Return NULL if a page can not be found and locked. 3203 * 3204 * Finds a pages, trys to lock it, then removes it. 3205 */ 3206 3207 /*ARGSUSED*/ 3208 page_t * 3209 page_get_cachelist(struct vnode *vp, u_offset_t off, struct seg *seg, 3210 caddr_t vaddr, uint_t flags, struct lgrp *lgrp) 3211 { 3212 page_t *pp; 3213 struct as *as = seg->s_as; 3214 ulong_t bin; 3215 /*LINTED*/ 3216 int mnode; 3217 int mtype; 3218 lgrp_mnode_cookie_t lgrp_cookie; 3219 3220 /* 3221 * If we aren't passed a specific lgroup, or pasased a freed lgrp 3222 * assume we wish to allocate near to the current thread's home. 3223 */ 3224 if (!LGRP_EXISTS(lgrp)) 3225 lgrp = lgrp_home_lgrp(); 3226 3227 if (!kcage_on) { 3228 flags &= ~PG_NORELOC; 3229 flags |= PGI_NOCAGE; 3230 } 3231 3232 if ((flags & (PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == PG_NORELOC && 3233 kcage_freemem <= kcage_throttlefree) { 3234 /* 3235 * Reserve kcage_throttlefree pages for critical kernel 3236 * threads. 3237 * 3238 * Everybody else has to go to page_create_get_something() 3239 * to get a cage page, so we don't deadlock cageout. 3240 */ 3241 return (NULL); 3242 } 3243 3244 /* LINTED */ 3245 AS_2_BIN(as, seg, vp, vaddr, bin); 3246 3247 ASSERT(bin <= page_colors_mask); 3248 3249 /* LINTED */ 3250 MTYPE_INIT(mtype, vp, vaddr, flags); 3251 3252 VM_STAT_ADD(vmm_vmstats.pgc_alloc); 3253 3254 /* 3255 * Try local cachelists first 3256 */ 3257 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL); 3258 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3259 pp = page_get_mnode_cachelist(bin, flags, mnode, mtype); 3260 if (pp != NULL) { 3261 VM_STAT_ADD(vmm_vmstats.pgc_allocok); 3262 DTRACE_PROBE4(page__get, 3263 lgrp_t *, lgrp, 3264 int, mnode, 3265 ulong_t, bin, 3266 uint_t, flags); 3267 return (pp); 3268 } 3269 } 3270 3271 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); 3272 3273 /* 3274 * Try freelists/cachelists that are farther away 3275 * This is our only chance to allocate remote pages for PAGESIZE 3276 * requests. 3277 */ 3278 LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie); 3279 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3280 pp = page_get_mnode_freelist(mnode, bin, mtype, 3281 0, flags); 3282 if (pp != NULL) { 3283 VM_STAT_ADD(vmm_vmstats.pgc_allocokdeferred); 3284 DTRACE_PROBE4(page__get, 3285 lgrp_t *, lgrp, 3286 int, mnode, 3287 ulong_t, bin, 3288 uint_t, flags); 3289 return (pp); 3290 } 3291 pp = page_get_mnode_cachelist(bin, flags, mnode, mtype); 3292 if (pp != NULL) { 3293 VM_STAT_ADD(vmm_vmstats.pgc_allocokrem); 3294 DTRACE_PROBE4(page__get, 3295 lgrp_t *, lgrp, 3296 int, mnode, 3297 ulong_t, bin, 3298 uint_t, flags); 3299 return (pp); 3300 } 3301 } 3302 3303 VM_STAT_ADD(vmm_vmstats.pgc_allocfailed); 3304 return (NULL); 3305 } 3306 3307 page_t * 3308 page_get_mnode_cachelist(uint_t bin, uint_t flags, int mnode, int mtype) 3309 { 3310 kmutex_t *pcm; 3311 int i; 3312 page_t *pp; 3313 page_t *first_pp; 3314 uint_t bin_marker; 3315 int nwaybins, nwaycnt; 3316 int cpucolors; 3317 3318 VM_STAT_ADD(vmm_vmstats.pgmc_alloc); 3319 3320 /* LINTED */ 3321 MTYPE_START(mnode, mtype, flags); 3322 if (mtype < 0) { /* mnode does not have memory in mtype range */ 3323 VM_STAT_ADD(vmm_vmstats.pgmc_allocempty); 3324 return (NULL); 3325 } 3326 3327 nwaybins = 0; 3328 cpucolors = cpu_page_colors; 3329 /* 3330 * adjust cpucolors to possibly check additional 'equivalent' bins 3331 * to try to minimize fragmentation of large pages by delaying calls 3332 * to page_freelist_fill. 3333 */ 3334 if (colorequiv > 1) { 3335 int equivcolors = page_colors / colorequiv; 3336 3337 if (equivcolors && (cpucolors == 0 || equivcolors < cpucolors)) 3338 cpucolors = equivcolors; 3339 } 3340 3341 /* 3342 * Only hold one cachelist lock at a time, that way we 3343 * can start anywhere and not have to worry about lock 3344 * ordering. 3345 */ 3346 3347 big_try_again: 3348 nwaycnt = 0; 3349 for (i = 0; i <= page_colors; i++) { 3350 if (PAGE_CACHELISTS(mnode, bin, mtype)) { 3351 pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST); 3352 mutex_enter(pcm); 3353 pp = PAGE_CACHELISTS(mnode, bin, mtype); 3354 if (pp != NULL) { 3355 first_pp = pp; 3356 ASSERT(pp->p_vnode); 3357 ASSERT(PP_ISAGED(pp) == 0); 3358 ASSERT(pp->p_szc == 0); 3359 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 3360 while (!page_trylock(pp, SE_EXCL)) { 3361 pp = pp->p_next; 3362 ASSERT(pp->p_szc == 0); 3363 if (pp == first_pp) { 3364 /* 3365 * We have searched the 3366 * complete list! 3367 * And all of them (might 3368 * only be one) are locked. 3369 * This can happen since 3370 * these pages can also be 3371 * found via the hash list. 3372 * When found via the hash 3373 * list, they are locked 3374 * first, then removed. 3375 * We give up to let the 3376 * other thread run. 3377 */ 3378 pp = NULL; 3379 break; 3380 } 3381 ASSERT(pp->p_vnode); 3382 ASSERT(PP_ISFREE(pp)); 3383 ASSERT(PP_ISAGED(pp) == 0); 3384 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == 3385 mnode); 3386 } 3387 3388 if (pp) { 3389 page_t **ppp; 3390 /* 3391 * Found and locked a page. 3392 * Pull it off the list. 3393 */ 3394 ASSERT(mtype == PP_2_MTYPE(pp)); 3395 ppp = &PAGE_CACHELISTS(mnode, bin, 3396 mtype); 3397 page_sub(ppp, pp); 3398 /* 3399 * Subtract counters before releasing 3400 * pcm mutex to avoid a race with 3401 * page_freelist_coalesce and 3402 * page_freelist_fill. 3403 */ 3404 page_ctr_sub(mnode, mtype, pp, 3405 PG_CACHE_LIST); 3406 mutex_exit(pcm); 3407 ASSERT(pp->p_vnode); 3408 ASSERT(PP_ISAGED(pp) == 0); 3409 #if defined(__sparc) 3410 ASSERT(!kcage_on || 3411 (flags & PG_NORELOC) == 0 || 3412 PP_ISNORELOC(pp)); 3413 if (PP_ISNORELOC(pp)) { 3414 kcage_freemem_sub(1); 3415 } 3416 #endif 3417 VM_STAT_ADD(vmm_vmstats. 3418 pgmc_allocok); 3419 return (pp); 3420 } 3421 } 3422 mutex_exit(pcm); 3423 } 3424 3425 /* 3426 * Wow! The initial bin is empty or no page in the bin could 3427 * be locked. 3428 * 3429 * If specific color is needed, check if page color may be in 3430 * other bins. 3431 */ 3432 if ((flags & PG_MATCH_COLOR) && (cpucolors != 0)) { 3433 if (!nwaybins) { 3434 if (cpucolors < 0) { 3435 cpucolors = CPUSETSIZE() / MMU_PAGESIZE; 3436 ASSERT(cpucolors > 0); 3437 nwaybins = page_colors / cpucolors; 3438 if (nwaybins < 2) 3439 cpucolors = 0; 3440 } else { 3441 nwaybins = page_colors / cpucolors; 3442 ASSERT(nwaybins > 1); 3443 } 3444 } 3445 3446 if (++nwaycnt >= nwaybins) { 3447 break; 3448 } 3449 bin = (bin + (page_colors / nwaybins)) & 3450 page_colors_mask; 3451 continue; 3452 } 3453 3454 if (i == 0) { 3455 bin = (bin + BIN_STEP) & page_colors_mask; 3456 bin_marker = bin; 3457 } else { 3458 bin = (bin + vac_colors) & page_colors_mask; 3459 if (bin == bin_marker) { 3460 bin = (bin + 1) & page_colors_mask; 3461 bin_marker = bin; 3462 } 3463 } 3464 } 3465 3466 MTYPE_NEXT(mnode, mtype, flags); 3467 if (mtype >= 0) 3468 goto big_try_again; 3469 3470 VM_STAT_ADD(vmm_vmstats.pgmc_allocfailed); 3471 return (NULL); 3472 } 3473 3474 #ifdef DEBUG 3475 #define REPL_PAGE_STATS 3476 #endif /* DEBUG */ 3477 3478 #ifdef REPL_PAGE_STATS 3479 struct repl_page_stats { 3480 uint_t ngets; 3481 uint_t ngets_noreloc; 3482 uint_t npgr_noreloc; 3483 uint_t nnopage_first; 3484 uint_t nnopage; 3485 uint_t nhashout; 3486 uint_t nnofree; 3487 uint_t nnext_pp; 3488 } repl_page_stats; 3489 #define REPL_STAT_INCR(v) atomic_add_32(&repl_page_stats.v, 1) 3490 #else /* REPL_PAGE_STATS */ 3491 #define REPL_STAT_INCR(v) 3492 #endif /* REPL_PAGE_STATS */ 3493 3494 int pgrppgcp; 3495 3496 /* 3497 * The freemem accounting must be done by the caller. 3498 * First we try to get a replacement page of the same size as like_pp, 3499 * if that is not possible, then we just get a set of discontiguous 3500 * PAGESIZE pages. 3501 */ 3502 page_t * 3503 page_get_replacement_page(page_t *orig_like_pp, struct lgrp *lgrp_target, 3504 uint_t pgrflags) 3505 { 3506 page_t *like_pp; 3507 page_t *pp, *pplist; 3508 page_t *pl = NULL; 3509 ulong_t bin; 3510 int mnode, page_mnode; 3511 int szc; 3512 spgcnt_t npgs, pg_cnt; 3513 pfn_t pfnum; 3514 int mtype; 3515 int flags = 0; 3516 lgrp_mnode_cookie_t lgrp_cookie; 3517 lgrp_t *lgrp; 3518 3519 REPL_STAT_INCR(ngets); 3520 like_pp = orig_like_pp; 3521 ASSERT(PAGE_EXCL(like_pp)); 3522 3523 szc = like_pp->p_szc; 3524 npgs = page_get_pagecnt(szc); 3525 /* 3526 * Now we reset like_pp to the base page_t. 3527 * That way, we won't walk past the end of this 'szc' page. 3528 */ 3529 pfnum = PFN_BASE(like_pp->p_pagenum, szc); 3530 like_pp = page_numtopp_nolock(pfnum); 3531 ASSERT(like_pp->p_szc == szc); 3532 3533 if (PP_ISNORELOC(like_pp)) { 3534 ASSERT(kcage_on); 3535 REPL_STAT_INCR(ngets_noreloc); 3536 flags = PGI_RELOCONLY; 3537 } else if (pgrflags & PGR_NORELOC) { 3538 ASSERT(kcage_on); 3539 REPL_STAT_INCR(npgr_noreloc); 3540 flags = PG_NORELOC; 3541 } 3542 3543 /* 3544 * Kernel pages must always be replaced with the same size 3545 * pages, since we cannot properly handle demotion of kernel 3546 * pages. 3547 */ 3548 if (like_pp->p_vnode == &kvp) 3549 pgrflags |= PGR_SAMESZC; 3550 3551 /* LINTED */ 3552 MTYPE_PGR_INIT(mtype, flags, like_pp, page_mnode); 3553 3554 while (npgs) { 3555 pplist = NULL; 3556 for (;;) { 3557 pg_cnt = page_get_pagecnt(szc); 3558 bin = PP_2_BIN(like_pp); 3559 ASSERT(like_pp->p_szc == orig_like_pp->p_szc); 3560 ASSERT(pg_cnt <= npgs); 3561 3562 /* 3563 * If an lgroup was specified, try to get the 3564 * page from that lgroup. 3565 * NOTE: Must be careful with code below because 3566 * lgroup may disappear and reappear since there 3567 * is no locking for lgroup here. 3568 */ 3569 if (LGRP_EXISTS(lgrp_target)) { 3570 /* 3571 * Keep local variable for lgroup separate 3572 * from lgroup argument since this code should 3573 * only be exercised when lgroup argument 3574 * exists.... 3575 */ 3576 lgrp = lgrp_target; 3577 3578 /* Try the lgroup's freelists first */ 3579 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 3580 LGRP_SRCH_LOCAL); 3581 while ((pplist == NULL) && 3582 (mnode = lgrp_memnode_choose(&lgrp_cookie)) 3583 != -1) { 3584 pplist = page_get_mnode_freelist( 3585 mnode, bin, mtype, szc, 3586 flags); 3587 } 3588 3589 /* 3590 * Now try it's cachelists if this is a 3591 * small page. Don't need to do it for 3592 * larger ones since page_freelist_coalesce() 3593 * already failed. 3594 */ 3595 if (pplist != NULL || szc != 0) 3596 break; 3597 3598 /* Now try it's cachelists */ 3599 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 3600 LGRP_SRCH_LOCAL); 3601 3602 while ((pplist == NULL) && 3603 (mnode = lgrp_memnode_choose(&lgrp_cookie)) 3604 != -1) { 3605 pplist = page_get_mnode_cachelist( 3606 bin, flags, mnode, mtype); 3607 } 3608 if (pplist != NULL) { 3609 page_hashout(pplist, NULL); 3610 PP_SETAGED(pplist); 3611 REPL_STAT_INCR(nhashout); 3612 break; 3613 } 3614 /* Done looking in this lgroup. Bail out. */ 3615 break; 3616 } 3617 3618 /* 3619 * No lgroup was specified (or lgroup was removed by 3620 * DR, so just try to get the page as close to 3621 * like_pp's mnode as possible. 3622 * First try the local freelist... 3623 */ 3624 mnode = PP_2_MEM_NODE(like_pp); 3625 pplist = page_get_mnode_freelist(mnode, bin, 3626 mtype, szc, flags); 3627 if (pplist != NULL) 3628 break; 3629 3630 REPL_STAT_INCR(nnofree); 3631 3632 /* 3633 * ...then the local cachelist. Don't need to do it for 3634 * larger pages cause page_freelist_coalesce() already 3635 * failed there anyway. 3636 */ 3637 if (szc == 0) { 3638 pplist = page_get_mnode_cachelist(bin, flags, 3639 mnode, mtype); 3640 if (pplist != NULL) { 3641 page_hashout(pplist, NULL); 3642 PP_SETAGED(pplist); 3643 REPL_STAT_INCR(nhashout); 3644 break; 3645 } 3646 } 3647 3648 /* Now try remote freelists */ 3649 page_mnode = mnode; 3650 lgrp = 3651 lgrp_hand_to_lgrp(MEM_NODE_2_LGRPHAND(page_mnode)); 3652 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 3653 LGRP_SRCH_HIER); 3654 while (pplist == NULL && 3655 (mnode = lgrp_memnode_choose(&lgrp_cookie)) 3656 != -1) { 3657 /* 3658 * Skip local mnode. 3659 */ 3660 if ((mnode == page_mnode) || 3661 (mem_node_config[mnode].exists == 0)) 3662 continue; 3663 3664 pplist = page_get_mnode_freelist(mnode, 3665 bin, mtype, szc, flags); 3666 } 3667 3668 if (pplist != NULL) 3669 break; 3670 3671 3672 /* Now try remote cachelists */ 3673 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 3674 LGRP_SRCH_HIER); 3675 while (pplist == NULL && szc == 0) { 3676 mnode = lgrp_memnode_choose(&lgrp_cookie); 3677 if (mnode == -1) 3678 break; 3679 /* 3680 * Skip local mnode. 3681 */ 3682 if ((mnode == page_mnode) || 3683 (mem_node_config[mnode].exists == 0)) 3684 continue; 3685 3686 pplist = page_get_mnode_cachelist(bin, 3687 flags, mnode, mtype); 3688 3689 if (pplist != NULL) { 3690 page_hashout(pplist, NULL); 3691 PP_SETAGED(pplist); 3692 REPL_STAT_INCR(nhashout); 3693 break; 3694 } 3695 } 3696 3697 /* 3698 * Break out of while loop under the following cases: 3699 * - If we successfully got a page. 3700 * - If pgrflags specified only returning a specific 3701 * page size and we could not find that page size. 3702 * - If we could not satisfy the request with PAGESIZE 3703 * or larger pages. 3704 */ 3705 if (pplist != NULL || szc == 0) 3706 break; 3707 3708 if ((pgrflags & PGR_SAMESZC) || pgrppgcp) { 3709 /* try to find contig page */ 3710 3711 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 3712 LGRP_SRCH_HIER); 3713 3714 while ((pplist == NULL) && 3715 (mnode = 3716 lgrp_memnode_choose(&lgrp_cookie)) 3717 != -1) { 3718 pplist = page_get_contig_pages( 3719 mnode, bin, mtype, szc, 3720 flags | PGI_PGCPHIPRI); 3721 } 3722 break; 3723 } 3724 3725 /* 3726 * The correct thing to do here is try the next 3727 * page size down using szc--. Due to a bug 3728 * with the processing of HAT_RELOAD_SHARE 3729 * where the sfmmu_ttecnt arrays of all 3730 * hats sharing an ISM segment don't get updated, 3731 * using intermediate size pages for relocation 3732 * can lead to continuous page faults. 3733 */ 3734 szc = 0; 3735 } 3736 3737 if (pplist != NULL) { 3738 DTRACE_PROBE4(page__get, 3739 lgrp_t *, lgrp, 3740 int, mnode, 3741 ulong_t, bin, 3742 uint_t, flags); 3743 3744 while (pplist != NULL && pg_cnt--) { 3745 ASSERT(pplist != NULL); 3746 pp = pplist; 3747 page_sub(&pplist, pp); 3748 PP_CLRFREE(pp); 3749 PP_CLRAGED(pp); 3750 page_list_concat(&pl, &pp); 3751 npgs--; 3752 like_pp = like_pp + 1; 3753 REPL_STAT_INCR(nnext_pp); 3754 } 3755 ASSERT(pg_cnt == 0); 3756 } else { 3757 break; 3758 } 3759 } 3760 3761 if (npgs) { 3762 /* 3763 * We were unable to allocate the necessary number 3764 * of pages. 3765 * We need to free up any pl. 3766 */ 3767 REPL_STAT_INCR(nnopage); 3768 page_free_replacement_page(pl); 3769 return (NULL); 3770 } else { 3771 return (pl); 3772 } 3773 } 3774 3775 /* 3776 * demote a free large page to it's constituent pages 3777 */ 3778 void 3779 page_demote_free_pages(page_t *pp) 3780 { 3781 3782 int mnode; 3783 3784 ASSERT(pp != NULL); 3785 ASSERT(PAGE_LOCKED(pp)); 3786 ASSERT(PP_ISFREE(pp)); 3787 ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes); 3788 3789 mnode = PP_2_MEM_NODE(pp); 3790 page_freelist_lock(mnode); 3791 if (pp->p_szc != 0) { 3792 (void) page_demote(mnode, PFN_BASE(pp->p_pagenum, 3793 pp->p_szc), pp->p_szc, 0, PC_NO_COLOR, PC_FREE); 3794 } 3795 page_freelist_unlock(mnode); 3796 ASSERT(pp->p_szc == 0); 3797 } 3798