1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * Portions of this source code were derived from Berkeley 4.3 BSD 31 * under license from the Regents of the University of California. 32 */ 33 34 #pragma ident "%Z%%M% %I% %E% SMI" 35 36 /* 37 * This file contains common functions to access and manage the page lists. 38 * Many of these routines originated from platform dependent modules 39 * (sun4/vm/vm_dep.c, i86pc/vm/vm_machdep.c) and modified to function in 40 * a platform independent manner. 41 * 42 * vm/vm_dep.h provides for platform specific support. 43 */ 44 45 #include <sys/types.h> 46 #include <sys/debug.h> 47 #include <sys/cmn_err.h> 48 #include <sys/systm.h> 49 #include <sys/atomic.h> 50 #include <sys/sysmacros.h> 51 #include <vm/as.h> 52 #include <vm/page.h> 53 #include <vm/seg_kmem.h> 54 #include <vm/seg_vn.h> 55 #include <sys/memnode.h> 56 #include <vm/vm_dep.h> 57 #include <sys/lgrp.h> 58 #include <sys/mem_config.h> 59 #include <sys/callb.h> 60 #include <sys/mem_cage.h> 61 #include <sys/sdt.h> 62 63 extern uint_t vac_colors; 64 65 #define MAX_PRAGMA_ALIGN 128 66 67 /* vm_cpu_data0 for the boot cpu before kmem is initialized */ 68 69 #if L2CACHE_ALIGN_MAX <= MAX_PRAGMA_ALIGN 70 #pragma align L2CACHE_ALIGN_MAX(vm_cpu_data0) 71 #else 72 #pragma align MAX_PRAGMA_ALIGN(vm_cpu_data0) 73 #endif 74 char vm_cpu_data0[VM_CPU_DATA_PADSIZE]; 75 76 /* 77 * number of page colors equivalent to reqested color in page_get routines. 78 * If set, keeps large pages intact longer and keeps MPO allocation 79 * from the local mnode in favor of acquiring the 'correct' page color from 80 * a demoted large page or from a remote mnode. 81 */ 82 int colorequiv; 83 84 /* 85 * if set, specifies the percentage of large pages that are free from within 86 * a large page region before attempting to lock those pages for 87 * page_get_contig_pages processing. 88 * 89 * Should be turned on when kpr is available when page_trylock_contig_pages 90 * can be more selective. 91 */ 92 93 int ptcpthreshold; 94 95 /* 96 * Limit page get contig page search based on failure cnts in pgcpfailcnt[]. 97 * Enabled by default via pgcplimitsearch. 98 * 99 * pgcpfailcnt[] is bounded by PGCPFAILMAX (>= 1/2 of installed 100 * memory). When reached, pgcpfailcnt[] is reset to 1/2 of this upper 101 * bound. This upper bound range guarantees: 102 * - all large page 'slots' will be searched over time 103 * - the minimum (1) large page candidates considered on each pgcp call 104 * - count doesn't wrap around to 0 105 */ 106 pgcnt_t pgcpfailcnt[MMU_PAGE_SIZES]; 107 int pgcplimitsearch = 1; 108 109 #define PGCPFAILMAX (1 << (highbit(physinstalled) - 1)) 110 #define SETPGCPFAILCNT(szc) \ 111 if (++pgcpfailcnt[szc] >= PGCPFAILMAX) \ 112 pgcpfailcnt[szc] = PGCPFAILMAX / 2; 113 114 #ifdef VM_STATS 115 struct vmm_vmstats_str vmm_vmstats; 116 117 #endif /* VM_STATS */ 118 119 #if defined(__sparc) 120 #define LPGCREATE 0 121 #else 122 /* enable page_get_contig_pages */ 123 #define LPGCREATE 1 124 #endif 125 126 int pg_contig_disable; 127 int pg_lpgcreate_nocage = LPGCREATE; 128 129 /* 130 * page_freelist_fill pfn flag to signify no hi pfn requirement. 131 */ 132 #define PFNNULL 0 133 134 /* Flags involved in promotion and demotion routines */ 135 #define PC_FREE 0x1 /* put page on freelist */ 136 #define PC_ALLOC 0x2 /* return page for allocation */ 137 138 /* 139 * Flag for page_demote to be used with PC_FREE to denote that we don't care 140 * what the color is as the color parameter to the function is ignored. 141 */ 142 #define PC_NO_COLOR (-1) 143 144 /* 145 * page counters candidates info 146 * See page_ctrs_cands comment below for more details. 147 * fields are as follows: 148 * pcc_pages_free: # pages which freelist coalesce can create 149 * pcc_color_free_len: number of elements in pcc_color_free array 150 * pcc_color_free: pointer to page free counts per color 151 */ 152 typedef struct pcc_info { 153 pgcnt_t pcc_pages_free; 154 int pcc_color_free_len; 155 pgcnt_t *pcc_color_free; 156 } pcc_info_t; 157 158 /* 159 * On big machines it can take a long time to check page_counters 160 * arrays. page_ctrs_cands is a summary array whose elements are a dynamically 161 * updated sum of all elements of the corresponding page_counters arrays. 162 * page_freelist_coalesce() searches page_counters only if an appropriate 163 * element of page_ctrs_cands array is greater than 0. 164 * 165 * An extra dimension is used for page_ctrs_cands to spread the elements 166 * over a few e$ cache lines to avoid serialization during the array 167 * updates. 168 */ 169 #pragma align 64(page_ctrs_cands) 170 171 static pcc_info_t *page_ctrs_cands[NPC_MUTEX][MMU_PAGE_SIZES]; 172 173 /* 174 * Return in val the total number of free pages which can be created 175 * for the given mnode (m) and region size (r) 176 */ 177 #define PGCTRS_CANDS_GETVALUE(m, r, val) { \ 178 int i; \ 179 val = 0; \ 180 for (i = 0; i < NPC_MUTEX; i++) { \ 181 val += page_ctrs_cands[i][(r)][(m)].pcc_pages_free; \ 182 } \ 183 } 184 185 /* 186 * Return in val the total number of free pages which can be created 187 * for the given mnode (m), region size (r), and color (c) 188 */ 189 #define PGCTRS_CANDS_GETVALUECOLOR(m, r, c, val) { \ 190 int i; \ 191 val = 0; \ 192 ASSERT((c) < page_ctrs_cands[0][(r)][(m)].pcc_color_free_len); \ 193 for (i = 0; i < NPC_MUTEX; i++) { \ 194 val += page_ctrs_cands[i][(r)][(m)].pcc_color_free[(c)]; \ 195 } \ 196 } 197 198 /* 199 * We can only allow a single thread to update a counter within the physical 200 * range of the largest supported page size. That is the finest granularity 201 * possible since the counter values are dependent on each other 202 * as you move accross region sizes. PP_CTR_LOCK_INDX is used to determine the 203 * ctr_mutex lock index for a particular physical range. 204 */ 205 static kmutex_t *ctr_mutex[NPC_MUTEX]; 206 207 #define PP_CTR_LOCK_INDX(pp) \ 208 (((pp)->p_pagenum >> \ 209 (PAGE_BSZS_SHIFT(mmu_page_sizes - 1))) & (NPC_MUTEX - 1)) 210 211 /* 212 * Local functions prototypes. 213 */ 214 215 void page_ctr_add(int, int, page_t *, int); 216 void page_ctr_add_internal(int, int, page_t *, int); 217 void page_ctr_sub(int, int, page_t *, int); 218 uint_t page_convert_color(uchar_t, uchar_t, uint_t); 219 void page_freelist_lock(int); 220 void page_freelist_unlock(int); 221 page_t *page_promote(int, pfn_t, uchar_t, int); 222 page_t *page_demote(int, pfn_t, uchar_t, uchar_t, int, int); 223 page_t *page_freelist_fill(uchar_t, int, int, int, pfn_t); 224 page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int); 225 static int page_trylock_cons(page_t *pp, se_t se); 226 227 #define PNUM_SIZE(szc) \ 228 (hw_page_array[(szc)].hp_size >> hw_page_array[0].hp_shift) 229 #define PNUM_SHIFT(szc) \ 230 (hw_page_array[(szc)].hp_shift - hw_page_array[0].hp_shift) 231 232 /* 233 * The page_counters array below is used to keep track of free contiguous 234 * physical memory. A hw_page_map_t will be allocated per mnode per szc. 235 * This contains an array of counters, the size of the array, a shift value 236 * used to convert a pagenum into a counter array index or vice versa, as 237 * well as a cache of the last successful index to be promoted to a larger 238 * page size. As an optimization, we keep track of the last successful index 239 * to be promoted per page color for the given size region, and this is 240 * allocated dynamically based upon the number of colors for a given 241 * region size. 242 * 243 * Conceptually, the page counters are represented as: 244 * 245 * page_counters[region_size][mnode] 246 * 247 * region_size: size code of a candidate larger page made up 248 * of contiguous free smaller pages. 249 * 250 * page_counters[region_size][mnode].hpm_counters[index]: 251 * represents how many (region_size - 1) pages either 252 * exist or can be created within the given index range. 253 * 254 * Let's look at a sparc example: 255 * If we want to create a free 512k page, we look at region_size 2 256 * for the mnode we want. We calculate the index and look at a specific 257 * hpm_counters location. If we see 8 (FULL_REGION_CNT on sparc) at 258 * this location, it means that 8 64k pages either exist or can be created 259 * from 8K pages in order to make a single free 512k page at the given 260 * index. Note that when a region is full, it will contribute to the 261 * counts in the region above it. Thus we will not know what page 262 * size the free pages will be which can be promoted to this new free 263 * page unless we look at all regions below the current region. 264 */ 265 266 /* 267 * Note: hpmctr_t is defined in platform vm_dep.h 268 * hw_page_map_t contains all the information needed for the page_counters 269 * logic. The fields are as follows: 270 * 271 * hpm_counters: dynamically allocated array to hold counter data 272 * hpm_entries: entries in hpm_counters 273 * hpm_shift: shift for pnum/array index conv 274 * hpm_base: PFN mapped to counter index 0 275 * hpm_color_current_len: # of elements in hpm_color_current "array" below 276 * hpm_color_current: last index in counter array for this color at 277 * which we successfully created a large page 278 */ 279 typedef struct hw_page_map { 280 hpmctr_t *hpm_counters; 281 size_t hpm_entries; 282 int hpm_shift; 283 pfn_t hpm_base; 284 size_t hpm_color_current_len; 285 size_t *hpm_color_current; 286 } hw_page_map_t; 287 288 /* 289 * Element zero is not used, but is allocated for convenience. 290 */ 291 static hw_page_map_t *page_counters[MMU_PAGE_SIZES]; 292 293 /* 294 * The following macros are convenient ways to get access to the individual 295 * elements of the page_counters arrays. They can be used on both 296 * the left side and right side of equations. 297 */ 298 #define PAGE_COUNTERS(mnode, rg_szc, idx) \ 299 (page_counters[(rg_szc)][(mnode)].hpm_counters[(idx)]) 300 301 #define PAGE_COUNTERS_COUNTERS(mnode, rg_szc) \ 302 (page_counters[(rg_szc)][(mnode)].hpm_counters) 303 304 #define PAGE_COUNTERS_SHIFT(mnode, rg_szc) \ 305 (page_counters[(rg_szc)][(mnode)].hpm_shift) 306 307 #define PAGE_COUNTERS_ENTRIES(mnode, rg_szc) \ 308 (page_counters[(rg_szc)][(mnode)].hpm_entries) 309 310 #define PAGE_COUNTERS_BASE(mnode, rg_szc) \ 311 (page_counters[(rg_szc)][(mnode)].hpm_base) 312 313 #define PAGE_COUNTERS_CURRENT_COLOR_LEN(mnode, rg_szc) \ 314 (page_counters[(rg_szc)][(mnode)].hpm_color_current_len) 315 316 #define PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, rg_szc) \ 317 (page_counters[(rg_szc)][(mnode)].hpm_color_current) 318 319 #define PAGE_COUNTERS_CURRENT_COLOR(mnode, rg_szc, color) \ 320 (page_counters[(rg_szc)][(mnode)].hpm_color_current[(color)]) 321 322 #define PNUM_TO_IDX(mnode, rg_szc, pnum) \ 323 (((pnum) - PAGE_COUNTERS_BASE((mnode), (rg_szc))) >> \ 324 PAGE_COUNTERS_SHIFT((mnode), (rg_szc))) 325 326 #define IDX_TO_PNUM(mnode, rg_szc, index) \ 327 (PAGE_COUNTERS_BASE((mnode), (rg_szc)) + \ 328 ((index) << PAGE_COUNTERS_SHIFT((mnode), (rg_szc)))) 329 330 /* 331 * Protects the hpm_counters and hpm_color_current memory from changing while 332 * looking at page counters information. 333 * Grab the write lock to modify what these fields point at. 334 * Grab the read lock to prevent any pointers from changing. 335 * The write lock can not be held during memory allocation due to a possible 336 * recursion deadlock with trying to grab the read lock while the 337 * write lock is already held. 338 */ 339 krwlock_t page_ctrs_rwlock[MAX_MEM_NODES]; 340 341 342 /* 343 * initialize cpu_vm_data to point at cache aligned vm_cpu_data_t. 344 */ 345 void 346 cpu_vm_data_init(struct cpu *cp) 347 { 348 if (cp == CPU0) { 349 cp->cpu_vm_data = (void *)&vm_cpu_data0; 350 } else { 351 void *kmptr; 352 int align; 353 size_t sz; 354 355 align = (L2CACHE_ALIGN) ? L2CACHE_ALIGN : L2CACHE_ALIGN_MAX; 356 sz = P2ROUNDUP(sizeof (vm_cpu_data_t), align) + align; 357 kmptr = kmem_zalloc(sz, KM_SLEEP); 358 cp->cpu_vm_data = (void *) P2ROUNDUP((uintptr_t)kmptr, align); 359 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr = kmptr; 360 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize = sz; 361 } 362 } 363 364 /* 365 * free cpu_vm_data 366 */ 367 void 368 cpu_vm_data_destroy(struct cpu *cp) 369 { 370 if (cp->cpu_seqid && cp->cpu_vm_data) { 371 ASSERT(cp != CPU0); 372 kmem_free(((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr, 373 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize); 374 } 375 cp->cpu_vm_data = NULL; 376 } 377 378 379 /* 380 * page size to page size code 381 */ 382 int 383 page_szc(size_t pagesize) 384 { 385 int i = 0; 386 387 while (hw_page_array[i].hp_size) { 388 if (pagesize == hw_page_array[i].hp_size) 389 return (i); 390 i++; 391 } 392 return (-1); 393 } 394 395 /* 396 * page size to page size code with the restriction that it be a supported 397 * user page size. If it's not a supported user page size, -1 will be returned. 398 */ 399 int 400 page_szc_user_filtered(size_t pagesize) 401 { 402 int szc = page_szc(pagesize); 403 if ((szc != -1) && (SZC_2_USERSZC(szc) != -1)) { 404 return (szc); 405 } 406 return (-1); 407 } 408 409 /* 410 * Return how many page sizes are available for the user to use. This is 411 * what the hardware supports and not based upon how the OS implements the 412 * support of different page sizes. 413 */ 414 uint_t 415 page_num_user_pagesizes(void) 416 { 417 return (mmu_exported_page_sizes); 418 } 419 420 uint_t 421 page_num_pagesizes(void) 422 { 423 return (mmu_page_sizes); 424 } 425 426 /* 427 * returns the count of the number of base pagesize pages associated with szc 428 */ 429 pgcnt_t 430 page_get_pagecnt(uint_t szc) 431 { 432 if (szc >= mmu_page_sizes) 433 panic("page_get_pagecnt: out of range %d", szc); 434 return (hw_page_array[szc].hp_pgcnt); 435 } 436 437 size_t 438 page_get_pagesize(uint_t szc) 439 { 440 if (szc >= mmu_page_sizes) 441 panic("page_get_pagesize: out of range %d", szc); 442 return (hw_page_array[szc].hp_size); 443 } 444 445 /* 446 * Return the size of a page based upon the index passed in. An index of 447 * zero refers to the smallest page size in the system, and as index increases 448 * it refers to the next larger supported page size in the system. 449 * Note that szc and userszc may not be the same due to unsupported szc's on 450 * some systems. 451 */ 452 size_t 453 page_get_user_pagesize(uint_t userszc) 454 { 455 uint_t szc = USERSZC_2_SZC(userszc); 456 457 if (szc >= mmu_page_sizes) 458 panic("page_get_user_pagesize: out of range %d", szc); 459 return (hw_page_array[szc].hp_size); 460 } 461 462 uint_t 463 page_get_shift(uint_t szc) 464 { 465 if (szc >= mmu_page_sizes) 466 panic("page_get_shift: out of range %d", szc); 467 return (hw_page_array[szc].hp_shift); 468 } 469 470 uint_t 471 page_get_pagecolors(uint_t szc) 472 { 473 ASSERT(page_colors != 0); 474 return (MAX(page_colors >> PAGE_BSZS_SHIFT(szc), 1)); 475 } 476 477 /* 478 * Called by startup(). 479 * Size up the per page size free list counters based on physmax 480 * of each node and max_mem_nodes. 481 */ 482 size_t 483 page_ctrs_sz(void) 484 { 485 int r; /* region size */ 486 int mnode; 487 uint_t ctrs_sz = 0; 488 int i; 489 pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; 490 491 /* 492 * We need to determine how many page colors there are for each 493 * page size in order to allocate memory for any color specific 494 * arrays. 495 */ 496 colors_per_szc[0] = page_colors; 497 for (i = 1; i < mmu_page_sizes; i++) { 498 colors_per_szc[i] = 499 page_convert_color(0, i, page_colors - 1) + 1; 500 } 501 502 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 503 504 pgcnt_t r_pgcnt; 505 pfn_t r_base; 506 pgcnt_t r_align; 507 508 if (mem_node_config[mnode].exists == 0) 509 continue; 510 511 /* 512 * determine size needed for page counter arrays with 513 * base aligned to large page size. 514 */ 515 for (r = 1; r < mmu_page_sizes; r++) { 516 /* add in space for hpm_counters */ 517 r_align = page_get_pagecnt(r); 518 r_base = mem_node_config[mnode].physbase; 519 r_base &= ~(r_align - 1); 520 r_pgcnt = howmany(mem_node_config[mnode].physmax - 521 r_base + 1, r_align); 522 /* 523 * Round up to always allocate on pointer sized 524 * boundaries. 525 */ 526 ctrs_sz += P2ROUNDUP((r_pgcnt * sizeof (hpmctr_t)), 527 sizeof (hpmctr_t *)); 528 529 /* add in space for hpm_color_current */ 530 ctrs_sz += (colors_per_szc[r] * 531 sizeof (size_t)); 532 } 533 } 534 535 for (r = 1; r < mmu_page_sizes; r++) { 536 ctrs_sz += (max_mem_nodes * sizeof (hw_page_map_t)); 537 538 /* add in space for page_ctrs_cands */ 539 ctrs_sz += NPC_MUTEX * max_mem_nodes * (sizeof (pcc_info_t)); 540 ctrs_sz += NPC_MUTEX * max_mem_nodes * colors_per_szc[r] * 541 sizeof (pgcnt_t); 542 } 543 544 /* ctr_mutex */ 545 ctrs_sz += (max_mem_nodes * NPC_MUTEX * sizeof (kmutex_t)); 546 547 /* size for page list counts */ 548 PLCNT_SZ(ctrs_sz); 549 550 /* 551 * add some slop for roundups. page_ctrs_alloc will roundup the start 552 * address of the counters to ecache_alignsize boundary for every 553 * memory node. 554 */ 555 return (ctrs_sz + max_mem_nodes * L2CACHE_ALIGN); 556 } 557 558 caddr_t 559 page_ctrs_alloc(caddr_t alloc_base) 560 { 561 int mnode; 562 int r; /* region size */ 563 int i; 564 pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; 565 566 /* 567 * We need to determine how many page colors there are for each 568 * page size in order to allocate memory for any color specific 569 * arrays. 570 */ 571 colors_per_szc[0] = page_colors; 572 for (i = 1; i < mmu_page_sizes; i++) { 573 colors_per_szc[i] = 574 page_convert_color(0, i, page_colors - 1) + 1; 575 } 576 577 for (r = 1; r < mmu_page_sizes; r++) { 578 page_counters[r] = (hw_page_map_t *)alloc_base; 579 alloc_base += (max_mem_nodes * sizeof (hw_page_map_t)); 580 } 581 582 /* page_ctrs_cands */ 583 for (r = 1; r < mmu_page_sizes; r++) { 584 for (i = 0; i < NPC_MUTEX; i++) { 585 page_ctrs_cands[i][r] = (pcc_info_t *)alloc_base; 586 alloc_base += max_mem_nodes * (sizeof (pcc_info_t)); 587 588 } 589 } 590 591 /* page_ctrs_cands pcc_color_free array */ 592 for (r = 1; r < mmu_page_sizes; r++) { 593 for (i = 0; i < NPC_MUTEX; i++) { 594 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 595 page_ctrs_cands[i][r][mnode].pcc_color_free_len 596 = colors_per_szc[r]; 597 page_ctrs_cands[i][r][mnode].pcc_color_free = 598 (pgcnt_t *)alloc_base; 599 alloc_base += colors_per_szc[r] * 600 sizeof (pgcnt_t); 601 } 602 } 603 } 604 605 /* ctr_mutex */ 606 for (i = 0; i < NPC_MUTEX; i++) { 607 ctr_mutex[i] = (kmutex_t *)alloc_base; 608 alloc_base += (max_mem_nodes * sizeof (kmutex_t)); 609 } 610 611 /* initialize page list counts */ 612 PLCNT_INIT(alloc_base); 613 614 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 615 616 pgcnt_t r_pgcnt; 617 pfn_t r_base; 618 pgcnt_t r_align; 619 int r_shift; 620 621 if (mem_node_config[mnode].exists == 0) 622 continue; 623 624 for (r = 1; r < mmu_page_sizes; r++) { 625 /* 626 * the page_counters base has to be aligned to the 627 * page count of page size code r otherwise the counts 628 * will cross large page boundaries. 629 */ 630 r_align = page_get_pagecnt(r); 631 r_base = mem_node_config[mnode].physbase; 632 /* base needs to be aligned - lower to aligned value */ 633 r_base &= ~(r_align - 1); 634 r_pgcnt = howmany(mem_node_config[mnode].physmax - 635 r_base + 1, r_align); 636 r_shift = PAGE_BSZS_SHIFT(r); 637 638 PAGE_COUNTERS_SHIFT(mnode, r) = r_shift; 639 PAGE_COUNTERS_ENTRIES(mnode, r) = r_pgcnt; 640 PAGE_COUNTERS_BASE(mnode, r) = r_base; 641 PAGE_COUNTERS_CURRENT_COLOR_LEN(mnode, r) = 642 colors_per_szc[r]; 643 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r) = 644 (size_t *)alloc_base; 645 alloc_base += (sizeof (size_t) * colors_per_szc[r]); 646 for (i = 0; i < colors_per_szc[r]; i++) { 647 PAGE_COUNTERS_CURRENT_COLOR(mnode, r, i) = i; 648 } 649 PAGE_COUNTERS_COUNTERS(mnode, r) = 650 (hpmctr_t *)alloc_base; 651 /* 652 * Round up to make alloc_base always be aligned on 653 * a pointer boundary. 654 */ 655 alloc_base += P2ROUNDUP((sizeof (hpmctr_t) * r_pgcnt), 656 sizeof (hpmctr_t *)); 657 658 /* 659 * Verify that PNUM_TO_IDX and IDX_TO_PNUM 660 * satisfy the identity requirement. 661 * We should be able to go from one to the other 662 * and get consistent values. 663 */ 664 ASSERT(PNUM_TO_IDX(mnode, r, 665 (IDX_TO_PNUM(mnode, r, 0))) == 0); 666 ASSERT(IDX_TO_PNUM(mnode, r, 667 (PNUM_TO_IDX(mnode, r, r_base))) == r_base); 668 } 669 /* 670 * Roundup the start address of the page_counters to 671 * cache aligned boundary for every memory node. 672 * page_ctrs_sz() has added some slop for these roundups. 673 */ 674 alloc_base = (caddr_t)P2ROUNDUP((uintptr_t)alloc_base, 675 L2CACHE_ALIGN); 676 } 677 678 /* Initialize other page counter specific data structures. */ 679 for (mnode = 0; mnode < MAX_MEM_NODES; mnode++) { 680 rw_init(&page_ctrs_rwlock[mnode], NULL, RW_DEFAULT, NULL); 681 } 682 683 return (alloc_base); 684 } 685 686 /* 687 * Functions to adjust region counters for each size free list. 688 * Caller is responsible to acquire the ctr_mutex lock if necessary and 689 * thus can be called during startup without locks. 690 */ 691 /* ARGSUSED */ 692 void 693 page_ctr_add_internal(int mnode, int mtype, page_t *pp, int flags) 694 { 695 ssize_t r; /* region size */ 696 ssize_t idx; 697 pfn_t pfnum; 698 int lckidx; 699 700 ASSERT(mnode == PP_2_MEM_NODE(pp)); 701 ASSERT(mtype == PP_2_MTYPE(pp)); 702 703 ASSERT(pp->p_szc < mmu_page_sizes); 704 705 PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags); 706 707 /* no counter update needed for largest page size */ 708 if (pp->p_szc >= mmu_page_sizes - 1) { 709 return; 710 } 711 712 r = pp->p_szc + 1; 713 pfnum = pp->p_pagenum; 714 lckidx = PP_CTR_LOCK_INDX(pp); 715 716 /* 717 * Increment the count of free pages for the current 718 * region. Continue looping up in region size incrementing 719 * count if the preceeding region is full. 720 */ 721 while (r < mmu_page_sizes) { 722 idx = PNUM_TO_IDX(mnode, r, pfnum); 723 724 ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r)); 725 ASSERT(PAGE_COUNTERS(mnode, r, idx) < FULL_REGION_CNT(r)); 726 727 if (++PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r)) 728 break; 729 730 page_ctrs_cands[lckidx][r][mnode].pcc_pages_free++; 731 page_ctrs_cands[lckidx][r][mnode]. 732 pcc_color_free[PP_2_BIN_SZC(pp, r)]++; 733 r++; 734 } 735 } 736 737 void 738 page_ctr_add(int mnode, int mtype, page_t *pp, int flags) 739 { 740 int lckidx = PP_CTR_LOCK_INDX(pp); 741 kmutex_t *lock = &ctr_mutex[lckidx][mnode]; 742 743 mutex_enter(lock); 744 page_ctr_add_internal(mnode, mtype, pp, flags); 745 mutex_exit(lock); 746 } 747 748 void 749 page_ctr_sub(int mnode, int mtype, page_t *pp, int flags) 750 { 751 int lckidx; 752 kmutex_t *lock; 753 ssize_t r; /* region size */ 754 ssize_t idx; 755 pfn_t pfnum; 756 757 ASSERT(mnode == PP_2_MEM_NODE(pp)); 758 ASSERT(mtype == PP_2_MTYPE(pp)); 759 760 ASSERT(pp->p_szc < mmu_page_sizes); 761 762 PLCNT_DECR(pp, mnode, mtype, pp->p_szc, flags); 763 764 /* no counter update needed for largest page size */ 765 if (pp->p_szc >= mmu_page_sizes - 1) { 766 return; 767 } 768 769 r = pp->p_szc + 1; 770 pfnum = pp->p_pagenum; 771 lckidx = PP_CTR_LOCK_INDX(pp); 772 lock = &ctr_mutex[lckidx][mnode]; 773 774 /* 775 * Decrement the count of free pages for the current 776 * region. Continue looping up in region size decrementing 777 * count if the preceeding region was full. 778 */ 779 mutex_enter(lock); 780 while (r < mmu_page_sizes) { 781 idx = PNUM_TO_IDX(mnode, r, pfnum); 782 783 ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r)); 784 ASSERT(PAGE_COUNTERS(mnode, r, idx) > 0); 785 786 if (--PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r) - 1) { 787 break; 788 } 789 ASSERT(page_ctrs_cands[lckidx][r][mnode].pcc_pages_free != 0); 790 ASSERT(page_ctrs_cands[lckidx][r][mnode]. 791 pcc_color_free[PP_2_BIN_SZC(pp, r)] != 0); 792 793 page_ctrs_cands[lckidx][r][mnode].pcc_pages_free--; 794 page_ctrs_cands[lckidx][r][mnode]. 795 pcc_color_free[PP_2_BIN_SZC(pp, r)]--; 796 r++; 797 } 798 mutex_exit(lock); 799 } 800 801 /* 802 * Adjust page counters following a memory attach, since typically the 803 * size of the array needs to change, and the PFN to counter index 804 * mapping needs to change. 805 */ 806 uint_t 807 page_ctrs_adjust(int mnode) 808 { 809 pgcnt_t npgs; 810 int r; /* region size */ 811 int i; 812 size_t pcsz, old_csz; 813 hpmctr_t *new_ctr, *old_ctr; 814 pfn_t oldbase, newbase; 815 size_t old_npgs; 816 hpmctr_t *ctr_cache[MMU_PAGE_SIZES]; 817 size_t size_cache[MMU_PAGE_SIZES]; 818 size_t *color_cache[MMU_PAGE_SIZES]; 819 size_t *old_color_array; 820 pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; 821 822 newbase = mem_node_config[mnode].physbase & ~PC_BASE_ALIGN_MASK; 823 npgs = roundup(mem_node_config[mnode].physmax, 824 PC_BASE_ALIGN) - newbase; 825 826 /* 827 * We need to determine how many page colors there are for each 828 * page size in order to allocate memory for any color specific 829 * arrays. 830 */ 831 colors_per_szc[0] = page_colors; 832 for (r = 1; r < mmu_page_sizes; r++) { 833 colors_per_szc[r] = 834 page_convert_color(0, r, page_colors - 1) + 1; 835 } 836 837 /* 838 * Preallocate all of the new hpm_counters arrays as we can't 839 * hold the page_ctrs_rwlock as a writer and allocate memory. 840 * If we can't allocate all of the arrays, undo our work so far 841 * and return failure. 842 */ 843 for (r = 1; r < mmu_page_sizes; r++) { 844 pcsz = npgs >> PAGE_BSZS_SHIFT(r); 845 846 ctr_cache[r] = kmem_zalloc(pcsz * 847 sizeof (hpmctr_t), KM_NOSLEEP); 848 if (ctr_cache[r] == NULL) { 849 while (--r >= 1) { 850 kmem_free(ctr_cache[r], 851 size_cache[r] * sizeof (hpmctr_t)); 852 } 853 return (ENOMEM); 854 } 855 size_cache[r] = pcsz; 856 } 857 /* 858 * Preallocate all of the new color current arrays as we can't 859 * hold the page_ctrs_rwlock as a writer and allocate memory. 860 * If we can't allocate all of the arrays, undo our work so far 861 * and return failure. 862 */ 863 for (r = 1; r < mmu_page_sizes; r++) { 864 color_cache[r] = kmem_zalloc(sizeof (size_t) * 865 colors_per_szc[r], KM_NOSLEEP); 866 if (color_cache[r] == NULL) { 867 while (--r >= 1) { 868 kmem_free(color_cache[r], 869 colors_per_szc[r] * sizeof (size_t)); 870 } 871 for (r = 1; r < mmu_page_sizes; r++) { 872 kmem_free(ctr_cache[r], 873 size_cache[r] * sizeof (hpmctr_t)); 874 } 875 return (ENOMEM); 876 } 877 } 878 879 /* 880 * Grab the write lock to prevent others from walking these arrays 881 * while we are modifying them. 882 */ 883 rw_enter(&page_ctrs_rwlock[mnode], RW_WRITER); 884 page_freelist_lock(mnode); 885 for (r = 1; r < mmu_page_sizes; r++) { 886 PAGE_COUNTERS_SHIFT(mnode, r) = PAGE_BSZS_SHIFT(r); 887 old_ctr = PAGE_COUNTERS_COUNTERS(mnode, r); 888 old_csz = PAGE_COUNTERS_ENTRIES(mnode, r); 889 oldbase = PAGE_COUNTERS_BASE(mnode, r); 890 old_npgs = old_csz << PAGE_COUNTERS_SHIFT(mnode, r); 891 old_color_array = PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r); 892 893 pcsz = npgs >> PAGE_COUNTERS_SHIFT(mnode, r); 894 new_ctr = ctr_cache[r]; 895 ctr_cache[r] = NULL; 896 if (old_ctr != NULL && 897 (oldbase + old_npgs > newbase) && 898 (newbase + npgs > oldbase)) { 899 /* 900 * Map the intersection of the old and new 901 * counters into the new array. 902 */ 903 size_t offset; 904 if (newbase > oldbase) { 905 offset = (newbase - oldbase) >> 906 PAGE_COUNTERS_SHIFT(mnode, r); 907 bcopy(old_ctr + offset, new_ctr, 908 MIN(pcsz, (old_csz - offset)) * 909 sizeof (hpmctr_t)); 910 } else { 911 offset = (oldbase - newbase) >> 912 PAGE_COUNTERS_SHIFT(mnode, r); 913 bcopy(old_ctr, new_ctr + offset, 914 MIN(pcsz - offset, old_csz) * 915 sizeof (hpmctr_t)); 916 } 917 } 918 919 PAGE_COUNTERS_COUNTERS(mnode, r) = new_ctr; 920 PAGE_COUNTERS_ENTRIES(mnode, r) = pcsz; 921 PAGE_COUNTERS_BASE(mnode, r) = newbase; 922 PAGE_COUNTERS_CURRENT_COLOR_LEN(mnode, r) = colors_per_szc[r]; 923 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r) = color_cache[r]; 924 color_cache[r] = NULL; 925 /* 926 * for now, just reset on these events as it's probably 927 * not worthwhile to try and optimize this. 928 */ 929 for (i = 0; i < colors_per_szc[r]; i++) { 930 PAGE_COUNTERS_CURRENT_COLOR(mnode, r, i) = i; 931 } 932 933 /* cache info for freeing out of the critical path */ 934 if ((caddr_t)old_ctr >= kernelheap && 935 (caddr_t)old_ctr < ekernelheap) { 936 ctr_cache[r] = old_ctr; 937 size_cache[r] = old_csz; 938 } 939 if ((caddr_t)old_color_array >= kernelheap && 940 (caddr_t)old_color_array < ekernelheap) { 941 color_cache[r] = old_color_array; 942 } 943 /* 944 * Verify that PNUM_TO_IDX and IDX_TO_PNUM 945 * satisfy the identity requirement. 946 * We should be able to go from one to the other 947 * and get consistent values. 948 */ 949 ASSERT(PNUM_TO_IDX(mnode, r, 950 (IDX_TO_PNUM(mnode, r, 0))) == 0); 951 ASSERT(IDX_TO_PNUM(mnode, r, 952 (PNUM_TO_IDX(mnode, r, newbase))) == newbase); 953 } 954 page_freelist_unlock(mnode); 955 rw_exit(&page_ctrs_rwlock[mnode]); 956 957 /* 958 * Now that we have dropped the write lock, it is safe to free all 959 * of the memory we have cached above. 960 */ 961 for (r = 1; r < mmu_page_sizes; r++) { 962 if (ctr_cache[r] != NULL) { 963 kmem_free(ctr_cache[r], 964 size_cache[r] * sizeof (hpmctr_t)); 965 } 966 if (color_cache[r] != NULL) { 967 kmem_free(color_cache[r], 968 colors_per_szc[r] * sizeof (size_t)); 969 } 970 } 971 return (0); 972 } 973 974 /* 975 * color contains a valid color index or bin for cur_szc 976 */ 977 uint_t 978 page_convert_color(uchar_t cur_szc, uchar_t new_szc, uint_t color) 979 { 980 uint_t shift; 981 982 if (cur_szc > new_szc) { 983 shift = page_get_shift(cur_szc) - page_get_shift(new_szc); 984 return (color << shift); 985 } else if (cur_szc < new_szc) { 986 shift = page_get_shift(new_szc) - page_get_shift(cur_szc); 987 return (color >> shift); 988 } 989 return (color); 990 } 991 992 #ifdef DEBUG 993 994 /* 995 * confirm pp is a large page corresponding to szc 996 */ 997 void 998 chk_lpg(page_t *pp, uchar_t szc) 999 { 1000 spgcnt_t npgs = page_get_pagecnt(pp->p_szc); 1001 uint_t noreloc; 1002 1003 if (npgs == 1) { 1004 ASSERT(pp->p_szc == 0); 1005 ASSERT(pp->p_next == pp); 1006 ASSERT(pp->p_prev == pp); 1007 return; 1008 } 1009 1010 ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL); 1011 ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL); 1012 1013 ASSERT(IS_P2ALIGNED(pp->p_pagenum, npgs)); 1014 ASSERT(pp->p_pagenum == (pp->p_next->p_pagenum - 1)); 1015 ASSERT(pp->p_prev->p_pagenum == (pp->p_pagenum + (npgs - 1))); 1016 ASSERT(pp->p_prev == (pp + (npgs - 1))); 1017 1018 /* 1019 * Check list of pages. 1020 */ 1021 noreloc = PP_ISNORELOC(pp); 1022 while (npgs--) { 1023 if (npgs != 0) { 1024 ASSERT(pp->p_pagenum == pp->p_next->p_pagenum - 1); 1025 ASSERT(pp->p_next == (pp + 1)); 1026 } 1027 ASSERT(pp->p_szc == szc); 1028 ASSERT(PP_ISFREE(pp)); 1029 ASSERT(PP_ISAGED(pp)); 1030 ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL); 1031 ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL); 1032 ASSERT(pp->p_vnode == NULL); 1033 ASSERT(PP_ISNORELOC(pp) == noreloc); 1034 1035 pp = pp->p_next; 1036 } 1037 } 1038 #endif /* DEBUG */ 1039 1040 void 1041 page_freelist_lock(int mnode) 1042 { 1043 int i; 1044 for (i = 0; i < NPC_MUTEX; i++) { 1045 mutex_enter(FPC_MUTEX(mnode, i)); 1046 mutex_enter(CPC_MUTEX(mnode, i)); 1047 } 1048 } 1049 1050 void 1051 page_freelist_unlock(int mnode) 1052 { 1053 int i; 1054 for (i = 0; i < NPC_MUTEX; i++) { 1055 mutex_exit(FPC_MUTEX(mnode, i)); 1056 mutex_exit(CPC_MUTEX(mnode, i)); 1057 } 1058 } 1059 1060 /* 1061 * add pp to the specified page list. Defaults to head of the page list 1062 * unless PG_LIST_TAIL is specified. 1063 */ 1064 void 1065 page_list_add(page_t *pp, int flags) 1066 { 1067 page_t **ppp; 1068 kmutex_t *pcm; 1069 uint_t bin, mtype; 1070 int mnode; 1071 1072 ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT)); 1073 ASSERT(PP_ISFREE(pp)); 1074 ASSERT(!hat_page_is_mapped(pp)); 1075 ASSERT(hat_page_getshare(pp) == 0); 1076 1077 /* 1078 * Large pages should be freed via page_list_add_pages(). 1079 */ 1080 ASSERT(pp->p_szc == 0); 1081 1082 /* 1083 * Don't need to lock the freelist first here 1084 * because the page isn't on the freelist yet. 1085 * This means p_szc can't change on us. 1086 */ 1087 1088 bin = PP_2_BIN(pp); 1089 mnode = PP_2_MEM_NODE(pp); 1090 mtype = PP_2_MTYPE(pp); 1091 1092 if (flags & PG_LIST_ISINIT) { 1093 /* 1094 * PG_LIST_ISINIT is set during system startup (ie. single 1095 * threaded), add a page to the free list and add to the 1096 * the free region counters w/o any locking 1097 */ 1098 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1099 1100 /* inline version of page_add() */ 1101 if (*ppp != NULL) { 1102 pp->p_next = *ppp; 1103 pp->p_prev = (*ppp)->p_prev; 1104 (*ppp)->p_prev = pp; 1105 pp->p_prev->p_next = pp; 1106 } else 1107 *ppp = pp; 1108 1109 page_ctr_add_internal(mnode, mtype, pp, flags); 1110 VM_STAT_ADD(vmm_vmstats.pladd_free[0]); 1111 } else { 1112 pcm = PC_BIN_MUTEX(mnode, bin, flags); 1113 1114 if (flags & PG_FREE_LIST) { 1115 VM_STAT_ADD(vmm_vmstats.pladd_free[0]); 1116 ASSERT(PP_ISAGED(pp)); 1117 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1118 1119 } else { 1120 VM_STAT_ADD(vmm_vmstats.pladd_cache); 1121 ASSERT(pp->p_vnode); 1122 ASSERT((pp->p_offset & PAGEOFFSET) == 0); 1123 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1124 } 1125 mutex_enter(pcm); 1126 page_add(ppp, pp); 1127 1128 if (flags & PG_LIST_TAIL) 1129 *ppp = (*ppp)->p_next; 1130 /* 1131 * Add counters before releasing pcm mutex to avoid a race with 1132 * page_freelist_coalesce and page_freelist_fill. 1133 */ 1134 page_ctr_add(mnode, mtype, pp, flags); 1135 mutex_exit(pcm); 1136 } 1137 1138 1139 #if defined(__sparc) 1140 if (PP_ISNORELOC(pp)) { 1141 kcage_freemem_add(1); 1142 } 1143 #endif 1144 /* 1145 * It is up to the caller to unlock the page! 1146 */ 1147 ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT)); 1148 } 1149 1150 1151 #ifdef __sparc 1152 /* 1153 * This routine is only used by kcage_init during system startup. 1154 * It performs the function of page_list_sub/PP_SETNORELOC/page_list_add 1155 * without the overhead of taking locks and updating counters. 1156 */ 1157 void 1158 page_list_noreloc_startup(page_t *pp) 1159 { 1160 page_t **ppp; 1161 uint_t bin; 1162 int mnode; 1163 int mtype; 1164 int flags = 0; 1165 1166 /* 1167 * If this is a large page on the freelist then 1168 * break it up into smaller pages. 1169 */ 1170 if (pp->p_szc != 0) 1171 page_boot_demote(pp); 1172 1173 /* 1174 * Get list page is currently on. 1175 */ 1176 bin = PP_2_BIN(pp); 1177 mnode = PP_2_MEM_NODE(pp); 1178 mtype = PP_2_MTYPE(pp); 1179 ASSERT(mtype == MTYPE_RELOC); 1180 ASSERT(pp->p_szc == 0); 1181 1182 if (PP_ISAGED(pp)) { 1183 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1184 flags |= PG_FREE_LIST; 1185 } else { 1186 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1187 flags |= PG_CACHE_LIST; 1188 } 1189 1190 ASSERT(*ppp != NULL); 1191 1192 /* 1193 * Delete page from current list. 1194 */ 1195 if (*ppp == pp) 1196 *ppp = pp->p_next; /* go to next page */ 1197 if (*ppp == pp) { 1198 *ppp = NULL; /* page list is gone */ 1199 } else { 1200 pp->p_prev->p_next = pp->p_next; 1201 pp->p_next->p_prev = pp->p_prev; 1202 } 1203 1204 /* LINTED */ 1205 PLCNT_DECR(pp, mnode, mtype, 0, flags); 1206 1207 /* 1208 * Set no reloc for cage initted pages. 1209 */ 1210 PP_SETNORELOC(pp); 1211 1212 mtype = PP_2_MTYPE(pp); 1213 ASSERT(mtype == MTYPE_NORELOC); 1214 1215 /* 1216 * Get new list for page. 1217 */ 1218 if (PP_ISAGED(pp)) { 1219 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1220 } else { 1221 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1222 } 1223 1224 /* 1225 * Insert page on new list. 1226 */ 1227 if (*ppp == NULL) { 1228 *ppp = pp; 1229 pp->p_next = pp->p_prev = pp; 1230 } else { 1231 pp->p_next = *ppp; 1232 pp->p_prev = (*ppp)->p_prev; 1233 (*ppp)->p_prev = pp; 1234 pp->p_prev->p_next = pp; 1235 } 1236 1237 /* LINTED */ 1238 PLCNT_INCR(pp, mnode, mtype, 0, flags); 1239 1240 /* 1241 * Update cage freemem counter 1242 */ 1243 atomic_add_long(&kcage_freemem, 1); 1244 } 1245 #else /* __sparc */ 1246 1247 /* ARGSUSED */ 1248 void 1249 page_list_noreloc_startup(page_t *pp) 1250 { 1251 panic("page_list_noreloc_startup: should be here only for sparc"); 1252 } 1253 #endif 1254 1255 void 1256 page_list_add_pages(page_t *pp, int flags) 1257 { 1258 kmutex_t *pcm; 1259 pgcnt_t pgcnt; 1260 uint_t bin, mtype, i; 1261 int mnode; 1262 1263 /* default to freelist/head */ 1264 ASSERT((flags & (PG_CACHE_LIST | PG_LIST_TAIL)) == 0); 1265 1266 CHK_LPG(pp, pp->p_szc); 1267 VM_STAT_ADD(vmm_vmstats.pladd_free[pp->p_szc]); 1268 1269 bin = PP_2_BIN(pp); 1270 mnode = PP_2_MEM_NODE(pp); 1271 mtype = PP_2_MTYPE(pp); 1272 1273 if (flags & PG_LIST_ISINIT) { 1274 ASSERT(pp->p_szc == mmu_page_sizes - 1); 1275 page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1276 ASSERT(!PP_ISNORELOC(pp)); 1277 PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags); 1278 } else { 1279 1280 ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes); 1281 1282 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 1283 1284 mutex_enter(pcm); 1285 page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1286 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST); 1287 mutex_exit(pcm); 1288 1289 pgcnt = page_get_pagecnt(pp->p_szc); 1290 #if defined(__sparc) 1291 if (PP_ISNORELOC(pp)) 1292 kcage_freemem_add(pgcnt); 1293 #endif 1294 for (i = 0; i < pgcnt; i++, pp++) 1295 page_unlock_noretire(pp); 1296 } 1297 } 1298 1299 /* 1300 * During boot, need to demote a large page to base 1301 * pagesize pages for seg_kmem for use in boot_alloc() 1302 */ 1303 void 1304 page_boot_demote(page_t *pp) 1305 { 1306 ASSERT(pp->p_szc != 0); 1307 ASSERT(PP_ISFREE(pp)); 1308 ASSERT(PP_ISAGED(pp)); 1309 1310 (void) page_demote(PP_2_MEM_NODE(pp), 1311 PFN_BASE(pp->p_pagenum, pp->p_szc), pp->p_szc, 0, PC_NO_COLOR, 1312 PC_FREE); 1313 1314 ASSERT(PP_ISFREE(pp)); 1315 ASSERT(PP_ISAGED(pp)); 1316 ASSERT(pp->p_szc == 0); 1317 } 1318 1319 /* 1320 * Take a particular page off of whatever freelist the page 1321 * is claimed to be on. 1322 * 1323 * NOTE: Only used for PAGESIZE pages. 1324 */ 1325 void 1326 page_list_sub(page_t *pp, int flags) 1327 { 1328 int bin; 1329 uint_t mtype; 1330 int mnode; 1331 kmutex_t *pcm; 1332 page_t **ppp; 1333 1334 ASSERT(PAGE_EXCL(pp)); 1335 ASSERT(PP_ISFREE(pp)); 1336 1337 /* 1338 * The p_szc field can only be changed by page_promote() 1339 * and page_demote(). Only free pages can be promoted and 1340 * demoted and the free list MUST be locked during these 1341 * operations. So to prevent a race in page_list_sub() 1342 * between computing which bin of the freelist lock to 1343 * grab and actually grabing the lock we check again that 1344 * the bin we locked is still the correct one. Notice that 1345 * the p_szc field could have actually changed on us but 1346 * if the bin happens to still be the same we are safe. 1347 */ 1348 try_again: 1349 bin = PP_2_BIN(pp); 1350 mnode = PP_2_MEM_NODE(pp); 1351 pcm = PC_BIN_MUTEX(mnode, bin, flags); 1352 mutex_enter(pcm); 1353 if (PP_2_BIN(pp) != bin) { 1354 mutex_exit(pcm); 1355 goto try_again; 1356 } 1357 mtype = PP_2_MTYPE(pp); 1358 1359 if (flags & PG_FREE_LIST) { 1360 VM_STAT_ADD(vmm_vmstats.plsub_free[0]); 1361 ASSERT(PP_ISAGED(pp)); 1362 ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype); 1363 } else { 1364 VM_STAT_ADD(vmm_vmstats.plsub_cache); 1365 ASSERT(!PP_ISAGED(pp)); 1366 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1367 } 1368 1369 /* 1370 * Common PAGESIZE case. 1371 * 1372 * Note that we locked the freelist. This prevents 1373 * any page promotion/demotion operations. Therefore 1374 * the p_szc will not change until we drop pcm mutex. 1375 */ 1376 if (pp->p_szc == 0) { 1377 page_sub(ppp, pp); 1378 /* 1379 * Subtract counters before releasing pcm mutex 1380 * to avoid race with page_freelist_coalesce. 1381 */ 1382 page_ctr_sub(mnode, mtype, pp, flags); 1383 mutex_exit(pcm); 1384 1385 #if defined(__sparc) 1386 if (PP_ISNORELOC(pp)) { 1387 kcage_freemem_sub(1); 1388 } 1389 #endif 1390 return; 1391 } 1392 1393 /* 1394 * Large pages on the cache list are not supported. 1395 */ 1396 if (flags & PG_CACHE_LIST) 1397 panic("page_list_sub: large page on cachelist"); 1398 1399 /* 1400 * Slow but rare. 1401 * 1402 * Somebody wants this particular page which is part 1403 * of a large page. In this case we just demote the page 1404 * if it's on the freelist. 1405 * 1406 * We have to drop pcm before locking the entire freelist. 1407 * Once we have re-locked the freelist check to make sure 1408 * the page hasn't already been demoted or completely 1409 * freed. 1410 */ 1411 mutex_exit(pcm); 1412 page_freelist_lock(mnode); 1413 if (pp->p_szc != 0) { 1414 /* 1415 * Large page is on freelist. 1416 */ 1417 (void) page_demote(mnode, PFN_BASE(pp->p_pagenum, pp->p_szc), 1418 pp->p_szc, 0, PC_NO_COLOR, PC_FREE); 1419 } 1420 ASSERT(PP_ISFREE(pp)); 1421 ASSERT(PP_ISAGED(pp)); 1422 ASSERT(pp->p_szc == 0); 1423 1424 /* 1425 * Subtract counters before releasing pcm mutex 1426 * to avoid race with page_freelist_coalesce. 1427 */ 1428 bin = PP_2_BIN(pp); 1429 mtype = PP_2_MTYPE(pp); 1430 ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype); 1431 1432 page_sub(ppp, pp); 1433 page_ctr_sub(mnode, mtype, pp, flags); 1434 page_freelist_unlock(mnode); 1435 1436 #if defined(__sparc) 1437 if (PP_ISNORELOC(pp)) { 1438 kcage_freemem_sub(1); 1439 } 1440 #endif 1441 } 1442 1443 void 1444 page_list_sub_pages(page_t *pp, uint_t szc) 1445 { 1446 kmutex_t *pcm; 1447 uint_t bin, mtype; 1448 int mnode; 1449 1450 ASSERT(PAGE_EXCL(pp)); 1451 ASSERT(PP_ISFREE(pp)); 1452 ASSERT(PP_ISAGED(pp)); 1453 1454 /* 1455 * See comment in page_list_sub(). 1456 */ 1457 try_again: 1458 bin = PP_2_BIN(pp); 1459 mnode = PP_2_MEM_NODE(pp); 1460 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 1461 mutex_enter(pcm); 1462 if (PP_2_BIN(pp) != bin) { 1463 mutex_exit(pcm); 1464 goto try_again; 1465 } 1466 1467 /* 1468 * If we're called with a page larger than szc or it got 1469 * promoted above szc before we locked the freelist then 1470 * drop pcm and re-lock entire freelist. If page still larger 1471 * than szc then demote it. 1472 */ 1473 if (pp->p_szc > szc) { 1474 mutex_exit(pcm); 1475 pcm = NULL; 1476 page_freelist_lock(mnode); 1477 if (pp->p_szc > szc) { 1478 VM_STAT_ADD(vmm_vmstats.plsubpages_szcbig); 1479 (void) page_demote(mnode, 1480 PFN_BASE(pp->p_pagenum, pp->p_szc), 1481 pp->p_szc, szc, PC_NO_COLOR, PC_FREE); 1482 } 1483 bin = PP_2_BIN(pp); 1484 } 1485 ASSERT(PP_ISFREE(pp)); 1486 ASSERT(PP_ISAGED(pp)); 1487 ASSERT(pp->p_szc <= szc); 1488 ASSERT(pp == PP_PAGEROOT(pp)); 1489 1490 VM_STAT_ADD(vmm_vmstats.plsub_free[pp->p_szc]); 1491 1492 mtype = PP_2_MTYPE(pp); 1493 if (pp->p_szc != 0) { 1494 page_vpsub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1495 CHK_LPG(pp, pp->p_szc); 1496 } else { 1497 VM_STAT_ADD(vmm_vmstats.plsubpages_szc0); 1498 page_sub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1499 } 1500 page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST); 1501 1502 if (pcm != NULL) { 1503 mutex_exit(pcm); 1504 } else { 1505 page_freelist_unlock(mnode); 1506 } 1507 1508 #if defined(__sparc) 1509 if (PP_ISNORELOC(pp)) { 1510 pgcnt_t pgcnt; 1511 1512 pgcnt = page_get_pagecnt(pp->p_szc); 1513 kcage_freemem_sub(pgcnt); 1514 } 1515 #endif 1516 } 1517 1518 /* 1519 * Add the page to the front of a linked list of pages 1520 * using the p_next & p_prev pointers for the list. 1521 * The caller is responsible for protecting the list pointers. 1522 */ 1523 void 1524 mach_page_add(page_t **ppp, page_t *pp) 1525 { 1526 if (*ppp == NULL) { 1527 pp->p_next = pp->p_prev = pp; 1528 } else { 1529 pp->p_next = *ppp; 1530 pp->p_prev = (*ppp)->p_prev; 1531 (*ppp)->p_prev = pp; 1532 pp->p_prev->p_next = pp; 1533 } 1534 *ppp = pp; 1535 } 1536 1537 /* 1538 * Remove this page from a linked list of pages 1539 * using the p_next & p_prev pointers for the list. 1540 * 1541 * The caller is responsible for protecting the list pointers. 1542 */ 1543 void 1544 mach_page_sub(page_t **ppp, page_t *pp) 1545 { 1546 ASSERT(PP_ISFREE(pp)); 1547 1548 if (*ppp == NULL || pp == NULL) 1549 panic("mach_page_sub"); 1550 1551 if (*ppp == pp) 1552 *ppp = pp->p_next; /* go to next page */ 1553 1554 if (*ppp == pp) 1555 *ppp = NULL; /* page list is gone */ 1556 else { 1557 pp->p_prev->p_next = pp->p_next; 1558 pp->p_next->p_prev = pp->p_prev; 1559 } 1560 pp->p_prev = pp->p_next = pp; /* make pp a list of one */ 1561 } 1562 1563 /* 1564 * Routine fsflush uses to gradually coalesce the free list into larger pages. 1565 */ 1566 void 1567 page_promote_size(page_t *pp, uint_t cur_szc) 1568 { 1569 pfn_t pfn; 1570 int mnode; 1571 int idx; 1572 int new_szc = cur_szc + 1; 1573 int full = FULL_REGION_CNT(new_szc); 1574 1575 pfn = page_pptonum(pp); 1576 mnode = PFN_2_MEM_NODE(pfn); 1577 1578 page_freelist_lock(mnode); 1579 1580 idx = PNUM_TO_IDX(mnode, new_szc, pfn); 1581 if (PAGE_COUNTERS(mnode, new_szc, idx) == full) 1582 (void) page_promote(mnode, pfn, new_szc, PC_FREE); 1583 1584 page_freelist_unlock(mnode); 1585 } 1586 1587 static uint_t page_promote_err; 1588 static uint_t page_promote_noreloc_err; 1589 1590 /* 1591 * Create a single larger page (of szc new_szc) from smaller contiguous pages 1592 * for the given mnode starting at pfnum. Pages involved are on the freelist 1593 * before the call and may be returned to the caller if requested, otherwise 1594 * they will be placed back on the freelist. 1595 * If flags is PC_ALLOC, then the large page will be returned to the user in 1596 * a state which is consistent with a page being taken off the freelist. If 1597 * we failed to lock the new large page, then we will return NULL to the 1598 * caller and put the large page on the freelist instead. 1599 * If flags is PC_FREE, then the large page will be placed on the freelist, 1600 * and NULL will be returned. 1601 * The caller is responsible for locking the freelist as well as any other 1602 * accounting which needs to be done for a returned page. 1603 * 1604 * RFE: For performance pass in pp instead of pfnum so 1605 * we can avoid excessive calls to page_numtopp_nolock(). 1606 * This would depend on an assumption that all contiguous 1607 * pages are in the same memseg so we can just add/dec 1608 * our pp. 1609 * 1610 * Lock ordering: 1611 * 1612 * There is a potential but rare deadlock situation 1613 * for page promotion and demotion operations. The problem 1614 * is there are two paths into the freelist manager and 1615 * they have different lock orders: 1616 * 1617 * page_create() 1618 * lock freelist 1619 * page_lock(EXCL) 1620 * unlock freelist 1621 * return 1622 * caller drops page_lock 1623 * 1624 * page_free() and page_reclaim() 1625 * caller grabs page_lock(EXCL) 1626 * 1627 * lock freelist 1628 * unlock freelist 1629 * drop page_lock 1630 * 1631 * What prevents a thread in page_create() from deadlocking 1632 * with a thread freeing or reclaiming the same page is the 1633 * page_trylock() in page_get_freelist(). If the trylock fails 1634 * it skips the page. 1635 * 1636 * The lock ordering for promotion and demotion is the same as 1637 * for page_create(). Since the same deadlock could occur during 1638 * page promotion and freeing or reclaiming of a page on the 1639 * cache list we might have to fail the operation and undo what 1640 * have done so far. Again this is rare. 1641 */ 1642 page_t * 1643 page_promote(int mnode, pfn_t pfnum, uchar_t new_szc, int flags) 1644 { 1645 page_t *pp, *pplist, *tpp, *start_pp; 1646 pgcnt_t new_npgs, npgs; 1647 uint_t bin; 1648 pgcnt_t tmpnpgs, pages_left; 1649 uint_t mtype; 1650 uint_t noreloc; 1651 uint_t i; 1652 int which_list; 1653 ulong_t index; 1654 kmutex_t *phm; 1655 1656 /* 1657 * General algorithm: 1658 * Find the starting page 1659 * Walk each page struct removing it from the freelist, 1660 * and linking it to all the other pages removed. 1661 * Once all pages are off the freelist, 1662 * walk the list, modifying p_szc to new_szc and what 1663 * ever other info needs to be done to create a large free page. 1664 * According to the flags, either return the page or put it 1665 * on the freelist. 1666 */ 1667 1668 start_pp = page_numtopp_nolock(pfnum); 1669 ASSERT(start_pp && (start_pp->p_pagenum == pfnum)); 1670 new_npgs = page_get_pagecnt(new_szc); 1671 ASSERT(IS_P2ALIGNED(pfnum, new_npgs)); 1672 1673 /* 1674 * Loop through smaller pages to confirm that all pages 1675 * give the same result for PP_ISNORELOC(). 1676 * We can check this reliably here as the protocol for setting 1677 * P_NORELOC requires pages to be taken off the free list first. 1678 */ 1679 for (i = 0, pp = start_pp; i < new_npgs; i++, pp++) { 1680 if (pp == start_pp) { 1681 /* First page, set requirement. */ 1682 noreloc = PP_ISNORELOC(pp); 1683 } else if (noreloc != PP_ISNORELOC(pp)) { 1684 page_promote_noreloc_err++; 1685 page_promote_err++; 1686 return (NULL); 1687 } 1688 } 1689 1690 pages_left = new_npgs; 1691 pplist = NULL; 1692 pp = start_pp; 1693 1694 /* Loop around coalescing the smaller pages into a big page. */ 1695 while (pages_left) { 1696 /* 1697 * Remove from the freelist. 1698 */ 1699 ASSERT(PP_ISFREE(pp)); 1700 bin = PP_2_BIN(pp); 1701 ASSERT(mnode == PP_2_MEM_NODE(pp)); 1702 mtype = PP_2_MTYPE(pp); 1703 if (PP_ISAGED(pp)) { 1704 1705 /* 1706 * PG_FREE_LIST 1707 */ 1708 if (pp->p_szc) { 1709 page_vpsub(&PAGE_FREELISTS(mnode, 1710 pp->p_szc, bin, mtype), pp); 1711 } else { 1712 mach_page_sub(&PAGE_FREELISTS(mnode, 0, 1713 bin, mtype), pp); 1714 } 1715 which_list = PG_FREE_LIST; 1716 } else { 1717 ASSERT(pp->p_szc == 0); 1718 1719 /* 1720 * PG_CACHE_LIST 1721 * 1722 * Since this page comes from the 1723 * cachelist, we must destroy the 1724 * vnode association. 1725 */ 1726 if (!page_trylock(pp, SE_EXCL)) { 1727 goto fail_promote; 1728 } 1729 1730 /* 1731 * We need to be careful not to deadlock 1732 * with another thread in page_lookup(). 1733 * The page_lookup() thread could be holding 1734 * the same phm that we need if the two 1735 * pages happen to hash to the same phm lock. 1736 * At this point we have locked the entire 1737 * freelist and page_lookup() could be trying 1738 * to grab a freelist lock. 1739 */ 1740 index = PAGE_HASH_FUNC(pp->p_vnode, pp->p_offset); 1741 phm = PAGE_HASH_MUTEX(index); 1742 if (!mutex_tryenter(phm)) { 1743 page_unlock_noretire(pp); 1744 goto fail_promote; 1745 } 1746 1747 mach_page_sub(&PAGE_CACHELISTS(mnode, bin, mtype), pp); 1748 page_hashout(pp, phm); 1749 mutex_exit(phm); 1750 PP_SETAGED(pp); 1751 page_unlock_noretire(pp); 1752 which_list = PG_CACHE_LIST; 1753 } 1754 page_ctr_sub(mnode, mtype, pp, which_list); 1755 1756 /* 1757 * Concatenate the smaller page(s) onto 1758 * the large page list. 1759 */ 1760 tmpnpgs = npgs = page_get_pagecnt(pp->p_szc); 1761 pages_left -= npgs; 1762 tpp = pp; 1763 while (npgs--) { 1764 tpp->p_szc = new_szc; 1765 tpp = tpp->p_next; 1766 } 1767 page_list_concat(&pplist, &pp); 1768 pp += tmpnpgs; 1769 } 1770 CHK_LPG(pplist, new_szc); 1771 1772 /* 1773 * return the page to the user if requested 1774 * in the properly locked state. 1775 */ 1776 if (flags == PC_ALLOC && (page_trylock_cons(pplist, SE_EXCL))) { 1777 return (pplist); 1778 } 1779 1780 /* 1781 * Otherwise place the new large page on the freelist 1782 */ 1783 bin = PP_2_BIN(pplist); 1784 mnode = PP_2_MEM_NODE(pplist); 1785 mtype = PP_2_MTYPE(pplist); 1786 page_vpadd(&PAGE_FREELISTS(mnode, new_szc, bin, mtype), pplist); 1787 1788 page_ctr_add(mnode, mtype, pplist, PG_FREE_LIST); 1789 return (NULL); 1790 1791 fail_promote: 1792 /* 1793 * A thread must have still been freeing or 1794 * reclaiming the page on the cachelist. 1795 * To prevent a deadlock undo what we have 1796 * done sofar and return failure. This 1797 * situation can only happen while promoting 1798 * PAGESIZE pages. 1799 */ 1800 page_promote_err++; 1801 while (pplist) { 1802 pp = pplist; 1803 mach_page_sub(&pplist, pp); 1804 pp->p_szc = 0; 1805 bin = PP_2_BIN(pp); 1806 mtype = PP_2_MTYPE(pp); 1807 mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, mtype), pp); 1808 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST); 1809 } 1810 return (NULL); 1811 1812 } 1813 1814 /* 1815 * Break up a large page into smaller size pages. 1816 * Pages involved are on the freelist before the call and may 1817 * be returned to the caller if requested, otherwise they will 1818 * be placed back on the freelist. 1819 * The caller is responsible for locking the freelist as well as any other 1820 * accounting which needs to be done for a returned page. 1821 * If flags is not PC_ALLOC, the color argument is ignored, and thus 1822 * technically, any value may be passed in but PC_NO_COLOR is the standard 1823 * which should be followed for clarity's sake. 1824 */ 1825 page_t * 1826 page_demote(int mnode, pfn_t pfnum, uchar_t cur_szc, uchar_t new_szc, 1827 int color, int flags) 1828 { 1829 page_t *pp, *pplist, *npplist; 1830 pgcnt_t npgs, n; 1831 uint_t bin; 1832 uint_t mtype; 1833 page_t *ret_pp = NULL; 1834 1835 ASSERT(cur_szc != 0); 1836 ASSERT(new_szc < cur_szc); 1837 1838 pplist = page_numtopp_nolock(pfnum); 1839 ASSERT(pplist != NULL); 1840 1841 ASSERT(pplist->p_szc == cur_szc); 1842 1843 bin = PP_2_BIN(pplist); 1844 ASSERT(mnode == PP_2_MEM_NODE(pplist)); 1845 mtype = PP_2_MTYPE(pplist); 1846 page_vpsub(&PAGE_FREELISTS(mnode, cur_szc, bin, mtype), pplist); 1847 1848 CHK_LPG(pplist, cur_szc); 1849 page_ctr_sub(mnode, mtype, pplist, PG_FREE_LIST); 1850 1851 /* 1852 * Number of PAGESIZE pages for smaller new_szc 1853 * page. 1854 */ 1855 npgs = page_get_pagecnt(new_szc); 1856 1857 while (pplist) { 1858 pp = pplist; 1859 1860 ASSERT(pp->p_szc == cur_szc); 1861 1862 /* 1863 * We either break it up into PAGESIZE pages or larger. 1864 */ 1865 if (npgs == 1) { /* PAGESIZE case */ 1866 mach_page_sub(&pplist, pp); 1867 ASSERT(pp->p_szc == cur_szc); 1868 ASSERT(new_szc == 0); 1869 ASSERT(mnode == PP_2_MEM_NODE(pp)); 1870 pp->p_szc = new_szc; 1871 bin = PP_2_BIN(pp); 1872 if ((bin == color) && (flags == PC_ALLOC) && 1873 (ret_pp == NULL) && 1874 page_trylock_cons(pp, SE_EXCL)) { 1875 ret_pp = pp; 1876 } else { 1877 mtype = PP_2_MTYPE(pp); 1878 mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, 1879 mtype), pp); 1880 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST); 1881 } 1882 } else { 1883 1884 /* 1885 * Break down into smaller lists of pages. 1886 */ 1887 page_list_break(&pplist, &npplist, npgs); 1888 1889 pp = pplist; 1890 n = npgs; 1891 while (n--) { 1892 ASSERT(pp->p_szc == cur_szc); 1893 pp->p_szc = new_szc; 1894 pp = pp->p_next; 1895 } 1896 1897 CHK_LPG(pplist, new_szc); 1898 1899 bin = PP_2_BIN(pplist); 1900 ASSERT(mnode == PP_2_MEM_NODE(pp)); 1901 if ((bin == color) && (flags == PC_ALLOC) && 1902 (ret_pp == NULL) && 1903 page_trylock_cons(pp, SE_EXCL)) { 1904 ret_pp = pp; 1905 } else { 1906 mtype = PP_2_MTYPE(pp); 1907 page_vpadd(&PAGE_FREELISTS(mnode, new_szc, 1908 bin, mtype), pplist); 1909 1910 page_ctr_add(mnode, mtype, pplist, 1911 PG_FREE_LIST); 1912 } 1913 pplist = npplist; 1914 } 1915 } 1916 return (ret_pp); 1917 } 1918 1919 int mpss_coalesce_disable = 0; 1920 1921 /* 1922 * Coalesce free pages into a page of the given szc and color if possible. 1923 * Return the pointer to the page created, otherwise, return NULL. 1924 */ 1925 static page_t * 1926 page_freelist_coalesce(int mnode, uchar_t szc, int color) 1927 { 1928 int r; /* region size */ 1929 int idx, full, i; 1930 pfn_t pfnum; 1931 size_t len; 1932 size_t buckets_to_check; 1933 pgcnt_t cands; 1934 page_t *ret_pp; 1935 int color_stride; 1936 1937 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce); 1938 1939 if (mpss_coalesce_disable) { 1940 return (NULL); 1941 } 1942 1943 r = szc; 1944 PGCTRS_CANDS_GETVALUECOLOR(mnode, r, color, cands); 1945 if (cands == 0) { 1946 VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip); 1947 return (NULL); 1948 } 1949 full = FULL_REGION_CNT(r); 1950 color_stride = (szc) ? page_convert_color(0, szc, page_colors - 1) + 1 : 1951 page_colors; 1952 1953 /* Prevent page_counters dynamic memory from being freed */ 1954 rw_enter(&page_ctrs_rwlock[mnode], RW_READER); 1955 len = PAGE_COUNTERS_ENTRIES(mnode, r); 1956 buckets_to_check = len / color_stride; 1957 idx = PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color); 1958 ASSERT((idx % color_stride) == color); 1959 idx += color_stride; 1960 if (idx >= len) 1961 idx = color; 1962 for (i = 0; i < buckets_to_check; i++) { 1963 if (PAGE_COUNTERS(mnode, r, idx) == full) { 1964 pfnum = IDX_TO_PNUM(mnode, r, idx); 1965 ASSERT(pfnum >= mem_node_config[mnode].physbase && 1966 pfnum < mem_node_config[mnode].physmax); 1967 /* 1968 * RFE: For performance maybe we can do something less 1969 * brutal than locking the entire freelist. So far 1970 * this doesn't seem to be a performance problem? 1971 */ 1972 page_freelist_lock(mnode); 1973 if (PAGE_COUNTERS(mnode, r, idx) != full) { 1974 VM_STAT_ADD(vmm_vmstats.page_ctrs_changed); 1975 goto skip_this_one; 1976 } 1977 ret_pp = page_promote(mnode, pfnum, r, PC_ALLOC); 1978 if (ret_pp != NULL) { 1979 PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color) = 1980 idx; 1981 page_freelist_unlock(mnode); 1982 rw_exit(&page_ctrs_rwlock[mnode]); 1983 #if defined(__sparc) 1984 if (PP_ISNORELOC(ret_pp)) { 1985 pgcnt_t npgs; 1986 1987 npgs = page_get_pagecnt(ret_pp->p_szc); 1988 kcage_freemem_sub(npgs); 1989 } 1990 #endif 1991 return (ret_pp); 1992 } 1993 skip_this_one: 1994 page_freelist_unlock(mnode); 1995 /* 1996 * No point looking for another page if we've 1997 * already tried all of the ones that 1998 * page_ctr_cands indicated. Stash off where we left 1999 * off. 2000 * Note: this is not exact since we don't hold the 2001 * page_freelist_locks before we initially get the 2002 * value of cands for performance reasons, but should 2003 * be a decent approximation. 2004 */ 2005 if (--cands == 0) { 2006 PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color) = 2007 idx; 2008 break; 2009 } 2010 } 2011 idx += color_stride; 2012 if (idx >= len) 2013 idx = color; 2014 } 2015 rw_exit(&page_ctrs_rwlock[mnode]); 2016 VM_STAT_ADD(vmm_vmstats.page_ctrs_failed); 2017 return (NULL); 2018 } 2019 2020 /* 2021 * For the given mnode, promote as many small pages to large pages as possible. 2022 */ 2023 void 2024 page_freelist_coalesce_all(int mnode) 2025 { 2026 int r; /* region size */ 2027 int idx, full; 2028 pfn_t pfnum; 2029 size_t len; 2030 2031 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce_all); 2032 2033 if (mpss_coalesce_disable) { 2034 return; 2035 } 2036 2037 /* 2038 * Lock the entire freelist and coalesce what we can. 2039 * 2040 * Always promote to the largest page possible 2041 * first to reduce the number of page promotions. 2042 */ 2043 rw_enter(&page_ctrs_rwlock[mnode], RW_READER); 2044 page_freelist_lock(mnode); 2045 for (r = mmu_page_sizes - 1; r > 0; r--) { 2046 pgcnt_t cands; 2047 2048 PGCTRS_CANDS_GETVALUE(mnode, r, cands); 2049 if (cands == 0) { 2050 VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip_all); 2051 continue; 2052 } 2053 2054 full = FULL_REGION_CNT(r); 2055 len = PAGE_COUNTERS_ENTRIES(mnode, r); 2056 2057 for (idx = 0; idx < len; idx++) { 2058 if (PAGE_COUNTERS(mnode, r, idx) == full) { 2059 pfnum = IDX_TO_PNUM(mnode, r, idx); 2060 ASSERT(pfnum >= 2061 mem_node_config[mnode].physbase && 2062 pfnum < 2063 mem_node_config[mnode].physmax); 2064 (void) page_promote(mnode, pfnum, r, PC_FREE); 2065 } 2066 } 2067 } 2068 page_freelist_unlock(mnode); 2069 rw_exit(&page_ctrs_rwlock[mnode]); 2070 } 2071 2072 /* 2073 * This is where all polices for moving pages around 2074 * to different page size free lists is implemented. 2075 * Returns 1 on success, 0 on failure. 2076 * 2077 * So far these are the priorities for this algorithm in descending 2078 * order: 2079 * 2080 * 1) When servicing a request try to do so with a free page 2081 * from next size up. Helps defer fragmentation as long 2082 * as possible. 2083 * 2084 * 2) Page coalesce on demand. Only when a freelist 2085 * larger than PAGESIZE is empty and step 1 2086 * will not work since all larger size lists are 2087 * also empty. 2088 * 2089 * If pfnhi is non-zero, search for large page with pfn range less than pfnhi. 2090 */ 2091 page_t * 2092 page_freelist_fill(uchar_t szc, int color, int mnode, int mtype, pfn_t pfnhi) 2093 { 2094 uchar_t nszc = szc + 1; 2095 int bin; 2096 page_t *pp, *firstpp; 2097 page_t *ret_pp = NULL; 2098 2099 ASSERT(szc < mmu_page_sizes); 2100 2101 VM_STAT_ADD(vmm_vmstats.pff_req[szc]); 2102 /* 2103 * First try to break up a larger page to fill 2104 * current size freelist. 2105 */ 2106 while (nszc < mmu_page_sizes) { 2107 /* 2108 * If page found then demote it. 2109 */ 2110 bin = page_convert_color(szc, nszc, color); 2111 if (PAGE_FREELISTS(mnode, nszc, bin, mtype)) { 2112 page_freelist_lock(mnode); 2113 firstpp = pp = PAGE_FREELISTS(mnode, nszc, bin, mtype); 2114 2115 /* 2116 * If pfnhi is not PFNNULL, look for large page below 2117 * pfnhi. PFNNULL signifies no pfn requirement. 2118 */ 2119 if (pfnhi != PFNNULL && pp->p_pagenum >= pfnhi) { 2120 do { 2121 pp = pp->p_vpnext; 2122 if (pp == firstpp) { 2123 pp = NULL; 2124 break; 2125 } 2126 } while (pp->p_pagenum >= pfnhi); 2127 } 2128 if (pp) { 2129 ASSERT(pp->p_szc == nszc); 2130 VM_STAT_ADD(vmm_vmstats.pff_demote[nszc]); 2131 ret_pp = page_demote(mnode, pp->p_pagenum, 2132 pp->p_szc, szc, color, PC_ALLOC); 2133 if (ret_pp) { 2134 page_freelist_unlock(mnode); 2135 #if defined(__sparc) 2136 if (PP_ISNORELOC(ret_pp)) { 2137 pgcnt_t npgs; 2138 2139 npgs = page_get_pagecnt( 2140 ret_pp->p_szc); 2141 kcage_freemem_sub(npgs); 2142 } 2143 #endif 2144 return (ret_pp); 2145 } 2146 } 2147 page_freelist_unlock(mnode); 2148 } 2149 nszc++; 2150 } 2151 2152 /* 2153 * Ok that didn't work. Time to coalesce. 2154 */ 2155 if (szc != 0) { 2156 ret_pp = page_freelist_coalesce(mnode, szc, color); 2157 VM_STAT_COND_ADD(ret_pp, vmm_vmstats.pff_coalok[szc]); 2158 } 2159 2160 return (ret_pp); 2161 } 2162 2163 /* 2164 * Helper routine used only by the freelist code to lock 2165 * a page. If the page is a large page then it succeeds in 2166 * locking all the constituent pages or none at all. 2167 * Returns 1 on sucess, 0 on failure. 2168 */ 2169 static int 2170 page_trylock_cons(page_t *pp, se_t se) 2171 { 2172 page_t *tpp, *first_pp = pp; 2173 2174 /* 2175 * Fail if can't lock first or only page. 2176 */ 2177 if (!page_trylock(pp, se)) { 2178 return (0); 2179 } 2180 2181 /* 2182 * PAGESIZE: common case. 2183 */ 2184 if (pp->p_szc == 0) { 2185 return (1); 2186 } 2187 2188 /* 2189 * Large page case. 2190 */ 2191 tpp = pp->p_next; 2192 while (tpp != pp) { 2193 if (!page_trylock(tpp, se)) { 2194 /* 2195 * On failure unlock what we 2196 * have locked so far. 2197 */ 2198 while (first_pp != tpp) { 2199 page_unlock_noretire(first_pp); 2200 first_pp = first_pp->p_next; 2201 } 2202 return (0); 2203 } 2204 tpp = tpp->p_next; 2205 } 2206 return (1); 2207 } 2208 2209 page_t * 2210 page_get_mnode_freelist(int mnode, uint_t bin, int mtype, uchar_t szc, 2211 uint_t flags) 2212 { 2213 kmutex_t *pcm; 2214 int i, fill_tried, fill_marker; 2215 page_t *pp, *first_pp; 2216 uint_t bin_marker; 2217 int colors, cpucolors; 2218 uchar_t nszc; 2219 uint_t nszc_color_shift; 2220 int nwaybins = 0, nwaycnt; 2221 2222 ASSERT(szc < mmu_page_sizes); 2223 2224 VM_STAT_ADD(vmm_vmstats.pgmf_alloc[szc]); 2225 2226 MTYPE_START(mnode, mtype, flags); 2227 if (mtype < 0) { /* mnode foes not have memory in mtype range */ 2228 VM_STAT_ADD(vmm_vmstats.pgmf_allocempty[szc]); 2229 return (NULL); 2230 } 2231 2232 /* 2233 * Set how many physical colors for this page size. 2234 */ 2235 colors = (szc) ? page_convert_color(0, szc, page_colors - 1) + 1 : 2236 page_colors; 2237 2238 nszc = MIN(szc + 1, mmu_page_sizes - 1); 2239 nszc_color_shift = page_get_shift(nszc) - page_get_shift(szc); 2240 2241 /* cpu_page_colors is non-zero if a page color may be in > 1 bin */ 2242 cpucolors = cpu_page_colors; 2243 2244 /* 2245 * adjust cpucolors to possibly check additional 'equivalent' bins 2246 * to try to minimize fragmentation of large pages by delaying calls 2247 * to page_freelist_fill. 2248 */ 2249 if (colorequiv > 1) { 2250 int equivcolors = colors / colorequiv; 2251 2252 if (equivcolors && (cpucolors == 0 || equivcolors < cpucolors)) 2253 cpucolors = equivcolors; 2254 } 2255 2256 ASSERT(colors <= page_colors); 2257 ASSERT(colors); 2258 ASSERT((colors & (colors - 1)) == 0); 2259 2260 ASSERT(bin < colors); 2261 2262 /* 2263 * Only hold one freelist lock at a time, that way we 2264 * can start anywhere and not have to worry about lock 2265 * ordering. 2266 */ 2267 big_try_again: 2268 fill_tried = 0; 2269 nwaycnt = 0; 2270 for (i = 0; i <= colors; i++) { 2271 try_again: 2272 ASSERT(bin < colors); 2273 if (PAGE_FREELISTS(mnode, szc, bin, mtype)) { 2274 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 2275 mutex_enter(pcm); 2276 pp = PAGE_FREELISTS(mnode, szc, bin, mtype); 2277 if (pp != NULL) { 2278 /* 2279 * These were set before the page 2280 * was put on the free list, 2281 * they must still be set. 2282 */ 2283 ASSERT(PP_ISFREE(pp)); 2284 ASSERT(PP_ISAGED(pp)); 2285 ASSERT(pp->p_vnode == NULL); 2286 ASSERT(pp->p_hash == NULL); 2287 ASSERT(pp->p_offset == (u_offset_t)-1); 2288 ASSERT(pp->p_szc == szc); 2289 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 2290 2291 /* 2292 * Walk down the hash chain. 2293 * 8k pages are linked on p_next 2294 * and p_prev fields. Large pages 2295 * are a contiguous group of 2296 * constituent pages linked together 2297 * on their p_next and p_prev fields. 2298 * The large pages are linked together 2299 * on the hash chain using p_vpnext 2300 * p_vpprev of the base constituent 2301 * page of each large page. 2302 */ 2303 first_pp = pp; 2304 while (!page_trylock_cons(pp, SE_EXCL)) { 2305 if (szc == 0) { 2306 pp = pp->p_next; 2307 } else { 2308 pp = pp->p_vpnext; 2309 } 2310 2311 ASSERT(PP_ISFREE(pp)); 2312 ASSERT(PP_ISAGED(pp)); 2313 ASSERT(pp->p_vnode == NULL); 2314 ASSERT(pp->p_hash == NULL); 2315 ASSERT(pp->p_offset == (u_offset_t)-1); 2316 ASSERT(pp->p_szc == szc); 2317 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == 2318 mnode); 2319 2320 if (pp == first_pp) { 2321 pp = NULL; 2322 break; 2323 } 2324 } 2325 2326 if (pp) { 2327 ASSERT(mtype == PP_2_MTYPE(pp)); 2328 ASSERT(pp->p_szc == szc); 2329 if (szc == 0) { 2330 page_sub(&PAGE_FREELISTS(mnode, 2331 szc, bin, mtype), pp); 2332 } else { 2333 page_vpsub(&PAGE_FREELISTS( 2334 mnode, szc, bin, mtype), 2335 pp); 2336 CHK_LPG(pp, szc); 2337 } 2338 page_ctr_sub(mnode, mtype, pp, 2339 PG_FREE_LIST); 2340 2341 if ((PP_ISFREE(pp) == 0) || 2342 (PP_ISAGED(pp) == 0)) 2343 panic("free page is not. pp %p", 2344 (void *)pp); 2345 mutex_exit(pcm); 2346 2347 #if defined(__sparc) 2348 ASSERT(!kcage_on || PP_ISNORELOC(pp) || 2349 (flags & PG_NORELOC) == 0); 2350 2351 if (PP_ISNORELOC(pp)) { 2352 pgcnt_t npgs; 2353 2354 npgs = page_get_pagecnt(szc); 2355 kcage_freemem_sub(npgs); 2356 } 2357 #endif 2358 VM_STAT_ADD(vmm_vmstats. 2359 pgmf_allocok[szc]); 2360 return (pp); 2361 } 2362 } 2363 mutex_exit(pcm); 2364 } 2365 2366 /* 2367 * Wow! The initial bin is empty. 2368 * If specific color is needed, check if page color may be 2369 * in other bins. cpucolors is: 2370 * 0 if the colors for this cpu is equal to page_colors. 2371 * This means that pages with a particular color are in a 2372 * single bin. 2373 * -1 if colors of cpus (cheetah+) are heterogenous. Need to 2374 * first determine the colors for the current cpu. 2375 * >0 colors of all cpus are homogenous and < page_colors 2376 */ 2377 2378 if ((flags & PG_MATCH_COLOR) && (cpucolors != 0)) { 2379 if (!nwaybins) { 2380 /* 2381 * cpucolors is negative if ecache setsizes 2382 * are heterogenous. determine colors for this 2383 * particular cpu. 2384 */ 2385 if (cpucolors < 0) { 2386 cpucolors = CPUSETSIZE() / MMU_PAGESIZE; 2387 ASSERT(cpucolors > 0); 2388 nwaybins = colors / cpucolors; 2389 } else { 2390 nwaybins = colors / cpucolors; 2391 ASSERT(szc > 0 || nwaybins > 1); 2392 } 2393 if (nwaybins < 2) 2394 cpucolors = 0; 2395 } 2396 2397 if (cpucolors && (nwaycnt + 1 <= nwaybins)) { 2398 nwaycnt++; 2399 bin = (bin + (colors / nwaybins)) & 2400 (colors - 1); 2401 if (nwaycnt < nwaybins) { 2402 goto try_again; 2403 } 2404 } 2405 /* back to initial color if fall-thru */ 2406 } 2407 2408 /* 2409 * color bins are all empty if color match. Try and satisfy 2410 * the request by breaking up or coalescing pages from 2411 * a different size freelist of the correct color that 2412 * satisfies the ORIGINAL color requested. If that 2413 * fails then try pages of the same size but different 2414 * colors assuming we are not called with 2415 * PG_MATCH_COLOR. 2416 */ 2417 if (!fill_tried) { 2418 fill_tried = 1; 2419 fill_marker = bin >> nszc_color_shift; 2420 pp = page_freelist_fill(szc, bin, mnode, mtype, 2421 PFNNULL); 2422 if (pp != NULL) { 2423 return (pp); 2424 } 2425 } 2426 2427 if (flags & PG_MATCH_COLOR) 2428 break; 2429 2430 /* 2431 * Select next color bin to try. 2432 */ 2433 if (szc == 0) { 2434 /* 2435 * PAGESIZE page case. 2436 */ 2437 if (i == 0) { 2438 bin = (bin + BIN_STEP) & page_colors_mask; 2439 bin_marker = bin; 2440 } else { 2441 bin = (bin + vac_colors) & page_colors_mask; 2442 if (bin == bin_marker) { 2443 bin = (bin + 1) & page_colors_mask; 2444 bin_marker = bin; 2445 } 2446 } 2447 } else { 2448 /* 2449 * Large page case. 2450 */ 2451 bin = (bin + 1) & (colors - 1); 2452 } 2453 /* 2454 * If bin advanced to the next color bin of the 2455 * next larger pagesize, there is a chance the fill 2456 * could succeed. 2457 */ 2458 if (fill_marker != (bin >> nszc_color_shift)) 2459 fill_tried = 0; 2460 } 2461 2462 /* if allowed, cycle through additional mtypes */ 2463 MTYPE_NEXT(mnode, mtype, flags); 2464 if (mtype >= 0) 2465 goto big_try_again; 2466 2467 VM_STAT_ADD(vmm_vmstats.pgmf_allocfailed[szc]); 2468 2469 return (NULL); 2470 } 2471 2472 2473 /* 2474 * Returns the count of free pages for 'pp' with size code 'szc'. 2475 * Note: This function does not return an exact value as the page freelist 2476 * locks are not held and thus the values in the page_counters may be 2477 * changing as we walk through the data. 2478 */ 2479 static int 2480 page_freecnt(int mnode, page_t *pp, uchar_t szc) 2481 { 2482 pgcnt_t pgfree; 2483 pgcnt_t cnt; 2484 ssize_t r = szc; /* region size */ 2485 ssize_t idx; 2486 int i; 2487 int full, range; 2488 2489 /* Make sure pagenum passed in is aligned properly */ 2490 ASSERT((pp->p_pagenum & (PNUM_SIZE(szc) - 1)) == 0); 2491 ASSERT(szc > 0); 2492 2493 /* Prevent page_counters dynamic memory from being freed */ 2494 rw_enter(&page_ctrs_rwlock[mnode], RW_READER); 2495 idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum); 2496 cnt = PAGE_COUNTERS(mnode, r, idx); 2497 pgfree = cnt << PNUM_SHIFT(r - 1); 2498 range = FULL_REGION_CNT(szc); 2499 2500 /* Check for completely full region */ 2501 if (cnt == range) { 2502 rw_exit(&page_ctrs_rwlock[mnode]); 2503 return (pgfree); 2504 } 2505 2506 while (--r > 0) { 2507 idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum); 2508 full = FULL_REGION_CNT(r); 2509 for (i = 0; i < range; i++, idx++) { 2510 cnt = PAGE_COUNTERS(mnode, r, idx); 2511 /* 2512 * If cnt here is full, that means we have already 2513 * accounted for these pages earlier. 2514 */ 2515 if (cnt != full) { 2516 pgfree += (cnt << PNUM_SHIFT(r - 1)); 2517 } 2518 } 2519 range *= full; 2520 } 2521 rw_exit(&page_ctrs_rwlock[mnode]); 2522 return (pgfree); 2523 } 2524 2525 /* 2526 * Called from page_geti_contig_pages to exclusively lock constituent pages 2527 * starting from 'spp' for page size code 'szc'. 2528 * 2529 * If 'ptcpthreshold' is set, the number of free pages needed in the 'szc' 2530 * region needs to be greater than or equal to the threshold. 2531 */ 2532 static int 2533 page_trylock_contig_pages(int mnode, page_t *spp, uchar_t szc, int flags) 2534 { 2535 pgcnt_t pgcnt = PNUM_SIZE(szc); 2536 pgcnt_t pgfree, i; 2537 page_t *pp; 2538 2539 VM_STAT_ADD(vmm_vmstats.ptcp[szc]); 2540 2541 2542 if ((ptcpthreshold == 0) || (flags & PGI_PGCPHIPRI)) 2543 goto skipptcpcheck; 2544 /* 2545 * check if there are sufficient free pages available before attempting 2546 * to trylock. Count is approximate as page counters can change. 2547 */ 2548 pgfree = page_freecnt(mnode, spp, szc); 2549 2550 /* attempt to trylock if there are sufficient already free pages */ 2551 if (pgfree < pgcnt/ptcpthreshold) { 2552 VM_STAT_ADD(vmm_vmstats.ptcpfreethresh[szc]); 2553 return (0); 2554 } 2555 2556 skipptcpcheck: 2557 2558 for (i = 0; i < pgcnt; i++) { 2559 pp = &spp[i]; 2560 if (!page_trylock(pp, SE_EXCL)) { 2561 VM_STAT_ADD(vmm_vmstats.ptcpfailexcl[szc]); 2562 while (--i != (pgcnt_t)-1) { 2563 pp = &spp[i]; 2564 ASSERT(PAGE_EXCL(pp)); 2565 page_unlock_noretire(pp); 2566 } 2567 return (0); 2568 } 2569 ASSERT(spp[i].p_pagenum == spp->p_pagenum + i); 2570 if ((pp->p_szc > szc || (szc && pp->p_szc == szc)) && 2571 !PP_ISFREE(pp)) { 2572 VM_STAT_ADD(vmm_vmstats.ptcpfailszc[szc]); 2573 ASSERT(i == 0); 2574 page_unlock_noretire(pp); 2575 return (0); 2576 } 2577 if (PP_ISNORELOC(pp)) { 2578 VM_STAT_ADD(vmm_vmstats.ptcpfailcage[szc]); 2579 while (i != (pgcnt_t)-1) { 2580 pp = &spp[i]; 2581 ASSERT(PAGE_EXCL(pp)); 2582 page_unlock_noretire(pp); 2583 i--; 2584 } 2585 return (0); 2586 } 2587 } 2588 VM_STAT_ADD(vmm_vmstats.ptcpok[szc]); 2589 return (1); 2590 } 2591 2592 /* 2593 * Claim large page pointed to by 'pp'. 'pp' is the starting set 2594 * of 'szc' constituent pages that had been locked exclusively previously. 2595 * Will attempt to relocate constituent pages in use. 2596 */ 2597 static page_t * 2598 page_claim_contig_pages(page_t *pp, uchar_t szc, int flags) 2599 { 2600 spgcnt_t pgcnt, npgs, i; 2601 page_t *targpp, *rpp, *hpp; 2602 page_t *replpp = NULL; 2603 page_t *pplist = NULL; 2604 2605 ASSERT(pp != NULL); 2606 2607 pgcnt = page_get_pagecnt(szc); 2608 while (pgcnt) { 2609 ASSERT(PAGE_EXCL(pp)); 2610 ASSERT(!PP_ISNORELOC(pp)); 2611 if (PP_ISFREE(pp)) { 2612 /* 2613 * If this is a PG_FREE_LIST page then its 2614 * size code can change underneath us due to 2615 * page promotion or demotion. As an optimzation 2616 * use page_list_sub_pages() instead of 2617 * page_list_sub(). 2618 */ 2619 if (PP_ISAGED(pp)) { 2620 page_list_sub_pages(pp, szc); 2621 if (pp->p_szc == szc) { 2622 return (pp); 2623 } 2624 ASSERT(pp->p_szc < szc); 2625 npgs = page_get_pagecnt(pp->p_szc); 2626 hpp = pp; 2627 for (i = 0; i < npgs; i++, pp++) { 2628 pp->p_szc = szc; 2629 } 2630 page_list_concat(&pplist, &hpp); 2631 pgcnt -= npgs; 2632 continue; 2633 } 2634 ASSERT(!PP_ISAGED(pp)); 2635 ASSERT(pp->p_szc == 0); 2636 page_list_sub(pp, PG_CACHE_LIST); 2637 page_hashout(pp, NULL); 2638 PP_SETAGED(pp); 2639 pp->p_szc = szc; 2640 page_list_concat(&pplist, &pp); 2641 pp++; 2642 pgcnt--; 2643 continue; 2644 } 2645 npgs = page_get_pagecnt(pp->p_szc); 2646 2647 /* 2648 * page_create_wait freemem accounting done by caller of 2649 * page_get_freelist and not necessary to call it prior to 2650 * calling page_get_replacement_page. 2651 * 2652 * page_get_replacement_page can call page_get_contig_pages 2653 * to acquire a large page (szc > 0); the replacement must be 2654 * smaller than the contig page size to avoid looping or 2655 * szc == 0 and PGI_PGCPSZC0 is set. 2656 */ 2657 if (pp->p_szc < szc || (szc == 0 && (flags & PGI_PGCPSZC0))) { 2658 replpp = page_get_replacement_page(pp, NULL, 0); 2659 if (replpp) { 2660 npgs = page_get_pagecnt(pp->p_szc); 2661 ASSERT(npgs <= pgcnt); 2662 targpp = pp; 2663 } 2664 } 2665 2666 /* 2667 * If replacement is NULL or do_page_relocate fails, fail 2668 * coalescing of pages. 2669 */ 2670 if (replpp == NULL || (do_page_relocate(&targpp, &replpp, 0, 2671 &npgs, NULL) != 0)) { 2672 /* 2673 * Unlock un-processed target list 2674 */ 2675 while (pgcnt--) { 2676 ASSERT(PAGE_EXCL(pp)); 2677 page_unlock_noretire(pp); 2678 pp++; 2679 } 2680 /* 2681 * Free the processed target list. 2682 */ 2683 while (pplist) { 2684 pp = pplist; 2685 page_sub(&pplist, pp); 2686 ASSERT(PAGE_EXCL(pp)); 2687 ASSERT(pp->p_szc == szc); 2688 ASSERT(PP_ISFREE(pp)); 2689 ASSERT(PP_ISAGED(pp)); 2690 pp->p_szc = 0; 2691 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); 2692 page_unlock_noretire(pp); 2693 } 2694 2695 if (replpp != NULL) 2696 page_free_replacement_page(replpp); 2697 2698 return (NULL); 2699 } 2700 ASSERT(pp == targpp); 2701 2702 /* LINTED */ 2703 ASSERT(hpp = pp); /* That's right, it's an assignment */ 2704 2705 pp += npgs; 2706 pgcnt -= npgs; 2707 2708 while (npgs--) { 2709 ASSERT(PAGE_EXCL(targpp)); 2710 ASSERT(!PP_ISFREE(targpp)); 2711 ASSERT(!PP_ISNORELOC(targpp)); 2712 PP_SETFREE(targpp); 2713 ASSERT(PP_ISAGED(targpp)); 2714 ASSERT(targpp->p_szc < szc || (szc == 0 && 2715 (flags & PGI_PGCPSZC0))); 2716 targpp->p_szc = szc; 2717 targpp = targpp->p_next; 2718 2719 rpp = replpp; 2720 ASSERT(rpp != NULL); 2721 page_sub(&replpp, rpp); 2722 ASSERT(PAGE_EXCL(rpp)); 2723 ASSERT(!PP_ISFREE(rpp)); 2724 page_unlock_noretire(rpp); 2725 } 2726 ASSERT(targpp == hpp); 2727 ASSERT(replpp == NULL); 2728 page_list_concat(&pplist, &targpp); 2729 } 2730 CHK_LPG(pplist, szc); 2731 return (pplist); 2732 } 2733 2734 /* 2735 * Trim kernel cage from pfnlo-pfnhi and store result in lo-hi. Return code 2736 * of 0 means nothing left after trim. 2737 */ 2738 2739 int 2740 trimkcage(struct memseg *mseg, pfn_t *lo, pfn_t *hi, pfn_t pfnlo, pfn_t pfnhi) 2741 { 2742 pfn_t kcagepfn; 2743 int decr; 2744 int rc = 0; 2745 2746 if (PP_ISNORELOC(mseg->pages)) { 2747 if (PP_ISNORELOC(mseg->epages - 1) == 0) { 2748 2749 /* lower part of this mseg inside kernel cage */ 2750 decr = kcage_current_pfn(&kcagepfn); 2751 2752 /* kernel cage may have transitioned past mseg */ 2753 if (kcagepfn >= mseg->pages_base && 2754 kcagepfn < mseg->pages_end) { 2755 ASSERT(decr == 0); 2756 *lo = kcagepfn; 2757 *hi = MIN(pfnhi, 2758 (mseg->pages_end - 1)); 2759 rc = 1; 2760 } 2761 } 2762 /* else entire mseg in the cage */ 2763 } else { 2764 if (PP_ISNORELOC(mseg->epages - 1)) { 2765 2766 /* upper part of this mseg inside kernel cage */ 2767 decr = kcage_current_pfn(&kcagepfn); 2768 2769 /* kernel cage may have transitioned past mseg */ 2770 if (kcagepfn >= mseg->pages_base && 2771 kcagepfn < mseg->pages_end) { 2772 ASSERT(decr); 2773 *hi = kcagepfn; 2774 *lo = MAX(pfnlo, mseg->pages_base); 2775 rc = 1; 2776 } 2777 } else { 2778 /* entire mseg outside of kernel cage */ 2779 *lo = MAX(pfnlo, mseg->pages_base); 2780 *hi = MIN(pfnhi, (mseg->pages_end - 1)); 2781 rc = 1; 2782 } 2783 } 2784 return (rc); 2785 } 2786 2787 /* 2788 * called from page_get_contig_pages to search 'pfnlo' thru 'pfnhi' to "claim" a 2789 * page with size code 'szc'. Claiming such a page requires acquiring 2790 * exclusive locks on all constituent pages (page_trylock_contig_pages), 2791 * relocating pages in use and concatenating these constituent pages into a 2792 * large page. 2793 * 2794 * The page lists do not have such a large page and page_freelist_fill has 2795 * already failed to demote larger pages and/or coalesce smaller free pages. 2796 * 2797 * 'flags' may specify PG_COLOR_MATCH which would limit the search of large 2798 * pages with the same color as 'bin'. 2799 * 2800 * 'pfnflag' specifies the subset of the pfn range to search. 2801 */ 2802 2803 2804 static page_t * 2805 page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags, 2806 pfn_t pfnlo, pfn_t pfnhi, pgcnt_t pfnflag) 2807 { 2808 struct memseg *mseg; 2809 pgcnt_t szcpgcnt = page_get_pagecnt(szc); 2810 pgcnt_t szcpgmask = szcpgcnt - 1; 2811 pfn_t randpfn; 2812 page_t *pp, *randpp, *endpp; 2813 uint_t colors; 2814 pfn_t hi, lo; 2815 uint_t skip; 2816 2817 ASSERT(szc != 0 || (flags & PGI_PGCPSZC0)); 2818 2819 if ((pfnhi - pfnlo) + 1 < szcpgcnt) 2820 return (NULL); 2821 2822 ASSERT(szc < mmu_page_sizes); 2823 2824 colors = (szc) ? page_convert_color(0, szc, page_colors - 1) + 1 : 2825 page_colors; 2826 2827 ASSERT(bin < colors); 2828 2829 /* 2830 * trim the pfn range to search based on pfnflag. pfnflag is set 2831 * when there have been previous page_get_contig_page failures to 2832 * limit the search. 2833 * 2834 * The high bit in pfnflag specifies the number of 'slots' in the 2835 * pfn range and the remainder of pfnflag specifies which slot. 2836 * For example, a value of 1010b would mean the second slot of 2837 * the pfn range that has been divided into 8 slots. 2838 */ 2839 if (pfnflag > 1) { 2840 int slots = 1 << (highbit(pfnflag) - 1); 2841 int slotid = pfnflag & (slots - 1); 2842 pgcnt_t szcpages; 2843 int slotlen; 2844 2845 pfnlo = P2ROUNDUP(pfnlo, szcpgcnt); 2846 pfnhi = pfnhi & ~(szcpgcnt - 1); 2847 2848 szcpages = ((pfnhi - pfnlo) + 1) / szcpgcnt; 2849 slotlen = howmany(szcpages, slots); 2850 pfnlo = pfnlo + (((slotid * slotlen) % szcpages) * szcpgcnt); 2851 ASSERT(pfnlo < pfnhi); 2852 if (pfnhi > pfnlo + (slotlen * szcpgcnt)) 2853 pfnhi = pfnlo + (slotlen * szcpgcnt); 2854 } 2855 2856 memsegs_lock(0); 2857 2858 /* 2859 * loop through memsegs to look for contig page candidates 2860 */ 2861 2862 for (mseg = memsegs; mseg != NULL; mseg = mseg->next) { 2863 if (pfnhi < mseg->pages_base || pfnlo >= mseg->pages_end) { 2864 /* no overlap */ 2865 continue; 2866 } 2867 2868 if (mseg->pages_end - mseg->pages_base < szcpgcnt) 2869 /* mseg too small */ 2870 continue; 2871 2872 /* trim off kernel cage pages from pfn range */ 2873 if (kcage_on) { 2874 if (trimkcage(mseg, &lo, &hi, pfnlo, pfnhi) == 0) 2875 continue; 2876 } else { 2877 lo = MAX(pfnlo, mseg->pages_base); 2878 hi = MIN(pfnhi, (mseg->pages_end - 1)); 2879 } 2880 2881 /* round to szcpgcnt boundaries */ 2882 lo = P2ROUNDUP(lo, szcpgcnt); 2883 hi = hi & ~(szcpgcnt - 1); 2884 2885 if (hi <= lo) 2886 continue; 2887 2888 /* 2889 * set lo to point to the pfn for the desired bin. Large 2890 * page sizes may only have a single page color 2891 */ 2892 if ((colors > 1) && (flags & PG_MATCH_COLOR)) { 2893 uint_t lobin; 2894 2895 /* 2896 * factor in colorequiv to check additional 2897 * 'equivalent' bins. 2898 */ 2899 if (colorequiv > 1 && colors > colorequiv) 2900 colors = colors / colorequiv; 2901 2902 /* determine bin that lo currently points to */ 2903 lobin = (lo & ((szcpgcnt * colors) - 1)) / szcpgcnt; 2904 2905 /* 2906 * set lo to point at appropriate color and set skip 2907 * to arrive at the next szc page of the same color. 2908 */ 2909 lo += ((bin - lobin) & (colors - 1)) * szcpgcnt; 2910 2911 skip = colors * szcpgcnt; 2912 } else { 2913 /* check all pages starting from lo */ 2914 skip = szcpgcnt; 2915 } 2916 if (hi <= lo) 2917 /* mseg cannot satisfy color request */ 2918 continue; 2919 2920 /* randomly choose a point between lo and hi to begin search */ 2921 2922 randpfn = (pfn_t)GETTICK(); 2923 randpfn = ((randpfn % (hi - lo)) + lo) & ~(skip - 1); 2924 randpp = mseg->pages + (randpfn - mseg->pages_base); 2925 2926 ASSERT(randpp->p_pagenum == randpfn); 2927 2928 pp = randpp; 2929 endpp = mseg->pages + (hi - mseg->pages_base); 2930 2931 ASSERT(randpp + szcpgcnt <= endpp); 2932 2933 do { 2934 ASSERT(!(pp->p_pagenum & szcpgmask)); 2935 ASSERT((flags & PG_MATCH_COLOR) == 0 || 2936 colorequiv > 1 || 2937 PP_2_BIN(pp) == bin); 2938 if (page_trylock_contig_pages(mnode, pp, szc, flags)) { 2939 /* pages unlocked by page_claim on failure */ 2940 if (page_claim_contig_pages(pp, szc, flags)) { 2941 memsegs_unlock(0); 2942 return (pp); 2943 } 2944 } 2945 2946 pp += skip; 2947 if (pp >= endpp) { 2948 /* start from the beginning */ 2949 pp = mseg->pages + (lo - mseg->pages_base); 2950 ASSERT(pp->p_pagenum == lo); 2951 ASSERT(pp + szcpgcnt <= endpp); 2952 } 2953 } while (pp != randpp); 2954 } 2955 memsegs_unlock(0); 2956 return (NULL); 2957 } 2958 2959 2960 /* 2961 * controlling routine that searches through physical memory in an attempt to 2962 * claim a large page based on the input parameters. 2963 * on the page free lists. 2964 * 2965 * calls page_geti_contig_pages with an initial pfn range from the mnode 2966 * and mtype. page_geti_contig_pages will trim off the parts of the pfn range 2967 * that overlaps with the kernel cage or does not match the requested page 2968 * color if PG_MATCH_COLOR is set. Since this search is very expensive, 2969 * page_geti_contig_pages may further limit the search range based on 2970 * previous failure counts (pgcpfailcnt[]). 2971 * 2972 * for PGI_PGCPSZC0 requests, page_get_contig_pages will relocate a base 2973 * pagesize page that satisfies mtype. 2974 */ 2975 page_t * 2976 page_get_contig_pages(int mnode, uint_t bin, int mtype, uchar_t szc, 2977 uint_t flags) 2978 { 2979 pfn_t pfnlo, pfnhi; /* contig pages pfn range */ 2980 page_t *pp; 2981 pgcnt_t pfnflag = 0; /* no limit on search if 0 */ 2982 2983 VM_STAT_ADD(vmm_vmstats.pgcp_alloc[szc]); 2984 2985 /* no allocations from cage */ 2986 flags |= PGI_NOCAGE; 2987 2988 /* LINTED */ 2989 MTYPE_START(mnode, mtype, flags); 2990 if (mtype < 0) { /* mnode does not have memory in mtype range */ 2991 VM_STAT_ADD(vmm_vmstats.pgcp_allocempty[szc]); 2992 return (NULL); 2993 } 2994 2995 ASSERT(szc > 0 || (flags & PGI_PGCPSZC0)); 2996 2997 /* do not limit search and ignore color if hi pri */ 2998 2999 if (pgcplimitsearch && ((flags & PGI_PGCPHIPRI) == 0)) 3000 pfnflag = pgcpfailcnt[szc]; 3001 3002 /* remove color match to improve chances */ 3003 3004 if (flags & PGI_PGCPHIPRI || pfnflag) 3005 flags &= ~PG_MATCH_COLOR; 3006 3007 do { 3008 /* get pfn range based on mnode and mtype */ 3009 MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi); 3010 3011 ASSERT(pfnhi >= pfnlo); 3012 3013 pp = page_geti_contig_pages(mnode, bin, szc, flags, 3014 pfnlo, pfnhi, pfnflag); 3015 3016 if (pp != NULL) { 3017 pfnflag = pgcpfailcnt[szc]; 3018 if (pfnflag) { 3019 /* double the search size */ 3020 pgcpfailcnt[szc] = pfnflag >> 1; 3021 } 3022 VM_STAT_ADD(vmm_vmstats.pgcp_allocok[szc]); 3023 return (pp); 3024 } 3025 MTYPE_NEXT(mnode, mtype, flags); 3026 } while (mtype >= 0); 3027 3028 VM_STAT_ADD(vmm_vmstats.pgcp_allocfailed[szc]); 3029 return (NULL); 3030 } 3031 3032 3033 /* 3034 * Find the `best' page on the freelist for this (vp,off) (as,vaddr) pair. 3035 * 3036 * Does its own locking and accounting. 3037 * If PG_MATCH_COLOR is set, then NULL will be returned if there are no 3038 * pages of the proper color even if there are pages of a different color. 3039 * 3040 * Finds a page, removes it, THEN locks it. 3041 */ 3042 3043 /*ARGSUSED*/ 3044 page_t * 3045 page_get_freelist(struct vnode *vp, u_offset_t off, struct seg *seg, 3046 caddr_t vaddr, size_t size, uint_t flags, struct lgrp *lgrp) 3047 { 3048 struct as *as = seg->s_as; 3049 page_t *pp = NULL; 3050 ulong_t bin; 3051 uchar_t szc; 3052 int mnode; 3053 int mtype; 3054 page_t *(*page_get_func)(int, uint_t, int, uchar_t, uint_t); 3055 lgrp_mnode_cookie_t lgrp_cookie; 3056 3057 page_get_func = page_get_mnode_freelist; 3058 3059 /* 3060 * If we aren't passed a specific lgroup, or passed a freed lgrp 3061 * assume we wish to allocate near to the current thread's home. 3062 */ 3063 if (!LGRP_EXISTS(lgrp)) 3064 lgrp = lgrp_home_lgrp(); 3065 3066 if (kcage_on) { 3067 if ((flags & (PG_NORELOC | PG_PANIC)) == PG_NORELOC && 3068 kcage_freemem < kcage_throttlefree + btop(size) && 3069 curthread != kcage_cageout_thread) { 3070 /* 3071 * Set a "reserve" of kcage_throttlefree pages for 3072 * PG_PANIC and cageout thread allocations. 3073 * 3074 * Everybody else has to serialize in 3075 * page_create_get_something() to get a cage page, so 3076 * that we don't deadlock cageout! 3077 */ 3078 return (NULL); 3079 } 3080 } else { 3081 flags &= ~PG_NORELOC; 3082 flags |= PGI_NOCAGE; 3083 } 3084 3085 /* LINTED */ 3086 MTYPE_INIT(mtype, vp, vaddr, flags, size); 3087 3088 /* 3089 * Convert size to page size code. 3090 */ 3091 if ((szc = page_szc(size)) == (uchar_t)-1) 3092 panic("page_get_freelist: illegal page size request"); 3093 ASSERT(szc < mmu_page_sizes); 3094 3095 VM_STAT_ADD(vmm_vmstats.pgf_alloc[szc]); 3096 3097 /* LINTED */ 3098 AS_2_BIN(as, seg, vp, vaddr, bin); 3099 3100 /* bin is for base pagesize color - convert if larger pagesize. */ 3101 if (szc) 3102 bin = page_convert_color(0, szc, bin); 3103 3104 /* 3105 * Try to get a local page first, but try remote if we can't 3106 * get a page of the right color. 3107 */ 3108 pgretry: 3109 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL); 3110 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3111 pp = page_get_func(mnode, bin, mtype, szc, flags); 3112 if (pp != NULL) { 3113 VM_STAT_ADD(vmm_vmstats.pgf_allocok[szc]); 3114 DTRACE_PROBE4(page__get, 3115 lgrp_t *, lgrp, 3116 int, mnode, 3117 ulong_t, bin, 3118 uint_t, flags); 3119 return (pp); 3120 } 3121 } 3122 ASSERT(pp == NULL); 3123 3124 /* 3125 * for non-SZC0 PAGESIZE requests, check cachelist before checking 3126 * remote free lists. Caller expected to call page_get_cachelist which 3127 * will check local cache lists and remote free lists. 3128 */ 3129 if (szc == 0 && ((flags & PGI_PGCPSZC0) == 0)) { 3130 VM_STAT_ADD(vmm_vmstats.pgf_allocdeferred); 3131 return (NULL); 3132 } 3133 3134 ASSERT(szc > 0 || (flags & PGI_PGCPSZC0)); 3135 3136 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); 3137 3138 /* 3139 * Try to get a non-local freelist page. 3140 */ 3141 LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie); 3142 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3143 pp = page_get_func(mnode, bin, mtype, szc, flags); 3144 if (pp != NULL) { 3145 DTRACE_PROBE4(page__get, 3146 lgrp_t *, lgrp, 3147 int, mnode, 3148 ulong_t, bin, 3149 uint_t, flags); 3150 VM_STAT_ADD(vmm_vmstats.pgf_allocokrem[szc]); 3151 return (pp); 3152 } 3153 } 3154 3155 ASSERT(pp == NULL); 3156 3157 /* 3158 * when the cage is off chances are page_get_contig_pages() will fail 3159 * to lock a large page chunk therefore when the cage is off it's not 3160 * called by default. this can be changed via /etc/system. 3161 * 3162 * page_get_contig_pages() also called to acquire a base pagesize page 3163 * for page_create_get_something(). 3164 */ 3165 if (!(flags & PG_NORELOC) && (pg_contig_disable == 0) && 3166 (kcage_on || pg_lpgcreate_nocage || szc == 0) && 3167 (page_get_func != page_get_contig_pages)) { 3168 3169 VM_STAT_ADD(vmm_vmstats.pgf_allocretry[szc]); 3170 page_get_func = page_get_contig_pages; 3171 goto pgretry; 3172 } 3173 3174 if (pgcplimitsearch && page_get_func == page_get_contig_pages) 3175 SETPGCPFAILCNT(szc); 3176 3177 VM_STAT_ADD(vmm_vmstats.pgf_allocfailed[szc]); 3178 return (NULL); 3179 } 3180 3181 /* 3182 * Find the `best' page on the cachelist for this (vp,off) (as,vaddr) pair. 3183 * 3184 * Does its own locking. 3185 * If PG_MATCH_COLOR is set, then NULL will be returned if there are no 3186 * pages of the proper color even if there are pages of a different color. 3187 * Otherwise, scan the bins for ones with pages. For each bin with pages, 3188 * try to lock one of them. If no page can be locked, try the 3189 * next bin. Return NULL if a page can not be found and locked. 3190 * 3191 * Finds a pages, trys to lock it, then removes it. 3192 */ 3193 3194 /*ARGSUSED*/ 3195 page_t * 3196 page_get_cachelist(struct vnode *vp, u_offset_t off, struct seg *seg, 3197 caddr_t vaddr, uint_t flags, struct lgrp *lgrp) 3198 { 3199 page_t *pp; 3200 struct as *as = seg->s_as; 3201 ulong_t bin; 3202 /*LINTED*/ 3203 int mnode; 3204 int mtype; 3205 lgrp_mnode_cookie_t lgrp_cookie; 3206 3207 /* 3208 * If we aren't passed a specific lgroup, or pasased a freed lgrp 3209 * assume we wish to allocate near to the current thread's home. 3210 */ 3211 if (!LGRP_EXISTS(lgrp)) 3212 lgrp = lgrp_home_lgrp(); 3213 3214 if (!kcage_on) { 3215 flags &= ~PG_NORELOC; 3216 flags |= PGI_NOCAGE; 3217 } 3218 3219 if ((flags & (PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == PG_NORELOC && 3220 kcage_freemem <= kcage_throttlefree) { 3221 /* 3222 * Reserve kcage_throttlefree pages for critical kernel 3223 * threads. 3224 * 3225 * Everybody else has to go to page_create_get_something() 3226 * to get a cage page, so we don't deadlock cageout. 3227 */ 3228 return (NULL); 3229 } 3230 3231 /* LINTED */ 3232 AS_2_BIN(as, seg, vp, vaddr, bin); 3233 3234 ASSERT(bin <= page_colors_mask); 3235 3236 /* LINTED */ 3237 MTYPE_INIT(mtype, vp, vaddr, flags, MMU_PAGESIZE); 3238 3239 VM_STAT_ADD(vmm_vmstats.pgc_alloc); 3240 3241 /* 3242 * Try local cachelists first 3243 */ 3244 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL); 3245 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3246 pp = page_get_mnode_cachelist(bin, flags, mnode, mtype); 3247 if (pp != NULL) { 3248 VM_STAT_ADD(vmm_vmstats.pgc_allocok); 3249 DTRACE_PROBE4(page__get, 3250 lgrp_t *, lgrp, 3251 int, mnode, 3252 ulong_t, bin, 3253 uint_t, flags); 3254 return (pp); 3255 } 3256 } 3257 3258 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); 3259 3260 /* 3261 * Try freelists/cachelists that are farther away 3262 * This is our only chance to allocate remote pages for PAGESIZE 3263 * requests. 3264 */ 3265 LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie); 3266 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3267 pp = page_get_mnode_freelist(mnode, bin, mtype, 3268 0, flags); 3269 if (pp != NULL) { 3270 VM_STAT_ADD(vmm_vmstats.pgc_allocokdeferred); 3271 DTRACE_PROBE4(page__get, 3272 lgrp_t *, lgrp, 3273 int, mnode, 3274 ulong_t, bin, 3275 uint_t, flags); 3276 return (pp); 3277 } 3278 pp = page_get_mnode_cachelist(bin, flags, mnode, mtype); 3279 if (pp != NULL) { 3280 VM_STAT_ADD(vmm_vmstats.pgc_allocokrem); 3281 DTRACE_PROBE4(page__get, 3282 lgrp_t *, lgrp, 3283 int, mnode, 3284 ulong_t, bin, 3285 uint_t, flags); 3286 return (pp); 3287 } 3288 } 3289 3290 VM_STAT_ADD(vmm_vmstats.pgc_allocfailed); 3291 return (NULL); 3292 } 3293 3294 page_t * 3295 page_get_mnode_cachelist(uint_t bin, uint_t flags, int mnode, int mtype) 3296 { 3297 kmutex_t *pcm; 3298 int i; 3299 page_t *pp; 3300 page_t *first_pp; 3301 uint_t bin_marker; 3302 int nwaybins, nwaycnt; 3303 int cpucolors; 3304 3305 VM_STAT_ADD(vmm_vmstats.pgmc_alloc); 3306 3307 /* LINTED */ 3308 MTYPE_START(mnode, mtype, flags); 3309 if (mtype < 0) { /* mnode does not have memory in mtype range */ 3310 VM_STAT_ADD(vmm_vmstats.pgmc_allocempty); 3311 return (NULL); 3312 } 3313 3314 nwaybins = 0; 3315 cpucolors = cpu_page_colors; 3316 /* 3317 * adjust cpucolors to possibly check additional 'equivalent' bins 3318 * to try to minimize fragmentation of large pages by delaying calls 3319 * to page_freelist_fill. 3320 */ 3321 if (colorequiv > 1) { 3322 int equivcolors = page_colors / colorequiv; 3323 3324 if (equivcolors && (cpucolors == 0 || equivcolors < cpucolors)) 3325 cpucolors = equivcolors; 3326 } 3327 3328 /* 3329 * Only hold one cachelist lock at a time, that way we 3330 * can start anywhere and not have to worry about lock 3331 * ordering. 3332 */ 3333 3334 big_try_again: 3335 nwaycnt = 0; 3336 for (i = 0; i <= page_colors; i++) { 3337 if (PAGE_CACHELISTS(mnode, bin, mtype)) { 3338 pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST); 3339 mutex_enter(pcm); 3340 pp = PAGE_CACHELISTS(mnode, bin, mtype); 3341 if (pp != NULL) { 3342 first_pp = pp; 3343 ASSERT(pp->p_vnode); 3344 ASSERT(PP_ISAGED(pp) == 0); 3345 ASSERT(pp->p_szc == 0); 3346 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 3347 while (!page_trylock(pp, SE_EXCL)) { 3348 pp = pp->p_next; 3349 ASSERT(pp->p_szc == 0); 3350 if (pp == first_pp) { 3351 /* 3352 * We have searched the 3353 * complete list! 3354 * And all of them (might 3355 * only be one) are locked. 3356 * This can happen since 3357 * these pages can also be 3358 * found via the hash list. 3359 * When found via the hash 3360 * list, they are locked 3361 * first, then removed. 3362 * We give up to let the 3363 * other thread run. 3364 */ 3365 pp = NULL; 3366 break; 3367 } 3368 ASSERT(pp->p_vnode); 3369 ASSERT(PP_ISFREE(pp)); 3370 ASSERT(PP_ISAGED(pp) == 0); 3371 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == 3372 mnode); 3373 } 3374 3375 if (pp) { 3376 page_t **ppp; 3377 /* 3378 * Found and locked a page. 3379 * Pull it off the list. 3380 */ 3381 ASSERT(mtype == PP_2_MTYPE(pp)); 3382 ppp = &PAGE_CACHELISTS(mnode, bin, 3383 mtype); 3384 page_sub(ppp, pp); 3385 /* 3386 * Subtract counters before releasing 3387 * pcm mutex to avoid a race with 3388 * page_freelist_coalesce and 3389 * page_freelist_fill. 3390 */ 3391 page_ctr_sub(mnode, mtype, pp, 3392 PG_CACHE_LIST); 3393 mutex_exit(pcm); 3394 ASSERT(pp->p_vnode); 3395 ASSERT(PP_ISAGED(pp) == 0); 3396 #if defined(__sparc) 3397 ASSERT(!kcage_on || 3398 (flags & PG_NORELOC) == 0 || 3399 PP_ISNORELOC(pp)); 3400 if (PP_ISNORELOC(pp)) { 3401 kcage_freemem_sub(1); 3402 } 3403 #endif 3404 VM_STAT_ADD(vmm_vmstats. 3405 pgmc_allocok); 3406 return (pp); 3407 } 3408 } 3409 mutex_exit(pcm); 3410 } 3411 3412 /* 3413 * Wow! The initial bin is empty or no page in the bin could 3414 * be locked. 3415 * 3416 * If specific color is needed, check if page color may be in 3417 * other bins. 3418 */ 3419 if ((flags & PG_MATCH_COLOR) && (cpucolors != 0)) { 3420 if (!nwaybins) { 3421 if (cpucolors < 0) { 3422 cpucolors = CPUSETSIZE() / MMU_PAGESIZE; 3423 ASSERT(cpucolors > 0); 3424 nwaybins = page_colors / cpucolors; 3425 if (nwaybins < 2) 3426 cpucolors = 0; 3427 } else { 3428 nwaybins = page_colors / cpucolors; 3429 ASSERT(nwaybins > 1); 3430 } 3431 } 3432 3433 if (++nwaycnt >= nwaybins) { 3434 break; 3435 } 3436 bin = (bin + (page_colors / nwaybins)) & 3437 page_colors_mask; 3438 continue; 3439 } 3440 3441 if (i == 0) { 3442 bin = (bin + BIN_STEP) & page_colors_mask; 3443 bin_marker = bin; 3444 } else { 3445 bin = (bin + vac_colors) & page_colors_mask; 3446 if (bin == bin_marker) { 3447 bin = (bin + 1) & page_colors_mask; 3448 bin_marker = bin; 3449 } 3450 } 3451 } 3452 3453 MTYPE_NEXT(mnode, mtype, flags); 3454 if (mtype >= 0) 3455 goto big_try_again; 3456 3457 VM_STAT_ADD(vmm_vmstats.pgmc_allocfailed); 3458 return (NULL); 3459 } 3460 3461 #ifdef DEBUG 3462 #define REPL_PAGE_STATS 3463 #endif /* DEBUG */ 3464 3465 #ifdef REPL_PAGE_STATS 3466 struct repl_page_stats { 3467 uint_t ngets; 3468 uint_t ngets_noreloc; 3469 uint_t npgr_noreloc; 3470 uint_t nnopage_first; 3471 uint_t nnopage; 3472 uint_t nhashout; 3473 uint_t nnofree; 3474 uint_t nnext_pp; 3475 } repl_page_stats; 3476 #define REPL_STAT_INCR(v) atomic_add_32(&repl_page_stats.v, 1) 3477 #else /* REPL_PAGE_STATS */ 3478 #define REPL_STAT_INCR(v) 3479 #endif /* REPL_PAGE_STATS */ 3480 3481 int pgrppgcp; 3482 3483 /* 3484 * The freemem accounting must be done by the caller. 3485 * First we try to get a replacement page of the same size as like_pp, 3486 * if that is not possible, then we just get a set of discontiguous 3487 * PAGESIZE pages. 3488 */ 3489 page_t * 3490 page_get_replacement_page(page_t *orig_like_pp, struct lgrp *lgrp_target, 3491 uint_t pgrflags) 3492 { 3493 page_t *like_pp; 3494 page_t *pp, *pplist; 3495 page_t *pl = NULL; 3496 ulong_t bin; 3497 int mnode, page_mnode; 3498 int szc; 3499 spgcnt_t npgs, pg_cnt; 3500 pfn_t pfnum; 3501 int mtype; 3502 int flags = 0; 3503 lgrp_mnode_cookie_t lgrp_cookie; 3504 lgrp_t *lgrp; 3505 3506 REPL_STAT_INCR(ngets); 3507 like_pp = orig_like_pp; 3508 ASSERT(PAGE_EXCL(like_pp)); 3509 3510 szc = like_pp->p_szc; 3511 npgs = page_get_pagecnt(szc); 3512 /* 3513 * Now we reset like_pp to the base page_t. 3514 * That way, we won't walk past the end of this 'szc' page. 3515 */ 3516 pfnum = PFN_BASE(like_pp->p_pagenum, szc); 3517 like_pp = page_numtopp_nolock(pfnum); 3518 ASSERT(like_pp->p_szc == szc); 3519 3520 if (PP_ISNORELOC(like_pp)) { 3521 ASSERT(kcage_on); 3522 REPL_STAT_INCR(ngets_noreloc); 3523 flags = PGI_RELOCONLY; 3524 } else if (pgrflags & PGR_NORELOC) { 3525 ASSERT(kcage_on); 3526 REPL_STAT_INCR(npgr_noreloc); 3527 flags = PG_NORELOC; 3528 } 3529 3530 /* 3531 * Kernel pages must always be replaced with the same size 3532 * pages, since we cannot properly handle demotion of kernel 3533 * pages. 3534 */ 3535 if (like_pp->p_vnode == &kvp) 3536 pgrflags |= PGR_SAMESZC; 3537 3538 /* LINTED */ 3539 MTYPE_PGR_INIT(mtype, flags, like_pp, page_mnode, npgs); 3540 3541 while (npgs) { 3542 pplist = NULL; 3543 for (;;) { 3544 pg_cnt = page_get_pagecnt(szc); 3545 bin = PP_2_BIN(like_pp); 3546 ASSERT(like_pp->p_szc == orig_like_pp->p_szc); 3547 ASSERT(pg_cnt <= npgs); 3548 3549 /* 3550 * If an lgroup was specified, try to get the 3551 * page from that lgroup. 3552 * NOTE: Must be careful with code below because 3553 * lgroup may disappear and reappear since there 3554 * is no locking for lgroup here. 3555 */ 3556 if (LGRP_EXISTS(lgrp_target)) { 3557 /* 3558 * Keep local variable for lgroup separate 3559 * from lgroup argument since this code should 3560 * only be exercised when lgroup argument 3561 * exists.... 3562 */ 3563 lgrp = lgrp_target; 3564 3565 /* Try the lgroup's freelists first */ 3566 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 3567 LGRP_SRCH_LOCAL); 3568 while ((pplist == NULL) && 3569 (mnode = lgrp_memnode_choose(&lgrp_cookie)) 3570 != -1) { 3571 pplist = page_get_mnode_freelist( 3572 mnode, bin, mtype, szc, 3573 flags); 3574 } 3575 3576 /* 3577 * Now try it's cachelists if this is a 3578 * small page. Don't need to do it for 3579 * larger ones since page_freelist_coalesce() 3580 * already failed. 3581 */ 3582 if (pplist != NULL || szc != 0) 3583 break; 3584 3585 /* Now try it's cachelists */ 3586 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 3587 LGRP_SRCH_LOCAL); 3588 3589 while ((pplist == NULL) && 3590 (mnode = lgrp_memnode_choose(&lgrp_cookie)) 3591 != -1) { 3592 pplist = page_get_mnode_cachelist( 3593 bin, flags, mnode, mtype); 3594 } 3595 if (pplist != NULL) { 3596 page_hashout(pplist, NULL); 3597 PP_SETAGED(pplist); 3598 REPL_STAT_INCR(nhashout); 3599 break; 3600 } 3601 /* Done looking in this lgroup. Bail out. */ 3602 break; 3603 } 3604 3605 /* 3606 * No lgroup was specified (or lgroup was removed by 3607 * DR, so just try to get the page as close to 3608 * like_pp's mnode as possible. 3609 * First try the local freelist... 3610 */ 3611 mnode = PP_2_MEM_NODE(like_pp); 3612 pplist = page_get_mnode_freelist(mnode, bin, 3613 mtype, szc, flags); 3614 if (pplist != NULL) 3615 break; 3616 3617 REPL_STAT_INCR(nnofree); 3618 3619 /* 3620 * ...then the local cachelist. Don't need to do it for 3621 * larger pages cause page_freelist_coalesce() already 3622 * failed there anyway. 3623 */ 3624 if (szc == 0) { 3625 pplist = page_get_mnode_cachelist(bin, flags, 3626 mnode, mtype); 3627 if (pplist != NULL) { 3628 page_hashout(pplist, NULL); 3629 PP_SETAGED(pplist); 3630 REPL_STAT_INCR(nhashout); 3631 break; 3632 } 3633 } 3634 3635 /* Now try remote freelists */ 3636 page_mnode = mnode; 3637 lgrp = 3638 lgrp_hand_to_lgrp(MEM_NODE_2_LGRPHAND(page_mnode)); 3639 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 3640 LGRP_SRCH_HIER); 3641 while (pplist == NULL && 3642 (mnode = lgrp_memnode_choose(&lgrp_cookie)) 3643 != -1) { 3644 /* 3645 * Skip local mnode. 3646 */ 3647 if ((mnode == page_mnode) || 3648 (mem_node_config[mnode].exists == 0)) 3649 continue; 3650 3651 pplist = page_get_mnode_freelist(mnode, 3652 bin, mtype, szc, flags); 3653 } 3654 3655 if (pplist != NULL) 3656 break; 3657 3658 3659 /* Now try remote cachelists */ 3660 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 3661 LGRP_SRCH_HIER); 3662 while (pplist == NULL && szc == 0) { 3663 mnode = lgrp_memnode_choose(&lgrp_cookie); 3664 if (mnode == -1) 3665 break; 3666 /* 3667 * Skip local mnode. 3668 */ 3669 if ((mnode == page_mnode) || 3670 (mem_node_config[mnode].exists == 0)) 3671 continue; 3672 3673 pplist = page_get_mnode_cachelist(bin, 3674 flags, mnode, mtype); 3675 3676 if (pplist != NULL) { 3677 page_hashout(pplist, NULL); 3678 PP_SETAGED(pplist); 3679 REPL_STAT_INCR(nhashout); 3680 break; 3681 } 3682 } 3683 3684 /* 3685 * Break out of while loop under the following cases: 3686 * - If we successfully got a page. 3687 * - If pgrflags specified only returning a specific 3688 * page size and we could not find that page size. 3689 * - If we could not satisfy the request with PAGESIZE 3690 * or larger pages. 3691 */ 3692 if (pplist != NULL || szc == 0) 3693 break; 3694 3695 if ((pgrflags & PGR_SAMESZC) || pgrppgcp) { 3696 /* try to find contig page */ 3697 3698 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 3699 LGRP_SRCH_HIER); 3700 3701 while ((pplist == NULL) && 3702 (mnode = 3703 lgrp_memnode_choose(&lgrp_cookie)) 3704 != -1) { 3705 pplist = page_get_contig_pages( 3706 mnode, bin, mtype, szc, 3707 flags | PGI_PGCPHIPRI); 3708 } 3709 break; 3710 } 3711 3712 /* 3713 * The correct thing to do here is try the next 3714 * page size down using szc--. Due to a bug 3715 * with the processing of HAT_RELOAD_SHARE 3716 * where the sfmmu_ttecnt arrays of all 3717 * hats sharing an ISM segment don't get updated, 3718 * using intermediate size pages for relocation 3719 * can lead to continuous page faults. 3720 */ 3721 szc = 0; 3722 } 3723 3724 if (pplist != NULL) { 3725 DTRACE_PROBE4(page__get, 3726 lgrp_t *, lgrp, 3727 int, mnode, 3728 ulong_t, bin, 3729 uint_t, flags); 3730 3731 while (pplist != NULL && pg_cnt--) { 3732 ASSERT(pplist != NULL); 3733 pp = pplist; 3734 page_sub(&pplist, pp); 3735 PP_CLRFREE(pp); 3736 PP_CLRAGED(pp); 3737 page_list_concat(&pl, &pp); 3738 npgs--; 3739 like_pp = like_pp + 1; 3740 REPL_STAT_INCR(nnext_pp); 3741 } 3742 ASSERT(pg_cnt == 0); 3743 } else { 3744 break; 3745 } 3746 } 3747 3748 if (npgs) { 3749 /* 3750 * We were unable to allocate the necessary number 3751 * of pages. 3752 * We need to free up any pl. 3753 */ 3754 REPL_STAT_INCR(nnopage); 3755 page_free_replacement_page(pl); 3756 return (NULL); 3757 } else { 3758 return (pl); 3759 } 3760 } 3761 3762 /* 3763 * demote a free large page to it's constituent pages 3764 */ 3765 void 3766 page_demote_free_pages(page_t *pp) 3767 { 3768 3769 int mnode; 3770 3771 ASSERT(pp != NULL); 3772 ASSERT(PAGE_LOCKED(pp)); 3773 ASSERT(PP_ISFREE(pp)); 3774 ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes); 3775 3776 mnode = PP_2_MEM_NODE(pp); 3777 page_freelist_lock(mnode); 3778 if (pp->p_szc != 0) { 3779 (void) page_demote(mnode, PFN_BASE(pp->p_pagenum, 3780 pp->p_szc), pp->p_szc, 0, PC_NO_COLOR, PC_FREE); 3781 } 3782 page_freelist_unlock(mnode); 3783 ASSERT(pp->p_szc == 0); 3784 } 3785