1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * Portions of this source code were derived from Berkeley 4.3 BSD 32 * under license from the Regents of the University of California. 33 */ 34 35 #pragma ident "%Z%%M% %I% %E% SMI" 36 37 /* 38 * This file contains common functions to access and manage the page lists. 39 * Many of these routines originated from platform dependent modules 40 * (sun4/vm/vm_dep.c, i86pc/vm/vm_machdep.c) and modified to function in 41 * a platform independent manner. 42 * 43 * vm/vm_dep.h provides for platform specific support. 44 */ 45 46 #include <sys/types.h> 47 #include <sys/debug.h> 48 #include <sys/cmn_err.h> 49 #include <sys/systm.h> 50 #include <sys/atomic.h> 51 #include <sys/sysmacros.h> 52 #include <vm/as.h> 53 #include <vm/page.h> 54 #include <vm/seg_kmem.h> 55 #include <vm/seg_vn.h> 56 #include <sys/memnode.h> 57 #include <vm/vm_dep.h> 58 #include <sys/lgrp.h> 59 #include <sys/mem_config.h> 60 #include <sys/callb.h> 61 #include <sys/mem_cage.h> 62 #include <sys/sdt.h> 63 64 extern uint_t vac_colors; 65 66 #define MAX_PRAGMA_ALIGN 128 67 68 /* vm_cpu_data0 for the boot cpu before kmem is initialized */ 69 70 #if L2CACHE_ALIGN_MAX <= MAX_PRAGMA_ALIGN 71 #pragma align L2CACHE_ALIGN_MAX(vm_cpu_data0) 72 #else 73 #pragma align MAX_PRAGMA_ALIGN(vm_cpu_data0) 74 #endif 75 char vm_cpu_data0[VM_CPU_DATA_PADSIZE]; 76 77 /* 78 * number of page colors equivalent to reqested color in page_get routines. 79 * If set, keeps large pages intact longer and keeps MPO allocation 80 * from the local mnode in favor of acquiring the 'correct' page color from 81 * a demoted large page or from a remote mnode. 82 */ 83 int colorequiv; 84 85 /* 86 * if set, specifies the percentage of large pages that are free from within 87 * a large page region before attempting to lock those pages for 88 * page_get_contig_pages processing. 89 * 90 * Should be turned on when kpr is available when page_trylock_contig_pages 91 * can be more selective. 92 */ 93 94 int ptcpthreshold; 95 96 /* 97 * Limit page get contig page search based on failure cnts in pgcpfailcnt[]. 98 * Enabled by default via pgcplimitsearch. 99 * 100 * pgcpfailcnt[] is bounded by PGCPFAILMAX (>= 1/2 of installed 101 * memory). When reached, pgcpfailcnt[] is reset to 1/2 of this upper 102 * bound. This upper bound range guarantees: 103 * - all large page 'slots' will be searched over time 104 * - the minimum (1) large page candidates considered on each pgcp call 105 * - count doesn't wrap around to 0 106 */ 107 pgcnt_t pgcpfailcnt[MMU_PAGE_SIZES]; 108 int pgcplimitsearch = 1; 109 110 #define PGCPFAILMAX (1 << (highbit(physinstalled) - 1)) 111 #define SETPGCPFAILCNT(szc) \ 112 if (++pgcpfailcnt[szc] >= PGCPFAILMAX) \ 113 pgcpfailcnt[szc] = PGCPFAILMAX / 2; 114 115 #ifdef VM_STATS 116 struct vmm_vmstats_str vmm_vmstats; 117 118 #endif /* VM_STATS */ 119 120 #if defined(__sparc) 121 #define LPGCREATE 0 122 #else 123 /* enable page_get_contig_pages */ 124 #define LPGCREATE 1 125 #endif 126 127 int pg_contig_disable; 128 int pg_lpgcreate_nocage = LPGCREATE; 129 130 /* 131 * page_freelist_fill pfn flag to signify no hi pfn requirement. 132 */ 133 #define PFNNULL 0 134 135 /* Flags involved in promotion and demotion routines */ 136 #define PC_FREE 0x1 /* put page on freelist */ 137 #define PC_ALLOC 0x2 /* return page for allocation */ 138 139 /* 140 * Flag for page_demote to be used with PC_FREE to denote that we don't care 141 * what the color is as the color parameter to the function is ignored. 142 */ 143 #define PC_NO_COLOR (-1) 144 145 /* 146 * page counters candidates info 147 * See page_ctrs_cands comment below for more details. 148 * fields are as follows: 149 * pcc_pages_free: # pages which freelist coalesce can create 150 * pcc_color_free_len: number of elements in pcc_color_free array 151 * pcc_color_free: pointer to page free counts per color 152 */ 153 typedef struct pcc_info { 154 pgcnt_t pcc_pages_free; 155 int pcc_color_free_len; 156 pgcnt_t *pcc_color_free; 157 } pcc_info_t; 158 159 /* 160 * On big machines it can take a long time to check page_counters 161 * arrays. page_ctrs_cands is a summary array whose elements are a dynamically 162 * updated sum of all elements of the corresponding page_counters arrays. 163 * page_freelist_coalesce() searches page_counters only if an appropriate 164 * element of page_ctrs_cands array is greater than 0. 165 * 166 * An extra dimension is used for page_ctrs_cands to spread the elements 167 * over a few e$ cache lines to avoid serialization during the array 168 * updates. 169 */ 170 #pragma align 64(page_ctrs_cands) 171 172 static pcc_info_t *page_ctrs_cands[NPC_MUTEX][MMU_PAGE_SIZES]; 173 174 /* 175 * Return in val the total number of free pages which can be created 176 * for the given mnode (m) and region size (r) 177 */ 178 #define PGCTRS_CANDS_GETVALUE(m, r, val) { \ 179 int i; \ 180 val = 0; \ 181 for (i = 0; i < NPC_MUTEX; i++) { \ 182 val += page_ctrs_cands[i][(r)][(m)].pcc_pages_free; \ 183 } \ 184 } 185 186 /* 187 * Return in val the total number of free pages which can be created 188 * for the given mnode (m), region size (r), and color (c) 189 */ 190 #define PGCTRS_CANDS_GETVALUECOLOR(m, r, c, val) { \ 191 int i; \ 192 val = 0; \ 193 ASSERT((c) < page_ctrs_cands[0][(r)][(m)].pcc_color_free_len); \ 194 for (i = 0; i < NPC_MUTEX; i++) { \ 195 val += page_ctrs_cands[i][(r)][(m)].pcc_color_free[(c)]; \ 196 } \ 197 } 198 199 /* 200 * We can only allow a single thread to update a counter within the physical 201 * range of the largest supported page size. That is the finest granularity 202 * possible since the counter values are dependent on each other 203 * as you move accross region sizes. PP_CTR_LOCK_INDX is used to determine the 204 * ctr_mutex lock index for a particular physical range. 205 */ 206 static kmutex_t *ctr_mutex[NPC_MUTEX]; 207 208 #define PP_CTR_LOCK_INDX(pp) \ 209 (((pp)->p_pagenum >> \ 210 (PAGE_BSZS_SHIFT(mmu_page_sizes - 1))) & (NPC_MUTEX - 1)) 211 212 /* 213 * Local functions prototypes. 214 */ 215 216 void page_ctr_add(int, int, page_t *, int); 217 void page_ctr_add_internal(int, int, page_t *, int); 218 void page_ctr_sub(int, int, page_t *, int); 219 uint_t page_convert_color(uchar_t, uchar_t, uint_t); 220 void page_freelist_lock(int); 221 void page_freelist_unlock(int); 222 page_t *page_promote(int, pfn_t, uchar_t, int); 223 page_t *page_demote(int, pfn_t, uchar_t, uchar_t, int, int); 224 page_t *page_freelist_fill(uchar_t, int, int, int, pfn_t); 225 page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int); 226 static int page_trylock_cons(page_t *pp, se_t se); 227 228 #define PNUM_SIZE(szc) \ 229 (hw_page_array[(szc)].hp_size >> hw_page_array[0].hp_shift) 230 #define PNUM_SHIFT(szc) \ 231 (hw_page_array[(szc)].hp_shift - hw_page_array[0].hp_shift) 232 233 /* 234 * The page_counters array below is used to keep track of free contiguous 235 * physical memory. A hw_page_map_t will be allocated per mnode per szc. 236 * This contains an array of counters, the size of the array, a shift value 237 * used to convert a pagenum into a counter array index or vice versa, as 238 * well as a cache of the last successful index to be promoted to a larger 239 * page size. As an optimization, we keep track of the last successful index 240 * to be promoted per page color for the given size region, and this is 241 * allocated dynamically based upon the number of colors for a given 242 * region size. 243 * 244 * Conceptually, the page counters are represented as: 245 * 246 * page_counters[region_size][mnode] 247 * 248 * region_size: size code of a candidate larger page made up 249 * of contiguous free smaller pages. 250 * 251 * page_counters[region_size][mnode].hpm_counters[index]: 252 * represents how many (region_size - 1) pages either 253 * exist or can be created within the given index range. 254 * 255 * Let's look at a sparc example: 256 * If we want to create a free 512k page, we look at region_size 2 257 * for the mnode we want. We calculate the index and look at a specific 258 * hpm_counters location. If we see 8 (FULL_REGION_CNT on sparc) at 259 * this location, it means that 8 64k pages either exist or can be created 260 * from 8K pages in order to make a single free 512k page at the given 261 * index. Note that when a region is full, it will contribute to the 262 * counts in the region above it. Thus we will not know what page 263 * size the free pages will be which can be promoted to this new free 264 * page unless we look at all regions below the current region. 265 */ 266 267 /* 268 * Note: hpmctr_t is defined in platform vm_dep.h 269 * hw_page_map_t contains all the information needed for the page_counters 270 * logic. The fields are as follows: 271 * 272 * hpm_counters: dynamically allocated array to hold counter data 273 * hpm_entries: entries in hpm_counters 274 * hpm_shift: shift for pnum/array index conv 275 * hpm_base: PFN mapped to counter index 0 276 * hpm_color_current_len: # of elements in hpm_color_current "array" below 277 * hpm_color_current: last index in counter array for this color at 278 * which we successfully created a large page 279 */ 280 typedef struct hw_page_map { 281 hpmctr_t *hpm_counters; 282 size_t hpm_entries; 283 int hpm_shift; 284 pfn_t hpm_base; 285 size_t hpm_color_current_len; 286 size_t *hpm_color_current; 287 } hw_page_map_t; 288 289 /* 290 * Element zero is not used, but is allocated for convenience. 291 */ 292 static hw_page_map_t *page_counters[MMU_PAGE_SIZES]; 293 294 /* 295 * The following macros are convenient ways to get access to the individual 296 * elements of the page_counters arrays. They can be used on both 297 * the left side and right side of equations. 298 */ 299 #define PAGE_COUNTERS(mnode, rg_szc, idx) \ 300 (page_counters[(rg_szc)][(mnode)].hpm_counters[(idx)]) 301 302 #define PAGE_COUNTERS_COUNTERS(mnode, rg_szc) \ 303 (page_counters[(rg_szc)][(mnode)].hpm_counters) 304 305 #define PAGE_COUNTERS_SHIFT(mnode, rg_szc) \ 306 (page_counters[(rg_szc)][(mnode)].hpm_shift) 307 308 #define PAGE_COUNTERS_ENTRIES(mnode, rg_szc) \ 309 (page_counters[(rg_szc)][(mnode)].hpm_entries) 310 311 #define PAGE_COUNTERS_BASE(mnode, rg_szc) \ 312 (page_counters[(rg_szc)][(mnode)].hpm_base) 313 314 #define PAGE_COUNTERS_CURRENT_COLOR_LEN(mnode, rg_szc) \ 315 (page_counters[(rg_szc)][(mnode)].hpm_color_current_len) 316 317 #define PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, rg_szc) \ 318 (page_counters[(rg_szc)][(mnode)].hpm_color_current) 319 320 #define PAGE_COUNTERS_CURRENT_COLOR(mnode, rg_szc, color) \ 321 (page_counters[(rg_szc)][(mnode)].hpm_color_current[(color)]) 322 323 #define PNUM_TO_IDX(mnode, rg_szc, pnum) \ 324 (((pnum) - PAGE_COUNTERS_BASE((mnode), (rg_szc))) >> \ 325 PAGE_COUNTERS_SHIFT((mnode), (rg_szc))) 326 327 #define IDX_TO_PNUM(mnode, rg_szc, index) \ 328 (PAGE_COUNTERS_BASE((mnode), (rg_szc)) + \ 329 ((index) << PAGE_COUNTERS_SHIFT((mnode), (rg_szc)))) 330 331 /* 332 * Protects the hpm_counters and hpm_color_current memory from changing while 333 * looking at page counters information. 334 * Grab the write lock to modify what these fields point at. 335 * Grab the read lock to prevent any pointers from changing. 336 * The write lock can not be held during memory allocation due to a possible 337 * recursion deadlock with trying to grab the read lock while the 338 * write lock is already held. 339 */ 340 krwlock_t page_ctrs_rwlock[MAX_MEM_NODES]; 341 342 343 /* 344 * initialize cpu_vm_data to point at cache aligned vm_cpu_data_t. 345 */ 346 void 347 cpu_vm_data_init(struct cpu *cp) 348 { 349 if (cp == CPU0) { 350 cp->cpu_vm_data = (void *)&vm_cpu_data0; 351 } else { 352 void *kmptr; 353 int align; 354 size_t sz; 355 356 align = (L2CACHE_ALIGN) ? L2CACHE_ALIGN : L2CACHE_ALIGN_MAX; 357 sz = P2ROUNDUP(sizeof (vm_cpu_data_t), align) + align; 358 kmptr = kmem_zalloc(sz, KM_SLEEP); 359 cp->cpu_vm_data = (void *) P2ROUNDUP((uintptr_t)kmptr, align); 360 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr = kmptr; 361 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize = sz; 362 } 363 } 364 365 /* 366 * free cpu_vm_data 367 */ 368 void 369 cpu_vm_data_destroy(struct cpu *cp) 370 { 371 if (cp->cpu_seqid && cp->cpu_vm_data) { 372 ASSERT(cp != CPU0); 373 kmem_free(((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr, 374 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize); 375 } 376 cp->cpu_vm_data = NULL; 377 } 378 379 380 /* 381 * page size to page size code 382 */ 383 int 384 page_szc(size_t pagesize) 385 { 386 int i = 0; 387 388 while (hw_page_array[i].hp_size) { 389 if (pagesize == hw_page_array[i].hp_size) 390 return (i); 391 i++; 392 } 393 return (-1); 394 } 395 396 /* 397 * page size to page size code with the restriction that it be a supported 398 * user page size. If it's not a supported user page size, -1 will be returned. 399 */ 400 int 401 page_szc_user_filtered(size_t pagesize) 402 { 403 int szc = page_szc(pagesize); 404 if ((szc != -1) && (SZC_2_USERSZC(szc) != -1)) { 405 return (szc); 406 } 407 return (-1); 408 } 409 410 /* 411 * Return how many page sizes are available for the user to use. This is 412 * what the hardware supports and not based upon how the OS implements the 413 * support of different page sizes. 414 */ 415 uint_t 416 page_num_user_pagesizes(void) 417 { 418 return (mmu_exported_page_sizes); 419 } 420 421 uint_t 422 page_num_pagesizes(void) 423 { 424 return (mmu_page_sizes); 425 } 426 427 /* 428 * returns the count of the number of base pagesize pages associated with szc 429 */ 430 pgcnt_t 431 page_get_pagecnt(uint_t szc) 432 { 433 if (szc >= mmu_page_sizes) 434 panic("page_get_pagecnt: out of range %d", szc); 435 return (hw_page_array[szc].hp_pgcnt); 436 } 437 438 size_t 439 page_get_pagesize(uint_t szc) 440 { 441 if (szc >= mmu_page_sizes) 442 panic("page_get_pagesize: out of range %d", szc); 443 return (hw_page_array[szc].hp_size); 444 } 445 446 /* 447 * Return the size of a page based upon the index passed in. An index of 448 * zero refers to the smallest page size in the system, and as index increases 449 * it refers to the next larger supported page size in the system. 450 * Note that szc and userszc may not be the same due to unsupported szc's on 451 * some systems. 452 */ 453 size_t 454 page_get_user_pagesize(uint_t userszc) 455 { 456 uint_t szc = USERSZC_2_SZC(userszc); 457 458 if (szc >= mmu_page_sizes) 459 panic("page_get_user_pagesize: out of range %d", szc); 460 return (hw_page_array[szc].hp_size); 461 } 462 463 uint_t 464 page_get_shift(uint_t szc) 465 { 466 if (szc >= mmu_page_sizes) 467 panic("page_get_shift: out of range %d", szc); 468 return (hw_page_array[szc].hp_shift); 469 } 470 471 uint_t 472 page_get_pagecolors(uint_t szc) 473 { 474 ASSERT(page_colors != 0); 475 return (MAX(page_colors >> PAGE_BSZS_SHIFT(szc), 1)); 476 } 477 478 /* 479 * Called by startup(). 480 * Size up the per page size free list counters based on physmax 481 * of each node and max_mem_nodes. 482 */ 483 size_t 484 page_ctrs_sz(void) 485 { 486 int r; /* region size */ 487 int mnode; 488 uint_t ctrs_sz = 0; 489 int i; 490 pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; 491 492 /* 493 * We need to determine how many page colors there are for each 494 * page size in order to allocate memory for any color specific 495 * arrays. 496 */ 497 colors_per_szc[0] = page_colors; 498 for (i = 1; i < mmu_page_sizes; i++) { 499 colors_per_szc[i] = 500 page_convert_color(0, i, page_colors - 1) + 1; 501 } 502 503 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 504 505 pgcnt_t r_pgcnt; 506 pfn_t r_base; 507 pgcnt_t r_align; 508 509 if (mem_node_config[mnode].exists == 0) 510 continue; 511 512 /* 513 * determine size needed for page counter arrays with 514 * base aligned to large page size. 515 */ 516 for (r = 1; r < mmu_page_sizes; r++) { 517 /* add in space for hpm_counters */ 518 r_align = page_get_pagecnt(r); 519 r_base = mem_node_config[mnode].physbase; 520 r_base &= ~(r_align - 1); 521 r_pgcnt = howmany(mem_node_config[mnode].physmax - 522 r_base + 1, r_align); 523 /* 524 * Round up to always allocate on pointer sized 525 * boundaries. 526 */ 527 ctrs_sz += P2ROUNDUP((r_pgcnt * sizeof (hpmctr_t)), 528 sizeof (hpmctr_t *)); 529 530 /* add in space for hpm_color_current */ 531 ctrs_sz += (colors_per_szc[r] * 532 sizeof (size_t)); 533 } 534 } 535 536 for (r = 1; r < mmu_page_sizes; r++) { 537 ctrs_sz += (max_mem_nodes * sizeof (hw_page_map_t)); 538 539 /* add in space for page_ctrs_cands */ 540 ctrs_sz += NPC_MUTEX * max_mem_nodes * (sizeof (pcc_info_t)); 541 ctrs_sz += NPC_MUTEX * max_mem_nodes * colors_per_szc[r] * 542 sizeof (pgcnt_t); 543 } 544 545 /* ctr_mutex */ 546 ctrs_sz += (max_mem_nodes * NPC_MUTEX * sizeof (kmutex_t)); 547 548 /* size for page list counts */ 549 PLCNT_SZ(ctrs_sz); 550 551 /* 552 * add some slop for roundups. page_ctrs_alloc will roundup the start 553 * address of the counters to ecache_alignsize boundary for every 554 * memory node. 555 */ 556 return (ctrs_sz + max_mem_nodes * L2CACHE_ALIGN); 557 } 558 559 caddr_t 560 page_ctrs_alloc(caddr_t alloc_base) 561 { 562 int mnode; 563 int r; /* region size */ 564 int i; 565 pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; 566 567 /* 568 * We need to determine how many page colors there are for each 569 * page size in order to allocate memory for any color specific 570 * arrays. 571 */ 572 colors_per_szc[0] = page_colors; 573 for (i = 1; i < mmu_page_sizes; i++) { 574 colors_per_szc[i] = 575 page_convert_color(0, i, page_colors - 1) + 1; 576 } 577 578 for (r = 1; r < mmu_page_sizes; r++) { 579 page_counters[r] = (hw_page_map_t *)alloc_base; 580 alloc_base += (max_mem_nodes * sizeof (hw_page_map_t)); 581 } 582 583 /* page_ctrs_cands */ 584 for (r = 1; r < mmu_page_sizes; r++) { 585 for (i = 0; i < NPC_MUTEX; i++) { 586 page_ctrs_cands[i][r] = (pcc_info_t *)alloc_base; 587 alloc_base += max_mem_nodes * (sizeof (pcc_info_t)); 588 589 } 590 } 591 592 /* page_ctrs_cands pcc_color_free array */ 593 for (r = 1; r < mmu_page_sizes; r++) { 594 for (i = 0; i < NPC_MUTEX; i++) { 595 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 596 page_ctrs_cands[i][r][mnode].pcc_color_free_len 597 = colors_per_szc[r]; 598 page_ctrs_cands[i][r][mnode].pcc_color_free = 599 (pgcnt_t *)alloc_base; 600 alloc_base += colors_per_szc[r] * 601 sizeof (pgcnt_t); 602 } 603 } 604 } 605 606 /* ctr_mutex */ 607 for (i = 0; i < NPC_MUTEX; i++) { 608 ctr_mutex[i] = (kmutex_t *)alloc_base; 609 alloc_base += (max_mem_nodes * sizeof (kmutex_t)); 610 } 611 612 /* initialize page list counts */ 613 PLCNT_INIT(alloc_base); 614 615 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 616 617 pgcnt_t r_pgcnt; 618 pfn_t r_base; 619 pgcnt_t r_align; 620 int r_shift; 621 622 if (mem_node_config[mnode].exists == 0) 623 continue; 624 625 for (r = 1; r < mmu_page_sizes; r++) { 626 /* 627 * the page_counters base has to be aligned to the 628 * page count of page size code r otherwise the counts 629 * will cross large page boundaries. 630 */ 631 r_align = page_get_pagecnt(r); 632 r_base = mem_node_config[mnode].physbase; 633 /* base needs to be aligned - lower to aligned value */ 634 r_base &= ~(r_align - 1); 635 r_pgcnt = howmany(mem_node_config[mnode].physmax - 636 r_base + 1, r_align); 637 r_shift = PAGE_BSZS_SHIFT(r); 638 639 PAGE_COUNTERS_SHIFT(mnode, r) = r_shift; 640 PAGE_COUNTERS_ENTRIES(mnode, r) = r_pgcnt; 641 PAGE_COUNTERS_BASE(mnode, r) = r_base; 642 PAGE_COUNTERS_CURRENT_COLOR_LEN(mnode, r) = 643 colors_per_szc[r]; 644 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r) = 645 (size_t *)alloc_base; 646 alloc_base += (sizeof (size_t) * colors_per_szc[r]); 647 for (i = 0; i < colors_per_szc[r]; i++) { 648 PAGE_COUNTERS_CURRENT_COLOR(mnode, r, i) = i; 649 } 650 PAGE_COUNTERS_COUNTERS(mnode, r) = 651 (hpmctr_t *)alloc_base; 652 /* 653 * Round up to make alloc_base always be aligned on 654 * a pointer boundary. 655 */ 656 alloc_base += P2ROUNDUP((sizeof (hpmctr_t) * r_pgcnt), 657 sizeof (hpmctr_t *)); 658 659 /* 660 * Verify that PNUM_TO_IDX and IDX_TO_PNUM 661 * satisfy the identity requirement. 662 * We should be able to go from one to the other 663 * and get consistent values. 664 */ 665 ASSERT(PNUM_TO_IDX(mnode, r, 666 (IDX_TO_PNUM(mnode, r, 0))) == 0); 667 ASSERT(IDX_TO_PNUM(mnode, r, 668 (PNUM_TO_IDX(mnode, r, r_base))) == r_base); 669 } 670 /* 671 * Roundup the start address of the page_counters to 672 * cache aligned boundary for every memory node. 673 * page_ctrs_sz() has added some slop for these roundups. 674 */ 675 alloc_base = (caddr_t)P2ROUNDUP((uintptr_t)alloc_base, 676 L2CACHE_ALIGN); 677 } 678 679 /* Initialize other page counter specific data structures. */ 680 for (mnode = 0; mnode < MAX_MEM_NODES; mnode++) { 681 rw_init(&page_ctrs_rwlock[mnode], NULL, RW_DEFAULT, NULL); 682 } 683 684 return (alloc_base); 685 } 686 687 /* 688 * Functions to adjust region counters for each size free list. 689 * Caller is responsible to acquire the ctr_mutex lock if necessary and 690 * thus can be called during startup without locks. 691 */ 692 /* ARGSUSED */ 693 void 694 page_ctr_add_internal(int mnode, int mtype, page_t *pp, int flags) 695 { 696 ssize_t r; /* region size */ 697 ssize_t idx; 698 pfn_t pfnum; 699 int lckidx; 700 701 ASSERT(mnode == PP_2_MEM_NODE(pp)); 702 ASSERT(mtype == PP_2_MTYPE(pp)); 703 704 ASSERT(pp->p_szc < mmu_page_sizes); 705 706 PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags); 707 708 /* no counter update needed for largest page size */ 709 if (pp->p_szc >= mmu_page_sizes - 1) { 710 return; 711 } 712 713 r = pp->p_szc + 1; 714 pfnum = pp->p_pagenum; 715 lckidx = PP_CTR_LOCK_INDX(pp); 716 717 /* 718 * Increment the count of free pages for the current 719 * region. Continue looping up in region size incrementing 720 * count if the preceeding region is full. 721 */ 722 while (r < mmu_page_sizes) { 723 idx = PNUM_TO_IDX(mnode, r, pfnum); 724 725 ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r)); 726 ASSERT(PAGE_COUNTERS(mnode, r, idx) < FULL_REGION_CNT(r)); 727 728 if (++PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r)) 729 break; 730 731 page_ctrs_cands[lckidx][r][mnode].pcc_pages_free++; 732 page_ctrs_cands[lckidx][r][mnode]. 733 pcc_color_free[PP_2_BIN_SZC(pp, r)]++; 734 r++; 735 } 736 } 737 738 void 739 page_ctr_add(int mnode, int mtype, page_t *pp, int flags) 740 { 741 int lckidx = PP_CTR_LOCK_INDX(pp); 742 kmutex_t *lock = &ctr_mutex[lckidx][mnode]; 743 744 mutex_enter(lock); 745 page_ctr_add_internal(mnode, mtype, pp, flags); 746 mutex_exit(lock); 747 } 748 749 void 750 page_ctr_sub(int mnode, int mtype, page_t *pp, int flags) 751 { 752 int lckidx; 753 kmutex_t *lock; 754 ssize_t r; /* region size */ 755 ssize_t idx; 756 pfn_t pfnum; 757 758 ASSERT(mnode == PP_2_MEM_NODE(pp)); 759 ASSERT(mtype == PP_2_MTYPE(pp)); 760 761 ASSERT(pp->p_szc < mmu_page_sizes); 762 763 PLCNT_DECR(pp, mnode, mtype, pp->p_szc, flags); 764 765 /* no counter update needed for largest page size */ 766 if (pp->p_szc >= mmu_page_sizes - 1) { 767 return; 768 } 769 770 r = pp->p_szc + 1; 771 pfnum = pp->p_pagenum; 772 lckidx = PP_CTR_LOCK_INDX(pp); 773 lock = &ctr_mutex[lckidx][mnode]; 774 775 /* 776 * Decrement the count of free pages for the current 777 * region. Continue looping up in region size decrementing 778 * count if the preceeding region was full. 779 */ 780 mutex_enter(lock); 781 while (r < mmu_page_sizes) { 782 idx = PNUM_TO_IDX(mnode, r, pfnum); 783 784 ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r)); 785 ASSERT(PAGE_COUNTERS(mnode, r, idx) > 0); 786 787 if (--PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r) - 1) { 788 break; 789 } 790 ASSERT(page_ctrs_cands[lckidx][r][mnode].pcc_pages_free != 0); 791 ASSERT(page_ctrs_cands[lckidx][r][mnode]. 792 pcc_color_free[PP_2_BIN_SZC(pp, r)] != 0); 793 794 page_ctrs_cands[lckidx][r][mnode].pcc_pages_free--; 795 page_ctrs_cands[lckidx][r][mnode]. 796 pcc_color_free[PP_2_BIN_SZC(pp, r)]--; 797 r++; 798 } 799 mutex_exit(lock); 800 } 801 802 /* 803 * Adjust page counters following a memory attach, since typically the 804 * size of the array needs to change, and the PFN to counter index 805 * mapping needs to change. 806 */ 807 uint_t 808 page_ctrs_adjust(int mnode) 809 { 810 pgcnt_t npgs; 811 int r; /* region size */ 812 int i; 813 size_t pcsz, old_csz; 814 hpmctr_t *new_ctr, *old_ctr; 815 pfn_t oldbase, newbase; 816 size_t old_npgs; 817 hpmctr_t *ctr_cache[MMU_PAGE_SIZES]; 818 size_t size_cache[MMU_PAGE_SIZES]; 819 size_t *color_cache[MMU_PAGE_SIZES]; 820 size_t *old_color_array; 821 pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; 822 823 newbase = mem_node_config[mnode].physbase & ~PC_BASE_ALIGN_MASK; 824 npgs = roundup(mem_node_config[mnode].physmax, 825 PC_BASE_ALIGN) - newbase; 826 827 /* 828 * We need to determine how many page colors there are for each 829 * page size in order to allocate memory for any color specific 830 * arrays. 831 */ 832 colors_per_szc[0] = page_colors; 833 for (r = 1; r < mmu_page_sizes; r++) { 834 colors_per_szc[r] = 835 page_convert_color(0, r, page_colors - 1) + 1; 836 } 837 838 /* 839 * Preallocate all of the new hpm_counters arrays as we can't 840 * hold the page_ctrs_rwlock as a writer and allocate memory. 841 * If we can't allocate all of the arrays, undo our work so far 842 * and return failure. 843 */ 844 for (r = 1; r < mmu_page_sizes; r++) { 845 pcsz = npgs >> PAGE_BSZS_SHIFT(r); 846 847 ctr_cache[r] = kmem_zalloc(pcsz * 848 sizeof (hpmctr_t), KM_NOSLEEP); 849 if (ctr_cache[r] == NULL) { 850 while (--r >= 1) { 851 kmem_free(ctr_cache[r], 852 size_cache[r] * sizeof (hpmctr_t)); 853 } 854 return (ENOMEM); 855 } 856 size_cache[r] = pcsz; 857 } 858 /* 859 * Preallocate all of the new color current arrays as we can't 860 * hold the page_ctrs_rwlock as a writer and allocate memory. 861 * If we can't allocate all of the arrays, undo our work so far 862 * and return failure. 863 */ 864 for (r = 1; r < mmu_page_sizes; r++) { 865 color_cache[r] = kmem_zalloc(sizeof (size_t) * 866 colors_per_szc[r], KM_NOSLEEP); 867 if (color_cache[r] == NULL) { 868 while (--r >= 1) { 869 kmem_free(color_cache[r], 870 colors_per_szc[r] * sizeof (size_t)); 871 } 872 for (r = 1; r < mmu_page_sizes; r++) { 873 kmem_free(ctr_cache[r], 874 size_cache[r] * sizeof (hpmctr_t)); 875 } 876 return (ENOMEM); 877 } 878 } 879 880 /* 881 * Grab the write lock to prevent others from walking these arrays 882 * while we are modifying them. 883 */ 884 rw_enter(&page_ctrs_rwlock[mnode], RW_WRITER); 885 page_freelist_lock(mnode); 886 for (r = 1; r < mmu_page_sizes; r++) { 887 PAGE_COUNTERS_SHIFT(mnode, r) = PAGE_BSZS_SHIFT(r); 888 old_ctr = PAGE_COUNTERS_COUNTERS(mnode, r); 889 old_csz = PAGE_COUNTERS_ENTRIES(mnode, r); 890 oldbase = PAGE_COUNTERS_BASE(mnode, r); 891 old_npgs = old_csz << PAGE_COUNTERS_SHIFT(mnode, r); 892 old_color_array = PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r); 893 894 pcsz = npgs >> PAGE_COUNTERS_SHIFT(mnode, r); 895 new_ctr = ctr_cache[r]; 896 ctr_cache[r] = NULL; 897 if (old_ctr != NULL && 898 (oldbase + old_npgs > newbase) && 899 (newbase + npgs > oldbase)) { 900 /* 901 * Map the intersection of the old and new 902 * counters into the new array. 903 */ 904 size_t offset; 905 if (newbase > oldbase) { 906 offset = (newbase - oldbase) >> 907 PAGE_COUNTERS_SHIFT(mnode, r); 908 bcopy(old_ctr + offset, new_ctr, 909 MIN(pcsz, (old_csz - offset)) * 910 sizeof (hpmctr_t)); 911 } else { 912 offset = (oldbase - newbase) >> 913 PAGE_COUNTERS_SHIFT(mnode, r); 914 bcopy(old_ctr, new_ctr + offset, 915 MIN(pcsz - offset, old_csz) * 916 sizeof (hpmctr_t)); 917 } 918 } 919 920 PAGE_COUNTERS_COUNTERS(mnode, r) = new_ctr; 921 PAGE_COUNTERS_ENTRIES(mnode, r) = pcsz; 922 PAGE_COUNTERS_BASE(mnode, r) = newbase; 923 PAGE_COUNTERS_CURRENT_COLOR_LEN(mnode, r) = colors_per_szc[r]; 924 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r) = color_cache[r]; 925 color_cache[r] = NULL; 926 /* 927 * for now, just reset on these events as it's probably 928 * not worthwhile to try and optimize this. 929 */ 930 for (i = 0; i < colors_per_szc[r]; i++) { 931 PAGE_COUNTERS_CURRENT_COLOR(mnode, r, i) = i; 932 } 933 934 /* cache info for freeing out of the critical path */ 935 if ((caddr_t)old_ctr >= kernelheap && 936 (caddr_t)old_ctr < ekernelheap) { 937 ctr_cache[r] = old_ctr; 938 size_cache[r] = old_csz; 939 } 940 if ((caddr_t)old_color_array >= kernelheap && 941 (caddr_t)old_color_array < ekernelheap) { 942 color_cache[r] = old_color_array; 943 } 944 /* 945 * Verify that PNUM_TO_IDX and IDX_TO_PNUM 946 * satisfy the identity requirement. 947 * We should be able to go from one to the other 948 * and get consistent values. 949 */ 950 ASSERT(PNUM_TO_IDX(mnode, r, 951 (IDX_TO_PNUM(mnode, r, 0))) == 0); 952 ASSERT(IDX_TO_PNUM(mnode, r, 953 (PNUM_TO_IDX(mnode, r, newbase))) == newbase); 954 } 955 page_freelist_unlock(mnode); 956 rw_exit(&page_ctrs_rwlock[mnode]); 957 958 /* 959 * Now that we have dropped the write lock, it is safe to free all 960 * of the memory we have cached above. 961 */ 962 for (r = 1; r < mmu_page_sizes; r++) { 963 if (ctr_cache[r] != NULL) { 964 kmem_free(ctr_cache[r], 965 size_cache[r] * sizeof (hpmctr_t)); 966 } 967 if (color_cache[r] != NULL) { 968 kmem_free(color_cache[r], 969 colors_per_szc[r] * sizeof (size_t)); 970 } 971 } 972 return (0); 973 } 974 975 /* 976 * color contains a valid color index or bin for cur_szc 977 */ 978 uint_t 979 page_convert_color(uchar_t cur_szc, uchar_t new_szc, uint_t color) 980 { 981 uint_t shift; 982 983 if (cur_szc > new_szc) { 984 shift = page_get_shift(cur_szc) - page_get_shift(new_szc); 985 return (color << shift); 986 } else if (cur_szc < new_szc) { 987 shift = page_get_shift(new_szc) - page_get_shift(cur_szc); 988 return (color >> shift); 989 } 990 return (color); 991 } 992 993 #ifdef DEBUG 994 995 /* 996 * confirm pp is a large page corresponding to szc 997 */ 998 void 999 chk_lpg(page_t *pp, uchar_t szc) 1000 { 1001 spgcnt_t npgs = page_get_pagecnt(pp->p_szc); 1002 uint_t noreloc; 1003 1004 if (npgs == 1) { 1005 ASSERT(pp->p_szc == 0); 1006 ASSERT(pp->p_next == pp); 1007 ASSERT(pp->p_prev == pp); 1008 return; 1009 } 1010 1011 ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL); 1012 ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL); 1013 1014 ASSERT(IS_P2ALIGNED(pp->p_pagenum, npgs)); 1015 ASSERT(pp->p_pagenum == (pp->p_next->p_pagenum - 1)); 1016 ASSERT(pp->p_prev->p_pagenum == (pp->p_pagenum + (npgs - 1))); 1017 ASSERT(pp->p_prev == (pp + (npgs - 1))); 1018 1019 /* 1020 * Check list of pages. 1021 */ 1022 noreloc = PP_ISNORELOC(pp); 1023 while (npgs--) { 1024 if (npgs != 0) { 1025 ASSERT(pp->p_pagenum == pp->p_next->p_pagenum - 1); 1026 ASSERT(pp->p_next == (pp + 1)); 1027 } 1028 ASSERT(pp->p_szc == szc); 1029 ASSERT(PP_ISFREE(pp)); 1030 ASSERT(PP_ISAGED(pp)); 1031 ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL); 1032 ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL); 1033 ASSERT(pp->p_vnode == NULL); 1034 ASSERT(PP_ISNORELOC(pp) == noreloc); 1035 1036 pp = pp->p_next; 1037 } 1038 } 1039 #endif /* DEBUG */ 1040 1041 void 1042 page_freelist_lock(int mnode) 1043 { 1044 int i; 1045 for (i = 0; i < NPC_MUTEX; i++) { 1046 mutex_enter(FPC_MUTEX(mnode, i)); 1047 mutex_enter(CPC_MUTEX(mnode, i)); 1048 } 1049 } 1050 1051 void 1052 page_freelist_unlock(int mnode) 1053 { 1054 int i; 1055 for (i = 0; i < NPC_MUTEX; i++) { 1056 mutex_exit(FPC_MUTEX(mnode, i)); 1057 mutex_exit(CPC_MUTEX(mnode, i)); 1058 } 1059 } 1060 1061 /* 1062 * add pp to the specified page list. Defaults to head of the page list 1063 * unless PG_LIST_TAIL is specified. 1064 */ 1065 void 1066 page_list_add(page_t *pp, int flags) 1067 { 1068 page_t **ppp; 1069 kmutex_t *pcm; 1070 uint_t bin, mtype; 1071 int mnode; 1072 1073 ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT)); 1074 ASSERT(PP_ISFREE(pp)); 1075 ASSERT(!hat_page_is_mapped(pp)); 1076 ASSERT(hat_page_getshare(pp) == 0); 1077 1078 /* 1079 * Large pages should be freed via page_list_add_pages(). 1080 */ 1081 ASSERT(pp->p_szc == 0); 1082 1083 /* 1084 * Don't need to lock the freelist first here 1085 * because the page isn't on the freelist yet. 1086 * This means p_szc can't change on us. 1087 */ 1088 1089 bin = PP_2_BIN(pp); 1090 mnode = PP_2_MEM_NODE(pp); 1091 mtype = PP_2_MTYPE(pp); 1092 1093 if (flags & PG_LIST_ISINIT) { 1094 /* 1095 * PG_LIST_ISINIT is set during system startup (ie. single 1096 * threaded), add a page to the free list and add to the 1097 * the free region counters w/o any locking 1098 */ 1099 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1100 1101 /* inline version of page_add() */ 1102 if (*ppp != NULL) { 1103 pp->p_next = *ppp; 1104 pp->p_prev = (*ppp)->p_prev; 1105 (*ppp)->p_prev = pp; 1106 pp->p_prev->p_next = pp; 1107 } else 1108 *ppp = pp; 1109 1110 page_ctr_add_internal(mnode, mtype, pp, flags); 1111 VM_STAT_ADD(vmm_vmstats.pladd_free[0]); 1112 } else { 1113 pcm = PC_BIN_MUTEX(mnode, bin, flags); 1114 1115 if (flags & PG_FREE_LIST) { 1116 VM_STAT_ADD(vmm_vmstats.pladd_free[0]); 1117 ASSERT(PP_ISAGED(pp)); 1118 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1119 1120 } else { 1121 VM_STAT_ADD(vmm_vmstats.pladd_cache); 1122 ASSERT(pp->p_vnode); 1123 ASSERT((pp->p_offset & PAGEOFFSET) == 0); 1124 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1125 } 1126 mutex_enter(pcm); 1127 page_add(ppp, pp); 1128 1129 if (flags & PG_LIST_TAIL) 1130 *ppp = (*ppp)->p_next; 1131 /* 1132 * Add counters before releasing pcm mutex to avoid a race with 1133 * page_freelist_coalesce and page_freelist_fill. 1134 */ 1135 page_ctr_add(mnode, mtype, pp, flags); 1136 mutex_exit(pcm); 1137 } 1138 1139 1140 #if defined(__sparc) 1141 if (PP_ISNORELOC(pp)) { 1142 kcage_freemem_add(1); 1143 } 1144 #endif 1145 /* 1146 * It is up to the caller to unlock the page! 1147 */ 1148 ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT)); 1149 } 1150 1151 1152 #ifdef __sparc 1153 /* 1154 * This routine is only used by kcage_init during system startup. 1155 * It performs the function of page_list_sub/PP_SETNORELOC/page_list_add 1156 * without the overhead of taking locks and updating counters. 1157 */ 1158 void 1159 page_list_noreloc_startup(page_t *pp) 1160 { 1161 page_t **ppp; 1162 uint_t bin; 1163 int mnode; 1164 int mtype; 1165 int flags = 0; 1166 1167 /* 1168 * If this is a large page on the freelist then 1169 * break it up into smaller pages. 1170 */ 1171 if (pp->p_szc != 0) 1172 page_boot_demote(pp); 1173 1174 /* 1175 * Get list page is currently on. 1176 */ 1177 bin = PP_2_BIN(pp); 1178 mnode = PP_2_MEM_NODE(pp); 1179 mtype = PP_2_MTYPE(pp); 1180 ASSERT(mtype == MTYPE_RELOC); 1181 ASSERT(pp->p_szc == 0); 1182 1183 if (PP_ISAGED(pp)) { 1184 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1185 flags |= PG_FREE_LIST; 1186 } else { 1187 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1188 flags |= PG_CACHE_LIST; 1189 } 1190 1191 ASSERT(*ppp != NULL); 1192 1193 /* 1194 * Delete page from current list. 1195 */ 1196 if (*ppp == pp) 1197 *ppp = pp->p_next; /* go to next page */ 1198 if (*ppp == pp) { 1199 *ppp = NULL; /* page list is gone */ 1200 } else { 1201 pp->p_prev->p_next = pp->p_next; 1202 pp->p_next->p_prev = pp->p_prev; 1203 } 1204 1205 /* LINTED */ 1206 PLCNT_DECR(pp, mnode, mtype, 0, flags); 1207 1208 /* 1209 * Set no reloc for cage initted pages. 1210 */ 1211 PP_SETNORELOC(pp); 1212 1213 mtype = PP_2_MTYPE(pp); 1214 ASSERT(mtype == MTYPE_NORELOC); 1215 1216 /* 1217 * Get new list for page. 1218 */ 1219 if (PP_ISAGED(pp)) { 1220 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1221 } else { 1222 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1223 } 1224 1225 /* 1226 * Insert page on new list. 1227 */ 1228 if (*ppp == NULL) { 1229 *ppp = pp; 1230 pp->p_next = pp->p_prev = pp; 1231 } else { 1232 pp->p_next = *ppp; 1233 pp->p_prev = (*ppp)->p_prev; 1234 (*ppp)->p_prev = pp; 1235 pp->p_prev->p_next = pp; 1236 } 1237 1238 /* LINTED */ 1239 PLCNT_INCR(pp, mnode, mtype, 0, flags); 1240 1241 /* 1242 * Update cage freemem counter 1243 */ 1244 atomic_add_long(&kcage_freemem, 1); 1245 } 1246 #else /* __sparc */ 1247 1248 /* ARGSUSED */ 1249 void 1250 page_list_noreloc_startup(page_t *pp) 1251 { 1252 panic("page_list_noreloc_startup: should be here only for sparc"); 1253 } 1254 #endif 1255 1256 void 1257 page_list_add_pages(page_t *pp, int flags) 1258 { 1259 kmutex_t *pcm; 1260 pgcnt_t pgcnt; 1261 uint_t bin, mtype, i; 1262 int mnode; 1263 1264 /* default to freelist/head */ 1265 ASSERT((flags & (PG_CACHE_LIST | PG_LIST_TAIL)) == 0); 1266 1267 CHK_LPG(pp, pp->p_szc); 1268 VM_STAT_ADD(vmm_vmstats.pladd_free[pp->p_szc]); 1269 1270 bin = PP_2_BIN(pp); 1271 mnode = PP_2_MEM_NODE(pp); 1272 mtype = PP_2_MTYPE(pp); 1273 1274 if (flags & PG_LIST_ISINIT) { 1275 ASSERT(pp->p_szc == mmu_page_sizes - 1); 1276 page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1277 ASSERT(!PP_ISNORELOC(pp)); 1278 PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags); 1279 } else { 1280 1281 ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes); 1282 1283 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 1284 1285 mutex_enter(pcm); 1286 page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1287 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST); 1288 mutex_exit(pcm); 1289 1290 pgcnt = page_get_pagecnt(pp->p_szc); 1291 #if defined(__sparc) 1292 if (PP_ISNORELOC(pp)) 1293 kcage_freemem_add(pgcnt); 1294 #endif 1295 for (i = 0; i < pgcnt; i++, pp++) 1296 page_unlock_noretire(pp); 1297 } 1298 } 1299 1300 /* 1301 * During boot, need to demote a large page to base 1302 * pagesize pages for seg_kmem for use in boot_alloc() 1303 */ 1304 void 1305 page_boot_demote(page_t *pp) 1306 { 1307 ASSERT(pp->p_szc != 0); 1308 ASSERT(PP_ISFREE(pp)); 1309 ASSERT(PP_ISAGED(pp)); 1310 1311 (void) page_demote(PP_2_MEM_NODE(pp), 1312 PFN_BASE(pp->p_pagenum, pp->p_szc), pp->p_szc, 0, PC_NO_COLOR, 1313 PC_FREE); 1314 1315 ASSERT(PP_ISFREE(pp)); 1316 ASSERT(PP_ISAGED(pp)); 1317 ASSERT(pp->p_szc == 0); 1318 } 1319 1320 /* 1321 * Take a particular page off of whatever freelist the page 1322 * is claimed to be on. 1323 * 1324 * NOTE: Only used for PAGESIZE pages. 1325 */ 1326 void 1327 page_list_sub(page_t *pp, int flags) 1328 { 1329 int bin; 1330 uint_t mtype; 1331 int mnode; 1332 kmutex_t *pcm; 1333 page_t **ppp; 1334 1335 ASSERT(PAGE_EXCL(pp)); 1336 ASSERT(PP_ISFREE(pp)); 1337 1338 /* 1339 * The p_szc field can only be changed by page_promote() 1340 * and page_demote(). Only free pages can be promoted and 1341 * demoted and the free list MUST be locked during these 1342 * operations. So to prevent a race in page_list_sub() 1343 * between computing which bin of the freelist lock to 1344 * grab and actually grabing the lock we check again that 1345 * the bin we locked is still the correct one. Notice that 1346 * the p_szc field could have actually changed on us but 1347 * if the bin happens to still be the same we are safe. 1348 */ 1349 try_again: 1350 bin = PP_2_BIN(pp); 1351 mnode = PP_2_MEM_NODE(pp); 1352 pcm = PC_BIN_MUTEX(mnode, bin, flags); 1353 mutex_enter(pcm); 1354 if (PP_2_BIN(pp) != bin) { 1355 mutex_exit(pcm); 1356 goto try_again; 1357 } 1358 mtype = PP_2_MTYPE(pp); 1359 1360 if (flags & PG_FREE_LIST) { 1361 VM_STAT_ADD(vmm_vmstats.plsub_free[0]); 1362 ASSERT(PP_ISAGED(pp)); 1363 ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype); 1364 } else { 1365 VM_STAT_ADD(vmm_vmstats.plsub_cache); 1366 ASSERT(!PP_ISAGED(pp)); 1367 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1368 } 1369 1370 /* 1371 * Common PAGESIZE case. 1372 * 1373 * Note that we locked the freelist. This prevents 1374 * any page promotion/demotion operations. Therefore 1375 * the p_szc will not change until we drop pcm mutex. 1376 */ 1377 if (pp->p_szc == 0) { 1378 page_sub(ppp, pp); 1379 /* 1380 * Subtract counters before releasing pcm mutex 1381 * to avoid race with page_freelist_coalesce. 1382 */ 1383 page_ctr_sub(mnode, mtype, pp, flags); 1384 mutex_exit(pcm); 1385 1386 #if defined(__sparc) 1387 if (PP_ISNORELOC(pp)) { 1388 kcage_freemem_sub(1); 1389 } 1390 #endif 1391 return; 1392 } 1393 1394 /* 1395 * Large pages on the cache list are not supported. 1396 */ 1397 if (flags & PG_CACHE_LIST) 1398 panic("page_list_sub: large page on cachelist"); 1399 1400 /* 1401 * Slow but rare. 1402 * 1403 * Somebody wants this particular page which is part 1404 * of a large page. In this case we just demote the page 1405 * if it's on the freelist. 1406 * 1407 * We have to drop pcm before locking the entire freelist. 1408 * Once we have re-locked the freelist check to make sure 1409 * the page hasn't already been demoted or completely 1410 * freed. 1411 */ 1412 mutex_exit(pcm); 1413 page_freelist_lock(mnode); 1414 if (pp->p_szc != 0) { 1415 /* 1416 * Large page is on freelist. 1417 */ 1418 (void) page_demote(mnode, PFN_BASE(pp->p_pagenum, pp->p_szc), 1419 pp->p_szc, 0, PC_NO_COLOR, PC_FREE); 1420 } 1421 ASSERT(PP_ISFREE(pp)); 1422 ASSERT(PP_ISAGED(pp)); 1423 ASSERT(pp->p_szc == 0); 1424 1425 /* 1426 * Subtract counters before releasing pcm mutex 1427 * to avoid race with page_freelist_coalesce. 1428 */ 1429 bin = PP_2_BIN(pp); 1430 mtype = PP_2_MTYPE(pp); 1431 ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype); 1432 1433 page_sub(ppp, pp); 1434 page_ctr_sub(mnode, mtype, pp, flags); 1435 page_freelist_unlock(mnode); 1436 1437 #if defined(__sparc) 1438 if (PP_ISNORELOC(pp)) { 1439 kcage_freemem_sub(1); 1440 } 1441 #endif 1442 } 1443 1444 void 1445 page_list_sub_pages(page_t *pp, uint_t szc) 1446 { 1447 kmutex_t *pcm; 1448 uint_t bin, mtype; 1449 int mnode; 1450 1451 ASSERT(PAGE_EXCL(pp)); 1452 ASSERT(PP_ISFREE(pp)); 1453 ASSERT(PP_ISAGED(pp)); 1454 1455 /* 1456 * See comment in page_list_sub(). 1457 */ 1458 try_again: 1459 bin = PP_2_BIN(pp); 1460 mnode = PP_2_MEM_NODE(pp); 1461 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 1462 mutex_enter(pcm); 1463 if (PP_2_BIN(pp) != bin) { 1464 mutex_exit(pcm); 1465 goto try_again; 1466 } 1467 1468 /* 1469 * If we're called with a page larger than szc or it got 1470 * promoted above szc before we locked the freelist then 1471 * drop pcm and re-lock entire freelist. If page still larger 1472 * than szc then demote it. 1473 */ 1474 if (pp->p_szc > szc) { 1475 mutex_exit(pcm); 1476 pcm = NULL; 1477 page_freelist_lock(mnode); 1478 if (pp->p_szc > szc) { 1479 VM_STAT_ADD(vmm_vmstats.plsubpages_szcbig); 1480 (void) page_demote(mnode, 1481 PFN_BASE(pp->p_pagenum, pp->p_szc), 1482 pp->p_szc, szc, PC_NO_COLOR, PC_FREE); 1483 } 1484 bin = PP_2_BIN(pp); 1485 } 1486 ASSERT(PP_ISFREE(pp)); 1487 ASSERT(PP_ISAGED(pp)); 1488 ASSERT(pp->p_szc <= szc); 1489 ASSERT(pp == PP_PAGEROOT(pp)); 1490 1491 VM_STAT_ADD(vmm_vmstats.plsub_free[pp->p_szc]); 1492 1493 mtype = PP_2_MTYPE(pp); 1494 if (pp->p_szc != 0) { 1495 page_vpsub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1496 CHK_LPG(pp, pp->p_szc); 1497 } else { 1498 VM_STAT_ADD(vmm_vmstats.plsubpages_szc0); 1499 page_sub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1500 } 1501 page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST); 1502 1503 if (pcm != NULL) { 1504 mutex_exit(pcm); 1505 } else { 1506 page_freelist_unlock(mnode); 1507 } 1508 1509 #if defined(__sparc) 1510 if (PP_ISNORELOC(pp)) { 1511 pgcnt_t pgcnt; 1512 1513 pgcnt = page_get_pagecnt(pp->p_szc); 1514 kcage_freemem_sub(pgcnt); 1515 } 1516 #endif 1517 } 1518 1519 /* 1520 * Add the page to the front of a linked list of pages 1521 * using the p_next & p_prev pointers for the list. 1522 * The caller is responsible for protecting the list pointers. 1523 */ 1524 void 1525 mach_page_add(page_t **ppp, page_t *pp) 1526 { 1527 if (*ppp == NULL) { 1528 pp->p_next = pp->p_prev = pp; 1529 } else { 1530 pp->p_next = *ppp; 1531 pp->p_prev = (*ppp)->p_prev; 1532 (*ppp)->p_prev = pp; 1533 pp->p_prev->p_next = pp; 1534 } 1535 *ppp = pp; 1536 } 1537 1538 /* 1539 * Remove this page from a linked list of pages 1540 * using the p_next & p_prev pointers for the list. 1541 * 1542 * The caller is responsible for protecting the list pointers. 1543 */ 1544 void 1545 mach_page_sub(page_t **ppp, page_t *pp) 1546 { 1547 ASSERT(PP_ISFREE(pp)); 1548 1549 if (*ppp == NULL || pp == NULL) 1550 panic("mach_page_sub"); 1551 1552 if (*ppp == pp) 1553 *ppp = pp->p_next; /* go to next page */ 1554 1555 if (*ppp == pp) 1556 *ppp = NULL; /* page list is gone */ 1557 else { 1558 pp->p_prev->p_next = pp->p_next; 1559 pp->p_next->p_prev = pp->p_prev; 1560 } 1561 pp->p_prev = pp->p_next = pp; /* make pp a list of one */ 1562 } 1563 1564 /* 1565 * Routine fsflush uses to gradually coalesce the free list into larger pages. 1566 */ 1567 void 1568 page_promote_size(page_t *pp, uint_t cur_szc) 1569 { 1570 pfn_t pfn; 1571 int mnode; 1572 int idx; 1573 int new_szc = cur_szc + 1; 1574 int full = FULL_REGION_CNT(new_szc); 1575 1576 pfn = page_pptonum(pp); 1577 mnode = PFN_2_MEM_NODE(pfn); 1578 1579 page_freelist_lock(mnode); 1580 1581 idx = PNUM_TO_IDX(mnode, new_szc, pfn); 1582 if (PAGE_COUNTERS(mnode, new_szc, idx) == full) 1583 (void) page_promote(mnode, pfn, new_szc, PC_FREE); 1584 1585 page_freelist_unlock(mnode); 1586 } 1587 1588 static uint_t page_promote_err; 1589 static uint_t page_promote_noreloc_err; 1590 1591 /* 1592 * Create a single larger page (of szc new_szc) from smaller contiguous pages 1593 * for the given mnode starting at pfnum. Pages involved are on the freelist 1594 * before the call and may be returned to the caller if requested, otherwise 1595 * they will be placed back on the freelist. 1596 * If flags is PC_ALLOC, then the large page will be returned to the user in 1597 * a state which is consistent with a page being taken off the freelist. If 1598 * we failed to lock the new large page, then we will return NULL to the 1599 * caller and put the large page on the freelist instead. 1600 * If flags is PC_FREE, then the large page will be placed on the freelist, 1601 * and NULL will be returned. 1602 * The caller is responsible for locking the freelist as well as any other 1603 * accounting which needs to be done for a returned page. 1604 * 1605 * RFE: For performance pass in pp instead of pfnum so 1606 * we can avoid excessive calls to page_numtopp_nolock(). 1607 * This would depend on an assumption that all contiguous 1608 * pages are in the same memseg so we can just add/dec 1609 * our pp. 1610 * 1611 * Lock ordering: 1612 * 1613 * There is a potential but rare deadlock situation 1614 * for page promotion and demotion operations. The problem 1615 * is there are two paths into the freelist manager and 1616 * they have different lock orders: 1617 * 1618 * page_create() 1619 * lock freelist 1620 * page_lock(EXCL) 1621 * unlock freelist 1622 * return 1623 * caller drops page_lock 1624 * 1625 * page_free() and page_reclaim() 1626 * caller grabs page_lock(EXCL) 1627 * 1628 * lock freelist 1629 * unlock freelist 1630 * drop page_lock 1631 * 1632 * What prevents a thread in page_create() from deadlocking 1633 * with a thread freeing or reclaiming the same page is the 1634 * page_trylock() in page_get_freelist(). If the trylock fails 1635 * it skips the page. 1636 * 1637 * The lock ordering for promotion and demotion is the same as 1638 * for page_create(). Since the same deadlock could occur during 1639 * page promotion and freeing or reclaiming of a page on the 1640 * cache list we might have to fail the operation and undo what 1641 * have done so far. Again this is rare. 1642 */ 1643 page_t * 1644 page_promote(int mnode, pfn_t pfnum, uchar_t new_szc, int flags) 1645 { 1646 page_t *pp, *pplist, *tpp, *start_pp; 1647 pgcnt_t new_npgs, npgs; 1648 uint_t bin; 1649 pgcnt_t tmpnpgs, pages_left; 1650 uint_t mtype; 1651 uint_t noreloc; 1652 uint_t i; 1653 int which_list; 1654 ulong_t index; 1655 kmutex_t *phm; 1656 1657 /* 1658 * General algorithm: 1659 * Find the starting page 1660 * Walk each page struct removing it from the freelist, 1661 * and linking it to all the other pages removed. 1662 * Once all pages are off the freelist, 1663 * walk the list, modifying p_szc to new_szc and what 1664 * ever other info needs to be done to create a large free page. 1665 * According to the flags, either return the page or put it 1666 * on the freelist. 1667 */ 1668 1669 start_pp = page_numtopp_nolock(pfnum); 1670 ASSERT(start_pp && (start_pp->p_pagenum == pfnum)); 1671 new_npgs = page_get_pagecnt(new_szc); 1672 ASSERT(IS_P2ALIGNED(pfnum, new_npgs)); 1673 1674 /* 1675 * Loop through smaller pages to confirm that all pages 1676 * give the same result for PP_ISNORELOC(). 1677 * We can check this reliably here as the protocol for setting 1678 * P_NORELOC requires pages to be taken off the free list first. 1679 */ 1680 for (i = 0, pp = start_pp; i < new_npgs; i++, pp++) { 1681 if (pp == start_pp) { 1682 /* First page, set requirement. */ 1683 noreloc = PP_ISNORELOC(pp); 1684 } else if (noreloc != PP_ISNORELOC(pp)) { 1685 page_promote_noreloc_err++; 1686 page_promote_err++; 1687 return (NULL); 1688 } 1689 } 1690 1691 pages_left = new_npgs; 1692 pplist = NULL; 1693 pp = start_pp; 1694 1695 /* Loop around coalescing the smaller pages into a big page. */ 1696 while (pages_left) { 1697 /* 1698 * Remove from the freelist. 1699 */ 1700 ASSERT(PP_ISFREE(pp)); 1701 bin = PP_2_BIN(pp); 1702 ASSERT(mnode == PP_2_MEM_NODE(pp)); 1703 mtype = PP_2_MTYPE(pp); 1704 if (PP_ISAGED(pp)) { 1705 1706 /* 1707 * PG_FREE_LIST 1708 */ 1709 if (pp->p_szc) { 1710 page_vpsub(&PAGE_FREELISTS(mnode, 1711 pp->p_szc, bin, mtype), pp); 1712 } else { 1713 mach_page_sub(&PAGE_FREELISTS(mnode, 0, 1714 bin, mtype), pp); 1715 } 1716 which_list = PG_FREE_LIST; 1717 } else { 1718 ASSERT(pp->p_szc == 0); 1719 1720 /* 1721 * PG_CACHE_LIST 1722 * 1723 * Since this page comes from the 1724 * cachelist, we must destroy the 1725 * vnode association. 1726 */ 1727 if (!page_trylock(pp, SE_EXCL)) { 1728 goto fail_promote; 1729 } 1730 1731 /* 1732 * We need to be careful not to deadlock 1733 * with another thread in page_lookup(). 1734 * The page_lookup() thread could be holding 1735 * the same phm that we need if the two 1736 * pages happen to hash to the same phm lock. 1737 * At this point we have locked the entire 1738 * freelist and page_lookup() could be trying 1739 * to grab a freelist lock. 1740 */ 1741 index = PAGE_HASH_FUNC(pp->p_vnode, pp->p_offset); 1742 phm = PAGE_HASH_MUTEX(index); 1743 if (!mutex_tryenter(phm)) { 1744 page_unlock_noretire(pp); 1745 goto fail_promote; 1746 } 1747 1748 mach_page_sub(&PAGE_CACHELISTS(mnode, bin, mtype), pp); 1749 page_hashout(pp, phm); 1750 mutex_exit(phm); 1751 PP_SETAGED(pp); 1752 page_unlock_noretire(pp); 1753 which_list = PG_CACHE_LIST; 1754 } 1755 page_ctr_sub(mnode, mtype, pp, which_list); 1756 1757 /* 1758 * Concatenate the smaller page(s) onto 1759 * the large page list. 1760 */ 1761 tmpnpgs = npgs = page_get_pagecnt(pp->p_szc); 1762 pages_left -= npgs; 1763 tpp = pp; 1764 while (npgs--) { 1765 tpp->p_szc = new_szc; 1766 tpp = tpp->p_next; 1767 } 1768 page_list_concat(&pplist, &pp); 1769 pp += tmpnpgs; 1770 } 1771 CHK_LPG(pplist, new_szc); 1772 1773 /* 1774 * return the page to the user if requested 1775 * in the properly locked state. 1776 */ 1777 if (flags == PC_ALLOC && (page_trylock_cons(pplist, SE_EXCL))) { 1778 return (pplist); 1779 } 1780 1781 /* 1782 * Otherwise place the new large page on the freelist 1783 */ 1784 bin = PP_2_BIN(pplist); 1785 mnode = PP_2_MEM_NODE(pplist); 1786 mtype = PP_2_MTYPE(pplist); 1787 page_vpadd(&PAGE_FREELISTS(mnode, new_szc, bin, mtype), pplist); 1788 1789 page_ctr_add(mnode, mtype, pplist, PG_FREE_LIST); 1790 return (NULL); 1791 1792 fail_promote: 1793 /* 1794 * A thread must have still been freeing or 1795 * reclaiming the page on the cachelist. 1796 * To prevent a deadlock undo what we have 1797 * done sofar and return failure. This 1798 * situation can only happen while promoting 1799 * PAGESIZE pages. 1800 */ 1801 page_promote_err++; 1802 while (pplist) { 1803 pp = pplist; 1804 mach_page_sub(&pplist, pp); 1805 pp->p_szc = 0; 1806 bin = PP_2_BIN(pp); 1807 mtype = PP_2_MTYPE(pp); 1808 mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, mtype), pp); 1809 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST); 1810 } 1811 return (NULL); 1812 1813 } 1814 1815 /* 1816 * Break up a large page into smaller size pages. 1817 * Pages involved are on the freelist before the call and may 1818 * be returned to the caller if requested, otherwise they will 1819 * be placed back on the freelist. 1820 * The caller is responsible for locking the freelist as well as any other 1821 * accounting which needs to be done for a returned page. 1822 * If flags is not PC_ALLOC, the color argument is ignored, and thus 1823 * technically, any value may be passed in but PC_NO_COLOR is the standard 1824 * which should be followed for clarity's sake. 1825 */ 1826 page_t * 1827 page_demote(int mnode, pfn_t pfnum, uchar_t cur_szc, uchar_t new_szc, 1828 int color, int flags) 1829 { 1830 page_t *pp, *pplist, *npplist; 1831 pgcnt_t npgs, n; 1832 uint_t bin; 1833 uint_t mtype; 1834 page_t *ret_pp = NULL; 1835 1836 ASSERT(cur_szc != 0); 1837 ASSERT(new_szc < cur_szc); 1838 1839 pplist = page_numtopp_nolock(pfnum); 1840 ASSERT(pplist != NULL); 1841 1842 ASSERT(pplist->p_szc == cur_szc); 1843 1844 bin = PP_2_BIN(pplist); 1845 ASSERT(mnode == PP_2_MEM_NODE(pplist)); 1846 mtype = PP_2_MTYPE(pplist); 1847 page_vpsub(&PAGE_FREELISTS(mnode, cur_szc, bin, mtype), pplist); 1848 1849 CHK_LPG(pplist, cur_szc); 1850 page_ctr_sub(mnode, mtype, pplist, PG_FREE_LIST); 1851 1852 /* 1853 * Number of PAGESIZE pages for smaller new_szc 1854 * page. 1855 */ 1856 npgs = page_get_pagecnt(new_szc); 1857 1858 while (pplist) { 1859 pp = pplist; 1860 1861 ASSERT(pp->p_szc == cur_szc); 1862 1863 /* 1864 * We either break it up into PAGESIZE pages or larger. 1865 */ 1866 if (npgs == 1) { /* PAGESIZE case */ 1867 mach_page_sub(&pplist, pp); 1868 ASSERT(pp->p_szc == cur_szc); 1869 ASSERT(new_szc == 0); 1870 ASSERT(mnode == PP_2_MEM_NODE(pp)); 1871 pp->p_szc = new_szc; 1872 bin = PP_2_BIN(pp); 1873 if ((bin == color) && (flags == PC_ALLOC) && 1874 (ret_pp == NULL) && 1875 page_trylock_cons(pp, SE_EXCL)) { 1876 ret_pp = pp; 1877 } else { 1878 mtype = PP_2_MTYPE(pp); 1879 mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, 1880 mtype), pp); 1881 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST); 1882 } 1883 } else { 1884 1885 /* 1886 * Break down into smaller lists of pages. 1887 */ 1888 page_list_break(&pplist, &npplist, npgs); 1889 1890 pp = pplist; 1891 n = npgs; 1892 while (n--) { 1893 ASSERT(pp->p_szc == cur_szc); 1894 pp->p_szc = new_szc; 1895 pp = pp->p_next; 1896 } 1897 1898 CHK_LPG(pplist, new_szc); 1899 1900 bin = PP_2_BIN(pplist); 1901 ASSERT(mnode == PP_2_MEM_NODE(pp)); 1902 if ((bin == color) && (flags == PC_ALLOC) && 1903 (ret_pp == NULL) && 1904 page_trylock_cons(pp, SE_EXCL)) { 1905 ret_pp = pp; 1906 } else { 1907 mtype = PP_2_MTYPE(pp); 1908 page_vpadd(&PAGE_FREELISTS(mnode, new_szc, 1909 bin, mtype), pplist); 1910 1911 page_ctr_add(mnode, mtype, pplist, 1912 PG_FREE_LIST); 1913 } 1914 pplist = npplist; 1915 } 1916 } 1917 return (ret_pp); 1918 } 1919 1920 int mpss_coalesce_disable = 0; 1921 1922 /* 1923 * Coalesce free pages into a page of the given szc and color if possible. 1924 * Return the pointer to the page created, otherwise, return NULL. 1925 */ 1926 static page_t * 1927 page_freelist_coalesce(int mnode, uchar_t szc, int color) 1928 { 1929 int r; /* region size */ 1930 int idx, full, i; 1931 pfn_t pfnum; 1932 size_t len; 1933 size_t buckets_to_check; 1934 pgcnt_t cands; 1935 page_t *ret_pp; 1936 int color_stride; 1937 1938 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce); 1939 1940 if (mpss_coalesce_disable) { 1941 return (NULL); 1942 } 1943 1944 r = szc; 1945 PGCTRS_CANDS_GETVALUECOLOR(mnode, r, color, cands); 1946 if (cands == 0) { 1947 VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip); 1948 return (NULL); 1949 } 1950 full = FULL_REGION_CNT(r); 1951 color_stride = (szc) ? page_convert_color(0, szc, page_colors - 1) + 1 : 1952 page_colors; 1953 1954 /* Prevent page_counters dynamic memory from being freed */ 1955 rw_enter(&page_ctrs_rwlock[mnode], RW_READER); 1956 len = PAGE_COUNTERS_ENTRIES(mnode, r); 1957 buckets_to_check = len / color_stride; 1958 idx = PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color); 1959 ASSERT((idx % color_stride) == color); 1960 idx += color_stride; 1961 if (idx >= len) 1962 idx = color; 1963 for (i = 0; i < buckets_to_check; i++) { 1964 if (PAGE_COUNTERS(mnode, r, idx) == full) { 1965 pfnum = IDX_TO_PNUM(mnode, r, idx); 1966 ASSERT(pfnum >= mem_node_config[mnode].physbase && 1967 pfnum < mem_node_config[mnode].physmax); 1968 /* 1969 * RFE: For performance maybe we can do something less 1970 * brutal than locking the entire freelist. So far 1971 * this doesn't seem to be a performance problem? 1972 */ 1973 page_freelist_lock(mnode); 1974 if (PAGE_COUNTERS(mnode, r, idx) != full) { 1975 VM_STAT_ADD(vmm_vmstats.page_ctrs_changed); 1976 goto skip_this_one; 1977 } 1978 ret_pp = page_promote(mnode, pfnum, r, PC_ALLOC); 1979 if (ret_pp != NULL) { 1980 PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color) = 1981 idx; 1982 page_freelist_unlock(mnode); 1983 rw_exit(&page_ctrs_rwlock[mnode]); 1984 #if defined(__sparc) 1985 if (PP_ISNORELOC(ret_pp)) { 1986 pgcnt_t npgs; 1987 1988 npgs = page_get_pagecnt(ret_pp->p_szc); 1989 kcage_freemem_sub(npgs); 1990 } 1991 #endif 1992 return (ret_pp); 1993 } 1994 skip_this_one: 1995 page_freelist_unlock(mnode); 1996 /* 1997 * No point looking for another page if we've 1998 * already tried all of the ones that 1999 * page_ctr_cands indicated. Stash off where we left 2000 * off. 2001 * Note: this is not exact since we don't hold the 2002 * page_freelist_locks before we initially get the 2003 * value of cands for performance reasons, but should 2004 * be a decent approximation. 2005 */ 2006 if (--cands == 0) { 2007 PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color) = 2008 idx; 2009 break; 2010 } 2011 } 2012 idx += color_stride; 2013 if (idx >= len) 2014 idx = color; 2015 } 2016 rw_exit(&page_ctrs_rwlock[mnode]); 2017 VM_STAT_ADD(vmm_vmstats.page_ctrs_failed); 2018 return (NULL); 2019 } 2020 2021 /* 2022 * For the given mnode, promote as many small pages to large pages as possible. 2023 */ 2024 void 2025 page_freelist_coalesce_all(int mnode) 2026 { 2027 int r; /* region size */ 2028 int idx, full; 2029 pfn_t pfnum; 2030 size_t len; 2031 2032 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce_all); 2033 2034 if (mpss_coalesce_disable) { 2035 return; 2036 } 2037 2038 /* 2039 * Lock the entire freelist and coalesce what we can. 2040 * 2041 * Always promote to the largest page possible 2042 * first to reduce the number of page promotions. 2043 */ 2044 rw_enter(&page_ctrs_rwlock[mnode], RW_READER); 2045 page_freelist_lock(mnode); 2046 for (r = mmu_page_sizes - 1; r > 0; r--) { 2047 pgcnt_t cands; 2048 2049 PGCTRS_CANDS_GETVALUE(mnode, r, cands); 2050 if (cands == 0) { 2051 VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip_all); 2052 continue; 2053 } 2054 2055 full = FULL_REGION_CNT(r); 2056 len = PAGE_COUNTERS_ENTRIES(mnode, r); 2057 2058 for (idx = 0; idx < len; idx++) { 2059 if (PAGE_COUNTERS(mnode, r, idx) == full) { 2060 pfnum = IDX_TO_PNUM(mnode, r, idx); 2061 ASSERT(pfnum >= 2062 mem_node_config[mnode].physbase && 2063 pfnum < 2064 mem_node_config[mnode].physmax); 2065 (void) page_promote(mnode, pfnum, r, PC_FREE); 2066 } 2067 } 2068 } 2069 page_freelist_unlock(mnode); 2070 rw_exit(&page_ctrs_rwlock[mnode]); 2071 } 2072 2073 /* 2074 * This is where all polices for moving pages around 2075 * to different page size free lists is implemented. 2076 * Returns 1 on success, 0 on failure. 2077 * 2078 * So far these are the priorities for this algorithm in descending 2079 * order: 2080 * 2081 * 1) When servicing a request try to do so with a free page 2082 * from next size up. Helps defer fragmentation as long 2083 * as possible. 2084 * 2085 * 2) Page coalesce on demand. Only when a freelist 2086 * larger than PAGESIZE is empty and step 1 2087 * will not work since all larger size lists are 2088 * also empty. 2089 * 2090 * If pfnhi is non-zero, search for large page with pfn range less than pfnhi. 2091 */ 2092 page_t * 2093 page_freelist_fill(uchar_t szc, int color, int mnode, int mtype, pfn_t pfnhi) 2094 { 2095 uchar_t nszc = szc + 1; 2096 int bin; 2097 page_t *pp, *firstpp; 2098 page_t *ret_pp = NULL; 2099 2100 ASSERT(szc < mmu_page_sizes); 2101 2102 VM_STAT_ADD(vmm_vmstats.pff_req[szc]); 2103 /* 2104 * First try to break up a larger page to fill 2105 * current size freelist. 2106 */ 2107 while (nszc < mmu_page_sizes) { 2108 /* 2109 * If page found then demote it. 2110 */ 2111 bin = page_convert_color(szc, nszc, color); 2112 if (PAGE_FREELISTS(mnode, nszc, bin, mtype)) { 2113 page_freelist_lock(mnode); 2114 firstpp = pp = PAGE_FREELISTS(mnode, nszc, bin, mtype); 2115 2116 /* 2117 * If pfnhi is not PFNNULL, look for large page below 2118 * pfnhi. PFNNULL signifies no pfn requirement. 2119 */ 2120 if (pfnhi != PFNNULL && pp->p_pagenum >= pfnhi) { 2121 do { 2122 pp = pp->p_vpnext; 2123 if (pp == firstpp) { 2124 pp = NULL; 2125 break; 2126 } 2127 } while (pp->p_pagenum >= pfnhi); 2128 } 2129 if (pp) { 2130 ASSERT(pp->p_szc == nszc); 2131 VM_STAT_ADD(vmm_vmstats.pff_demote[nszc]); 2132 ret_pp = page_demote(mnode, pp->p_pagenum, 2133 pp->p_szc, szc, color, PC_ALLOC); 2134 if (ret_pp) { 2135 page_freelist_unlock(mnode); 2136 #if defined(__sparc) 2137 if (PP_ISNORELOC(ret_pp)) { 2138 pgcnt_t npgs; 2139 2140 npgs = page_get_pagecnt( 2141 ret_pp->p_szc); 2142 kcage_freemem_sub(npgs); 2143 } 2144 #endif 2145 return (ret_pp); 2146 } 2147 } 2148 page_freelist_unlock(mnode); 2149 } 2150 nszc++; 2151 } 2152 2153 /* 2154 * Ok that didn't work. Time to coalesce. 2155 */ 2156 if (szc != 0) { 2157 ret_pp = page_freelist_coalesce(mnode, szc, color); 2158 VM_STAT_COND_ADD(ret_pp, vmm_vmstats.pff_coalok[szc]); 2159 } 2160 2161 return (ret_pp); 2162 } 2163 2164 /* 2165 * Helper routine used only by the freelist code to lock 2166 * a page. If the page is a large page then it succeeds in 2167 * locking all the constituent pages or none at all. 2168 * Returns 1 on sucess, 0 on failure. 2169 */ 2170 static int 2171 page_trylock_cons(page_t *pp, se_t se) 2172 { 2173 page_t *tpp, *first_pp = pp; 2174 2175 /* 2176 * Fail if can't lock first or only page. 2177 */ 2178 if (!page_trylock(pp, se)) { 2179 return (0); 2180 } 2181 2182 /* 2183 * PAGESIZE: common case. 2184 */ 2185 if (pp->p_szc == 0) { 2186 return (1); 2187 } 2188 2189 /* 2190 * Large page case. 2191 */ 2192 tpp = pp->p_next; 2193 while (tpp != pp) { 2194 if (!page_trylock(tpp, se)) { 2195 /* 2196 * On failure unlock what we 2197 * have locked so far. 2198 */ 2199 while (first_pp != tpp) { 2200 page_unlock_noretire(first_pp); 2201 first_pp = first_pp->p_next; 2202 } 2203 return (0); 2204 } 2205 tpp = tpp->p_next; 2206 } 2207 return (1); 2208 } 2209 2210 page_t * 2211 page_get_mnode_freelist(int mnode, uint_t bin, int mtype, uchar_t szc, 2212 uint_t flags) 2213 { 2214 kmutex_t *pcm; 2215 int i, fill_tried, fill_marker; 2216 page_t *pp, *first_pp; 2217 uint_t bin_marker; 2218 int colors, cpucolors; 2219 uchar_t nszc; 2220 uint_t nszc_color_shift; 2221 int nwaybins = 0, nwaycnt; 2222 2223 ASSERT(szc < mmu_page_sizes); 2224 2225 VM_STAT_ADD(vmm_vmstats.pgmf_alloc[szc]); 2226 2227 MTYPE_START(mnode, mtype, flags); 2228 if (mtype < 0) { /* mnode foes not have memory in mtype range */ 2229 VM_STAT_ADD(vmm_vmstats.pgmf_allocempty[szc]); 2230 return (NULL); 2231 } 2232 2233 /* 2234 * Set how many physical colors for this page size. 2235 */ 2236 colors = (szc) ? page_convert_color(0, szc, page_colors - 1) + 1 : 2237 page_colors; 2238 2239 nszc = MIN(szc + 1, mmu_page_sizes - 1); 2240 nszc_color_shift = page_get_shift(nszc) - page_get_shift(szc); 2241 2242 /* cpu_page_colors is non-zero if a page color may be in > 1 bin */ 2243 cpucolors = cpu_page_colors; 2244 2245 /* 2246 * adjust cpucolors to possibly check additional 'equivalent' bins 2247 * to try to minimize fragmentation of large pages by delaying calls 2248 * to page_freelist_fill. 2249 */ 2250 if (colorequiv > 1) { 2251 int equivcolors = colors / colorequiv; 2252 2253 if (equivcolors && (cpucolors == 0 || equivcolors < cpucolors)) 2254 cpucolors = equivcolors; 2255 } 2256 2257 ASSERT(colors <= page_colors); 2258 ASSERT(colors); 2259 ASSERT((colors & (colors - 1)) == 0); 2260 2261 ASSERT(bin < colors); 2262 2263 /* 2264 * Only hold one freelist lock at a time, that way we 2265 * can start anywhere and not have to worry about lock 2266 * ordering. 2267 */ 2268 big_try_again: 2269 fill_tried = 0; 2270 nwaycnt = 0; 2271 for (i = 0; i <= colors; i++) { 2272 try_again: 2273 ASSERT(bin < colors); 2274 if (PAGE_FREELISTS(mnode, szc, bin, mtype)) { 2275 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 2276 mutex_enter(pcm); 2277 pp = PAGE_FREELISTS(mnode, szc, bin, mtype); 2278 if (pp != NULL) { 2279 /* 2280 * These were set before the page 2281 * was put on the free list, 2282 * they must still be set. 2283 */ 2284 ASSERT(PP_ISFREE(pp)); 2285 ASSERT(PP_ISAGED(pp)); 2286 ASSERT(pp->p_vnode == NULL); 2287 ASSERT(pp->p_hash == NULL); 2288 ASSERT(pp->p_offset == (u_offset_t)-1); 2289 ASSERT(pp->p_szc == szc); 2290 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 2291 2292 /* 2293 * Walk down the hash chain. 2294 * 8k pages are linked on p_next 2295 * and p_prev fields. Large pages 2296 * are a contiguous group of 2297 * constituent pages linked together 2298 * on their p_next and p_prev fields. 2299 * The large pages are linked together 2300 * on the hash chain using p_vpnext 2301 * p_vpprev of the base constituent 2302 * page of each large page. 2303 */ 2304 first_pp = pp; 2305 while (!page_trylock_cons(pp, SE_EXCL)) { 2306 if (szc == 0) { 2307 pp = pp->p_next; 2308 } else { 2309 pp = pp->p_vpnext; 2310 } 2311 2312 ASSERT(PP_ISFREE(pp)); 2313 ASSERT(PP_ISAGED(pp)); 2314 ASSERT(pp->p_vnode == NULL); 2315 ASSERT(pp->p_hash == NULL); 2316 ASSERT(pp->p_offset == (u_offset_t)-1); 2317 ASSERT(pp->p_szc == szc); 2318 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == 2319 mnode); 2320 2321 if (pp == first_pp) { 2322 pp = NULL; 2323 break; 2324 } 2325 } 2326 2327 if (pp) { 2328 ASSERT(mtype == PP_2_MTYPE(pp)); 2329 ASSERT(pp->p_szc == szc); 2330 if (szc == 0) { 2331 page_sub(&PAGE_FREELISTS(mnode, 2332 szc, bin, mtype), pp); 2333 } else { 2334 page_vpsub(&PAGE_FREELISTS( 2335 mnode, szc, bin, mtype), 2336 pp); 2337 CHK_LPG(pp, szc); 2338 } 2339 page_ctr_sub(mnode, mtype, pp, 2340 PG_FREE_LIST); 2341 2342 if ((PP_ISFREE(pp) == 0) || 2343 (PP_ISAGED(pp) == 0)) 2344 panic("free page is not. pp %p", 2345 (void *)pp); 2346 mutex_exit(pcm); 2347 2348 #if defined(__sparc) 2349 ASSERT(!kcage_on || PP_ISNORELOC(pp) || 2350 (flags & PG_NORELOC) == 0); 2351 2352 if (PP_ISNORELOC(pp)) { 2353 pgcnt_t npgs; 2354 2355 npgs = page_get_pagecnt(szc); 2356 kcage_freemem_sub(npgs); 2357 } 2358 #endif 2359 VM_STAT_ADD(vmm_vmstats. 2360 pgmf_allocok[szc]); 2361 return (pp); 2362 } 2363 } 2364 mutex_exit(pcm); 2365 } 2366 2367 /* 2368 * Wow! The initial bin is empty. 2369 * If specific color is needed, check if page color may be 2370 * in other bins. cpucolors is: 2371 * 0 if the colors for this cpu is equal to page_colors. 2372 * This means that pages with a particular color are in a 2373 * single bin. 2374 * -1 if colors of cpus (cheetah+) are heterogenous. Need to 2375 * first determine the colors for the current cpu. 2376 * >0 colors of all cpus are homogenous and < page_colors 2377 */ 2378 2379 if ((flags & PG_MATCH_COLOR) && (cpucolors != 0)) { 2380 if (!nwaybins) { 2381 /* 2382 * cpucolors is negative if ecache setsizes 2383 * are heterogenous. determine colors for this 2384 * particular cpu. 2385 */ 2386 if (cpucolors < 0) { 2387 cpucolors = CPUSETSIZE() / MMU_PAGESIZE; 2388 ASSERT(cpucolors > 0); 2389 nwaybins = colors / cpucolors; 2390 } else { 2391 nwaybins = colors / cpucolors; 2392 ASSERT(szc > 0 || nwaybins > 1); 2393 } 2394 if (nwaybins < 2) 2395 cpucolors = 0; 2396 } 2397 2398 if (cpucolors && (nwaycnt + 1 <= nwaybins)) { 2399 nwaycnt++; 2400 bin = (bin + (colors / nwaybins)) & 2401 (colors - 1); 2402 if (nwaycnt < nwaybins) { 2403 goto try_again; 2404 } 2405 } 2406 /* back to initial color if fall-thru */ 2407 } 2408 2409 /* 2410 * color bins are all empty if color match. Try and satisfy 2411 * the request by breaking up or coalescing pages from 2412 * a different size freelist of the correct color that 2413 * satisfies the ORIGINAL color requested. If that 2414 * fails then try pages of the same size but different 2415 * colors assuming we are not called with 2416 * PG_MATCH_COLOR. 2417 */ 2418 if (!fill_tried) { 2419 fill_tried = 1; 2420 fill_marker = bin >> nszc_color_shift; 2421 pp = page_freelist_fill(szc, bin, mnode, mtype, 2422 PFNNULL); 2423 if (pp != NULL) { 2424 return (pp); 2425 } 2426 } 2427 2428 if (flags & PG_MATCH_COLOR) 2429 break; 2430 2431 /* 2432 * Select next color bin to try. 2433 */ 2434 if (szc == 0) { 2435 /* 2436 * PAGESIZE page case. 2437 */ 2438 if (i == 0) { 2439 bin = (bin + BIN_STEP) & page_colors_mask; 2440 bin_marker = bin; 2441 } else { 2442 bin = (bin + vac_colors) & page_colors_mask; 2443 if (bin == bin_marker) { 2444 bin = (bin + 1) & page_colors_mask; 2445 bin_marker = bin; 2446 } 2447 } 2448 } else { 2449 /* 2450 * Large page case. 2451 */ 2452 bin = (bin + 1) & (colors - 1); 2453 } 2454 /* 2455 * If bin advanced to the next color bin of the 2456 * next larger pagesize, there is a chance the fill 2457 * could succeed. 2458 */ 2459 if (fill_marker != (bin >> nszc_color_shift)) 2460 fill_tried = 0; 2461 } 2462 2463 /* if allowed, cycle through additional mtypes */ 2464 MTYPE_NEXT(mnode, mtype, flags); 2465 if (mtype >= 0) 2466 goto big_try_again; 2467 2468 VM_STAT_ADD(vmm_vmstats.pgmf_allocfailed[szc]); 2469 2470 return (NULL); 2471 } 2472 2473 2474 /* 2475 * Returns the count of free pages for 'pp' with size code 'szc'. 2476 * Note: This function does not return an exact value as the page freelist 2477 * locks are not held and thus the values in the page_counters may be 2478 * changing as we walk through the data. 2479 */ 2480 static int 2481 page_freecnt(int mnode, page_t *pp, uchar_t szc) 2482 { 2483 pgcnt_t pgfree; 2484 pgcnt_t cnt; 2485 ssize_t r = szc; /* region size */ 2486 ssize_t idx; 2487 int i; 2488 int full, range; 2489 2490 /* Make sure pagenum passed in is aligned properly */ 2491 ASSERT((pp->p_pagenum & (PNUM_SIZE(szc) - 1)) == 0); 2492 ASSERT(szc > 0); 2493 2494 /* Prevent page_counters dynamic memory from being freed */ 2495 rw_enter(&page_ctrs_rwlock[mnode], RW_READER); 2496 idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum); 2497 cnt = PAGE_COUNTERS(mnode, r, idx); 2498 pgfree = cnt << PNUM_SHIFT(r - 1); 2499 range = FULL_REGION_CNT(szc); 2500 2501 /* Check for completely full region */ 2502 if (cnt == range) { 2503 rw_exit(&page_ctrs_rwlock[mnode]); 2504 return (pgfree); 2505 } 2506 2507 while (--r > 0) { 2508 idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum); 2509 full = FULL_REGION_CNT(r); 2510 for (i = 0; i < range; i++, idx++) { 2511 cnt = PAGE_COUNTERS(mnode, r, idx); 2512 /* 2513 * If cnt here is full, that means we have already 2514 * accounted for these pages earlier. 2515 */ 2516 if (cnt != full) { 2517 pgfree += (cnt << PNUM_SHIFT(r - 1)); 2518 } 2519 } 2520 range *= full; 2521 } 2522 rw_exit(&page_ctrs_rwlock[mnode]); 2523 return (pgfree); 2524 } 2525 2526 /* 2527 * Called from page_geti_contig_pages to exclusively lock constituent pages 2528 * starting from 'spp' for page size code 'szc'. 2529 * 2530 * If 'ptcpthreshold' is set, the number of free pages needed in the 'szc' 2531 * region needs to be greater than or equal to the threshold. 2532 */ 2533 static int 2534 page_trylock_contig_pages(int mnode, page_t *spp, uchar_t szc, int flags) 2535 { 2536 pgcnt_t pgcnt = PNUM_SIZE(szc); 2537 pgcnt_t pgfree, i; 2538 page_t *pp; 2539 2540 VM_STAT_ADD(vmm_vmstats.ptcp[szc]); 2541 2542 2543 if ((ptcpthreshold == 0) || (flags & PGI_PGCPHIPRI)) 2544 goto skipptcpcheck; 2545 /* 2546 * check if there are sufficient free pages available before attempting 2547 * to trylock. Count is approximate as page counters can change. 2548 */ 2549 pgfree = page_freecnt(mnode, spp, szc); 2550 2551 /* attempt to trylock if there are sufficient already free pages */ 2552 if (pgfree < pgcnt/ptcpthreshold) { 2553 VM_STAT_ADD(vmm_vmstats.ptcpfreethresh[szc]); 2554 return (0); 2555 } 2556 2557 skipptcpcheck: 2558 2559 for (i = 0; i < pgcnt; i++) { 2560 pp = &spp[i]; 2561 if (!page_trylock(pp, SE_EXCL)) { 2562 VM_STAT_ADD(vmm_vmstats.ptcpfailexcl[szc]); 2563 while (--i != (pgcnt_t)-1) { 2564 pp = &spp[i]; 2565 ASSERT(PAGE_EXCL(pp)); 2566 page_unlock_noretire(pp); 2567 } 2568 return (0); 2569 } 2570 ASSERT(spp[i].p_pagenum == spp->p_pagenum + i); 2571 if ((pp->p_szc > szc || (szc && pp->p_szc == szc)) && 2572 !PP_ISFREE(pp)) { 2573 VM_STAT_ADD(vmm_vmstats.ptcpfailszc[szc]); 2574 ASSERT(i == 0); 2575 page_unlock_noretire(pp); 2576 return (0); 2577 } 2578 if (PP_ISNORELOC(pp)) { 2579 VM_STAT_ADD(vmm_vmstats.ptcpfailcage[szc]); 2580 while (i != (pgcnt_t)-1) { 2581 pp = &spp[i]; 2582 ASSERT(PAGE_EXCL(pp)); 2583 page_unlock_noretire(pp); 2584 i--; 2585 } 2586 return (0); 2587 } 2588 } 2589 VM_STAT_ADD(vmm_vmstats.ptcpok[szc]); 2590 return (1); 2591 } 2592 2593 /* 2594 * Claim large page pointed to by 'pp'. 'pp' is the starting set 2595 * of 'szc' constituent pages that had been locked exclusively previously. 2596 * Will attempt to relocate constituent pages in use. 2597 */ 2598 static page_t * 2599 page_claim_contig_pages(page_t *pp, uchar_t szc, int flags) 2600 { 2601 spgcnt_t pgcnt, npgs, i; 2602 page_t *targpp, *rpp, *hpp; 2603 page_t *replpp = NULL; 2604 page_t *pplist = NULL; 2605 2606 ASSERT(pp != NULL); 2607 2608 pgcnt = page_get_pagecnt(szc); 2609 while (pgcnt) { 2610 ASSERT(PAGE_EXCL(pp)); 2611 ASSERT(!PP_ISNORELOC(pp)); 2612 if (PP_ISFREE(pp)) { 2613 /* 2614 * If this is a PG_FREE_LIST page then its 2615 * size code can change underneath us due to 2616 * page promotion or demotion. As an optimzation 2617 * use page_list_sub_pages() instead of 2618 * page_list_sub(). 2619 */ 2620 if (PP_ISAGED(pp)) { 2621 page_list_sub_pages(pp, szc); 2622 if (pp->p_szc == szc) { 2623 return (pp); 2624 } 2625 ASSERT(pp->p_szc < szc); 2626 npgs = page_get_pagecnt(pp->p_szc); 2627 hpp = pp; 2628 for (i = 0; i < npgs; i++, pp++) { 2629 pp->p_szc = szc; 2630 } 2631 page_list_concat(&pplist, &hpp); 2632 pgcnt -= npgs; 2633 continue; 2634 } 2635 ASSERT(!PP_ISAGED(pp)); 2636 ASSERT(pp->p_szc == 0); 2637 page_list_sub(pp, PG_CACHE_LIST); 2638 page_hashout(pp, NULL); 2639 PP_SETAGED(pp); 2640 pp->p_szc = szc; 2641 page_list_concat(&pplist, &pp); 2642 pp++; 2643 pgcnt--; 2644 continue; 2645 } 2646 npgs = page_get_pagecnt(pp->p_szc); 2647 2648 /* 2649 * page_create_wait freemem accounting done by caller of 2650 * page_get_freelist and not necessary to call it prior to 2651 * calling page_get_replacement_page. 2652 * 2653 * page_get_replacement_page can call page_get_contig_pages 2654 * to acquire a large page (szc > 0); the replacement must be 2655 * smaller than the contig page size to avoid looping or 2656 * szc == 0 and PGI_PGCPSZC0 is set. 2657 */ 2658 if (pp->p_szc < szc || (szc == 0 && (flags & PGI_PGCPSZC0))) { 2659 replpp = page_get_replacement_page(pp, NULL, 0); 2660 if (replpp) { 2661 npgs = page_get_pagecnt(pp->p_szc); 2662 ASSERT(npgs <= pgcnt); 2663 targpp = pp; 2664 } 2665 } 2666 2667 /* 2668 * If replacement is NULL or do_page_relocate fails, fail 2669 * coalescing of pages. 2670 */ 2671 if (replpp == NULL || (do_page_relocate(&targpp, &replpp, 0, 2672 &npgs, NULL) != 0)) { 2673 /* 2674 * Unlock un-processed target list 2675 */ 2676 while (pgcnt--) { 2677 ASSERT(PAGE_EXCL(pp)); 2678 page_unlock_noretire(pp); 2679 pp++; 2680 } 2681 /* 2682 * Free the processed target list. 2683 */ 2684 while (pplist) { 2685 pp = pplist; 2686 page_sub(&pplist, pp); 2687 ASSERT(PAGE_EXCL(pp)); 2688 ASSERT(pp->p_szc == szc); 2689 ASSERT(PP_ISFREE(pp)); 2690 ASSERT(PP_ISAGED(pp)); 2691 pp->p_szc = 0; 2692 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); 2693 page_unlock_noretire(pp); 2694 } 2695 2696 if (replpp != NULL) 2697 page_free_replacement_page(replpp); 2698 2699 return (NULL); 2700 } 2701 ASSERT(pp == targpp); 2702 2703 /* LINTED */ 2704 ASSERT(hpp = pp); /* That's right, it's an assignment */ 2705 2706 pp += npgs; 2707 pgcnt -= npgs; 2708 2709 while (npgs--) { 2710 ASSERT(PAGE_EXCL(targpp)); 2711 ASSERT(!PP_ISFREE(targpp)); 2712 ASSERT(!PP_ISNORELOC(targpp)); 2713 PP_SETFREE(targpp); 2714 ASSERT(PP_ISAGED(targpp)); 2715 ASSERT(targpp->p_szc < szc || (szc == 0 && 2716 (flags & PGI_PGCPSZC0))); 2717 targpp->p_szc = szc; 2718 targpp = targpp->p_next; 2719 2720 rpp = replpp; 2721 ASSERT(rpp != NULL); 2722 page_sub(&replpp, rpp); 2723 ASSERT(PAGE_EXCL(rpp)); 2724 ASSERT(!PP_ISFREE(rpp)); 2725 page_unlock_noretire(rpp); 2726 } 2727 ASSERT(targpp == hpp); 2728 ASSERT(replpp == NULL); 2729 page_list_concat(&pplist, &targpp); 2730 } 2731 CHK_LPG(pplist, szc); 2732 return (pplist); 2733 } 2734 2735 /* 2736 * Trim kernel cage from pfnlo-pfnhi and store result in lo-hi. Return code 2737 * of 0 means nothing left after trim. 2738 */ 2739 2740 int 2741 trimkcage(struct memseg *mseg, pfn_t *lo, pfn_t *hi, pfn_t pfnlo, pfn_t pfnhi) 2742 { 2743 pfn_t kcagepfn; 2744 int decr; 2745 int rc = 0; 2746 2747 if (PP_ISNORELOC(mseg->pages)) { 2748 if (PP_ISNORELOC(mseg->epages - 1) == 0) { 2749 2750 /* lower part of this mseg inside kernel cage */ 2751 decr = kcage_current_pfn(&kcagepfn); 2752 2753 /* kernel cage may have transitioned past mseg */ 2754 if (kcagepfn >= mseg->pages_base && 2755 kcagepfn < mseg->pages_end) { 2756 ASSERT(decr == 0); 2757 *lo = kcagepfn; 2758 *hi = MIN(pfnhi, 2759 (mseg->pages_end - 1)); 2760 rc = 1; 2761 } 2762 } 2763 /* else entire mseg in the cage */ 2764 } else { 2765 if (PP_ISNORELOC(mseg->epages - 1)) { 2766 2767 /* upper part of this mseg inside kernel cage */ 2768 decr = kcage_current_pfn(&kcagepfn); 2769 2770 /* kernel cage may have transitioned past mseg */ 2771 if (kcagepfn >= mseg->pages_base && 2772 kcagepfn < mseg->pages_end) { 2773 ASSERT(decr); 2774 *hi = kcagepfn; 2775 *lo = MAX(pfnlo, mseg->pages_base); 2776 rc = 1; 2777 } 2778 } else { 2779 /* entire mseg outside of kernel cage */ 2780 *lo = MAX(pfnlo, mseg->pages_base); 2781 *hi = MIN(pfnhi, (mseg->pages_end - 1)); 2782 rc = 1; 2783 } 2784 } 2785 return (rc); 2786 } 2787 2788 /* 2789 * called from page_get_contig_pages to search 'pfnlo' thru 'pfnhi' to "claim" a 2790 * page with size code 'szc'. Claiming such a page requires acquiring 2791 * exclusive locks on all constituent pages (page_trylock_contig_pages), 2792 * relocating pages in use and concatenating these constituent pages into a 2793 * large page. 2794 * 2795 * The page lists do not have such a large page and page_freelist_fill has 2796 * already failed to demote larger pages and/or coalesce smaller free pages. 2797 * 2798 * 'flags' may specify PG_COLOR_MATCH which would limit the search of large 2799 * pages with the same color as 'bin'. 2800 * 2801 * 'pfnflag' specifies the subset of the pfn range to search. 2802 */ 2803 2804 2805 static page_t * 2806 page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags, 2807 pfn_t pfnlo, pfn_t pfnhi, pgcnt_t pfnflag) 2808 { 2809 struct memseg *mseg; 2810 pgcnt_t szcpgcnt = page_get_pagecnt(szc); 2811 pgcnt_t szcpgmask = szcpgcnt - 1; 2812 pfn_t randpfn; 2813 page_t *pp, *randpp, *endpp; 2814 uint_t colors; 2815 pfn_t hi, lo; 2816 uint_t skip; 2817 2818 ASSERT(szc != 0 || (flags & PGI_PGCPSZC0)); 2819 2820 if ((pfnhi - pfnlo) + 1 < szcpgcnt) 2821 return (NULL); 2822 2823 ASSERT(szc < mmu_page_sizes); 2824 2825 colors = (szc) ? page_convert_color(0, szc, page_colors - 1) + 1 : 2826 page_colors; 2827 2828 ASSERT(bin < colors); 2829 2830 /* 2831 * trim the pfn range to search based on pfnflag. pfnflag is set 2832 * when there have been previous page_get_contig_page failures to 2833 * limit the search. 2834 * 2835 * The high bit in pfnflag specifies the number of 'slots' in the 2836 * pfn range and the remainder of pfnflag specifies which slot. 2837 * For example, a value of 1010b would mean the second slot of 2838 * the pfn range that has been divided into 8 slots. 2839 */ 2840 if (pfnflag > 1) { 2841 int slots = 1 << (highbit(pfnflag) - 1); 2842 int slotid = pfnflag & (slots - 1); 2843 pgcnt_t szcpages; 2844 int slotlen; 2845 2846 pfnlo = P2ROUNDUP(pfnlo, szcpgcnt); 2847 pfnhi = pfnhi & ~(szcpgcnt - 1); 2848 2849 szcpages = ((pfnhi - pfnlo) + 1) / szcpgcnt; 2850 slotlen = howmany(szcpages, slots); 2851 pfnlo = pfnlo + (((slotid * slotlen) % szcpages) * szcpgcnt); 2852 ASSERT(pfnlo < pfnhi); 2853 if (pfnhi > pfnlo + (slotlen * szcpgcnt)) 2854 pfnhi = pfnlo + (slotlen * szcpgcnt); 2855 } 2856 2857 memsegs_lock(0); 2858 2859 /* 2860 * loop through memsegs to look for contig page candidates 2861 */ 2862 2863 for (mseg = memsegs; mseg != NULL; mseg = mseg->next) { 2864 if (pfnhi < mseg->pages_base || pfnlo >= mseg->pages_end) { 2865 /* no overlap */ 2866 continue; 2867 } 2868 2869 if (mseg->pages_end - mseg->pages_base < szcpgcnt) 2870 /* mseg too small */ 2871 continue; 2872 2873 /* trim off kernel cage pages from pfn range */ 2874 if (kcage_on) { 2875 if (trimkcage(mseg, &lo, &hi, pfnlo, pfnhi) == 0) 2876 continue; 2877 } else { 2878 lo = MAX(pfnlo, mseg->pages_base); 2879 hi = MIN(pfnhi, (mseg->pages_end - 1)); 2880 } 2881 2882 /* round to szcpgcnt boundaries */ 2883 lo = P2ROUNDUP(lo, szcpgcnt); 2884 hi = hi & ~(szcpgcnt - 1); 2885 2886 if (hi <= lo) 2887 continue; 2888 2889 /* 2890 * set lo to point to the pfn for the desired bin. Large 2891 * page sizes may only have a single page color 2892 */ 2893 if ((colors > 1) && (flags & PG_MATCH_COLOR)) { 2894 uint_t lobin; 2895 2896 /* 2897 * factor in colorequiv to check additional 2898 * 'equivalent' bins. 2899 */ 2900 if (colorequiv > 1 && colors > colorequiv) 2901 colors = colors / colorequiv; 2902 2903 /* determine bin that lo currently points to */ 2904 lobin = (lo & ((szcpgcnt * colors) - 1)) / szcpgcnt; 2905 2906 /* 2907 * set lo to point at appropriate color and set skip 2908 * to arrive at the next szc page of the same color. 2909 */ 2910 lo += ((bin - lobin) & (colors - 1)) * szcpgcnt; 2911 2912 skip = colors * szcpgcnt; 2913 } else { 2914 /* check all pages starting from lo */ 2915 skip = szcpgcnt; 2916 } 2917 if (hi <= lo) 2918 /* mseg cannot satisfy color request */ 2919 continue; 2920 2921 /* randomly choose a point between lo and hi to begin search */ 2922 2923 randpfn = (pfn_t)GETTICK(); 2924 randpfn = ((randpfn % (hi - lo)) + lo) & ~(skip - 1); 2925 randpp = mseg->pages + (randpfn - mseg->pages_base); 2926 2927 ASSERT(randpp->p_pagenum == randpfn); 2928 2929 pp = randpp; 2930 endpp = mseg->pages + (hi - mseg->pages_base); 2931 2932 ASSERT(randpp + szcpgcnt <= endpp); 2933 2934 do { 2935 ASSERT(!(pp->p_pagenum & szcpgmask)); 2936 ASSERT((flags & PG_MATCH_COLOR) == 0 || 2937 colorequiv > 1 || 2938 PP_2_BIN(pp) == bin); 2939 if (page_trylock_contig_pages(mnode, pp, szc, flags)) { 2940 /* pages unlocked by page_claim on failure */ 2941 if (page_claim_contig_pages(pp, szc, flags)) { 2942 memsegs_unlock(0); 2943 return (pp); 2944 } 2945 } 2946 2947 pp += skip; 2948 if (pp >= endpp) { 2949 /* start from the beginning */ 2950 pp = mseg->pages + (lo - mseg->pages_base); 2951 ASSERT(pp->p_pagenum == lo); 2952 ASSERT(pp + szcpgcnt <= endpp); 2953 } 2954 } while (pp != randpp); 2955 } 2956 memsegs_unlock(0); 2957 return (NULL); 2958 } 2959 2960 2961 /* 2962 * controlling routine that searches through physical memory in an attempt to 2963 * claim a large page based on the input parameters. 2964 * on the page free lists. 2965 * 2966 * calls page_geti_contig_pages with an initial pfn range from the mnode 2967 * and mtype. page_geti_contig_pages will trim off the parts of the pfn range 2968 * that overlaps with the kernel cage or does not match the requested page 2969 * color if PG_MATCH_COLOR is set. Since this search is very expensive, 2970 * page_geti_contig_pages may further limit the search range based on 2971 * previous failure counts (pgcpfailcnt[]). 2972 * 2973 * for PGI_PGCPSZC0 requests, page_get_contig_pages will relocate a base 2974 * pagesize page that satisfies mtype. 2975 */ 2976 page_t * 2977 page_get_contig_pages(int mnode, uint_t bin, int mtype, uchar_t szc, 2978 uint_t flags) 2979 { 2980 pfn_t pfnlo, pfnhi; /* contig pages pfn range */ 2981 page_t *pp; 2982 pgcnt_t pfnflag = 0; /* no limit on search if 0 */ 2983 2984 VM_STAT_ADD(vmm_vmstats.pgcp_alloc[szc]); 2985 2986 /* LINTED */ 2987 MTYPE_START(mnode, mtype, flags); 2988 if (mtype < 0) { /* mnode does not have memory in mtype range */ 2989 VM_STAT_ADD(vmm_vmstats.pgcp_allocempty[szc]); 2990 return (NULL); 2991 } 2992 2993 ASSERT(szc > 0 || (flags & PGI_PGCPSZC0)); 2994 2995 /* no allocations from cage */ 2996 flags |= PGI_NOCAGE; 2997 2998 /* do not limit search and ignore color if hi pri */ 2999 3000 if (pgcplimitsearch && ((flags & PGI_PGCPHIPRI) == 0)) 3001 pfnflag = pgcpfailcnt[szc]; 3002 3003 /* remove color match to improve chances */ 3004 3005 if (flags & PGI_PGCPHIPRI || pfnflag) 3006 flags &= ~PG_MATCH_COLOR; 3007 3008 do { 3009 /* get pfn range based on mnode and mtype */ 3010 MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi); 3011 3012 ASSERT(pfnhi >= pfnlo); 3013 3014 pp = page_geti_contig_pages(mnode, bin, szc, flags, 3015 pfnlo, pfnhi, pfnflag); 3016 3017 if (pp != NULL) { 3018 pfnflag = pgcpfailcnt[szc]; 3019 if (pfnflag) { 3020 /* double the search size */ 3021 pgcpfailcnt[szc] = pfnflag >> 1; 3022 } 3023 VM_STAT_ADD(vmm_vmstats.pgcp_allocok[szc]); 3024 return (pp); 3025 } 3026 MTYPE_NEXT(mnode, mtype, flags); 3027 } while (mtype >= 0); 3028 3029 VM_STAT_ADD(vmm_vmstats.pgcp_allocfailed[szc]); 3030 return (NULL); 3031 } 3032 3033 3034 /* 3035 * Find the `best' page on the freelist for this (vp,off) (as,vaddr) pair. 3036 * 3037 * Does its own locking and accounting. 3038 * If PG_MATCH_COLOR is set, then NULL will be returned if there are no 3039 * pages of the proper color even if there are pages of a different color. 3040 * 3041 * Finds a page, removes it, THEN locks it. 3042 */ 3043 3044 /*ARGSUSED*/ 3045 page_t * 3046 page_get_freelist(struct vnode *vp, u_offset_t off, struct seg *seg, 3047 caddr_t vaddr, size_t size, uint_t flags, struct lgrp *lgrp) 3048 { 3049 struct as *as = seg->s_as; 3050 page_t *pp = NULL; 3051 ulong_t bin; 3052 uchar_t szc; 3053 int mnode; 3054 int mtype; 3055 page_t *(*page_get_func)(int, uint_t, int, uchar_t, uint_t); 3056 lgrp_mnode_cookie_t lgrp_cookie; 3057 3058 page_get_func = page_get_mnode_freelist; 3059 3060 /* 3061 * If we aren't passed a specific lgroup, or passed a freed lgrp 3062 * assume we wish to allocate near to the current thread's home. 3063 */ 3064 if (!LGRP_EXISTS(lgrp)) 3065 lgrp = lgrp_home_lgrp(); 3066 3067 if (kcage_on) { 3068 if ((flags & (PG_NORELOC | PG_PANIC)) == PG_NORELOC && 3069 kcage_freemem < kcage_throttlefree + btop(size) && 3070 curthread != kcage_cageout_thread) { 3071 /* 3072 * Set a "reserve" of kcage_throttlefree pages for 3073 * PG_PANIC and cageout thread allocations. 3074 * 3075 * Everybody else has to serialize in 3076 * page_create_get_something() to get a cage page, so 3077 * that we don't deadlock cageout! 3078 */ 3079 return (NULL); 3080 } 3081 } else { 3082 flags &= ~PG_NORELOC; 3083 flags |= PGI_NOCAGE; 3084 } 3085 3086 /* LINTED */ 3087 MTYPE_INIT(mtype, vp, vaddr, flags, size); 3088 3089 /* 3090 * Convert size to page size code. 3091 */ 3092 if ((szc = page_szc(size)) == (uchar_t)-1) 3093 panic("page_get_freelist: illegal page size request"); 3094 ASSERT(szc < mmu_page_sizes); 3095 3096 VM_STAT_ADD(vmm_vmstats.pgf_alloc[szc]); 3097 3098 /* LINTED */ 3099 AS_2_BIN(as, seg, vp, vaddr, bin); 3100 3101 /* bin is for base pagesize color - convert if larger pagesize. */ 3102 if (szc) 3103 bin = page_convert_color(0, szc, bin); 3104 3105 /* 3106 * Try to get a local page first, but try remote if we can't 3107 * get a page of the right color. 3108 */ 3109 pgretry: 3110 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL); 3111 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3112 pp = page_get_func(mnode, bin, mtype, szc, flags); 3113 if (pp != NULL) { 3114 VM_STAT_ADD(vmm_vmstats.pgf_allocok[szc]); 3115 DTRACE_PROBE4(page__get, 3116 lgrp_t *, lgrp, 3117 int, mnode, 3118 ulong_t, bin, 3119 uint_t, flags); 3120 return (pp); 3121 } 3122 } 3123 ASSERT(pp == NULL); 3124 3125 /* 3126 * for non-SZC0 PAGESIZE requests, check cachelist before checking 3127 * remote free lists. Caller expected to call page_get_cachelist which 3128 * will check local cache lists and remote free lists. 3129 */ 3130 if (szc == 0 && ((flags & PGI_PGCPSZC0) == 0)) { 3131 VM_STAT_ADD(vmm_vmstats.pgf_allocdeferred); 3132 return (NULL); 3133 } 3134 3135 ASSERT(szc > 0 || (flags & PGI_PGCPSZC0)); 3136 3137 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); 3138 3139 /* 3140 * Try to get a non-local freelist page. 3141 */ 3142 LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie); 3143 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3144 pp = page_get_func(mnode, bin, mtype, szc, flags); 3145 if (pp != NULL) { 3146 DTRACE_PROBE4(page__get, 3147 lgrp_t *, lgrp, 3148 int, mnode, 3149 ulong_t, bin, 3150 uint_t, flags); 3151 VM_STAT_ADD(vmm_vmstats.pgf_allocokrem[szc]); 3152 return (pp); 3153 } 3154 } 3155 3156 ASSERT(pp == NULL); 3157 3158 /* 3159 * when the cage is off chances are page_get_contig_pages() will fail 3160 * to lock a large page chunk therefore when the cage is off it's not 3161 * called by default. this can be changed via /etc/system. 3162 * 3163 * page_get_contig_pages() also called to acquire a base pagesize page 3164 * for page_create_get_something(). 3165 */ 3166 if (!(flags & PG_NORELOC) && (pg_contig_disable == 0) && 3167 (kcage_on || pg_lpgcreate_nocage || szc == 0) && 3168 (page_get_func != page_get_contig_pages)) { 3169 3170 VM_STAT_ADD(vmm_vmstats.pgf_allocretry[szc]); 3171 page_get_func = page_get_contig_pages; 3172 goto pgretry; 3173 } 3174 3175 if (pgcplimitsearch && page_get_func == page_get_contig_pages) 3176 SETPGCPFAILCNT(szc); 3177 3178 VM_STAT_ADD(vmm_vmstats.pgf_allocfailed[szc]); 3179 return (NULL); 3180 } 3181 3182 /* 3183 * Find the `best' page on the cachelist for this (vp,off) (as,vaddr) pair. 3184 * 3185 * Does its own locking. 3186 * If PG_MATCH_COLOR is set, then NULL will be returned if there are no 3187 * pages of the proper color even if there are pages of a different color. 3188 * Otherwise, scan the bins for ones with pages. For each bin with pages, 3189 * try to lock one of them. If no page can be locked, try the 3190 * next bin. Return NULL if a page can not be found and locked. 3191 * 3192 * Finds a pages, trys to lock it, then removes it. 3193 */ 3194 3195 /*ARGSUSED*/ 3196 page_t * 3197 page_get_cachelist(struct vnode *vp, u_offset_t off, struct seg *seg, 3198 caddr_t vaddr, uint_t flags, struct lgrp *lgrp) 3199 { 3200 page_t *pp; 3201 struct as *as = seg->s_as; 3202 ulong_t bin; 3203 /*LINTED*/ 3204 int mnode; 3205 int mtype; 3206 lgrp_mnode_cookie_t lgrp_cookie; 3207 3208 /* 3209 * If we aren't passed a specific lgroup, or pasased a freed lgrp 3210 * assume we wish to allocate near to the current thread's home. 3211 */ 3212 if (!LGRP_EXISTS(lgrp)) 3213 lgrp = lgrp_home_lgrp(); 3214 3215 if (!kcage_on) { 3216 flags &= ~PG_NORELOC; 3217 flags |= PGI_NOCAGE; 3218 } 3219 3220 if ((flags & (PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == PG_NORELOC && 3221 kcage_freemem <= kcage_throttlefree) { 3222 /* 3223 * Reserve kcage_throttlefree pages for critical kernel 3224 * threads. 3225 * 3226 * Everybody else has to go to page_create_get_something() 3227 * to get a cage page, so we don't deadlock cageout. 3228 */ 3229 return (NULL); 3230 } 3231 3232 /* LINTED */ 3233 AS_2_BIN(as, seg, vp, vaddr, bin); 3234 3235 ASSERT(bin <= page_colors_mask); 3236 3237 /* LINTED */ 3238 MTYPE_INIT(mtype, vp, vaddr, flags, MMU_PAGESIZE); 3239 3240 VM_STAT_ADD(vmm_vmstats.pgc_alloc); 3241 3242 /* 3243 * Try local cachelists first 3244 */ 3245 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL); 3246 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3247 pp = page_get_mnode_cachelist(bin, flags, mnode, mtype); 3248 if (pp != NULL) { 3249 VM_STAT_ADD(vmm_vmstats.pgc_allocok); 3250 DTRACE_PROBE4(page__get, 3251 lgrp_t *, lgrp, 3252 int, mnode, 3253 ulong_t, bin, 3254 uint_t, flags); 3255 return (pp); 3256 } 3257 } 3258 3259 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); 3260 3261 /* 3262 * Try freelists/cachelists that are farther away 3263 * This is our only chance to allocate remote pages for PAGESIZE 3264 * requests. 3265 */ 3266 LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie); 3267 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3268 pp = page_get_mnode_freelist(mnode, bin, mtype, 3269 0, flags); 3270 if (pp != NULL) { 3271 VM_STAT_ADD(vmm_vmstats.pgc_allocokdeferred); 3272 DTRACE_PROBE4(page__get, 3273 lgrp_t *, lgrp, 3274 int, mnode, 3275 ulong_t, bin, 3276 uint_t, flags); 3277 return (pp); 3278 } 3279 pp = page_get_mnode_cachelist(bin, flags, mnode, mtype); 3280 if (pp != NULL) { 3281 VM_STAT_ADD(vmm_vmstats.pgc_allocokrem); 3282 DTRACE_PROBE4(page__get, 3283 lgrp_t *, lgrp, 3284 int, mnode, 3285 ulong_t, bin, 3286 uint_t, flags); 3287 return (pp); 3288 } 3289 } 3290 3291 VM_STAT_ADD(vmm_vmstats.pgc_allocfailed); 3292 return (NULL); 3293 } 3294 3295 page_t * 3296 page_get_mnode_cachelist(uint_t bin, uint_t flags, int mnode, int mtype) 3297 { 3298 kmutex_t *pcm; 3299 int i; 3300 page_t *pp; 3301 page_t *first_pp; 3302 uint_t bin_marker; 3303 int nwaybins, nwaycnt; 3304 int cpucolors; 3305 3306 VM_STAT_ADD(vmm_vmstats.pgmc_alloc); 3307 3308 /* LINTED */ 3309 MTYPE_START(mnode, mtype, flags); 3310 if (mtype < 0) { /* mnode does not have memory in mtype range */ 3311 VM_STAT_ADD(vmm_vmstats.pgmc_allocempty); 3312 return (NULL); 3313 } 3314 3315 nwaybins = 0; 3316 cpucolors = cpu_page_colors; 3317 /* 3318 * adjust cpucolors to possibly check additional 'equivalent' bins 3319 * to try to minimize fragmentation of large pages by delaying calls 3320 * to page_freelist_fill. 3321 */ 3322 if (colorequiv > 1) { 3323 int equivcolors = page_colors / colorequiv; 3324 3325 if (equivcolors && (cpucolors == 0 || equivcolors < cpucolors)) 3326 cpucolors = equivcolors; 3327 } 3328 3329 /* 3330 * Only hold one cachelist lock at a time, that way we 3331 * can start anywhere and not have to worry about lock 3332 * ordering. 3333 */ 3334 3335 big_try_again: 3336 nwaycnt = 0; 3337 for (i = 0; i <= page_colors; i++) { 3338 if (PAGE_CACHELISTS(mnode, bin, mtype)) { 3339 pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST); 3340 mutex_enter(pcm); 3341 pp = PAGE_CACHELISTS(mnode, bin, mtype); 3342 if (pp != NULL) { 3343 first_pp = pp; 3344 ASSERT(pp->p_vnode); 3345 ASSERT(PP_ISAGED(pp) == 0); 3346 ASSERT(pp->p_szc == 0); 3347 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 3348 while (!page_trylock(pp, SE_EXCL)) { 3349 pp = pp->p_next; 3350 ASSERT(pp->p_szc == 0); 3351 if (pp == first_pp) { 3352 /* 3353 * We have searched the 3354 * complete list! 3355 * And all of them (might 3356 * only be one) are locked. 3357 * This can happen since 3358 * these pages can also be 3359 * found via the hash list. 3360 * When found via the hash 3361 * list, they are locked 3362 * first, then removed. 3363 * We give up to let the 3364 * other thread run. 3365 */ 3366 pp = NULL; 3367 break; 3368 } 3369 ASSERT(pp->p_vnode); 3370 ASSERT(PP_ISFREE(pp)); 3371 ASSERT(PP_ISAGED(pp) == 0); 3372 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == 3373 mnode); 3374 } 3375 3376 if (pp) { 3377 page_t **ppp; 3378 /* 3379 * Found and locked a page. 3380 * Pull it off the list. 3381 */ 3382 ASSERT(mtype == PP_2_MTYPE(pp)); 3383 ppp = &PAGE_CACHELISTS(mnode, bin, 3384 mtype); 3385 page_sub(ppp, pp); 3386 /* 3387 * Subtract counters before releasing 3388 * pcm mutex to avoid a race with 3389 * page_freelist_coalesce and 3390 * page_freelist_fill. 3391 */ 3392 page_ctr_sub(mnode, mtype, pp, 3393 PG_CACHE_LIST); 3394 mutex_exit(pcm); 3395 ASSERT(pp->p_vnode); 3396 ASSERT(PP_ISAGED(pp) == 0); 3397 #if defined(__sparc) 3398 ASSERT(!kcage_on || 3399 (flags & PG_NORELOC) == 0 || 3400 PP_ISNORELOC(pp)); 3401 if (PP_ISNORELOC(pp)) { 3402 kcage_freemem_sub(1); 3403 } 3404 #endif 3405 VM_STAT_ADD(vmm_vmstats. 3406 pgmc_allocok); 3407 return (pp); 3408 } 3409 } 3410 mutex_exit(pcm); 3411 } 3412 3413 /* 3414 * Wow! The initial bin is empty or no page in the bin could 3415 * be locked. 3416 * 3417 * If specific color is needed, check if page color may be in 3418 * other bins. 3419 */ 3420 if ((flags & PG_MATCH_COLOR) && (cpucolors != 0)) { 3421 if (!nwaybins) { 3422 if (cpucolors < 0) { 3423 cpucolors = CPUSETSIZE() / MMU_PAGESIZE; 3424 ASSERT(cpucolors > 0); 3425 nwaybins = page_colors / cpucolors; 3426 if (nwaybins < 2) 3427 cpucolors = 0; 3428 } else { 3429 nwaybins = page_colors / cpucolors; 3430 ASSERT(nwaybins > 1); 3431 } 3432 } 3433 3434 if (++nwaycnt >= nwaybins) { 3435 break; 3436 } 3437 bin = (bin + (page_colors / nwaybins)) & 3438 page_colors_mask; 3439 continue; 3440 } 3441 3442 if (i == 0) { 3443 bin = (bin + BIN_STEP) & page_colors_mask; 3444 bin_marker = bin; 3445 } else { 3446 bin = (bin + vac_colors) & page_colors_mask; 3447 if (bin == bin_marker) { 3448 bin = (bin + 1) & page_colors_mask; 3449 bin_marker = bin; 3450 } 3451 } 3452 } 3453 3454 MTYPE_NEXT(mnode, mtype, flags); 3455 if (mtype >= 0) 3456 goto big_try_again; 3457 3458 VM_STAT_ADD(vmm_vmstats.pgmc_allocfailed); 3459 return (NULL); 3460 } 3461 3462 #ifdef DEBUG 3463 #define REPL_PAGE_STATS 3464 #endif /* DEBUG */ 3465 3466 #ifdef REPL_PAGE_STATS 3467 struct repl_page_stats { 3468 uint_t ngets; 3469 uint_t ngets_noreloc; 3470 uint_t npgr_noreloc; 3471 uint_t nnopage_first; 3472 uint_t nnopage; 3473 uint_t nhashout; 3474 uint_t nnofree; 3475 uint_t nnext_pp; 3476 } repl_page_stats; 3477 #define REPL_STAT_INCR(v) atomic_add_32(&repl_page_stats.v, 1) 3478 #else /* REPL_PAGE_STATS */ 3479 #define REPL_STAT_INCR(v) 3480 #endif /* REPL_PAGE_STATS */ 3481 3482 int pgrppgcp; 3483 3484 /* 3485 * The freemem accounting must be done by the caller. 3486 * First we try to get a replacement page of the same size as like_pp, 3487 * if that is not possible, then we just get a set of discontiguous 3488 * PAGESIZE pages. 3489 */ 3490 page_t * 3491 page_get_replacement_page(page_t *orig_like_pp, struct lgrp *lgrp_target, 3492 uint_t pgrflags) 3493 { 3494 page_t *like_pp; 3495 page_t *pp, *pplist; 3496 page_t *pl = NULL; 3497 ulong_t bin; 3498 int mnode, page_mnode; 3499 int szc; 3500 spgcnt_t npgs, pg_cnt; 3501 pfn_t pfnum; 3502 int mtype; 3503 int flags = 0; 3504 lgrp_mnode_cookie_t lgrp_cookie; 3505 lgrp_t *lgrp; 3506 3507 REPL_STAT_INCR(ngets); 3508 like_pp = orig_like_pp; 3509 ASSERT(PAGE_EXCL(like_pp)); 3510 3511 szc = like_pp->p_szc; 3512 npgs = page_get_pagecnt(szc); 3513 /* 3514 * Now we reset like_pp to the base page_t. 3515 * That way, we won't walk past the end of this 'szc' page. 3516 */ 3517 pfnum = PFN_BASE(like_pp->p_pagenum, szc); 3518 like_pp = page_numtopp_nolock(pfnum); 3519 ASSERT(like_pp->p_szc == szc); 3520 3521 if (PP_ISNORELOC(like_pp)) { 3522 ASSERT(kcage_on); 3523 REPL_STAT_INCR(ngets_noreloc); 3524 flags = PGI_RELOCONLY; 3525 } else if (pgrflags & PGR_NORELOC) { 3526 ASSERT(kcage_on); 3527 REPL_STAT_INCR(npgr_noreloc); 3528 flags = PG_NORELOC; 3529 } 3530 3531 /* 3532 * Kernel pages must always be replaced with the same size 3533 * pages, since we cannot properly handle demotion of kernel 3534 * pages. 3535 */ 3536 if (like_pp->p_vnode == &kvp) 3537 pgrflags |= PGR_SAMESZC; 3538 3539 /* LINTED */ 3540 MTYPE_PGR_INIT(mtype, flags, like_pp, page_mnode, npgs); 3541 3542 while (npgs) { 3543 pplist = NULL; 3544 for (;;) { 3545 pg_cnt = page_get_pagecnt(szc); 3546 bin = PP_2_BIN(like_pp); 3547 ASSERT(like_pp->p_szc == orig_like_pp->p_szc); 3548 ASSERT(pg_cnt <= npgs); 3549 3550 /* 3551 * If an lgroup was specified, try to get the 3552 * page from that lgroup. 3553 * NOTE: Must be careful with code below because 3554 * lgroup may disappear and reappear since there 3555 * is no locking for lgroup here. 3556 */ 3557 if (LGRP_EXISTS(lgrp_target)) { 3558 /* 3559 * Keep local variable for lgroup separate 3560 * from lgroup argument since this code should 3561 * only be exercised when lgroup argument 3562 * exists.... 3563 */ 3564 lgrp = lgrp_target; 3565 3566 /* Try the lgroup's freelists first */ 3567 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 3568 LGRP_SRCH_LOCAL); 3569 while ((pplist == NULL) && 3570 (mnode = lgrp_memnode_choose(&lgrp_cookie)) 3571 != -1) { 3572 pplist = page_get_mnode_freelist( 3573 mnode, bin, mtype, szc, 3574 flags); 3575 } 3576 3577 /* 3578 * Now try it's cachelists if this is a 3579 * small page. Don't need to do it for 3580 * larger ones since page_freelist_coalesce() 3581 * already failed. 3582 */ 3583 if (pplist != NULL || szc != 0) 3584 break; 3585 3586 /* Now try it's cachelists */ 3587 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 3588 LGRP_SRCH_LOCAL); 3589 3590 while ((pplist == NULL) && 3591 (mnode = lgrp_memnode_choose(&lgrp_cookie)) 3592 != -1) { 3593 pplist = page_get_mnode_cachelist( 3594 bin, flags, mnode, mtype); 3595 } 3596 if (pplist != NULL) { 3597 page_hashout(pplist, NULL); 3598 PP_SETAGED(pplist); 3599 REPL_STAT_INCR(nhashout); 3600 break; 3601 } 3602 /* Done looking in this lgroup. Bail out. */ 3603 break; 3604 } 3605 3606 /* 3607 * No lgroup was specified (or lgroup was removed by 3608 * DR, so just try to get the page as close to 3609 * like_pp's mnode as possible. 3610 * First try the local freelist... 3611 */ 3612 mnode = PP_2_MEM_NODE(like_pp); 3613 pplist = page_get_mnode_freelist(mnode, bin, 3614 mtype, szc, flags); 3615 if (pplist != NULL) 3616 break; 3617 3618 REPL_STAT_INCR(nnofree); 3619 3620 /* 3621 * ...then the local cachelist. Don't need to do it for 3622 * larger pages cause page_freelist_coalesce() already 3623 * failed there anyway. 3624 */ 3625 if (szc == 0) { 3626 pplist = page_get_mnode_cachelist(bin, flags, 3627 mnode, mtype); 3628 if (pplist != NULL) { 3629 page_hashout(pplist, NULL); 3630 PP_SETAGED(pplist); 3631 REPL_STAT_INCR(nhashout); 3632 break; 3633 } 3634 } 3635 3636 /* Now try remote freelists */ 3637 page_mnode = mnode; 3638 lgrp = 3639 lgrp_hand_to_lgrp(MEM_NODE_2_LGRPHAND(page_mnode)); 3640 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 3641 LGRP_SRCH_HIER); 3642 while (pplist == NULL && 3643 (mnode = lgrp_memnode_choose(&lgrp_cookie)) 3644 != -1) { 3645 /* 3646 * Skip local mnode. 3647 */ 3648 if ((mnode == page_mnode) || 3649 (mem_node_config[mnode].exists == 0)) 3650 continue; 3651 3652 pplist = page_get_mnode_freelist(mnode, 3653 bin, mtype, szc, flags); 3654 } 3655 3656 if (pplist != NULL) 3657 break; 3658 3659 3660 /* Now try remote cachelists */ 3661 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 3662 LGRP_SRCH_HIER); 3663 while (pplist == NULL && szc == 0) { 3664 mnode = lgrp_memnode_choose(&lgrp_cookie); 3665 if (mnode == -1) 3666 break; 3667 /* 3668 * Skip local mnode. 3669 */ 3670 if ((mnode == page_mnode) || 3671 (mem_node_config[mnode].exists == 0)) 3672 continue; 3673 3674 pplist = page_get_mnode_cachelist(bin, 3675 flags, mnode, mtype); 3676 3677 if (pplist != NULL) { 3678 page_hashout(pplist, NULL); 3679 PP_SETAGED(pplist); 3680 REPL_STAT_INCR(nhashout); 3681 break; 3682 } 3683 } 3684 3685 /* 3686 * Break out of while loop under the following cases: 3687 * - If we successfully got a page. 3688 * - If pgrflags specified only returning a specific 3689 * page size and we could not find that page size. 3690 * - If we could not satisfy the request with PAGESIZE 3691 * or larger pages. 3692 */ 3693 if (pplist != NULL || szc == 0) 3694 break; 3695 3696 if ((pgrflags & PGR_SAMESZC) || pgrppgcp) { 3697 /* try to find contig page */ 3698 3699 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 3700 LGRP_SRCH_HIER); 3701 3702 while ((pplist == NULL) && 3703 (mnode = 3704 lgrp_memnode_choose(&lgrp_cookie)) 3705 != -1) { 3706 pplist = page_get_contig_pages( 3707 mnode, bin, mtype, szc, 3708 flags | PGI_PGCPHIPRI); 3709 } 3710 break; 3711 } 3712 3713 /* 3714 * The correct thing to do here is try the next 3715 * page size down using szc--. Due to a bug 3716 * with the processing of HAT_RELOAD_SHARE 3717 * where the sfmmu_ttecnt arrays of all 3718 * hats sharing an ISM segment don't get updated, 3719 * using intermediate size pages for relocation 3720 * can lead to continuous page faults. 3721 */ 3722 szc = 0; 3723 } 3724 3725 if (pplist != NULL) { 3726 DTRACE_PROBE4(page__get, 3727 lgrp_t *, lgrp, 3728 int, mnode, 3729 ulong_t, bin, 3730 uint_t, flags); 3731 3732 while (pplist != NULL && pg_cnt--) { 3733 ASSERT(pplist != NULL); 3734 pp = pplist; 3735 page_sub(&pplist, pp); 3736 PP_CLRFREE(pp); 3737 PP_CLRAGED(pp); 3738 page_list_concat(&pl, &pp); 3739 npgs--; 3740 like_pp = like_pp + 1; 3741 REPL_STAT_INCR(nnext_pp); 3742 } 3743 ASSERT(pg_cnt == 0); 3744 } else { 3745 break; 3746 } 3747 } 3748 3749 if (npgs) { 3750 /* 3751 * We were unable to allocate the necessary number 3752 * of pages. 3753 * We need to free up any pl. 3754 */ 3755 REPL_STAT_INCR(nnopage); 3756 page_free_replacement_page(pl); 3757 return (NULL); 3758 } else { 3759 return (pl); 3760 } 3761 } 3762 3763 /* 3764 * demote a free large page to it's constituent pages 3765 */ 3766 void 3767 page_demote_free_pages(page_t *pp) 3768 { 3769 3770 int mnode; 3771 3772 ASSERT(pp != NULL); 3773 ASSERT(PAGE_LOCKED(pp)); 3774 ASSERT(PP_ISFREE(pp)); 3775 ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes); 3776 3777 mnode = PP_2_MEM_NODE(pp); 3778 page_freelist_lock(mnode); 3779 if (pp->p_szc != 0) { 3780 (void) page_demote(mnode, PFN_BASE(pp->p_pagenum, 3781 pp->p_szc), pp->p_szc, 0, PC_NO_COLOR, PC_FREE); 3782 } 3783 page_freelist_unlock(mnode); 3784 ASSERT(pp->p_szc == 0); 3785 } 3786