1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * Portions of this source code were derived from Berkeley 4.3 BSD 32 * under license from the Regents of the University of California. 33 */ 34 35 #pragma ident "%Z%%M% %I% %E% SMI" 36 37 /* 38 * This file contains common functions to access and manage the page lists. 39 * Many of these routines originated from platform dependent modules 40 * (sun4/vm/vm_dep.c, i86pc/vm/vm_machdep.c) and modified to function in 41 * a platform independent manner. 42 * 43 * vm/vm_dep.h provides for platform specific support. 44 */ 45 46 #include <sys/types.h> 47 #include <sys/debug.h> 48 #include <sys/cmn_err.h> 49 #include <sys/systm.h> 50 #include <sys/atomic.h> 51 #include <sys/sysmacros.h> 52 #include <vm/as.h> 53 #include <vm/page.h> 54 #include <vm/seg_kmem.h> 55 #include <vm/seg_vn.h> 56 #include <sys/memnode.h> 57 #include <vm/vm_dep.h> 58 #include <sys/lgrp.h> 59 #include <sys/mem_config.h> 60 #include <sys/callb.h> 61 #include <sys/mem_cage.h> 62 #include <sys/sdt.h> 63 64 extern uint_t vac_colors; 65 66 #define MAX_PRAGMA_ALIGN 128 67 68 /* vm_cpu_data0 for the boot cpu before kmem is initialized */ 69 70 #if L2CACHE_ALIGN_MAX <= MAX_PRAGMA_ALIGN 71 #pragma align L2CACHE_ALIGN_MAX(vm_cpu_data0) 72 #else 73 #pragma align MAX_PRAGMA_ALIGN(vm_cpu_data0) 74 #endif 75 char vm_cpu_data0[VM_CPU_DATA_PADSIZE]; 76 77 /* 78 * number of page colors equivalent to reqested color in page_get routines. 79 * If set, keeps large pages intact longer and keeps MPO allocation 80 * from the local mnode in favor of acquiring the 'correct' page color from 81 * a demoted large page or from a remote mnode. 82 */ 83 int colorequiv; 84 85 /* 86 * if set, specifies the percentage of large pages that are free from within 87 * a large page region before attempting to lock those pages for 88 * page_get_contig_pages processing. 89 * 90 * Should be turned on when kpr is available when page_trylock_contig_pages 91 * can be more selective. 92 */ 93 94 int ptcpthreshold; 95 96 /* 97 * Limit page get contig page search based on failure cnts in pgcpfailcnt[]. 98 * use slot 0 (base page size unused) to enable or disable limiting search. 99 * Enabled by default. 100 */ 101 int pgcpfailcnt[MMU_PAGE_SIZES]; 102 int pgcplimitsearch = 1; 103 104 #ifdef VM_STATS 105 struct vmm_vmstats_str vmm_vmstats; 106 107 #endif /* VM_STATS */ 108 109 #if defined(__sparc) 110 #define LPGCREATE 0 111 #else 112 /* enable page_get_contig_pages */ 113 #define LPGCREATE 1 114 #endif 115 116 int pg_contig_disable; 117 int pg_lpgcreate_nocage = LPGCREATE; 118 119 /* 120 * page_freelist_fill pfn flag to signify no hi pfn requirement. 121 */ 122 #define PFNNULL 0 123 124 /* Flags involved in promotion and demotion routines */ 125 #define PC_FREE 0x1 /* put page on freelist */ 126 #define PC_ALLOC 0x2 /* return page for allocation */ 127 128 /* 129 * Flag for page_demote to be used with PC_FREE to denote that we don't care 130 * what the color is as the color parameter to the function is ignored. 131 */ 132 #define PC_NO_COLOR (-1) 133 134 /* 135 * page counters candidates info 136 * See page_ctrs_cands comment below for more details. 137 * fields are as follows: 138 * pcc_pages_free: # pages which freelist coalesce can create 139 * pcc_color_free_len: number of elements in pcc_color_free array 140 * pcc_color_free: pointer to page free counts per color 141 */ 142 typedef struct pcc_info { 143 pgcnt_t pcc_pages_free; 144 int pcc_color_free_len; 145 pgcnt_t *pcc_color_free; 146 } pcc_info_t; 147 148 /* 149 * On big machines it can take a long time to check page_counters 150 * arrays. page_ctrs_cands is a summary array whose elements are a dynamically 151 * updated sum of all elements of the corresponding page_counters arrays. 152 * page_freelist_coalesce() searches page_counters only if an appropriate 153 * element of page_ctrs_cands array is greater than 0. 154 * 155 * An extra dimension is used for page_ctrs_cands to spread the elements 156 * over a few e$ cache lines to avoid serialization during the array 157 * updates. 158 */ 159 #pragma align 64(page_ctrs_cands) 160 161 static pcc_info_t *page_ctrs_cands[NPC_MUTEX][MMU_PAGE_SIZES]; 162 163 /* 164 * Return in val the total number of free pages which can be created 165 * for the given mnode (m) and region size (r) 166 */ 167 #define PGCTRS_CANDS_GETVALUE(m, r, val) { \ 168 int i; \ 169 val = 0; \ 170 for (i = 0; i < NPC_MUTEX; i++) { \ 171 val += page_ctrs_cands[i][(r)][(m)].pcc_pages_free; \ 172 } \ 173 } 174 175 /* 176 * Return in val the total number of free pages which can be created 177 * for the given mnode (m), region size (r), and color (c) 178 */ 179 #define PGCTRS_CANDS_GETVALUECOLOR(m, r, c, val) { \ 180 int i; \ 181 val = 0; \ 182 ASSERT((c) < page_ctrs_cands[0][(r)][(m)].pcc_color_free_len); \ 183 for (i = 0; i < NPC_MUTEX; i++) { \ 184 val += page_ctrs_cands[i][(r)][(m)].pcc_color_free[(c)]; \ 185 } \ 186 } 187 188 /* 189 * We can only allow a single thread to update a counter within the physical 190 * range of the largest supported page size. That is the finest granularity 191 * possible since the counter values are dependent on each other 192 * as you move accross region sizes. PP_CTR_LOCK_INDX is used to determine the 193 * ctr_mutex lock index for a particular physical range. 194 */ 195 static kmutex_t *ctr_mutex[NPC_MUTEX]; 196 197 #define PP_CTR_LOCK_INDX(pp) \ 198 (((pp)->p_pagenum >> \ 199 (PAGE_BSZS_SHIFT(mmu_page_sizes - 1))) & (NPC_MUTEX - 1)) 200 201 /* 202 * Local functions prototypes. 203 */ 204 205 void page_ctr_add(int, int, page_t *, int); 206 void page_ctr_add_internal(int, int, page_t *, int); 207 void page_ctr_sub(int, int, page_t *, int); 208 uint_t page_convert_color(uchar_t, uchar_t, uint_t); 209 void page_freelist_lock(int); 210 void page_freelist_unlock(int); 211 page_t *page_promote(int, pfn_t, uchar_t, int); 212 page_t *page_demote(int, pfn_t, uchar_t, uchar_t, int, int); 213 page_t *page_freelist_fill(uchar_t, int, int, int, pfn_t); 214 page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int); 215 static int page_trylock_cons(page_t *pp, se_t se); 216 217 #define PNUM_SIZE(szc) \ 218 (hw_page_array[(szc)].hp_size >> hw_page_array[0].hp_shift) 219 #define PNUM_SHIFT(szc) \ 220 (hw_page_array[(szc)].hp_shift - hw_page_array[0].hp_shift) 221 222 /* 223 * The page_counters array below is used to keep track of free contiguous 224 * physical memory. A hw_page_map_t will be allocated per mnode per szc. 225 * This contains an array of counters, the size of the array, a shift value 226 * used to convert a pagenum into a counter array index or vice versa, as 227 * well as a cache of the last successful index to be promoted to a larger 228 * page size. As an optimization, we keep track of the last successful index 229 * to be promoted per page color for the given size region, and this is 230 * allocated dynamically based upon the number of colors for a given 231 * region size. 232 * 233 * Conceptually, the page counters are represented as: 234 * 235 * page_counters[region_size][mnode] 236 * 237 * region_size: size code of a candidate larger page made up 238 * of contiguous free smaller pages. 239 * 240 * page_counters[region_size][mnode].hpm_counters[index]: 241 * represents how many (region_size - 1) pages either 242 * exist or can be created within the given index range. 243 * 244 * Let's look at a sparc example: 245 * If we want to create a free 512k page, we look at region_size 2 246 * for the mnode we want. We calculate the index and look at a specific 247 * hpm_counters location. If we see 8 (FULL_REGION_CNT on sparc) at 248 * this location, it means that 8 64k pages either exist or can be created 249 * from 8K pages in order to make a single free 512k page at the given 250 * index. Note that when a region is full, it will contribute to the 251 * counts in the region above it. Thus we will not know what page 252 * size the free pages will be which can be promoted to this new free 253 * page unless we look at all regions below the current region. 254 */ 255 256 /* 257 * Note: hpmctr_t is defined in platform vm_dep.h 258 * hw_page_map_t contains all the information needed for the page_counters 259 * logic. The fields are as follows: 260 * 261 * hpm_counters: dynamically allocated array to hold counter data 262 * hpm_entries: entries in hpm_counters 263 * hpm_shift: shift for pnum/array index conv 264 * hpm_base: PFN mapped to counter index 0 265 * hpm_color_current_len: # of elements in hpm_color_current "array" below 266 * hpm_color_current: last index in counter array for this color at 267 * which we successfully created a large page 268 */ 269 typedef struct hw_page_map { 270 hpmctr_t *hpm_counters; 271 size_t hpm_entries; 272 int hpm_shift; 273 pfn_t hpm_base; 274 size_t hpm_color_current_len; 275 size_t *hpm_color_current; 276 } hw_page_map_t; 277 278 /* 279 * Element zero is not used, but is allocated for convenience. 280 */ 281 static hw_page_map_t *page_counters[MMU_PAGE_SIZES]; 282 283 /* 284 * The following macros are convenient ways to get access to the individual 285 * elements of the page_counters arrays. They can be used on both 286 * the left side and right side of equations. 287 */ 288 #define PAGE_COUNTERS(mnode, rg_szc, idx) \ 289 (page_counters[(rg_szc)][(mnode)].hpm_counters[(idx)]) 290 291 #define PAGE_COUNTERS_COUNTERS(mnode, rg_szc) \ 292 (page_counters[(rg_szc)][(mnode)].hpm_counters) 293 294 #define PAGE_COUNTERS_SHIFT(mnode, rg_szc) \ 295 (page_counters[(rg_szc)][(mnode)].hpm_shift) 296 297 #define PAGE_COUNTERS_ENTRIES(mnode, rg_szc) \ 298 (page_counters[(rg_szc)][(mnode)].hpm_entries) 299 300 #define PAGE_COUNTERS_BASE(mnode, rg_szc) \ 301 (page_counters[(rg_szc)][(mnode)].hpm_base) 302 303 #define PAGE_COUNTERS_CURRENT_COLOR_LEN(mnode, rg_szc) \ 304 (page_counters[(rg_szc)][(mnode)].hpm_color_current_len) 305 306 #define PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, rg_szc) \ 307 (page_counters[(rg_szc)][(mnode)].hpm_color_current) 308 309 #define PAGE_COUNTERS_CURRENT_COLOR(mnode, rg_szc, color) \ 310 (page_counters[(rg_szc)][(mnode)].hpm_color_current[(color)]) 311 312 #define PNUM_TO_IDX(mnode, rg_szc, pnum) \ 313 (((pnum) - PAGE_COUNTERS_BASE((mnode), (rg_szc))) >> \ 314 PAGE_COUNTERS_SHIFT((mnode), (rg_szc))) 315 316 #define IDX_TO_PNUM(mnode, rg_szc, index) \ 317 (PAGE_COUNTERS_BASE((mnode), (rg_szc)) + \ 318 ((index) << PAGE_COUNTERS_SHIFT((mnode), (rg_szc)))) 319 320 /* 321 * Protects the hpm_counters and hpm_color_current memory from changing while 322 * looking at page counters information. 323 * Grab the write lock to modify what these fields point at. 324 * Grab the read lock to prevent any pointers from changing. 325 * The write lock can not be held during memory allocation due to a possible 326 * recursion deadlock with trying to grab the read lock while the 327 * write lock is already held. 328 */ 329 krwlock_t page_ctrs_rwlock[MAX_MEM_NODES]; 330 331 332 /* 333 * initialize cpu_vm_data to point at cache aligned vm_cpu_data_t. 334 */ 335 void 336 cpu_vm_data_init(struct cpu *cp) 337 { 338 if (cp == CPU0) { 339 cp->cpu_vm_data = (void *)&vm_cpu_data0; 340 } else { 341 void *kmptr; 342 int align; 343 size_t sz; 344 345 align = (L2CACHE_ALIGN) ? L2CACHE_ALIGN : L2CACHE_ALIGN_MAX; 346 sz = P2ROUNDUP(sizeof (vm_cpu_data_t), align) + align; 347 kmptr = kmem_zalloc(sz, KM_SLEEP); 348 cp->cpu_vm_data = (void *) P2ROUNDUP((uintptr_t)kmptr, align); 349 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr = kmptr; 350 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize = sz; 351 } 352 } 353 354 /* 355 * free cpu_vm_data 356 */ 357 void 358 cpu_vm_data_destroy(struct cpu *cp) 359 { 360 if (cp->cpu_seqid && cp->cpu_vm_data) { 361 ASSERT(cp != CPU0); 362 kmem_free(((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr, 363 ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize); 364 } 365 cp->cpu_vm_data = NULL; 366 } 367 368 369 /* 370 * page size to page size code 371 */ 372 int 373 page_szc(size_t pagesize) 374 { 375 int i = 0; 376 377 while (hw_page_array[i].hp_size) { 378 if (pagesize == hw_page_array[i].hp_size) 379 return (i); 380 i++; 381 } 382 return (-1); 383 } 384 385 /* 386 * page size to page size code with the restriction that it be a supported 387 * user page size. If it's not a supported user page size, -1 will be returned. 388 */ 389 int 390 page_szc_user_filtered(size_t pagesize) 391 { 392 int szc = page_szc(pagesize); 393 if ((szc != -1) && (SZC_2_USERSZC(szc) != -1)) { 394 return (szc); 395 } 396 return (-1); 397 } 398 399 /* 400 * Return how many page sizes are available for the user to use. This is 401 * what the hardware supports and not based upon how the OS implements the 402 * support of different page sizes. 403 */ 404 uint_t 405 page_num_user_pagesizes(void) 406 { 407 return (mmu_exported_page_sizes); 408 } 409 410 uint_t 411 page_num_pagesizes(void) 412 { 413 return (mmu_page_sizes); 414 } 415 416 /* 417 * returns the count of the number of base pagesize pages associated with szc 418 */ 419 pgcnt_t 420 page_get_pagecnt(uint_t szc) 421 { 422 if (szc >= mmu_page_sizes) 423 panic("page_get_pagecnt: out of range %d", szc); 424 return (hw_page_array[szc].hp_pgcnt); 425 } 426 427 size_t 428 page_get_pagesize(uint_t szc) 429 { 430 if (szc >= mmu_page_sizes) 431 panic("page_get_pagesize: out of range %d", szc); 432 return (hw_page_array[szc].hp_size); 433 } 434 435 /* 436 * Return the size of a page based upon the index passed in. An index of 437 * zero refers to the smallest page size in the system, and as index increases 438 * it refers to the next larger supported page size in the system. 439 * Note that szc and userszc may not be the same due to unsupported szc's on 440 * some systems. 441 */ 442 size_t 443 page_get_user_pagesize(uint_t userszc) 444 { 445 uint_t szc = USERSZC_2_SZC(userszc); 446 447 if (szc >= mmu_page_sizes) 448 panic("page_get_user_pagesize: out of range %d", szc); 449 return (hw_page_array[szc].hp_size); 450 } 451 452 uint_t 453 page_get_shift(uint_t szc) 454 { 455 if (szc >= mmu_page_sizes) 456 panic("page_get_shift: out of range %d", szc); 457 return (hw_page_array[szc].hp_shift); 458 } 459 460 uint_t 461 page_get_pagecolors(uint_t szc) 462 { 463 ASSERT(page_colors != 0); 464 return (MAX(page_colors >> PAGE_BSZS_SHIFT(szc), 1)); 465 } 466 467 /* 468 * Called by startup(). 469 * Size up the per page size free list counters based on physmax 470 * of each node and max_mem_nodes. 471 */ 472 size_t 473 page_ctrs_sz(void) 474 { 475 int r; /* region size */ 476 int mnode; 477 uint_t ctrs_sz = 0; 478 int i; 479 pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; 480 481 /* 482 * We need to determine how many page colors there are for each 483 * page size in order to allocate memory for any color specific 484 * arrays. 485 */ 486 colors_per_szc[0] = page_colors; 487 for (i = 1; i < mmu_page_sizes; i++) { 488 colors_per_szc[i] = 489 page_convert_color(0, i, page_colors - 1) + 1; 490 } 491 492 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 493 494 pgcnt_t r_pgcnt; 495 pfn_t r_base; 496 pgcnt_t r_align; 497 498 if (mem_node_config[mnode].exists == 0) 499 continue; 500 501 /* 502 * determine size needed for page counter arrays with 503 * base aligned to large page size. 504 */ 505 for (r = 1; r < mmu_page_sizes; r++) { 506 /* add in space for hpm_counters */ 507 r_align = page_get_pagecnt(r); 508 r_base = mem_node_config[mnode].physbase; 509 r_base &= ~(r_align - 1); 510 r_pgcnt = howmany(mem_node_config[mnode].physmax - 511 r_base, r_align); 512 /* 513 * Round up to always allocate on pointer sized 514 * boundaries. 515 */ 516 ctrs_sz += P2ROUNDUP((r_pgcnt * sizeof (hpmctr_t)), 517 sizeof (hpmctr_t *)); 518 519 /* add in space for hpm_color_current */ 520 ctrs_sz += (colors_per_szc[r] * 521 sizeof (size_t)); 522 } 523 } 524 525 for (r = 1; r < mmu_page_sizes; r++) { 526 ctrs_sz += (max_mem_nodes * sizeof (hw_page_map_t)); 527 528 /* add in space for page_ctrs_cands */ 529 ctrs_sz += NPC_MUTEX * max_mem_nodes * (sizeof (pcc_info_t)); 530 ctrs_sz += NPC_MUTEX * max_mem_nodes * colors_per_szc[r] * 531 sizeof (pgcnt_t); 532 } 533 534 /* ctr_mutex */ 535 ctrs_sz += (max_mem_nodes * NPC_MUTEX * sizeof (kmutex_t)); 536 537 /* size for page list counts */ 538 PLCNT_SZ(ctrs_sz); 539 540 /* 541 * add some slop for roundups. page_ctrs_alloc will roundup the start 542 * address of the counters to ecache_alignsize boundary for every 543 * memory node. 544 */ 545 return (ctrs_sz + max_mem_nodes * L2CACHE_ALIGN); 546 } 547 548 caddr_t 549 page_ctrs_alloc(caddr_t alloc_base) 550 { 551 int mnode; 552 int r; /* region size */ 553 int i; 554 pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; 555 556 /* 557 * We need to determine how many page colors there are for each 558 * page size in order to allocate memory for any color specific 559 * arrays. 560 */ 561 colors_per_szc[0] = page_colors; 562 for (i = 1; i < mmu_page_sizes; i++) { 563 colors_per_szc[i] = 564 page_convert_color(0, i, page_colors - 1) + 1; 565 } 566 567 for (r = 1; r < mmu_page_sizes; r++) { 568 page_counters[r] = (hw_page_map_t *)alloc_base; 569 alloc_base += (max_mem_nodes * sizeof (hw_page_map_t)); 570 } 571 572 /* page_ctrs_cands */ 573 for (r = 1; r < mmu_page_sizes; r++) { 574 for (i = 0; i < NPC_MUTEX; i++) { 575 page_ctrs_cands[i][r] = (pcc_info_t *)alloc_base; 576 alloc_base += max_mem_nodes * (sizeof (pcc_info_t)); 577 578 } 579 } 580 581 /* page_ctrs_cands pcc_color_free array */ 582 for (r = 1; r < mmu_page_sizes; r++) { 583 for (i = 0; i < NPC_MUTEX; i++) { 584 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 585 page_ctrs_cands[i][r][mnode].pcc_color_free_len 586 = colors_per_szc[r]; 587 page_ctrs_cands[i][r][mnode].pcc_color_free = 588 (pgcnt_t *)alloc_base; 589 alloc_base += colors_per_szc[r] * 590 sizeof (pgcnt_t); 591 } 592 } 593 } 594 595 /* ctr_mutex */ 596 for (i = 0; i < NPC_MUTEX; i++) { 597 ctr_mutex[i] = (kmutex_t *)alloc_base; 598 alloc_base += (max_mem_nodes * sizeof (kmutex_t)); 599 } 600 601 /* initialize page list counts */ 602 PLCNT_INIT(alloc_base); 603 604 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 605 606 pgcnt_t r_pgcnt; 607 pfn_t r_base; 608 pgcnt_t r_align; 609 int r_shift; 610 611 if (mem_node_config[mnode].exists == 0) 612 continue; 613 614 for (r = 1; r < mmu_page_sizes; r++) { 615 /* 616 * the page_counters base has to be aligned to the 617 * page count of page size code r otherwise the counts 618 * will cross large page boundaries. 619 */ 620 r_align = page_get_pagecnt(r); 621 r_base = mem_node_config[mnode].physbase; 622 /* base needs to be aligned - lower to aligned value */ 623 r_base &= ~(r_align - 1); 624 r_pgcnt = howmany(mem_node_config[mnode].physmax - 625 r_base, r_align); 626 r_shift = PAGE_BSZS_SHIFT(r); 627 628 PAGE_COUNTERS_SHIFT(mnode, r) = r_shift; 629 PAGE_COUNTERS_ENTRIES(mnode, r) = r_pgcnt; 630 PAGE_COUNTERS_BASE(mnode, r) = r_base; 631 PAGE_COUNTERS_CURRENT_COLOR_LEN(mnode, r) = 632 colors_per_szc[r]; 633 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r) = 634 (size_t *)alloc_base; 635 alloc_base += (sizeof (size_t) * colors_per_szc[r]); 636 for (i = 0; i < colors_per_szc[r]; i++) { 637 PAGE_COUNTERS_CURRENT_COLOR(mnode, r, i) = i; 638 } 639 PAGE_COUNTERS_COUNTERS(mnode, r) = 640 (hpmctr_t *)alloc_base; 641 /* 642 * Round up to make alloc_base always be aligned on 643 * a pointer boundary. 644 */ 645 alloc_base += P2ROUNDUP((sizeof (hpmctr_t) * r_pgcnt), 646 sizeof (hpmctr_t *)); 647 648 /* 649 * Verify that PNUM_TO_IDX and IDX_TO_PNUM 650 * satisfy the identity requirement. 651 * We should be able to go from one to the other 652 * and get consistent values. 653 */ 654 ASSERT(PNUM_TO_IDX(mnode, r, 655 (IDX_TO_PNUM(mnode, r, 0))) == 0); 656 ASSERT(IDX_TO_PNUM(mnode, r, 657 (PNUM_TO_IDX(mnode, r, r_base))) == r_base); 658 } 659 /* 660 * Roundup the start address of the page_counters to 661 * cache aligned boundary for every memory node. 662 * page_ctrs_sz() has added some slop for these roundups. 663 */ 664 alloc_base = (caddr_t)P2ROUNDUP((uintptr_t)alloc_base, 665 L2CACHE_ALIGN); 666 } 667 668 /* Initialize other page counter specific data structures. */ 669 for (mnode = 0; mnode < MAX_MEM_NODES; mnode++) { 670 rw_init(&page_ctrs_rwlock[mnode], NULL, RW_DEFAULT, NULL); 671 } 672 673 return (alloc_base); 674 } 675 676 /* 677 * Functions to adjust region counters for each size free list. 678 * Caller is responsible to acquire the ctr_mutex lock if necessary and 679 * thus can be called during startup without locks. 680 */ 681 /* ARGSUSED */ 682 void 683 page_ctr_add_internal(int mnode, int mtype, page_t *pp, int flags) 684 { 685 ssize_t r; /* region size */ 686 ssize_t idx; 687 pfn_t pfnum; 688 int lckidx; 689 690 ASSERT(mnode == PP_2_MEM_NODE(pp)); 691 ASSERT(mtype == PP_2_MTYPE(pp)); 692 693 ASSERT(pp->p_szc < mmu_page_sizes); 694 695 PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags); 696 697 /* no counter update needed for largest page size */ 698 if (pp->p_szc >= mmu_page_sizes - 1) { 699 return; 700 } 701 702 r = pp->p_szc + 1; 703 pfnum = pp->p_pagenum; 704 lckidx = PP_CTR_LOCK_INDX(pp); 705 706 /* 707 * Increment the count of free pages for the current 708 * region. Continue looping up in region size incrementing 709 * count if the preceeding region is full. 710 */ 711 while (r < mmu_page_sizes) { 712 idx = PNUM_TO_IDX(mnode, r, pfnum); 713 714 ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r)); 715 ASSERT(PAGE_COUNTERS(mnode, r, idx) < FULL_REGION_CNT(r)); 716 717 if (++PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r)) 718 break; 719 720 page_ctrs_cands[lckidx][r][mnode].pcc_pages_free++; 721 page_ctrs_cands[lckidx][r][mnode]. 722 pcc_color_free[PP_2_BIN_SZC(pp, r)]++; 723 r++; 724 } 725 } 726 727 void 728 page_ctr_add(int mnode, int mtype, page_t *pp, int flags) 729 { 730 int lckidx = PP_CTR_LOCK_INDX(pp); 731 kmutex_t *lock = &ctr_mutex[lckidx][mnode]; 732 733 mutex_enter(lock); 734 page_ctr_add_internal(mnode, mtype, pp, flags); 735 mutex_exit(lock); 736 } 737 738 void 739 page_ctr_sub(int mnode, int mtype, page_t *pp, int flags) 740 { 741 int lckidx; 742 kmutex_t *lock; 743 ssize_t r; /* region size */ 744 ssize_t idx; 745 pfn_t pfnum; 746 747 ASSERT(mnode == PP_2_MEM_NODE(pp)); 748 ASSERT(mtype == PP_2_MTYPE(pp)); 749 750 ASSERT(pp->p_szc < mmu_page_sizes); 751 752 PLCNT_DECR(pp, mnode, mtype, pp->p_szc, flags); 753 754 /* no counter update needed for largest page size */ 755 if (pp->p_szc >= mmu_page_sizes - 1) { 756 return; 757 } 758 759 r = pp->p_szc + 1; 760 pfnum = pp->p_pagenum; 761 lckidx = PP_CTR_LOCK_INDX(pp); 762 lock = &ctr_mutex[lckidx][mnode]; 763 764 /* 765 * Decrement the count of free pages for the current 766 * region. Continue looping up in region size decrementing 767 * count if the preceeding region was full. 768 */ 769 mutex_enter(lock); 770 while (r < mmu_page_sizes) { 771 idx = PNUM_TO_IDX(mnode, r, pfnum); 772 773 ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r)); 774 ASSERT(PAGE_COUNTERS(mnode, r, idx) > 0); 775 776 if (--PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r) - 1) { 777 break; 778 } 779 ASSERT(page_ctrs_cands[lckidx][r][mnode].pcc_pages_free != 0); 780 ASSERT(page_ctrs_cands[lckidx][r][mnode]. 781 pcc_color_free[PP_2_BIN_SZC(pp, r)] != 0); 782 783 page_ctrs_cands[lckidx][r][mnode].pcc_pages_free--; 784 page_ctrs_cands[lckidx][r][mnode]. 785 pcc_color_free[PP_2_BIN_SZC(pp, r)]--; 786 r++; 787 } 788 mutex_exit(lock); 789 } 790 791 /* 792 * Adjust page counters following a memory attach, since typically the 793 * size of the array needs to change, and the PFN to counter index 794 * mapping needs to change. 795 */ 796 uint_t 797 page_ctrs_adjust(int mnode) 798 { 799 pgcnt_t npgs; 800 int r; /* region size */ 801 int i; 802 size_t pcsz, old_csz; 803 hpmctr_t *new_ctr, *old_ctr; 804 pfn_t oldbase, newbase; 805 size_t old_npgs; 806 hpmctr_t *ctr_cache[MMU_PAGE_SIZES]; 807 size_t size_cache[MMU_PAGE_SIZES]; 808 size_t *color_cache[MMU_PAGE_SIZES]; 809 size_t *old_color_array; 810 pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; 811 812 newbase = mem_node_config[mnode].physbase & ~PC_BASE_ALIGN_MASK; 813 npgs = roundup(mem_node_config[mnode].physmax, 814 PC_BASE_ALIGN) - newbase; 815 816 /* 817 * We need to determine how many page colors there are for each 818 * page size in order to allocate memory for any color specific 819 * arrays. 820 */ 821 colors_per_szc[0] = page_colors; 822 for (r = 1; r < mmu_page_sizes; r++) { 823 colors_per_szc[r] = 824 page_convert_color(0, r, page_colors - 1) + 1; 825 } 826 827 /* 828 * Preallocate all of the new hpm_counters arrays as we can't 829 * hold the page_ctrs_rwlock as a writer and allocate memory. 830 * If we can't allocate all of the arrays, undo our work so far 831 * and return failure. 832 */ 833 for (r = 1; r < mmu_page_sizes; r++) { 834 pcsz = npgs >> PAGE_BSZS_SHIFT(r); 835 836 ctr_cache[r] = kmem_zalloc(pcsz * 837 sizeof (hpmctr_t), KM_NOSLEEP); 838 if (ctr_cache[r] == NULL) { 839 while (--r >= 1) { 840 kmem_free(ctr_cache[r], 841 size_cache[r] * sizeof (hpmctr_t)); 842 } 843 return (ENOMEM); 844 } 845 size_cache[r] = pcsz; 846 } 847 /* 848 * Preallocate all of the new color current arrays as we can't 849 * hold the page_ctrs_rwlock as a writer and allocate memory. 850 * If we can't allocate all of the arrays, undo our work so far 851 * and return failure. 852 */ 853 for (r = 1; r < mmu_page_sizes; r++) { 854 color_cache[r] = kmem_zalloc(sizeof (size_t) * 855 colors_per_szc[r], KM_NOSLEEP); 856 if (color_cache[r] == NULL) { 857 while (--r >= 1) { 858 kmem_free(color_cache[r], 859 colors_per_szc[r] * sizeof (size_t)); 860 } 861 for (r = 1; r < mmu_page_sizes; r++) { 862 kmem_free(ctr_cache[r], 863 size_cache[r] * sizeof (hpmctr_t)); 864 } 865 return (ENOMEM); 866 } 867 } 868 869 /* 870 * Grab the write lock to prevent others from walking these arrays 871 * while we are modifying them. 872 */ 873 rw_enter(&page_ctrs_rwlock[mnode], RW_WRITER); 874 page_freelist_lock(mnode); 875 for (r = 1; r < mmu_page_sizes; r++) { 876 PAGE_COUNTERS_SHIFT(mnode, r) = PAGE_BSZS_SHIFT(r); 877 old_ctr = PAGE_COUNTERS_COUNTERS(mnode, r); 878 old_csz = PAGE_COUNTERS_ENTRIES(mnode, r); 879 oldbase = PAGE_COUNTERS_BASE(mnode, r); 880 old_npgs = old_csz << PAGE_COUNTERS_SHIFT(mnode, r); 881 old_color_array = PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r); 882 883 pcsz = npgs >> PAGE_COUNTERS_SHIFT(mnode, r); 884 new_ctr = ctr_cache[r]; 885 ctr_cache[r] = NULL; 886 if (old_ctr != NULL && 887 (oldbase + old_npgs > newbase) && 888 (newbase + npgs > oldbase)) { 889 /* 890 * Map the intersection of the old and new 891 * counters into the new array. 892 */ 893 size_t offset; 894 if (newbase > oldbase) { 895 offset = (newbase - oldbase) >> 896 PAGE_COUNTERS_SHIFT(mnode, r); 897 bcopy(old_ctr + offset, new_ctr, 898 MIN(pcsz, (old_csz - offset)) * 899 sizeof (hpmctr_t)); 900 } else { 901 offset = (oldbase - newbase) >> 902 PAGE_COUNTERS_SHIFT(mnode, r); 903 bcopy(old_ctr, new_ctr + offset, 904 MIN(pcsz - offset, old_csz) * 905 sizeof (hpmctr_t)); 906 } 907 } 908 909 PAGE_COUNTERS_COUNTERS(mnode, r) = new_ctr; 910 PAGE_COUNTERS_ENTRIES(mnode, r) = pcsz; 911 PAGE_COUNTERS_BASE(mnode, r) = newbase; 912 PAGE_COUNTERS_CURRENT_COLOR_LEN(mnode, r) = colors_per_szc[r]; 913 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r) = color_cache[r]; 914 color_cache[r] = NULL; 915 /* 916 * for now, just reset on these events as it's probably 917 * not worthwhile to try and optimize this. 918 */ 919 for (i = 0; i < colors_per_szc[r]; i++) { 920 PAGE_COUNTERS_CURRENT_COLOR(mnode, r, i) = i; 921 } 922 923 /* cache info for freeing out of the critical path */ 924 if ((caddr_t)old_ctr >= kernelheap && 925 (caddr_t)old_ctr < ekernelheap) { 926 ctr_cache[r] = old_ctr; 927 size_cache[r] = old_csz; 928 } 929 if ((caddr_t)old_color_array >= kernelheap && 930 (caddr_t)old_color_array < ekernelheap) { 931 color_cache[r] = old_color_array; 932 } 933 /* 934 * Verify that PNUM_TO_IDX and IDX_TO_PNUM 935 * satisfy the identity requirement. 936 * We should be able to go from one to the other 937 * and get consistent values. 938 */ 939 ASSERT(PNUM_TO_IDX(mnode, r, 940 (IDX_TO_PNUM(mnode, r, 0))) == 0); 941 ASSERT(IDX_TO_PNUM(mnode, r, 942 (PNUM_TO_IDX(mnode, r, newbase))) == newbase); 943 } 944 page_freelist_unlock(mnode); 945 rw_exit(&page_ctrs_rwlock[mnode]); 946 947 /* 948 * Now that we have dropped the write lock, it is safe to free all 949 * of the memory we have cached above. 950 */ 951 for (r = 1; r < mmu_page_sizes; r++) { 952 if (ctr_cache[r] != NULL) { 953 kmem_free(ctr_cache[r], 954 size_cache[r] * sizeof (hpmctr_t)); 955 } 956 if (color_cache[r] != NULL) { 957 kmem_free(color_cache[r], 958 colors_per_szc[r] * sizeof (size_t)); 959 } 960 } 961 return (0); 962 } 963 964 /* 965 * color contains a valid color index or bin for cur_szc 966 */ 967 uint_t 968 page_convert_color(uchar_t cur_szc, uchar_t new_szc, uint_t color) 969 { 970 uint_t shift; 971 972 if (cur_szc > new_szc) { 973 shift = page_get_shift(cur_szc) - page_get_shift(new_szc); 974 return (color << shift); 975 } else if (cur_szc < new_szc) { 976 shift = page_get_shift(new_szc) - page_get_shift(cur_szc); 977 return (color >> shift); 978 } 979 return (color); 980 } 981 982 #ifdef DEBUG 983 984 /* 985 * confirm pp is a large page corresponding to szc 986 */ 987 void 988 chk_lpg(page_t *pp, uchar_t szc) 989 { 990 spgcnt_t npgs = page_get_pagecnt(pp->p_szc); 991 uint_t noreloc; 992 993 if (npgs == 1) { 994 ASSERT(pp->p_szc == 0); 995 ASSERT(pp->p_next == pp); 996 ASSERT(pp->p_prev == pp); 997 return; 998 } 999 1000 ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL); 1001 ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL); 1002 1003 ASSERT(IS_P2ALIGNED(pp->p_pagenum, npgs)); 1004 ASSERT(pp->p_pagenum == (pp->p_next->p_pagenum - 1)); 1005 ASSERT(pp->p_prev->p_pagenum == (pp->p_pagenum + (npgs - 1))); 1006 ASSERT(pp->p_prev == (pp + (npgs - 1))); 1007 1008 /* 1009 * Check list of pages. 1010 */ 1011 noreloc = PP_ISNORELOC(pp); 1012 while (npgs--) { 1013 if (npgs != 0) { 1014 ASSERT(pp->p_pagenum == pp->p_next->p_pagenum - 1); 1015 ASSERT(pp->p_next == (pp + 1)); 1016 } 1017 ASSERT(pp->p_szc == szc); 1018 ASSERT(PP_ISFREE(pp)); 1019 ASSERT(PP_ISAGED(pp)); 1020 ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL); 1021 ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL); 1022 ASSERT(pp->p_vnode == NULL); 1023 ASSERT(PP_ISNORELOC(pp) == noreloc); 1024 1025 pp = pp->p_next; 1026 } 1027 } 1028 #endif /* DEBUG */ 1029 1030 void 1031 page_freelist_lock(int mnode) 1032 { 1033 int i; 1034 for (i = 0; i < NPC_MUTEX; i++) { 1035 mutex_enter(FPC_MUTEX(mnode, i)); 1036 mutex_enter(CPC_MUTEX(mnode, i)); 1037 } 1038 } 1039 1040 void 1041 page_freelist_unlock(int mnode) 1042 { 1043 int i; 1044 for (i = 0; i < NPC_MUTEX; i++) { 1045 mutex_exit(FPC_MUTEX(mnode, i)); 1046 mutex_exit(CPC_MUTEX(mnode, i)); 1047 } 1048 } 1049 1050 /* 1051 * update the page list max counts for already allocated pages that has xfer'ed 1052 * (kcage_assimilate_page) between different mtypes. 1053 */ 1054 /* ARGSUSED */ 1055 void 1056 page_list_xfer(page_t *pp, int to_mtype, int from_mtype) 1057 { 1058 PLCNT_MAX_INCR(pp, PP_2_MEM_NODE(pp), to_mtype, pp->p_szc); 1059 PLCNT_MAX_DECR(pp, PP_2_MEM_NODE(pp), from_mtype, pp->p_szc); 1060 } 1061 1062 /* 1063 * add pp to the specified page list. Defaults to head of the page list 1064 * unless PG_LIST_TAIL is specified. 1065 */ 1066 void 1067 page_list_add(page_t *pp, int flags) 1068 { 1069 page_t **ppp; 1070 kmutex_t *pcm; 1071 uint_t bin, mtype; 1072 int mnode; 1073 1074 ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT)); 1075 ASSERT(PP_ISFREE(pp)); 1076 ASSERT(!hat_page_is_mapped(pp)); 1077 ASSERT(hat_page_getshare(pp) == 0); 1078 1079 /* 1080 * Large pages should be freed via page_list_add_pages(). 1081 */ 1082 ASSERT(pp->p_szc == 0); 1083 1084 /* 1085 * Don't need to lock the freelist first here 1086 * because the page isn't on the freelist yet. 1087 * This means p_szc can't change on us. 1088 */ 1089 1090 bin = PP_2_BIN(pp); 1091 mnode = PP_2_MEM_NODE(pp); 1092 mtype = PP_2_MTYPE(pp); 1093 1094 if (flags & PG_LIST_ISINIT) { 1095 /* 1096 * PG_LIST_ISINIT is set during system startup (ie. single 1097 * threaded), add a page to the free list and add to the 1098 * the free region counters w/o any locking 1099 */ 1100 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1101 1102 /* inline version of page_add() */ 1103 if (*ppp != NULL) { 1104 pp->p_next = *ppp; 1105 pp->p_prev = (*ppp)->p_prev; 1106 (*ppp)->p_prev = pp; 1107 pp->p_prev->p_next = pp; 1108 } else 1109 *ppp = pp; 1110 1111 page_ctr_add_internal(mnode, mtype, pp, flags); 1112 VM_STAT_ADD(vmm_vmstats.pladd_free[0]); 1113 } else { 1114 pcm = PC_BIN_MUTEX(mnode, bin, flags); 1115 1116 if (flags & PG_FREE_LIST) { 1117 VM_STAT_ADD(vmm_vmstats.pladd_free[0]); 1118 ASSERT(PP_ISAGED(pp)); 1119 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1120 1121 } else { 1122 VM_STAT_ADD(vmm_vmstats.pladd_cache); 1123 ASSERT(pp->p_vnode); 1124 ASSERT((pp->p_offset & PAGEOFFSET) == 0); 1125 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1126 } 1127 mutex_enter(pcm); 1128 page_add(ppp, pp); 1129 1130 if (flags & PG_LIST_TAIL) 1131 *ppp = (*ppp)->p_next; 1132 /* 1133 * Add counters before releasing pcm mutex to avoid a race with 1134 * page_freelist_coalesce and page_freelist_fill. 1135 */ 1136 page_ctr_add(mnode, mtype, pp, flags); 1137 mutex_exit(pcm); 1138 } 1139 1140 1141 #if defined(__sparc) 1142 if (PP_ISNORELOC(pp)) { 1143 kcage_freemem_add(1); 1144 } 1145 #endif 1146 /* 1147 * It is up to the caller to unlock the page! 1148 */ 1149 ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT)); 1150 } 1151 1152 1153 #ifdef __sparc 1154 /* 1155 * This routine is only used by kcage_init during system startup. 1156 * It performs the function of page_list_sub/PP_SETNORELOC/page_list_add 1157 * without the overhead of taking locks and updating counters. 1158 */ 1159 void 1160 page_list_noreloc_startup(page_t *pp) 1161 { 1162 page_t **ppp; 1163 uint_t bin; 1164 int mnode; 1165 int mtype; 1166 int flags = PG_LIST_ISCAGE; 1167 1168 /* 1169 * If this is a large page on the freelist then 1170 * break it up into smaller pages. 1171 */ 1172 if (pp->p_szc != 0) 1173 page_boot_demote(pp); 1174 1175 /* 1176 * Get list page is currently on. 1177 */ 1178 bin = PP_2_BIN(pp); 1179 mnode = PP_2_MEM_NODE(pp); 1180 mtype = PP_2_MTYPE(pp); 1181 ASSERT(mtype == MTYPE_RELOC); 1182 ASSERT(pp->p_szc == 0); 1183 1184 if (PP_ISAGED(pp)) { 1185 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1186 flags |= PG_FREE_LIST; 1187 } else { 1188 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1189 flags |= PG_CACHE_LIST; 1190 } 1191 1192 ASSERT(*ppp != NULL); 1193 1194 /* 1195 * Delete page from current list. 1196 */ 1197 if (*ppp == pp) 1198 *ppp = pp->p_next; /* go to next page */ 1199 if (*ppp == pp) { 1200 *ppp = NULL; /* page list is gone */ 1201 } else { 1202 pp->p_prev->p_next = pp->p_next; 1203 pp->p_next->p_prev = pp->p_prev; 1204 } 1205 1206 /* LINTED */ 1207 PLCNT_DECR(pp, mnode, mtype, 0, flags); 1208 1209 /* 1210 * Set no reloc for cage initted pages. 1211 */ 1212 PP_SETNORELOC(pp); 1213 1214 mtype = PP_2_MTYPE(pp); 1215 ASSERT(mtype == MTYPE_NORELOC); 1216 1217 /* 1218 * Get new list for page. 1219 */ 1220 if (PP_ISAGED(pp)) { 1221 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1222 } else { 1223 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1224 } 1225 1226 /* 1227 * Insert page on new list. 1228 */ 1229 if (*ppp == NULL) { 1230 *ppp = pp; 1231 pp->p_next = pp->p_prev = pp; 1232 } else { 1233 pp->p_next = *ppp; 1234 pp->p_prev = (*ppp)->p_prev; 1235 (*ppp)->p_prev = pp; 1236 pp->p_prev->p_next = pp; 1237 } 1238 1239 /* LINTED */ 1240 PLCNT_INCR(pp, mnode, mtype, 0, flags); 1241 1242 /* 1243 * Update cage freemem counter 1244 */ 1245 atomic_add_long(&kcage_freemem, 1); 1246 } 1247 #else /* __sparc */ 1248 1249 /* ARGSUSED */ 1250 void 1251 page_list_noreloc_startup(page_t *pp) 1252 { 1253 panic("page_list_noreloc_startup: should be here only for sparc"); 1254 } 1255 #endif 1256 1257 void 1258 page_list_add_pages(page_t *pp, int flags) 1259 { 1260 kmutex_t *pcm; 1261 pgcnt_t pgcnt; 1262 uint_t bin, mtype, i; 1263 int mnode; 1264 1265 /* default to freelist/head */ 1266 ASSERT((flags & (PG_CACHE_LIST | PG_LIST_TAIL)) == 0); 1267 1268 CHK_LPG(pp, pp->p_szc); 1269 VM_STAT_ADD(vmm_vmstats.pladd_free[pp->p_szc]); 1270 1271 bin = PP_2_BIN(pp); 1272 mnode = PP_2_MEM_NODE(pp); 1273 mtype = PP_2_MTYPE(pp); 1274 1275 if (flags & PG_LIST_ISINIT) { 1276 ASSERT(pp->p_szc == mmu_page_sizes - 1); 1277 page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1278 ASSERT(!PP_ISNORELOC(pp)); 1279 PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags); 1280 } else { 1281 1282 ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes); 1283 1284 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 1285 1286 mutex_enter(pcm); 1287 page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1288 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST); 1289 mutex_exit(pcm); 1290 1291 pgcnt = page_get_pagecnt(pp->p_szc); 1292 #if defined(__sparc) 1293 if (PP_ISNORELOC(pp)) 1294 kcage_freemem_add(pgcnt); 1295 #endif 1296 for (i = 0; i < pgcnt; i++, pp++) 1297 page_unlock(pp); 1298 } 1299 } 1300 1301 /* 1302 * During boot, need to demote a large page to base 1303 * pagesize pages for seg_kmem for use in boot_alloc() 1304 */ 1305 void 1306 page_boot_demote(page_t *pp) 1307 { 1308 ASSERT(pp->p_szc != 0); 1309 ASSERT(PP_ISFREE(pp)); 1310 ASSERT(PP_ISAGED(pp)); 1311 1312 (void) page_demote(PP_2_MEM_NODE(pp), 1313 PFN_BASE(pp->p_pagenum, pp->p_szc), pp->p_szc, 0, PC_NO_COLOR, 1314 PC_FREE); 1315 1316 ASSERT(PP_ISFREE(pp)); 1317 ASSERT(PP_ISAGED(pp)); 1318 ASSERT(pp->p_szc == 0); 1319 } 1320 1321 /* 1322 * Take a particular page off of whatever freelist the page 1323 * is claimed to be on. 1324 * 1325 * NOTE: Only used for PAGESIZE pages. 1326 */ 1327 void 1328 page_list_sub(page_t *pp, int flags) 1329 { 1330 int bin; 1331 uint_t mtype; 1332 int mnode; 1333 kmutex_t *pcm; 1334 page_t **ppp; 1335 1336 ASSERT(PAGE_EXCL(pp)); 1337 ASSERT(PP_ISFREE(pp)); 1338 1339 /* 1340 * The p_szc field can only be changed by page_promote() 1341 * and page_demote(). Only free pages can be promoted and 1342 * demoted and the free list MUST be locked during these 1343 * operations. So to prevent a race in page_list_sub() 1344 * between computing which bin of the freelist lock to 1345 * grab and actually grabing the lock we check again that 1346 * the bin we locked is still the correct one. Notice that 1347 * the p_szc field could have actually changed on us but 1348 * if the bin happens to still be the same we are safe. 1349 */ 1350 try_again: 1351 bin = PP_2_BIN(pp); 1352 mnode = PP_2_MEM_NODE(pp); 1353 pcm = PC_BIN_MUTEX(mnode, bin, flags); 1354 mutex_enter(pcm); 1355 if (PP_2_BIN(pp) != bin) { 1356 mutex_exit(pcm); 1357 goto try_again; 1358 } 1359 mtype = PP_2_MTYPE(pp); 1360 1361 if (flags & PG_FREE_LIST) { 1362 VM_STAT_ADD(vmm_vmstats.plsub_free[0]); 1363 ASSERT(PP_ISAGED(pp)); 1364 ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype); 1365 } else { 1366 VM_STAT_ADD(vmm_vmstats.plsub_cache); 1367 ASSERT(!PP_ISAGED(pp)); 1368 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1369 } 1370 1371 /* 1372 * Common PAGESIZE case. 1373 * 1374 * Note that we locked the freelist. This prevents 1375 * any page promotion/demotion operations. Therefore 1376 * the p_szc will not change until we drop pcm mutex. 1377 */ 1378 if (pp->p_szc == 0) { 1379 page_sub(ppp, pp); 1380 /* 1381 * Subtract counters before releasing pcm mutex 1382 * to avoid race with page_freelist_coalesce. 1383 */ 1384 page_ctr_sub(mnode, mtype, pp, flags); 1385 mutex_exit(pcm); 1386 1387 #if defined(__sparc) 1388 if (PP_ISNORELOC(pp)) { 1389 kcage_freemem_sub(1); 1390 } 1391 #endif 1392 return; 1393 } 1394 1395 /* 1396 * Large pages on the cache list are not supported. 1397 */ 1398 if (flags & PG_CACHE_LIST) 1399 panic("page_list_sub: large page on cachelist"); 1400 1401 /* 1402 * Slow but rare. 1403 * 1404 * Somebody wants this particular page which is part 1405 * of a large page. In this case we just demote the page 1406 * if it's on the freelist. 1407 * 1408 * We have to drop pcm before locking the entire freelist. 1409 * Once we have re-locked the freelist check to make sure 1410 * the page hasn't already been demoted or completely 1411 * freed. 1412 */ 1413 mutex_exit(pcm); 1414 page_freelist_lock(mnode); 1415 if (pp->p_szc != 0) { 1416 /* 1417 * Large page is on freelist. 1418 */ 1419 (void) page_demote(mnode, PFN_BASE(pp->p_pagenum, pp->p_szc), 1420 pp->p_szc, 0, PC_NO_COLOR, PC_FREE); 1421 } 1422 ASSERT(PP_ISFREE(pp)); 1423 ASSERT(PP_ISAGED(pp)); 1424 ASSERT(pp->p_szc == 0); 1425 1426 /* 1427 * Subtract counters before releasing pcm mutex 1428 * to avoid race with page_freelist_coalesce. 1429 */ 1430 bin = PP_2_BIN(pp); 1431 mtype = PP_2_MTYPE(pp); 1432 ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype); 1433 1434 page_sub(ppp, pp); 1435 page_ctr_sub(mnode, mtype, pp, flags); 1436 page_freelist_unlock(mnode); 1437 1438 #if defined(__sparc) 1439 if (PP_ISNORELOC(pp)) { 1440 kcage_freemem_sub(1); 1441 } 1442 #endif 1443 } 1444 1445 void 1446 page_list_sub_pages(page_t *pp, uint_t szc) 1447 { 1448 kmutex_t *pcm; 1449 uint_t bin, mtype; 1450 int mnode; 1451 1452 ASSERT(PAGE_EXCL(pp)); 1453 ASSERT(PP_ISFREE(pp)); 1454 ASSERT(PP_ISAGED(pp)); 1455 1456 /* 1457 * See comment in page_list_sub(). 1458 */ 1459 try_again: 1460 bin = PP_2_BIN(pp); 1461 mnode = PP_2_MEM_NODE(pp); 1462 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 1463 mutex_enter(pcm); 1464 if (PP_2_BIN(pp) != bin) { 1465 mutex_exit(pcm); 1466 goto try_again; 1467 } 1468 1469 /* 1470 * If we're called with a page larger than szc or it got 1471 * promoted above szc before we locked the freelist then 1472 * drop pcm and re-lock entire freelist. If page still larger 1473 * than szc then demote it. 1474 */ 1475 if (pp->p_szc > szc) { 1476 mutex_exit(pcm); 1477 pcm = NULL; 1478 page_freelist_lock(mnode); 1479 if (pp->p_szc > szc) { 1480 VM_STAT_ADD(vmm_vmstats.plsubpages_szcbig); 1481 (void) page_demote(mnode, 1482 PFN_BASE(pp->p_pagenum, pp->p_szc), 1483 pp->p_szc, szc, PC_NO_COLOR, PC_FREE); 1484 } 1485 bin = PP_2_BIN(pp); 1486 } 1487 ASSERT(PP_ISFREE(pp)); 1488 ASSERT(PP_ISAGED(pp)); 1489 ASSERT(pp->p_szc <= szc); 1490 ASSERT(pp == PP_PAGEROOT(pp)); 1491 1492 VM_STAT_ADD(vmm_vmstats.plsub_free[pp->p_szc]); 1493 1494 mtype = PP_2_MTYPE(pp); 1495 if (pp->p_szc != 0) { 1496 page_vpsub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1497 CHK_LPG(pp, pp->p_szc); 1498 } else { 1499 VM_STAT_ADD(vmm_vmstats.plsubpages_szc0); 1500 page_sub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1501 } 1502 page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST); 1503 1504 if (pcm != NULL) { 1505 mutex_exit(pcm); 1506 } else { 1507 page_freelist_unlock(mnode); 1508 } 1509 1510 #if defined(__sparc) 1511 if (PP_ISNORELOC(pp)) { 1512 pgcnt_t pgcnt; 1513 1514 pgcnt = page_get_pagecnt(pp->p_szc); 1515 kcage_freemem_sub(pgcnt); 1516 } 1517 #endif 1518 } 1519 1520 /* 1521 * Add the page to the front of a linked list of pages 1522 * using the p_next & p_prev pointers for the list. 1523 * The caller is responsible for protecting the list pointers. 1524 */ 1525 void 1526 mach_page_add(page_t **ppp, page_t *pp) 1527 { 1528 if (*ppp == NULL) { 1529 pp->p_next = pp->p_prev = pp; 1530 } else { 1531 pp->p_next = *ppp; 1532 pp->p_prev = (*ppp)->p_prev; 1533 (*ppp)->p_prev = pp; 1534 pp->p_prev->p_next = pp; 1535 } 1536 *ppp = pp; 1537 } 1538 1539 /* 1540 * Remove this page from a linked list of pages 1541 * using the p_next & p_prev pointers for the list. 1542 * 1543 * The caller is responsible for protecting the list pointers. 1544 */ 1545 void 1546 mach_page_sub(page_t **ppp, page_t *pp) 1547 { 1548 ASSERT(PP_ISFREE(pp)); 1549 1550 if (*ppp == NULL || pp == NULL) 1551 panic("mach_page_sub"); 1552 1553 if (*ppp == pp) 1554 *ppp = pp->p_next; /* go to next page */ 1555 1556 if (*ppp == pp) 1557 *ppp = NULL; /* page list is gone */ 1558 else { 1559 pp->p_prev->p_next = pp->p_next; 1560 pp->p_next->p_prev = pp->p_prev; 1561 } 1562 pp->p_prev = pp->p_next = pp; /* make pp a list of one */ 1563 } 1564 1565 /* 1566 * Routine fsflush uses to gradually coalesce the free list into larger pages. 1567 */ 1568 void 1569 page_promote_size(page_t *pp, uint_t cur_szc) 1570 { 1571 pfn_t pfn; 1572 int mnode; 1573 int idx; 1574 int new_szc = cur_szc + 1; 1575 int full = FULL_REGION_CNT(new_szc); 1576 1577 pfn = page_pptonum(pp); 1578 mnode = PFN_2_MEM_NODE(pfn); 1579 1580 page_freelist_lock(mnode); 1581 1582 idx = PNUM_TO_IDX(mnode, new_szc, pfn); 1583 if (PAGE_COUNTERS(mnode, new_szc, idx) == full) 1584 (void) page_promote(mnode, pfn, new_szc, PC_FREE); 1585 1586 page_freelist_unlock(mnode); 1587 } 1588 1589 static uint_t page_promote_err; 1590 static uint_t page_promote_noreloc_err; 1591 1592 /* 1593 * Create a single larger page (of szc new_szc) from smaller contiguous pages 1594 * for the given mnode starting at pfnum. Pages involved are on the freelist 1595 * before the call and may be returned to the caller if requested, otherwise 1596 * they will be placed back on the freelist. 1597 * If flags is PC_ALLOC, then the large page will be returned to the user in 1598 * a state which is consistent with a page being taken off the freelist. If 1599 * we failed to lock the new large page, then we will return NULL to the 1600 * caller and put the large page on the freelist instead. 1601 * If flags is PC_FREE, then the large page will be placed on the freelist, 1602 * and NULL will be returned. 1603 * The caller is responsible for locking the freelist as well as any other 1604 * accounting which needs to be done for a returned page. 1605 * 1606 * RFE: For performance pass in pp instead of pfnum so 1607 * we can avoid excessive calls to page_numtopp_nolock(). 1608 * This would depend on an assumption that all contiguous 1609 * pages are in the same memseg so we can just add/dec 1610 * our pp. 1611 * 1612 * Lock ordering: 1613 * 1614 * There is a potential but rare deadlock situation 1615 * for page promotion and demotion operations. The problem 1616 * is there are two paths into the freelist manager and 1617 * they have different lock orders: 1618 * 1619 * page_create() 1620 * lock freelist 1621 * page_lock(EXCL) 1622 * unlock freelist 1623 * return 1624 * caller drops page_lock 1625 * 1626 * page_free() and page_reclaim() 1627 * caller grabs page_lock(EXCL) 1628 * 1629 * lock freelist 1630 * unlock freelist 1631 * drop page_lock 1632 * 1633 * What prevents a thread in page_create() from deadlocking 1634 * with a thread freeing or reclaiming the same page is the 1635 * page_trylock() in page_get_freelist(). If the trylock fails 1636 * it skips the page. 1637 * 1638 * The lock ordering for promotion and demotion is the same as 1639 * for page_create(). Since the same deadlock could occur during 1640 * page promotion and freeing or reclaiming of a page on the 1641 * cache list we might have to fail the operation and undo what 1642 * have done so far. Again this is rare. 1643 */ 1644 page_t * 1645 page_promote(int mnode, pfn_t pfnum, uchar_t new_szc, int flags) 1646 { 1647 page_t *pp, *pplist, *tpp, *start_pp; 1648 pgcnt_t new_npgs, npgs; 1649 uint_t bin; 1650 pgcnt_t tmpnpgs, pages_left; 1651 uint_t mtype; 1652 uint_t noreloc; 1653 uint_t i; 1654 int which_list; 1655 ulong_t index; 1656 kmutex_t *phm; 1657 1658 /* 1659 * General algorithm: 1660 * Find the starting page 1661 * Walk each page struct removing it from the freelist, 1662 * and linking it to all the other pages removed. 1663 * Once all pages are off the freelist, 1664 * walk the list, modifying p_szc to new_szc and what 1665 * ever other info needs to be done to create a large free page. 1666 * According to the flags, either return the page or put it 1667 * on the freelist. 1668 */ 1669 1670 start_pp = page_numtopp_nolock(pfnum); 1671 ASSERT(start_pp && (start_pp->p_pagenum == pfnum)); 1672 new_npgs = page_get_pagecnt(new_szc); 1673 ASSERT(IS_P2ALIGNED(pfnum, new_npgs)); 1674 1675 /* 1676 * Loop through smaller pages to confirm that all pages 1677 * give the same result for PP_ISNORELOC(). 1678 * We can check this reliably here as the protocol for setting 1679 * P_NORELOC requires pages to be taken off the free list first. 1680 */ 1681 for (i = 0, pp = start_pp; i < new_npgs; i++, pp++) { 1682 if (pp == start_pp) { 1683 /* First page, set requirement. */ 1684 noreloc = PP_ISNORELOC(pp); 1685 } else if (noreloc != PP_ISNORELOC(pp)) { 1686 page_promote_noreloc_err++; 1687 page_promote_err++; 1688 return (NULL); 1689 } 1690 } 1691 1692 pages_left = new_npgs; 1693 pplist = NULL; 1694 pp = start_pp; 1695 1696 /* Loop around coalescing the smaller pages into a big page. */ 1697 while (pages_left) { 1698 /* 1699 * Remove from the freelist. 1700 */ 1701 ASSERT(PP_ISFREE(pp)); 1702 bin = PP_2_BIN(pp); 1703 ASSERT(mnode == PP_2_MEM_NODE(pp)); 1704 mtype = PP_2_MTYPE(pp); 1705 if (PP_ISAGED(pp)) { 1706 1707 /* 1708 * PG_FREE_LIST 1709 */ 1710 if (pp->p_szc) { 1711 page_vpsub(&PAGE_FREELISTS(mnode, 1712 pp->p_szc, bin, mtype), pp); 1713 } else { 1714 mach_page_sub(&PAGE_FREELISTS(mnode, 0, 1715 bin, mtype), pp); 1716 } 1717 which_list = PG_FREE_LIST; 1718 } else { 1719 ASSERT(pp->p_szc == 0); 1720 1721 /* 1722 * PG_CACHE_LIST 1723 * 1724 * Since this page comes from the 1725 * cachelist, we must destroy the 1726 * vnode association. 1727 */ 1728 if (!page_trylock(pp, SE_EXCL)) { 1729 goto fail_promote; 1730 } 1731 1732 /* 1733 * We need to be careful not to deadlock 1734 * with another thread in page_lookup(). 1735 * The page_lookup() thread could be holding 1736 * the same phm that we need if the two 1737 * pages happen to hash to the same phm lock. 1738 * At this point we have locked the entire 1739 * freelist and page_lookup() could be trying 1740 * to grab a freelist lock. 1741 */ 1742 index = PAGE_HASH_FUNC(pp->p_vnode, pp->p_offset); 1743 phm = PAGE_HASH_MUTEX(index); 1744 if (!mutex_tryenter(phm)) { 1745 page_unlock(pp); 1746 goto fail_promote; 1747 } 1748 1749 mach_page_sub(&PAGE_CACHELISTS(mnode, bin, mtype), pp); 1750 page_hashout(pp, phm); 1751 mutex_exit(phm); 1752 PP_SETAGED(pp); 1753 page_unlock(pp); 1754 which_list = PG_CACHE_LIST; 1755 } 1756 page_ctr_sub(mnode, mtype, pp, which_list); 1757 1758 /* 1759 * Concatenate the smaller page(s) onto 1760 * the large page list. 1761 */ 1762 tmpnpgs = npgs = page_get_pagecnt(pp->p_szc); 1763 pages_left -= npgs; 1764 tpp = pp; 1765 while (npgs--) { 1766 tpp->p_szc = new_szc; 1767 tpp = tpp->p_next; 1768 } 1769 page_list_concat(&pplist, &pp); 1770 pp += tmpnpgs; 1771 } 1772 CHK_LPG(pplist, new_szc); 1773 1774 /* 1775 * return the page to the user if requested 1776 * in the properly locked state. 1777 */ 1778 if (flags == PC_ALLOC && (page_trylock_cons(pplist, SE_EXCL))) { 1779 return (pplist); 1780 } 1781 1782 /* 1783 * Otherwise place the new large page on the freelist 1784 */ 1785 bin = PP_2_BIN(pplist); 1786 mnode = PP_2_MEM_NODE(pplist); 1787 mtype = PP_2_MTYPE(pplist); 1788 page_vpadd(&PAGE_FREELISTS(mnode, new_szc, bin, mtype), pplist); 1789 1790 page_ctr_add(mnode, mtype, pplist, PG_FREE_LIST); 1791 return (NULL); 1792 1793 fail_promote: 1794 /* 1795 * A thread must have still been freeing or 1796 * reclaiming the page on the cachelist. 1797 * To prevent a deadlock undo what we have 1798 * done sofar and return failure. This 1799 * situation can only happen while promoting 1800 * PAGESIZE pages. 1801 */ 1802 page_promote_err++; 1803 while (pplist) { 1804 pp = pplist; 1805 mach_page_sub(&pplist, pp); 1806 pp->p_szc = 0; 1807 bin = PP_2_BIN(pp); 1808 mtype = PP_2_MTYPE(pp); 1809 mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, mtype), pp); 1810 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST); 1811 } 1812 return (NULL); 1813 1814 } 1815 1816 /* 1817 * Break up a large page into smaller size pages. 1818 * Pages involved are on the freelist before the call and may 1819 * be returned to the caller if requested, otherwise they will 1820 * be placed back on the freelist. 1821 * The caller is responsible for locking the freelist as well as any other 1822 * accounting which needs to be done for a returned page. 1823 * If flags is not PC_ALLOC, the color argument is ignored, and thus 1824 * technically, any value may be passed in but PC_NO_COLOR is the standard 1825 * which should be followed for clarity's sake. 1826 */ 1827 page_t * 1828 page_demote(int mnode, pfn_t pfnum, uchar_t cur_szc, uchar_t new_szc, 1829 int color, int flags) 1830 { 1831 page_t *pp, *pplist, *npplist; 1832 pgcnt_t npgs, n; 1833 uint_t bin; 1834 uint_t mtype; 1835 page_t *ret_pp = NULL; 1836 1837 ASSERT(cur_szc != 0); 1838 ASSERT(new_szc < cur_szc); 1839 1840 pplist = page_numtopp_nolock(pfnum); 1841 ASSERT(pplist != NULL); 1842 1843 ASSERT(pplist->p_szc == cur_szc); 1844 1845 bin = PP_2_BIN(pplist); 1846 ASSERT(mnode == PP_2_MEM_NODE(pplist)); 1847 mtype = PP_2_MTYPE(pplist); 1848 page_vpsub(&PAGE_FREELISTS(mnode, cur_szc, bin, mtype), pplist); 1849 1850 CHK_LPG(pplist, cur_szc); 1851 page_ctr_sub(mnode, mtype, pplist, PG_FREE_LIST); 1852 1853 /* 1854 * Number of PAGESIZE pages for smaller new_szc 1855 * page. 1856 */ 1857 npgs = page_get_pagecnt(new_szc); 1858 1859 while (pplist) { 1860 pp = pplist; 1861 1862 ASSERT(pp->p_szc == cur_szc); 1863 1864 /* 1865 * We either break it up into PAGESIZE pages or larger. 1866 */ 1867 if (npgs == 1) { /* PAGESIZE case */ 1868 mach_page_sub(&pplist, pp); 1869 ASSERT(pp->p_szc == cur_szc); 1870 ASSERT(new_szc == 0); 1871 ASSERT(mnode == PP_2_MEM_NODE(pp)); 1872 pp->p_szc = new_szc; 1873 bin = PP_2_BIN(pp); 1874 if ((bin == color) && (flags == PC_ALLOC) && 1875 (ret_pp == NULL) && 1876 page_trylock_cons(pp, SE_EXCL)) { 1877 ret_pp = pp; 1878 } else { 1879 mtype = PP_2_MTYPE(pp); 1880 mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, 1881 mtype), pp); 1882 page_ctr_add(mnode, mtype, pp, PG_FREE_LIST); 1883 } 1884 } else { 1885 1886 /* 1887 * Break down into smaller lists of pages. 1888 */ 1889 page_list_break(&pplist, &npplist, npgs); 1890 1891 pp = pplist; 1892 n = npgs; 1893 while (n--) { 1894 ASSERT(pp->p_szc == cur_szc); 1895 pp->p_szc = new_szc; 1896 pp = pp->p_next; 1897 } 1898 1899 CHK_LPG(pplist, new_szc); 1900 1901 bin = PP_2_BIN(pplist); 1902 ASSERT(mnode == PP_2_MEM_NODE(pp)); 1903 if ((bin == color) && (flags == PC_ALLOC) && 1904 (ret_pp == NULL) && 1905 page_trylock_cons(pp, SE_EXCL)) { 1906 ret_pp = pp; 1907 } else { 1908 mtype = PP_2_MTYPE(pp); 1909 page_vpadd(&PAGE_FREELISTS(mnode, new_szc, 1910 bin, mtype), pplist); 1911 1912 page_ctr_add(mnode, mtype, pplist, 1913 PG_FREE_LIST); 1914 } 1915 pplist = npplist; 1916 } 1917 } 1918 return (ret_pp); 1919 } 1920 1921 int mpss_coalesce_disable = 0; 1922 1923 /* 1924 * Coalesce free pages into a page of the given szc and color if possible. 1925 * Return the pointer to the page created, otherwise, return NULL. 1926 */ 1927 static page_t * 1928 page_freelist_coalesce(int mnode, uchar_t szc, int color) 1929 { 1930 int r; /* region size */ 1931 int idx, full, i; 1932 pfn_t pfnum; 1933 size_t len; 1934 size_t buckets_to_check; 1935 pgcnt_t cands; 1936 page_t *ret_pp; 1937 int color_stride; 1938 1939 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce); 1940 1941 if (mpss_coalesce_disable) { 1942 return (NULL); 1943 } 1944 1945 r = szc; 1946 PGCTRS_CANDS_GETVALUECOLOR(mnode, r, color, cands); 1947 if (cands == 0) { 1948 VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip); 1949 return (NULL); 1950 } 1951 full = FULL_REGION_CNT(r); 1952 color_stride = (szc) ? page_convert_color(0, szc, page_colors - 1) + 1 : 1953 page_colors; 1954 1955 /* Prevent page_counters dynamic memory from being freed */ 1956 rw_enter(&page_ctrs_rwlock[mnode], RW_READER); 1957 len = PAGE_COUNTERS_ENTRIES(mnode, r); 1958 buckets_to_check = len / color_stride; 1959 idx = PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color); 1960 ASSERT((idx % color_stride) == color); 1961 idx += color_stride; 1962 if (idx >= len) 1963 idx = color; 1964 for (i = 0; i < buckets_to_check; i++) { 1965 if (PAGE_COUNTERS(mnode, r, idx) == full) { 1966 pfnum = IDX_TO_PNUM(mnode, r, idx); 1967 ASSERT(pfnum >= mem_node_config[mnode].physbase && 1968 pfnum < mem_node_config[mnode].physmax); 1969 /* 1970 * RFE: For performance maybe we can do something less 1971 * brutal than locking the entire freelist. So far 1972 * this doesn't seem to be a performance problem? 1973 */ 1974 page_freelist_lock(mnode); 1975 if (PAGE_COUNTERS(mnode, r, idx) != full) { 1976 VM_STAT_ADD(vmm_vmstats.page_ctrs_changed); 1977 goto skip_this_one; 1978 } 1979 ret_pp = page_promote(mnode, pfnum, r, PC_ALLOC); 1980 if (ret_pp != NULL) { 1981 PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color) = 1982 idx; 1983 page_freelist_unlock(mnode); 1984 rw_exit(&page_ctrs_rwlock[mnode]); 1985 #if defined(__sparc) 1986 if (PP_ISNORELOC(ret_pp)) { 1987 pgcnt_t npgs; 1988 1989 npgs = page_get_pagecnt(ret_pp->p_szc); 1990 kcage_freemem_sub(npgs); 1991 } 1992 #endif 1993 return (ret_pp); 1994 } 1995 skip_this_one: 1996 page_freelist_unlock(mnode); 1997 /* 1998 * No point looking for another page if we've 1999 * already tried all of the ones that 2000 * page_ctr_cands indicated. Stash off where we left 2001 * off. 2002 * Note: this is not exact since we don't hold the 2003 * page_freelist_locks before we initially get the 2004 * value of cands for performance reasons, but should 2005 * be a decent approximation. 2006 */ 2007 if (--cands == 0) { 2008 PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color) = 2009 idx; 2010 break; 2011 } 2012 } 2013 idx += color_stride; 2014 if (idx >= len) 2015 idx = color; 2016 } 2017 rw_exit(&page_ctrs_rwlock[mnode]); 2018 VM_STAT_ADD(vmm_vmstats.page_ctrs_failed); 2019 return (NULL); 2020 } 2021 2022 /* 2023 * For the given mnode, promote as many small pages to large pages as possible. 2024 */ 2025 void 2026 page_freelist_coalesce_all(int mnode) 2027 { 2028 int r; /* region size */ 2029 int idx, full; 2030 pfn_t pfnum; 2031 size_t len; 2032 2033 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce_all); 2034 2035 if (mpss_coalesce_disable) { 2036 return; 2037 } 2038 2039 /* 2040 * Lock the entire freelist and coalesce what we can. 2041 * 2042 * Always promote to the largest page possible 2043 * first to reduce the number of page promotions. 2044 */ 2045 rw_enter(&page_ctrs_rwlock[mnode], RW_READER); 2046 page_freelist_lock(mnode); 2047 for (r = mmu_page_sizes - 1; r > 0; r--) { 2048 pgcnt_t cands; 2049 2050 PGCTRS_CANDS_GETVALUE(mnode, r, cands); 2051 if (cands == 0) { 2052 VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip_all); 2053 continue; 2054 } 2055 2056 full = FULL_REGION_CNT(r); 2057 len = PAGE_COUNTERS_ENTRIES(mnode, r); 2058 2059 for (idx = 0; idx < len; idx++) { 2060 if (PAGE_COUNTERS(mnode, r, idx) == full) { 2061 pfnum = IDX_TO_PNUM(mnode, r, idx); 2062 ASSERT(pfnum >= 2063 mem_node_config[mnode].physbase && 2064 pfnum < 2065 mem_node_config[mnode].physmax); 2066 (void) page_promote(mnode, pfnum, r, PC_FREE); 2067 } 2068 } 2069 } 2070 page_freelist_unlock(mnode); 2071 rw_exit(&page_ctrs_rwlock[mnode]); 2072 } 2073 2074 /* 2075 * This is where all polices for moving pages around 2076 * to different page size free lists is implemented. 2077 * Returns 1 on success, 0 on failure. 2078 * 2079 * So far these are the priorities for this algorithm in descending 2080 * order: 2081 * 2082 * 1) When servicing a request try to do so with a free page 2083 * from next size up. Helps defer fragmentation as long 2084 * as possible. 2085 * 2086 * 2) Page coalesce on demand. Only when a freelist 2087 * larger than PAGESIZE is empty and step 1 2088 * will not work since all larger size lists are 2089 * also empty. 2090 * 2091 * If pfnhi is non-zero, search for large page with pfn range less than pfnhi. 2092 */ 2093 page_t * 2094 page_freelist_fill(uchar_t szc, int color, int mnode, int mtype, pfn_t pfnhi) 2095 { 2096 uchar_t nszc = szc + 1; 2097 int bin; 2098 page_t *pp, *firstpp; 2099 page_t *ret_pp = NULL; 2100 2101 ASSERT(szc < mmu_page_sizes); 2102 2103 VM_STAT_ADD(vmm_vmstats.pff_req[szc]); 2104 /* 2105 * First try to break up a larger page to fill 2106 * current size freelist. 2107 */ 2108 while (nszc < mmu_page_sizes) { 2109 /* 2110 * If page found then demote it. 2111 */ 2112 bin = page_convert_color(szc, nszc, color); 2113 if (PAGE_FREELISTS(mnode, nszc, bin, mtype)) { 2114 page_freelist_lock(mnode); 2115 firstpp = pp = PAGE_FREELISTS(mnode, nszc, bin, mtype); 2116 2117 /* 2118 * If pfnhi is not PFNNULL, look for large page below 2119 * pfnhi. PFNNULL signifies no pfn requirement. 2120 */ 2121 if (pfnhi != PFNNULL && pp->p_pagenum >= pfnhi) { 2122 do { 2123 pp = pp->p_vpnext; 2124 if (pp == firstpp) { 2125 pp = NULL; 2126 break; 2127 } 2128 } while (pp->p_pagenum >= pfnhi); 2129 } 2130 if (pp) { 2131 ASSERT(pp->p_szc == nszc); 2132 VM_STAT_ADD(vmm_vmstats.pff_demote[nszc]); 2133 ret_pp = page_demote(mnode, pp->p_pagenum, 2134 pp->p_szc, szc, color, PC_ALLOC); 2135 if (ret_pp) { 2136 page_freelist_unlock(mnode); 2137 #if defined(__sparc) 2138 if (PP_ISNORELOC(ret_pp)) { 2139 pgcnt_t npgs; 2140 2141 npgs = page_get_pagecnt( 2142 ret_pp->p_szc); 2143 kcage_freemem_sub(npgs); 2144 } 2145 #endif 2146 return (ret_pp); 2147 } 2148 } 2149 page_freelist_unlock(mnode); 2150 } 2151 nszc++; 2152 } 2153 2154 /* 2155 * Ok that didn't work. Time to coalesce. 2156 */ 2157 if (szc != 0) { 2158 ret_pp = page_freelist_coalesce(mnode, szc, color); 2159 VM_STAT_COND_ADD(ret_pp, vmm_vmstats.pff_coalok[szc]); 2160 } 2161 2162 return (ret_pp); 2163 } 2164 2165 /* 2166 * Helper routine used only by the freelist code to lock 2167 * a page. If the page is a large page then it succeeds in 2168 * locking all the constituent pages or none at all. 2169 * Returns 1 on sucess, 0 on failure. 2170 */ 2171 static int 2172 page_trylock_cons(page_t *pp, se_t se) 2173 { 2174 page_t *tpp, *first_pp = pp; 2175 2176 /* 2177 * Fail if can't lock first or only page. 2178 */ 2179 if (!page_trylock(pp, se)) { 2180 return (0); 2181 } 2182 2183 /* 2184 * PAGESIZE: common case. 2185 */ 2186 if (pp->p_szc == 0) { 2187 return (1); 2188 } 2189 2190 /* 2191 * Large page case. 2192 */ 2193 tpp = pp->p_next; 2194 while (tpp != pp) { 2195 if (!page_trylock(tpp, se)) { 2196 /* 2197 * On failure unlock what we 2198 * have locked so far. 2199 */ 2200 while (first_pp != tpp) { 2201 page_unlock(first_pp); 2202 first_pp = first_pp->p_next; 2203 } 2204 return (0); 2205 } 2206 tpp = tpp->p_next; 2207 } 2208 return (1); 2209 } 2210 2211 page_t * 2212 page_get_mnode_freelist(int mnode, uint_t bin, int mtype, uchar_t szc, 2213 uint_t flags) 2214 { 2215 kmutex_t *pcm; 2216 int i, fill_tried, fill_marker; 2217 page_t *pp, *first_pp; 2218 uint_t bin_marker; 2219 int colors, cpucolors; 2220 uchar_t nszc; 2221 uint_t nszc_color_shift; 2222 int nwaybins = 0, nwaycnt; 2223 2224 ASSERT(szc < mmu_page_sizes); 2225 2226 VM_STAT_ADD(vmm_vmstats.pgmf_alloc[szc]); 2227 2228 MTYPE_START(mnode, mtype, flags); 2229 if (mtype < 0) { /* mnode foes not have memory in mtype range */ 2230 VM_STAT_ADD(vmm_vmstats.pgmf_allocempty[szc]); 2231 return (NULL); 2232 } 2233 2234 /* 2235 * Set how many physical colors for this page size. 2236 */ 2237 colors = (szc) ? page_convert_color(0, szc, page_colors - 1) + 1 : 2238 page_colors; 2239 2240 nszc = MIN(szc + 1, mmu_page_sizes - 1); 2241 nszc_color_shift = page_get_shift(nszc) - page_get_shift(szc); 2242 2243 /* cpu_page_colors is non-zero if a page color may be in > 1 bin */ 2244 cpucolors = cpu_page_colors; 2245 2246 /* 2247 * adjust cpucolors to possibly check additional 'equivalent' bins 2248 * to try to minimize fragmentation of large pages by delaying calls 2249 * to page_freelist_fill. 2250 */ 2251 if (colorequiv > 1) { 2252 int equivcolors = colors / colorequiv; 2253 2254 if (equivcolors && (cpucolors == 0 || equivcolors < cpucolors)) 2255 cpucolors = equivcolors; 2256 } 2257 2258 ASSERT(colors <= page_colors); 2259 ASSERT(colors); 2260 ASSERT((colors & (colors - 1)) == 0); 2261 2262 ASSERT(bin < colors); 2263 2264 /* 2265 * Only hold one freelist lock at a time, that way we 2266 * can start anywhere and not have to worry about lock 2267 * ordering. 2268 */ 2269 big_try_again: 2270 fill_tried = 0; 2271 nwaycnt = 0; 2272 for (i = 0; i <= colors; i++) { 2273 try_again: 2274 ASSERT(bin < colors); 2275 if (PAGE_FREELISTS(mnode, szc, bin, mtype)) { 2276 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 2277 mutex_enter(pcm); 2278 pp = PAGE_FREELISTS(mnode, szc, bin, mtype); 2279 if (pp != NULL) { 2280 /* 2281 * These were set before the page 2282 * was put on the free list, 2283 * they must still be set. 2284 */ 2285 ASSERT(PP_ISFREE(pp)); 2286 ASSERT(PP_ISAGED(pp)); 2287 ASSERT(pp->p_vnode == NULL); 2288 ASSERT(pp->p_hash == NULL); 2289 ASSERT(pp->p_offset == (u_offset_t)-1); 2290 ASSERT(pp->p_szc == szc); 2291 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 2292 2293 /* 2294 * Walk down the hash chain. 2295 * 8k pages are linked on p_next 2296 * and p_prev fields. Large pages 2297 * are a contiguous group of 2298 * constituent pages linked together 2299 * on their p_next and p_prev fields. 2300 * The large pages are linked together 2301 * on the hash chain using p_vpnext 2302 * p_vpprev of the base constituent 2303 * page of each large page. 2304 */ 2305 first_pp = pp; 2306 while (!page_trylock_cons(pp, SE_EXCL)) { 2307 if (szc == 0) { 2308 pp = pp->p_next; 2309 } else { 2310 pp = pp->p_vpnext; 2311 } 2312 2313 ASSERT(PP_ISFREE(pp)); 2314 ASSERT(PP_ISAGED(pp)); 2315 ASSERT(pp->p_vnode == NULL); 2316 ASSERT(pp->p_hash == NULL); 2317 ASSERT(pp->p_offset == (u_offset_t)-1); 2318 ASSERT(pp->p_szc == szc); 2319 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == 2320 mnode); 2321 2322 if (pp == first_pp) { 2323 pp = NULL; 2324 break; 2325 } 2326 } 2327 2328 if (pp) { 2329 ASSERT(mtype == PP_2_MTYPE(pp)); 2330 ASSERT(pp->p_szc == szc); 2331 if (szc == 0) { 2332 page_sub(&PAGE_FREELISTS(mnode, 2333 szc, bin, mtype), pp); 2334 } else { 2335 page_vpsub(&PAGE_FREELISTS( 2336 mnode, szc, bin, mtype), 2337 pp); 2338 CHK_LPG(pp, szc); 2339 } 2340 page_ctr_sub(mnode, mtype, pp, 2341 PG_FREE_LIST); 2342 2343 if ((PP_ISFREE(pp) == 0) || 2344 (PP_ISAGED(pp) == 0)) 2345 panic("free page is not. pp %p", 2346 (void *)pp); 2347 mutex_exit(pcm); 2348 2349 #if defined(__sparc) 2350 ASSERT(!kcage_on || PP_ISNORELOC(pp) || 2351 (flags & PG_NORELOC) == 0); 2352 2353 if (PP_ISNORELOC(pp)) { 2354 pgcnt_t npgs; 2355 2356 npgs = page_get_pagecnt(szc); 2357 kcage_freemem_sub(npgs); 2358 } 2359 #endif 2360 VM_STAT_ADD(vmm_vmstats. 2361 pgmf_allocok[szc]); 2362 return (pp); 2363 } 2364 } 2365 mutex_exit(pcm); 2366 } 2367 2368 /* 2369 * Wow! The initial bin is empty. 2370 * If specific color is needed, check if page color may be 2371 * in other bins. cpucolors is: 2372 * 0 if the colors for this cpu is equal to page_colors. 2373 * This means that pages with a particular color are in a 2374 * single bin. 2375 * -1 if colors of cpus (cheetah+) are heterogenous. Need to 2376 * first determine the colors for the current cpu. 2377 * >0 colors of all cpus are homogenous and < page_colors 2378 */ 2379 2380 if ((flags & PG_MATCH_COLOR) && (cpucolors != 0)) { 2381 if (!nwaybins) { 2382 /* 2383 * cpucolors is negative if ecache setsizes 2384 * are heterogenous. determine colors for this 2385 * particular cpu. 2386 */ 2387 if (cpucolors < 0) { 2388 cpucolors = CPUSETSIZE() / MMU_PAGESIZE; 2389 ASSERT(cpucolors > 0); 2390 nwaybins = colors / cpucolors; 2391 } else { 2392 nwaybins = colors / cpucolors; 2393 ASSERT(szc > 0 || nwaybins > 1); 2394 } 2395 if (nwaybins < 2) 2396 cpucolors = 0; 2397 } 2398 2399 if (cpucolors && (nwaycnt + 1 <= nwaybins)) { 2400 nwaycnt++; 2401 bin = (bin + (colors / nwaybins)) & 2402 (colors - 1); 2403 if (nwaycnt < nwaybins) { 2404 goto try_again; 2405 } 2406 } 2407 /* back to initial color if fall-thru */ 2408 } 2409 2410 /* 2411 * color bins are all empty if color match. Try and satisfy 2412 * the request by breaking up or coalescing pages from 2413 * a different size freelist of the correct color that 2414 * satisfies the ORIGINAL color requested. If that 2415 * fails then try pages of the same size but different 2416 * colors assuming we are not called with 2417 * PG_MATCH_COLOR. 2418 */ 2419 if (!fill_tried) { 2420 fill_tried = 1; 2421 fill_marker = bin >> nszc_color_shift; 2422 pp = page_freelist_fill(szc, bin, mnode, mtype, 2423 PFNNULL); 2424 if (pp != NULL) { 2425 return (pp); 2426 } 2427 } 2428 2429 if (flags & PG_MATCH_COLOR) 2430 break; 2431 2432 /* 2433 * Select next color bin to try. 2434 */ 2435 if (szc == 0) { 2436 /* 2437 * PAGESIZE page case. 2438 */ 2439 if (i == 0) { 2440 bin = (bin + BIN_STEP) & page_colors_mask; 2441 bin_marker = bin; 2442 } else { 2443 bin = (bin + vac_colors) & page_colors_mask; 2444 if (bin == bin_marker) { 2445 bin = (bin + 1) & page_colors_mask; 2446 bin_marker = bin; 2447 } 2448 } 2449 } else { 2450 /* 2451 * Large page case. 2452 */ 2453 bin = (bin + 1) & (colors - 1); 2454 } 2455 /* 2456 * If bin advanced to the next color bin of the 2457 * next larger pagesize, there is a chance the fill 2458 * could succeed. 2459 */ 2460 if (fill_marker != (bin >> nszc_color_shift)) 2461 fill_tried = 0; 2462 } 2463 2464 /* if allowed, cycle through additional mtypes */ 2465 MTYPE_NEXT(mnode, mtype, flags); 2466 if (mtype >= 0) 2467 goto big_try_again; 2468 2469 VM_STAT_ADD(vmm_vmstats.pgmf_allocfailed[szc]); 2470 2471 return (NULL); 2472 } 2473 2474 2475 /* 2476 * Returns the count of free pages for 'pp' with size code 'szc'. 2477 * Note: This function does not return an exact value as the page freelist 2478 * locks are not held and thus the values in the page_counters may be 2479 * changing as we walk through the data. 2480 */ 2481 static int 2482 page_freecnt(int mnode, page_t *pp, uchar_t szc) 2483 { 2484 pgcnt_t pgfree; 2485 pgcnt_t cnt; 2486 ssize_t r = szc; /* region size */ 2487 ssize_t idx; 2488 int i; 2489 int full, range; 2490 2491 /* Make sure pagenum passed in is aligned properly */ 2492 ASSERT((pp->p_pagenum & (PNUM_SIZE(szc) - 1)) == 0); 2493 ASSERT(szc > 0); 2494 2495 /* Prevent page_counters dynamic memory from being freed */ 2496 rw_enter(&page_ctrs_rwlock[mnode], RW_READER); 2497 idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum); 2498 cnt = PAGE_COUNTERS(mnode, r, idx); 2499 pgfree = cnt << PNUM_SHIFT(r - 1); 2500 range = FULL_REGION_CNT(szc); 2501 2502 /* Check for completely full region */ 2503 if (cnt == range) { 2504 rw_exit(&page_ctrs_rwlock[mnode]); 2505 return (pgfree); 2506 } 2507 2508 while (--r > 0) { 2509 idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum); 2510 full = FULL_REGION_CNT(r); 2511 for (i = 0; i < range; i++, idx++) { 2512 cnt = PAGE_COUNTERS(mnode, r, idx); 2513 /* 2514 * If cnt here is full, that means we have already 2515 * accounted for these pages earlier. 2516 */ 2517 if (cnt != full) { 2518 pgfree += (cnt << PNUM_SHIFT(r - 1)); 2519 } 2520 } 2521 range *= full; 2522 } 2523 rw_exit(&page_ctrs_rwlock[mnode]); 2524 return (pgfree); 2525 } 2526 2527 /* 2528 * Called from page_geti_contig_pages to exclusively lock constituent pages 2529 * starting from 'spp' for page size code 'szc'. 2530 * 2531 * If 'ptcpthreshold' is set, the number of free pages needed in the 'szc' 2532 * region needs to be greater than or equal to the threshold. 2533 */ 2534 static int 2535 page_trylock_contig_pages(int mnode, page_t *spp, uchar_t szc, int flags) 2536 { 2537 pgcnt_t pgcnt = PNUM_SIZE(szc); 2538 pgcnt_t pgfree, i; 2539 page_t *pp; 2540 2541 VM_STAT_ADD(vmm_vmstats.ptcp[szc]); 2542 2543 2544 if ((ptcpthreshold == 0) || (flags & PGI_PGCPHIPRI)) 2545 goto skipptcpcheck; 2546 /* 2547 * check if there are sufficient free pages available before attempting 2548 * to trylock. Count is approximate as page counters can change. 2549 */ 2550 pgfree = page_freecnt(mnode, spp, szc); 2551 2552 /* attempt to trylock if there are sufficient already free pages */ 2553 if (pgfree < pgcnt/ptcpthreshold) { 2554 VM_STAT_ADD(vmm_vmstats.ptcpfreethresh[szc]); 2555 return (0); 2556 } 2557 2558 skipptcpcheck: 2559 2560 for (i = 0; i < pgcnt; i++) { 2561 pp = &spp[i]; 2562 if (!page_trylock(pp, SE_EXCL)) { 2563 VM_STAT_ADD(vmm_vmstats.ptcpfailexcl[szc]); 2564 while (--i != (pgcnt_t)-1) { 2565 pp = &spp[i]; 2566 ASSERT(PAGE_EXCL(pp)); 2567 page_unlock(pp); 2568 } 2569 return (0); 2570 } 2571 ASSERT(spp[i].p_pagenum == spp->p_pagenum + i); 2572 if ((pp->p_szc > szc || (szc && pp->p_szc == szc)) && 2573 !PP_ISFREE(pp)) { 2574 VM_STAT_ADD(vmm_vmstats.ptcpfailszc[szc]); 2575 ASSERT(i == 0); 2576 page_unlock(pp); 2577 return (0); 2578 } 2579 if (PP_ISNORELOC(pp)) { 2580 VM_STAT_ADD(vmm_vmstats.ptcpfailcage[szc]); 2581 while (i != (pgcnt_t)-1) { 2582 pp = &spp[i]; 2583 ASSERT(PAGE_EXCL(pp)); 2584 page_unlock(pp); 2585 i--; 2586 } 2587 return (0); 2588 } 2589 } 2590 VM_STAT_ADD(vmm_vmstats.ptcpok[szc]); 2591 return (1); 2592 } 2593 2594 /* 2595 * Claim large page pointed to by 'pp'. 'pp' is the starting set 2596 * of 'szc' constituent pages that had been locked exclusively previously. 2597 * Will attempt to relocate constituent pages in use. 2598 */ 2599 static page_t * 2600 page_claim_contig_pages(page_t *pp, uchar_t szc, int flags) 2601 { 2602 spgcnt_t pgcnt, npgs, i; 2603 page_t *targpp, *rpp, *hpp; 2604 page_t *replpp = NULL; 2605 page_t *pplist = NULL; 2606 2607 ASSERT(pp != NULL); 2608 2609 pgcnt = page_get_pagecnt(szc); 2610 while (pgcnt) { 2611 ASSERT(PAGE_EXCL(pp)); 2612 ASSERT(!PP_ISNORELOC(pp)); 2613 if (PP_ISFREE(pp)) { 2614 /* 2615 * If this is a PG_FREE_LIST page then its 2616 * size code can change underneath us due to 2617 * page promotion or demotion. As an optimzation 2618 * use page_list_sub_pages() instead of 2619 * page_list_sub(). 2620 */ 2621 if (PP_ISAGED(pp)) { 2622 page_list_sub_pages(pp, szc); 2623 if (pp->p_szc == szc) { 2624 return (pp); 2625 } 2626 ASSERT(pp->p_szc < szc); 2627 npgs = page_get_pagecnt(pp->p_szc); 2628 hpp = pp; 2629 for (i = 0; i < npgs; i++, pp++) { 2630 pp->p_szc = szc; 2631 } 2632 page_list_concat(&pplist, &hpp); 2633 pgcnt -= npgs; 2634 continue; 2635 } 2636 ASSERT(!PP_ISAGED(pp)); 2637 ASSERT(pp->p_szc == 0); 2638 page_list_sub(pp, PG_CACHE_LIST); 2639 page_hashout(pp, NULL); 2640 PP_SETAGED(pp); 2641 pp->p_szc = szc; 2642 page_list_concat(&pplist, &pp); 2643 pp++; 2644 pgcnt--; 2645 continue; 2646 } 2647 npgs = page_get_pagecnt(pp->p_szc); 2648 2649 /* 2650 * page_create_wait freemem accounting done by caller of 2651 * page_get_freelist and not necessary to call it prior to 2652 * calling page_get_replacement_page. 2653 * 2654 * page_get_replacement_page can call page_get_contig_pages 2655 * to acquire a large page (szc > 0); the replacement must be 2656 * smaller than the contig page size to avoid looping or 2657 * szc == 0 and PGI_PGCPSZC0 is set. 2658 */ 2659 if (pp->p_szc < szc || (szc == 0 && (flags & PGI_PGCPSZC0))) { 2660 replpp = page_get_replacement_page(pp, NULL, 0); 2661 if (replpp) { 2662 npgs = page_get_pagecnt(pp->p_szc); 2663 ASSERT(npgs <= pgcnt); 2664 targpp = pp; 2665 } 2666 } 2667 2668 /* 2669 * If replacement is NULL or do_page_relocate fails, fail 2670 * coalescing of pages. 2671 */ 2672 if (replpp == NULL || (do_page_relocate(&targpp, &replpp, 0, 2673 &npgs, NULL) != 0)) { 2674 /* 2675 * Unlock un-processed target list 2676 */ 2677 while (pgcnt--) { 2678 ASSERT(PAGE_EXCL(pp)); 2679 page_unlock(pp); 2680 pp++; 2681 } 2682 /* 2683 * Free the processed target list. 2684 */ 2685 while (pplist) { 2686 pp = pplist; 2687 page_sub(&pplist, pp); 2688 ASSERT(PAGE_EXCL(pp)); 2689 ASSERT(pp->p_szc == szc); 2690 ASSERT(PP_ISFREE(pp)); 2691 ASSERT(PP_ISAGED(pp)); 2692 pp->p_szc = 0; 2693 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); 2694 page_unlock(pp); 2695 } 2696 2697 if (replpp != NULL) 2698 page_free_replacement_page(replpp); 2699 2700 return (NULL); 2701 } 2702 ASSERT(pp == targpp); 2703 2704 /* LINTED */ 2705 ASSERT(hpp = pp); /* That's right, it's an assignment */ 2706 2707 pp += npgs; 2708 pgcnt -= npgs; 2709 2710 while (npgs--) { 2711 ASSERT(PAGE_EXCL(targpp)); 2712 ASSERT(!PP_ISFREE(targpp)); 2713 ASSERT(!PP_ISNORELOC(targpp)); 2714 PP_SETFREE(targpp); 2715 ASSERT(PP_ISAGED(targpp)); 2716 ASSERT(targpp->p_szc < szc || (szc == 0 && 2717 (flags & PGI_PGCPSZC0))); 2718 targpp->p_szc = szc; 2719 targpp = targpp->p_next; 2720 2721 rpp = replpp; 2722 ASSERT(rpp != NULL); 2723 page_sub(&replpp, rpp); 2724 ASSERT(PAGE_EXCL(rpp)); 2725 ASSERT(!PP_ISFREE(rpp)); 2726 page_unlock(rpp); 2727 } 2728 ASSERT(targpp == hpp); 2729 ASSERT(replpp == NULL); 2730 page_list_concat(&pplist, &targpp); 2731 } 2732 CHK_LPG(pplist, szc); 2733 return (pplist); 2734 } 2735 2736 /* 2737 * Trim kernel cage from pfnlo-pfnhi and store result in lo-hi. Return code 2738 * of 0 means nothing left after trim. 2739 */ 2740 2741 int 2742 trimkcage(struct memseg *mseg, pfn_t *lo, pfn_t *hi, pfn_t pfnlo, pfn_t pfnhi) 2743 { 2744 pfn_t kcagepfn; 2745 int decr; 2746 int rc = 0; 2747 2748 if (PP_ISNORELOC(mseg->pages)) { 2749 if (PP_ISNORELOC(mseg->epages - 1) == 0) { 2750 2751 /* lower part of this mseg inside kernel cage */ 2752 decr = kcage_current_pfn(&kcagepfn); 2753 2754 /* kernel cage may have transitioned past mseg */ 2755 if (kcagepfn >= mseg->pages_base && 2756 kcagepfn < mseg->pages_end) { 2757 ASSERT(decr == 0); 2758 *lo = kcagepfn; 2759 *hi = MIN(pfnhi, 2760 (mseg->pages_end - 1)); 2761 rc = 1; 2762 } 2763 } 2764 /* else entire mseg in the cage */ 2765 } else { 2766 if (PP_ISNORELOC(mseg->epages - 1)) { 2767 2768 /* upper part of this mseg inside kernel cage */ 2769 decr = kcage_current_pfn(&kcagepfn); 2770 2771 /* kernel cage may have transitioned past mseg */ 2772 if (kcagepfn >= mseg->pages_base && 2773 kcagepfn < mseg->pages_end) { 2774 ASSERT(decr); 2775 *hi = kcagepfn; 2776 *lo = MAX(pfnlo, mseg->pages_base); 2777 rc = 1; 2778 } 2779 } else { 2780 /* entire mseg outside of kernel cage */ 2781 *lo = MAX(pfnlo, mseg->pages_base); 2782 *hi = MIN(pfnhi, (mseg->pages_end - 1)); 2783 rc = 1; 2784 } 2785 } 2786 return (rc); 2787 } 2788 2789 /* 2790 * called from page_get_contig_pages to search 'pfnlo' thru 'pfnhi' to "claim" a 2791 * page with size code 'szc'. Claiming such a page requires acquiring 2792 * exclusive locks on all constituent pages (page_trylock_contig_pages), 2793 * relocating pages in use and concatenating these constituent pages into a 2794 * large page. 2795 * 2796 * The page lists do not have such a large page and page_freelist_fill has 2797 * already failed to demote larger pages and/or coalesce smaller free pages. 2798 * 2799 * 'flags' may specify PG_COLOR_MATCH which would limit the search of large 2800 * pages with the same color as 'bin'. 2801 * 2802 * 'pfnflag' specifies the subset of the pfn range to search. 2803 */ 2804 2805 2806 static page_t * 2807 page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags, 2808 pfn_t pfnlo, pfn_t pfnhi, int pfnflag) 2809 { 2810 struct memseg *mseg; 2811 pgcnt_t szcpgcnt = page_get_pagecnt(szc); 2812 pgcnt_t szcpgmask = szcpgcnt - 1; 2813 pfn_t randpfn; 2814 page_t *pp, *randpp, *endpp; 2815 uint_t colors; 2816 pfn_t hi, lo; 2817 uint_t skip; 2818 2819 ASSERT(szc != 0 || (flags & PGI_PGCPSZC0)); 2820 2821 if ((pfnhi - pfnlo) + 1 < szcpgcnt) 2822 return (NULL); 2823 2824 ASSERT(szc < mmu_page_sizes); 2825 2826 colors = (szc) ? page_convert_color(0, szc, page_colors - 1) + 1 : 2827 page_colors; 2828 2829 ASSERT(bin < colors); 2830 2831 /* 2832 * trim the pfn range to search based on pfnflag. pfnflag is set 2833 * when there have been previous page_get_contig_page failures to 2834 * limit the search. 2835 * 2836 * The high bit in pfnflag specifies the number of 'slots' in the 2837 * pfn range and the remainder of pfnflag specifies which slot. 2838 * For example, a value of 1010b would mean the second slot of 2839 * the pfn range that has been divided into 8 slots. 2840 */ 2841 if (pfnflag > 1) { 2842 int slots = 1 << (highbit(pfnflag) - 1); 2843 int slotid = pfnflag & (slots - 1); 2844 pgcnt_t szcpages; 2845 int slotlen; 2846 2847 pfnlo = P2ROUNDUP(pfnlo, szcpgcnt); 2848 pfnhi = pfnhi & ~(szcpgcnt - 1); 2849 2850 szcpages = ((pfnhi - pfnlo) + 1) / szcpgcnt; 2851 slotlen = howmany(szcpages, slots); 2852 pfnlo = pfnlo + (((slotid * slotlen) % szcpages) * szcpgcnt); 2853 ASSERT(pfnlo < pfnhi); 2854 if (pfnhi > pfnlo + (slotlen * szcpgcnt)) 2855 pfnhi = pfnlo + (slotlen * szcpgcnt); 2856 } 2857 2858 memsegs_lock(0); 2859 2860 /* 2861 * loop through memsegs to look for contig page candidates 2862 */ 2863 2864 for (mseg = memsegs; mseg != NULL; mseg = mseg->next) { 2865 if (pfnhi < mseg->pages_base || pfnlo >= mseg->pages_end) { 2866 /* no overlap */ 2867 continue; 2868 } 2869 2870 if (mseg->pages_end - mseg->pages_base < szcpgcnt) 2871 /* mseg too small */ 2872 continue; 2873 2874 /* trim off kernel cage pages from pfn range */ 2875 if (kcage_on) { 2876 if (trimkcage(mseg, &lo, &hi, pfnlo, pfnhi) == 0) 2877 continue; 2878 } else { 2879 lo = MAX(pfnlo, mseg->pages_base); 2880 hi = MIN(pfnhi, (mseg->pages_end - 1)); 2881 } 2882 2883 /* round to szcpgcnt boundaries */ 2884 lo = P2ROUNDUP(lo, szcpgcnt); 2885 hi = hi & ~(szcpgcnt - 1); 2886 2887 if (hi <= lo) 2888 continue; 2889 2890 /* 2891 * set lo to point to the pfn for the desired bin. Large 2892 * page sizes may only have a single page color 2893 */ 2894 if ((colors > 1) && (flags & PG_MATCH_COLOR)) { 2895 uint_t lobin; 2896 2897 /* 2898 * factor in colorequiv to check additional 2899 * 'equivalent' bins. 2900 */ 2901 if (colorequiv > 1 && colors > colorequiv) 2902 colors = colors / colorequiv; 2903 2904 /* determine bin that lo currently points to */ 2905 lobin = (lo & ((szcpgcnt * colors) - 1)) / szcpgcnt; 2906 2907 /* 2908 * set lo to point at appropriate color and set skip 2909 * to arrive at the next szc page of the same color. 2910 */ 2911 lo += ((bin - lobin) & (colors - 1)) * szcpgcnt; 2912 2913 skip = colors * szcpgcnt; 2914 } else { 2915 /* check all pages starting from lo */ 2916 skip = szcpgcnt; 2917 } 2918 if (hi <= lo) 2919 /* mseg cannot satisfy color request */ 2920 continue; 2921 2922 /* randomly choose a point between lo and hi to begin search */ 2923 2924 randpfn = (pfn_t)GETTICK(); 2925 randpfn = ((randpfn % (hi - lo)) + lo) & ~(skip - 1); 2926 randpp = mseg->pages + (randpfn - mseg->pages_base); 2927 2928 ASSERT(randpp->p_pagenum == randpfn); 2929 2930 pp = randpp; 2931 endpp = mseg->pages + (hi - mseg->pages_base); 2932 2933 ASSERT(randpp + szcpgcnt <= endpp); 2934 2935 do { 2936 ASSERT(!(pp->p_pagenum & szcpgmask)); 2937 ASSERT((flags & PG_MATCH_COLOR) == 0 || 2938 colorequiv > 1 || 2939 PP_2_BIN(pp) == bin); 2940 if (page_trylock_contig_pages(mnode, pp, szc, flags)) { 2941 /* pages unlocked by page_claim on failure */ 2942 if (page_claim_contig_pages(pp, szc, flags)) { 2943 memsegs_unlock(0); 2944 return (pp); 2945 } 2946 } 2947 2948 pp += skip; 2949 if (pp >= endpp) { 2950 /* start from the beginning */ 2951 pp = mseg->pages + (lo - mseg->pages_base); 2952 ASSERT(pp->p_pagenum == lo); 2953 ASSERT(pp + szcpgcnt <= endpp); 2954 } 2955 } while (pp != randpp); 2956 } 2957 memsegs_unlock(0); 2958 return (NULL); 2959 } 2960 2961 2962 /* 2963 * controlling routine that searches through physical memory in an attempt to 2964 * claim a large page based on the input parameters. 2965 * on the page free lists. 2966 * 2967 * calls page_geti_contig_pages with an initial pfn range from the mnode 2968 * and mtype. page_geti_contig_pages will trim off the parts of the pfn range 2969 * that overlaps with the kernel cage or does not match the requested page 2970 * color if PG_MATCH_COLOR is set. Since this search is very expensive, 2971 * page_geti_contig_pages may further limit the search range based on 2972 * previous failure counts (pgcpfailcnt[]). 2973 * 2974 * for PGI_PGCPSZC0 requests, page_get_contig_pages will relocate a base 2975 * pagesize page that satisfies mtype. 2976 */ 2977 page_t * 2978 page_get_contig_pages(int mnode, uint_t bin, int mtype, uchar_t szc, 2979 uint_t flags) 2980 { 2981 pfn_t pfnlo, pfnhi; /* contig pages pfn range */ 2982 page_t *pp; 2983 int pfnflag = 0; /* no limit on search if 0 */ 2984 2985 VM_STAT_ADD(vmm_vmstats.pgcp_alloc[szc]); 2986 2987 /* LINTED */ 2988 MTYPE_START(mnode, mtype, flags); 2989 if (mtype < 0) { /* mnode does not have memory in mtype range */ 2990 VM_STAT_ADD(vmm_vmstats.pgcp_allocempty[szc]); 2991 return (NULL); 2992 } 2993 2994 ASSERT(szc > 0 || (flags & PGI_PGCPSZC0)); 2995 2996 /* no allocations from cage */ 2997 flags |= PGI_NOCAGE; 2998 2999 /* do not limit search and ignore color if hi pri */ 3000 3001 if (pgcplimitsearch && ((flags & PGI_PGCPHIPRI) == 0)) 3002 pfnflag = pgcpfailcnt[szc]; 3003 3004 /* remove color match to improve chances */ 3005 3006 if (flags & PGI_PGCPHIPRI || pfnflag) 3007 flags &= ~PG_MATCH_COLOR; 3008 3009 do { 3010 /* get pfn range based on mnode and mtype */ 3011 MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi); 3012 3013 ASSERT(pfnhi >= pfnlo); 3014 3015 pp = page_geti_contig_pages(mnode, bin, szc, flags, 3016 pfnlo, pfnhi, pfnflag); 3017 3018 if (pp != NULL) { 3019 pfnflag = pgcpfailcnt[szc]; 3020 if (pfnflag) { 3021 /* double the search size */ 3022 pgcpfailcnt[szc] = pfnflag >> 1; 3023 } 3024 VM_STAT_ADD(vmm_vmstats.pgcp_allocok[szc]); 3025 return (pp); 3026 } 3027 MTYPE_NEXT(mnode, mtype, flags); 3028 } while (mtype >= 0); 3029 3030 VM_STAT_ADD(vmm_vmstats.pgcp_allocfailed[szc]); 3031 return (NULL); 3032 } 3033 3034 3035 /* 3036 * Find the `best' page on the freelist for this (vp,off) (as,vaddr) pair. 3037 * 3038 * Does its own locking and accounting. 3039 * If PG_MATCH_COLOR is set, then NULL will be returned if there are no 3040 * pages of the proper color even if there are pages of a different color. 3041 * 3042 * Finds a page, removes it, THEN locks it. 3043 */ 3044 3045 /*ARGSUSED*/ 3046 page_t * 3047 page_get_freelist(struct vnode *vp, u_offset_t off, struct seg *seg, 3048 caddr_t vaddr, size_t size, uint_t flags, struct lgrp *lgrp) 3049 { 3050 struct as *as = seg->s_as; 3051 page_t *pp = NULL; 3052 ulong_t bin; 3053 uchar_t szc; 3054 int mnode; 3055 int mtype; 3056 page_t *(*page_get_func)(int, uint_t, int, uchar_t, uint_t); 3057 lgrp_mnode_cookie_t lgrp_cookie; 3058 3059 page_get_func = page_get_mnode_freelist; 3060 3061 /* 3062 * If we aren't passed a specific lgroup, or passed a freed lgrp 3063 * assume we wish to allocate near to the current thread's home. 3064 */ 3065 if (!LGRP_EXISTS(lgrp)) 3066 lgrp = lgrp_home_lgrp(); 3067 3068 if (kcage_on) { 3069 if ((flags & (PG_NORELOC | PG_PANIC)) == PG_NORELOC && 3070 kcage_freemem < kcage_throttlefree + btop(size) && 3071 curthread != kcage_cageout_thread) { 3072 /* 3073 * Set a "reserve" of kcage_throttlefree pages for 3074 * PG_PANIC and cageout thread allocations. 3075 * 3076 * Everybody else has to serialize in 3077 * page_create_get_something() to get a cage page, so 3078 * that we don't deadlock cageout! 3079 */ 3080 return (NULL); 3081 } 3082 } else { 3083 flags &= ~PG_NORELOC; 3084 flags |= PGI_NOCAGE; 3085 } 3086 3087 /* LINTED */ 3088 MTYPE_INIT(mtype, vp, vaddr, flags); 3089 3090 /* 3091 * Convert size to page size code. 3092 */ 3093 if ((szc = page_szc(size)) == (uchar_t)-1) 3094 panic("page_get_freelist: illegal page size request"); 3095 ASSERT(szc < mmu_page_sizes); 3096 3097 VM_STAT_ADD(vmm_vmstats.pgf_alloc[szc]); 3098 3099 /* LINTED */ 3100 AS_2_BIN(as, seg, vp, vaddr, bin); 3101 3102 /* bin is for base pagesize color - convert if larger pagesize. */ 3103 if (szc) 3104 bin = page_convert_color(0, szc, bin); 3105 3106 /* 3107 * Try to get a local page first, but try remote if we can't 3108 * get a page of the right color. 3109 */ 3110 pgretry: 3111 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL); 3112 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3113 pp = page_get_func(mnode, bin, mtype, szc, flags); 3114 if (pp != NULL) { 3115 VM_STAT_ADD(vmm_vmstats.pgf_allocok[szc]); 3116 DTRACE_PROBE4(page__get, 3117 lgrp_t *, lgrp, 3118 int, mnode, 3119 ulong_t, bin, 3120 uint_t, flags); 3121 return (pp); 3122 } 3123 } 3124 ASSERT(pp == NULL); 3125 3126 /* 3127 * for non-SZC0 PAGESIZE requests, check cachelist before checking 3128 * remote free lists. Caller expected to call page_get_cachelist which 3129 * will check local cache lists and remote free lists. 3130 */ 3131 if (szc == 0 && ((flags & PGI_PGCPSZC0) == 0)) { 3132 VM_STAT_ADD(vmm_vmstats.pgf_allocdeferred); 3133 return (NULL); 3134 } 3135 3136 ASSERT(szc > 0 || (flags & PGI_PGCPSZC0)); 3137 3138 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); 3139 3140 /* 3141 * Try to get a non-local freelist page. 3142 */ 3143 LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie); 3144 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3145 pp = page_get_func(mnode, bin, mtype, szc, flags); 3146 if (pp != NULL) { 3147 DTRACE_PROBE4(page__get, 3148 lgrp_t *, lgrp, 3149 int, mnode, 3150 ulong_t, bin, 3151 uint_t, flags); 3152 VM_STAT_ADD(vmm_vmstats.pgf_allocokrem[szc]); 3153 return (pp); 3154 } 3155 } 3156 3157 ASSERT(pp == NULL); 3158 3159 /* 3160 * when the cage is off chances are page_get_contig_pages() will fail 3161 * to lock a large page chunk therefore when the cage is off it's not 3162 * called by default. this can be changed via /etc/system. 3163 * 3164 * page_get_contig_pages() also called to acquire a base pagesize page 3165 * for page_create_get_something(). 3166 */ 3167 if (!(flags & PG_NORELOC) && (pg_contig_disable == 0) && 3168 (kcage_on || pg_lpgcreate_nocage || szc == 0) && 3169 (page_get_func != page_get_contig_pages)) { 3170 3171 VM_STAT_ADD(vmm_vmstats.pgf_allocretry[szc]); 3172 page_get_func = page_get_contig_pages; 3173 goto pgretry; 3174 } 3175 3176 if (pgcplimitsearch && page_get_func == page_get_contig_pages) 3177 pgcpfailcnt[szc]++; 3178 3179 VM_STAT_ADD(vmm_vmstats.pgf_allocfailed[szc]); 3180 return (NULL); 3181 } 3182 3183 /* 3184 * Find the `best' page on the cachelist for this (vp,off) (as,vaddr) pair. 3185 * 3186 * Does its own locking. 3187 * If PG_MATCH_COLOR is set, then NULL will be returned if there are no 3188 * pages of the proper color even if there are pages of a different color. 3189 * Otherwise, scan the bins for ones with pages. For each bin with pages, 3190 * try to lock one of them. If no page can be locked, try the 3191 * next bin. Return NULL if a page can not be found and locked. 3192 * 3193 * Finds a pages, trys to lock it, then removes it. 3194 */ 3195 3196 /*ARGSUSED*/ 3197 page_t * 3198 page_get_cachelist(struct vnode *vp, u_offset_t off, struct seg *seg, 3199 caddr_t vaddr, uint_t flags, struct lgrp *lgrp) 3200 { 3201 page_t *pp; 3202 struct as *as = seg->s_as; 3203 ulong_t bin; 3204 /*LINTED*/ 3205 int mnode; 3206 int mtype; 3207 lgrp_mnode_cookie_t lgrp_cookie; 3208 3209 /* 3210 * If we aren't passed a specific lgroup, or pasased a freed lgrp 3211 * assume we wish to allocate near to the current thread's home. 3212 */ 3213 if (!LGRP_EXISTS(lgrp)) 3214 lgrp = lgrp_home_lgrp(); 3215 3216 if (!kcage_on) { 3217 flags &= ~PG_NORELOC; 3218 flags |= PGI_NOCAGE; 3219 } 3220 3221 if ((flags & (PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == PG_NORELOC && 3222 kcage_freemem <= kcage_throttlefree) { 3223 /* 3224 * Reserve kcage_throttlefree pages for critical kernel 3225 * threads. 3226 * 3227 * Everybody else has to go to page_create_get_something() 3228 * to get a cage page, so we don't deadlock cageout. 3229 */ 3230 return (NULL); 3231 } 3232 3233 /* LINTED */ 3234 AS_2_BIN(as, seg, vp, vaddr, bin); 3235 3236 ASSERT(bin <= page_colors_mask); 3237 3238 /* LINTED */ 3239 MTYPE_INIT(mtype, vp, vaddr, flags); 3240 3241 VM_STAT_ADD(vmm_vmstats.pgc_alloc); 3242 3243 /* 3244 * Try local cachelists first 3245 */ 3246 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL); 3247 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3248 pp = page_get_mnode_cachelist(bin, flags, mnode, mtype); 3249 if (pp != NULL) { 3250 VM_STAT_ADD(vmm_vmstats.pgc_allocok); 3251 DTRACE_PROBE4(page__get, 3252 lgrp_t *, lgrp, 3253 int, mnode, 3254 ulong_t, bin, 3255 uint_t, flags); 3256 return (pp); 3257 } 3258 } 3259 3260 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); 3261 3262 /* 3263 * Try freelists/cachelists that are farther away 3264 * This is our only chance to allocate remote pages for PAGESIZE 3265 * requests. 3266 */ 3267 LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie); 3268 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3269 pp = page_get_mnode_freelist(mnode, bin, mtype, 3270 0, flags); 3271 if (pp != NULL) { 3272 VM_STAT_ADD(vmm_vmstats.pgc_allocokdeferred); 3273 DTRACE_PROBE4(page__get, 3274 lgrp_t *, lgrp, 3275 int, mnode, 3276 ulong_t, bin, 3277 uint_t, flags); 3278 return (pp); 3279 } 3280 pp = page_get_mnode_cachelist(bin, flags, mnode, mtype); 3281 if (pp != NULL) { 3282 VM_STAT_ADD(vmm_vmstats.pgc_allocokrem); 3283 DTRACE_PROBE4(page__get, 3284 lgrp_t *, lgrp, 3285 int, mnode, 3286 ulong_t, bin, 3287 uint_t, flags); 3288 return (pp); 3289 } 3290 } 3291 3292 VM_STAT_ADD(vmm_vmstats.pgc_allocfailed); 3293 return (NULL); 3294 } 3295 3296 page_t * 3297 page_get_mnode_cachelist(uint_t bin, uint_t flags, int mnode, int mtype) 3298 { 3299 kmutex_t *pcm; 3300 int i; 3301 page_t *pp; 3302 page_t *first_pp; 3303 uint_t bin_marker; 3304 int nwaybins, nwaycnt; 3305 int cpucolors; 3306 3307 VM_STAT_ADD(vmm_vmstats.pgmc_alloc); 3308 3309 /* LINTED */ 3310 MTYPE_START(mnode, mtype, flags); 3311 if (mtype < 0) { /* mnode does not have memory in mtype range */ 3312 VM_STAT_ADD(vmm_vmstats.pgmc_allocempty); 3313 return (NULL); 3314 } 3315 3316 nwaybins = 0; 3317 cpucolors = cpu_page_colors; 3318 /* 3319 * adjust cpucolors to possibly check additional 'equivalent' bins 3320 * to try to minimize fragmentation of large pages by delaying calls 3321 * to page_freelist_fill. 3322 */ 3323 if (colorequiv > 1) { 3324 int equivcolors = page_colors / colorequiv; 3325 3326 if (equivcolors && (cpucolors == 0 || equivcolors < cpucolors)) 3327 cpucolors = equivcolors; 3328 } 3329 3330 /* 3331 * Only hold one cachelist lock at a time, that way we 3332 * can start anywhere and not have to worry about lock 3333 * ordering. 3334 */ 3335 3336 big_try_again: 3337 nwaycnt = 0; 3338 for (i = 0; i <= page_colors; i++) { 3339 if (PAGE_CACHELISTS(mnode, bin, mtype)) { 3340 pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST); 3341 mutex_enter(pcm); 3342 pp = PAGE_CACHELISTS(mnode, bin, mtype); 3343 if (pp != NULL) { 3344 first_pp = pp; 3345 ASSERT(pp->p_vnode); 3346 ASSERT(PP_ISAGED(pp) == 0); 3347 ASSERT(pp->p_szc == 0); 3348 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 3349 while (!page_trylock(pp, SE_EXCL)) { 3350 pp = pp->p_next; 3351 ASSERT(pp->p_szc == 0); 3352 if (pp == first_pp) { 3353 /* 3354 * We have searched the 3355 * complete list! 3356 * And all of them (might 3357 * only be one) are locked. 3358 * This can happen since 3359 * these pages can also be 3360 * found via the hash list. 3361 * When found via the hash 3362 * list, they are locked 3363 * first, then removed. 3364 * We give up to let the 3365 * other thread run. 3366 */ 3367 pp = NULL; 3368 break; 3369 } 3370 ASSERT(pp->p_vnode); 3371 ASSERT(PP_ISFREE(pp)); 3372 ASSERT(PP_ISAGED(pp) == 0); 3373 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == 3374 mnode); 3375 } 3376 3377 if (pp) { 3378 page_t **ppp; 3379 /* 3380 * Found and locked a page. 3381 * Pull it off the list. 3382 */ 3383 ASSERT(mtype == PP_2_MTYPE(pp)); 3384 ppp = &PAGE_CACHELISTS(mnode, bin, 3385 mtype); 3386 page_sub(ppp, pp); 3387 /* 3388 * Subtract counters before releasing 3389 * pcm mutex to avoid a race with 3390 * page_freelist_coalesce and 3391 * page_freelist_fill. 3392 */ 3393 page_ctr_sub(mnode, mtype, pp, 3394 PG_CACHE_LIST); 3395 mutex_exit(pcm); 3396 ASSERT(pp->p_vnode); 3397 ASSERT(PP_ISAGED(pp) == 0); 3398 #if defined(__sparc) 3399 ASSERT(!kcage_on || 3400 (flags & PG_NORELOC) == 0 || 3401 PP_ISNORELOC(pp)); 3402 if (PP_ISNORELOC(pp)) { 3403 kcage_freemem_sub(1); 3404 } 3405 #endif 3406 VM_STAT_ADD(vmm_vmstats. 3407 pgmc_allocok); 3408 return (pp); 3409 } 3410 } 3411 mutex_exit(pcm); 3412 } 3413 3414 /* 3415 * Wow! The initial bin is empty or no page in the bin could 3416 * be locked. 3417 * 3418 * If specific color is needed, check if page color may be in 3419 * other bins. 3420 */ 3421 if ((flags & PG_MATCH_COLOR) && (cpucolors != 0)) { 3422 if (!nwaybins) { 3423 if (cpucolors < 0) { 3424 cpucolors = CPUSETSIZE() / MMU_PAGESIZE; 3425 ASSERT(cpucolors > 0); 3426 nwaybins = page_colors / cpucolors; 3427 if (nwaybins < 2) 3428 cpucolors = 0; 3429 } else { 3430 nwaybins = page_colors / cpucolors; 3431 ASSERT(nwaybins > 1); 3432 } 3433 } 3434 3435 if (++nwaycnt >= nwaybins) { 3436 break; 3437 } 3438 bin = (bin + (page_colors / nwaybins)) & 3439 page_colors_mask; 3440 continue; 3441 } 3442 3443 if (i == 0) { 3444 bin = (bin + BIN_STEP) & page_colors_mask; 3445 bin_marker = bin; 3446 } else { 3447 bin = (bin + vac_colors) & page_colors_mask; 3448 if (bin == bin_marker) { 3449 bin = (bin + 1) & page_colors_mask; 3450 bin_marker = bin; 3451 } 3452 } 3453 } 3454 3455 MTYPE_NEXT(mnode, mtype, flags); 3456 if (mtype >= 0) 3457 goto big_try_again; 3458 3459 VM_STAT_ADD(vmm_vmstats.pgmc_allocfailed); 3460 return (NULL); 3461 } 3462 3463 #ifdef DEBUG 3464 #define REPL_PAGE_STATS 3465 #endif /* DEBUG */ 3466 3467 #ifdef REPL_PAGE_STATS 3468 struct repl_page_stats { 3469 uint_t ngets; 3470 uint_t ngets_noreloc; 3471 uint_t npgr_noreloc; 3472 uint_t nnopage_first; 3473 uint_t nnopage; 3474 uint_t nhashout; 3475 uint_t nnofree; 3476 uint_t nnext_pp; 3477 } repl_page_stats; 3478 #define REPL_STAT_INCR(v) atomic_add_32(&repl_page_stats.v, 1) 3479 #else /* REPL_PAGE_STATS */ 3480 #define REPL_STAT_INCR(v) 3481 #endif /* REPL_PAGE_STATS */ 3482 3483 int pgrppgcp; 3484 3485 /* 3486 * The freemem accounting must be done by the caller. 3487 * First we try to get a replacement page of the same size as like_pp, 3488 * if that is not possible, then we just get a set of discontiguous 3489 * PAGESIZE pages. 3490 */ 3491 page_t * 3492 page_get_replacement_page(page_t *orig_like_pp, struct lgrp *lgrp_target, 3493 uint_t pgrflags) 3494 { 3495 page_t *like_pp; 3496 page_t *pp, *pplist; 3497 page_t *pl = NULL; 3498 ulong_t bin; 3499 int mnode, page_mnode; 3500 int szc; 3501 spgcnt_t npgs, pg_cnt; 3502 pfn_t pfnum; 3503 int mtype; 3504 int flags = 0; 3505 lgrp_mnode_cookie_t lgrp_cookie; 3506 lgrp_t *lgrp; 3507 3508 REPL_STAT_INCR(ngets); 3509 like_pp = orig_like_pp; 3510 ASSERT(PAGE_EXCL(like_pp)); 3511 3512 szc = like_pp->p_szc; 3513 npgs = page_get_pagecnt(szc); 3514 /* 3515 * Now we reset like_pp to the base page_t. 3516 * That way, we won't walk past the end of this 'szc' page. 3517 */ 3518 pfnum = PFN_BASE(like_pp->p_pagenum, szc); 3519 like_pp = page_numtopp_nolock(pfnum); 3520 ASSERT(like_pp->p_szc == szc); 3521 3522 if (PP_ISNORELOC(like_pp)) { 3523 ASSERT(kcage_on); 3524 REPL_STAT_INCR(ngets_noreloc); 3525 flags = PGI_RELOCONLY; 3526 } else if (pgrflags & PGR_NORELOC) { 3527 ASSERT(kcage_on); 3528 REPL_STAT_INCR(npgr_noreloc); 3529 flags = PG_NORELOC; 3530 } 3531 3532 /* 3533 * Kernel pages must always be replaced with the same size 3534 * pages, since we cannot properly handle demotion of kernel 3535 * pages. 3536 */ 3537 if (like_pp->p_vnode == &kvp) 3538 pgrflags |= PGR_SAMESZC; 3539 3540 /* LINTED */ 3541 MTYPE_PGR_INIT(mtype, flags, like_pp, page_mnode); 3542 3543 while (npgs) { 3544 pplist = NULL; 3545 for (;;) { 3546 pg_cnt = page_get_pagecnt(szc); 3547 bin = PP_2_BIN(like_pp); 3548 ASSERT(like_pp->p_szc == orig_like_pp->p_szc); 3549 ASSERT(pg_cnt <= npgs); 3550 3551 /* 3552 * If an lgroup was specified, try to get the 3553 * page from that lgroup. 3554 * NOTE: Must be careful with code below because 3555 * lgroup may disappear and reappear since there 3556 * is no locking for lgroup here. 3557 */ 3558 if (LGRP_EXISTS(lgrp_target)) { 3559 /* 3560 * Keep local variable for lgroup separate 3561 * from lgroup argument since this code should 3562 * only be exercised when lgroup argument 3563 * exists.... 3564 */ 3565 lgrp = lgrp_target; 3566 3567 /* Try the lgroup's freelists first */ 3568 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 3569 LGRP_SRCH_LOCAL); 3570 while ((pplist == NULL) && 3571 (mnode = lgrp_memnode_choose(&lgrp_cookie)) 3572 != -1) { 3573 pplist = page_get_mnode_freelist( 3574 mnode, bin, mtype, szc, 3575 flags); 3576 } 3577 3578 /* 3579 * Now try it's cachelists if this is a 3580 * small page. Don't need to do it for 3581 * larger ones since page_freelist_coalesce() 3582 * already failed. 3583 */ 3584 if (pplist != NULL || szc != 0) 3585 break; 3586 3587 /* Now try it's cachelists */ 3588 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 3589 LGRP_SRCH_LOCAL); 3590 3591 while ((pplist == NULL) && 3592 (mnode = lgrp_memnode_choose(&lgrp_cookie)) 3593 != -1) { 3594 pplist = page_get_mnode_cachelist( 3595 bin, flags, mnode, mtype); 3596 } 3597 if (pplist != NULL) { 3598 page_hashout(pplist, NULL); 3599 PP_SETAGED(pplist); 3600 REPL_STAT_INCR(nhashout); 3601 break; 3602 } 3603 /* Done looking in this lgroup. Bail out. */ 3604 break; 3605 } 3606 3607 /* 3608 * No lgroup was specified (or lgroup was removed by 3609 * DR, so just try to get the page as close to 3610 * like_pp's mnode as possible. 3611 * First try the local freelist... 3612 */ 3613 mnode = PP_2_MEM_NODE(like_pp); 3614 pplist = page_get_mnode_freelist(mnode, bin, 3615 mtype, szc, flags); 3616 if (pplist != NULL) 3617 break; 3618 3619 REPL_STAT_INCR(nnofree); 3620 3621 /* 3622 * ...then the local cachelist. Don't need to do it for 3623 * larger pages cause page_freelist_coalesce() already 3624 * failed there anyway. 3625 */ 3626 if (szc == 0) { 3627 pplist = page_get_mnode_cachelist(bin, flags, 3628 mnode, mtype); 3629 if (pplist != NULL) { 3630 page_hashout(pplist, NULL); 3631 PP_SETAGED(pplist); 3632 REPL_STAT_INCR(nhashout); 3633 break; 3634 } 3635 } 3636 3637 /* Now try remote freelists */ 3638 page_mnode = mnode; 3639 lgrp = 3640 lgrp_hand_to_lgrp(MEM_NODE_2_LGRPHAND(page_mnode)); 3641 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 3642 LGRP_SRCH_HIER); 3643 while (pplist == NULL && 3644 (mnode = lgrp_memnode_choose(&lgrp_cookie)) 3645 != -1) { 3646 /* 3647 * Skip local mnode. 3648 */ 3649 if ((mnode == page_mnode) || 3650 (mem_node_config[mnode].exists == 0)) 3651 continue; 3652 3653 pplist = page_get_mnode_freelist(mnode, 3654 bin, mtype, szc, flags); 3655 } 3656 3657 if (pplist != NULL) 3658 break; 3659 3660 3661 /* Now try remote cachelists */ 3662 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 3663 LGRP_SRCH_HIER); 3664 while (pplist == NULL && szc == 0) { 3665 mnode = lgrp_memnode_choose(&lgrp_cookie); 3666 if (mnode == -1) 3667 break; 3668 /* 3669 * Skip local mnode. 3670 */ 3671 if ((mnode == page_mnode) || 3672 (mem_node_config[mnode].exists == 0)) 3673 continue; 3674 3675 pplist = page_get_mnode_cachelist(bin, 3676 flags, mnode, mtype); 3677 3678 if (pplist != NULL) { 3679 page_hashout(pplist, NULL); 3680 PP_SETAGED(pplist); 3681 REPL_STAT_INCR(nhashout); 3682 break; 3683 } 3684 } 3685 3686 /* 3687 * Break out of while loop under the following cases: 3688 * - If we successfully got a page. 3689 * - If pgrflags specified only returning a specific 3690 * page size and we could not find that page size. 3691 * - If we could not satisfy the request with PAGESIZE 3692 * or larger pages. 3693 */ 3694 if (pplist != NULL || szc == 0) 3695 break; 3696 3697 if ((pgrflags & PGR_SAMESZC) || pgrppgcp) { 3698 /* try to find contig page */ 3699 3700 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 3701 LGRP_SRCH_HIER); 3702 3703 while ((pplist == NULL) && 3704 (mnode = 3705 lgrp_memnode_choose(&lgrp_cookie)) 3706 != -1) { 3707 pplist = page_get_contig_pages( 3708 mnode, bin, mtype, szc, 3709 flags | PGI_PGCPHIPRI); 3710 } 3711 break; 3712 } 3713 3714 /* 3715 * The correct thing to do here is try the next 3716 * page size down using szc--. Due to a bug 3717 * with the processing of HAT_RELOAD_SHARE 3718 * where the sfmmu_ttecnt arrays of all 3719 * hats sharing an ISM segment don't get updated, 3720 * using intermediate size pages for relocation 3721 * can lead to continuous page faults. 3722 */ 3723 szc = 0; 3724 } 3725 3726 if (pplist != NULL) { 3727 DTRACE_PROBE4(page__get, 3728 lgrp_t *, lgrp, 3729 int, mnode, 3730 ulong_t, bin, 3731 uint_t, flags); 3732 3733 while (pplist != NULL && pg_cnt--) { 3734 ASSERT(pplist != NULL); 3735 pp = pplist; 3736 page_sub(&pplist, pp); 3737 PP_CLRFREE(pp); 3738 PP_CLRAGED(pp); 3739 page_list_concat(&pl, &pp); 3740 npgs--; 3741 like_pp = like_pp + 1; 3742 REPL_STAT_INCR(nnext_pp); 3743 } 3744 ASSERT(pg_cnt == 0); 3745 } else { 3746 break; 3747 } 3748 } 3749 3750 if (npgs) { 3751 /* 3752 * We were unable to allocate the necessary number 3753 * of pages. 3754 * We need to free up any pl. 3755 */ 3756 REPL_STAT_INCR(nnopage); 3757 page_free_replacement_page(pl); 3758 return (NULL); 3759 } else { 3760 return (pl); 3761 } 3762 } 3763 3764 /* 3765 * demote a free large page to it's constituent pages 3766 */ 3767 void 3768 page_demote_free_pages(page_t *pp) 3769 { 3770 3771 int mnode; 3772 3773 ASSERT(pp != NULL); 3774 ASSERT(PAGE_LOCKED(pp)); 3775 ASSERT(PP_ISFREE(pp)); 3776 ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes); 3777 3778 mnode = PP_2_MEM_NODE(pp); 3779 page_freelist_lock(mnode); 3780 if (pp->p_szc != 0) { 3781 (void) page_demote(mnode, PFN_BASE(pp->p_pagenum, 3782 pp->p_szc), pp->p_szc, 0, PC_NO_COLOR, PC_FREE); 3783 } 3784 page_freelist_unlock(mnode); 3785 ASSERT(pp->p_szc == 0); 3786 } 3787