1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * Portions of this source code were derived from Berkeley 4.3 BSD 32 * under license from the Regents of the University of California. 33 */ 34 35 #pragma ident "%Z%%M% %I% %E% SMI" 36 37 /* 38 * This file contains common functions to access and manage the page lists. 39 * Many of these routines originated from platform dependent modules 40 * (sun4/vm/vm_dep.c, i86pc/vm/vm_machdep.c) and modified to function in 41 * a platform independent manner. 42 * 43 * vm/vm_dep.h provides for platform specific support. 44 */ 45 46 #include <sys/types.h> 47 #include <sys/debug.h> 48 #include <sys/cmn_err.h> 49 #include <sys/systm.h> 50 #include <sys/atomic.h> 51 #include <sys/sysmacros.h> 52 #include <vm/as.h> 53 #include <vm/page.h> 54 #include <vm/seg_kmem.h> 55 #include <vm/seg_vn.h> 56 #include <sys/memnode.h> 57 #include <vm/vm_dep.h> 58 #include <sys/lgrp.h> 59 #include <sys/mem_config.h> 60 #include <sys/callb.h> 61 #include <sys/mem_cage.h> 62 #include <sys/sdt.h> 63 64 extern uint_t vac_colors; 65 66 /* 67 * number of page colors equivalent to reqested color in page_get routines. 68 * If set, keeps large pages intact longer and keeps MPO allocation 69 * from the local mnode in favor of acquiring the 'correct' page color from 70 * a demoted large page or from a remote mnode. 71 */ 72 int colorequiv; 73 74 /* 75 * if set, specifies the percentage of large pages that are free from within 76 * a large page region before attempting to lock those pages for 77 * page_get_contig_pages processing. 78 * 79 * Should be turned on when kpr is available when page_trylock_contig_pages 80 * can be more selective. 81 */ 82 83 int ptcpthreshold; 84 85 /* 86 * Limit page get contig page search based on failure cnts in pgcpfailcnt[]. 87 * use slot 0 (base page size unused) to enable or disable limiting search. 88 * Enabled by default. 89 */ 90 int pgcpfailcnt[MMU_PAGE_SIZES]; 91 int pgcplimitsearch = 1; 92 93 #ifdef VM_STATS 94 struct vmm_vmstats_str vmm_vmstats; 95 96 #endif /* VM_STATS */ 97 98 #if defined(__sparc) 99 #define LPGCREATE 0 100 #else 101 /* enable page_get_contig_pages */ 102 #define LPGCREATE 1 103 #endif 104 105 int pg_contig_disable; 106 int pg_lpgcreate_nocage = LPGCREATE; 107 108 /* 109 * page_freelist_fill pfn flag to signify no hi pfn requirement. 110 */ 111 #define PFNNULL 0 112 113 /* Flags involved in promotion and demotion routines */ 114 #define PC_FREE 0x1 /* put page on freelist */ 115 #define PC_ALLOC 0x2 /* return page for allocation */ 116 117 /* 118 * Flag for page_demote to be used with PC_FREE to denote that we don't care 119 * what the color is as the color parameter to the function is ignored. 120 */ 121 #define PC_NO_COLOR (-1) 122 123 /* 124 * page counters candidates info 125 * See page_ctrs_cands comment below for more details. 126 * fields are as follows: 127 * pcc_pages_free: # pages which freelist coalesce can create 128 * pcc_color_free_len: number of elements in pcc_color_free array 129 * pcc_color_free: pointer to page free counts per color 130 */ 131 typedef struct pcc_info { 132 pgcnt_t pcc_pages_free; 133 int pcc_color_free_len; 134 pgcnt_t *pcc_color_free; 135 } pcc_info_t; 136 137 /* 138 * On big machines it can take a long time to check page_counters 139 * arrays. page_ctrs_cands is a summary array whose elements are a dynamically 140 * updated sum of all elements of the corresponding page_counters arrays. 141 * page_freelist_coalesce() searches page_counters only if an appropriate 142 * element of page_ctrs_cands array is greater than 0. 143 * 144 * An extra dimension is used for page_ctrs_cands to spread the elements 145 * over a few e$ cache lines to avoid serialization during the array 146 * updates. 147 */ 148 #pragma align 64(page_ctrs_cands) 149 150 static pcc_info_t *page_ctrs_cands[NPC_MUTEX][MMU_PAGE_SIZES]; 151 152 /* 153 * Return in val the total number of free pages which can be created 154 * for the given mnode (m) and region size (r) 155 */ 156 #define PGCTRS_CANDS_GETVALUE(m, r, val) { \ 157 int i; \ 158 val = 0; \ 159 for (i = 0; i < NPC_MUTEX; i++) { \ 160 val += page_ctrs_cands[i][(r)][(m)].pcc_pages_free; \ 161 } \ 162 } 163 164 /* 165 * Return in val the total number of free pages which can be created 166 * for the given mnode (m), region size (r), and color (c) 167 */ 168 #define PGCTRS_CANDS_GETVALUECOLOR(m, r, c, val) { \ 169 int i; \ 170 val = 0; \ 171 ASSERT((c) < page_ctrs_cands[0][(r)][(m)].pcc_color_free_len); \ 172 for (i = 0; i < NPC_MUTEX; i++) { \ 173 val += page_ctrs_cands[i][(r)][(m)].pcc_color_free[(c)]; \ 174 } \ 175 } 176 177 /* 178 * We can only allow a single thread to update a counter within the physical 179 * range of the largest supported page size. That is the finest granularity 180 * possible since the counter values are dependent on each other 181 * as you move accross region sizes. PP_CTR_LOCK_INDX is used to determine the 182 * ctr_mutex lock index for a particular physical range. 183 */ 184 static kmutex_t *ctr_mutex[NPC_MUTEX]; 185 186 #define PP_CTR_LOCK_INDX(pp) \ 187 (((pp)->p_pagenum >> \ 188 (PAGE_BSZS_SHIFT(mmu_page_sizes - 1))) & (NPC_MUTEX - 1)) 189 190 /* 191 * Local functions prototypes. 192 */ 193 194 void page_ctr_add(page_t *, int); 195 void page_ctr_add_internal(int, page_t *, int); 196 void page_ctr_sub(page_t *, int); 197 uint_t page_convert_color(uchar_t, uchar_t, uint_t); 198 void page_freelist_lock(int); 199 void page_freelist_unlock(int); 200 page_t *page_promote(int, pfn_t, uchar_t, int); 201 page_t *page_demote(int, pfn_t, uchar_t, uchar_t, int, int); 202 page_t *page_freelist_fill(uchar_t, int, int, int, pfn_t); 203 page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int); 204 static int page_trylock_cons(page_t *pp, se_t se); 205 206 #define PNUM_SIZE(szc) \ 207 (hw_page_array[(szc)].hp_size >> hw_page_array[0].hp_shift) 208 #define PNUM_SHIFT(szc) \ 209 (hw_page_array[(szc)].hp_shift - hw_page_array[0].hp_shift) 210 211 /* 212 * The page_counters array below is used to keep track of free contiguous 213 * physical memory. A hw_page_map_t will be allocated per mnode per szc. 214 * This contains an array of counters, the size of the array, a shift value 215 * used to convert a pagenum into a counter array index or vice versa, as 216 * well as a cache of the last successful index to be promoted to a larger 217 * page size. As an optimization, we keep track of the last successful index 218 * to be promoted per page color for the given size region, and this is 219 * allocated dynamically based upon the number of colors for a given 220 * region size. 221 * 222 * Conceptually, the page counters are represented as: 223 * 224 * page_counters[region_size][mnode] 225 * 226 * region_size: size code of a candidate larger page made up 227 * of contiguous free smaller pages. 228 * 229 * page_counters[region_size][mnode].hpm_counters[index]: 230 * represents how many (region_size - 1) pages either 231 * exist or can be created within the given index range. 232 * 233 * Let's look at a sparc example: 234 * If we want to create a free 512k page, we look at region_size 2 235 * for the mnode we want. We calculate the index and look at a specific 236 * hpm_counters location. If we see 8 (FULL_REGION_CNT on sparc) at 237 * this location, it means that 8 64k pages either exist or can be created 238 * from 8K pages in order to make a single free 512k page at the given 239 * index. Note that when a region is full, it will contribute to the 240 * counts in the region above it. Thus we will not know what page 241 * size the free pages will be which can be promoted to this new free 242 * page unless we look at all regions below the current region. 243 */ 244 245 /* 246 * Note: hpmctr_t is defined in platform vm_dep.h 247 * hw_page_map_t contains all the information needed for the page_counters 248 * logic. The fields are as follows: 249 * 250 * hpm_counters: dynamically allocated array to hold counter data 251 * hpm_entries: entries in hpm_counters 252 * hpm_shift: shift for pnum/array index conv 253 * hpm_base: PFN mapped to counter index 0 254 * hpm_color_current_len: # of elements in hpm_color_current "array" below 255 * hpm_color_current: last index in counter array for this color at 256 * which we successfully created a large page 257 */ 258 typedef struct hw_page_map { 259 hpmctr_t *hpm_counters; 260 size_t hpm_entries; 261 int hpm_shift; 262 pfn_t hpm_base; 263 size_t hpm_color_current_len; 264 size_t *hpm_color_current; 265 } hw_page_map_t; 266 267 /* 268 * Element zero is not used, but is allocated for convenience. 269 */ 270 static hw_page_map_t *page_counters[MMU_PAGE_SIZES]; 271 272 /* 273 * The following macros are convenient ways to get access to the individual 274 * elements of the page_counters arrays. They can be used on both 275 * the left side and right side of equations. 276 */ 277 #define PAGE_COUNTERS(mnode, rg_szc, idx) \ 278 (page_counters[(rg_szc)][(mnode)].hpm_counters[(idx)]) 279 280 #define PAGE_COUNTERS_COUNTERS(mnode, rg_szc) \ 281 (page_counters[(rg_szc)][(mnode)].hpm_counters) 282 283 #define PAGE_COUNTERS_SHIFT(mnode, rg_szc) \ 284 (page_counters[(rg_szc)][(mnode)].hpm_shift) 285 286 #define PAGE_COUNTERS_ENTRIES(mnode, rg_szc) \ 287 (page_counters[(rg_szc)][(mnode)].hpm_entries) 288 289 #define PAGE_COUNTERS_BASE(mnode, rg_szc) \ 290 (page_counters[(rg_szc)][(mnode)].hpm_base) 291 292 #define PAGE_COUNTERS_CURRENT_COLOR_LEN(mnode, rg_szc) \ 293 (page_counters[(rg_szc)][(mnode)].hpm_color_current_len) 294 295 #define PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, rg_szc) \ 296 (page_counters[(rg_szc)][(mnode)].hpm_color_current) 297 298 #define PAGE_COUNTERS_CURRENT_COLOR(mnode, rg_szc, color) \ 299 (page_counters[(rg_szc)][(mnode)].hpm_color_current[(color)]) 300 301 #define PNUM_TO_IDX(mnode, rg_szc, pnum) \ 302 (((pnum) - PAGE_COUNTERS_BASE((mnode), (rg_szc))) >> \ 303 PAGE_COUNTERS_SHIFT((mnode), (rg_szc))) 304 305 #define IDX_TO_PNUM(mnode, rg_szc, index) \ 306 (PAGE_COUNTERS_BASE((mnode), (rg_szc)) + \ 307 ((index) << PAGE_COUNTERS_SHIFT((mnode), (rg_szc)))) 308 309 /* 310 * Protects the hpm_counters and hpm_color_current memory from changing while 311 * looking at page counters information. 312 * Grab the write lock to modify what these fields point at. 313 * Grab the read lock to prevent any pointers from changing. 314 * The write lock can not be held during memory allocation due to a possible 315 * recursion deadlock with trying to grab the read lock while the 316 * write lock is already held. 317 */ 318 krwlock_t page_ctrs_rwlock[MAX_MEM_NODES]; 319 320 /* 321 * page size to page size code 322 */ 323 int 324 page_szc(size_t pagesize) 325 { 326 int i = 0; 327 328 while (hw_page_array[i].hp_size) { 329 if (pagesize == hw_page_array[i].hp_size) 330 return (i); 331 i++; 332 } 333 return (-1); 334 } 335 336 /* 337 * page size to page size code with the restriction that it be a supported 338 * user page size. If it's not a supported user page size, -1 will be returned. 339 */ 340 int 341 page_szc_user_filtered(size_t pagesize) 342 { 343 int szc = page_szc(pagesize); 344 if ((szc != -1) && (SZC_2_USERSZC(szc) != -1)) { 345 return (szc); 346 } 347 return (-1); 348 } 349 350 /* 351 * Return how many page sizes are available for the user to use. This is 352 * what the hardware supports and not based upon how the OS implements the 353 * support of different page sizes. 354 */ 355 uint_t 356 page_num_user_pagesizes(void) 357 { 358 return (mmu_exported_page_sizes); 359 } 360 361 uint_t 362 page_num_pagesizes(void) 363 { 364 return (mmu_page_sizes); 365 } 366 367 /* 368 * returns the count of the number of base pagesize pages associated with szc 369 */ 370 pgcnt_t 371 page_get_pagecnt(uint_t szc) 372 { 373 if (szc >= mmu_page_sizes) 374 panic("page_get_pagecnt: out of range %d", szc); 375 return (hw_page_array[szc].hp_pgcnt); 376 } 377 378 size_t 379 page_get_pagesize(uint_t szc) 380 { 381 if (szc >= mmu_page_sizes) 382 panic("page_get_pagesize: out of range %d", szc); 383 return (hw_page_array[szc].hp_size); 384 } 385 386 /* 387 * Return the size of a page based upon the index passed in. An index of 388 * zero refers to the smallest page size in the system, and as index increases 389 * it refers to the next larger supported page size in the system. 390 * Note that szc and userszc may not be the same due to unsupported szc's on 391 * some systems. 392 */ 393 size_t 394 page_get_user_pagesize(uint_t userszc) 395 { 396 uint_t szc = USERSZC_2_SZC(userszc); 397 398 if (szc >= mmu_page_sizes) 399 panic("page_get_user_pagesize: out of range %d", szc); 400 return (hw_page_array[szc].hp_size); 401 } 402 403 uint_t 404 page_get_shift(uint_t szc) 405 { 406 if (szc >= mmu_page_sizes) 407 panic("page_get_shift: out of range %d", szc); 408 return (hw_page_array[szc].hp_shift); 409 } 410 411 uint_t 412 page_get_pagecolors(uint_t szc) 413 { 414 ASSERT(page_colors != 0); 415 return (MAX(page_colors >> PAGE_BSZS_SHIFT(szc), 1)); 416 } 417 418 /* 419 * Called by startup(). 420 * Size up the per page size free list counters based on physmax 421 * of each node and max_mem_nodes. 422 */ 423 size_t 424 page_ctrs_sz(void) 425 { 426 int r; /* region size */ 427 int mnode; 428 uint_t ctrs_sz = 0; 429 int i; 430 pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; 431 432 /* 433 * We need to determine how many page colors there are for each 434 * page size in order to allocate memory for any color specific 435 * arrays. 436 */ 437 colors_per_szc[0] = page_colors; 438 for (i = 1; i < mmu_page_sizes; i++) { 439 colors_per_szc[i] = 440 page_convert_color(0, i, page_colors - 1) + 1; 441 } 442 443 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 444 445 pgcnt_t r_pgcnt; 446 pfn_t r_base; 447 pgcnt_t r_align; 448 449 if (mem_node_config[mnode].exists == 0) 450 continue; 451 452 /* 453 * determine size needed for page counter arrays with 454 * base aligned to large page size. 455 */ 456 for (r = 1; r < mmu_page_sizes; r++) { 457 /* add in space for hpm_counters */ 458 r_align = page_get_pagecnt(r); 459 r_base = mem_node_config[mnode].physbase; 460 r_base &= ~(r_align - 1); 461 r_pgcnt = howmany(mem_node_config[mnode].physmax - 462 r_base, r_align); 463 /* 464 * Round up to always allocate on pointer sized 465 * boundaries. 466 */ 467 ctrs_sz += P2ROUNDUP((r_pgcnt * sizeof (hpmctr_t)), 468 sizeof (hpmctr_t *)); 469 470 /* add in space for hpm_color_current */ 471 ctrs_sz += (colors_per_szc[r] * 472 sizeof (size_t)); 473 } 474 } 475 476 for (r = 1; r < mmu_page_sizes; r++) { 477 ctrs_sz += (max_mem_nodes * sizeof (hw_page_map_t)); 478 479 /* add in space for page_ctrs_cands */ 480 ctrs_sz += NPC_MUTEX * max_mem_nodes * (sizeof (pcc_info_t)); 481 ctrs_sz += NPC_MUTEX * max_mem_nodes * colors_per_szc[r] * 482 sizeof (pgcnt_t); 483 } 484 485 /* ctr_mutex */ 486 ctrs_sz += (max_mem_nodes * NPC_MUTEX * sizeof (kmutex_t)); 487 488 /* size for page list counts */ 489 PLCNT_SZ(ctrs_sz); 490 491 /* 492 * add some slop for roundups. page_ctrs_alloc will roundup the start 493 * address of the counters to ecache_alignsize boundary for every 494 * memory node. 495 */ 496 return (ctrs_sz + max_mem_nodes * L2CACHE_ALIGN); 497 } 498 499 caddr_t 500 page_ctrs_alloc(caddr_t alloc_base) 501 { 502 int mnode; 503 int r; /* region size */ 504 int i; 505 pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; 506 507 /* 508 * We need to determine how many page colors there are for each 509 * page size in order to allocate memory for any color specific 510 * arrays. 511 */ 512 colors_per_szc[0] = page_colors; 513 for (i = 1; i < mmu_page_sizes; i++) { 514 colors_per_szc[i] = 515 page_convert_color(0, i, page_colors - 1) + 1; 516 } 517 518 for (r = 1; r < mmu_page_sizes; r++) { 519 page_counters[r] = (hw_page_map_t *)alloc_base; 520 alloc_base += (max_mem_nodes * sizeof (hw_page_map_t)); 521 } 522 523 /* page_ctrs_cands */ 524 for (r = 1; r < mmu_page_sizes; r++) { 525 for (i = 0; i < NPC_MUTEX; i++) { 526 page_ctrs_cands[i][r] = (pcc_info_t *)alloc_base; 527 alloc_base += max_mem_nodes * (sizeof (pcc_info_t)); 528 529 } 530 } 531 532 /* page_ctrs_cands pcc_color_free array */ 533 for (r = 1; r < mmu_page_sizes; r++) { 534 for (i = 0; i < NPC_MUTEX; i++) { 535 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 536 page_ctrs_cands[i][r][mnode].pcc_color_free_len 537 = colors_per_szc[r]; 538 page_ctrs_cands[i][r][mnode].pcc_color_free = 539 (pgcnt_t *)alloc_base; 540 alloc_base += colors_per_szc[r] * 541 sizeof (pgcnt_t); 542 } 543 } 544 } 545 546 /* ctr_mutex */ 547 for (i = 0; i < NPC_MUTEX; i++) { 548 ctr_mutex[i] = (kmutex_t *)alloc_base; 549 alloc_base += (max_mem_nodes * sizeof (kmutex_t)); 550 } 551 552 /* initialize page list counts */ 553 PLCNT_INIT(alloc_base); 554 555 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 556 557 pgcnt_t r_pgcnt; 558 pfn_t r_base; 559 pgcnt_t r_align; 560 int r_shift; 561 562 if (mem_node_config[mnode].exists == 0) 563 continue; 564 565 for (r = 1; r < mmu_page_sizes; r++) { 566 /* 567 * the page_counters base has to be aligned to the 568 * page count of page size code r otherwise the counts 569 * will cross large page boundaries. 570 */ 571 r_align = page_get_pagecnt(r); 572 r_base = mem_node_config[mnode].physbase; 573 /* base needs to be aligned - lower to aligned value */ 574 r_base &= ~(r_align - 1); 575 r_pgcnt = howmany(mem_node_config[mnode].physmax - 576 r_base, r_align); 577 r_shift = PAGE_BSZS_SHIFT(r); 578 579 PAGE_COUNTERS_SHIFT(mnode, r) = r_shift; 580 PAGE_COUNTERS_ENTRIES(mnode, r) = r_pgcnt; 581 PAGE_COUNTERS_BASE(mnode, r) = r_base; 582 PAGE_COUNTERS_CURRENT_COLOR_LEN(mnode, r) = 583 colors_per_szc[r]; 584 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r) = 585 (size_t *)alloc_base; 586 alloc_base += (sizeof (size_t) * colors_per_szc[r]); 587 for (i = 0; i < colors_per_szc[r]; i++) { 588 PAGE_COUNTERS_CURRENT_COLOR(mnode, r, i) = i; 589 } 590 PAGE_COUNTERS_COUNTERS(mnode, r) = 591 (hpmctr_t *)alloc_base; 592 /* 593 * Round up to make alloc_base always be aligned on 594 * a pointer boundary. 595 */ 596 alloc_base += P2ROUNDUP((sizeof (hpmctr_t) * r_pgcnt), 597 sizeof (hpmctr_t *)); 598 599 /* 600 * Verify that PNUM_TO_IDX and IDX_TO_PNUM 601 * satisfy the identity requirement. 602 * We should be able to go from one to the other 603 * and get consistent values. 604 */ 605 ASSERT(PNUM_TO_IDX(mnode, r, 606 (IDX_TO_PNUM(mnode, r, 0))) == 0); 607 ASSERT(IDX_TO_PNUM(mnode, r, 608 (PNUM_TO_IDX(mnode, r, r_base))) == r_base); 609 } 610 /* 611 * Roundup the start address of the page_counters to 612 * cache aligned boundary for every memory node. 613 * page_ctrs_sz() has added some slop for these roundups. 614 */ 615 alloc_base = (caddr_t)P2ROUNDUP((uintptr_t)alloc_base, 616 L2CACHE_ALIGN); 617 } 618 619 /* Initialize other page counter specific data structures. */ 620 for (mnode = 0; mnode < MAX_MEM_NODES; mnode++) { 621 rw_init(&page_ctrs_rwlock[mnode], NULL, RW_DEFAULT, NULL); 622 } 623 624 return (alloc_base); 625 } 626 627 /* 628 * Functions to adjust region counters for each size free list. 629 * Caller is responsible to acquire the ctr_mutex lock if necessary and 630 * thus can be called during startup without locks. 631 */ 632 /* ARGSUSED */ 633 void 634 page_ctr_add_internal(int mnode, page_t *pp, int flags) 635 { 636 ssize_t r; /* region size */ 637 ssize_t idx; 638 pfn_t pfnum; 639 int lckidx; 640 641 ASSERT(pp->p_szc < mmu_page_sizes); 642 643 PLCNT_INCR(pp, mnode, pp->p_szc, flags); 644 645 /* no counter update needed for largest page size */ 646 if (pp->p_szc >= mmu_page_sizes - 1) { 647 return; 648 } 649 650 r = pp->p_szc + 1; 651 pfnum = pp->p_pagenum; 652 lckidx = PP_CTR_LOCK_INDX(pp); 653 654 /* 655 * Increment the count of free pages for the current 656 * region. Continue looping up in region size incrementing 657 * count if the preceeding region is full. 658 */ 659 while (r < mmu_page_sizes) { 660 idx = PNUM_TO_IDX(mnode, r, pfnum); 661 662 ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r)); 663 ASSERT(PAGE_COUNTERS(mnode, r, idx) < FULL_REGION_CNT(r)); 664 665 if (++PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r)) 666 break; 667 668 page_ctrs_cands[lckidx][r][mnode].pcc_pages_free++; 669 page_ctrs_cands[lckidx][r][mnode]. 670 pcc_color_free[PP_2_BIN_SZC(pp, r)]++; 671 r++; 672 } 673 } 674 675 void 676 page_ctr_add(page_t *pp, int flags) 677 { 678 int lckidx = PP_CTR_LOCK_INDX(pp); 679 int mnode = PP_2_MEM_NODE(pp); 680 kmutex_t *lock = &ctr_mutex[lckidx][mnode]; 681 682 mutex_enter(lock); 683 page_ctr_add_internal(mnode, pp, flags); 684 mutex_exit(lock); 685 } 686 687 void 688 page_ctr_sub(page_t *pp, int flags) 689 { 690 int lckidx; 691 int mnode = PP_2_MEM_NODE(pp); 692 kmutex_t *lock; 693 ssize_t r; /* region size */ 694 ssize_t idx; 695 pfn_t pfnum; 696 697 ASSERT(pp->p_szc < mmu_page_sizes); 698 699 PLCNT_DECR(pp, mnode, pp->p_szc, flags); 700 701 /* no counter update needed for largest page size */ 702 if (pp->p_szc >= mmu_page_sizes - 1) { 703 return; 704 } 705 706 r = pp->p_szc + 1; 707 pfnum = pp->p_pagenum; 708 lckidx = PP_CTR_LOCK_INDX(pp); 709 lock = &ctr_mutex[lckidx][mnode]; 710 711 /* 712 * Decrement the count of free pages for the current 713 * region. Continue looping up in region size decrementing 714 * count if the preceeding region was full. 715 */ 716 mutex_enter(lock); 717 while (r < mmu_page_sizes) { 718 idx = PNUM_TO_IDX(mnode, r, pfnum); 719 720 ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r)); 721 ASSERT(PAGE_COUNTERS(mnode, r, idx) > 0); 722 723 if (--PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r) - 1) { 724 break; 725 } 726 ASSERT(page_ctrs_cands[lckidx][r][mnode].pcc_pages_free != 0); 727 ASSERT(page_ctrs_cands[lckidx][r][mnode]. 728 pcc_color_free[PP_2_BIN_SZC(pp, r)] != 0); 729 730 page_ctrs_cands[lckidx][r][mnode].pcc_pages_free--; 731 page_ctrs_cands[lckidx][r][mnode]. 732 pcc_color_free[PP_2_BIN_SZC(pp, r)]--; 733 r++; 734 } 735 mutex_exit(lock); 736 } 737 738 /* 739 * Adjust page counters following a memory attach, since typically the 740 * size of the array needs to change, and the PFN to counter index 741 * mapping needs to change. 742 */ 743 uint_t 744 page_ctrs_adjust(int mnode) 745 { 746 pgcnt_t npgs; 747 int r; /* region size */ 748 int i; 749 size_t pcsz, old_csz; 750 hpmctr_t *new_ctr, *old_ctr; 751 pfn_t oldbase, newbase; 752 size_t old_npgs; 753 hpmctr_t *ctr_cache[MMU_PAGE_SIZES]; 754 size_t size_cache[MMU_PAGE_SIZES]; 755 size_t *color_cache[MMU_PAGE_SIZES]; 756 size_t *old_color_array; 757 pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; 758 759 newbase = mem_node_config[mnode].physbase & ~PC_BASE_ALIGN_MASK; 760 npgs = roundup(mem_node_config[mnode].physmax, 761 PC_BASE_ALIGN) - newbase; 762 763 /* 764 * We need to determine how many page colors there are for each 765 * page size in order to allocate memory for any color specific 766 * arrays. 767 */ 768 colors_per_szc[0] = page_colors; 769 for (r = 1; r < mmu_page_sizes; r++) { 770 colors_per_szc[r] = 771 page_convert_color(0, r, page_colors - 1) + 1; 772 } 773 774 /* 775 * Preallocate all of the new hpm_counters arrays as we can't 776 * hold the page_ctrs_rwlock as a writer and allocate memory. 777 * If we can't allocate all of the arrays, undo our work so far 778 * and return failure. 779 */ 780 for (r = 1; r < mmu_page_sizes; r++) { 781 pcsz = npgs >> PAGE_BSZS_SHIFT(r); 782 783 ctr_cache[r] = kmem_zalloc(pcsz * 784 sizeof (hpmctr_t), KM_NOSLEEP); 785 if (ctr_cache[r] == NULL) { 786 while (--r >= 1) { 787 kmem_free(ctr_cache[r], 788 size_cache[r] * sizeof (hpmctr_t)); 789 } 790 return (ENOMEM); 791 } 792 size_cache[r] = pcsz; 793 } 794 /* 795 * Preallocate all of the new color current arrays as we can't 796 * hold the page_ctrs_rwlock as a writer and allocate memory. 797 * If we can't allocate all of the arrays, undo our work so far 798 * and return failure. 799 */ 800 for (r = 1; r < mmu_page_sizes; r++) { 801 color_cache[r] = kmem_zalloc(sizeof (size_t) * 802 colors_per_szc[r], KM_NOSLEEP); 803 if (color_cache[r] == NULL) { 804 while (--r >= 1) { 805 kmem_free(color_cache[r], 806 colors_per_szc[r] * sizeof (size_t)); 807 } 808 for (r = 1; r < mmu_page_sizes; r++) { 809 kmem_free(ctr_cache[r], 810 size_cache[r] * sizeof (hpmctr_t)); 811 } 812 return (ENOMEM); 813 } 814 } 815 816 /* 817 * Grab the write lock to prevent others from walking these arrays 818 * while we are modifying them. 819 */ 820 rw_enter(&page_ctrs_rwlock[mnode], RW_WRITER); 821 page_freelist_lock(mnode); 822 for (r = 1; r < mmu_page_sizes; r++) { 823 PAGE_COUNTERS_SHIFT(mnode, r) = PAGE_BSZS_SHIFT(r); 824 old_ctr = PAGE_COUNTERS_COUNTERS(mnode, r); 825 old_csz = PAGE_COUNTERS_ENTRIES(mnode, r); 826 oldbase = PAGE_COUNTERS_BASE(mnode, r); 827 old_npgs = old_csz << PAGE_COUNTERS_SHIFT(mnode, r); 828 old_color_array = PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r); 829 830 pcsz = npgs >> PAGE_COUNTERS_SHIFT(mnode, r); 831 new_ctr = ctr_cache[r]; 832 ctr_cache[r] = NULL; 833 if (old_ctr != NULL && 834 (oldbase + old_npgs > newbase) && 835 (newbase + npgs > oldbase)) { 836 /* 837 * Map the intersection of the old and new 838 * counters into the new array. 839 */ 840 size_t offset; 841 if (newbase > oldbase) { 842 offset = (newbase - oldbase) >> 843 PAGE_COUNTERS_SHIFT(mnode, r); 844 bcopy(old_ctr + offset, new_ctr, 845 MIN(pcsz, (old_csz - offset)) * 846 sizeof (hpmctr_t)); 847 } else { 848 offset = (oldbase - newbase) >> 849 PAGE_COUNTERS_SHIFT(mnode, r); 850 bcopy(old_ctr, new_ctr + offset, 851 MIN(pcsz - offset, old_csz) * 852 sizeof (hpmctr_t)); 853 } 854 } 855 856 PAGE_COUNTERS_COUNTERS(mnode, r) = new_ctr; 857 PAGE_COUNTERS_ENTRIES(mnode, r) = pcsz; 858 PAGE_COUNTERS_BASE(mnode, r) = newbase; 859 PAGE_COUNTERS_CURRENT_COLOR_LEN(mnode, r) = colors_per_szc[r]; 860 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r) = color_cache[r]; 861 color_cache[r] = NULL; 862 /* 863 * for now, just reset on these events as it's probably 864 * not worthwhile to try and optimize this. 865 */ 866 for (i = 0; i < colors_per_szc[r]; i++) { 867 PAGE_COUNTERS_CURRENT_COLOR(mnode, r, i) = i; 868 } 869 870 /* cache info for freeing out of the critical path */ 871 if ((caddr_t)old_ctr >= kernelheap && 872 (caddr_t)old_ctr < ekernelheap) { 873 ctr_cache[r] = old_ctr; 874 size_cache[r] = old_csz; 875 } 876 if ((caddr_t)old_color_array >= kernelheap && 877 (caddr_t)old_color_array < ekernelheap) { 878 color_cache[r] = old_color_array; 879 } 880 /* 881 * Verify that PNUM_TO_IDX and IDX_TO_PNUM 882 * satisfy the identity requirement. 883 * We should be able to go from one to the other 884 * and get consistent values. 885 */ 886 ASSERT(PNUM_TO_IDX(mnode, r, 887 (IDX_TO_PNUM(mnode, r, 0))) == 0); 888 ASSERT(IDX_TO_PNUM(mnode, r, 889 (PNUM_TO_IDX(mnode, r, newbase))) == newbase); 890 } 891 page_freelist_unlock(mnode); 892 rw_exit(&page_ctrs_rwlock[mnode]); 893 894 /* 895 * Now that we have dropped the write lock, it is safe to free all 896 * of the memory we have cached above. 897 */ 898 for (r = 1; r < mmu_page_sizes; r++) { 899 if (ctr_cache[r] != NULL) { 900 kmem_free(ctr_cache[r], 901 size_cache[r] * sizeof (hpmctr_t)); 902 } 903 if (color_cache[r] != NULL) { 904 kmem_free(color_cache[r], 905 colors_per_szc[r] * sizeof (size_t)); 906 } 907 } 908 return (0); 909 } 910 911 /* 912 * color contains a valid color index or bin for cur_szc 913 */ 914 uint_t 915 page_convert_color(uchar_t cur_szc, uchar_t new_szc, uint_t color) 916 { 917 uint_t shift; 918 919 if (cur_szc > new_szc) { 920 shift = page_get_shift(cur_szc) - page_get_shift(new_szc); 921 return (color << shift); 922 } else if (cur_szc < new_szc) { 923 shift = page_get_shift(new_szc) - page_get_shift(cur_szc); 924 return (color >> shift); 925 } 926 return (color); 927 } 928 929 #ifdef DEBUG 930 931 /* 932 * confirm pp is a large page corresponding to szc 933 */ 934 void 935 chk_lpg(page_t *pp, uchar_t szc) 936 { 937 spgcnt_t npgs = page_get_pagecnt(pp->p_szc); 938 uint_t noreloc; 939 940 if (npgs == 1) { 941 ASSERT(pp->p_szc == 0); 942 ASSERT(pp->p_next == pp); 943 ASSERT(pp->p_prev == pp); 944 return; 945 } 946 947 ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL); 948 ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL); 949 950 ASSERT(IS_P2ALIGNED(pp->p_pagenum, npgs)); 951 ASSERT(pp->p_pagenum == (pp->p_next->p_pagenum - 1)); 952 ASSERT(pp->p_prev->p_pagenum == (pp->p_pagenum + (npgs - 1))); 953 ASSERT(pp->p_prev == (pp + (npgs - 1))); 954 955 /* 956 * Check list of pages. 957 */ 958 noreloc = PP_ISNORELOC(pp); 959 while (npgs--) { 960 if (npgs != 0) { 961 ASSERT(pp->p_pagenum == pp->p_next->p_pagenum - 1); 962 ASSERT(pp->p_next == (pp + 1)); 963 } 964 ASSERT(pp->p_szc == szc); 965 ASSERT(PP_ISFREE(pp)); 966 ASSERT(PP_ISAGED(pp)); 967 ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL); 968 ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL); 969 ASSERT(pp->p_vnode == NULL); 970 ASSERT(PP_ISNORELOC(pp) == noreloc); 971 972 pp = pp->p_next; 973 } 974 } 975 #endif /* DEBUG */ 976 977 void 978 page_freelist_lock(int mnode) 979 { 980 int i; 981 for (i = 0; i < NPC_MUTEX; i++) { 982 mutex_enter(FPC_MUTEX(mnode, i)); 983 mutex_enter(CPC_MUTEX(mnode, i)); 984 } 985 } 986 987 void 988 page_freelist_unlock(int mnode) 989 { 990 int i; 991 for (i = 0; i < NPC_MUTEX; i++) { 992 mutex_exit(FPC_MUTEX(mnode, i)); 993 mutex_exit(CPC_MUTEX(mnode, i)); 994 } 995 } 996 997 /* 998 * add pp to the specified page list. Defaults to head of the page list 999 * unless PG_LIST_TAIL is specified. 1000 */ 1001 void 1002 page_list_add(page_t *pp, int flags) 1003 { 1004 page_t **ppp; 1005 kmutex_t *pcm; 1006 uint_t bin, mtype; 1007 int mnode; 1008 1009 ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT)); 1010 ASSERT(PP_ISFREE(pp)); 1011 ASSERT(!hat_page_is_mapped(pp)); 1012 ASSERT(hat_page_getshare(pp) == 0); 1013 1014 /* 1015 * Large pages should be freed via page_list_add_pages(). 1016 */ 1017 ASSERT(pp->p_szc == 0); 1018 1019 /* 1020 * Don't need to lock the freelist first here 1021 * because the page isn't on the freelist yet. 1022 * This means p_szc can't change on us. 1023 */ 1024 1025 bin = PP_2_BIN(pp); 1026 mnode = PP_2_MEM_NODE(pp); 1027 mtype = PP_2_MTYPE(pp); 1028 1029 if (flags & PG_LIST_ISINIT) { 1030 /* 1031 * PG_LIST_ISINIT is set during system startup (ie. single 1032 * threaded), add a page to the free list and add to the 1033 * the free region counters w/o any locking 1034 */ 1035 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1036 1037 /* inline version of page_add() */ 1038 if (*ppp != NULL) { 1039 pp->p_next = *ppp; 1040 pp->p_prev = (*ppp)->p_prev; 1041 (*ppp)->p_prev = pp; 1042 pp->p_prev->p_next = pp; 1043 } else 1044 *ppp = pp; 1045 1046 page_ctr_add_internal(mnode, pp, flags); 1047 } else { 1048 pcm = PC_BIN_MUTEX(mnode, bin, flags); 1049 1050 if (flags & PG_FREE_LIST) { 1051 ASSERT(PP_ISAGED(pp)); 1052 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1053 1054 } else { 1055 ASSERT(pp->p_vnode); 1056 ASSERT((pp->p_offset & PAGEOFFSET) == 0); 1057 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1058 } 1059 mutex_enter(pcm); 1060 page_add(ppp, pp); 1061 1062 if (flags & PG_LIST_TAIL) 1063 *ppp = (*ppp)->p_next; 1064 /* 1065 * Add counters before releasing pcm mutex to avoid a race with 1066 * page_freelist_coalesce and page_freelist_fill. 1067 */ 1068 page_ctr_add(pp, flags); 1069 mutex_exit(pcm); 1070 } 1071 1072 1073 #if defined(__sparc) 1074 if (PP_ISNORELOC(pp)) { 1075 kcage_freemem_add(1); 1076 } 1077 #endif 1078 /* 1079 * It is up to the caller to unlock the page! 1080 */ 1081 ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT)); 1082 } 1083 1084 1085 #ifdef __sparc 1086 /* 1087 * This routine is only used by kcage_init during system startup. 1088 * It performs the function of page_list_sub/PP_SETNORELOC/page_list_add 1089 * without the overhead of taking locks and updating counters. 1090 */ 1091 void 1092 page_list_noreloc_startup(page_t *pp) 1093 { 1094 page_t **ppp; 1095 uint_t bin; 1096 int mnode; 1097 int mtype; 1098 int flags = PG_LIST_ISCAGE; 1099 1100 /* 1101 * If this is a large page on the freelist then 1102 * break it up into smaller pages. 1103 */ 1104 if (pp->p_szc != 0) 1105 page_boot_demote(pp); 1106 1107 /* 1108 * Get list page is currently on. 1109 */ 1110 bin = PP_2_BIN(pp); 1111 mnode = PP_2_MEM_NODE(pp); 1112 mtype = PP_2_MTYPE(pp); 1113 ASSERT(mtype == MTYPE_RELOC); 1114 ASSERT(pp->p_szc == 0); 1115 1116 if (PP_ISAGED(pp)) { 1117 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1118 flags |= PG_FREE_LIST; 1119 } else { 1120 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1121 flags |= PG_CACHE_LIST; 1122 } 1123 1124 ASSERT(*ppp != NULL); 1125 1126 /* 1127 * Delete page from current list. 1128 */ 1129 if (*ppp == pp) 1130 *ppp = pp->p_next; /* go to next page */ 1131 if (*ppp == pp) { 1132 *ppp = NULL; /* page list is gone */ 1133 } else { 1134 pp->p_prev->p_next = pp->p_next; 1135 pp->p_next->p_prev = pp->p_prev; 1136 } 1137 1138 /* LINTED */ 1139 PLCNT_DECR(pp, mnode, 0, flags); 1140 1141 /* 1142 * Set no reloc for cage initted pages. 1143 */ 1144 PP_SETNORELOC(pp); 1145 1146 mtype = PP_2_MTYPE(pp); 1147 ASSERT(mtype == MTYPE_NORELOC); 1148 1149 /* 1150 * Get new list for page. 1151 */ 1152 if (PP_ISAGED(pp)) { 1153 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1154 } else { 1155 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1156 } 1157 1158 /* 1159 * Insert page on new list. 1160 */ 1161 if (*ppp == NULL) { 1162 *ppp = pp; 1163 pp->p_next = pp->p_prev = pp; 1164 } else { 1165 pp->p_next = *ppp; 1166 pp->p_prev = (*ppp)->p_prev; 1167 (*ppp)->p_prev = pp; 1168 pp->p_prev->p_next = pp; 1169 } 1170 1171 /* LINTED */ 1172 PLCNT_INCR(pp, mnode, 0, flags); 1173 1174 /* 1175 * Update cage freemem counter 1176 */ 1177 atomic_add_long(&kcage_freemem, 1); 1178 } 1179 #else /* __sparc */ 1180 1181 /* ARGSUSED */ 1182 void 1183 page_list_noreloc_startup(page_t *pp) 1184 { 1185 panic("page_list_noreloc_startup: should be here only for sparc"); 1186 } 1187 #endif 1188 1189 void 1190 page_list_add_pages(page_t *pp, int flags) 1191 { 1192 kmutex_t *pcm; 1193 pgcnt_t pgcnt; 1194 uint_t bin, mtype, i; 1195 int mnode; 1196 1197 /* default to freelist/head */ 1198 ASSERT((flags & (PG_CACHE_LIST | PG_LIST_TAIL)) == 0); 1199 1200 CHK_LPG(pp, pp->p_szc); 1201 VM_STAT_ADD(vmm_vmstats.pc_list_add_pages[pp->p_szc]); 1202 1203 bin = PP_2_BIN(pp); 1204 mnode = PP_2_MEM_NODE(pp); 1205 mtype = PP_2_MTYPE(pp); 1206 1207 if (flags & PG_LIST_ISINIT) { 1208 ASSERT(pp->p_szc == mmu_page_sizes - 1); 1209 page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1210 ASSERT(!PP_ISNORELOC(pp)); 1211 PLCNT_INCR(pp, mnode, pp->p_szc, flags); 1212 } else { 1213 1214 ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes); 1215 1216 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 1217 1218 mutex_enter(pcm); 1219 page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1220 page_ctr_add(pp, PG_FREE_LIST); 1221 mutex_exit(pcm); 1222 1223 pgcnt = page_get_pagecnt(pp->p_szc); 1224 #if defined(__sparc) 1225 if (PP_ISNORELOC(pp)) 1226 kcage_freemem_add(pgcnt); 1227 #endif 1228 for (i = 0; i < pgcnt; i++, pp++) 1229 page_unlock(pp); 1230 } 1231 } 1232 1233 /* 1234 * During boot, need to demote a large page to base 1235 * pagesize pages for seg_kmem for use in boot_alloc() 1236 */ 1237 void 1238 page_boot_demote(page_t *pp) 1239 { 1240 ASSERT(pp->p_szc != 0); 1241 ASSERT(PP_ISFREE(pp)); 1242 ASSERT(PP_ISAGED(pp)); 1243 1244 (void) page_demote(PP_2_MEM_NODE(pp), 1245 PFN_BASE(pp->p_pagenum, pp->p_szc), pp->p_szc, 0, PC_NO_COLOR, 1246 PC_FREE); 1247 1248 ASSERT(PP_ISFREE(pp)); 1249 ASSERT(PP_ISAGED(pp)); 1250 ASSERT(pp->p_szc == 0); 1251 } 1252 1253 /* 1254 * Take a particular page off of whatever freelist the page 1255 * is claimed to be on. 1256 * 1257 * NOTE: Only used for PAGESIZE pages. 1258 */ 1259 void 1260 page_list_sub(page_t *pp, int flags) 1261 { 1262 int bin; 1263 uint_t mtype; 1264 int mnode; 1265 kmutex_t *pcm; 1266 page_t **ppp; 1267 1268 ASSERT(PAGE_EXCL(pp)); 1269 ASSERT(PP_ISFREE(pp)); 1270 1271 /* 1272 * The p_szc field can only be changed by page_promote() 1273 * and page_demote(). Only free pages can be promoted and 1274 * demoted and the free list MUST be locked during these 1275 * operations. So to prevent a race in page_list_sub() 1276 * between computing which bin of the freelist lock to 1277 * grab and actually grabing the lock we check again that 1278 * the bin we locked is still the correct one. Notice that 1279 * the p_szc field could have actually changed on us but 1280 * if the bin happens to still be the same we are safe. 1281 */ 1282 try_again: 1283 bin = PP_2_BIN(pp); 1284 mnode = PP_2_MEM_NODE(pp); 1285 pcm = PC_BIN_MUTEX(mnode, bin, flags); 1286 mutex_enter(pcm); 1287 if (PP_2_BIN(pp) != bin) { 1288 mutex_exit(pcm); 1289 goto try_again; 1290 } 1291 mtype = PP_2_MTYPE(pp); 1292 1293 if (flags & PG_FREE_LIST) { 1294 ASSERT(PP_ISAGED(pp)); 1295 ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype); 1296 } else { 1297 ASSERT(!PP_ISAGED(pp)); 1298 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1299 } 1300 1301 /* 1302 * Common PAGESIZE case. 1303 * 1304 * Note that we locked the freelist. This prevents 1305 * any page promotion/demotion operations. Therefore 1306 * the p_szc will not change until we drop pcm mutex. 1307 */ 1308 if (pp->p_szc == 0) { 1309 page_sub(ppp, pp); 1310 /* 1311 * Subtract counters before releasing pcm mutex 1312 * to avoid race with page_freelist_coalesce. 1313 */ 1314 page_ctr_sub(pp, flags); 1315 mutex_exit(pcm); 1316 1317 #if defined(__sparc) 1318 if (PP_ISNORELOC(pp)) { 1319 kcage_freemem_sub(1); 1320 } 1321 #endif 1322 return; 1323 } 1324 1325 /* 1326 * Large pages on the cache list are not supported. 1327 */ 1328 if (flags & PG_CACHE_LIST) 1329 panic("page_list_sub: large page on cachelist"); 1330 1331 /* 1332 * Slow but rare. 1333 * 1334 * Somebody wants this particular page which is part 1335 * of a large page. In this case we just demote the page 1336 * if it's on the freelist. 1337 * 1338 * We have to drop pcm before locking the entire freelist. 1339 * Once we have re-locked the freelist check to make sure 1340 * the page hasn't already been demoted or completely 1341 * freed. 1342 */ 1343 mutex_exit(pcm); 1344 page_freelist_lock(mnode); 1345 if (pp->p_szc != 0) { 1346 /* 1347 * Large page is on freelist. 1348 */ 1349 (void) page_demote(mnode, PFN_BASE(pp->p_pagenum, pp->p_szc), 1350 pp->p_szc, 0, PC_NO_COLOR, PC_FREE); 1351 } 1352 ASSERT(PP_ISFREE(pp)); 1353 ASSERT(PP_ISAGED(pp)); 1354 ASSERT(pp->p_szc == 0); 1355 1356 /* 1357 * Subtract counters before releasing pcm mutex 1358 * to avoid race with page_freelist_coalesce. 1359 */ 1360 bin = PP_2_BIN(pp); 1361 mtype = PP_2_MTYPE(pp); 1362 ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype); 1363 1364 page_sub(ppp, pp); 1365 page_ctr_sub(pp, flags); 1366 page_freelist_unlock(mnode); 1367 1368 #if defined(__sparc) 1369 if (PP_ISNORELOC(pp)) { 1370 kcage_freemem_sub(1); 1371 } 1372 #endif 1373 } 1374 1375 void 1376 page_list_sub_pages(page_t *pp, uint_t szc) 1377 { 1378 kmutex_t *pcm; 1379 uint_t bin, mtype; 1380 int mnode; 1381 1382 ASSERT(PAGE_EXCL(pp)); 1383 ASSERT(PP_ISFREE(pp)); 1384 ASSERT(PP_ISAGED(pp)); 1385 1386 /* 1387 * See comment in page_list_sub(). 1388 */ 1389 try_again: 1390 bin = PP_2_BIN(pp); 1391 mnode = PP_2_MEM_NODE(pp); 1392 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 1393 mutex_enter(pcm); 1394 if (PP_2_BIN(pp) != bin) { 1395 mutex_exit(pcm); 1396 goto try_again; 1397 } 1398 1399 VM_STAT_ADD(vmm_vmstats.pc_list_sub_pages1[pp->p_szc]); 1400 1401 /* 1402 * If we're called with a page larger than szc or it got 1403 * promoted above szc before we locked the freelist then 1404 * drop pcm and re-lock entire freelist. If page still larger 1405 * than szc then demote it. 1406 */ 1407 if (pp->p_szc > szc) { 1408 VM_STAT_ADD(vmm_vmstats.pc_list_sub_pages2[pp->p_szc]); 1409 mutex_exit(pcm); 1410 pcm = NULL; 1411 page_freelist_lock(mnode); 1412 if (pp->p_szc > szc) { 1413 VM_STAT_ADD(vmm_vmstats.pc_list_sub_pages3[pp->p_szc]); 1414 (void) page_demote(mnode, 1415 PFN_BASE(pp->p_pagenum, pp->p_szc), 1416 pp->p_szc, szc, PC_NO_COLOR, PC_FREE); 1417 } 1418 bin = PP_2_BIN(pp); 1419 } 1420 ASSERT(PP_ISFREE(pp)); 1421 ASSERT(PP_ISAGED(pp)); 1422 ASSERT(pp->p_szc <= szc); 1423 ASSERT(pp == PP_PAGEROOT(pp)); 1424 1425 mtype = PP_2_MTYPE(pp); 1426 if (pp->p_szc != 0) { 1427 page_vpsub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1428 CHK_LPG(pp, pp->p_szc); 1429 } else { 1430 page_sub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1431 } 1432 page_ctr_sub(pp, PG_FREE_LIST); 1433 1434 if (pcm != NULL) { 1435 mutex_exit(pcm); 1436 } else { 1437 page_freelist_unlock(mnode); 1438 } 1439 1440 #if defined(__sparc) 1441 if (PP_ISNORELOC(pp)) { 1442 pgcnt_t pgcnt; 1443 1444 pgcnt = page_get_pagecnt(pp->p_szc); 1445 kcage_freemem_sub(pgcnt); 1446 } 1447 #endif 1448 } 1449 1450 /* 1451 * Add the page to the front of a linked list of pages 1452 * using the p_next & p_prev pointers for the list. 1453 * The caller is responsible for protecting the list pointers. 1454 */ 1455 void 1456 mach_page_add(page_t **ppp, page_t *pp) 1457 { 1458 if (*ppp == NULL) { 1459 pp->p_next = pp->p_prev = pp; 1460 } else { 1461 pp->p_next = *ppp; 1462 pp->p_prev = (*ppp)->p_prev; 1463 (*ppp)->p_prev = pp; 1464 pp->p_prev->p_next = pp; 1465 } 1466 *ppp = pp; 1467 } 1468 1469 /* 1470 * Remove this page from a linked list of pages 1471 * using the p_next & p_prev pointers for the list. 1472 * 1473 * The caller is responsible for protecting the list pointers. 1474 */ 1475 void 1476 mach_page_sub(page_t **ppp, page_t *pp) 1477 { 1478 ASSERT(PP_ISFREE(pp)); 1479 1480 if (*ppp == NULL || pp == NULL) 1481 panic("mach_page_sub"); 1482 1483 if (*ppp == pp) 1484 *ppp = pp->p_next; /* go to next page */ 1485 1486 if (*ppp == pp) 1487 *ppp = NULL; /* page list is gone */ 1488 else { 1489 pp->p_prev->p_next = pp->p_next; 1490 pp->p_next->p_prev = pp->p_prev; 1491 } 1492 pp->p_prev = pp->p_next = pp; /* make pp a list of one */ 1493 } 1494 1495 /* 1496 * Routine fsflush uses to gradually coalesce the free list into larger pages. 1497 */ 1498 void 1499 page_promote_size(page_t *pp, uint_t cur_szc) 1500 { 1501 pfn_t pfn; 1502 int mnode; 1503 int idx; 1504 int new_szc = cur_szc + 1; 1505 int full = FULL_REGION_CNT(new_szc); 1506 1507 pfn = page_pptonum(pp); 1508 mnode = PFN_2_MEM_NODE(pfn); 1509 1510 page_freelist_lock(mnode); 1511 1512 idx = PNUM_TO_IDX(mnode, new_szc, pfn); 1513 if (PAGE_COUNTERS(mnode, new_szc, idx) == full) 1514 (void) page_promote(mnode, pfn, new_szc, PC_FREE); 1515 1516 page_freelist_unlock(mnode); 1517 } 1518 1519 static uint_t page_promote_err; 1520 static uint_t page_promote_noreloc_err; 1521 1522 /* 1523 * Create a single larger page (of szc new_szc) from smaller contiguous pages 1524 * for the given mnode starting at pfnum. Pages involved are on the freelist 1525 * before the call and may be returned to the caller if requested, otherwise 1526 * they will be placed back on the freelist. 1527 * If flags is PC_ALLOC, then the large page will be returned to the user in 1528 * a state which is consistent with a page being taken off the freelist. If 1529 * we failed to lock the new large page, then we will return NULL to the 1530 * caller and put the large page on the freelist instead. 1531 * If flags is PC_FREE, then the large page will be placed on the freelist, 1532 * and NULL will be returned. 1533 * The caller is responsible for locking the freelist as well as any other 1534 * accounting which needs to be done for a returned page. 1535 * 1536 * RFE: For performance pass in pp instead of pfnum so 1537 * we can avoid excessive calls to page_numtopp_nolock(). 1538 * This would depend on an assumption that all contiguous 1539 * pages are in the same memseg so we can just add/dec 1540 * our pp. 1541 * 1542 * Lock ordering: 1543 * 1544 * There is a potential but rare deadlock situation 1545 * for page promotion and demotion operations. The problem 1546 * is there are two paths into the freelist manager and 1547 * they have different lock orders: 1548 * 1549 * page_create() 1550 * lock freelist 1551 * page_lock(EXCL) 1552 * unlock freelist 1553 * return 1554 * caller drops page_lock 1555 * 1556 * page_free() and page_reclaim() 1557 * caller grabs page_lock(EXCL) 1558 * 1559 * lock freelist 1560 * unlock freelist 1561 * drop page_lock 1562 * 1563 * What prevents a thread in page_create() from deadlocking 1564 * with a thread freeing or reclaiming the same page is the 1565 * page_trylock() in page_get_freelist(). If the trylock fails 1566 * it skips the page. 1567 * 1568 * The lock ordering for promotion and demotion is the same as 1569 * for page_create(). Since the same deadlock could occur during 1570 * page promotion and freeing or reclaiming of a page on the 1571 * cache list we might have to fail the operation and undo what 1572 * have done so far. Again this is rare. 1573 */ 1574 page_t * 1575 page_promote(int mnode, pfn_t pfnum, uchar_t new_szc, int flags) 1576 { 1577 page_t *pp, *pplist, *tpp, *start_pp; 1578 pgcnt_t new_npgs, npgs; 1579 uint_t bin; 1580 pgcnt_t tmpnpgs, pages_left; 1581 uint_t mtype; 1582 uint_t noreloc; 1583 uint_t i; 1584 int which_list; 1585 ulong_t index; 1586 kmutex_t *phm; 1587 1588 /* 1589 * General algorithm: 1590 * Find the starting page 1591 * Walk each page struct removing it from the freelist, 1592 * and linking it to all the other pages removed. 1593 * Once all pages are off the freelist, 1594 * walk the list, modifying p_szc to new_szc and what 1595 * ever other info needs to be done to create a large free page. 1596 * According to the flags, either return the page or put it 1597 * on the freelist. 1598 */ 1599 1600 start_pp = page_numtopp_nolock(pfnum); 1601 ASSERT(start_pp && (start_pp->p_pagenum == pfnum)); 1602 new_npgs = page_get_pagecnt(new_szc); 1603 ASSERT(IS_P2ALIGNED(pfnum, new_npgs)); 1604 1605 /* 1606 * Loop through smaller pages to confirm that all pages 1607 * give the same result for PP_ISNORELOC(). 1608 * We can check this reliably here as the protocol for setting 1609 * P_NORELOC requires pages to be taken off the free list first. 1610 */ 1611 for (i = 0, pp = start_pp; i < new_npgs; i++, pp++) { 1612 if (pp == start_pp) { 1613 /* First page, set requirement. */ 1614 noreloc = PP_ISNORELOC(pp); 1615 } else if (noreloc != PP_ISNORELOC(pp)) { 1616 page_promote_noreloc_err++; 1617 page_promote_err++; 1618 return (NULL); 1619 } 1620 } 1621 1622 pages_left = new_npgs; 1623 pplist = NULL; 1624 pp = start_pp; 1625 1626 /* Loop around coalescing the smaller pages into a big page. */ 1627 while (pages_left) { 1628 /* 1629 * Remove from the freelist. 1630 */ 1631 ASSERT(PP_ISFREE(pp)); 1632 bin = PP_2_BIN(pp); 1633 ASSERT(mnode == PP_2_MEM_NODE(pp)); 1634 mtype = PP_2_MTYPE(pp); 1635 if (PP_ISAGED(pp)) { 1636 1637 /* 1638 * PG_FREE_LIST 1639 */ 1640 if (pp->p_szc) { 1641 page_vpsub(&PAGE_FREELISTS(mnode, 1642 pp->p_szc, bin, mtype), pp); 1643 } else { 1644 mach_page_sub(&PAGE_FREELISTS(mnode, 0, 1645 bin, mtype), pp); 1646 } 1647 which_list = PG_FREE_LIST; 1648 } else { 1649 ASSERT(pp->p_szc == 0); 1650 1651 /* 1652 * PG_CACHE_LIST 1653 * 1654 * Since this page comes from the 1655 * cachelist, we must destroy the 1656 * vnode association. 1657 */ 1658 if (!page_trylock(pp, SE_EXCL)) { 1659 goto fail_promote; 1660 } 1661 1662 /* 1663 * We need to be careful not to deadlock 1664 * with another thread in page_lookup(). 1665 * The page_lookup() thread could be holding 1666 * the same phm that we need if the two 1667 * pages happen to hash to the same phm lock. 1668 * At this point we have locked the entire 1669 * freelist and page_lookup() could be trying 1670 * to grab a freelist lock. 1671 */ 1672 index = PAGE_HASH_FUNC(pp->p_vnode, pp->p_offset); 1673 phm = PAGE_HASH_MUTEX(index); 1674 if (!mutex_tryenter(phm)) { 1675 page_unlock(pp); 1676 goto fail_promote; 1677 } 1678 1679 mach_page_sub(&PAGE_CACHELISTS(mnode, bin, mtype), pp); 1680 page_hashout(pp, phm); 1681 mutex_exit(phm); 1682 PP_SETAGED(pp); 1683 page_unlock(pp); 1684 which_list = PG_CACHE_LIST; 1685 } 1686 page_ctr_sub(pp, which_list); 1687 1688 /* 1689 * Concatenate the smaller page(s) onto 1690 * the large page list. 1691 */ 1692 tmpnpgs = npgs = page_get_pagecnt(pp->p_szc); 1693 pages_left -= npgs; 1694 tpp = pp; 1695 while (npgs--) { 1696 tpp->p_szc = new_szc; 1697 tpp = tpp->p_next; 1698 } 1699 page_list_concat(&pplist, &pp); 1700 pp += tmpnpgs; 1701 } 1702 CHK_LPG(pplist, new_szc); 1703 1704 /* 1705 * return the page to the user if requested 1706 * in the properly locked state. 1707 */ 1708 if (flags == PC_ALLOC && (page_trylock_cons(pplist, SE_EXCL))) { 1709 return (pplist); 1710 } 1711 1712 /* 1713 * Otherwise place the new large page on the freelist 1714 */ 1715 bin = PP_2_BIN(pplist); 1716 mnode = PP_2_MEM_NODE(pplist); 1717 mtype = PP_2_MTYPE(pplist); 1718 page_vpadd(&PAGE_FREELISTS(mnode, new_szc, bin, mtype), pplist); 1719 1720 page_ctr_add(pplist, PG_FREE_LIST); 1721 return (NULL); 1722 1723 fail_promote: 1724 /* 1725 * A thread must have still been freeing or 1726 * reclaiming the page on the cachelist. 1727 * To prevent a deadlock undo what we have 1728 * done sofar and return failure. This 1729 * situation can only happen while promoting 1730 * PAGESIZE pages. 1731 */ 1732 page_promote_err++; 1733 while (pplist) { 1734 pp = pplist; 1735 mach_page_sub(&pplist, pp); 1736 pp->p_szc = 0; 1737 bin = PP_2_BIN(pp); 1738 mtype = PP_2_MTYPE(pp); 1739 mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, mtype), pp); 1740 page_ctr_add(pp, PG_FREE_LIST); 1741 } 1742 return (NULL); 1743 1744 } 1745 1746 /* 1747 * Break up a large page into smaller size pages. 1748 * Pages involved are on the freelist before the call and may 1749 * be returned to the caller if requested, otherwise they will 1750 * be placed back on the freelist. 1751 * The caller is responsible for locking the freelist as well as any other 1752 * accounting which needs to be done for a returned page. 1753 * If flags is not PC_ALLOC, the color argument is ignored, and thus 1754 * technically, any value may be passed in but PC_NO_COLOR is the standard 1755 * which should be followed for clarity's sake. 1756 */ 1757 page_t * 1758 page_demote(int mnode, pfn_t pfnum, uchar_t cur_szc, uchar_t new_szc, 1759 int color, int flags) 1760 { 1761 page_t *pp, *pplist, *npplist; 1762 pgcnt_t npgs, n; 1763 uint_t bin; 1764 uint_t mtype; 1765 page_t *ret_pp = NULL; 1766 1767 ASSERT(cur_szc != 0); 1768 ASSERT(new_szc < cur_szc); 1769 1770 pplist = page_numtopp_nolock(pfnum); 1771 ASSERT(pplist != NULL); 1772 1773 ASSERT(pplist->p_szc == cur_szc); 1774 1775 bin = PP_2_BIN(pplist); 1776 ASSERT(mnode == PP_2_MEM_NODE(pplist)); 1777 mtype = PP_2_MTYPE(pplist); 1778 page_vpsub(&PAGE_FREELISTS(mnode, cur_szc, bin, mtype), pplist); 1779 1780 CHK_LPG(pplist, cur_szc); 1781 page_ctr_sub(pplist, PG_FREE_LIST); 1782 1783 /* 1784 * Number of PAGESIZE pages for smaller new_szc 1785 * page. 1786 */ 1787 npgs = page_get_pagecnt(new_szc); 1788 1789 while (pplist) { 1790 pp = pplist; 1791 1792 ASSERT(pp->p_szc == cur_szc); 1793 1794 /* 1795 * We either break it up into PAGESIZE pages or larger. 1796 */ 1797 if (npgs == 1) { /* PAGESIZE case */ 1798 mach_page_sub(&pplist, pp); 1799 ASSERT(pp->p_szc == cur_szc); 1800 ASSERT(new_szc == 0); 1801 ASSERT(mnode == PP_2_MEM_NODE(pp)); 1802 pp->p_szc = new_szc; 1803 bin = PP_2_BIN(pp); 1804 if ((bin == color) && (flags == PC_ALLOC) && 1805 (ret_pp == NULL) && 1806 page_trylock_cons(pp, SE_EXCL)) { 1807 ret_pp = pp; 1808 } else { 1809 mtype = PP_2_MTYPE(pp); 1810 mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, 1811 mtype), pp); 1812 page_ctr_add(pp, PG_FREE_LIST); 1813 } 1814 } else { 1815 1816 /* 1817 * Break down into smaller lists of pages. 1818 */ 1819 page_list_break(&pplist, &npplist, npgs); 1820 1821 pp = pplist; 1822 n = npgs; 1823 while (n--) { 1824 ASSERT(pp->p_szc == cur_szc); 1825 pp->p_szc = new_szc; 1826 pp = pp->p_next; 1827 } 1828 1829 CHK_LPG(pplist, new_szc); 1830 1831 bin = PP_2_BIN(pplist); 1832 ASSERT(mnode == PP_2_MEM_NODE(pp)); 1833 if ((bin == color) && (flags == PC_ALLOC) && 1834 (ret_pp == NULL) && 1835 page_trylock_cons(pp, SE_EXCL)) { 1836 ret_pp = pp; 1837 } else { 1838 mtype = PP_2_MTYPE(pp); 1839 page_vpadd(&PAGE_FREELISTS(mnode, new_szc, 1840 bin, mtype), pplist); 1841 1842 page_ctr_add(pplist, PG_FREE_LIST); 1843 } 1844 pplist = npplist; 1845 } 1846 } 1847 return (ret_pp); 1848 } 1849 1850 int mpss_coalesce_disable = 0; 1851 1852 /* 1853 * Coalesce free pages into a page of the given szc and color if possible. 1854 * Return the pointer to the page created, otherwise, return NULL. 1855 */ 1856 static page_t * 1857 page_freelist_coalesce(int mnode, uchar_t szc, int color) 1858 { 1859 int r; /* region size */ 1860 int idx, full, i; 1861 pfn_t pfnum; 1862 size_t len; 1863 size_t buckets_to_check; 1864 pgcnt_t cands; 1865 page_t *ret_pp; 1866 int color_stride; 1867 1868 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce); 1869 1870 if (mpss_coalesce_disable) { 1871 return (NULL); 1872 } 1873 1874 r = szc; 1875 PGCTRS_CANDS_GETVALUECOLOR(mnode, r, color, cands); 1876 if (cands == 0) { 1877 VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip); 1878 return (NULL); 1879 } 1880 full = FULL_REGION_CNT(r); 1881 color_stride = (szc) ? page_convert_color(0, szc, page_colors - 1) + 1 : 1882 page_colors; 1883 1884 /* Prevent page_counters dynamic memory from being freed */ 1885 rw_enter(&page_ctrs_rwlock[mnode], RW_READER); 1886 len = PAGE_COUNTERS_ENTRIES(mnode, r); 1887 buckets_to_check = len / color_stride; 1888 idx = PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color); 1889 ASSERT((idx % color_stride) == color); 1890 idx += color_stride; 1891 if (idx >= len) 1892 idx = color; 1893 for (i = 0; i < buckets_to_check; i++) { 1894 if (PAGE_COUNTERS(mnode, r, idx) == full) { 1895 pfnum = IDX_TO_PNUM(mnode, r, idx); 1896 ASSERT(pfnum >= mem_node_config[mnode].physbase && 1897 pfnum < mem_node_config[mnode].physmax); 1898 /* 1899 * RFE: For performance maybe we can do something less 1900 * brutal than locking the entire freelist. So far 1901 * this doesn't seem to be a performance problem? 1902 */ 1903 page_freelist_lock(mnode); 1904 if (PAGE_COUNTERS(mnode, r, idx) != full) { 1905 VM_STAT_ADD(vmm_vmstats.page_ctrs_changed); 1906 goto skip_this_one; 1907 } 1908 ret_pp = page_promote(mnode, pfnum, r, PC_ALLOC); 1909 if (ret_pp != NULL) { 1910 PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color) = 1911 idx; 1912 page_freelist_unlock(mnode); 1913 rw_exit(&page_ctrs_rwlock[mnode]); 1914 #if defined(__sparc) 1915 if (PP_ISNORELOC(ret_pp)) { 1916 pgcnt_t npgs; 1917 1918 npgs = page_get_pagecnt(ret_pp->p_szc); 1919 kcage_freemem_sub(npgs); 1920 } 1921 #endif 1922 return (ret_pp); 1923 } 1924 skip_this_one: 1925 page_freelist_unlock(mnode); 1926 /* 1927 * No point looking for another page if we've 1928 * already tried all of the ones that 1929 * page_ctr_cands indicated. Stash off where we left 1930 * off. 1931 * Note: this is not exact since we don't hold the 1932 * page_freelist_locks before we initially get the 1933 * value of cands for performance reasons, but should 1934 * be a decent approximation. 1935 */ 1936 if (--cands == 0) { 1937 PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color) = 1938 idx; 1939 break; 1940 } 1941 } 1942 idx += color_stride; 1943 if (idx >= len) 1944 idx = color; 1945 } 1946 rw_exit(&page_ctrs_rwlock[mnode]); 1947 VM_STAT_ADD(vmm_vmstats.page_ctrs_failed); 1948 return (NULL); 1949 } 1950 1951 /* 1952 * For the given mnode, promote as many small pages to large pages as possible. 1953 */ 1954 void 1955 page_freelist_coalesce_all(int mnode) 1956 { 1957 int r; /* region size */ 1958 int idx, full; 1959 pfn_t pfnum; 1960 size_t len; 1961 1962 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce_all); 1963 1964 if (mpss_coalesce_disable) { 1965 return; 1966 } 1967 1968 /* 1969 * Lock the entire freelist and coalesce what we can. 1970 * 1971 * Always promote to the largest page possible 1972 * first to reduce the number of page promotions. 1973 */ 1974 rw_enter(&page_ctrs_rwlock[mnode], RW_READER); 1975 page_freelist_lock(mnode); 1976 for (r = mmu_page_sizes - 1; r > 0; r--) { 1977 pgcnt_t cands; 1978 1979 PGCTRS_CANDS_GETVALUE(mnode, r, cands); 1980 if (cands == 0) { 1981 VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip_all); 1982 continue; 1983 } 1984 1985 full = FULL_REGION_CNT(r); 1986 len = PAGE_COUNTERS_ENTRIES(mnode, r); 1987 1988 for (idx = 0; idx < len; idx++) { 1989 if (PAGE_COUNTERS(mnode, r, idx) == full) { 1990 pfnum = IDX_TO_PNUM(mnode, r, idx); 1991 ASSERT(pfnum >= 1992 mem_node_config[mnode].physbase && 1993 pfnum < 1994 mem_node_config[mnode].physmax); 1995 (void) page_promote(mnode, pfnum, r, PC_FREE); 1996 } 1997 } 1998 } 1999 page_freelist_unlock(mnode); 2000 rw_exit(&page_ctrs_rwlock[mnode]); 2001 } 2002 2003 /* 2004 * This is where all polices for moving pages around 2005 * to different page size free lists is implemented. 2006 * Returns 1 on success, 0 on failure. 2007 * 2008 * So far these are the priorities for this algorithm in descending 2009 * order: 2010 * 2011 * 1) When servicing a request try to do so with a free page 2012 * from next size up. Helps defer fragmentation as long 2013 * as possible. 2014 * 2015 * 2) Page coalesce on demand. Only when a freelist 2016 * larger than PAGESIZE is empty and step 1 2017 * will not work since all larger size lists are 2018 * also empty. 2019 * 2020 * If pfnhi is non-zero, search for large page with pfn range less than pfnhi. 2021 */ 2022 page_t * 2023 page_freelist_fill(uchar_t szc, int color, int mnode, int mtype, pfn_t pfnhi) 2024 { 2025 uchar_t nszc = szc + 1; 2026 int bin; 2027 page_t *pp, *firstpp; 2028 page_t *ret_pp = NULL; 2029 2030 ASSERT(szc < mmu_page_sizes); 2031 2032 /* 2033 * First try to break up a larger page to fill 2034 * current size freelist. 2035 */ 2036 while (nszc < mmu_page_sizes) { 2037 /* 2038 * If page found then demote it. 2039 */ 2040 bin = page_convert_color(szc, nszc, color); 2041 if (PAGE_FREELISTS(mnode, nszc, bin, mtype)) { 2042 page_freelist_lock(mnode); 2043 firstpp = pp = PAGE_FREELISTS(mnode, nszc, bin, mtype); 2044 2045 /* 2046 * If pfnhi is not PFNNULL, look for large page below 2047 * pfnhi. PFNNULL signifies no pfn requirement. 2048 */ 2049 if (pfnhi != PFNNULL && pp->p_pagenum >= pfnhi) { 2050 do { 2051 pp = pp->p_vpnext; 2052 if (pp == firstpp) { 2053 pp = NULL; 2054 break; 2055 } 2056 } while (pp->p_pagenum >= pfnhi); 2057 } 2058 if (pp) { 2059 ASSERT(pp->p_szc == nszc); 2060 ret_pp = page_demote(mnode, pp->p_pagenum, 2061 pp->p_szc, szc, color, PC_ALLOC); 2062 if (ret_pp) { 2063 page_freelist_unlock(mnode); 2064 #if defined(__sparc) 2065 if (PP_ISNORELOC(ret_pp)) { 2066 pgcnt_t npgs; 2067 2068 npgs = page_get_pagecnt( 2069 ret_pp->p_szc); 2070 kcage_freemem_sub(npgs); 2071 } 2072 #endif 2073 return (ret_pp); 2074 } 2075 } 2076 page_freelist_unlock(mnode); 2077 } 2078 nszc++; 2079 } 2080 2081 /* 2082 * Ok that didn't work. Time to coalesce. 2083 */ 2084 if (szc != 0) { 2085 ret_pp = page_freelist_coalesce(mnode, szc, color); 2086 } 2087 2088 return (ret_pp); 2089 } 2090 2091 /* 2092 * Helper routine used only by the freelist code to lock 2093 * a page. If the page is a large page then it succeeds in 2094 * locking all the constituent pages or none at all. 2095 * Returns 1 on sucess, 0 on failure. 2096 */ 2097 static int 2098 page_trylock_cons(page_t *pp, se_t se) 2099 { 2100 page_t *tpp, *first_pp = pp; 2101 2102 /* 2103 * Fail if can't lock first or only page. 2104 */ 2105 if (!page_trylock(pp, se)) { 2106 return (0); 2107 } 2108 2109 /* 2110 * PAGESIZE: common case. 2111 */ 2112 if (pp->p_szc == 0) { 2113 return (1); 2114 } 2115 2116 /* 2117 * Large page case. 2118 */ 2119 tpp = pp->p_next; 2120 while (tpp != pp) { 2121 if (!page_trylock(tpp, se)) { 2122 /* 2123 * On failure unlock what we 2124 * have locked so far. 2125 */ 2126 while (first_pp != tpp) { 2127 page_unlock(first_pp); 2128 first_pp = first_pp->p_next; 2129 } 2130 return (0); 2131 } 2132 tpp = tpp->p_next; 2133 } 2134 return (1); 2135 } 2136 2137 page_t * 2138 page_get_mnode_freelist(int mnode, uint_t bin, int mtype, uchar_t szc, 2139 uint_t flags) 2140 { 2141 kmutex_t *pcm; 2142 int i, fill_tried, fill_marker; 2143 page_t *pp, *first_pp; 2144 uint_t bin_marker; 2145 int colors, cpucolors; 2146 uchar_t nszc; 2147 uint_t nszc_color_shift; 2148 int nwaybins = 0, nwaycnt; 2149 2150 ASSERT(szc < mmu_page_sizes); 2151 2152 VM_STAT_ADD(vmm_vmstats.pgmf_alloc[szc]); 2153 2154 /* LINTED */ 2155 MTYPE_START(mnode, mtype, flags); 2156 if (mtype < 0) { /* mnode foes not have memory in mtype range */ 2157 VM_STAT_ADD(vmm_vmstats.pgmf_allocempty[szc]); 2158 return (NULL); 2159 } 2160 2161 /* 2162 * Set how many physical colors for this page size. 2163 */ 2164 colors = (szc) ? page_convert_color(0, szc, page_colors - 1) + 1 : 2165 page_colors; 2166 2167 nszc = MIN(szc + 1, mmu_page_sizes - 1); 2168 nszc_color_shift = page_get_shift(nszc) - page_get_shift(szc); 2169 2170 /* cpu_page_colors is non-zero if a page color may be in > 1 bin */ 2171 cpucolors = cpu_page_colors; 2172 2173 /* 2174 * adjust cpucolors to possibly check additional 'equivalent' bins 2175 * to try to minimize fragmentation of large pages by delaying calls 2176 * to page_freelist_fill. 2177 */ 2178 if (colorequiv > 1) { 2179 int equivcolors = colors / colorequiv; 2180 2181 if (equivcolors && (cpucolors == 0 || equivcolors < cpucolors)) 2182 cpucolors = equivcolors; 2183 } 2184 2185 ASSERT(colors <= page_colors); 2186 ASSERT(colors); 2187 ASSERT((colors & (colors - 1)) == 0); 2188 2189 ASSERT(bin < colors); 2190 2191 /* 2192 * Only hold one freelist lock at a time, that way we 2193 * can start anywhere and not have to worry about lock 2194 * ordering. 2195 */ 2196 big_try_again: 2197 fill_tried = 0; 2198 nwaycnt = 0; 2199 for (i = 0; i <= colors; i++) { 2200 try_again: 2201 ASSERT(bin < colors); 2202 if (PAGE_FREELISTS(mnode, szc, bin, mtype)) { 2203 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 2204 mutex_enter(pcm); 2205 pp = PAGE_FREELISTS(mnode, szc, bin, mtype); 2206 if (pp != NULL) { 2207 /* 2208 * These were set before the page 2209 * was put on the free list, 2210 * they must still be set. 2211 */ 2212 ASSERT(PP_ISFREE(pp)); 2213 ASSERT(PP_ISAGED(pp)); 2214 ASSERT(pp->p_vnode == NULL); 2215 ASSERT(pp->p_hash == NULL); 2216 ASSERT(pp->p_offset == (u_offset_t)-1); 2217 ASSERT(pp->p_szc == szc); 2218 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 2219 2220 /* 2221 * Walk down the hash chain. 2222 * 8k pages are linked on p_next 2223 * and p_prev fields. Large pages 2224 * are a contiguous group of 2225 * constituent pages linked together 2226 * on their p_next and p_prev fields. 2227 * The large pages are linked together 2228 * on the hash chain using p_vpnext 2229 * p_vpprev of the base constituent 2230 * page of each large page. 2231 */ 2232 first_pp = pp; 2233 while (!page_trylock_cons(pp, SE_EXCL)) { 2234 if (szc == 0) { 2235 pp = pp->p_next; 2236 } else { 2237 pp = pp->p_vpnext; 2238 } 2239 2240 ASSERT(PP_ISFREE(pp)); 2241 ASSERT(PP_ISAGED(pp)); 2242 ASSERT(pp->p_vnode == NULL); 2243 ASSERT(pp->p_hash == NULL); 2244 ASSERT(pp->p_offset == (u_offset_t)-1); 2245 ASSERT(pp->p_szc == szc); 2246 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == 2247 mnode); 2248 2249 if (pp == first_pp) { 2250 pp = NULL; 2251 break; 2252 } 2253 } 2254 2255 if (pp) { 2256 ASSERT(mtype == PP_2_MTYPE(pp)); 2257 ASSERT(pp->p_szc == szc); 2258 if (szc == 0) { 2259 page_sub(&PAGE_FREELISTS(mnode, 2260 szc, bin, mtype), pp); 2261 } else { 2262 page_vpsub(&PAGE_FREELISTS( 2263 mnode, szc, bin, mtype), 2264 pp); 2265 CHK_LPG(pp, szc); 2266 } 2267 page_ctr_sub(pp, PG_FREE_LIST); 2268 2269 if ((PP_ISFREE(pp) == 0) || 2270 (PP_ISAGED(pp) == 0)) 2271 panic("free page is not. pp %p", 2272 (void *)pp); 2273 mutex_exit(pcm); 2274 2275 #if defined(__sparc) 2276 ASSERT(!kcage_on || PP_ISNORELOC(pp) || 2277 (flags & PG_NORELOC) == 0); 2278 2279 if (PP_ISNORELOC(pp)) { 2280 pgcnt_t npgs; 2281 2282 npgs = page_get_pagecnt(szc); 2283 kcage_freemem_sub(npgs); 2284 } 2285 #endif 2286 VM_STAT_ADD(vmm_vmstats. 2287 pgmf_allocok[szc]); 2288 return (pp); 2289 } 2290 } 2291 mutex_exit(pcm); 2292 } 2293 2294 /* 2295 * Wow! The initial bin is empty. 2296 * If specific color is needed, check if page color may be 2297 * in other bins. cpucolors is: 2298 * 0 if the colors for this cpu is equal to page_colors. 2299 * This means that pages with a particular color are in a 2300 * single bin. 2301 * -1 if colors of cpus (cheetah+) are heterogenous. Need to 2302 * first determine the colors for the current cpu. 2303 * >0 colors of all cpus are homogenous and < page_colors 2304 */ 2305 2306 if ((flags & PG_MATCH_COLOR) && (cpucolors != 0)) { 2307 if (!nwaybins) { 2308 /* 2309 * cpucolors is negative if ecache setsizes 2310 * are heterogenous. determine colors for this 2311 * particular cpu. 2312 */ 2313 if (cpucolors < 0) { 2314 cpucolors = CPUSETSIZE() / MMU_PAGESIZE; 2315 ASSERT(cpucolors > 0); 2316 nwaybins = colors / cpucolors; 2317 } else { 2318 nwaybins = colors / cpucolors; 2319 ASSERT(szc > 0 || nwaybins > 1); 2320 } 2321 if (nwaybins < 2) 2322 cpucolors = 0; 2323 } 2324 2325 if (cpucolors && (nwaycnt + 1 <= nwaybins)) { 2326 nwaycnt++; 2327 bin = (bin + (colors / nwaybins)) & 2328 (colors - 1); 2329 if (nwaycnt < nwaybins) { 2330 goto try_again; 2331 } 2332 } 2333 /* back to initial color if fall-thru */ 2334 } 2335 2336 /* 2337 * color bins are all empty if color match. Try and satisfy 2338 * the request by breaking up or coalescing pages from 2339 * a different size freelist of the correct color that 2340 * satisfies the ORIGINAL color requested. If that 2341 * fails then try pages of the same size but different 2342 * colors assuming we are not called with 2343 * PG_MATCH_COLOR. 2344 */ 2345 if (!fill_tried) { 2346 fill_tried = 1; 2347 fill_marker = bin >> nszc_color_shift; 2348 pp = page_freelist_fill(szc, bin, mnode, mtype, 2349 PFNNULL); 2350 if (pp != NULL) { 2351 return (pp); 2352 } 2353 } 2354 2355 if (flags & PG_MATCH_COLOR) 2356 break; 2357 2358 /* 2359 * Select next color bin to try. 2360 */ 2361 if (szc == 0) { 2362 /* 2363 * PAGESIZE page case. 2364 */ 2365 if (i == 0) { 2366 bin = (bin + BIN_STEP) & page_colors_mask; 2367 bin_marker = bin; 2368 } else { 2369 bin = (bin + vac_colors) & page_colors_mask; 2370 if (bin == bin_marker) { 2371 bin = (bin + 1) & page_colors_mask; 2372 bin_marker = bin; 2373 } 2374 } 2375 } else { 2376 /* 2377 * Large page case. 2378 */ 2379 bin = (bin + 1) & (colors - 1); 2380 } 2381 /* 2382 * If bin advanced to the next color bin of the 2383 * next larger pagesize, there is a chance the fill 2384 * could succeed. 2385 */ 2386 if (fill_marker != (bin >> nszc_color_shift)) 2387 fill_tried = 0; 2388 } 2389 2390 #if defined(__sparc) 2391 if (!(flags & (PG_NORELOC | PGI_NOCAGE | PGI_RELOCONLY)) && 2392 (kcage_freemem >= kcage_lotsfree)) { 2393 /* 2394 * The Cage is ON and with plenty of free mem, and 2395 * we're willing to check for a NORELOC page if we 2396 * couldn't find a RELOC page, so spin again. 2397 */ 2398 flags |= PG_NORELOC; 2399 mtype = MTYPE_NORELOC; 2400 goto big_try_again; 2401 } 2402 #else 2403 if (flags & PGI_MT_RANGE) { 2404 /* cycle through range of mtypes */ 2405 MTYPE_NEXT(mnode, mtype, flags); 2406 if (mtype >= 0) 2407 goto big_try_again; 2408 } 2409 #endif 2410 VM_STAT_ADD(vmm_vmstats.pgmf_allocfailed[szc]); 2411 2412 return (NULL); 2413 } 2414 2415 2416 /* 2417 * Returns the count of free pages for 'pp' with size code 'szc'. 2418 * Note: This function does not return an exact value as the page freelist 2419 * locks are not held and thus the values in the page_counters may be 2420 * changing as we walk through the data. 2421 */ 2422 static int 2423 page_freecnt(int mnode, page_t *pp, uchar_t szc) 2424 { 2425 pgcnt_t pgfree; 2426 pgcnt_t cnt; 2427 ssize_t r = szc; /* region size */ 2428 ssize_t idx; 2429 int i; 2430 int full, range; 2431 2432 /* Make sure pagenum passed in is aligned properly */ 2433 ASSERT((pp->p_pagenum & (PNUM_SIZE(szc) - 1)) == 0); 2434 ASSERT(szc > 0); 2435 2436 /* Prevent page_counters dynamic memory from being freed */ 2437 rw_enter(&page_ctrs_rwlock[mnode], RW_READER); 2438 idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum); 2439 cnt = PAGE_COUNTERS(mnode, r, idx); 2440 pgfree = cnt << PNUM_SHIFT(r - 1); 2441 range = FULL_REGION_CNT(szc); 2442 2443 /* Check for completely full region */ 2444 if (cnt == range) { 2445 rw_exit(&page_ctrs_rwlock[mnode]); 2446 return (pgfree); 2447 } 2448 2449 while (--r > 0) { 2450 idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum); 2451 full = FULL_REGION_CNT(r); 2452 for (i = 0; i < range; i++, idx++) { 2453 cnt = PAGE_COUNTERS(mnode, r, idx); 2454 /* 2455 * If cnt here is full, that means we have already 2456 * accounted for these pages earlier. 2457 */ 2458 if (cnt != full) { 2459 pgfree += (cnt << PNUM_SHIFT(r - 1)); 2460 } 2461 } 2462 range *= full; 2463 } 2464 rw_exit(&page_ctrs_rwlock[mnode]); 2465 return (pgfree); 2466 } 2467 2468 /* 2469 * Called from page_geti_contig_pages to exclusively lock constituent pages 2470 * starting from 'spp' for page size code 'szc'. 2471 * 2472 * If 'ptcpthreshold' is set, the number of free pages needed in the 'szc' 2473 * region needs to be greater than or equal to the threshold. 2474 */ 2475 static int 2476 page_trylock_contig_pages(int mnode, page_t *spp, uchar_t szc, int flags) 2477 { 2478 pgcnt_t pgcnt = PNUM_SIZE(szc); 2479 pgcnt_t pgfree, i; 2480 page_t *pp; 2481 2482 VM_STAT_ADD(vmm_vmstats.ptcp[szc]); 2483 2484 2485 if ((ptcpthreshold == 0) || (flags & PGI_PGCPHIPRI)) 2486 goto skipptcpcheck; 2487 /* 2488 * check if there are sufficient free pages available before attempting 2489 * to trylock. Count is approximate as page counters can change. 2490 */ 2491 pgfree = page_freecnt(mnode, spp, szc); 2492 2493 /* attempt to trylock if there are sufficient already free pages */ 2494 if (pgfree < pgcnt/ptcpthreshold) { 2495 VM_STAT_ADD(vmm_vmstats.ptcpfreethresh[szc]); 2496 return (0); 2497 } 2498 2499 skipptcpcheck: 2500 2501 for (i = 0; i < pgcnt; i++) { 2502 pp = &spp[i]; 2503 if (!page_trylock(pp, SE_EXCL)) { 2504 VM_STAT_ADD(vmm_vmstats.ptcpfailexcl[szc]); 2505 while (--i != (pgcnt_t)-1) { 2506 pp = &spp[i]; 2507 ASSERT(PAGE_EXCL(pp)); 2508 page_unlock(pp); 2509 } 2510 return (0); 2511 } 2512 ASSERT(spp[i].p_pagenum == spp->p_pagenum + i); 2513 if ((pp->p_szc > szc || (szc && pp->p_szc == szc)) && 2514 !PP_ISFREE(pp)) { 2515 VM_STAT_ADD(vmm_vmstats.ptcpfailszc[szc]); 2516 ASSERT(i == 0); 2517 page_unlock(pp); 2518 return (0); 2519 } 2520 if (PP_ISNORELOC(pp)) { 2521 VM_STAT_ADD(vmm_vmstats.ptcpfailcage[szc]); 2522 while (i != (pgcnt_t)-1) { 2523 pp = &spp[i]; 2524 ASSERT(PAGE_EXCL(pp)); 2525 page_unlock(pp); 2526 i--; 2527 } 2528 return (0); 2529 } 2530 } 2531 VM_STAT_ADD(vmm_vmstats.ptcpok[szc]); 2532 return (1); 2533 } 2534 2535 /* 2536 * Claim large page pointed to by 'pp'. 'pp' is the starting set 2537 * of 'szc' constituent pages that had been locked exclusively previously. 2538 * Will attempt to relocate constituent pages in use. 2539 */ 2540 static page_t * 2541 page_claim_contig_pages(page_t *pp, uchar_t szc, int flags) 2542 { 2543 spgcnt_t pgcnt, npgs, i; 2544 page_t *targpp, *rpp, *hpp; 2545 page_t *replpp = NULL; 2546 page_t *pplist = NULL; 2547 2548 ASSERT(pp != NULL); 2549 2550 pgcnt = page_get_pagecnt(szc); 2551 while (pgcnt) { 2552 ASSERT(PAGE_EXCL(pp)); 2553 ASSERT(!PP_ISNORELOC(pp)); 2554 if (PP_ISFREE(pp)) { 2555 /* 2556 * If this is a PG_FREE_LIST page then its 2557 * size code can change underneath us due to 2558 * page promotion or demotion. As an optimzation 2559 * use page_list_sub_pages() instead of 2560 * page_list_sub(). 2561 */ 2562 if (PP_ISAGED(pp)) { 2563 page_list_sub_pages(pp, szc); 2564 if (pp->p_szc == szc) { 2565 return (pp); 2566 } 2567 ASSERT(pp->p_szc < szc); 2568 npgs = page_get_pagecnt(pp->p_szc); 2569 hpp = pp; 2570 for (i = 0; i < npgs; i++, pp++) { 2571 pp->p_szc = szc; 2572 } 2573 page_list_concat(&pplist, &hpp); 2574 pgcnt -= npgs; 2575 continue; 2576 } 2577 ASSERT(!PP_ISAGED(pp)); 2578 ASSERT(pp->p_szc == 0); 2579 page_list_sub(pp, PG_CACHE_LIST); 2580 page_hashout(pp, NULL); 2581 PP_SETAGED(pp); 2582 pp->p_szc = szc; 2583 page_list_concat(&pplist, &pp); 2584 pp++; 2585 pgcnt--; 2586 continue; 2587 } 2588 npgs = page_get_pagecnt(pp->p_szc); 2589 2590 /* 2591 * page_create_wait freemem accounting done by caller of 2592 * page_get_freelist and not necessary to call it prior to 2593 * calling page_get_replacement_page. 2594 * 2595 * page_get_replacement_page can call page_get_contig_pages 2596 * to acquire a large page (szc > 0); the replacement must be 2597 * smaller than the contig page size to avoid looping or 2598 * szc == 0 and PGI_PGCPSZC0 is set. 2599 */ 2600 if (pp->p_szc < szc || (szc == 0 && (flags & PGI_PGCPSZC0))) { 2601 replpp = page_get_replacement_page(pp, NULL, 0); 2602 if (replpp) { 2603 npgs = page_get_pagecnt(pp->p_szc); 2604 ASSERT(npgs <= pgcnt); 2605 targpp = pp; 2606 } 2607 } 2608 2609 /* 2610 * If replacement is NULL or do_page_relocate fails, fail 2611 * coalescing of pages. 2612 */ 2613 if (replpp == NULL || (do_page_relocate(&targpp, &replpp, 0, 2614 &npgs, NULL) != 0)) { 2615 /* 2616 * Unlock un-processed target list 2617 */ 2618 while (pgcnt--) { 2619 ASSERT(PAGE_EXCL(pp)); 2620 page_unlock(pp); 2621 pp++; 2622 } 2623 /* 2624 * Free the processed target list. 2625 */ 2626 while (pplist) { 2627 pp = pplist; 2628 page_sub(&pplist, pp); 2629 ASSERT(PAGE_EXCL(pp)); 2630 ASSERT(pp->p_szc == szc); 2631 ASSERT(PP_ISFREE(pp)); 2632 ASSERT(PP_ISAGED(pp)); 2633 pp->p_szc = 0; 2634 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); 2635 page_unlock(pp); 2636 } 2637 2638 if (replpp != NULL) 2639 page_free_replacement_page(replpp); 2640 2641 return (NULL); 2642 } 2643 ASSERT(pp == targpp); 2644 2645 /* LINTED */ 2646 ASSERT(hpp = pp); /* That's right, it's an assignment */ 2647 2648 pp += npgs; 2649 pgcnt -= npgs; 2650 2651 while (npgs--) { 2652 ASSERT(PAGE_EXCL(targpp)); 2653 ASSERT(!PP_ISFREE(targpp)); 2654 ASSERT(!PP_ISNORELOC(targpp)); 2655 PP_SETFREE(targpp); 2656 ASSERT(PP_ISAGED(targpp)); 2657 ASSERT(targpp->p_szc < szc || (szc == 0 && 2658 (flags & PGI_PGCPSZC0))); 2659 targpp->p_szc = szc; 2660 targpp = targpp->p_next; 2661 2662 rpp = replpp; 2663 ASSERT(rpp != NULL); 2664 page_sub(&replpp, rpp); 2665 ASSERT(PAGE_EXCL(rpp)); 2666 ASSERT(!PP_ISFREE(rpp)); 2667 page_unlock(rpp); 2668 } 2669 ASSERT(targpp == hpp); 2670 ASSERT(replpp == NULL); 2671 page_list_concat(&pplist, &targpp); 2672 } 2673 CHK_LPG(pplist, szc); 2674 return (pplist); 2675 } 2676 2677 /* 2678 * Trim kernel cage from pfnlo-pfnhi and store result in lo-hi. Return code 2679 * of 0 means nothing left after trim. 2680 */ 2681 2682 int 2683 trimkcage(struct memseg *mseg, pfn_t *lo, pfn_t *hi, pfn_t pfnlo, pfn_t pfnhi) 2684 { 2685 pfn_t kcagepfn; 2686 int decr; 2687 int rc = 0; 2688 2689 if (PP_ISNORELOC(mseg->pages)) { 2690 if (PP_ISNORELOC(mseg->epages - 1) == 0) { 2691 2692 /* lower part of this mseg inside kernel cage */ 2693 decr = kcage_current_pfn(&kcagepfn); 2694 2695 /* kernel cage may have transitioned past mseg */ 2696 if (kcagepfn >= mseg->pages_base && 2697 kcagepfn < mseg->pages_end) { 2698 ASSERT(decr == 0); 2699 *lo = kcagepfn; 2700 *hi = MIN(pfnhi, 2701 (mseg->pages_end - 1)); 2702 rc = 1; 2703 } 2704 } 2705 /* else entire mseg in the cage */ 2706 } else { 2707 if (PP_ISNORELOC(mseg->epages - 1)) { 2708 2709 /* upper part of this mseg inside kernel cage */ 2710 decr = kcage_current_pfn(&kcagepfn); 2711 2712 /* kernel cage may have transitioned past mseg */ 2713 if (kcagepfn >= mseg->pages_base && 2714 kcagepfn < mseg->pages_end) { 2715 ASSERT(decr); 2716 *hi = kcagepfn; 2717 *lo = MAX(pfnlo, mseg->pages_base); 2718 rc = 1; 2719 } 2720 } else { 2721 /* entire mseg outside of kernel cage */ 2722 *lo = MAX(pfnlo, mseg->pages_base); 2723 *hi = MIN(pfnhi, (mseg->pages_end - 1)); 2724 rc = 1; 2725 } 2726 } 2727 return (rc); 2728 } 2729 2730 /* 2731 * called from page_get_contig_pages to search 'pfnlo' thru 'pfnhi' to "claim" a 2732 * page with size code 'szc'. Claiming such a page requires acquiring 2733 * exclusive locks on all constituent pages (page_trylock_contig_pages), 2734 * relocating pages in use and concatenating these constituent pages into a 2735 * large page. 2736 * 2737 * The page lists do not have such a large page and page_freelist_fill has 2738 * already failed to demote larger pages and/or coalesce smaller free pages. 2739 * 2740 * 'flags' may specify PG_COLOR_MATCH which would limit the search of large 2741 * pages with the same color as 'bin'. 2742 * 2743 * 'pfnflag' specifies the subset of the pfn range to search. 2744 */ 2745 2746 2747 static page_t * 2748 page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags, 2749 pfn_t pfnlo, pfn_t pfnhi, int pfnflag) 2750 { 2751 struct memseg *mseg; 2752 pgcnt_t szcpgcnt = page_get_pagecnt(szc); 2753 pgcnt_t szcpgmask = szcpgcnt - 1; 2754 pfn_t randpfn; 2755 page_t *pp, *randpp, *endpp; 2756 uint_t colors; 2757 pfn_t hi, lo; 2758 uint_t skip; 2759 2760 ASSERT(szc != 0 || (flags & PGI_PGCPSZC0)); 2761 2762 if ((pfnhi - pfnlo) + 1 < szcpgcnt) 2763 return (NULL); 2764 2765 ASSERT(szc < mmu_page_sizes); 2766 2767 colors = (szc) ? page_convert_color(0, szc, page_colors - 1) + 1 : 2768 page_colors; 2769 2770 ASSERT(bin < colors); 2771 2772 /* 2773 * trim the pfn range to search based on pfnflag. pfnflag is set 2774 * when there have been previous page_get_contig_page failures to 2775 * limit the search. 2776 * 2777 * The high bit in pfnflag specifies the number of 'slots' in the 2778 * pfn range and the remainder of pfnflag specifies which slot. 2779 * For example, a value of 1010b would mean the second slot of 2780 * the pfn range that has been divided into 8 slots. 2781 */ 2782 if (pfnflag > 1) { 2783 int slots = 1 << (highbit(pfnflag) - 1); 2784 int slotid = pfnflag & (slots - 1); 2785 pgcnt_t szcpages; 2786 int slotlen; 2787 2788 pfnlo = P2ROUNDUP(pfnlo, szcpgcnt); 2789 pfnhi = pfnhi & ~(szcpgcnt - 1); 2790 2791 szcpages = ((pfnhi - pfnlo) + 1) / szcpgcnt; 2792 slotlen = howmany(szcpages, slots); 2793 pfnlo = pfnlo + (((slotid * slotlen) % szcpages) * szcpgcnt); 2794 ASSERT(pfnlo < pfnhi); 2795 if (pfnhi > pfnlo + (slotlen * szcpgcnt)) 2796 pfnhi = pfnlo + (slotlen * szcpgcnt); 2797 } 2798 2799 memsegs_lock(0); 2800 2801 /* 2802 * loop through memsegs to look for contig page candidates 2803 */ 2804 2805 for (mseg = memsegs; mseg != NULL; mseg = mseg->next) { 2806 if (pfnhi < mseg->pages_base || pfnlo >= mseg->pages_end) { 2807 /* no overlap */ 2808 continue; 2809 } 2810 2811 if (mseg->pages_end - mseg->pages_base < szcpgcnt) 2812 /* mseg too small */ 2813 continue; 2814 2815 /* trim off kernel cage pages from pfn range */ 2816 if (kcage_on) { 2817 if (trimkcage(mseg, &lo, &hi, pfnlo, pfnhi) == 0) 2818 continue; 2819 } else { 2820 lo = MAX(pfnlo, mseg->pages_base); 2821 hi = MIN(pfnhi, (mseg->pages_end - 1)); 2822 } 2823 2824 /* round to szcpgcnt boundaries */ 2825 lo = P2ROUNDUP(lo, szcpgcnt); 2826 hi = hi & ~(szcpgcnt - 1); 2827 2828 if (hi <= lo) 2829 continue; 2830 2831 /* 2832 * set lo to point to the pfn for the desired bin. Large 2833 * page sizes may only have a single page color 2834 */ 2835 if ((colors > 1) && (flags & PG_MATCH_COLOR)) { 2836 uint_t lobin; 2837 2838 /* 2839 * factor in colorequiv to check additional 2840 * 'equivalent' bins. 2841 */ 2842 if (colorequiv > 1 && colors > colorequiv) 2843 colors = colors / colorequiv; 2844 2845 /* determine bin that lo currently points to */ 2846 lobin = (lo & ((szcpgcnt * colors) - 1)) / szcpgcnt; 2847 2848 /* 2849 * set lo to point at appropriate color and set skip 2850 * to arrive at the next szc page of the same color. 2851 */ 2852 lo += ((bin - lobin) & (colors - 1)) * szcpgcnt; 2853 2854 skip = colors * szcpgcnt; 2855 } else { 2856 /* check all pages starting from lo */ 2857 skip = szcpgcnt; 2858 } 2859 if (hi <= lo) 2860 /* mseg cannot satisfy color request */ 2861 continue; 2862 2863 /* randomly choose a point between lo and hi to begin search */ 2864 2865 randpfn = (pfn_t)GETTICK(); 2866 randpfn = ((randpfn % (hi - lo)) + lo) & ~(skip - 1); 2867 randpp = mseg->pages + (randpfn - mseg->pages_base); 2868 2869 ASSERT(randpp->p_pagenum == randpfn); 2870 2871 pp = randpp; 2872 endpp = mseg->pages + (hi - mseg->pages_base); 2873 2874 ASSERT(randpp + szcpgcnt <= endpp); 2875 2876 do { 2877 ASSERT(!(pp->p_pagenum & szcpgmask)); 2878 ASSERT((flags & PG_MATCH_COLOR) == 0 || 2879 colorequiv > 1 || 2880 PP_2_BIN(pp) == bin); 2881 if (page_trylock_contig_pages(mnode, pp, szc, flags)) { 2882 /* pages unlocked by page_claim on failure */ 2883 if (page_claim_contig_pages(pp, szc, flags)) { 2884 memsegs_unlock(0); 2885 return (pp); 2886 } 2887 } 2888 2889 pp += skip; 2890 if (pp >= endpp) { 2891 /* start from the beginning */ 2892 pp = mseg->pages + (lo - mseg->pages_base); 2893 ASSERT(pp->p_pagenum == lo); 2894 ASSERT(pp + szcpgcnt <= endpp); 2895 } 2896 } while (pp != randpp); 2897 } 2898 memsegs_unlock(0); 2899 return (NULL); 2900 } 2901 2902 2903 /* 2904 * controlling routine that searches through physical memory in an attempt to 2905 * claim a large page based on the input parameters. 2906 * on the page free lists. 2907 * 2908 * calls page_geti_contig_pages with an initial pfn range from the mnode 2909 * and mtype. page_geti_contig_pages will trim off the parts of the pfn range 2910 * that overlaps with the kernel cage or does not match the requested page 2911 * color if PG_MATCH_COLOR is set. Since this search is very expensive, 2912 * page_geti_contig_pages may further limit the search range based on 2913 * previous failure counts (pgcpfailcnt[]). 2914 * 2915 * for PGI_PGCPSZC0 requests, page_get_contig_pages will relocate a base 2916 * pagesize page that satisfies mtype. 2917 */ 2918 page_t * 2919 page_get_contig_pages(int mnode, uint_t bin, int mtype, uchar_t szc, 2920 uint_t flags) 2921 { 2922 pfn_t pfnlo, pfnhi; /* contig pages pfn range */ 2923 page_t *pp; 2924 int pfnflag = 0; /* no limit on search if 0 */ 2925 2926 VM_STAT_ADD(vmm_vmstats.pgcp_alloc[szc]); 2927 2928 /* LINTED */ 2929 MTYPE_START(mnode, mtype, flags); 2930 if (mtype < 0) { /* mnode does not have memory in mtype range */ 2931 VM_STAT_ADD(vmm_vmstats.pgcp_allocempty[szc]); 2932 return (NULL); 2933 } 2934 2935 ASSERT(szc > 0 || (flags & PGI_PGCPSZC0)); 2936 2937 /* do not limit search and ignore color if hi pri */ 2938 2939 if (pgcplimitsearch && ((flags & PGI_PGCPHIPRI) == 0)) 2940 pfnflag = pgcpfailcnt[szc]; 2941 2942 /* remove color match to improve chances */ 2943 2944 if (flags & PGI_PGCPHIPRI || pfnflag) 2945 flags &= ~PG_MATCH_COLOR; 2946 2947 do { 2948 /* get pfn range based on mnode and mtype */ 2949 MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi); 2950 2951 ASSERT(pfnhi >= pfnlo); 2952 2953 pp = page_geti_contig_pages(mnode, bin, szc, flags, 2954 pfnlo, pfnhi, pfnflag); 2955 2956 if (pp != NULL) { 2957 pfnflag = pgcpfailcnt[szc]; 2958 if (pfnflag) { 2959 /* double the search size */ 2960 pgcpfailcnt[szc] = pfnflag >> 1; 2961 } 2962 VM_STAT_ADD(vmm_vmstats.pgcp_allocok[szc]); 2963 return (pp); 2964 } 2965 /* LINTED */ 2966 } while ((flags & PGI_MT_RANGE) && 2967 (MTYPE_NEXT(mnode, mtype, flags) >= 0)); 2968 2969 VM_STAT_ADD(vmm_vmstats.pgcp_allocfailed[szc]); 2970 return (NULL); 2971 } 2972 2973 2974 /* 2975 * Find the `best' page on the freelist for this (vp,off) (as,vaddr) pair. 2976 * 2977 * Does its own locking and accounting. 2978 * If PG_MATCH_COLOR is set, then NULL will be returned if there are no 2979 * pages of the proper color even if there are pages of a different color. 2980 * 2981 * Finds a page, removes it, THEN locks it. 2982 */ 2983 2984 /*ARGSUSED*/ 2985 page_t * 2986 page_get_freelist(struct vnode *vp, u_offset_t off, struct seg *seg, 2987 caddr_t vaddr, size_t size, uint_t flags, struct lgrp *lgrp) 2988 { 2989 struct as *as = seg->s_as; 2990 page_t *pp = NULL; 2991 ulong_t bin; 2992 uchar_t szc; 2993 int mnode; 2994 int mtype; 2995 page_t *(*page_get_func)(int, uint_t, int, uchar_t, uint_t); 2996 lgrp_mnode_cookie_t lgrp_cookie; 2997 2998 page_get_func = page_get_mnode_freelist; 2999 3000 /* 3001 * If we aren't passed a specific lgroup, or passed a freed lgrp 3002 * assume we wish to allocate near to the current thread's home. 3003 */ 3004 if (!LGRP_EXISTS(lgrp)) 3005 lgrp = lgrp_home_lgrp(); 3006 3007 if (kcage_on) { 3008 if ((flags & (PG_NORELOC | PG_PANIC)) == PG_NORELOC && 3009 kcage_freemem < kcage_throttlefree + btop(size) && 3010 curthread != kcage_cageout_thread) { 3011 /* 3012 * Set a "reserve" of kcage_throttlefree pages for 3013 * PG_PANIC and cageout thread allocations. 3014 * 3015 * Everybody else has to serialize in 3016 * page_create_get_something() to get a cage page, so 3017 * that we don't deadlock cageout! 3018 */ 3019 return (NULL); 3020 } 3021 } else { 3022 flags &= ~PG_NORELOC; 3023 flags |= PGI_NOCAGE; 3024 } 3025 3026 /* LINTED */ 3027 MTYPE_INIT(mtype, vp, vaddr, flags); 3028 3029 /* 3030 * Convert size to page size code. 3031 */ 3032 if ((szc = page_szc(size)) == (uchar_t)-1) 3033 panic("page_get_freelist: illegal page size request"); 3034 ASSERT(szc < mmu_page_sizes); 3035 3036 VM_STAT_ADD(vmm_vmstats.pgf_alloc[szc]); 3037 3038 /* LINTED */ 3039 AS_2_BIN(as, seg, vp, vaddr, bin); 3040 3041 /* bin is for base pagesize color - convert if larger pagesize. */ 3042 if (szc) 3043 bin = page_convert_color(0, szc, bin); 3044 3045 /* 3046 * Try to get a local page first, but try remote if we can't 3047 * get a page of the right color. 3048 */ 3049 pgretry: 3050 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL); 3051 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3052 pp = page_get_func(mnode, bin, mtype, szc, flags); 3053 if (pp != NULL) { 3054 VM_STAT_ADD(vmm_vmstats.pgf_allocok[szc]); 3055 DTRACE_PROBE4(page__get, 3056 lgrp_t *, lgrp, 3057 int, mnode, 3058 ulong_t, bin, 3059 uint_t, flags); 3060 return (pp); 3061 } 3062 } 3063 ASSERT(pp == NULL); 3064 3065 /* 3066 * for non-SZC0 PAGESIZE requests, check cachelist before checking 3067 * remote free lists. Caller expected to call page_get_cachelist which 3068 * will check local cache lists and remote free lists. 3069 */ 3070 if (szc == 0 && ((flags & PGI_PGCPSZC0) == 0)) { 3071 VM_STAT_ADD(vmm_vmstats.pgf_allocdeferred); 3072 return (NULL); 3073 } 3074 3075 ASSERT(szc > 0 || (flags & PGI_PGCPSZC0)); 3076 3077 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); 3078 3079 /* 3080 * Try to get a non-local freelist page. 3081 */ 3082 LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie); 3083 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3084 pp = page_get_func(mnode, bin, mtype, szc, flags); 3085 if (pp != NULL) { 3086 DTRACE_PROBE4(page__get, 3087 lgrp_t *, lgrp, 3088 int, mnode, 3089 ulong_t, bin, 3090 uint_t, flags); 3091 VM_STAT_ADD(vmm_vmstats.pgf_allocokrem[szc]); 3092 return (pp); 3093 } 3094 } 3095 3096 ASSERT(pp == NULL); 3097 3098 /* 3099 * when the cage is off chances are page_get_contig_pages() will fail 3100 * to lock a large page chunk therefore when the cage is off it's not 3101 * called by default. this can be changed via /etc/system. 3102 * 3103 * page_get_contig_pages() also called to acquire a base pagesize page 3104 * for page_create_get_something(). 3105 */ 3106 if (!(flags & PG_NORELOC) && (pg_contig_disable == 0) && 3107 (kcage_on || pg_lpgcreate_nocage || szc == 0) && 3108 (page_get_func != page_get_contig_pages)) { 3109 3110 VM_STAT_ADD(vmm_vmstats.pgf_allocretry[szc]); 3111 page_get_func = page_get_contig_pages; 3112 goto pgretry; 3113 } 3114 3115 if (pgcplimitsearch && page_get_func == page_get_contig_pages) 3116 pgcpfailcnt[szc]++; 3117 3118 VM_STAT_ADD(vmm_vmstats.pgf_allocfailed[szc]); 3119 return (NULL); 3120 } 3121 3122 /* 3123 * Find the `best' page on the cachelist for this (vp,off) (as,vaddr) pair. 3124 * 3125 * Does its own locking. 3126 * If PG_MATCH_COLOR is set, then NULL will be returned if there are no 3127 * pages of the proper color even if there are pages of a different color. 3128 * Otherwise, scan the bins for ones with pages. For each bin with pages, 3129 * try to lock one of them. If no page can be locked, try the 3130 * next bin. Return NULL if a page can not be found and locked. 3131 * 3132 * Finds a pages, trys to lock it, then removes it. 3133 */ 3134 3135 /*ARGSUSED*/ 3136 page_t * 3137 page_get_cachelist(struct vnode *vp, u_offset_t off, struct seg *seg, 3138 caddr_t vaddr, uint_t flags, struct lgrp *lgrp) 3139 { 3140 page_t *pp; 3141 struct as *as = seg->s_as; 3142 ulong_t bin; 3143 /*LINTED*/ 3144 int mnode; 3145 int mtype; 3146 lgrp_mnode_cookie_t lgrp_cookie; 3147 3148 /* 3149 * If we aren't passed a specific lgroup, or pasased a freed lgrp 3150 * assume we wish to allocate near to the current thread's home. 3151 */ 3152 if (!LGRP_EXISTS(lgrp)) 3153 lgrp = lgrp_home_lgrp(); 3154 3155 if (!kcage_on) { 3156 flags &= ~PG_NORELOC; 3157 flags |= PGI_NOCAGE; 3158 } 3159 3160 if ((flags & (PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == PG_NORELOC && 3161 kcage_freemem <= kcage_throttlefree) { 3162 /* 3163 * Reserve kcage_throttlefree pages for critical kernel 3164 * threads. 3165 * 3166 * Everybody else has to go to page_create_get_something() 3167 * to get a cage page, so we don't deadlock cageout. 3168 */ 3169 return (NULL); 3170 } 3171 3172 /* LINTED */ 3173 AS_2_BIN(as, seg, vp, vaddr, bin); 3174 3175 ASSERT(bin <= page_colors_mask); 3176 3177 /* LINTED */ 3178 MTYPE_INIT(mtype, vp, vaddr, flags); 3179 3180 VM_STAT_ADD(vmm_vmstats.pgc_alloc); 3181 3182 /* 3183 * Try local cachelists first 3184 */ 3185 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL); 3186 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3187 pp = page_get_mnode_cachelist(bin, flags, mnode, mtype); 3188 if (pp != NULL) { 3189 VM_STAT_ADD(vmm_vmstats.pgc_allocok); 3190 DTRACE_PROBE4(page__get, 3191 lgrp_t *, lgrp, 3192 int, mnode, 3193 ulong_t, bin, 3194 uint_t, flags); 3195 return (pp); 3196 } 3197 } 3198 3199 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); 3200 3201 /* 3202 * Try freelists/cachelists that are farther away 3203 * This is our only chance to allocate remote pages for PAGESIZE 3204 * requests. 3205 */ 3206 LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie); 3207 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3208 pp = page_get_mnode_freelist(mnode, bin, mtype, 3209 0, flags); 3210 if (pp != NULL) { 3211 VM_STAT_ADD(vmm_vmstats.pgc_allocokdeferred); 3212 DTRACE_PROBE4(page__get, 3213 lgrp_t *, lgrp, 3214 int, mnode, 3215 ulong_t, bin, 3216 uint_t, flags); 3217 return (pp); 3218 } 3219 pp = page_get_mnode_cachelist(bin, flags, mnode, mtype); 3220 if (pp != NULL) { 3221 VM_STAT_ADD(vmm_vmstats.pgc_allocokrem); 3222 DTRACE_PROBE4(page__get, 3223 lgrp_t *, lgrp, 3224 int, mnode, 3225 ulong_t, bin, 3226 uint_t, flags); 3227 return (pp); 3228 } 3229 } 3230 3231 VM_STAT_ADD(vmm_vmstats.pgc_allocfailed); 3232 return (NULL); 3233 } 3234 3235 page_t * 3236 page_get_mnode_cachelist(uint_t bin, uint_t flags, int mnode, int mtype) 3237 { 3238 kmutex_t *pcm; 3239 int i; 3240 page_t *pp; 3241 page_t *first_pp; 3242 uint_t bin_marker; 3243 int nwaybins, nwaycnt; 3244 int cpucolors; 3245 3246 VM_STAT_ADD(vmm_vmstats.pgmc_alloc); 3247 3248 /* LINTED */ 3249 MTYPE_START(mnode, mtype, flags); 3250 if (mtype < 0) { /* mnode does not have memory in mtype range */ 3251 VM_STAT_ADD(vmm_vmstats.pgmc_allocempty); 3252 return (NULL); 3253 } 3254 3255 nwaybins = 0; 3256 cpucolors = cpu_page_colors; 3257 /* 3258 * adjust cpucolors to possibly check additional 'equivalent' bins 3259 * to try to minimize fragmentation of large pages by delaying calls 3260 * to page_freelist_fill. 3261 */ 3262 if (colorequiv > 1) { 3263 int equivcolors = page_colors / colorequiv; 3264 3265 if (equivcolors && (cpucolors == 0 || equivcolors < cpucolors)) 3266 cpucolors = equivcolors; 3267 } 3268 3269 /* 3270 * Only hold one cachelist lock at a time, that way we 3271 * can start anywhere and not have to worry about lock 3272 * ordering. 3273 */ 3274 3275 big_try_again: 3276 nwaycnt = 0; 3277 for (i = 0; i <= page_colors; i++) { 3278 if (PAGE_CACHELISTS(mnode, bin, mtype)) { 3279 pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST); 3280 mutex_enter(pcm); 3281 pp = PAGE_CACHELISTS(mnode, bin, mtype); 3282 if (pp != NULL) { 3283 first_pp = pp; 3284 ASSERT(pp->p_vnode); 3285 ASSERT(PP_ISAGED(pp) == 0); 3286 ASSERT(pp->p_szc == 0); 3287 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 3288 while (!page_trylock(pp, SE_EXCL)) { 3289 pp = pp->p_next; 3290 ASSERT(pp->p_szc == 0); 3291 if (pp == first_pp) { 3292 /* 3293 * We have searched the 3294 * complete list! 3295 * And all of them (might 3296 * only be one) are locked. 3297 * This can happen since 3298 * these pages can also be 3299 * found via the hash list. 3300 * When found via the hash 3301 * list, they are locked 3302 * first, then removed. 3303 * We give up to let the 3304 * other thread run. 3305 */ 3306 pp = NULL; 3307 break; 3308 } 3309 ASSERT(pp->p_vnode); 3310 ASSERT(PP_ISFREE(pp)); 3311 ASSERT(PP_ISAGED(pp) == 0); 3312 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == 3313 mnode); 3314 } 3315 3316 if (pp) { 3317 page_t **ppp; 3318 /* 3319 * Found and locked a page. 3320 * Pull it off the list. 3321 */ 3322 ASSERT(mtype == PP_2_MTYPE(pp)); 3323 ppp = &PAGE_CACHELISTS(mnode, bin, 3324 mtype); 3325 page_sub(ppp, pp); 3326 /* 3327 * Subtract counters before releasing 3328 * pcm mutex to avoid a race with 3329 * page_freelist_coalesce and 3330 * page_freelist_fill. 3331 */ 3332 page_ctr_sub(pp, PG_CACHE_LIST); 3333 mutex_exit(pcm); 3334 ASSERT(pp->p_vnode); 3335 ASSERT(PP_ISAGED(pp) == 0); 3336 #if defined(__sparc) 3337 ASSERT(!kcage_on || 3338 (flags & PG_NORELOC) == 0 || 3339 PP_ISNORELOC(pp)); 3340 if (PP_ISNORELOC(pp)) { 3341 kcage_freemem_sub(1); 3342 } 3343 #endif 3344 VM_STAT_ADD(vmm_vmstats. 3345 pgmc_allocok); 3346 return (pp); 3347 } 3348 } 3349 mutex_exit(pcm); 3350 } 3351 3352 /* 3353 * Wow! The initial bin is empty or no page in the bin could 3354 * be locked. 3355 * 3356 * If specific color is needed, check if page color may be in 3357 * other bins. 3358 */ 3359 if ((flags & PG_MATCH_COLOR) && (cpucolors != 0)) { 3360 if (!nwaybins) { 3361 if (cpucolors < 0) { 3362 cpucolors = CPUSETSIZE() / MMU_PAGESIZE; 3363 ASSERT(cpucolors > 0); 3364 nwaybins = page_colors / cpucolors; 3365 if (nwaybins < 2) 3366 cpucolors = 0; 3367 } else { 3368 nwaybins = page_colors / cpucolors; 3369 ASSERT(nwaybins > 1); 3370 } 3371 } 3372 3373 if (++nwaycnt >= nwaybins) { 3374 break; 3375 } 3376 bin = (bin + (page_colors / nwaybins)) & 3377 page_colors_mask; 3378 continue; 3379 } 3380 3381 if (i == 0) { 3382 bin = (bin + BIN_STEP) & page_colors_mask; 3383 bin_marker = bin; 3384 } else { 3385 bin = (bin + vac_colors) & page_colors_mask; 3386 if (bin == bin_marker) { 3387 bin = (bin + 1) & page_colors_mask; 3388 bin_marker = bin; 3389 } 3390 } 3391 } 3392 3393 #if defined(__sparc) 3394 if (!(flags & (PG_NORELOC | PGI_NOCAGE | PGI_RELOCONLY)) && 3395 (kcage_freemem >= kcage_lotsfree)) { 3396 /* 3397 * The Cage is ON and with plenty of free mem, and 3398 * we're willing to check for a NORELOC page if we 3399 * couldn't find a RELOC page, so spin again. 3400 */ 3401 flags |= PG_NORELOC; 3402 mtype = MTYPE_NORELOC; 3403 goto big_try_again; 3404 } 3405 #else 3406 if (flags & PGI_MT_RANGE) { 3407 MTYPE_NEXT(mnode, mtype, flags); 3408 if (mtype >= 0) 3409 goto big_try_again; 3410 } 3411 #endif 3412 VM_STAT_ADD(vmm_vmstats.pgmc_allocfailed); 3413 return (NULL); 3414 } 3415 3416 #ifdef DEBUG 3417 #define REPL_PAGE_STATS 3418 #endif /* DEBUG */ 3419 3420 #ifdef REPL_PAGE_STATS 3421 struct repl_page_stats { 3422 uint_t ngets; 3423 uint_t ngets_noreloc; 3424 uint_t npgr_noreloc; 3425 uint_t nnopage_first; 3426 uint_t nnopage; 3427 uint_t nhashout; 3428 uint_t nnofree; 3429 uint_t nnext_pp; 3430 } repl_page_stats; 3431 #define REPL_STAT_INCR(v) atomic_add_32(&repl_page_stats.v, 1) 3432 #else /* REPL_PAGE_STATS */ 3433 #define REPL_STAT_INCR(v) 3434 #endif /* REPL_PAGE_STATS */ 3435 3436 int pgrppgcp; 3437 3438 /* 3439 * The freemem accounting must be done by the caller. 3440 * First we try to get a replacement page of the same size as like_pp, 3441 * if that is not possible, then we just get a set of discontiguous 3442 * PAGESIZE pages. 3443 */ 3444 page_t * 3445 page_get_replacement_page(page_t *orig_like_pp, struct lgrp *lgrp_target, 3446 uint_t pgrflags) 3447 { 3448 page_t *like_pp; 3449 page_t *pp, *pplist; 3450 page_t *pl = NULL; 3451 ulong_t bin; 3452 int mnode, page_mnode; 3453 int szc; 3454 spgcnt_t npgs, pg_cnt; 3455 pfn_t pfnum; 3456 int mtype; 3457 int flags = 0; 3458 lgrp_mnode_cookie_t lgrp_cookie; 3459 lgrp_t *lgrp; 3460 3461 REPL_STAT_INCR(ngets); 3462 like_pp = orig_like_pp; 3463 ASSERT(PAGE_EXCL(like_pp)); 3464 3465 szc = like_pp->p_szc; 3466 npgs = page_get_pagecnt(szc); 3467 /* 3468 * Now we reset like_pp to the base page_t. 3469 * That way, we won't walk past the end of this 'szc' page. 3470 */ 3471 pfnum = PFN_BASE(like_pp->p_pagenum, szc); 3472 like_pp = page_numtopp_nolock(pfnum); 3473 ASSERT(like_pp->p_szc == szc); 3474 3475 if (PP_ISNORELOC(like_pp)) { 3476 ASSERT(kcage_on); 3477 REPL_STAT_INCR(ngets_noreloc); 3478 flags = PGI_RELOCONLY; 3479 } else if (pgrflags & PGR_NORELOC) { 3480 ASSERT(kcage_on); 3481 REPL_STAT_INCR(npgr_noreloc); 3482 flags = PG_NORELOC; 3483 } 3484 3485 /* 3486 * Kernel pages must always be replaced with the same size 3487 * pages, since we cannot properly handle demotion of kernel 3488 * pages. 3489 */ 3490 if (like_pp->p_vnode == &kvp) 3491 pgrflags |= PGR_SAMESZC; 3492 3493 /* LINTED */ 3494 MTYPE_PGR_INIT(mtype, flags, like_pp, page_mnode); 3495 3496 while (npgs) { 3497 pplist = NULL; 3498 for (;;) { 3499 pg_cnt = page_get_pagecnt(szc); 3500 bin = PP_2_BIN(like_pp); 3501 ASSERT(like_pp->p_szc == orig_like_pp->p_szc); 3502 ASSERT(pg_cnt <= npgs); 3503 3504 /* 3505 * If an lgroup was specified, try to get the 3506 * page from that lgroup. 3507 * NOTE: Must be careful with code below because 3508 * lgroup may disappear and reappear since there 3509 * is no locking for lgroup here. 3510 */ 3511 if (LGRP_EXISTS(lgrp_target)) { 3512 /* 3513 * Keep local variable for lgroup separate 3514 * from lgroup argument since this code should 3515 * only be exercised when lgroup argument 3516 * exists.... 3517 */ 3518 lgrp = lgrp_target; 3519 3520 /* Try the lgroup's freelists first */ 3521 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 3522 LGRP_SRCH_LOCAL); 3523 while ((pplist == NULL) && 3524 (mnode = lgrp_memnode_choose(&lgrp_cookie)) 3525 != -1) { 3526 pplist = page_get_mnode_freelist( 3527 mnode, bin, mtype, szc, 3528 flags); 3529 } 3530 3531 /* 3532 * Now try it's cachelists if this is a 3533 * small page. Don't need to do it for 3534 * larger ones since page_freelist_coalesce() 3535 * already failed. 3536 */ 3537 if (pplist != NULL || szc != 0) 3538 break; 3539 3540 /* Now try it's cachelists */ 3541 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 3542 LGRP_SRCH_LOCAL); 3543 3544 while ((pplist == NULL) && 3545 (mnode = lgrp_memnode_choose(&lgrp_cookie)) 3546 != -1) { 3547 pplist = page_get_mnode_cachelist( 3548 bin, flags, mnode, mtype); 3549 } 3550 if (pplist != NULL) { 3551 page_hashout(pplist, NULL); 3552 PP_SETAGED(pplist); 3553 REPL_STAT_INCR(nhashout); 3554 break; 3555 } 3556 /* Done looking in this lgroup. Bail out. */ 3557 break; 3558 } 3559 3560 /* 3561 * No lgroup was specified (or lgroup was removed by 3562 * DR, so just try to get the page as close to 3563 * like_pp's mnode as possible. 3564 * First try the local freelist... 3565 */ 3566 mnode = PP_2_MEM_NODE(like_pp); 3567 pplist = page_get_mnode_freelist(mnode, bin, 3568 mtype, szc, flags); 3569 if (pplist != NULL) 3570 break; 3571 3572 REPL_STAT_INCR(nnofree); 3573 3574 /* 3575 * ...then the local cachelist. Don't need to do it for 3576 * larger pages cause page_freelist_coalesce() already 3577 * failed there anyway. 3578 */ 3579 if (szc == 0) { 3580 pplist = page_get_mnode_cachelist(bin, flags, 3581 mnode, mtype); 3582 if (pplist != NULL) { 3583 page_hashout(pplist, NULL); 3584 PP_SETAGED(pplist); 3585 REPL_STAT_INCR(nhashout); 3586 break; 3587 } 3588 } 3589 3590 /* Now try remote freelists */ 3591 page_mnode = mnode; 3592 lgrp = 3593 lgrp_hand_to_lgrp(MEM_NODE_2_LGRPHAND(page_mnode)); 3594 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 3595 LGRP_SRCH_HIER); 3596 while (pplist == NULL && 3597 (mnode = lgrp_memnode_choose(&lgrp_cookie)) 3598 != -1) { 3599 /* 3600 * Skip local mnode. 3601 */ 3602 if ((mnode == page_mnode) || 3603 (mem_node_config[mnode].exists == 0)) 3604 continue; 3605 3606 pplist = page_get_mnode_freelist(mnode, 3607 bin, mtype, szc, flags); 3608 } 3609 3610 if (pplist != NULL) 3611 break; 3612 3613 3614 /* Now try remote cachelists */ 3615 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 3616 LGRP_SRCH_HIER); 3617 while (pplist == NULL && szc == 0) { 3618 mnode = lgrp_memnode_choose(&lgrp_cookie); 3619 if (mnode == -1) 3620 break; 3621 /* 3622 * Skip local mnode. 3623 */ 3624 if ((mnode == page_mnode) || 3625 (mem_node_config[mnode].exists == 0)) 3626 continue; 3627 3628 pplist = page_get_mnode_cachelist(bin, 3629 flags, mnode, mtype); 3630 3631 if (pplist != NULL) { 3632 page_hashout(pplist, NULL); 3633 PP_SETAGED(pplist); 3634 REPL_STAT_INCR(nhashout); 3635 break; 3636 } 3637 } 3638 3639 /* 3640 * Break out of while loop under the following cases: 3641 * - If we successfully got a page. 3642 * - If pgrflags specified only returning a specific 3643 * page size and we could not find that page size. 3644 * - If we could not satisfy the request with PAGESIZE 3645 * or larger pages. 3646 */ 3647 if (pplist != NULL || szc == 0) 3648 break; 3649 3650 if ((pgrflags & PGR_SAMESZC) || pgrppgcp) { 3651 /* try to find contig page */ 3652 3653 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 3654 LGRP_SRCH_HIER); 3655 3656 while ((pplist == NULL) && 3657 (mnode = 3658 lgrp_memnode_choose(&lgrp_cookie)) 3659 != -1) { 3660 pplist = page_get_contig_pages( 3661 mnode, bin, mtype, szc, 3662 flags | PGI_PGCPHIPRI); 3663 } 3664 break; 3665 } 3666 3667 /* 3668 * The correct thing to do here is try the next 3669 * page size down using szc--. Due to a bug 3670 * with the processing of HAT_RELOAD_SHARE 3671 * where the sfmmu_ttecnt arrays of all 3672 * hats sharing an ISM segment don't get updated, 3673 * using intermediate size pages for relocation 3674 * can lead to continuous page faults. 3675 */ 3676 szc = 0; 3677 } 3678 3679 if (pplist != NULL) { 3680 DTRACE_PROBE4(page__get, 3681 lgrp_t *, lgrp, 3682 int, mnode, 3683 ulong_t, bin, 3684 uint_t, flags); 3685 3686 while (pplist != NULL && pg_cnt--) { 3687 ASSERT(pplist != NULL); 3688 pp = pplist; 3689 page_sub(&pplist, pp); 3690 PP_CLRFREE(pp); 3691 PP_CLRAGED(pp); 3692 page_list_concat(&pl, &pp); 3693 npgs--; 3694 like_pp = like_pp + 1; 3695 REPL_STAT_INCR(nnext_pp); 3696 } 3697 ASSERT(pg_cnt == 0); 3698 } else { 3699 break; 3700 } 3701 } 3702 3703 if (npgs) { 3704 /* 3705 * We were unable to allocate the necessary number 3706 * of pages. 3707 * We need to free up any pl. 3708 */ 3709 REPL_STAT_INCR(nnopage); 3710 page_free_replacement_page(pl); 3711 return (NULL); 3712 } else { 3713 return (pl); 3714 } 3715 } 3716 3717 /* 3718 * demote a free large page to it's constituent pages 3719 */ 3720 void 3721 page_demote_free_pages(page_t *pp) 3722 { 3723 3724 int mnode; 3725 3726 ASSERT(pp != NULL); 3727 ASSERT(PAGE_LOCKED(pp)); 3728 ASSERT(PP_ISFREE(pp)); 3729 ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes); 3730 3731 mnode = PP_2_MEM_NODE(pp); 3732 page_freelist_lock(mnode); 3733 if (pp->p_szc != 0) { 3734 (void) page_demote(mnode, PFN_BASE(pp->p_pagenum, 3735 pp->p_szc), pp->p_szc, 0, PC_NO_COLOR, PC_FREE); 3736 } 3737 page_freelist_unlock(mnode); 3738 ASSERT(pp->p_szc == 0); 3739 } 3740