1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * Portions of this source code were derived from Berkeley 4.3 BSD 32 * under license from the Regents of the University of California. 33 */ 34 35 #pragma ident "%Z%%M% %I% %E% SMI" 36 37 /* 38 * This file contains common functions to access and manage the page lists. 39 * Many of these routines originated from platform dependent modules 40 * (sun4/vm/vm_dep.c, i86pc/vm/vm_machdep.c) and modified to function in 41 * a platform independent manner. 42 * 43 * vm/vm_dep.h provides for platform specific support. 44 */ 45 46 #include <sys/types.h> 47 #include <sys/debug.h> 48 #include <sys/cmn_err.h> 49 #include <sys/systm.h> 50 #include <sys/atomic.h> 51 #include <sys/sysmacros.h> 52 #include <vm/as.h> 53 #include <vm/page.h> 54 #include <vm/seg_kmem.h> 55 #include <vm/seg_vn.h> 56 #include <sys/memnode.h> 57 #include <vm/vm_dep.h> 58 #include <sys/lgrp.h> 59 #include <sys/mem_config.h> 60 #include <sys/callb.h> 61 #include <sys/mem_cage.h> 62 #include <sys/sdt.h> 63 64 extern uint_t vac_colors; 65 66 /* 67 * number of page colors equivalent to reqested color in page_get routines. 68 * If set, keeps large pages intact longer and keeps MPO allocation 69 * from the local mnode in favor of acquiring the 'correct' page color from 70 * a demoted large page or from a remote mnode. 71 */ 72 int colorequiv; 73 74 /* 75 * if set, specifies the percentage of large pages that are free from within 76 * a large page region before attempting to lock those pages for 77 * page_get_contig_pages processing. 78 * 79 * Should be turned on when kpr is available when page_trylock_contig_pages 80 * can be more selective. 81 */ 82 83 int ptcpthreshold; 84 85 /* 86 * Limit page get contig page search based on failure cnts in pgcpfailcnt[]. 87 * use slot 0 (base page size unused) to enable or disable limiting search. 88 * Enabled by default. 89 */ 90 int pgcpfailcnt[MMU_PAGE_SIZES]; 91 int pgcplimitsearch = 1; 92 93 #ifdef VM_STATS 94 struct vmm_vmstats_str vmm_vmstats; 95 96 #endif /* VM_STATS */ 97 98 #if defined(__sparc) 99 #define LPGCREATE 0 100 #else 101 /* enable page_get_contig_pages */ 102 #define LPGCREATE 1 103 #endif 104 105 int pg_contig_disable; 106 int pg_lpgcreate_nocage = LPGCREATE; 107 108 /* 109 * page_freelist_fill pfn flag to signify no hi pfn requirement. 110 */ 111 #define PFNNULL 0 112 113 /* Flags involved in promotion and demotion routines */ 114 #define PC_FREE 0x1 /* put page on freelist */ 115 #define PC_ALLOC 0x2 /* return page for allocation */ 116 117 /* 118 * Flag for page_demote to be used with PC_FREE to denote that we don't care 119 * what the color is as the color parameter to the function is ignored. 120 */ 121 #define PC_NO_COLOR (-1) 122 123 /* 124 * page counters candidates info 125 * See page_ctrs_cands comment below for more details. 126 * fields are as follows: 127 * pcc_pages_free: # pages which freelist coalesce can create 128 * pcc_color_free_len: number of elements in pcc_color_free array 129 * pcc_color_free: pointer to page free counts per color 130 */ 131 typedef struct pcc_info { 132 pgcnt_t pcc_pages_free; 133 int pcc_color_free_len; 134 pgcnt_t *pcc_color_free; 135 } pcc_info_t; 136 137 /* 138 * On big machines it can take a long time to check page_counters 139 * arrays. page_ctrs_cands is a summary array whose elements are a dynamically 140 * updated sum of all elements of the corresponding page_counters arrays. 141 * page_freelist_coalesce() searches page_counters only if an appropriate 142 * element of page_ctrs_cands array is greater than 0. 143 * 144 * An extra dimension is used for page_ctrs_cands to spread the elements 145 * over a few e$ cache lines to avoid serialization during the array 146 * updates. 147 */ 148 #pragma align 64(page_ctrs_cands) 149 150 static pcc_info_t *page_ctrs_cands[NPC_MUTEX][MMU_PAGE_SIZES]; 151 152 /* 153 * Return in val the total number of free pages which can be created 154 * for the given mnode (m) and region size (r) 155 */ 156 #define PGCTRS_CANDS_GETVALUE(m, r, val) { \ 157 int i; \ 158 val = 0; \ 159 for (i = 0; i < NPC_MUTEX; i++) { \ 160 val += page_ctrs_cands[i][(r)][(m)].pcc_pages_free; \ 161 } \ 162 } 163 164 /* 165 * Return in val the total number of free pages which can be created 166 * for the given mnode (m), region size (r), and color (c) 167 */ 168 #define PGCTRS_CANDS_GETVALUECOLOR(m, r, c, val) { \ 169 int i; \ 170 val = 0; \ 171 ASSERT((c) < page_ctrs_cands[0][(r)][(m)].pcc_color_free_len); \ 172 for (i = 0; i < NPC_MUTEX; i++) { \ 173 val += page_ctrs_cands[i][(r)][(m)].pcc_color_free[(c)]; \ 174 } \ 175 } 176 177 /* 178 * We can only allow a single thread to update a counter within the physical 179 * range of the largest supported page size. That is the finest granularity 180 * possible since the counter values are dependent on each other 181 * as you move accross region sizes. PP_CTR_LOCK_INDX is used to determine the 182 * ctr_mutex lock index for a particular physical range. 183 */ 184 static kmutex_t *ctr_mutex[NPC_MUTEX]; 185 186 #define PP_CTR_LOCK_INDX(pp) \ 187 (((pp)->p_pagenum >> \ 188 (PAGE_BSZS_SHIFT(mmu_page_sizes - 1))) & (NPC_MUTEX - 1)) 189 190 /* 191 * Local functions prototypes. 192 */ 193 194 void page_ctr_add(page_t *, int); 195 void page_ctr_add_internal(int, page_t *, int); 196 void page_ctr_sub(page_t *, int); 197 uint_t page_convert_color(uchar_t, uchar_t, uint_t); 198 void page_freelist_lock(int); 199 void page_freelist_unlock(int); 200 page_t *page_promote(int, pfn_t, uchar_t, int); 201 page_t *page_demote(int, pfn_t, uchar_t, uchar_t, int, int); 202 page_t *page_freelist_fill(uchar_t, int, int, int, pfn_t); 203 page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int); 204 static int page_trylock_cons(page_t *pp, se_t se); 205 206 #define PNUM_SIZE(szc) \ 207 (hw_page_array[(szc)].hp_size >> hw_page_array[0].hp_shift) 208 #define PNUM_SHIFT(szc) \ 209 (hw_page_array[(szc)].hp_shift - hw_page_array[0].hp_shift) 210 211 /* 212 * The page_counters array below is used to keep track of free contiguous 213 * physical memory. A hw_page_map_t will be allocated per mnode per szc. 214 * This contains an array of counters, the size of the array, a shift value 215 * used to convert a pagenum into a counter array index or vice versa, as 216 * well as a cache of the last successful index to be promoted to a larger 217 * page size. As an optimization, we keep track of the last successful index 218 * to be promoted per page color for the given size region, and this is 219 * allocated dynamically based upon the number of colors for a given 220 * region size. 221 * 222 * Conceptually, the page counters are represented as: 223 * 224 * page_counters[region_size][mnode] 225 * 226 * region_size: size code of a candidate larger page made up 227 * of contiguous free smaller pages. 228 * 229 * page_counters[region_size][mnode].hpm_counters[index]: 230 * represents how many (region_size - 1) pages either 231 * exist or can be created within the given index range. 232 * 233 * Let's look at a sparc example: 234 * If we want to create a free 512k page, we look at region_size 2 235 * for the mnode we want. We calculate the index and look at a specific 236 * hpm_counters location. If we see 8 (FULL_REGION_CNT on sparc) at 237 * this location, it means that 8 64k pages either exist or can be created 238 * from 8K pages in order to make a single free 512k page at the given 239 * index. Note that when a region is full, it will contribute to the 240 * counts in the region above it. Thus we will not know what page 241 * size the free pages will be which can be promoted to this new free 242 * page unless we look at all regions below the current region. 243 */ 244 245 /* 246 * Note: hpmctr_t is defined in platform vm_dep.h 247 * hw_page_map_t contains all the information needed for the page_counters 248 * logic. The fields are as follows: 249 * 250 * hpm_counters: dynamically allocated array to hold counter data 251 * hpm_entries: entries in hpm_counters 252 * hpm_shift: shift for pnum/array index conv 253 * hpm_base: PFN mapped to counter index 0 254 * hpm_color_current_len: # of elements in hpm_color_current "array" below 255 * hpm_color_current: last index in counter array for this color at 256 * which we successfully created a large page 257 */ 258 typedef struct hw_page_map { 259 hpmctr_t *hpm_counters; 260 size_t hpm_entries; 261 int hpm_shift; 262 pfn_t hpm_base; 263 size_t hpm_color_current_len; 264 size_t *hpm_color_current; 265 } hw_page_map_t; 266 267 /* 268 * Element zero is not used, but is allocated for convenience. 269 */ 270 static hw_page_map_t *page_counters[MMU_PAGE_SIZES]; 271 272 /* 273 * The following macros are convenient ways to get access to the individual 274 * elements of the page_counters arrays. They can be used on both 275 * the left side and right side of equations. 276 */ 277 #define PAGE_COUNTERS(mnode, rg_szc, idx) \ 278 (page_counters[(rg_szc)][(mnode)].hpm_counters[(idx)]) 279 280 #define PAGE_COUNTERS_COUNTERS(mnode, rg_szc) \ 281 (page_counters[(rg_szc)][(mnode)].hpm_counters) 282 283 #define PAGE_COUNTERS_SHIFT(mnode, rg_szc) \ 284 (page_counters[(rg_szc)][(mnode)].hpm_shift) 285 286 #define PAGE_COUNTERS_ENTRIES(mnode, rg_szc) \ 287 (page_counters[(rg_szc)][(mnode)].hpm_entries) 288 289 #define PAGE_COUNTERS_BASE(mnode, rg_szc) \ 290 (page_counters[(rg_szc)][(mnode)].hpm_base) 291 292 #define PAGE_COUNTERS_CURRENT_COLOR_LEN(mnode, rg_szc) \ 293 (page_counters[(rg_szc)][(mnode)].hpm_color_current_len) 294 295 #define PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, rg_szc) \ 296 (page_counters[(rg_szc)][(mnode)].hpm_color_current) 297 298 #define PAGE_COUNTERS_CURRENT_COLOR(mnode, rg_szc, color) \ 299 (page_counters[(rg_szc)][(mnode)].hpm_color_current[(color)]) 300 301 #define PNUM_TO_IDX(mnode, rg_szc, pnum) \ 302 (((pnum) - PAGE_COUNTERS_BASE((mnode), (rg_szc))) >> \ 303 PAGE_COUNTERS_SHIFT((mnode), (rg_szc))) 304 305 #define IDX_TO_PNUM(mnode, rg_szc, index) \ 306 (PAGE_COUNTERS_BASE((mnode), (rg_szc)) + \ 307 ((index) << PAGE_COUNTERS_SHIFT((mnode), (rg_szc)))) 308 309 /* 310 * Protects the hpm_counters and hpm_color_current memory from changing while 311 * looking at page counters information. 312 * Grab the write lock to modify what these fields point at. 313 * Grab the read lock to prevent any pointers from changing. 314 * The write lock can not be held during memory allocation due to a possible 315 * recursion deadlock with trying to grab the read lock while the 316 * write lock is already held. 317 */ 318 krwlock_t page_ctrs_rwlock[MAX_MEM_NODES]; 319 320 /* 321 * page size to page size code 322 */ 323 int 324 page_szc(size_t pagesize) 325 { 326 int i = 0; 327 328 while (hw_page_array[i].hp_size) { 329 if (pagesize == hw_page_array[i].hp_size) 330 return (i); 331 i++; 332 } 333 return (-1); 334 } 335 336 /* 337 * page size to page size code for user supported page sizes 338 */ 339 int 340 page_user_szc(size_t pagesize) 341 { 342 int szc = page_szc(pagesize); 343 if (szc != -1) 344 return (SZC_2_USERSZC(szc)); 345 return (-1); 346 } 347 348 /* 349 * Return how many page sizes are available for the user to use. This is 350 * what the hardware supports and not based upon how the OS implements the 351 * support of different page sizes. 352 */ 353 uint_t 354 page_num_user_pagesizes(void) 355 { 356 return (mmu_exported_page_sizes); 357 } 358 359 uint_t 360 page_num_pagesizes(void) 361 { 362 return (mmu_page_sizes); 363 } 364 365 /* 366 * returns the count of the number of base pagesize pages associated with szc 367 */ 368 pgcnt_t 369 page_get_pagecnt(uint_t szc) 370 { 371 if (szc >= mmu_page_sizes) 372 panic("page_get_pagecnt: out of range %d", szc); 373 return (hw_page_array[szc].hp_pgcnt); 374 } 375 376 size_t 377 page_get_pagesize(uint_t szc) 378 { 379 if (szc >= mmu_page_sizes) 380 panic("page_get_pagesize: out of range %d", szc); 381 return (hw_page_array[szc].hp_size); 382 } 383 384 /* 385 * Return the size of a page based upon the index passed in. An index of 386 * zero refers to the smallest page size in the system, and as index increases 387 * it refers to the next larger supported page size in the system. 388 * Note that szc and userszc may not be the same due to unsupported szc's on 389 * some systems. 390 */ 391 size_t 392 page_get_user_pagesize(uint_t userszc) 393 { 394 uint_t szc = USERSZC_2_SZC(userszc); 395 396 if (szc >= mmu_page_sizes) 397 panic("page_get_user_pagesize: out of range %d", szc); 398 return (hw_page_array[szc].hp_size); 399 } 400 401 uint_t 402 page_get_shift(uint_t szc) 403 { 404 if (szc >= mmu_page_sizes) 405 panic("page_get_shift: out of range %d", szc); 406 return (hw_page_array[szc].hp_shift); 407 } 408 409 uint_t 410 page_get_pagecolors(uint_t szc) 411 { 412 ASSERT(page_colors != 0); 413 return (MAX(page_colors >> PAGE_BSZS_SHIFT(szc), 1)); 414 } 415 416 /* 417 * Called by startup(). 418 * Size up the per page size free list counters based on physmax 419 * of each node and max_mem_nodes. 420 */ 421 size_t 422 page_ctrs_sz(void) 423 { 424 int r; /* region size */ 425 int mnode; 426 uint_t ctrs_sz = 0; 427 int i; 428 pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; 429 430 /* 431 * We need to determine how many page colors there are for each 432 * page size in order to allocate memory for any color specific 433 * arrays. 434 */ 435 colors_per_szc[0] = page_colors; 436 for (i = 1; i < mmu_page_sizes; i++) { 437 colors_per_szc[i] = 438 page_convert_color(0, i, page_colors - 1) + 1; 439 } 440 441 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 442 443 pgcnt_t r_pgcnt; 444 pfn_t r_base; 445 pgcnt_t r_align; 446 447 if (mem_node_config[mnode].exists == 0) 448 continue; 449 450 /* 451 * determine size needed for page counter arrays with 452 * base aligned to large page size. 453 */ 454 for (r = 1; r < mmu_page_sizes; r++) { 455 /* add in space for hpm_counters */ 456 r_align = page_get_pagecnt(r); 457 r_base = mem_node_config[mnode].physbase; 458 r_base &= ~(r_align - 1); 459 r_pgcnt = howmany(mem_node_config[mnode].physmax - 460 r_base, r_align); 461 /* 462 * Round up to always allocate on pointer sized 463 * boundaries. 464 */ 465 ctrs_sz += P2ROUNDUP((r_pgcnt * sizeof (hpmctr_t)), 466 sizeof (hpmctr_t *)); 467 468 /* add in space for hpm_color_current */ 469 ctrs_sz += (colors_per_szc[r] * 470 sizeof (size_t)); 471 } 472 } 473 474 for (r = 1; r < mmu_page_sizes; r++) { 475 ctrs_sz += (max_mem_nodes * sizeof (hw_page_map_t)); 476 477 /* add in space for page_ctrs_cands */ 478 ctrs_sz += NPC_MUTEX * max_mem_nodes * (sizeof (pcc_info_t)); 479 ctrs_sz += NPC_MUTEX * max_mem_nodes * colors_per_szc[r] * 480 sizeof (pgcnt_t); 481 } 482 483 /* ctr_mutex */ 484 ctrs_sz += (max_mem_nodes * NPC_MUTEX * sizeof (kmutex_t)); 485 486 /* size for page list counts */ 487 PLCNT_SZ(ctrs_sz); 488 489 /* 490 * add some slop for roundups. page_ctrs_alloc will roundup the start 491 * address of the counters to ecache_alignsize boundary for every 492 * memory node. 493 */ 494 return (ctrs_sz + max_mem_nodes * L2CACHE_ALIGN); 495 } 496 497 caddr_t 498 page_ctrs_alloc(caddr_t alloc_base) 499 { 500 int mnode; 501 int r; /* region size */ 502 int i; 503 pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; 504 505 /* 506 * We need to determine how many page colors there are for each 507 * page size in order to allocate memory for any color specific 508 * arrays. 509 */ 510 colors_per_szc[0] = page_colors; 511 for (i = 1; i < mmu_page_sizes; i++) { 512 colors_per_szc[i] = 513 page_convert_color(0, i, page_colors - 1) + 1; 514 } 515 516 for (r = 1; r < mmu_page_sizes; r++) { 517 page_counters[r] = (hw_page_map_t *)alloc_base; 518 alloc_base += (max_mem_nodes * sizeof (hw_page_map_t)); 519 } 520 521 /* page_ctrs_cands */ 522 for (r = 1; r < mmu_page_sizes; r++) { 523 for (i = 0; i < NPC_MUTEX; i++) { 524 page_ctrs_cands[i][r] = (pcc_info_t *)alloc_base; 525 alloc_base += max_mem_nodes * (sizeof (pcc_info_t)); 526 527 } 528 } 529 530 /* page_ctrs_cands pcc_color_free array */ 531 for (r = 1; r < mmu_page_sizes; r++) { 532 for (i = 0; i < NPC_MUTEX; i++) { 533 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 534 page_ctrs_cands[i][r][mnode].pcc_color_free_len 535 = colors_per_szc[r]; 536 page_ctrs_cands[i][r][mnode].pcc_color_free = 537 (pgcnt_t *)alloc_base; 538 alloc_base += colors_per_szc[r] * 539 sizeof (pgcnt_t); 540 } 541 } 542 } 543 544 /* ctr_mutex */ 545 for (i = 0; i < NPC_MUTEX; i++) { 546 ctr_mutex[i] = (kmutex_t *)alloc_base; 547 alloc_base += (max_mem_nodes * sizeof (kmutex_t)); 548 } 549 550 /* initialize page list counts */ 551 PLCNT_INIT(alloc_base); 552 553 for (mnode = 0; mnode < max_mem_nodes; mnode++) { 554 555 pgcnt_t r_pgcnt; 556 pfn_t r_base; 557 pgcnt_t r_align; 558 int r_shift; 559 560 if (mem_node_config[mnode].exists == 0) 561 continue; 562 563 for (r = 1; r < mmu_page_sizes; r++) { 564 /* 565 * the page_counters base has to be aligned to the 566 * page count of page size code r otherwise the counts 567 * will cross large page boundaries. 568 */ 569 r_align = page_get_pagecnt(r); 570 r_base = mem_node_config[mnode].physbase; 571 /* base needs to be aligned - lower to aligned value */ 572 r_base &= ~(r_align - 1); 573 r_pgcnt = howmany(mem_node_config[mnode].physmax - 574 r_base, r_align); 575 r_shift = PAGE_BSZS_SHIFT(r); 576 577 PAGE_COUNTERS_SHIFT(mnode, r) = r_shift; 578 PAGE_COUNTERS_ENTRIES(mnode, r) = r_pgcnt; 579 PAGE_COUNTERS_BASE(mnode, r) = r_base; 580 PAGE_COUNTERS_CURRENT_COLOR_LEN(mnode, r) = 581 colors_per_szc[r]; 582 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r) = 583 (size_t *)alloc_base; 584 alloc_base += (sizeof (size_t) * colors_per_szc[r]); 585 for (i = 0; i < colors_per_szc[r]; i++) { 586 PAGE_COUNTERS_CURRENT_COLOR(mnode, r, i) = i; 587 } 588 PAGE_COUNTERS_COUNTERS(mnode, r) = 589 (hpmctr_t *)alloc_base; 590 /* 591 * Round up to make alloc_base always be aligned on 592 * a pointer boundary. 593 */ 594 alloc_base += P2ROUNDUP((sizeof (hpmctr_t) * r_pgcnt), 595 sizeof (hpmctr_t *)); 596 597 /* 598 * Verify that PNUM_TO_IDX and IDX_TO_PNUM 599 * satisfy the identity requirement. 600 * We should be able to go from one to the other 601 * and get consistent values. 602 */ 603 ASSERT(PNUM_TO_IDX(mnode, r, 604 (IDX_TO_PNUM(mnode, r, 0))) == 0); 605 ASSERT(IDX_TO_PNUM(mnode, r, 606 (PNUM_TO_IDX(mnode, r, r_base))) == r_base); 607 } 608 /* 609 * Roundup the start address of the page_counters to 610 * cache aligned boundary for every memory node. 611 * page_ctrs_sz() has added some slop for these roundups. 612 */ 613 alloc_base = (caddr_t)P2ROUNDUP((uintptr_t)alloc_base, 614 L2CACHE_ALIGN); 615 } 616 617 /* Initialize other page counter specific data structures. */ 618 for (mnode = 0; mnode < MAX_MEM_NODES; mnode++) { 619 rw_init(&page_ctrs_rwlock[mnode], NULL, RW_DEFAULT, NULL); 620 } 621 622 return (alloc_base); 623 } 624 625 /* 626 * Functions to adjust region counters for each size free list. 627 * Caller is responsible to acquire the ctr_mutex lock if necessary and 628 * thus can be called during startup without locks. 629 */ 630 /* ARGSUSED */ 631 void 632 page_ctr_add_internal(int mnode, page_t *pp, int flags) 633 { 634 ssize_t r; /* region size */ 635 ssize_t idx; 636 pfn_t pfnum; 637 int lckidx; 638 639 ASSERT(pp->p_szc < mmu_page_sizes); 640 641 PLCNT_INCR(pp, mnode, pp->p_szc, flags); 642 643 /* no counter update needed for largest page size */ 644 if (pp->p_szc >= mmu_page_sizes - 1) { 645 return; 646 } 647 648 r = pp->p_szc + 1; 649 pfnum = pp->p_pagenum; 650 lckidx = PP_CTR_LOCK_INDX(pp); 651 652 /* 653 * Increment the count of free pages for the current 654 * region. Continue looping up in region size incrementing 655 * count if the preceeding region is full. 656 */ 657 while (r < mmu_page_sizes) { 658 idx = PNUM_TO_IDX(mnode, r, pfnum); 659 660 ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r)); 661 ASSERT(PAGE_COUNTERS(mnode, r, idx) < FULL_REGION_CNT(r)); 662 663 if (++PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r)) 664 break; 665 666 page_ctrs_cands[lckidx][r][mnode].pcc_pages_free++; 667 page_ctrs_cands[lckidx][r][mnode]. 668 pcc_color_free[PP_2_BIN_SZC(pp, r)]++; 669 r++; 670 } 671 } 672 673 void 674 page_ctr_add(page_t *pp, int flags) 675 { 676 int lckidx = PP_CTR_LOCK_INDX(pp); 677 int mnode = PP_2_MEM_NODE(pp); 678 kmutex_t *lock = &ctr_mutex[lckidx][mnode]; 679 680 mutex_enter(lock); 681 page_ctr_add_internal(mnode, pp, flags); 682 mutex_exit(lock); 683 } 684 685 void 686 page_ctr_sub(page_t *pp, int flags) 687 { 688 int lckidx; 689 int mnode = PP_2_MEM_NODE(pp); 690 kmutex_t *lock; 691 ssize_t r; /* region size */ 692 ssize_t idx; 693 pfn_t pfnum; 694 695 ASSERT(pp->p_szc < mmu_page_sizes); 696 697 PLCNT_DECR(pp, mnode, pp->p_szc, flags); 698 699 /* no counter update needed for largest page size */ 700 if (pp->p_szc >= mmu_page_sizes - 1) { 701 return; 702 } 703 704 r = pp->p_szc + 1; 705 pfnum = pp->p_pagenum; 706 lckidx = PP_CTR_LOCK_INDX(pp); 707 lock = &ctr_mutex[lckidx][mnode]; 708 709 /* 710 * Decrement the count of free pages for the current 711 * region. Continue looping up in region size decrementing 712 * count if the preceeding region was full. 713 */ 714 mutex_enter(lock); 715 while (r < mmu_page_sizes) { 716 idx = PNUM_TO_IDX(mnode, r, pfnum); 717 718 ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r)); 719 ASSERT(PAGE_COUNTERS(mnode, r, idx) > 0); 720 721 if (--PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r) - 1) { 722 break; 723 } 724 ASSERT(page_ctrs_cands[lckidx][r][mnode].pcc_pages_free != 0); 725 ASSERT(page_ctrs_cands[lckidx][r][mnode]. 726 pcc_color_free[PP_2_BIN_SZC(pp, r)] != 0); 727 728 page_ctrs_cands[lckidx][r][mnode].pcc_pages_free--; 729 page_ctrs_cands[lckidx][r][mnode]. 730 pcc_color_free[PP_2_BIN_SZC(pp, r)]--; 731 r++; 732 } 733 mutex_exit(lock); 734 } 735 736 /* 737 * Adjust page counters following a memory attach, since typically the 738 * size of the array needs to change, and the PFN to counter index 739 * mapping needs to change. 740 */ 741 uint_t 742 page_ctrs_adjust(int mnode) 743 { 744 pgcnt_t npgs; 745 int r; /* region size */ 746 int i; 747 size_t pcsz, old_csz; 748 hpmctr_t *new_ctr, *old_ctr; 749 pfn_t oldbase, newbase; 750 size_t old_npgs; 751 hpmctr_t *ctr_cache[MMU_PAGE_SIZES]; 752 size_t size_cache[MMU_PAGE_SIZES]; 753 size_t *color_cache[MMU_PAGE_SIZES]; 754 size_t *old_color_array; 755 pgcnt_t colors_per_szc[MMU_PAGE_SIZES]; 756 757 newbase = mem_node_config[mnode].physbase & ~PC_BASE_ALIGN_MASK; 758 npgs = roundup(mem_node_config[mnode].physmax, 759 PC_BASE_ALIGN) - newbase; 760 761 /* 762 * We need to determine how many page colors there are for each 763 * page size in order to allocate memory for any color specific 764 * arrays. 765 */ 766 colors_per_szc[0] = page_colors; 767 for (r = 1; r < mmu_page_sizes; r++) { 768 colors_per_szc[r] = 769 page_convert_color(0, r, page_colors - 1) + 1; 770 } 771 772 /* 773 * Preallocate all of the new hpm_counters arrays as we can't 774 * hold the page_ctrs_rwlock as a writer and allocate memory. 775 * If we can't allocate all of the arrays, undo our work so far 776 * and return failure. 777 */ 778 for (r = 1; r < mmu_page_sizes; r++) { 779 pcsz = npgs >> PAGE_BSZS_SHIFT(r); 780 781 ctr_cache[r] = kmem_zalloc(pcsz * 782 sizeof (hpmctr_t), KM_NOSLEEP); 783 if (ctr_cache[r] == NULL) { 784 while (--r >= 1) { 785 kmem_free(ctr_cache[r], 786 size_cache[r] * sizeof (hpmctr_t)); 787 } 788 return (ENOMEM); 789 } 790 size_cache[r] = pcsz; 791 } 792 /* 793 * Preallocate all of the new color current arrays as we can't 794 * hold the page_ctrs_rwlock as a writer and allocate memory. 795 * If we can't allocate all of the arrays, undo our work so far 796 * and return failure. 797 */ 798 for (r = 1; r < mmu_page_sizes; r++) { 799 color_cache[r] = kmem_zalloc(sizeof (size_t) * 800 colors_per_szc[r], KM_NOSLEEP); 801 if (color_cache[r] == NULL) { 802 while (--r >= 1) { 803 kmem_free(color_cache[r], 804 colors_per_szc[r] * sizeof (size_t)); 805 } 806 for (r = 1; r < mmu_page_sizes; r++) { 807 kmem_free(ctr_cache[r], 808 size_cache[r] * sizeof (hpmctr_t)); 809 } 810 return (ENOMEM); 811 } 812 } 813 814 /* 815 * Grab the write lock to prevent others from walking these arrays 816 * while we are modifying them. 817 */ 818 rw_enter(&page_ctrs_rwlock[mnode], RW_WRITER); 819 page_freelist_lock(mnode); 820 for (r = 1; r < mmu_page_sizes; r++) { 821 PAGE_COUNTERS_SHIFT(mnode, r) = PAGE_BSZS_SHIFT(r); 822 old_ctr = PAGE_COUNTERS_COUNTERS(mnode, r); 823 old_csz = PAGE_COUNTERS_ENTRIES(mnode, r); 824 oldbase = PAGE_COUNTERS_BASE(mnode, r); 825 old_npgs = old_csz << PAGE_COUNTERS_SHIFT(mnode, r); 826 old_color_array = PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r); 827 828 pcsz = npgs >> PAGE_COUNTERS_SHIFT(mnode, r); 829 new_ctr = ctr_cache[r]; 830 ctr_cache[r] = NULL; 831 if (old_ctr != NULL && 832 (oldbase + old_npgs > newbase) && 833 (newbase + npgs > oldbase)) { 834 /* 835 * Map the intersection of the old and new 836 * counters into the new array. 837 */ 838 size_t offset; 839 if (newbase > oldbase) { 840 offset = (newbase - oldbase) >> 841 PAGE_COUNTERS_SHIFT(mnode, r); 842 bcopy(old_ctr + offset, new_ctr, 843 MIN(pcsz, (old_csz - offset)) * 844 sizeof (hpmctr_t)); 845 } else { 846 offset = (oldbase - newbase) >> 847 PAGE_COUNTERS_SHIFT(mnode, r); 848 bcopy(old_ctr, new_ctr + offset, 849 MIN(pcsz - offset, old_csz) * 850 sizeof (hpmctr_t)); 851 } 852 } 853 854 PAGE_COUNTERS_COUNTERS(mnode, r) = new_ctr; 855 PAGE_COUNTERS_ENTRIES(mnode, r) = pcsz; 856 PAGE_COUNTERS_BASE(mnode, r) = newbase; 857 PAGE_COUNTERS_CURRENT_COLOR_LEN(mnode, r) = colors_per_szc[r]; 858 PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r) = color_cache[r]; 859 color_cache[r] = NULL; 860 /* 861 * for now, just reset on these events as it's probably 862 * not worthwhile to try and optimize this. 863 */ 864 for (i = 0; i < colors_per_szc[r]; i++) { 865 PAGE_COUNTERS_CURRENT_COLOR(mnode, r, i) = i; 866 } 867 868 /* cache info for freeing out of the critical path */ 869 if ((caddr_t)old_ctr >= kernelheap && 870 (caddr_t)old_ctr < ekernelheap) { 871 ctr_cache[r] = old_ctr; 872 size_cache[r] = old_csz; 873 } 874 if ((caddr_t)old_color_array >= kernelheap && 875 (caddr_t)old_color_array < ekernelheap) { 876 color_cache[r] = old_color_array; 877 } 878 /* 879 * Verify that PNUM_TO_IDX and IDX_TO_PNUM 880 * satisfy the identity requirement. 881 * We should be able to go from one to the other 882 * and get consistent values. 883 */ 884 ASSERT(PNUM_TO_IDX(mnode, r, 885 (IDX_TO_PNUM(mnode, r, 0))) == 0); 886 ASSERT(IDX_TO_PNUM(mnode, r, 887 (PNUM_TO_IDX(mnode, r, newbase))) == newbase); 888 } 889 page_freelist_unlock(mnode); 890 rw_exit(&page_ctrs_rwlock[mnode]); 891 892 /* 893 * Now that we have dropped the write lock, it is safe to free all 894 * of the memory we have cached above. 895 */ 896 for (r = 1; r < mmu_page_sizes; r++) { 897 if (ctr_cache[r] != NULL) { 898 kmem_free(ctr_cache[r], 899 size_cache[r] * sizeof (hpmctr_t)); 900 } 901 if (color_cache[r] != NULL) { 902 kmem_free(color_cache[r], 903 colors_per_szc[r] * sizeof (size_t)); 904 } 905 } 906 return (0); 907 } 908 909 /* 910 * color contains a valid color index or bin for cur_szc 911 */ 912 uint_t 913 page_convert_color(uchar_t cur_szc, uchar_t new_szc, uint_t color) 914 { 915 uint_t shift; 916 917 if (cur_szc > new_szc) { 918 shift = page_get_shift(cur_szc) - page_get_shift(new_szc); 919 return (color << shift); 920 } else if (cur_szc < new_szc) { 921 shift = page_get_shift(new_szc) - page_get_shift(cur_szc); 922 return (color >> shift); 923 } 924 return (color); 925 } 926 927 #ifdef DEBUG 928 929 /* 930 * confirm pp is a large page corresponding to szc 931 */ 932 void 933 chk_lpg(page_t *pp, uchar_t szc) 934 { 935 spgcnt_t npgs = page_get_pagecnt(pp->p_szc); 936 uint_t noreloc; 937 938 if (npgs == 1) { 939 ASSERT(pp->p_szc == 0); 940 ASSERT(pp->p_next == pp); 941 ASSERT(pp->p_prev == pp); 942 return; 943 } 944 945 ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL); 946 ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL); 947 948 ASSERT(IS_P2ALIGNED(pp->p_pagenum, npgs)); 949 ASSERT(pp->p_pagenum == (pp->p_next->p_pagenum - 1)); 950 ASSERT(pp->p_prev->p_pagenum == (pp->p_pagenum + (npgs - 1))); 951 ASSERT(pp->p_prev == (pp + (npgs - 1))); 952 953 /* 954 * Check list of pages. 955 */ 956 noreloc = PP_ISNORELOC(pp); 957 while (npgs--) { 958 if (npgs != 0) { 959 ASSERT(pp->p_pagenum == pp->p_next->p_pagenum - 1); 960 ASSERT(pp->p_next == (pp + 1)); 961 } 962 ASSERT(pp->p_szc == szc); 963 ASSERT(PP_ISFREE(pp)); 964 ASSERT(PP_ISAGED(pp)); 965 ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL); 966 ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL); 967 ASSERT(pp->p_vnode == NULL); 968 ASSERT(PP_ISNORELOC(pp) == noreloc); 969 970 pp = pp->p_next; 971 } 972 } 973 #endif /* DEBUG */ 974 975 void 976 page_freelist_lock(int mnode) 977 { 978 int i; 979 for (i = 0; i < NPC_MUTEX; i++) { 980 mutex_enter(FPC_MUTEX(mnode, i)); 981 mutex_enter(CPC_MUTEX(mnode, i)); 982 } 983 } 984 985 void 986 page_freelist_unlock(int mnode) 987 { 988 int i; 989 for (i = 0; i < NPC_MUTEX; i++) { 990 mutex_exit(FPC_MUTEX(mnode, i)); 991 mutex_exit(CPC_MUTEX(mnode, i)); 992 } 993 } 994 995 /* 996 * add pp to the specified page list. Defaults to head of the page list 997 * unless PG_LIST_TAIL is specified. 998 */ 999 void 1000 page_list_add(page_t *pp, int flags) 1001 { 1002 page_t **ppp; 1003 kmutex_t *pcm; 1004 uint_t bin, mtype; 1005 int mnode; 1006 1007 ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT)); 1008 ASSERT(PP_ISFREE(pp)); 1009 ASSERT(!hat_page_is_mapped(pp)); 1010 ASSERT(hat_page_getshare(pp) == 0); 1011 1012 /* 1013 * Large pages should be freed via page_list_add_pages(). 1014 */ 1015 ASSERT(pp->p_szc == 0); 1016 1017 /* 1018 * Don't need to lock the freelist first here 1019 * because the page isn't on the freelist yet. 1020 * This means p_szc can't change on us. 1021 */ 1022 1023 bin = PP_2_BIN(pp); 1024 mnode = PP_2_MEM_NODE(pp); 1025 mtype = PP_2_MTYPE(pp); 1026 1027 if (flags & PG_LIST_ISINIT) { 1028 /* 1029 * PG_LIST_ISINIT is set during system startup (ie. single 1030 * threaded), add a page to the free list and add to the 1031 * the free region counters w/o any locking 1032 */ 1033 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1034 1035 /* inline version of page_add() */ 1036 if (*ppp != NULL) { 1037 pp->p_next = *ppp; 1038 pp->p_prev = (*ppp)->p_prev; 1039 (*ppp)->p_prev = pp; 1040 pp->p_prev->p_next = pp; 1041 } else 1042 *ppp = pp; 1043 1044 page_ctr_add_internal(mnode, pp, flags); 1045 } else { 1046 pcm = PC_BIN_MUTEX(mnode, bin, flags); 1047 1048 if (flags & PG_FREE_LIST) { 1049 ASSERT(PP_ISAGED(pp)); 1050 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1051 1052 } else { 1053 ASSERT(pp->p_vnode); 1054 ASSERT((pp->p_offset & PAGEOFFSET) == 0); 1055 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1056 } 1057 mutex_enter(pcm); 1058 page_add(ppp, pp); 1059 1060 if (flags & PG_LIST_TAIL) 1061 *ppp = (*ppp)->p_next; 1062 /* 1063 * Add counters before releasing pcm mutex to avoid a race with 1064 * page_freelist_coalesce and page_freelist_fill. 1065 */ 1066 page_ctr_add(pp, flags); 1067 mutex_exit(pcm); 1068 } 1069 1070 1071 #if defined(__sparc) 1072 if (PP_ISNORELOC(pp)) { 1073 kcage_freemem_add(1); 1074 } 1075 #endif 1076 /* 1077 * It is up to the caller to unlock the page! 1078 */ 1079 ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT)); 1080 } 1081 1082 1083 #ifdef __sparc 1084 /* 1085 * This routine is only used by kcage_init during system startup. 1086 * It performs the function of page_list_sub/PP_SETNORELOC/page_list_add 1087 * without the overhead of taking locks and updating counters. 1088 */ 1089 void 1090 page_list_noreloc_startup(page_t *pp) 1091 { 1092 page_t **ppp; 1093 uint_t bin; 1094 int mnode; 1095 int mtype; 1096 int flags = PG_LIST_ISCAGE; 1097 1098 /* 1099 * If this is a large page on the freelist then 1100 * break it up into smaller pages. 1101 */ 1102 if (pp->p_szc != 0) 1103 page_boot_demote(pp); 1104 1105 /* 1106 * Get list page is currently on. 1107 */ 1108 bin = PP_2_BIN(pp); 1109 mnode = PP_2_MEM_NODE(pp); 1110 mtype = PP_2_MTYPE(pp); 1111 ASSERT(mtype == MTYPE_RELOC); 1112 ASSERT(pp->p_szc == 0); 1113 1114 if (PP_ISAGED(pp)) { 1115 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1116 flags |= PG_FREE_LIST; 1117 } else { 1118 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1119 flags |= PG_CACHE_LIST; 1120 } 1121 1122 ASSERT(*ppp != NULL); 1123 1124 /* 1125 * Delete page from current list. 1126 */ 1127 if (*ppp == pp) 1128 *ppp = pp->p_next; /* go to next page */ 1129 if (*ppp == pp) { 1130 *ppp = NULL; /* page list is gone */ 1131 } else { 1132 pp->p_prev->p_next = pp->p_next; 1133 pp->p_next->p_prev = pp->p_prev; 1134 } 1135 1136 /* LINTED */ 1137 PLCNT_DECR(pp, mnode, 0, flags); 1138 1139 /* 1140 * Set no reloc for cage initted pages. 1141 */ 1142 PP_SETNORELOC(pp); 1143 1144 mtype = PP_2_MTYPE(pp); 1145 ASSERT(mtype == MTYPE_NORELOC); 1146 1147 /* 1148 * Get new list for page. 1149 */ 1150 if (PP_ISAGED(pp)) { 1151 ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype); 1152 } else { 1153 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1154 } 1155 1156 /* 1157 * Insert page on new list. 1158 */ 1159 if (*ppp == NULL) { 1160 *ppp = pp; 1161 pp->p_next = pp->p_prev = pp; 1162 } else { 1163 pp->p_next = *ppp; 1164 pp->p_prev = (*ppp)->p_prev; 1165 (*ppp)->p_prev = pp; 1166 pp->p_prev->p_next = pp; 1167 } 1168 1169 /* LINTED */ 1170 PLCNT_INCR(pp, mnode, 0, flags); 1171 1172 /* 1173 * Update cage freemem counter 1174 */ 1175 atomic_add_long(&kcage_freemem, 1); 1176 } 1177 #else /* __sparc */ 1178 1179 /* ARGSUSED */ 1180 void 1181 page_list_noreloc_startup(page_t *pp) 1182 { 1183 panic("page_list_noreloc_startup: should be here only for sparc"); 1184 } 1185 #endif 1186 1187 void 1188 page_list_add_pages(page_t *pp, int flags) 1189 { 1190 kmutex_t *pcm; 1191 pgcnt_t pgcnt; 1192 uint_t bin, mtype, i; 1193 int mnode; 1194 1195 /* default to freelist/head */ 1196 ASSERT((flags & (PG_CACHE_LIST | PG_LIST_TAIL)) == 0); 1197 1198 CHK_LPG(pp, pp->p_szc); 1199 VM_STAT_ADD(vmm_vmstats.pc_list_add_pages[pp->p_szc]); 1200 1201 bin = PP_2_BIN(pp); 1202 mnode = PP_2_MEM_NODE(pp); 1203 mtype = PP_2_MTYPE(pp); 1204 1205 if (flags & PG_LIST_ISINIT) { 1206 ASSERT(pp->p_szc == mmu_page_sizes - 1); 1207 page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1208 ASSERT(!PP_ISNORELOC(pp)); 1209 PLCNT_INCR(pp, mnode, pp->p_szc, flags); 1210 } else { 1211 1212 ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes); 1213 1214 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 1215 1216 mutex_enter(pcm); 1217 page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1218 page_ctr_add(pp, PG_FREE_LIST); 1219 mutex_exit(pcm); 1220 1221 pgcnt = page_get_pagecnt(pp->p_szc); 1222 #if defined(__sparc) 1223 if (PP_ISNORELOC(pp)) 1224 kcage_freemem_add(pgcnt); 1225 #endif 1226 for (i = 0; i < pgcnt; i++, pp++) 1227 page_unlock(pp); 1228 } 1229 } 1230 1231 /* 1232 * During boot, need to demote a large page to base 1233 * pagesize pages for seg_kmem for use in boot_alloc() 1234 */ 1235 void 1236 page_boot_demote(page_t *pp) 1237 { 1238 ASSERT(pp->p_szc != 0); 1239 ASSERT(PP_ISFREE(pp)); 1240 ASSERT(PP_ISAGED(pp)); 1241 1242 (void) page_demote(PP_2_MEM_NODE(pp), 1243 PFN_BASE(pp->p_pagenum, pp->p_szc), pp->p_szc, 0, PC_NO_COLOR, 1244 PC_FREE); 1245 1246 ASSERT(PP_ISFREE(pp)); 1247 ASSERT(PP_ISAGED(pp)); 1248 ASSERT(pp->p_szc == 0); 1249 } 1250 1251 /* 1252 * Take a particular page off of whatever freelist the page 1253 * is claimed to be on. 1254 * 1255 * NOTE: Only used for PAGESIZE pages. 1256 */ 1257 void 1258 page_list_sub(page_t *pp, int flags) 1259 { 1260 int bin; 1261 uint_t mtype; 1262 int mnode; 1263 kmutex_t *pcm; 1264 page_t **ppp; 1265 1266 ASSERT(PAGE_EXCL(pp)); 1267 ASSERT(PP_ISFREE(pp)); 1268 1269 /* 1270 * The p_szc field can only be changed by page_promote() 1271 * and page_demote(). Only free pages can be promoted and 1272 * demoted and the free list MUST be locked during these 1273 * operations. So to prevent a race in page_list_sub() 1274 * between computing which bin of the freelist lock to 1275 * grab and actually grabing the lock we check again that 1276 * the bin we locked is still the correct one. Notice that 1277 * the p_szc field could have actually changed on us but 1278 * if the bin happens to still be the same we are safe. 1279 */ 1280 try_again: 1281 bin = PP_2_BIN(pp); 1282 mnode = PP_2_MEM_NODE(pp); 1283 pcm = PC_BIN_MUTEX(mnode, bin, flags); 1284 mutex_enter(pcm); 1285 if (PP_2_BIN(pp) != bin) { 1286 mutex_exit(pcm); 1287 goto try_again; 1288 } 1289 mtype = PP_2_MTYPE(pp); 1290 1291 if (flags & PG_FREE_LIST) { 1292 ASSERT(PP_ISAGED(pp)); 1293 ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype); 1294 } else { 1295 ASSERT(!PP_ISAGED(pp)); 1296 ppp = &PAGE_CACHELISTS(mnode, bin, mtype); 1297 } 1298 1299 /* 1300 * Common PAGESIZE case. 1301 * 1302 * Note that we locked the freelist. This prevents 1303 * any page promotion/demotion operations. Therefore 1304 * the p_szc will not change until we drop pcm mutex. 1305 */ 1306 if (pp->p_szc == 0) { 1307 page_sub(ppp, pp); 1308 /* 1309 * Subtract counters before releasing pcm mutex 1310 * to avoid race with page_freelist_coalesce. 1311 */ 1312 page_ctr_sub(pp, flags); 1313 mutex_exit(pcm); 1314 1315 #if defined(__sparc) 1316 if (PP_ISNORELOC(pp)) { 1317 kcage_freemem_sub(1); 1318 } 1319 #endif 1320 return; 1321 } 1322 1323 /* 1324 * Large pages on the cache list are not supported. 1325 */ 1326 if (flags & PG_CACHE_LIST) 1327 panic("page_list_sub: large page on cachelist"); 1328 1329 /* 1330 * Slow but rare. 1331 * 1332 * Somebody wants this particular page which is part 1333 * of a large page. In this case we just demote the page 1334 * if it's on the freelist. 1335 * 1336 * We have to drop pcm before locking the entire freelist. 1337 * Once we have re-locked the freelist check to make sure 1338 * the page hasn't already been demoted or completely 1339 * freed. 1340 */ 1341 mutex_exit(pcm); 1342 page_freelist_lock(mnode); 1343 if (pp->p_szc != 0) { 1344 /* 1345 * Large page is on freelist. 1346 */ 1347 (void) page_demote(mnode, PFN_BASE(pp->p_pagenum, pp->p_szc), 1348 pp->p_szc, 0, PC_NO_COLOR, PC_FREE); 1349 } 1350 ASSERT(PP_ISFREE(pp)); 1351 ASSERT(PP_ISAGED(pp)); 1352 ASSERT(pp->p_szc == 0); 1353 1354 /* 1355 * Subtract counters before releasing pcm mutex 1356 * to avoid race with page_freelist_coalesce. 1357 */ 1358 bin = PP_2_BIN(pp); 1359 mtype = PP_2_MTYPE(pp); 1360 ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype); 1361 1362 page_sub(ppp, pp); 1363 page_ctr_sub(pp, flags); 1364 page_freelist_unlock(mnode); 1365 1366 #if defined(__sparc) 1367 if (PP_ISNORELOC(pp)) { 1368 kcage_freemem_sub(1); 1369 } 1370 #endif 1371 } 1372 1373 void 1374 page_list_sub_pages(page_t *pp, uint_t szc) 1375 { 1376 kmutex_t *pcm; 1377 uint_t bin, mtype; 1378 int mnode; 1379 1380 ASSERT(PAGE_EXCL(pp)); 1381 ASSERT(PP_ISFREE(pp)); 1382 ASSERT(PP_ISAGED(pp)); 1383 1384 /* 1385 * See comment in page_list_sub(). 1386 */ 1387 try_again: 1388 bin = PP_2_BIN(pp); 1389 mnode = PP_2_MEM_NODE(pp); 1390 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 1391 mutex_enter(pcm); 1392 if (PP_2_BIN(pp) != bin) { 1393 mutex_exit(pcm); 1394 goto try_again; 1395 } 1396 1397 VM_STAT_ADD(vmm_vmstats.pc_list_sub_pages1[pp->p_szc]); 1398 1399 /* 1400 * If we're called with a page larger than szc or it got 1401 * promoted above szc before we locked the freelist then 1402 * drop pcm and re-lock entire freelist. If page still larger 1403 * than szc then demote it. 1404 */ 1405 if (pp->p_szc > szc) { 1406 VM_STAT_ADD(vmm_vmstats.pc_list_sub_pages2[pp->p_szc]); 1407 mutex_exit(pcm); 1408 pcm = NULL; 1409 page_freelist_lock(mnode); 1410 if (pp->p_szc > szc) { 1411 VM_STAT_ADD(vmm_vmstats.pc_list_sub_pages3[pp->p_szc]); 1412 (void) page_demote(mnode, 1413 PFN_BASE(pp->p_pagenum, pp->p_szc), 1414 pp->p_szc, szc, PC_NO_COLOR, PC_FREE); 1415 } 1416 bin = PP_2_BIN(pp); 1417 } 1418 ASSERT(PP_ISFREE(pp)); 1419 ASSERT(PP_ISAGED(pp)); 1420 ASSERT(pp->p_szc <= szc); 1421 ASSERT(pp == PP_PAGEROOT(pp)); 1422 1423 mtype = PP_2_MTYPE(pp); 1424 if (pp->p_szc != 0) { 1425 page_vpsub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1426 CHK_LPG(pp, pp->p_szc); 1427 } else { 1428 page_sub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp); 1429 } 1430 page_ctr_sub(pp, PG_FREE_LIST); 1431 1432 if (pcm != NULL) { 1433 mutex_exit(pcm); 1434 } else { 1435 page_freelist_unlock(mnode); 1436 } 1437 1438 #if defined(__sparc) 1439 if (PP_ISNORELOC(pp)) { 1440 pgcnt_t pgcnt; 1441 1442 pgcnt = page_get_pagecnt(pp->p_szc); 1443 kcage_freemem_sub(pgcnt); 1444 } 1445 #endif 1446 } 1447 1448 /* 1449 * Add the page to the front of a linked list of pages 1450 * using the p_next & p_prev pointers for the list. 1451 * The caller is responsible for protecting the list pointers. 1452 */ 1453 void 1454 mach_page_add(page_t **ppp, page_t *pp) 1455 { 1456 if (*ppp == NULL) { 1457 pp->p_next = pp->p_prev = pp; 1458 } else { 1459 pp->p_next = *ppp; 1460 pp->p_prev = (*ppp)->p_prev; 1461 (*ppp)->p_prev = pp; 1462 pp->p_prev->p_next = pp; 1463 } 1464 *ppp = pp; 1465 } 1466 1467 /* 1468 * Remove this page from a linked list of pages 1469 * using the p_next & p_prev pointers for the list. 1470 * 1471 * The caller is responsible for protecting the list pointers. 1472 */ 1473 void 1474 mach_page_sub(page_t **ppp, page_t *pp) 1475 { 1476 ASSERT(PP_ISFREE(pp)); 1477 1478 if (*ppp == NULL || pp == NULL) 1479 panic("mach_page_sub"); 1480 1481 if (*ppp == pp) 1482 *ppp = pp->p_next; /* go to next page */ 1483 1484 if (*ppp == pp) 1485 *ppp = NULL; /* page list is gone */ 1486 else { 1487 pp->p_prev->p_next = pp->p_next; 1488 pp->p_next->p_prev = pp->p_prev; 1489 } 1490 pp->p_prev = pp->p_next = pp; /* make pp a list of one */ 1491 } 1492 1493 /* 1494 * Routine fsflush uses to gradually coalesce the free list into larger pages. 1495 */ 1496 void 1497 page_promote_size(page_t *pp, uint_t cur_szc) 1498 { 1499 pfn_t pfn; 1500 int mnode; 1501 int idx; 1502 int new_szc = cur_szc + 1; 1503 int full = FULL_REGION_CNT(new_szc); 1504 1505 pfn = page_pptonum(pp); 1506 mnode = PFN_2_MEM_NODE(pfn); 1507 1508 page_freelist_lock(mnode); 1509 1510 idx = PNUM_TO_IDX(mnode, new_szc, pfn); 1511 if (PAGE_COUNTERS(mnode, new_szc, idx) == full) 1512 (void) page_promote(mnode, pfn, new_szc, PC_FREE); 1513 1514 page_freelist_unlock(mnode); 1515 } 1516 1517 static uint_t page_promote_err; 1518 static uint_t page_promote_noreloc_err; 1519 1520 /* 1521 * Create a single larger page (of szc new_szc) from smaller contiguous pages 1522 * for the given mnode starting at pfnum. Pages involved are on the freelist 1523 * before the call and may be returned to the caller if requested, otherwise 1524 * they will be placed back on the freelist. 1525 * If flags is PC_ALLOC, then the large page will be returned to the user in 1526 * a state which is consistent with a page being taken off the freelist. If 1527 * we failed to lock the new large page, then we will return NULL to the 1528 * caller and put the large page on the freelist instead. 1529 * If flags is PC_FREE, then the large page will be placed on the freelist, 1530 * and NULL will be returned. 1531 * The caller is responsible for locking the freelist as well as any other 1532 * accounting which needs to be done for a returned page. 1533 * 1534 * RFE: For performance pass in pp instead of pfnum so 1535 * we can avoid excessive calls to page_numtopp_nolock(). 1536 * This would depend on an assumption that all contiguous 1537 * pages are in the same memseg so we can just add/dec 1538 * our pp. 1539 * 1540 * Lock ordering: 1541 * 1542 * There is a potential but rare deadlock situation 1543 * for page promotion and demotion operations. The problem 1544 * is there are two paths into the freelist manager and 1545 * they have different lock orders: 1546 * 1547 * page_create() 1548 * lock freelist 1549 * page_lock(EXCL) 1550 * unlock freelist 1551 * return 1552 * caller drops page_lock 1553 * 1554 * page_free() and page_reclaim() 1555 * caller grabs page_lock(EXCL) 1556 * 1557 * lock freelist 1558 * unlock freelist 1559 * drop page_lock 1560 * 1561 * What prevents a thread in page_create() from deadlocking 1562 * with a thread freeing or reclaiming the same page is the 1563 * page_trylock() in page_get_freelist(). If the trylock fails 1564 * it skips the page. 1565 * 1566 * The lock ordering for promotion and demotion is the same as 1567 * for page_create(). Since the same deadlock could occur during 1568 * page promotion and freeing or reclaiming of a page on the 1569 * cache list we might have to fail the operation and undo what 1570 * have done so far. Again this is rare. 1571 */ 1572 page_t * 1573 page_promote(int mnode, pfn_t pfnum, uchar_t new_szc, int flags) 1574 { 1575 page_t *pp, *pplist, *tpp, *start_pp; 1576 pgcnt_t new_npgs, npgs; 1577 uint_t bin; 1578 pgcnt_t tmpnpgs, pages_left; 1579 uint_t mtype; 1580 uint_t noreloc; 1581 uint_t i; 1582 int which_list; 1583 ulong_t index; 1584 kmutex_t *phm; 1585 1586 /* 1587 * General algorithm: 1588 * Find the starting page 1589 * Walk each page struct removing it from the freelist, 1590 * and linking it to all the other pages removed. 1591 * Once all pages are off the freelist, 1592 * walk the list, modifying p_szc to new_szc and what 1593 * ever other info needs to be done to create a large free page. 1594 * According to the flags, either return the page or put it 1595 * on the freelist. 1596 */ 1597 1598 start_pp = page_numtopp_nolock(pfnum); 1599 ASSERT(start_pp && (start_pp->p_pagenum == pfnum)); 1600 new_npgs = page_get_pagecnt(new_szc); 1601 ASSERT(IS_P2ALIGNED(pfnum, new_npgs)); 1602 1603 /* 1604 * Loop through smaller pages to confirm that all pages 1605 * give the same result for PP_ISNORELOC(). 1606 * We can check this reliably here as the protocol for setting 1607 * P_NORELOC requires pages to be taken off the free list first. 1608 */ 1609 for (i = 0, pp = start_pp; i < new_npgs; i++, pp++) { 1610 if (pp == start_pp) { 1611 /* First page, set requirement. */ 1612 noreloc = PP_ISNORELOC(pp); 1613 } else if (noreloc != PP_ISNORELOC(pp)) { 1614 page_promote_noreloc_err++; 1615 page_promote_err++; 1616 return (NULL); 1617 } 1618 } 1619 1620 pages_left = new_npgs; 1621 pplist = NULL; 1622 pp = start_pp; 1623 1624 /* Loop around coalescing the smaller pages into a big page. */ 1625 while (pages_left) { 1626 /* 1627 * Remove from the freelist. 1628 */ 1629 ASSERT(PP_ISFREE(pp)); 1630 bin = PP_2_BIN(pp); 1631 ASSERT(mnode == PP_2_MEM_NODE(pp)); 1632 mtype = PP_2_MTYPE(pp); 1633 if (PP_ISAGED(pp)) { 1634 1635 /* 1636 * PG_FREE_LIST 1637 */ 1638 if (pp->p_szc) { 1639 page_vpsub(&PAGE_FREELISTS(mnode, 1640 pp->p_szc, bin, mtype), pp); 1641 } else { 1642 mach_page_sub(&PAGE_FREELISTS(mnode, 0, 1643 bin, mtype), pp); 1644 } 1645 which_list = PG_FREE_LIST; 1646 } else { 1647 ASSERT(pp->p_szc == 0); 1648 1649 /* 1650 * PG_CACHE_LIST 1651 * 1652 * Since this page comes from the 1653 * cachelist, we must destroy the 1654 * vnode association. 1655 */ 1656 if (!page_trylock(pp, SE_EXCL)) { 1657 goto fail_promote; 1658 } 1659 1660 /* 1661 * We need to be careful not to deadlock 1662 * with another thread in page_lookup(). 1663 * The page_lookup() thread could be holding 1664 * the same phm that we need if the two 1665 * pages happen to hash to the same phm lock. 1666 * At this point we have locked the entire 1667 * freelist and page_lookup() could be trying 1668 * to grab a freelist lock. 1669 */ 1670 index = PAGE_HASH_FUNC(pp->p_vnode, pp->p_offset); 1671 phm = PAGE_HASH_MUTEX(index); 1672 if (!mutex_tryenter(phm)) { 1673 page_unlock(pp); 1674 goto fail_promote; 1675 } 1676 1677 mach_page_sub(&PAGE_CACHELISTS(mnode, bin, mtype), pp); 1678 page_hashout(pp, phm); 1679 mutex_exit(phm); 1680 PP_SETAGED(pp); 1681 page_unlock(pp); 1682 which_list = PG_CACHE_LIST; 1683 } 1684 page_ctr_sub(pp, which_list); 1685 1686 /* 1687 * Concatenate the smaller page(s) onto 1688 * the large page list. 1689 */ 1690 tmpnpgs = npgs = page_get_pagecnt(pp->p_szc); 1691 pages_left -= npgs; 1692 tpp = pp; 1693 while (npgs--) { 1694 tpp->p_szc = new_szc; 1695 tpp = tpp->p_next; 1696 } 1697 page_list_concat(&pplist, &pp); 1698 pp += tmpnpgs; 1699 } 1700 CHK_LPG(pplist, new_szc); 1701 1702 /* 1703 * return the page to the user if requested 1704 * in the properly locked state. 1705 */ 1706 if (flags == PC_ALLOC && (page_trylock_cons(pplist, SE_EXCL))) { 1707 return (pplist); 1708 } 1709 1710 /* 1711 * Otherwise place the new large page on the freelist 1712 */ 1713 bin = PP_2_BIN(pplist); 1714 mnode = PP_2_MEM_NODE(pplist); 1715 mtype = PP_2_MTYPE(pplist); 1716 page_vpadd(&PAGE_FREELISTS(mnode, new_szc, bin, mtype), pplist); 1717 1718 page_ctr_add(pplist, PG_FREE_LIST); 1719 return (NULL); 1720 1721 fail_promote: 1722 /* 1723 * A thread must have still been freeing or 1724 * reclaiming the page on the cachelist. 1725 * To prevent a deadlock undo what we have 1726 * done sofar and return failure. This 1727 * situation can only happen while promoting 1728 * PAGESIZE pages. 1729 */ 1730 page_promote_err++; 1731 while (pplist) { 1732 pp = pplist; 1733 mach_page_sub(&pplist, pp); 1734 pp->p_szc = 0; 1735 bin = PP_2_BIN(pp); 1736 mtype = PP_2_MTYPE(pp); 1737 mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, mtype), pp); 1738 page_ctr_add(pp, PG_FREE_LIST); 1739 } 1740 return (NULL); 1741 1742 } 1743 1744 /* 1745 * Break up a large page into smaller size pages. 1746 * Pages involved are on the freelist before the call and may 1747 * be returned to the caller if requested, otherwise they will 1748 * be placed back on the freelist. 1749 * The caller is responsible for locking the freelist as well as any other 1750 * accounting which needs to be done for a returned page. 1751 * If flags is not PC_ALLOC, the color argument is ignored, and thus 1752 * technically, any value may be passed in but PC_NO_COLOR is the standard 1753 * which should be followed for clarity's sake. 1754 */ 1755 page_t * 1756 page_demote(int mnode, pfn_t pfnum, uchar_t cur_szc, uchar_t new_szc, 1757 int color, int flags) 1758 { 1759 page_t *pp, *pplist, *npplist; 1760 pgcnt_t npgs, n; 1761 uint_t bin; 1762 uint_t mtype; 1763 page_t *ret_pp = NULL; 1764 1765 ASSERT(cur_szc != 0); 1766 ASSERT(new_szc < cur_szc); 1767 1768 pplist = page_numtopp_nolock(pfnum); 1769 ASSERT(pplist != NULL); 1770 1771 ASSERT(pplist->p_szc == cur_szc); 1772 1773 bin = PP_2_BIN(pplist); 1774 ASSERT(mnode == PP_2_MEM_NODE(pplist)); 1775 mtype = PP_2_MTYPE(pplist); 1776 page_vpsub(&PAGE_FREELISTS(mnode, cur_szc, bin, mtype), pplist); 1777 1778 CHK_LPG(pplist, cur_szc); 1779 page_ctr_sub(pplist, PG_FREE_LIST); 1780 1781 /* 1782 * Number of PAGESIZE pages for smaller new_szc 1783 * page. 1784 */ 1785 npgs = page_get_pagecnt(new_szc); 1786 1787 while (pplist) { 1788 pp = pplist; 1789 1790 ASSERT(pp->p_szc == cur_szc); 1791 1792 /* 1793 * We either break it up into PAGESIZE pages or larger. 1794 */ 1795 if (npgs == 1) { /* PAGESIZE case */ 1796 mach_page_sub(&pplist, pp); 1797 ASSERT(pp->p_szc == cur_szc); 1798 ASSERT(new_szc == 0); 1799 ASSERT(mnode == PP_2_MEM_NODE(pp)); 1800 pp->p_szc = new_szc; 1801 bin = PP_2_BIN(pp); 1802 if ((bin == color) && (flags == PC_ALLOC) && 1803 (ret_pp == NULL) && 1804 page_trylock_cons(pp, SE_EXCL)) { 1805 ret_pp = pp; 1806 } else { 1807 mtype = PP_2_MTYPE(pp); 1808 mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, 1809 mtype), pp); 1810 page_ctr_add(pp, PG_FREE_LIST); 1811 } 1812 } else { 1813 1814 /* 1815 * Break down into smaller lists of pages. 1816 */ 1817 page_list_break(&pplist, &npplist, npgs); 1818 1819 pp = pplist; 1820 n = npgs; 1821 while (n--) { 1822 ASSERT(pp->p_szc == cur_szc); 1823 pp->p_szc = new_szc; 1824 pp = pp->p_next; 1825 } 1826 1827 CHK_LPG(pplist, new_szc); 1828 1829 bin = PP_2_BIN(pplist); 1830 ASSERT(mnode == PP_2_MEM_NODE(pp)); 1831 if ((bin == color) && (flags == PC_ALLOC) && 1832 (ret_pp == NULL) && 1833 page_trylock_cons(pp, SE_EXCL)) { 1834 ret_pp = pp; 1835 } else { 1836 mtype = PP_2_MTYPE(pp); 1837 page_vpadd(&PAGE_FREELISTS(mnode, new_szc, 1838 bin, mtype), pplist); 1839 1840 page_ctr_add(pplist, PG_FREE_LIST); 1841 } 1842 pplist = npplist; 1843 } 1844 } 1845 return (ret_pp); 1846 } 1847 1848 int mpss_coalesce_disable = 0; 1849 1850 /* 1851 * Coalesce free pages into a page of the given szc and color if possible. 1852 * Return the pointer to the page created, otherwise, return NULL. 1853 */ 1854 static page_t * 1855 page_freelist_coalesce(int mnode, uchar_t szc, int color) 1856 { 1857 int r; /* region size */ 1858 int idx, full, i; 1859 pfn_t pfnum; 1860 size_t len; 1861 size_t buckets_to_check; 1862 pgcnt_t cands; 1863 page_t *ret_pp; 1864 int color_stride; 1865 1866 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce); 1867 1868 if (mpss_coalesce_disable) { 1869 return (NULL); 1870 } 1871 1872 r = szc; 1873 PGCTRS_CANDS_GETVALUECOLOR(mnode, r, color, cands); 1874 if (cands == 0) { 1875 VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip); 1876 return (NULL); 1877 } 1878 full = FULL_REGION_CNT(r); 1879 color_stride = (szc) ? page_convert_color(0, szc, page_colors - 1) + 1 : 1880 page_colors; 1881 1882 /* Prevent page_counters dynamic memory from being freed */ 1883 rw_enter(&page_ctrs_rwlock[mnode], RW_READER); 1884 len = PAGE_COUNTERS_ENTRIES(mnode, r); 1885 buckets_to_check = len / color_stride; 1886 idx = PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color); 1887 ASSERT((idx % color_stride) == color); 1888 idx += color_stride; 1889 if (idx >= len) 1890 idx = color; 1891 for (i = 0; i < buckets_to_check; i++) { 1892 if (PAGE_COUNTERS(mnode, r, idx) == full) { 1893 pfnum = IDX_TO_PNUM(mnode, r, idx); 1894 ASSERT(pfnum >= mem_node_config[mnode].physbase && 1895 pfnum < mem_node_config[mnode].physmax); 1896 /* 1897 * RFE: For performance maybe we can do something less 1898 * brutal than locking the entire freelist. So far 1899 * this doesn't seem to be a performance problem? 1900 */ 1901 page_freelist_lock(mnode); 1902 if (PAGE_COUNTERS(mnode, r, idx) != full) { 1903 VM_STAT_ADD(vmm_vmstats.page_ctrs_changed); 1904 goto skip_this_one; 1905 } 1906 ret_pp = page_promote(mnode, pfnum, r, PC_ALLOC); 1907 if (ret_pp != NULL) { 1908 PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color) = 1909 idx; 1910 page_freelist_unlock(mnode); 1911 rw_exit(&page_ctrs_rwlock[mnode]); 1912 #if defined(__sparc) 1913 if (PP_ISNORELOC(ret_pp)) { 1914 pgcnt_t npgs; 1915 1916 npgs = page_get_pagecnt(ret_pp->p_szc); 1917 kcage_freemem_sub(npgs); 1918 } 1919 #endif 1920 return (ret_pp); 1921 } 1922 skip_this_one: 1923 page_freelist_unlock(mnode); 1924 /* 1925 * No point looking for another page if we've 1926 * already tried all of the ones that 1927 * page_ctr_cands indicated. Stash off where we left 1928 * off. 1929 * Note: this is not exact since we don't hold the 1930 * page_freelist_locks before we initially get the 1931 * value of cands for performance reasons, but should 1932 * be a decent approximation. 1933 */ 1934 if (--cands == 0) { 1935 PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color) = 1936 idx; 1937 break; 1938 } 1939 } 1940 idx += color_stride; 1941 if (idx >= len) 1942 idx = color; 1943 } 1944 rw_exit(&page_ctrs_rwlock[mnode]); 1945 VM_STAT_ADD(vmm_vmstats.page_ctrs_failed); 1946 return (NULL); 1947 } 1948 1949 /* 1950 * For the given mnode, promote as many small pages to large pages as possible. 1951 */ 1952 void 1953 page_freelist_coalesce_all(int mnode) 1954 { 1955 int r; /* region size */ 1956 int idx, full; 1957 pfn_t pfnum; 1958 size_t len; 1959 1960 VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce_all); 1961 1962 if (mpss_coalesce_disable) { 1963 return; 1964 } 1965 1966 /* 1967 * Lock the entire freelist and coalesce what we can. 1968 * 1969 * Always promote to the largest page possible 1970 * first to reduce the number of page promotions. 1971 */ 1972 rw_enter(&page_ctrs_rwlock[mnode], RW_READER); 1973 page_freelist_lock(mnode); 1974 for (r = mmu_page_sizes - 1; r > 0; r--) { 1975 pgcnt_t cands; 1976 1977 PGCTRS_CANDS_GETVALUE(mnode, r, cands); 1978 if (cands == 0) { 1979 VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip_all); 1980 continue; 1981 } 1982 1983 full = FULL_REGION_CNT(r); 1984 len = PAGE_COUNTERS_ENTRIES(mnode, r); 1985 1986 for (idx = 0; idx < len; idx++) { 1987 if (PAGE_COUNTERS(mnode, r, idx) == full) { 1988 pfnum = IDX_TO_PNUM(mnode, r, idx); 1989 ASSERT(pfnum >= 1990 mem_node_config[mnode].physbase && 1991 pfnum < 1992 mem_node_config[mnode].physmax); 1993 (void) page_promote(mnode, pfnum, r, PC_FREE); 1994 } 1995 } 1996 } 1997 page_freelist_unlock(mnode); 1998 rw_exit(&page_ctrs_rwlock[mnode]); 1999 } 2000 2001 /* 2002 * This is where all polices for moving pages around 2003 * to different page size free lists is implemented. 2004 * Returns 1 on success, 0 on failure. 2005 * 2006 * So far these are the priorities for this algorithm in descending 2007 * order: 2008 * 2009 * 1) When servicing a request try to do so with a free page 2010 * from next size up. Helps defer fragmentation as long 2011 * as possible. 2012 * 2013 * 2) Page coalesce on demand. Only when a freelist 2014 * larger than PAGESIZE is empty and step 1 2015 * will not work since all larger size lists are 2016 * also empty. 2017 * 2018 * If pfnhi is non-zero, search for large page with pfn range less than pfnhi. 2019 */ 2020 page_t * 2021 page_freelist_fill(uchar_t szc, int color, int mnode, int mtype, pfn_t pfnhi) 2022 { 2023 uchar_t nszc = szc + 1; 2024 int bin; 2025 page_t *pp, *firstpp; 2026 page_t *ret_pp = NULL; 2027 2028 ASSERT(szc < mmu_page_sizes); 2029 2030 /* 2031 * First try to break up a larger page to fill 2032 * current size freelist. 2033 */ 2034 while (nszc < mmu_page_sizes) { 2035 /* 2036 * If page found then demote it. 2037 */ 2038 bin = page_convert_color(szc, nszc, color); 2039 if (PAGE_FREELISTS(mnode, nszc, bin, mtype)) { 2040 page_freelist_lock(mnode); 2041 firstpp = pp = PAGE_FREELISTS(mnode, nszc, bin, mtype); 2042 2043 /* 2044 * If pfnhi is not PFNNULL, look for large page below 2045 * pfnhi. PFNNULL signifies no pfn requirement. 2046 */ 2047 if (pfnhi != PFNNULL && pp->p_pagenum >= pfnhi) { 2048 do { 2049 pp = pp->p_vpnext; 2050 if (pp == firstpp) { 2051 pp = NULL; 2052 break; 2053 } 2054 } while (pp->p_pagenum >= pfnhi); 2055 } 2056 if (pp) { 2057 ASSERT(pp->p_szc == nszc); 2058 ret_pp = page_demote(mnode, pp->p_pagenum, 2059 pp->p_szc, szc, color, PC_ALLOC); 2060 if (ret_pp) { 2061 page_freelist_unlock(mnode); 2062 #if defined(__sparc) 2063 if (PP_ISNORELOC(ret_pp)) { 2064 pgcnt_t npgs; 2065 2066 npgs = page_get_pagecnt( 2067 ret_pp->p_szc); 2068 kcage_freemem_sub(npgs); 2069 } 2070 #endif 2071 return (ret_pp); 2072 } 2073 } 2074 page_freelist_unlock(mnode); 2075 } 2076 nszc++; 2077 } 2078 2079 /* 2080 * Ok that didn't work. Time to coalesce. 2081 */ 2082 if (szc != 0) { 2083 ret_pp = page_freelist_coalesce(mnode, szc, color); 2084 } 2085 2086 return (ret_pp); 2087 } 2088 2089 /* 2090 * Helper routine used only by the freelist code to lock 2091 * a page. If the page is a large page then it succeeds in 2092 * locking all the constituent pages or none at all. 2093 * Returns 1 on sucess, 0 on failure. 2094 */ 2095 static int 2096 page_trylock_cons(page_t *pp, se_t se) 2097 { 2098 page_t *tpp, *first_pp = pp; 2099 2100 /* 2101 * Fail if can't lock first or only page. 2102 */ 2103 if (!page_trylock(pp, se)) { 2104 return (0); 2105 } 2106 2107 /* 2108 * PAGESIZE: common case. 2109 */ 2110 if (pp->p_szc == 0) { 2111 return (1); 2112 } 2113 2114 /* 2115 * Large page case. 2116 */ 2117 tpp = pp->p_next; 2118 while (tpp != pp) { 2119 if (!page_trylock(tpp, se)) { 2120 /* 2121 * On failure unlock what we 2122 * have locked so far. 2123 */ 2124 while (first_pp != tpp) { 2125 page_unlock(first_pp); 2126 first_pp = first_pp->p_next; 2127 } 2128 return (0); 2129 } 2130 tpp = tpp->p_next; 2131 } 2132 return (1); 2133 } 2134 2135 page_t * 2136 page_get_mnode_freelist(int mnode, uint_t bin, int mtype, uchar_t szc, 2137 uint_t flags) 2138 { 2139 kmutex_t *pcm; 2140 int i, fill_tried, fill_marker; 2141 page_t *pp, *first_pp; 2142 uint_t bin_marker; 2143 int colors, cpucolors; 2144 uchar_t nszc; 2145 uint_t nszc_color_shift; 2146 int nwaybins = 0, nwaycnt; 2147 2148 ASSERT(szc < mmu_page_sizes); 2149 2150 VM_STAT_ADD(vmm_vmstats.pgmf_alloc[szc]); 2151 2152 /* LINTED */ 2153 MTYPE_START(mnode, mtype, flags); 2154 if (mtype < 0) { /* mnode foes not have memory in mtype range */ 2155 VM_STAT_ADD(vmm_vmstats.pgmf_allocempty[szc]); 2156 return (NULL); 2157 } 2158 2159 /* 2160 * Set how many physical colors for this page size. 2161 */ 2162 colors = (szc) ? page_convert_color(0, szc, page_colors - 1) + 1 : 2163 page_colors; 2164 2165 nszc = MIN(szc + 1, mmu_page_sizes - 1); 2166 nszc_color_shift = page_get_shift(nszc) - page_get_shift(szc); 2167 2168 /* cpu_page_colors is non-zero if a page color may be in > 1 bin */ 2169 cpucolors = cpu_page_colors; 2170 2171 /* 2172 * adjust cpucolors to possibly check additional 'equivalent' bins 2173 * to try to minimize fragmentation of large pages by delaying calls 2174 * to page_freelist_fill. 2175 */ 2176 if (colorequiv > 1) { 2177 int equivcolors = colors / colorequiv; 2178 2179 if (equivcolors && (cpucolors == 0 || equivcolors < cpucolors)) 2180 cpucolors = equivcolors; 2181 } 2182 2183 ASSERT(colors <= page_colors); 2184 ASSERT(colors); 2185 ASSERT((colors & (colors - 1)) == 0); 2186 2187 ASSERT(bin < colors); 2188 2189 /* 2190 * Only hold one freelist lock at a time, that way we 2191 * can start anywhere and not have to worry about lock 2192 * ordering. 2193 */ 2194 big_try_again: 2195 fill_tried = 0; 2196 nwaycnt = 0; 2197 for (i = 0; i <= colors; i++) { 2198 try_again: 2199 ASSERT(bin < colors); 2200 if (PAGE_FREELISTS(mnode, szc, bin, mtype)) { 2201 pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST); 2202 mutex_enter(pcm); 2203 pp = PAGE_FREELISTS(mnode, szc, bin, mtype); 2204 if (pp != NULL) { 2205 /* 2206 * These were set before the page 2207 * was put on the free list, 2208 * they must still be set. 2209 */ 2210 ASSERT(PP_ISFREE(pp)); 2211 ASSERT(PP_ISAGED(pp)); 2212 ASSERT(pp->p_vnode == NULL); 2213 ASSERT(pp->p_hash == NULL); 2214 ASSERT(pp->p_offset == (u_offset_t)-1); 2215 ASSERT(pp->p_szc == szc); 2216 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 2217 2218 /* 2219 * Walk down the hash chain. 2220 * 8k pages are linked on p_next 2221 * and p_prev fields. Large pages 2222 * are a contiguous group of 2223 * constituent pages linked together 2224 * on their p_next and p_prev fields. 2225 * The large pages are linked together 2226 * on the hash chain using p_vpnext 2227 * p_vpprev of the base constituent 2228 * page of each large page. 2229 */ 2230 first_pp = pp; 2231 while (!page_trylock_cons(pp, SE_EXCL)) { 2232 if (szc == 0) { 2233 pp = pp->p_next; 2234 } else { 2235 pp = pp->p_vpnext; 2236 } 2237 2238 ASSERT(PP_ISFREE(pp)); 2239 ASSERT(PP_ISAGED(pp)); 2240 ASSERT(pp->p_vnode == NULL); 2241 ASSERT(pp->p_hash == NULL); 2242 ASSERT(pp->p_offset == (u_offset_t)-1); 2243 ASSERT(pp->p_szc == szc); 2244 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == 2245 mnode); 2246 2247 if (pp == first_pp) { 2248 pp = NULL; 2249 break; 2250 } 2251 } 2252 2253 if (pp) { 2254 ASSERT(mtype == PP_2_MTYPE(pp)); 2255 ASSERT(pp->p_szc == szc); 2256 if (szc == 0) { 2257 page_sub(&PAGE_FREELISTS(mnode, 2258 szc, bin, mtype), pp); 2259 } else { 2260 page_vpsub(&PAGE_FREELISTS( 2261 mnode, szc, bin, mtype), 2262 pp); 2263 CHK_LPG(pp, szc); 2264 } 2265 page_ctr_sub(pp, PG_FREE_LIST); 2266 2267 if ((PP_ISFREE(pp) == 0) || 2268 (PP_ISAGED(pp) == 0)) 2269 panic("free page is not. pp %p", 2270 (void *)pp); 2271 mutex_exit(pcm); 2272 2273 #if defined(__sparc) 2274 ASSERT(!kcage_on || PP_ISNORELOC(pp) || 2275 (flags & PG_NORELOC) == 0); 2276 2277 if (PP_ISNORELOC(pp)) { 2278 pgcnt_t npgs; 2279 2280 npgs = page_get_pagecnt(szc); 2281 kcage_freemem_sub(npgs); 2282 } 2283 #endif 2284 VM_STAT_ADD(vmm_vmstats. 2285 pgmf_allocok[szc]); 2286 return (pp); 2287 } 2288 } 2289 mutex_exit(pcm); 2290 } 2291 2292 /* 2293 * Wow! The initial bin is empty. 2294 * If specific color is needed, check if page color may be 2295 * in other bins. cpucolors is: 2296 * 0 if the colors for this cpu is equal to page_colors. 2297 * This means that pages with a particular color are in a 2298 * single bin. 2299 * -1 if colors of cpus (cheetah+) are heterogenous. Need to 2300 * first determine the colors for the current cpu. 2301 * >0 colors of all cpus are homogenous and < page_colors 2302 */ 2303 2304 if ((flags & PG_MATCH_COLOR) && (cpucolors != 0)) { 2305 if (!nwaybins) { 2306 /* 2307 * cpucolors is negative if ecache setsizes 2308 * are heterogenous. determine colors for this 2309 * particular cpu. 2310 */ 2311 if (cpucolors < 0) { 2312 cpucolors = CPUSETSIZE() / MMU_PAGESIZE; 2313 ASSERT(cpucolors > 0); 2314 nwaybins = colors / cpucolors; 2315 } else { 2316 nwaybins = colors / cpucolors; 2317 ASSERT(szc > 0 || nwaybins > 1); 2318 } 2319 if (nwaybins < 2) 2320 cpucolors = 0; 2321 } 2322 2323 if (cpucolors && (nwaycnt + 1 <= nwaybins)) { 2324 nwaycnt++; 2325 bin = (bin + (colors / nwaybins)) & 2326 (colors - 1); 2327 if (nwaycnt < nwaybins) { 2328 goto try_again; 2329 } 2330 } 2331 /* back to initial color if fall-thru */ 2332 } 2333 2334 /* 2335 * color bins are all empty if color match. Try and satisfy 2336 * the request by breaking up or coalescing pages from 2337 * a different size freelist of the correct color that 2338 * satisfies the ORIGINAL color requested. If that 2339 * fails then try pages of the same size but different 2340 * colors assuming we are not called with 2341 * PG_MATCH_COLOR. 2342 */ 2343 if (!fill_tried) { 2344 fill_tried = 1; 2345 fill_marker = bin >> nszc_color_shift; 2346 pp = page_freelist_fill(szc, bin, mnode, mtype, 2347 PFNNULL); 2348 if (pp != NULL) { 2349 return (pp); 2350 } 2351 } 2352 2353 if (flags & PG_MATCH_COLOR) 2354 break; 2355 2356 /* 2357 * Select next color bin to try. 2358 */ 2359 if (szc == 0) { 2360 /* 2361 * PAGESIZE page case. 2362 */ 2363 if (i == 0) { 2364 bin = (bin + BIN_STEP) & page_colors_mask; 2365 bin_marker = bin; 2366 } else { 2367 bin = (bin + vac_colors) & page_colors_mask; 2368 if (bin == bin_marker) { 2369 bin = (bin + 1) & page_colors_mask; 2370 bin_marker = bin; 2371 } 2372 } 2373 } else { 2374 /* 2375 * Large page case. 2376 */ 2377 bin = (bin + 1) & (colors - 1); 2378 } 2379 /* 2380 * If bin advanced to the next color bin of the 2381 * next larger pagesize, there is a chance the fill 2382 * could succeed. 2383 */ 2384 if (fill_marker != (bin >> nszc_color_shift)) 2385 fill_tried = 0; 2386 } 2387 2388 #if defined(__sparc) 2389 if (!(flags & (PG_NORELOC | PGI_NOCAGE | PGI_RELOCONLY)) && 2390 (kcage_freemem >= kcage_lotsfree)) { 2391 /* 2392 * The Cage is ON and with plenty of free mem, and 2393 * we're willing to check for a NORELOC page if we 2394 * couldn't find a RELOC page, so spin again. 2395 */ 2396 flags |= PG_NORELOC; 2397 mtype = MTYPE_NORELOC; 2398 goto big_try_again; 2399 } 2400 #else 2401 if (flags & PGI_MT_RANGE) { 2402 /* cycle through range of mtypes */ 2403 MTYPE_NEXT(mnode, mtype, flags); 2404 if (mtype >= 0) 2405 goto big_try_again; 2406 } 2407 #endif 2408 VM_STAT_ADD(vmm_vmstats.pgmf_allocfailed[szc]); 2409 2410 return (NULL); 2411 } 2412 2413 2414 /* 2415 * Returns the count of free pages for 'pp' with size code 'szc'. 2416 * Note: This function does not return an exact value as the page freelist 2417 * locks are not held and thus the values in the page_counters may be 2418 * changing as we walk through the data. 2419 */ 2420 static int 2421 page_freecnt(int mnode, page_t *pp, uchar_t szc) 2422 { 2423 pgcnt_t pgfree; 2424 pgcnt_t cnt; 2425 ssize_t r = szc; /* region size */ 2426 ssize_t idx; 2427 int i; 2428 int full, range; 2429 2430 /* Make sure pagenum passed in is aligned properly */ 2431 ASSERT((pp->p_pagenum & (PNUM_SIZE(szc) - 1)) == 0); 2432 ASSERT(szc > 0); 2433 2434 /* Prevent page_counters dynamic memory from being freed */ 2435 rw_enter(&page_ctrs_rwlock[mnode], RW_READER); 2436 idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum); 2437 cnt = PAGE_COUNTERS(mnode, r, idx); 2438 pgfree = cnt << PNUM_SHIFT(r - 1); 2439 range = FULL_REGION_CNT(szc); 2440 2441 /* Check for completely full region */ 2442 if (cnt == range) { 2443 rw_exit(&page_ctrs_rwlock[mnode]); 2444 return (pgfree); 2445 } 2446 2447 while (--r > 0) { 2448 idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum); 2449 full = FULL_REGION_CNT(r); 2450 for (i = 0; i < range; i++, idx++) { 2451 cnt = PAGE_COUNTERS(mnode, r, idx); 2452 /* 2453 * If cnt here is full, that means we have already 2454 * accounted for these pages earlier. 2455 */ 2456 if (cnt != full) { 2457 pgfree += (cnt << PNUM_SHIFT(r - 1)); 2458 } 2459 } 2460 range *= full; 2461 } 2462 rw_exit(&page_ctrs_rwlock[mnode]); 2463 return (pgfree); 2464 } 2465 2466 /* 2467 * Called from page_geti_contig_pages to exclusively lock constituent pages 2468 * starting from 'spp' for page size code 'szc'. 2469 * 2470 * If 'ptcpthreshold' is set, the number of free pages needed in the 'szc' 2471 * region needs to be greater than or equal to the threshold. 2472 */ 2473 static int 2474 page_trylock_contig_pages(int mnode, page_t *spp, uchar_t szc, int flags) 2475 { 2476 pgcnt_t pgcnt = PNUM_SIZE(szc); 2477 pgcnt_t pgfree, i; 2478 page_t *pp; 2479 2480 VM_STAT_ADD(vmm_vmstats.ptcp[szc]); 2481 2482 2483 if ((ptcpthreshold == 0) || (flags & PGI_PGCPHIPRI)) 2484 goto skipptcpcheck; 2485 /* 2486 * check if there are sufficient free pages available before attempting 2487 * to trylock. Count is approximate as page counters can change. 2488 */ 2489 pgfree = page_freecnt(mnode, spp, szc); 2490 2491 /* attempt to trylock if there are sufficient already free pages */ 2492 if (pgfree < pgcnt/ptcpthreshold) { 2493 VM_STAT_ADD(vmm_vmstats.ptcpfreethresh[szc]); 2494 return (0); 2495 } 2496 2497 skipptcpcheck: 2498 2499 for (i = 0; i < pgcnt; i++) { 2500 pp = &spp[i]; 2501 if (!page_trylock(pp, SE_EXCL)) { 2502 VM_STAT_ADD(vmm_vmstats.ptcpfailexcl[szc]); 2503 while (--i != (pgcnt_t)-1) { 2504 pp = &spp[i]; 2505 ASSERT(PAGE_EXCL(pp)); 2506 page_unlock(pp); 2507 } 2508 return (0); 2509 } 2510 ASSERT(spp[i].p_pagenum == spp->p_pagenum + i); 2511 if ((pp->p_szc > szc || (szc && pp->p_szc == szc)) && 2512 !PP_ISFREE(pp)) { 2513 VM_STAT_ADD(vmm_vmstats.ptcpfailszc[szc]); 2514 ASSERT(i == 0); 2515 page_unlock(pp); 2516 return (0); 2517 } 2518 if (PP_ISNORELOC(pp)) { 2519 VM_STAT_ADD(vmm_vmstats.ptcpfailcage[szc]); 2520 while (i != (pgcnt_t)-1) { 2521 pp = &spp[i]; 2522 ASSERT(PAGE_EXCL(pp)); 2523 page_unlock(pp); 2524 i--; 2525 } 2526 return (0); 2527 } 2528 } 2529 VM_STAT_ADD(vmm_vmstats.ptcpok[szc]); 2530 return (1); 2531 } 2532 2533 /* 2534 * Claim large page pointed to by 'pp'. 'pp' is the starting set 2535 * of 'szc' constituent pages that had been locked exclusively previously. 2536 * Will attempt to relocate constituent pages in use. 2537 */ 2538 static page_t * 2539 page_claim_contig_pages(page_t *pp, uchar_t szc, int flags) 2540 { 2541 spgcnt_t pgcnt, npgs, i; 2542 page_t *targpp, *rpp, *hpp; 2543 page_t *replpp = NULL; 2544 page_t *pplist = NULL; 2545 2546 ASSERT(pp != NULL); 2547 2548 pgcnt = page_get_pagecnt(szc); 2549 while (pgcnt) { 2550 ASSERT(PAGE_EXCL(pp)); 2551 ASSERT(!PP_ISNORELOC(pp)); 2552 if (PP_ISFREE(pp)) { 2553 /* 2554 * If this is a PG_FREE_LIST page then its 2555 * size code can change underneath us due to 2556 * page promotion or demotion. As an optimzation 2557 * use page_list_sub_pages() instead of 2558 * page_list_sub(). 2559 */ 2560 if (PP_ISAGED(pp)) { 2561 page_list_sub_pages(pp, szc); 2562 if (pp->p_szc == szc) { 2563 return (pp); 2564 } 2565 ASSERT(pp->p_szc < szc); 2566 npgs = page_get_pagecnt(pp->p_szc); 2567 hpp = pp; 2568 for (i = 0; i < npgs; i++, pp++) { 2569 pp->p_szc = szc; 2570 } 2571 page_list_concat(&pplist, &hpp); 2572 pgcnt -= npgs; 2573 continue; 2574 } 2575 ASSERT(!PP_ISAGED(pp)); 2576 ASSERT(pp->p_szc == 0); 2577 page_list_sub(pp, PG_CACHE_LIST); 2578 page_hashout(pp, NULL); 2579 PP_SETAGED(pp); 2580 pp->p_szc = szc; 2581 page_list_concat(&pplist, &pp); 2582 pp++; 2583 pgcnt--; 2584 continue; 2585 } 2586 npgs = page_get_pagecnt(pp->p_szc); 2587 2588 /* 2589 * page_create_wait freemem accounting done by caller of 2590 * page_get_freelist and not necessary to call it prior to 2591 * calling page_get_replacement_page. 2592 * 2593 * page_get_replacement_page can call page_get_contig_pages 2594 * to acquire a large page (szc > 0); the replacement must be 2595 * smaller than the contig page size to avoid looping or 2596 * szc == 0 and PGI_PGCPSZC0 is set. 2597 */ 2598 if (pp->p_szc < szc || (szc == 0 && (flags & PGI_PGCPSZC0))) { 2599 replpp = page_get_replacement_page(pp, NULL, 0); 2600 if (replpp) { 2601 npgs = page_get_pagecnt(pp->p_szc); 2602 ASSERT(npgs <= pgcnt); 2603 targpp = pp; 2604 } 2605 } 2606 2607 /* 2608 * If replacement is NULL or do_page_relocate fails, fail 2609 * coalescing of pages. 2610 */ 2611 if (replpp == NULL || (do_page_relocate(&targpp, &replpp, 0, 2612 &npgs, NULL) != 0)) { 2613 /* 2614 * Unlock un-processed target list 2615 */ 2616 while (pgcnt--) { 2617 ASSERT(PAGE_EXCL(pp)); 2618 page_unlock(pp); 2619 pp++; 2620 } 2621 /* 2622 * Free the processed target list. 2623 */ 2624 while (pplist) { 2625 pp = pplist; 2626 page_sub(&pplist, pp); 2627 ASSERT(PAGE_EXCL(pp)); 2628 ASSERT(pp->p_szc == szc); 2629 ASSERT(PP_ISFREE(pp)); 2630 ASSERT(PP_ISAGED(pp)); 2631 pp->p_szc = 0; 2632 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); 2633 page_unlock(pp); 2634 } 2635 2636 if (replpp != NULL) 2637 page_free_replacement_page(replpp); 2638 2639 return (NULL); 2640 } 2641 ASSERT(pp == targpp); 2642 2643 /* LINTED */ 2644 ASSERT(hpp = pp); /* That's right, it's an assignment */ 2645 2646 pp += npgs; 2647 pgcnt -= npgs; 2648 2649 while (npgs--) { 2650 ASSERT(PAGE_EXCL(targpp)); 2651 ASSERT(!PP_ISFREE(targpp)); 2652 ASSERT(!PP_ISNORELOC(targpp)); 2653 PP_SETFREE(targpp); 2654 ASSERT(PP_ISAGED(targpp)); 2655 ASSERT(targpp->p_szc < szc || (szc == 0 && 2656 (flags & PGI_PGCPSZC0))); 2657 targpp->p_szc = szc; 2658 targpp = targpp->p_next; 2659 2660 rpp = replpp; 2661 ASSERT(rpp != NULL); 2662 page_sub(&replpp, rpp); 2663 ASSERT(PAGE_EXCL(rpp)); 2664 ASSERT(!PP_ISFREE(rpp)); 2665 page_unlock(rpp); 2666 } 2667 ASSERT(targpp == hpp); 2668 ASSERT(replpp == NULL); 2669 page_list_concat(&pplist, &targpp); 2670 } 2671 CHK_LPG(pplist, szc); 2672 return (pplist); 2673 } 2674 2675 /* 2676 * Trim kernel cage from pfnlo-pfnhi and store result in lo-hi. Return code 2677 * of 0 means nothing left after trim. 2678 */ 2679 2680 int 2681 trimkcage(struct memseg *mseg, pfn_t *lo, pfn_t *hi, pfn_t pfnlo, pfn_t pfnhi) 2682 { 2683 pfn_t kcagepfn; 2684 int decr; 2685 int rc = 0; 2686 2687 if (PP_ISNORELOC(mseg->pages)) { 2688 if (PP_ISNORELOC(mseg->epages - 1) == 0) { 2689 2690 /* lower part of this mseg inside kernel cage */ 2691 decr = kcage_current_pfn(&kcagepfn); 2692 2693 /* kernel cage may have transitioned past mseg */ 2694 if (kcagepfn >= mseg->pages_base && 2695 kcagepfn < mseg->pages_end) { 2696 ASSERT(decr == 0); 2697 *lo = kcagepfn; 2698 *hi = MIN(pfnhi, 2699 (mseg->pages_end - 1)); 2700 rc = 1; 2701 } 2702 } 2703 /* else entire mseg in the cage */ 2704 } else { 2705 if (PP_ISNORELOC(mseg->epages - 1)) { 2706 2707 /* upper part of this mseg inside kernel cage */ 2708 decr = kcage_current_pfn(&kcagepfn); 2709 2710 /* kernel cage may have transitioned past mseg */ 2711 if (kcagepfn >= mseg->pages_base && 2712 kcagepfn < mseg->pages_end) { 2713 ASSERT(decr); 2714 *hi = kcagepfn; 2715 *lo = MAX(pfnlo, mseg->pages_base); 2716 rc = 1; 2717 } 2718 } else { 2719 /* entire mseg outside of kernel cage */ 2720 *lo = MAX(pfnlo, mseg->pages_base); 2721 *hi = MIN(pfnhi, (mseg->pages_end - 1)); 2722 rc = 1; 2723 } 2724 } 2725 return (rc); 2726 } 2727 2728 /* 2729 * called from page_get_contig_pages to search 'pfnlo' thru 'pfnhi' to "claim" a 2730 * page with size code 'szc'. Claiming such a page requires acquiring 2731 * exclusive locks on all constituent pages (page_trylock_contig_pages), 2732 * relocating pages in use and concatenating these constituent pages into a 2733 * large page. 2734 * 2735 * The page lists do not have such a large page and page_freelist_fill has 2736 * already failed to demote larger pages and/or coalesce smaller free pages. 2737 * 2738 * 'flags' may specify PG_COLOR_MATCH which would limit the search of large 2739 * pages with the same color as 'bin'. 2740 * 2741 * 'pfnflag' specifies the subset of the pfn range to search. 2742 */ 2743 2744 2745 static page_t * 2746 page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags, 2747 pfn_t pfnlo, pfn_t pfnhi, int pfnflag) 2748 { 2749 struct memseg *mseg; 2750 pgcnt_t szcpgcnt = page_get_pagecnt(szc); 2751 pgcnt_t szcpgmask = szcpgcnt - 1; 2752 pfn_t randpfn; 2753 page_t *pp, *randpp, *endpp; 2754 uint_t colors; 2755 pfn_t hi, lo; 2756 uint_t skip; 2757 2758 ASSERT(szc != 0 || (flags & PGI_PGCPSZC0)); 2759 2760 if ((pfnhi - pfnlo) + 1 < szcpgcnt) 2761 return (NULL); 2762 2763 ASSERT(szc < mmu_page_sizes); 2764 2765 colors = (szc) ? page_convert_color(0, szc, page_colors - 1) + 1 : 2766 page_colors; 2767 2768 ASSERT(bin < colors); 2769 2770 /* 2771 * trim the pfn range to search based on pfnflag. pfnflag is set 2772 * when there have been previous page_get_contig_page failures to 2773 * limit the search. 2774 * 2775 * The high bit in pfnflag specifies the number of 'slots' in the 2776 * pfn range and the remainder of pfnflag specifies which slot. 2777 * For example, a value of 1010b would mean the second slot of 2778 * the pfn range that has been divided into 8 slots. 2779 */ 2780 if (pfnflag > 1) { 2781 int slots = 1 << (highbit(pfnflag) - 1); 2782 int slotid = pfnflag & (slots - 1); 2783 pgcnt_t szcpages; 2784 int slotlen; 2785 2786 pfnlo = P2ROUNDUP(pfnlo, szcpgcnt); 2787 pfnhi = pfnhi & ~(szcpgcnt - 1); 2788 2789 szcpages = ((pfnhi - pfnlo) + 1) / szcpgcnt; 2790 slotlen = howmany(szcpages, slots); 2791 pfnlo = pfnlo + (((slotid * slotlen) % szcpages) * szcpgcnt); 2792 ASSERT(pfnlo < pfnhi); 2793 if (pfnhi > pfnlo + (slotlen * szcpgcnt)) 2794 pfnhi = pfnlo + (slotlen * szcpgcnt); 2795 } 2796 2797 memsegs_lock(0); 2798 2799 /* 2800 * loop through memsegs to look for contig page candidates 2801 */ 2802 2803 for (mseg = memsegs; mseg != NULL; mseg = mseg->next) { 2804 if (pfnhi < mseg->pages_base || pfnlo >= mseg->pages_end) { 2805 /* no overlap */ 2806 continue; 2807 } 2808 2809 if (mseg->pages_end - mseg->pages_base < szcpgcnt) 2810 /* mseg too small */ 2811 continue; 2812 2813 /* trim off kernel cage pages from pfn range */ 2814 if (kcage_on) { 2815 if (trimkcage(mseg, &lo, &hi, pfnlo, pfnhi) == 0) 2816 continue; 2817 } else { 2818 lo = MAX(pfnlo, mseg->pages_base); 2819 hi = MIN(pfnhi, (mseg->pages_end - 1)); 2820 } 2821 2822 /* round to szcpgcnt boundaries */ 2823 lo = P2ROUNDUP(lo, szcpgcnt); 2824 hi = hi & ~(szcpgcnt - 1); 2825 2826 if (hi <= lo) 2827 continue; 2828 2829 /* 2830 * set lo to point to the pfn for the desired bin. Large 2831 * page sizes may only have a single page color 2832 */ 2833 if ((colors > 1) && (flags & PG_MATCH_COLOR)) { 2834 uint_t lobin; 2835 2836 /* 2837 * factor in colorequiv to check additional 2838 * 'equivalent' bins. 2839 */ 2840 if (colorequiv > 1 && colors > colorequiv) 2841 colors = colors / colorequiv; 2842 2843 /* determine bin that lo currently points to */ 2844 lobin = (lo & ((szcpgcnt * colors) - 1)) / szcpgcnt; 2845 2846 /* 2847 * set lo to point at appropriate color and set skip 2848 * to arrive at the next szc page of the same color. 2849 */ 2850 lo += ((bin - lobin) & (colors - 1)) * szcpgcnt; 2851 2852 skip = colors * szcpgcnt; 2853 } else { 2854 /* check all pages starting from lo */ 2855 skip = szcpgcnt; 2856 } 2857 if (hi <= lo) 2858 /* mseg cannot satisfy color request */ 2859 continue; 2860 2861 /* randomly choose a point between lo and hi to begin search */ 2862 2863 randpfn = (pfn_t)GETTICK(); 2864 randpfn = ((randpfn % (hi - lo)) + lo) & ~(skip - 1); 2865 randpp = mseg->pages + (randpfn - mseg->pages_base); 2866 2867 ASSERT(randpp->p_pagenum == randpfn); 2868 2869 pp = randpp; 2870 endpp = mseg->pages + (hi - mseg->pages_base); 2871 2872 ASSERT(randpp + szcpgcnt <= endpp); 2873 2874 do { 2875 ASSERT(!(pp->p_pagenum & szcpgmask)); 2876 ASSERT((flags & PG_MATCH_COLOR) == 0 || 2877 colorequiv > 1 || 2878 PP_2_BIN(pp) == bin); 2879 if (page_trylock_contig_pages(mnode, pp, szc, flags)) { 2880 /* pages unlocked by page_claim on failure */ 2881 if (page_claim_contig_pages(pp, szc, flags)) { 2882 memsegs_unlock(0); 2883 return (pp); 2884 } 2885 } 2886 2887 pp += skip; 2888 if (pp >= endpp) { 2889 /* start from the beginning */ 2890 pp = mseg->pages + (lo - mseg->pages_base); 2891 ASSERT(pp->p_pagenum == lo); 2892 ASSERT(pp + szcpgcnt <= endpp); 2893 } 2894 } while (pp != randpp); 2895 } 2896 memsegs_unlock(0); 2897 return (NULL); 2898 } 2899 2900 2901 /* 2902 * controlling routine that searches through physical memory in an attempt to 2903 * claim a large page based on the input parameters. 2904 * on the page free lists. 2905 * 2906 * calls page_geti_contig_pages with an initial pfn range from the mnode 2907 * and mtype. page_geti_contig_pages will trim off the parts of the pfn range 2908 * that overlaps with the kernel cage or does not match the requested page 2909 * color if PG_MATCH_COLOR is set. Since this search is very expensive, 2910 * page_geti_contig_pages may further limit the search range based on 2911 * previous failure counts (pgcpfailcnt[]). 2912 * 2913 * for PGI_PGCPSZC0 requests, page_get_contig_pages will relocate a base 2914 * pagesize page that satisfies mtype. 2915 */ 2916 page_t * 2917 page_get_contig_pages(int mnode, uint_t bin, int mtype, uchar_t szc, 2918 uint_t flags) 2919 { 2920 pfn_t pfnlo, pfnhi; /* contig pages pfn range */ 2921 page_t *pp; 2922 int pfnflag = 0; /* no limit on search if 0 */ 2923 2924 VM_STAT_ADD(vmm_vmstats.pgcp_alloc[szc]); 2925 2926 /* LINTED */ 2927 MTYPE_START(mnode, mtype, flags); 2928 if (mtype < 0) { /* mnode does not have memory in mtype range */ 2929 VM_STAT_ADD(vmm_vmstats.pgcp_allocempty[szc]); 2930 return (NULL); 2931 } 2932 2933 ASSERT(szc > 0 || (flags & PGI_PGCPSZC0)); 2934 2935 /* do not limit search and ignore color if hi pri */ 2936 2937 if (pgcplimitsearch && ((flags & PGI_PGCPHIPRI) == 0)) 2938 pfnflag = pgcpfailcnt[szc]; 2939 2940 /* remove color match to improve chances */ 2941 2942 if (flags & PGI_PGCPHIPRI || pfnflag) 2943 flags &= ~PG_MATCH_COLOR; 2944 2945 do { 2946 /* get pfn range based on mnode and mtype */ 2947 MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi); 2948 2949 ASSERT(pfnhi >= pfnlo); 2950 2951 pp = page_geti_contig_pages(mnode, bin, szc, flags, 2952 pfnlo, pfnhi, pfnflag); 2953 2954 if (pp != NULL) { 2955 pfnflag = pgcpfailcnt[szc]; 2956 if (pfnflag) { 2957 /* double the search size */ 2958 pgcpfailcnt[szc] = pfnflag >> 1; 2959 } 2960 VM_STAT_ADD(vmm_vmstats.pgcp_allocok[szc]); 2961 return (pp); 2962 } 2963 /* LINTED */ 2964 } while ((flags & PGI_MT_RANGE) && 2965 (MTYPE_NEXT(mnode, mtype, flags) >= 0)); 2966 2967 VM_STAT_ADD(vmm_vmstats.pgcp_allocfailed[szc]); 2968 return (NULL); 2969 } 2970 2971 2972 /* 2973 * Find the `best' page on the freelist for this (vp,off) (as,vaddr) pair. 2974 * 2975 * Does its own locking and accounting. 2976 * If PG_MATCH_COLOR is set, then NULL will be returned if there are no 2977 * pages of the proper color even if there are pages of a different color. 2978 * 2979 * Finds a page, removes it, THEN locks it. 2980 */ 2981 2982 /*ARGSUSED*/ 2983 page_t * 2984 page_get_freelist(struct vnode *vp, u_offset_t off, struct seg *seg, 2985 caddr_t vaddr, size_t size, uint_t flags, struct lgrp *lgrp) 2986 { 2987 struct as *as = seg->s_as; 2988 page_t *pp = NULL; 2989 ulong_t bin; 2990 uchar_t szc; 2991 int mnode; 2992 int mtype; 2993 page_t *(*page_get_func)(int, uint_t, int, uchar_t, uint_t); 2994 lgrp_mnode_cookie_t lgrp_cookie; 2995 2996 page_get_func = page_get_mnode_freelist; 2997 2998 /* 2999 * If we aren't passed a specific lgroup, or passed a freed lgrp 3000 * assume we wish to allocate near to the current thread's home. 3001 */ 3002 if (!LGRP_EXISTS(lgrp)) 3003 lgrp = lgrp_home_lgrp(); 3004 3005 if (kcage_on) { 3006 if ((flags & (PG_NORELOC | PG_PANIC)) == PG_NORELOC && 3007 kcage_freemem < kcage_throttlefree + btop(size) && 3008 curthread != kcage_cageout_thread) { 3009 /* 3010 * Set a "reserve" of kcage_throttlefree pages for 3011 * PG_PANIC and cageout thread allocations. 3012 * 3013 * Everybody else has to serialize in 3014 * page_create_get_something() to get a cage page, so 3015 * that we don't deadlock cageout! 3016 */ 3017 return (NULL); 3018 } 3019 } else { 3020 flags &= ~PG_NORELOC; 3021 flags |= PGI_NOCAGE; 3022 } 3023 3024 /* LINTED */ 3025 MTYPE_INIT(mtype, vp, vaddr, flags); 3026 3027 /* 3028 * Convert size to page size code. 3029 */ 3030 if ((szc = page_szc(size)) == (uchar_t)-1) 3031 panic("page_get_freelist: illegal page size request"); 3032 ASSERT(szc < mmu_page_sizes); 3033 3034 VM_STAT_ADD(vmm_vmstats.pgf_alloc[szc]); 3035 3036 /* LINTED */ 3037 AS_2_BIN(as, seg, vp, vaddr, bin); 3038 3039 /* bin is for base pagesize color - convert if larger pagesize. */ 3040 if (szc) 3041 bin = page_convert_color(0, szc, bin); 3042 3043 /* 3044 * Try to get a local page first, but try remote if we can't 3045 * get a page of the right color. 3046 */ 3047 pgretry: 3048 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL); 3049 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3050 pp = page_get_func(mnode, bin, mtype, szc, flags); 3051 if (pp != NULL) { 3052 VM_STAT_ADD(vmm_vmstats.pgf_allocok[szc]); 3053 DTRACE_PROBE4(page__get, 3054 lgrp_t *, lgrp, 3055 int, mnode, 3056 ulong_t, bin, 3057 uint_t, flags); 3058 return (pp); 3059 } 3060 } 3061 ASSERT(pp == NULL); 3062 3063 /* 3064 * for non-SZC0 PAGESIZE requests, check cachelist before checking 3065 * remote free lists. Caller expected to call page_get_cachelist which 3066 * will check local cache lists and remote free lists. 3067 */ 3068 if (szc == 0 && ((flags & PGI_PGCPSZC0) == 0)) { 3069 VM_STAT_ADD(vmm_vmstats.pgf_allocdeferred); 3070 return (NULL); 3071 } 3072 3073 ASSERT(szc > 0 || (flags & PGI_PGCPSZC0)); 3074 3075 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); 3076 3077 /* 3078 * Try to get a non-local freelist page. 3079 */ 3080 LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie); 3081 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3082 pp = page_get_func(mnode, bin, mtype, szc, flags); 3083 if (pp != NULL) { 3084 DTRACE_PROBE4(page__get, 3085 lgrp_t *, lgrp, 3086 int, mnode, 3087 ulong_t, bin, 3088 uint_t, flags); 3089 VM_STAT_ADD(vmm_vmstats.pgf_allocokrem[szc]); 3090 return (pp); 3091 } 3092 } 3093 3094 ASSERT(pp == NULL); 3095 3096 /* 3097 * when the cage is off chances are page_get_contig_pages() will fail 3098 * to lock a large page chunk therefore when the cage is off it's not 3099 * called by default. this can be changed via /etc/system. 3100 * 3101 * page_get_contig_pages() also called to acquire a base pagesize page 3102 * for page_create_get_something(). 3103 */ 3104 if (!(flags & PG_NORELOC) && (pg_contig_disable == 0) && 3105 (kcage_on || pg_lpgcreate_nocage || szc == 0) && 3106 (page_get_func != page_get_contig_pages)) { 3107 3108 VM_STAT_ADD(vmm_vmstats.pgf_allocretry[szc]); 3109 page_get_func = page_get_contig_pages; 3110 goto pgretry; 3111 } 3112 3113 if (pgcplimitsearch && page_get_func == page_get_contig_pages) 3114 pgcpfailcnt[szc]++; 3115 3116 VM_STAT_ADD(vmm_vmstats.pgf_allocfailed[szc]); 3117 return (NULL); 3118 } 3119 3120 /* 3121 * Find the `best' page on the cachelist for this (vp,off) (as,vaddr) pair. 3122 * 3123 * Does its own locking. 3124 * If PG_MATCH_COLOR is set, then NULL will be returned if there are no 3125 * pages of the proper color even if there are pages of a different color. 3126 * Otherwise, scan the bins for ones with pages. For each bin with pages, 3127 * try to lock one of them. If no page can be locked, try the 3128 * next bin. Return NULL if a page can not be found and locked. 3129 * 3130 * Finds a pages, trys to lock it, then removes it. 3131 */ 3132 3133 /*ARGSUSED*/ 3134 page_t * 3135 page_get_cachelist(struct vnode *vp, u_offset_t off, struct seg *seg, 3136 caddr_t vaddr, uint_t flags, struct lgrp *lgrp) 3137 { 3138 page_t *pp; 3139 struct as *as = seg->s_as; 3140 ulong_t bin; 3141 /*LINTED*/ 3142 int mnode; 3143 int mtype; 3144 lgrp_mnode_cookie_t lgrp_cookie; 3145 3146 /* 3147 * If we aren't passed a specific lgroup, or pasased a freed lgrp 3148 * assume we wish to allocate near to the current thread's home. 3149 */ 3150 if (!LGRP_EXISTS(lgrp)) 3151 lgrp = lgrp_home_lgrp(); 3152 3153 if (!kcage_on) { 3154 flags &= ~PG_NORELOC; 3155 flags |= PGI_NOCAGE; 3156 } 3157 3158 if ((flags & (PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == PG_NORELOC && 3159 kcage_freemem <= kcage_throttlefree) { 3160 /* 3161 * Reserve kcage_throttlefree pages for critical kernel 3162 * threads. 3163 * 3164 * Everybody else has to go to page_create_get_something() 3165 * to get a cage page, so we don't deadlock cageout. 3166 */ 3167 return (NULL); 3168 } 3169 3170 /* LINTED */ 3171 AS_2_BIN(as, seg, vp, vaddr, bin); 3172 3173 ASSERT(bin <= page_colors_mask); 3174 3175 /* LINTED */ 3176 MTYPE_INIT(mtype, vp, vaddr, flags); 3177 3178 VM_STAT_ADD(vmm_vmstats.pgc_alloc); 3179 3180 /* 3181 * Try local cachelists first 3182 */ 3183 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL); 3184 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3185 pp = page_get_mnode_cachelist(bin, flags, mnode, mtype); 3186 if (pp != NULL) { 3187 VM_STAT_ADD(vmm_vmstats.pgc_allocok); 3188 DTRACE_PROBE4(page__get, 3189 lgrp_t *, lgrp, 3190 int, mnode, 3191 ulong_t, bin, 3192 uint_t, flags); 3193 return (pp); 3194 } 3195 } 3196 3197 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1); 3198 3199 /* 3200 * Try freelists/cachelists that are farther away 3201 * This is our only chance to allocate remote pages for PAGESIZE 3202 * requests. 3203 */ 3204 LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie); 3205 while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) { 3206 pp = page_get_mnode_freelist(mnode, bin, mtype, 3207 0, flags); 3208 if (pp != NULL) { 3209 VM_STAT_ADD(vmm_vmstats.pgc_allocokdeferred); 3210 DTRACE_PROBE4(page__get, 3211 lgrp_t *, lgrp, 3212 int, mnode, 3213 ulong_t, bin, 3214 uint_t, flags); 3215 return (pp); 3216 } 3217 pp = page_get_mnode_cachelist(bin, flags, mnode, mtype); 3218 if (pp != NULL) { 3219 VM_STAT_ADD(vmm_vmstats.pgc_allocokrem); 3220 DTRACE_PROBE4(page__get, 3221 lgrp_t *, lgrp, 3222 int, mnode, 3223 ulong_t, bin, 3224 uint_t, flags); 3225 return (pp); 3226 } 3227 } 3228 3229 VM_STAT_ADD(vmm_vmstats.pgc_allocfailed); 3230 return (NULL); 3231 } 3232 3233 page_t * 3234 page_get_mnode_cachelist(uint_t bin, uint_t flags, int mnode, int mtype) 3235 { 3236 kmutex_t *pcm; 3237 int i; 3238 page_t *pp; 3239 page_t *first_pp; 3240 uint_t bin_marker; 3241 int nwaybins, nwaycnt; 3242 int cpucolors; 3243 3244 VM_STAT_ADD(vmm_vmstats.pgmc_alloc); 3245 3246 /* LINTED */ 3247 MTYPE_START(mnode, mtype, flags); 3248 if (mtype < 0) { /* mnode does not have memory in mtype range */ 3249 VM_STAT_ADD(vmm_vmstats.pgmc_allocempty); 3250 return (NULL); 3251 } 3252 3253 nwaybins = 0; 3254 cpucolors = cpu_page_colors; 3255 /* 3256 * adjust cpucolors to possibly check additional 'equivalent' bins 3257 * to try to minimize fragmentation of large pages by delaying calls 3258 * to page_freelist_fill. 3259 */ 3260 if (colorequiv > 1) { 3261 int equivcolors = page_colors / colorequiv; 3262 3263 if (equivcolors && (cpucolors == 0 || equivcolors < cpucolors)) 3264 cpucolors = equivcolors; 3265 } 3266 3267 /* 3268 * Only hold one cachelist lock at a time, that way we 3269 * can start anywhere and not have to worry about lock 3270 * ordering. 3271 */ 3272 3273 big_try_again: 3274 nwaycnt = 0; 3275 for (i = 0; i <= page_colors; i++) { 3276 if (PAGE_CACHELISTS(mnode, bin, mtype)) { 3277 pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST); 3278 mutex_enter(pcm); 3279 pp = PAGE_CACHELISTS(mnode, bin, mtype); 3280 if (pp != NULL) { 3281 first_pp = pp; 3282 ASSERT(pp->p_vnode); 3283 ASSERT(PP_ISAGED(pp) == 0); 3284 ASSERT(pp->p_szc == 0); 3285 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode); 3286 while (!page_trylock(pp, SE_EXCL)) { 3287 pp = pp->p_next; 3288 ASSERT(pp->p_szc == 0); 3289 if (pp == first_pp) { 3290 /* 3291 * We have searched the 3292 * complete list! 3293 * And all of them (might 3294 * only be one) are locked. 3295 * This can happen since 3296 * these pages can also be 3297 * found via the hash list. 3298 * When found via the hash 3299 * list, they are locked 3300 * first, then removed. 3301 * We give up to let the 3302 * other thread run. 3303 */ 3304 pp = NULL; 3305 break; 3306 } 3307 ASSERT(pp->p_vnode); 3308 ASSERT(PP_ISFREE(pp)); 3309 ASSERT(PP_ISAGED(pp) == 0); 3310 ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == 3311 mnode); 3312 } 3313 3314 if (pp) { 3315 page_t **ppp; 3316 /* 3317 * Found and locked a page. 3318 * Pull it off the list. 3319 */ 3320 ASSERT(mtype == PP_2_MTYPE(pp)); 3321 ppp = &PAGE_CACHELISTS(mnode, bin, 3322 mtype); 3323 page_sub(ppp, pp); 3324 /* 3325 * Subtract counters before releasing 3326 * pcm mutex to avoid a race with 3327 * page_freelist_coalesce and 3328 * page_freelist_fill. 3329 */ 3330 page_ctr_sub(pp, PG_CACHE_LIST); 3331 mutex_exit(pcm); 3332 ASSERT(pp->p_vnode); 3333 ASSERT(PP_ISAGED(pp) == 0); 3334 #if defined(__sparc) 3335 ASSERT(!kcage_on || 3336 (flags & PG_NORELOC) == 0 || 3337 PP_ISNORELOC(pp)); 3338 if (PP_ISNORELOC(pp)) { 3339 kcage_freemem_sub(1); 3340 } 3341 #endif 3342 VM_STAT_ADD(vmm_vmstats. 3343 pgmc_allocok); 3344 return (pp); 3345 } 3346 } 3347 mutex_exit(pcm); 3348 } 3349 3350 /* 3351 * Wow! The initial bin is empty or no page in the bin could 3352 * be locked. 3353 * 3354 * If specific color is needed, check if page color may be in 3355 * other bins. 3356 */ 3357 if ((flags & PG_MATCH_COLOR) && (cpucolors != 0)) { 3358 if (!nwaybins) { 3359 if (cpucolors < 0) { 3360 cpucolors = CPUSETSIZE() / MMU_PAGESIZE; 3361 ASSERT(cpucolors > 0); 3362 nwaybins = page_colors / cpucolors; 3363 if (nwaybins < 2) 3364 cpucolors = 0; 3365 } else { 3366 nwaybins = page_colors / cpucolors; 3367 ASSERT(nwaybins > 1); 3368 } 3369 } 3370 3371 if (++nwaycnt >= nwaybins) { 3372 break; 3373 } 3374 bin = (bin + (page_colors / nwaybins)) & 3375 page_colors_mask; 3376 continue; 3377 } 3378 3379 if (i == 0) { 3380 bin = (bin + BIN_STEP) & page_colors_mask; 3381 bin_marker = bin; 3382 } else { 3383 bin = (bin + vac_colors) & page_colors_mask; 3384 if (bin == bin_marker) { 3385 bin = (bin + 1) & page_colors_mask; 3386 bin_marker = bin; 3387 } 3388 } 3389 } 3390 3391 #if defined(__sparc) 3392 if (!(flags & (PG_NORELOC | PGI_NOCAGE | PGI_RELOCONLY)) && 3393 (kcage_freemem >= kcage_lotsfree)) { 3394 /* 3395 * The Cage is ON and with plenty of free mem, and 3396 * we're willing to check for a NORELOC page if we 3397 * couldn't find a RELOC page, so spin again. 3398 */ 3399 flags |= PG_NORELOC; 3400 mtype = MTYPE_NORELOC; 3401 goto big_try_again; 3402 } 3403 #else 3404 if (flags & PGI_MT_RANGE) { 3405 MTYPE_NEXT(mnode, mtype, flags); 3406 if (mtype >= 0) 3407 goto big_try_again; 3408 } 3409 #endif 3410 VM_STAT_ADD(vmm_vmstats.pgmc_allocfailed); 3411 return (NULL); 3412 } 3413 3414 #ifdef DEBUG 3415 #define REPL_PAGE_STATS 3416 #endif /* DEBUG */ 3417 3418 #ifdef REPL_PAGE_STATS 3419 struct repl_page_stats { 3420 uint_t ngets; 3421 uint_t ngets_noreloc; 3422 uint_t npgr_noreloc; 3423 uint_t nnopage_first; 3424 uint_t nnopage; 3425 uint_t nhashout; 3426 uint_t nnofree; 3427 uint_t nnext_pp; 3428 } repl_page_stats; 3429 #define REPL_STAT_INCR(v) atomic_add_32(&repl_page_stats.v, 1) 3430 #else /* REPL_PAGE_STATS */ 3431 #define REPL_STAT_INCR(v) 3432 #endif /* REPL_PAGE_STATS */ 3433 3434 int pgrppgcp; 3435 3436 /* 3437 * The freemem accounting must be done by the caller. 3438 * First we try to get a replacement page of the same size as like_pp, 3439 * if that is not possible, then we just get a set of discontiguous 3440 * PAGESIZE pages. 3441 */ 3442 page_t * 3443 page_get_replacement_page(page_t *orig_like_pp, struct lgrp *lgrp_target, 3444 uint_t pgrflags) 3445 { 3446 page_t *like_pp; 3447 page_t *pp, *pplist; 3448 page_t *pl = NULL; 3449 ulong_t bin; 3450 int mnode, page_mnode; 3451 int szc; 3452 spgcnt_t npgs, pg_cnt; 3453 pfn_t pfnum; 3454 int mtype; 3455 int flags = 0; 3456 lgrp_mnode_cookie_t lgrp_cookie; 3457 lgrp_t *lgrp; 3458 3459 REPL_STAT_INCR(ngets); 3460 like_pp = orig_like_pp; 3461 ASSERT(PAGE_EXCL(like_pp)); 3462 3463 szc = like_pp->p_szc; 3464 npgs = page_get_pagecnt(szc); 3465 /* 3466 * Now we reset like_pp to the base page_t. 3467 * That way, we won't walk past the end of this 'szc' page. 3468 */ 3469 pfnum = PFN_BASE(like_pp->p_pagenum, szc); 3470 like_pp = page_numtopp_nolock(pfnum); 3471 ASSERT(like_pp->p_szc == szc); 3472 3473 if (PP_ISNORELOC(like_pp)) { 3474 ASSERT(kcage_on); 3475 REPL_STAT_INCR(ngets_noreloc); 3476 flags = PGI_RELOCONLY; 3477 } else if (pgrflags & PGR_NORELOC) { 3478 ASSERT(kcage_on); 3479 REPL_STAT_INCR(npgr_noreloc); 3480 flags = PG_NORELOC; 3481 } 3482 3483 /* 3484 * Kernel pages must always be replaced with the same size 3485 * pages, since we cannot properly handle demotion of kernel 3486 * pages. 3487 */ 3488 if (like_pp->p_vnode == &kvp) 3489 pgrflags |= PGR_SAMESZC; 3490 3491 /* LINTED */ 3492 MTYPE_PGR_INIT(mtype, flags, like_pp, page_mnode); 3493 3494 while (npgs) { 3495 pplist = NULL; 3496 for (;;) { 3497 pg_cnt = page_get_pagecnt(szc); 3498 bin = PP_2_BIN(like_pp); 3499 ASSERT(like_pp->p_szc == orig_like_pp->p_szc); 3500 ASSERT(pg_cnt <= npgs); 3501 3502 /* 3503 * If an lgroup was specified, try to get the 3504 * page from that lgroup. 3505 * NOTE: Must be careful with code below because 3506 * lgroup may disappear and reappear since there 3507 * is no locking for lgroup here. 3508 */ 3509 if (LGRP_EXISTS(lgrp_target)) { 3510 /* 3511 * Keep local variable for lgroup separate 3512 * from lgroup argument since this code should 3513 * only be exercised when lgroup argument 3514 * exists.... 3515 */ 3516 lgrp = lgrp_target; 3517 3518 /* Try the lgroup's freelists first */ 3519 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 3520 LGRP_SRCH_LOCAL); 3521 while ((pplist == NULL) && 3522 (mnode = lgrp_memnode_choose(&lgrp_cookie)) 3523 != -1) { 3524 pplist = page_get_mnode_freelist( 3525 mnode, bin, mtype, szc, 3526 flags); 3527 } 3528 3529 /* 3530 * Now try it's cachelists if this is a 3531 * small page. Don't need to do it for 3532 * larger ones since page_freelist_coalesce() 3533 * already failed. 3534 */ 3535 if (pplist != NULL || szc != 0) 3536 break; 3537 3538 /* Now try it's cachelists */ 3539 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 3540 LGRP_SRCH_LOCAL); 3541 3542 while ((pplist == NULL) && 3543 (mnode = lgrp_memnode_choose(&lgrp_cookie)) 3544 != -1) { 3545 pplist = page_get_mnode_cachelist( 3546 bin, flags, mnode, mtype); 3547 } 3548 if (pplist != NULL) { 3549 page_hashout(pplist, NULL); 3550 PP_SETAGED(pplist); 3551 REPL_STAT_INCR(nhashout); 3552 break; 3553 } 3554 /* Done looking in this lgroup. Bail out. */ 3555 break; 3556 } 3557 3558 /* 3559 * No lgroup was specified (or lgroup was removed by 3560 * DR, so just try to get the page as close to 3561 * like_pp's mnode as possible. 3562 * First try the local freelist... 3563 */ 3564 mnode = PP_2_MEM_NODE(like_pp); 3565 pplist = page_get_mnode_freelist(mnode, bin, 3566 mtype, szc, flags); 3567 if (pplist != NULL) 3568 break; 3569 3570 REPL_STAT_INCR(nnofree); 3571 3572 /* 3573 * ...then the local cachelist. Don't need to do it for 3574 * larger pages cause page_freelist_coalesce() already 3575 * failed there anyway. 3576 */ 3577 if (szc == 0) { 3578 pplist = page_get_mnode_cachelist(bin, flags, 3579 mnode, mtype); 3580 if (pplist != NULL) { 3581 page_hashout(pplist, NULL); 3582 PP_SETAGED(pplist); 3583 REPL_STAT_INCR(nhashout); 3584 break; 3585 } 3586 } 3587 3588 /* Now try remote freelists */ 3589 page_mnode = mnode; 3590 lgrp = 3591 lgrp_hand_to_lgrp(MEM_NODE_2_LGRPHAND(page_mnode)); 3592 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 3593 LGRP_SRCH_HIER); 3594 while (pplist == NULL && 3595 (mnode = lgrp_memnode_choose(&lgrp_cookie)) 3596 != -1) { 3597 /* 3598 * Skip local mnode. 3599 */ 3600 if ((mnode == page_mnode) || 3601 (mem_node_config[mnode].exists == 0)) 3602 continue; 3603 3604 pplist = page_get_mnode_freelist(mnode, 3605 bin, mtype, szc, flags); 3606 } 3607 3608 if (pplist != NULL) 3609 break; 3610 3611 3612 /* Now try remote cachelists */ 3613 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 3614 LGRP_SRCH_HIER); 3615 while (pplist == NULL && szc == 0) { 3616 mnode = lgrp_memnode_choose(&lgrp_cookie); 3617 if (mnode == -1) 3618 break; 3619 /* 3620 * Skip local mnode. 3621 */ 3622 if ((mnode == page_mnode) || 3623 (mem_node_config[mnode].exists == 0)) 3624 continue; 3625 3626 pplist = page_get_mnode_cachelist(bin, 3627 flags, mnode, mtype); 3628 3629 if (pplist != NULL) { 3630 page_hashout(pplist, NULL); 3631 PP_SETAGED(pplist); 3632 REPL_STAT_INCR(nhashout); 3633 break; 3634 } 3635 } 3636 3637 /* 3638 * Break out of while loop under the following cases: 3639 * - If we successfully got a page. 3640 * - If pgrflags specified only returning a specific 3641 * page size and we could not find that page size. 3642 * - If we could not satisfy the request with PAGESIZE 3643 * or larger pages. 3644 */ 3645 if (pplist != NULL || szc == 0) 3646 break; 3647 3648 if ((pgrflags & PGR_SAMESZC) || pgrppgcp) { 3649 /* try to find contig page */ 3650 3651 LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, 3652 LGRP_SRCH_HIER); 3653 3654 while ((pplist == NULL) && 3655 (mnode = 3656 lgrp_memnode_choose(&lgrp_cookie)) 3657 != -1) { 3658 pplist = page_get_contig_pages( 3659 mnode, bin, mtype, szc, 3660 flags | PGI_PGCPHIPRI); 3661 } 3662 break; 3663 } 3664 3665 /* 3666 * The correct thing to do here is try the next 3667 * page size down using szc--. Due to a bug 3668 * with the processing of HAT_RELOAD_SHARE 3669 * where the sfmmu_ttecnt arrays of all 3670 * hats sharing an ISM segment don't get updated, 3671 * using intermediate size pages for relocation 3672 * can lead to continuous page faults. 3673 */ 3674 szc = 0; 3675 } 3676 3677 if (pplist != NULL) { 3678 DTRACE_PROBE4(page__get, 3679 lgrp_t *, lgrp, 3680 int, mnode, 3681 ulong_t, bin, 3682 uint_t, flags); 3683 3684 while (pplist != NULL && pg_cnt--) { 3685 ASSERT(pplist != NULL); 3686 pp = pplist; 3687 page_sub(&pplist, pp); 3688 PP_CLRFREE(pp); 3689 PP_CLRAGED(pp); 3690 page_list_concat(&pl, &pp); 3691 npgs--; 3692 like_pp = like_pp + 1; 3693 REPL_STAT_INCR(nnext_pp); 3694 } 3695 ASSERT(pg_cnt == 0); 3696 } else { 3697 break; 3698 } 3699 } 3700 3701 if (npgs) { 3702 /* 3703 * We were unable to allocate the necessary number 3704 * of pages. 3705 * We need to free up any pl. 3706 */ 3707 REPL_STAT_INCR(nnopage); 3708 page_free_replacement_page(pl); 3709 return (NULL); 3710 } else { 3711 return (pl); 3712 } 3713 } 3714 3715 /* 3716 * demote a free large page to it's constituent pages 3717 */ 3718 void 3719 page_demote_free_pages(page_t *pp) 3720 { 3721 3722 int mnode; 3723 3724 ASSERT(pp != NULL); 3725 ASSERT(PAGE_LOCKED(pp)); 3726 ASSERT(PP_ISFREE(pp)); 3727 ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes); 3728 3729 mnode = PP_2_MEM_NODE(pp); 3730 page_freelist_lock(mnode); 3731 if (pp->p_szc != 0) { 3732 (void) page_demote(mnode, PFN_BASE(pp->p_pagenum, 3733 pp->p_szc), pp->p_szc, 0, PC_NO_COLOR, PC_FREE); 3734 } 3735 page_freelist_unlock(mnode); 3736 ASSERT(pp->p_szc == 0); 3737 } 3738