1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * University Copyright- Copyright (c) 1982, 1986, 1988 32 * The Regents of the University of California 33 * All Rights Reserved 34 * 35 * University Acknowledgment- Portions of this document are derived from 36 * software developed by the University of California, Berkeley, and its 37 * contributors. 38 */ 39 40 #pragma ident "%Z%%M% %I% %E% SMI" 41 42 /* 43 * VM - physical page management. 44 */ 45 46 #include <sys/types.h> 47 #include <sys/t_lock.h> 48 #include <sys/param.h> 49 #include <sys/systm.h> 50 #include <sys/errno.h> 51 #include <sys/time.h> 52 #include <sys/vnode.h> 53 #include <sys/vm.h> 54 #include <sys/vtrace.h> 55 #include <sys/swap.h> 56 #include <sys/cmn_err.h> 57 #include <sys/tuneable.h> 58 #include <sys/sysmacros.h> 59 #include <sys/cpuvar.h> 60 #include <sys/callb.h> 61 #include <sys/debug.h> 62 #include <sys/tnf_probe.h> 63 #include <sys/condvar_impl.h> 64 #include <sys/mem_config.h> 65 #include <sys/mem_cage.h> 66 #include <sys/kmem.h> 67 #include <sys/atomic.h> 68 #include <sys/strlog.h> 69 #include <sys/mman.h> 70 #include <sys/ontrap.h> 71 #include <sys/lgrp.h> 72 #include <sys/vfs.h> 73 74 #include <vm/hat.h> 75 #include <vm/anon.h> 76 #include <vm/page.h> 77 #include <vm/seg.h> 78 #include <vm/pvn.h> 79 #include <vm/seg_kmem.h> 80 #include <vm/vm_dep.h> 81 82 #include <fs/fs_subr.h> 83 84 static int nopageage = 0; 85 86 static pgcnt_t max_page_get; /* max page_get request size in pages */ 87 pgcnt_t total_pages = 0; /* total number of pages (used by /proc) */ 88 89 /* 90 * vnode for all pages which are retired from the VM system; 91 * such as pages with Uncorrectable Errors. 92 */ 93 struct vnode retired_ppages; 94 95 static void page_retired_init(void); 96 static void retired_dispose(vnode_t *vp, page_t *pp, int flag, 97 int dn, cred_t *cr); 98 static void retired_inactive(vnode_t *vp, cred_t *cr); 99 static void page_retired(page_t *pp); 100 static void retired_page_removed(page_t *pp); 101 void page_unretire_pages(void); 102 103 /* 104 * The maximum number of pages that will be unretired in one iteration. 105 * This number is totally arbitrary. 106 */ 107 #define UNRETIRE_PAGES 256 108 109 /* 110 * We limit the number of pages that may be retired to 111 * a percentage of the total physical memory. Note that 112 * the percentage values are stored as 'basis points', 113 * ie, 100 basis points is 1%. 114 */ 115 #define MAX_PAGES_RETIRED_BPS_DEFAULT 10 /* .1% */ 116 117 uint64_t max_pages_retired_bps = MAX_PAGES_RETIRED_BPS_DEFAULT; 118 119 static int pages_retired_limit_exceeded(void); 120 121 /* 122 * operations vector for vnode with retired pages. Only VOP_DISPOSE 123 * and VOP_INACTIVE are intercepted. 124 */ 125 struct vnodeops retired_vnodeops = { 126 "retired_vnodeops", 127 fs_nosys, /* open */ 128 fs_nosys, /* close */ 129 fs_nosys, /* read */ 130 fs_nosys, /* write */ 131 fs_nosys, /* ioctl */ 132 fs_nosys, /* setfl */ 133 fs_nosys, /* getattr */ 134 fs_nosys, /* setattr */ 135 fs_nosys, /* access */ 136 fs_nosys, /* lookup */ 137 fs_nosys, /* create */ 138 fs_nosys, /* remove */ 139 fs_nosys, /* link */ 140 fs_nosys, /* rename */ 141 fs_nosys, /* mkdir */ 142 fs_nosys, /* rmdir */ 143 fs_nosys, /* readdir */ 144 fs_nosys, /* symlink */ 145 fs_nosys, /* readlink */ 146 fs_nosys, /* fsync */ 147 retired_inactive, 148 fs_nosys, /* fid */ 149 fs_rwlock, /* rwlock */ 150 fs_rwunlock, /* rwunlock */ 151 fs_nosys, /* seek */ 152 fs_nosys, /* cmp */ 153 fs_nosys, /* frlock */ 154 fs_nosys, /* space */ 155 fs_nosys, /* realvp */ 156 fs_nosys, /* getpage */ 157 fs_nosys, /* putpage */ 158 fs_nosys_map, 159 fs_nosys_addmap, 160 fs_nosys, /* delmap */ 161 fs_nosys_poll, 162 fs_nosys, /* dump */ 163 fs_nosys, /* l_pathconf */ 164 fs_nosys, /* pageio */ 165 fs_nosys, /* dumpctl */ 166 retired_dispose, 167 fs_nosys, /* setsecattr */ 168 fs_nosys, /* getsecatt */ 169 fs_nosys, /* shrlock */ 170 fs_vnevent_nosupport /* vnevent */ 171 }; 172 173 /* 174 * freemem_lock protects all freemem variables: 175 * availrmem. Also this lock protects the globals which track the 176 * availrmem changes for accurate kernel footprint calculation. 177 * See below for an explanation of these 178 * globals. 179 */ 180 kmutex_t freemem_lock; 181 pgcnt_t availrmem; 182 pgcnt_t availrmem_initial; 183 184 /* 185 * These globals track availrmem changes to get a more accurate 186 * estimate of tke kernel size. Historically pp_kernel is used for 187 * kernel size and is based on availrmem. But availrmem is adjusted for 188 * locked pages in the system not just for kernel locked pages. 189 * These new counters will track the pages locked through segvn and 190 * by explicit user locking. 191 * 192 * segvn_pages_locked : This keeps track on a global basis how many pages 193 * are currently locked because of I/O. 194 * 195 * pages_locked : How many pages are locked becuase of user specified 196 * locking through mlock or plock. 197 * 198 * pages_useclaim,pages_claimed : These two variables track the 199 * cliam adjustments because of the protection changes on a segvn segment. 200 * 201 * All these globals are protected by the same lock which protects availrmem. 202 */ 203 pgcnt_t segvn_pages_locked; 204 pgcnt_t pages_locked; 205 pgcnt_t pages_useclaim; 206 pgcnt_t pages_claimed; 207 208 209 /* 210 * new_freemem_lock protects freemem, freemem_wait & freemem_cv. 211 */ 212 static kmutex_t new_freemem_lock; 213 static uint_t freemem_wait; /* someone waiting for freemem */ 214 static kcondvar_t freemem_cv; 215 216 /* 217 * The logical page free list is maintained as two lists, the 'free' 218 * and the 'cache' lists. 219 * The free list contains those pages that should be reused first. 220 * 221 * The implementation of the lists is machine dependent. 222 * page_get_freelist(), page_get_cachelist(), 223 * page_list_sub(), and page_list_add() 224 * form the interface to the machine dependent implementation. 225 * 226 * Pages with p_free set are on the cache list. 227 * Pages with p_free and p_age set are on the free list, 228 * 229 * A page may be locked while on either list. 230 */ 231 232 /* 233 * free list accounting stuff. 234 * 235 * 236 * Spread out the value for the number of pages on the 237 * page free and page cache lists. If there is just one 238 * value, then it must be under just one lock. 239 * The lock contention and cache traffic are a real bother. 240 * 241 * When we acquire and then drop a single pcf lock 242 * we can start in the middle of the array of pcf structures. 243 * If we acquire more than one pcf lock at a time, we need to 244 * start at the front to avoid deadlocking. 245 * 246 * pcf_count holds the number of pages in each pool. 247 * 248 * pcf_block is set when page_create_get_something() has asked the 249 * PSM page freelist and page cachelist routines without specifying 250 * a color and nothing came back. This is used to block anything 251 * else from moving pages from one list to the other while the 252 * lists are searched again. If a page is freeed while pcf_block is 253 * set, then pcf_reserve is incremented. pcgs_unblock() takes care 254 * of clearning pcf_block, doing the wakeups, etc. 255 */ 256 257 #if NCPU <= 4 258 #define PAD 1 259 #define PCF_FANOUT 4 260 static uint_t pcf_mask = PCF_FANOUT - 1; 261 #else 262 #define PAD 9 263 #ifdef sun4v 264 #define PCF_FANOUT 32 265 #else 266 #define PCF_FANOUT 128 267 #endif 268 static uint_t pcf_mask = PCF_FANOUT - 1; 269 #endif 270 271 struct pcf { 272 uint_t pcf_touch; /* just to help the cache */ 273 uint_t pcf_count; /* page count */ 274 kmutex_t pcf_lock; /* protects the structure */ 275 uint_t pcf_wait; /* number of waiters */ 276 uint_t pcf_block; /* pcgs flag to page_free() */ 277 uint_t pcf_reserve; /* pages freed after pcf_block set */ 278 uint_t pcf_fill[PAD]; /* to line up on the caches */ 279 }; 280 281 static struct pcf pcf[PCF_FANOUT]; 282 #define PCF_INDEX() ((CPU->cpu_id) & (pcf_mask)) 283 284 kmutex_t pcgs_lock; /* serializes page_create_get_ */ 285 kmutex_t pcgs_cagelock; /* serializes NOSLEEP cage allocs */ 286 kmutex_t pcgs_wait_lock; /* used for delay in pcgs */ 287 static kcondvar_t pcgs_cv; /* cv for delay in pcgs */ 288 289 #define PAGE_LOCK_MAXIMUM \ 290 ((1 << (sizeof (((page_t *)0)->p_lckcnt) * NBBY)) - 1) 291 292 /* 293 * Control over the verbosity of page retirement. When set to zero, no messages 294 * will be printed. A value of one will trigger messages for retirement 295 * operations, and is intended for processors which don't yet support FMA 296 * (spitfire). Two will cause verbose messages to be printed when retirements 297 * complete, and is intended only for debugging purposes. 298 */ 299 int page_retire_messages = 0; 300 301 #ifdef VM_STATS 302 303 /* 304 * No locks, but so what, they are only statistics. 305 */ 306 307 static struct page_tcnt { 308 int pc_free_cache; /* free's into cache list */ 309 int pc_free_dontneed; /* free's with dontneed */ 310 int pc_free_pageout; /* free's from pageout */ 311 int pc_free_free; /* free's into free list */ 312 int pc_free_pages; /* free's into large page free list */ 313 int pc_destroy_pages; /* large page destroy's */ 314 int pc_get_cache; /* get's from cache list */ 315 int pc_get_free; /* get's from free list */ 316 int pc_reclaim; /* reclaim's */ 317 int pc_abortfree; /* abort's of free pages */ 318 int pc_find_hit; /* find's that find page */ 319 int pc_find_miss; /* find's that don't find page */ 320 int pc_destroy_free; /* # of free pages destroyed */ 321 #define PC_HASH_CNT (4*PAGE_HASHAVELEN) 322 int pc_find_hashlen[PC_HASH_CNT+1]; 323 int pc_addclaim_pages; 324 int pc_subclaim_pages; 325 int pc_free_replacement_page[2]; 326 int pc_try_demote_pages[6]; 327 int pc_demote_pages[2]; 328 } pagecnt; 329 330 uint_t hashin_count; 331 uint_t hashin_not_held; 332 uint_t hashin_already; 333 334 uint_t hashout_count; 335 uint_t hashout_not_held; 336 337 uint_t page_create_count; 338 uint_t page_create_not_enough; 339 uint_t page_create_not_enough_again; 340 uint_t page_create_zero; 341 uint_t page_create_hashout; 342 uint_t page_create_page_lock_failed; 343 uint_t page_create_trylock_failed; 344 uint_t page_create_found_one; 345 uint_t page_create_hashin_failed; 346 uint_t page_create_dropped_phm; 347 348 uint_t page_create_new; 349 uint_t page_create_exists; 350 uint_t page_create_putbacks; 351 uint_t page_create_overshoot; 352 353 uint_t page_reclaim_zero; 354 uint_t page_reclaim_zero_locked; 355 356 uint_t page_rename_exists; 357 uint_t page_rename_count; 358 359 uint_t page_lookup_cnt[20]; 360 uint_t page_lookup_nowait_cnt[10]; 361 uint_t page_find_cnt; 362 uint_t page_exists_cnt; 363 uint_t page_exists_forreal_cnt; 364 uint_t page_lookup_dev_cnt; 365 uint_t get_cachelist_cnt; 366 uint_t page_create_cnt[10]; 367 uint_t alloc_pages[8]; 368 uint_t page_exphcontg[19]; 369 uint_t page_create_large_cnt[10]; 370 371 /* 372 * Collects statistics. 373 */ 374 #define PAGE_HASH_SEARCH(index, pp, vp, off) { \ 375 uint_t mylen = 0; \ 376 \ 377 for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash, mylen++) { \ 378 if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \ 379 break; \ 380 } \ 381 if ((pp) != NULL) \ 382 pagecnt.pc_find_hit++; \ 383 else \ 384 pagecnt.pc_find_miss++; \ 385 if (mylen > PC_HASH_CNT) \ 386 mylen = PC_HASH_CNT; \ 387 pagecnt.pc_find_hashlen[mylen]++; \ 388 } 389 390 #else /* VM_STATS */ 391 392 /* 393 * Don't collect statistics 394 */ 395 #define PAGE_HASH_SEARCH(index, pp, vp, off) { \ 396 for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \ 397 if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \ 398 break; \ 399 } \ 400 } 401 402 #endif /* VM_STATS */ 403 404 405 406 #ifdef DEBUG 407 #define MEMSEG_SEARCH_STATS 408 #endif 409 410 #ifdef MEMSEG_SEARCH_STATS 411 struct memseg_stats { 412 uint_t nsearch; 413 uint_t nlastwon; 414 uint_t nhashwon; 415 uint_t nnotfound; 416 } memseg_stats; 417 418 #define MEMSEG_STAT_INCR(v) \ 419 atomic_add_32(&memseg_stats.v, 1) 420 #else 421 #define MEMSEG_STAT_INCR(x) 422 #endif 423 424 struct memseg *memsegs; /* list of memory segments */ 425 426 427 static void page_init_mem_config(void); 428 static int page_do_hashin(page_t *, vnode_t *, u_offset_t); 429 static void page_do_hashout(page_t *); 430 431 static void page_demote_vp_pages(page_t *); 432 433 /* 434 * vm subsystem related initialization 435 */ 436 void 437 vm_init(void) 438 { 439 boolean_t callb_vm_cpr(void *, int); 440 441 (void) callb_add(callb_vm_cpr, 0, CB_CL_CPR_VM, "vm"); 442 page_init_mem_config(); 443 444 /* 445 * initialise the vnode for retired pages 446 */ 447 page_retired_init(); 448 } 449 450 /* 451 * This function is called at startup and when memory is added or deleted. 452 */ 453 void 454 init_pages_pp_maximum() 455 { 456 static pgcnt_t p_min; 457 static pgcnt_t pages_pp_maximum_startup; 458 static pgcnt_t avrmem_delta; 459 static int init_done; 460 static int user_set; /* true if set in /etc/system */ 461 462 if (init_done == 0) { 463 464 /* If the user specified a value, save it */ 465 if (pages_pp_maximum != 0) { 466 user_set = 1; 467 pages_pp_maximum_startup = pages_pp_maximum; 468 } 469 470 /* 471 * Setting of pages_pp_maximum is based first time 472 * on the value of availrmem just after the start-up 473 * allocations. To preserve this relationship at run 474 * time, use a delta from availrmem_initial. 475 */ 476 ASSERT(availrmem_initial >= availrmem); 477 avrmem_delta = availrmem_initial - availrmem; 478 479 /* The allowable floor of pages_pp_maximum */ 480 p_min = tune.t_minarmem + 100; 481 482 /* Make sure we don't come through here again. */ 483 init_done = 1; 484 } 485 /* 486 * Determine pages_pp_maximum, the number of currently available 487 * pages (availrmem) that can't be `locked'. If not set by 488 * the user, we set it to 4% of the currently available memory 489 * plus 4MB. 490 * But we also insist that it be greater than tune.t_minarmem; 491 * otherwise a process could lock down a lot of memory, get swapped 492 * out, and never have enough to get swapped back in. 493 */ 494 if (user_set) 495 pages_pp_maximum = pages_pp_maximum_startup; 496 else 497 pages_pp_maximum = ((availrmem_initial - avrmem_delta) / 25) 498 + btop(4 * 1024 * 1024); 499 500 if (pages_pp_maximum <= p_min) { 501 pages_pp_maximum = p_min; 502 } 503 } 504 505 void 506 set_max_page_get(pgcnt_t target_total_pages) 507 { 508 max_page_get = target_total_pages / 2; 509 } 510 511 static pgcnt_t pending_delete; 512 513 /*ARGSUSED*/ 514 static void 515 page_mem_config_post_add( 516 void *arg, 517 pgcnt_t delta_pages) 518 { 519 set_max_page_get(total_pages - pending_delete); 520 init_pages_pp_maximum(); 521 } 522 523 /*ARGSUSED*/ 524 static int 525 page_mem_config_pre_del( 526 void *arg, 527 pgcnt_t delta_pages) 528 { 529 pgcnt_t nv; 530 531 nv = atomic_add_long_nv(&pending_delete, (spgcnt_t)delta_pages); 532 set_max_page_get(total_pages - nv); 533 return (0); 534 } 535 536 /*ARGSUSED*/ 537 static void 538 page_mem_config_post_del( 539 void *arg, 540 pgcnt_t delta_pages, 541 int cancelled) 542 { 543 pgcnt_t nv; 544 545 nv = atomic_add_long_nv(&pending_delete, -(spgcnt_t)delta_pages); 546 set_max_page_get(total_pages - nv); 547 if (!cancelled) 548 init_pages_pp_maximum(); 549 } 550 551 static kphysm_setup_vector_t page_mem_config_vec = { 552 KPHYSM_SETUP_VECTOR_VERSION, 553 page_mem_config_post_add, 554 page_mem_config_pre_del, 555 page_mem_config_post_del, 556 }; 557 558 static void 559 page_init_mem_config(void) 560 { 561 int ret; 562 563 ret = kphysm_setup_func_register(&page_mem_config_vec, (void *)NULL); 564 ASSERT(ret == 0); 565 } 566 567 /* 568 * Evenly spread out the PCF counters for large free pages 569 */ 570 static void 571 page_free_large_ctr(pgcnt_t npages) 572 { 573 static struct pcf *p = pcf; 574 pgcnt_t lump; 575 576 freemem += npages; 577 578 lump = roundup(npages, PCF_FANOUT) / PCF_FANOUT; 579 580 while (npages > 0) { 581 582 ASSERT(!p->pcf_block); 583 584 if (lump < npages) { 585 p->pcf_count += (uint_t)lump; 586 npages -= lump; 587 } else { 588 p->pcf_count += (uint_t)npages; 589 npages = 0; 590 } 591 592 ASSERT(!p->pcf_wait); 593 594 if (++p > &pcf[PCF_FANOUT - 1]) 595 p = pcf; 596 } 597 598 ASSERT(npages == 0); 599 } 600 601 /* 602 * Add a physical chunk of memory to the system freee lists during startup. 603 * Platform specific startup() allocates the memory for the page structs. 604 * 605 * num - number of page structures 606 * base - page number (pfn) to be associated with the first page. 607 * 608 * Since we are doing this during startup (ie. single threaded), we will 609 * use shortcut routines to avoid any locking overhead while putting all 610 * these pages on the freelists. 611 * 612 * NOTE: Any changes performed to page_free(), must also be performed to 613 * add_physmem() since this is how we initialize all page_t's at 614 * boot time. 615 */ 616 void 617 add_physmem( 618 page_t *pp, 619 pgcnt_t num, 620 pfn_t pnum) 621 { 622 page_t *root = NULL; 623 uint_t szc = page_num_pagesizes() - 1; 624 pgcnt_t large = page_get_pagecnt(szc); 625 pgcnt_t cnt = 0; 626 627 TRACE_2(TR_FAC_VM, TR_PAGE_INIT, 628 "add_physmem:pp %p num %lu", pp, num); 629 630 /* 631 * Arbitrarily limit the max page_get request 632 * to 1/2 of the page structs we have. 633 */ 634 total_pages += num; 635 set_max_page_get(total_pages); 636 637 /* 638 * The physical space for the pages array 639 * representing ram pages has already been 640 * allocated. Here we initialize each lock 641 * in the page structure, and put each on 642 * the free list 643 */ 644 for (; num; pp = page_next_raw(pp), pnum++, num--) { 645 646 /* 647 * this needs to fill in the page number 648 * and do any other arch specific initialization 649 */ 650 add_physmem_cb(pp, pnum); 651 652 /* 653 * Initialize the page lock as unlocked, since nobody 654 * can see or access this page yet. 655 */ 656 pp->p_selock = 0; 657 658 /* 659 * Initialize IO lock 660 */ 661 page_iolock_init(pp); 662 663 /* 664 * initialize other fields in the page_t 665 */ 666 PP_SETFREE(pp); 667 page_clr_all_props(pp); 668 PP_SETAGED(pp); 669 pp->p_offset = (u_offset_t)-1; 670 pp->p_next = pp; 671 pp->p_prev = pp; 672 673 /* 674 * Simple case: System doesn't support large pages. 675 */ 676 if (szc == 0) { 677 pp->p_szc = 0; 678 page_free_at_startup(pp); 679 continue; 680 } 681 682 /* 683 * Handle unaligned pages, we collect them up onto 684 * the root page until we have a full large page. 685 */ 686 if (!IS_P2ALIGNED(pnum, large)) { 687 688 /* 689 * If not in a large page, 690 * just free as small page. 691 */ 692 if (root == NULL) { 693 pp->p_szc = 0; 694 page_free_at_startup(pp); 695 continue; 696 } 697 698 /* 699 * Link a constituent page into the large page. 700 */ 701 pp->p_szc = szc; 702 page_list_concat(&root, &pp); 703 704 /* 705 * When large page is fully formed, free it. 706 */ 707 if (++cnt == large) { 708 page_free_large_ctr(cnt); 709 page_list_add_pages(root, PG_LIST_ISINIT); 710 root = NULL; 711 cnt = 0; 712 } 713 continue; 714 } 715 716 /* 717 * At this point we have a page number which 718 * is aligned. We assert that we aren't already 719 * in a different large page. 720 */ 721 ASSERT(IS_P2ALIGNED(pnum, large)); 722 ASSERT(root == NULL && cnt == 0); 723 724 /* 725 * If insufficient number of pages left to form 726 * a large page, just free the small page. 727 */ 728 if (num < large) { 729 pp->p_szc = 0; 730 page_free_at_startup(pp); 731 continue; 732 } 733 734 /* 735 * Otherwise start a new large page. 736 */ 737 pp->p_szc = szc; 738 cnt++; 739 root = pp; 740 } 741 ASSERT(root == NULL && cnt == 0); 742 } 743 744 /* 745 * Find a page representing the specified [vp, offset]. 746 * If we find the page but it is intransit coming in, 747 * it will have an "exclusive" lock and we wait for 748 * the i/o to complete. A page found on the free list 749 * is always reclaimed and then locked. On success, the page 750 * is locked, its data is valid and it isn't on the free 751 * list, while a NULL is returned if the page doesn't exist. 752 */ 753 page_t * 754 page_lookup(vnode_t *vp, u_offset_t off, se_t se) 755 { 756 return (page_lookup_create(vp, off, se, NULL, NULL, 0)); 757 } 758 759 /* 760 * Find a page representing the specified [vp, offset]. 761 * We either return the one we found or, if passed in, 762 * create one with identity of [vp, offset] of the 763 * pre-allocated page. If we find exsisting page but it is 764 * intransit coming in, it will have an "exclusive" lock 765 * and we wait for the i/o to complete. A page found on 766 * the free list is always reclaimed and then locked. 767 * On success, the page is locked, its data is valid and 768 * it isn't on the free list, while a NULL is returned 769 * if the page doesn't exist and newpp is NULL; 770 */ 771 page_t * 772 page_lookup_create( 773 vnode_t *vp, 774 u_offset_t off, 775 se_t se, 776 page_t *newpp, 777 spgcnt_t *nrelocp, 778 int flags) 779 { 780 page_t *pp; 781 kmutex_t *phm; 782 ulong_t index; 783 uint_t hash_locked; 784 uint_t es; 785 786 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 787 VM_STAT_ADD(page_lookup_cnt[0]); 788 ASSERT(newpp ? PAGE_EXCL(newpp) : 1); 789 790 /* 791 * Acquire the appropriate page hash lock since 792 * we have to search the hash list. Pages that 793 * hash to this list can't change identity while 794 * this lock is held. 795 */ 796 hash_locked = 0; 797 index = PAGE_HASH_FUNC(vp, off); 798 phm = NULL; 799 top: 800 PAGE_HASH_SEARCH(index, pp, vp, off); 801 if (pp != NULL) { 802 VM_STAT_ADD(page_lookup_cnt[1]); 803 es = (newpp != NULL) ? 1 : 0; 804 es |= flags; 805 if (!hash_locked) { 806 VM_STAT_ADD(page_lookup_cnt[2]); 807 if (!page_try_reclaim_lock(pp, se, es)) { 808 /* 809 * On a miss, acquire the phm. Then 810 * next time, page_lock() will be called, 811 * causing a wait if the page is busy. 812 * just looping with page_trylock() would 813 * get pretty boring. 814 */ 815 VM_STAT_ADD(page_lookup_cnt[3]); 816 phm = PAGE_HASH_MUTEX(index); 817 mutex_enter(phm); 818 hash_locked = 1; 819 goto top; 820 } 821 } else { 822 VM_STAT_ADD(page_lookup_cnt[4]); 823 if (!page_lock_es(pp, se, phm, P_RECLAIM, es)) { 824 VM_STAT_ADD(page_lookup_cnt[5]); 825 goto top; 826 } 827 } 828 829 /* 830 * Since `pp' is locked it can not change identity now. 831 * Reconfirm we locked the correct page. 832 * 833 * Both the p_vnode and p_offset *must* be cast volatile 834 * to force a reload of their values: The PAGE_HASH_SEARCH 835 * macro will have stuffed p_vnode and p_offset into 836 * registers before calling page_trylock(); another thread, 837 * actually holding the hash lock, could have changed the 838 * page's identity in memory, but our registers would not 839 * be changed, fooling the reconfirmation. If the hash 840 * lock was held during the search, the casting would 841 * not be needed. 842 */ 843 VM_STAT_ADD(page_lookup_cnt[6]); 844 if (((volatile struct vnode *)(pp->p_vnode) != vp) || 845 ((volatile u_offset_t)(pp->p_offset) != off)) { 846 VM_STAT_ADD(page_lookup_cnt[7]); 847 if (hash_locked) { 848 panic("page_lookup_create: lost page %p", 849 (void *)pp); 850 /*NOTREACHED*/ 851 } 852 page_unlock(pp); 853 phm = PAGE_HASH_MUTEX(index); 854 mutex_enter(phm); 855 hash_locked = 1; 856 goto top; 857 } 858 859 /* 860 * If page_trylock() was called, then pp may still be on 861 * the cachelist (can't be on the free list, it would not 862 * have been found in the search). If it is on the 863 * cachelist it must be pulled now. To pull the page from 864 * the cachelist, it must be exclusively locked. 865 * 866 * The other big difference between page_trylock() and 867 * page_lock(), is that page_lock() will pull the 868 * page from whatever free list (the cache list in this 869 * case) the page is on. If page_trylock() was used 870 * above, then we have to do the reclaim ourselves. 871 */ 872 if ((!hash_locked) && (PP_ISFREE(pp))) { 873 ASSERT(PP_ISAGED(pp) == 0); 874 VM_STAT_ADD(page_lookup_cnt[8]); 875 876 /* 877 * page_relcaim will insure that we 878 * have this page exclusively 879 */ 880 881 if (!page_reclaim(pp, NULL)) { 882 /* 883 * Page_reclaim dropped whatever lock 884 * we held. 885 */ 886 VM_STAT_ADD(page_lookup_cnt[9]); 887 phm = PAGE_HASH_MUTEX(index); 888 mutex_enter(phm); 889 hash_locked = 1; 890 goto top; 891 } else if (se == SE_SHARED && newpp == NULL) { 892 VM_STAT_ADD(page_lookup_cnt[10]); 893 page_downgrade(pp); 894 } 895 } 896 897 if (hash_locked) { 898 mutex_exit(phm); 899 } 900 901 if (newpp != NULL && pp->p_szc < newpp->p_szc && 902 PAGE_EXCL(pp) && nrelocp != NULL) { 903 ASSERT(nrelocp != NULL); 904 (void) page_relocate(&pp, &newpp, 1, 1, nrelocp, 905 NULL); 906 if (*nrelocp > 0) { 907 VM_STAT_COND_ADD(*nrelocp == 1, 908 page_lookup_cnt[11]); 909 VM_STAT_COND_ADD(*nrelocp > 1, 910 page_lookup_cnt[12]); 911 pp = newpp; 912 se = SE_EXCL; 913 } else { 914 if (se == SE_SHARED) { 915 page_downgrade(pp); 916 } 917 VM_STAT_ADD(page_lookup_cnt[13]); 918 } 919 } else if (newpp != NULL && nrelocp != NULL) { 920 if (PAGE_EXCL(pp) && se == SE_SHARED) { 921 page_downgrade(pp); 922 } 923 VM_STAT_COND_ADD(pp->p_szc < newpp->p_szc, 924 page_lookup_cnt[14]); 925 VM_STAT_COND_ADD(pp->p_szc == newpp->p_szc, 926 page_lookup_cnt[15]); 927 VM_STAT_COND_ADD(pp->p_szc > newpp->p_szc, 928 page_lookup_cnt[16]); 929 } else if (newpp != NULL && PAGE_EXCL(pp)) { 930 se = SE_EXCL; 931 } 932 } else if (!hash_locked) { 933 VM_STAT_ADD(page_lookup_cnt[17]); 934 phm = PAGE_HASH_MUTEX(index); 935 mutex_enter(phm); 936 hash_locked = 1; 937 goto top; 938 } else if (newpp != NULL) { 939 /* 940 * If we have a preallocated page then 941 * insert it now and basically behave like 942 * page_create. 943 */ 944 VM_STAT_ADD(page_lookup_cnt[18]); 945 /* 946 * Since we hold the page hash mutex and 947 * just searched for this page, page_hashin 948 * had better not fail. If it does, that 949 * means some thread did not follow the 950 * page hash mutex rules. Panic now and 951 * get it over with. As usual, go down 952 * holding all the locks. 953 */ 954 ASSERT(MUTEX_HELD(phm)); 955 if (!page_hashin(newpp, vp, off, phm)) { 956 ASSERT(MUTEX_HELD(phm)); 957 panic("page_lookup_create: hashin failed %p %p %llx %p", 958 (void *)newpp, (void *)vp, off, (void *)phm); 959 /*NOTREACHED*/ 960 } 961 ASSERT(MUTEX_HELD(phm)); 962 mutex_exit(phm); 963 phm = NULL; 964 page_set_props(newpp, P_REF); 965 page_io_lock(newpp); 966 pp = newpp; 967 se = SE_EXCL; 968 } else { 969 VM_STAT_ADD(page_lookup_cnt[19]); 970 mutex_exit(phm); 971 } 972 973 ASSERT(pp ? PAGE_LOCKED_SE(pp, se) : 1); 974 975 ASSERT(pp ? ((PP_ISFREE(pp) == 0) && (PP_ISAGED(pp) == 0)) : 1); 976 977 return (pp); 978 } 979 980 /* 981 * Search the hash list for the page representing the 982 * specified [vp, offset] and return it locked. Skip 983 * free pages and pages that cannot be locked as requested. 984 * Used while attempting to kluster pages. 985 */ 986 page_t * 987 page_lookup_nowait(vnode_t *vp, u_offset_t off, se_t se) 988 { 989 page_t *pp; 990 kmutex_t *phm; 991 ulong_t index; 992 uint_t locked; 993 994 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 995 VM_STAT_ADD(page_lookup_nowait_cnt[0]); 996 997 index = PAGE_HASH_FUNC(vp, off); 998 PAGE_HASH_SEARCH(index, pp, vp, off); 999 locked = 0; 1000 if (pp == NULL) { 1001 top: 1002 VM_STAT_ADD(page_lookup_nowait_cnt[1]); 1003 locked = 1; 1004 phm = PAGE_HASH_MUTEX(index); 1005 mutex_enter(phm); 1006 PAGE_HASH_SEARCH(index, pp, vp, off); 1007 } 1008 1009 if (pp == NULL || PP_ISFREE(pp)) { 1010 VM_STAT_ADD(page_lookup_nowait_cnt[2]); 1011 pp = NULL; 1012 } else { 1013 if (!page_trylock(pp, se)) { 1014 VM_STAT_ADD(page_lookup_nowait_cnt[3]); 1015 pp = NULL; 1016 } else { 1017 VM_STAT_ADD(page_lookup_nowait_cnt[4]); 1018 /* 1019 * See the comment in page_lookup() 1020 */ 1021 if (((volatile struct vnode *)(pp->p_vnode) != vp) || 1022 ((u_offset_t)(pp->p_offset) != off)) { 1023 VM_STAT_ADD(page_lookup_nowait_cnt[5]); 1024 if (locked) { 1025 panic("page_lookup_nowait %p", 1026 (void *)pp); 1027 /*NOTREACHED*/ 1028 } 1029 page_unlock(pp); 1030 goto top; 1031 } 1032 if (PP_ISFREE(pp)) { 1033 VM_STAT_ADD(page_lookup_nowait_cnt[6]); 1034 page_unlock(pp); 1035 pp = NULL; 1036 } 1037 } 1038 } 1039 if (locked) { 1040 VM_STAT_ADD(page_lookup_nowait_cnt[7]); 1041 mutex_exit(phm); 1042 } 1043 1044 ASSERT(pp ? PAGE_LOCKED_SE(pp, se) : 1); 1045 1046 return (pp); 1047 } 1048 1049 /* 1050 * Search the hash list for a page with the specified [vp, off] 1051 * that is known to exist and is already locked. This routine 1052 * is typically used by segment SOFTUNLOCK routines. 1053 */ 1054 page_t * 1055 page_find(vnode_t *vp, u_offset_t off) 1056 { 1057 page_t *pp; 1058 kmutex_t *phm; 1059 ulong_t index; 1060 1061 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 1062 VM_STAT_ADD(page_find_cnt); 1063 1064 index = PAGE_HASH_FUNC(vp, off); 1065 phm = PAGE_HASH_MUTEX(index); 1066 1067 mutex_enter(phm); 1068 PAGE_HASH_SEARCH(index, pp, vp, off); 1069 mutex_exit(phm); 1070 1071 ASSERT(pp != NULL); 1072 ASSERT(PAGE_LOCKED(pp) || panicstr); 1073 return (pp); 1074 } 1075 1076 /* 1077 * Determine whether a page with the specified [vp, off] 1078 * currently exists in the system. Obviously this should 1079 * only be considered as a hint since nothing prevents the 1080 * page from disappearing or appearing immediately after 1081 * the return from this routine. Subsequently, we don't 1082 * even bother to lock the list. 1083 */ 1084 page_t * 1085 page_exists(vnode_t *vp, u_offset_t off) 1086 { 1087 page_t *pp; 1088 ulong_t index; 1089 1090 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 1091 VM_STAT_ADD(page_exists_cnt); 1092 1093 index = PAGE_HASH_FUNC(vp, off); 1094 PAGE_HASH_SEARCH(index, pp, vp, off); 1095 1096 return (pp); 1097 } 1098 1099 /* 1100 * Determine if physically contiguous pages exist for [vp, off] - [vp, off + 1101 * page_size(szc)) range. if they exist and ppa is not NULL fill ppa array 1102 * with these pages locked SHARED. If necessary reclaim pages from 1103 * freelist. Return 1 if contiguous pages exist and 0 otherwise. 1104 * 1105 * If we fail to lock pages still return 1 if pages exist and contiguous. 1106 * But in this case return value is just a hint. ppa array won't be filled. 1107 * Caller should initialize ppa[0] as NULL to distinguish return value. 1108 * 1109 * Returns 0 if pages don't exist or not physically contiguous. 1110 * 1111 * This routine doesn't work for anonymous(swapfs) pages. 1112 */ 1113 int 1114 page_exists_physcontig(vnode_t *vp, u_offset_t off, uint_t szc, page_t *ppa[]) 1115 { 1116 pgcnt_t pages; 1117 pfn_t pfn; 1118 page_t *rootpp; 1119 pgcnt_t i; 1120 pgcnt_t j; 1121 u_offset_t save_off = off; 1122 ulong_t index; 1123 kmutex_t *phm; 1124 page_t *pp; 1125 uint_t pszc; 1126 int loopcnt = 0; 1127 1128 ASSERT(szc != 0); 1129 ASSERT(vp != NULL); 1130 ASSERT(!IS_SWAPFSVP(vp)); 1131 ASSERT(vp != &kvp); 1132 1133 again: 1134 if (++loopcnt > 3) { 1135 VM_STAT_ADD(page_exphcontg[0]); 1136 return (0); 1137 } 1138 1139 index = PAGE_HASH_FUNC(vp, off); 1140 phm = PAGE_HASH_MUTEX(index); 1141 1142 mutex_enter(phm); 1143 PAGE_HASH_SEARCH(index, pp, vp, off); 1144 mutex_exit(phm); 1145 1146 VM_STAT_ADD(page_exphcontg[1]); 1147 1148 if (pp == NULL) { 1149 VM_STAT_ADD(page_exphcontg[2]); 1150 return (0); 1151 } 1152 1153 pages = page_get_pagecnt(szc); 1154 rootpp = pp; 1155 pfn = rootpp->p_pagenum; 1156 1157 if ((pszc = pp->p_szc) >= szc && ppa != NULL) { 1158 VM_STAT_ADD(page_exphcontg[3]); 1159 if (!page_trylock(pp, SE_SHARED)) { 1160 VM_STAT_ADD(page_exphcontg[4]); 1161 return (1); 1162 } 1163 if (pp->p_szc != pszc || pp->p_vnode != vp || 1164 pp->p_offset != off) { 1165 VM_STAT_ADD(page_exphcontg[5]); 1166 page_unlock(pp); 1167 off = save_off; 1168 goto again; 1169 } 1170 /* 1171 * szc was non zero and vnode and offset matched after we 1172 * locked the page it means it can't become free on us. 1173 */ 1174 ASSERT(!PP_ISFREE(pp)); 1175 if (!IS_P2ALIGNED(pfn, pages)) { 1176 page_unlock(pp); 1177 return (0); 1178 } 1179 ppa[0] = pp; 1180 pp++; 1181 off += PAGESIZE; 1182 pfn++; 1183 for (i = 1; i < pages; i++, pp++, off += PAGESIZE, pfn++) { 1184 if (!page_trylock(pp, SE_SHARED)) { 1185 VM_STAT_ADD(page_exphcontg[6]); 1186 pp--; 1187 while (i-- > 0) { 1188 page_unlock(pp); 1189 pp--; 1190 } 1191 ppa[0] = NULL; 1192 return (1); 1193 } 1194 if (pp->p_szc != pszc) { 1195 VM_STAT_ADD(page_exphcontg[7]); 1196 page_unlock(pp); 1197 pp--; 1198 while (i-- > 0) { 1199 page_unlock(pp); 1200 pp--; 1201 } 1202 ppa[0] = NULL; 1203 off = save_off; 1204 goto again; 1205 } 1206 /* 1207 * szc the same as for previous already locked pages 1208 * with right identity. Since this page had correct 1209 * szc after we locked it can't get freed or destroyed 1210 * and therefore must have the expected identity. 1211 */ 1212 ASSERT(!PP_ISFREE(pp)); 1213 if (pp->p_vnode != vp || 1214 pp->p_offset != off) { 1215 panic("page_exists_physcontig: " 1216 "large page identity doesn't match"); 1217 } 1218 ppa[i] = pp; 1219 ASSERT(pp->p_pagenum == pfn); 1220 } 1221 VM_STAT_ADD(page_exphcontg[8]); 1222 ppa[pages] = NULL; 1223 return (1); 1224 } else if (pszc >= szc) { 1225 VM_STAT_ADD(page_exphcontg[9]); 1226 if (!IS_P2ALIGNED(pfn, pages)) { 1227 return (0); 1228 } 1229 return (1); 1230 } 1231 1232 if (!IS_P2ALIGNED(pfn, pages)) { 1233 VM_STAT_ADD(page_exphcontg[10]); 1234 return (0); 1235 } 1236 1237 if (page_numtomemseg_nolock(pfn) != 1238 page_numtomemseg_nolock(pfn + pages - 1)) { 1239 VM_STAT_ADD(page_exphcontg[11]); 1240 return (0); 1241 } 1242 1243 /* 1244 * We loop up 4 times across pages to promote page size. 1245 * We're extra cautious to promote page size atomically with respect 1246 * to everybody else. But we can probably optimize into 1 loop if 1247 * this becomes an issue. 1248 */ 1249 1250 for (i = 0; i < pages; i++, pp++, off += PAGESIZE, pfn++) { 1251 ASSERT(pp->p_pagenum == pfn); 1252 if (!page_trylock(pp, SE_EXCL)) { 1253 VM_STAT_ADD(page_exphcontg[12]); 1254 break; 1255 } 1256 if (pp->p_vnode != vp || 1257 pp->p_offset != off) { 1258 VM_STAT_ADD(page_exphcontg[13]); 1259 page_unlock(pp); 1260 break; 1261 } 1262 if (pp->p_szc >= szc) { 1263 ASSERT(i == 0); 1264 page_unlock(pp); 1265 off = save_off; 1266 goto again; 1267 } 1268 } 1269 1270 if (i != pages) { 1271 VM_STAT_ADD(page_exphcontg[14]); 1272 --pp; 1273 while (i-- > 0) { 1274 page_unlock(pp); 1275 --pp; 1276 } 1277 return (0); 1278 } 1279 1280 pp = rootpp; 1281 for (i = 0; i < pages; i++, pp++) { 1282 if (PP_ISFREE(pp)) { 1283 VM_STAT_ADD(page_exphcontg[15]); 1284 ASSERT(!PP_ISAGED(pp)); 1285 ASSERT(pp->p_szc == 0); 1286 if (!page_reclaim(pp, NULL)) { 1287 break; 1288 } 1289 } else { 1290 ASSERT(pp->p_szc < szc); 1291 VM_STAT_ADD(page_exphcontg[16]); 1292 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1293 } 1294 } 1295 if (i < pages) { 1296 VM_STAT_ADD(page_exphcontg[17]); 1297 /* 1298 * page_reclaim failed because we were out of memory. 1299 * drop the rest of the locks and return because this page 1300 * must be already reallocated anyway. 1301 */ 1302 pp = rootpp; 1303 for (j = 0; j < pages; j++, pp++) { 1304 if (j != i) { 1305 page_unlock(pp); 1306 } 1307 } 1308 return (0); 1309 } 1310 1311 off = save_off; 1312 pp = rootpp; 1313 for (i = 0; i < pages; i++, pp++, off += PAGESIZE) { 1314 ASSERT(PAGE_EXCL(pp)); 1315 ASSERT(!PP_ISFREE(pp)); 1316 ASSERT(!hat_page_is_mapped(pp)); 1317 ASSERT(pp->p_vnode == vp); 1318 ASSERT(pp->p_offset == off); 1319 pp->p_szc = szc; 1320 } 1321 pp = rootpp; 1322 for (i = 0; i < pages; i++, pp++) { 1323 if (ppa == NULL) { 1324 page_unlock(pp); 1325 } else { 1326 ppa[i] = pp; 1327 page_downgrade(ppa[i]); 1328 } 1329 } 1330 if (ppa != NULL) { 1331 ppa[pages] = NULL; 1332 } 1333 VM_STAT_ADD(page_exphcontg[18]); 1334 ASSERT(vp->v_pages != NULL); 1335 return (1); 1336 } 1337 1338 /* 1339 * Determine whether a page with the specified [vp, off] 1340 * currently exists in the system and if so return its 1341 * size code. Obviously this should only be considered as 1342 * a hint since nothing prevents the page from disappearing 1343 * or appearing immediately after the return from this routine. 1344 */ 1345 int 1346 page_exists_forreal(vnode_t *vp, u_offset_t off, uint_t *szc) 1347 { 1348 page_t *pp; 1349 kmutex_t *phm; 1350 ulong_t index; 1351 int rc = 0; 1352 1353 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 1354 ASSERT(szc != NULL); 1355 VM_STAT_ADD(page_exists_forreal_cnt); 1356 1357 index = PAGE_HASH_FUNC(vp, off); 1358 phm = PAGE_HASH_MUTEX(index); 1359 1360 mutex_enter(phm); 1361 PAGE_HASH_SEARCH(index, pp, vp, off); 1362 if (pp != NULL) { 1363 *szc = pp->p_szc; 1364 rc = 1; 1365 } 1366 mutex_exit(phm); 1367 return (rc); 1368 } 1369 1370 /* wakeup threads waiting for pages in page_create_get_something() */ 1371 void 1372 wakeup_pcgs(void) 1373 { 1374 if (!CV_HAS_WAITERS(&pcgs_cv)) 1375 return; 1376 cv_broadcast(&pcgs_cv); 1377 } 1378 1379 /* 1380 * 'freemem' is used all over the kernel as an indication of how many 1381 * pages are free (either on the cache list or on the free page list) 1382 * in the system. In very few places is a really accurate 'freemem' 1383 * needed. To avoid contention of the lock protecting a the 1384 * single freemem, it was spread out into NCPU buckets. Set_freemem 1385 * sets freemem to the total of all NCPU buckets. It is called from 1386 * clock() on each TICK. 1387 */ 1388 void 1389 set_freemem() 1390 { 1391 struct pcf *p; 1392 ulong_t t; 1393 uint_t i; 1394 1395 t = 0; 1396 p = pcf; 1397 for (i = 0; i < PCF_FANOUT; i++) { 1398 t += p->pcf_count; 1399 p++; 1400 } 1401 freemem = t; 1402 1403 /* 1404 * Don't worry about grabbing mutex. It's not that 1405 * critical if we miss a tick or two. This is 1406 * where we wakeup possible delayers in 1407 * page_create_get_something(). 1408 */ 1409 wakeup_pcgs(); 1410 } 1411 1412 ulong_t 1413 get_freemem() 1414 { 1415 struct pcf *p; 1416 ulong_t t; 1417 uint_t i; 1418 1419 t = 0; 1420 p = pcf; 1421 for (i = 0; i < PCF_FANOUT; i++) { 1422 t += p->pcf_count; 1423 p++; 1424 } 1425 /* 1426 * We just calculated it, might as well set it. 1427 */ 1428 freemem = t; 1429 return (t); 1430 } 1431 1432 /* 1433 * Acquire all of the page cache & free (pcf) locks. 1434 */ 1435 void 1436 pcf_acquire_all() 1437 { 1438 struct pcf *p; 1439 uint_t i; 1440 1441 p = pcf; 1442 for (i = 0; i < PCF_FANOUT; i++) { 1443 p->pcf_touch = 1; 1444 mutex_enter(&p->pcf_lock); 1445 p++; 1446 } 1447 } 1448 1449 /* 1450 * Release all the pcf_locks. 1451 */ 1452 void 1453 pcf_release_all() 1454 { 1455 struct pcf *p; 1456 uint_t i; 1457 1458 p = pcf; 1459 for (i = 0; i < PCF_FANOUT; i++) { 1460 mutex_exit(&p->pcf_lock); 1461 p++; 1462 } 1463 } 1464 1465 /* 1466 * Inform the VM system that we need some pages freed up. 1467 * Calls must be symmetric, e.g.: 1468 * 1469 * page_needfree(100); 1470 * wait a bit; 1471 * page_needfree(-100); 1472 */ 1473 void 1474 page_needfree(spgcnt_t npages) 1475 { 1476 mutex_enter(&new_freemem_lock); 1477 needfree += npages; 1478 mutex_exit(&new_freemem_lock); 1479 } 1480 1481 /* 1482 * Throttle for page_create(): try to prevent freemem from dropping 1483 * below throttlefree. We can't provide a 100% guarantee because 1484 * KM_NOSLEEP allocations, page_reclaim(), and various other things 1485 * nibble away at the freelist. However, we can block all PG_WAIT 1486 * allocations until memory becomes available. The motivation is 1487 * that several things can fall apart when there's no free memory: 1488 * 1489 * (1) If pageout() needs memory to push a page, the system deadlocks. 1490 * 1491 * (2) By (broken) specification, timeout(9F) can neither fail nor 1492 * block, so it has no choice but to panic the system if it 1493 * cannot allocate a callout structure. 1494 * 1495 * (3) Like timeout(), ddi_set_callback() cannot fail and cannot block; 1496 * it panics if it cannot allocate a callback structure. 1497 * 1498 * (4) Untold numbers of third-party drivers have not yet been hardened 1499 * against KM_NOSLEEP and/or allocb() failures; they simply assume 1500 * success and panic the system with a data fault on failure. 1501 * (The long-term solution to this particular problem is to ship 1502 * hostile fault-injecting DEBUG kernels with the DDK.) 1503 * 1504 * It is theoretically impossible to guarantee success of non-blocking 1505 * allocations, but in practice, this throttle is very hard to break. 1506 */ 1507 static int 1508 page_create_throttle(pgcnt_t npages, int flags) 1509 { 1510 ulong_t fm; 1511 uint_t i; 1512 pgcnt_t tf; /* effective value of throttlefree */ 1513 1514 /* 1515 * Never deny pages when: 1516 * - it's a thread that cannot block [NOMEMWAIT()] 1517 * - the allocation cannot block and must not fail 1518 * - the allocation cannot block and is pageout dispensated 1519 */ 1520 if (NOMEMWAIT() || 1521 ((flags & (PG_WAIT | PG_PANIC)) == PG_PANIC) || 1522 ((flags & (PG_WAIT | PG_PUSHPAGE)) == PG_PUSHPAGE)) 1523 return (1); 1524 1525 /* 1526 * If the allocation can't block, we look favorably upon it 1527 * unless we're below pageout_reserve. In that case we fail 1528 * the allocation because we want to make sure there are a few 1529 * pages available for pageout. 1530 */ 1531 if ((flags & PG_WAIT) == 0) 1532 return (freemem >= npages + pageout_reserve); 1533 1534 /* Calculate the effective throttlefree value */ 1535 tf = throttlefree - 1536 ((flags & PG_PUSHPAGE) ? pageout_reserve : 0); 1537 1538 cv_signal(&proc_pageout->p_cv); 1539 1540 while (freemem < npages + tf) { 1541 pcf_acquire_all(); 1542 mutex_enter(&new_freemem_lock); 1543 fm = 0; 1544 for (i = 0; i < PCF_FANOUT; i++) { 1545 fm += pcf[i].pcf_count; 1546 pcf[i].pcf_wait++; 1547 mutex_exit(&pcf[i].pcf_lock); 1548 } 1549 freemem = fm; 1550 needfree += npages; 1551 freemem_wait++; 1552 cv_wait(&freemem_cv, &new_freemem_lock); 1553 freemem_wait--; 1554 needfree -= npages; 1555 mutex_exit(&new_freemem_lock); 1556 } 1557 return (1); 1558 } 1559 1560 /* 1561 * page_create_wait() is called to either coalecse pages from the 1562 * different pcf buckets or to wait because there simply are not 1563 * enough pages to satisfy the caller's request. 1564 * 1565 * Sadly, this is called from platform/vm/vm_machdep.c 1566 */ 1567 int 1568 page_create_wait(size_t npages, uint_t flags) 1569 { 1570 pgcnt_t total; 1571 uint_t i; 1572 struct pcf *p; 1573 1574 /* 1575 * Wait until there are enough free pages to satisfy our 1576 * entire request. 1577 * We set needfree += npages before prodding pageout, to make sure 1578 * it does real work when npages > lotsfree > freemem. 1579 */ 1580 VM_STAT_ADD(page_create_not_enough); 1581 1582 ASSERT(!kcage_on ? !(flags & PG_NORELOC) : 1); 1583 checkagain: 1584 if ((flags & PG_NORELOC) && 1585 kcage_freemem < kcage_throttlefree + npages) 1586 (void) kcage_create_throttle(npages, flags); 1587 1588 if (freemem < npages + throttlefree) 1589 if (!page_create_throttle(npages, flags)) 1590 return (0); 1591 1592 /* 1593 * Since page_create_va() looked at every 1594 * bucket, assume we are going to have to wait. 1595 * Get all of the pcf locks. 1596 */ 1597 total = 0; 1598 p = pcf; 1599 for (i = 0; i < PCF_FANOUT; i++) { 1600 p->pcf_touch = 1; 1601 mutex_enter(&p->pcf_lock); 1602 total += p->pcf_count; 1603 if (total >= npages) { 1604 /* 1605 * Wow! There are enough pages laying around 1606 * to satisfy the request. Do the accounting, 1607 * drop the locks we acquired, and go back. 1608 * 1609 * freemem is not protected by any lock. So, 1610 * we cannot have any assertion containing 1611 * freemem. 1612 */ 1613 freemem -= npages; 1614 1615 while (p >= pcf) { 1616 if (p->pcf_count <= npages) { 1617 npages -= p->pcf_count; 1618 p->pcf_count = 0; 1619 } else { 1620 p->pcf_count -= (uint_t)npages; 1621 npages = 0; 1622 } 1623 mutex_exit(&p->pcf_lock); 1624 p--; 1625 } 1626 ASSERT(npages == 0); 1627 return (1); 1628 } 1629 p++; 1630 } 1631 1632 /* 1633 * All of the pcf locks are held, there are not enough pages 1634 * to satisfy the request (npages < total). 1635 * Be sure to acquire the new_freemem_lock before dropping 1636 * the pcf locks. This prevents dropping wakeups in page_free(). 1637 * The order is always pcf_lock then new_freemem_lock. 1638 * 1639 * Since we hold all the pcf locks, it is a good time to set freemem. 1640 * 1641 * If the caller does not want to wait, return now. 1642 * Else turn the pageout daemon loose to find something 1643 * and wait till it does. 1644 * 1645 */ 1646 freemem = total; 1647 1648 if ((flags & PG_WAIT) == 0) { 1649 pcf_release_all(); 1650 1651 TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_NOMEM, 1652 "page_create_nomem:npages %ld freemem %ld", npages, freemem); 1653 return (0); 1654 } 1655 1656 ASSERT(proc_pageout != NULL); 1657 cv_signal(&proc_pageout->p_cv); 1658 1659 TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SLEEP_START, 1660 "page_create_sleep_start: freemem %ld needfree %ld", 1661 freemem, needfree); 1662 1663 /* 1664 * We are going to wait. 1665 * We currently hold all of the pcf_locks, 1666 * get the new_freemem_lock (it protects freemem_wait), 1667 * before dropping the pcf_locks. 1668 */ 1669 mutex_enter(&new_freemem_lock); 1670 1671 p = pcf; 1672 for (i = 0; i < PCF_FANOUT; i++) { 1673 p->pcf_wait++; 1674 mutex_exit(&p->pcf_lock); 1675 p++; 1676 } 1677 1678 needfree += npages; 1679 freemem_wait++; 1680 1681 cv_wait(&freemem_cv, &new_freemem_lock); 1682 1683 freemem_wait--; 1684 needfree -= npages; 1685 1686 mutex_exit(&new_freemem_lock); 1687 1688 TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SLEEP_END, 1689 "page_create_sleep_end: freemem %ld needfree %ld", 1690 freemem, needfree); 1691 1692 VM_STAT_ADD(page_create_not_enough_again); 1693 goto checkagain; 1694 } 1695 1696 /* 1697 * A routine to do the opposite of page_create_wait(). 1698 */ 1699 void 1700 page_create_putback(spgcnt_t npages) 1701 { 1702 struct pcf *p; 1703 pgcnt_t lump; 1704 uint_t *which; 1705 1706 /* 1707 * When a contiguous lump is broken up, we have to 1708 * deal with lots of pages (min 64) so lets spread 1709 * the wealth around. 1710 */ 1711 lump = roundup(npages, PCF_FANOUT) / PCF_FANOUT; 1712 freemem += npages; 1713 1714 for (p = pcf; (npages > 0) && (p < &pcf[PCF_FANOUT]); p++) { 1715 which = &p->pcf_count; 1716 1717 mutex_enter(&p->pcf_lock); 1718 1719 if (p->pcf_block) { 1720 which = &p->pcf_reserve; 1721 } 1722 1723 if (lump < npages) { 1724 *which += (uint_t)lump; 1725 npages -= lump; 1726 } else { 1727 *which += (uint_t)npages; 1728 npages = 0; 1729 } 1730 1731 if (p->pcf_wait) { 1732 mutex_enter(&new_freemem_lock); 1733 /* 1734 * Check to see if some other thread 1735 * is actually waiting. Another bucket 1736 * may have woken it up by now. If there 1737 * are no waiters, then set our pcf_wait 1738 * count to zero to avoid coming in here 1739 * next time. 1740 */ 1741 if (freemem_wait) { 1742 if (npages > 1) { 1743 cv_broadcast(&freemem_cv); 1744 } else { 1745 cv_signal(&freemem_cv); 1746 } 1747 p->pcf_wait--; 1748 } else { 1749 p->pcf_wait = 0; 1750 } 1751 mutex_exit(&new_freemem_lock); 1752 } 1753 mutex_exit(&p->pcf_lock); 1754 } 1755 ASSERT(npages == 0); 1756 } 1757 1758 /* 1759 * A helper routine for page_create_get_something. 1760 * The indenting got to deep down there. 1761 * Unblock the pcf counters. Any pages freed after 1762 * pcf_block got set are moved to pcf_count and 1763 * wakeups (cv_broadcast() or cv_signal()) are done as needed. 1764 */ 1765 static void 1766 pcgs_unblock(void) 1767 { 1768 int i; 1769 struct pcf *p; 1770 1771 /* Update freemem while we're here. */ 1772 freemem = 0; 1773 p = pcf; 1774 for (i = 0; i < PCF_FANOUT; i++) { 1775 mutex_enter(&p->pcf_lock); 1776 ASSERT(p->pcf_count == 0); 1777 p->pcf_count = p->pcf_reserve; 1778 p->pcf_block = 0; 1779 freemem += p->pcf_count; 1780 if (p->pcf_wait) { 1781 mutex_enter(&new_freemem_lock); 1782 if (freemem_wait) { 1783 if (p->pcf_reserve > 1) { 1784 cv_broadcast(&freemem_cv); 1785 p->pcf_wait = 0; 1786 } else { 1787 cv_signal(&freemem_cv); 1788 p->pcf_wait--; 1789 } 1790 } else { 1791 p->pcf_wait = 0; 1792 } 1793 mutex_exit(&new_freemem_lock); 1794 } 1795 p->pcf_reserve = 0; 1796 mutex_exit(&p->pcf_lock); 1797 p++; 1798 } 1799 } 1800 1801 /* 1802 * Called from page_create_va() when both the cache and free lists 1803 * have been checked once. 1804 * 1805 * Either returns a page or panics since the accounting was done 1806 * way before we got here. 1807 * 1808 * We don't come here often, so leave the accounting on permanently. 1809 */ 1810 1811 #define MAX_PCGS 100 1812 1813 #ifdef DEBUG 1814 #define PCGS_TRIES 100 1815 #else /* DEBUG */ 1816 #define PCGS_TRIES 10 1817 #endif /* DEBUG */ 1818 1819 #ifdef VM_STATS 1820 uint_t pcgs_counts[PCGS_TRIES]; 1821 uint_t pcgs_too_many; 1822 uint_t pcgs_entered; 1823 uint_t pcgs_entered_noreloc; 1824 uint_t pcgs_locked; 1825 uint_t pcgs_cagelocked; 1826 #endif /* VM_STATS */ 1827 1828 static page_t * 1829 page_create_get_something(vnode_t *vp, u_offset_t off, struct seg *seg, 1830 caddr_t vaddr, uint_t flags) 1831 { 1832 uint_t count; 1833 page_t *pp; 1834 uint_t locked, i; 1835 struct pcf *p; 1836 lgrp_t *lgrp; 1837 int cagelocked = 0; 1838 1839 VM_STAT_ADD(pcgs_entered); 1840 1841 /* 1842 * Tap any reserve freelists: if we fail now, we'll die 1843 * since the page(s) we're looking for have already been 1844 * accounted for. 1845 */ 1846 flags |= PG_PANIC; 1847 1848 if ((flags & PG_NORELOC) != 0) { 1849 VM_STAT_ADD(pcgs_entered_noreloc); 1850 /* 1851 * Requests for free pages from critical threads 1852 * such as pageout still won't throttle here, but 1853 * we must try again, to give the cageout thread 1854 * another chance to catch up. Since we already 1855 * accounted for the pages, we had better get them 1856 * this time. 1857 * 1858 * N.B. All non-critical threads acquire the pcgs_cagelock 1859 * to serialize access to the freelists. This implements a 1860 * turnstile-type synchornization to avoid starvation of 1861 * critical requests for PG_NORELOC memory by non-critical 1862 * threads: all non-critical threads must acquire a 'ticket' 1863 * before passing through, which entails making sure 1864 * kcage_freemem won't fall below minfree prior to grabbing 1865 * pages from the freelists. 1866 */ 1867 if (kcage_create_throttle(1, flags) == KCT_NONCRIT) { 1868 mutex_enter(&pcgs_cagelock); 1869 cagelocked = 1; 1870 VM_STAT_ADD(pcgs_cagelocked); 1871 } 1872 } 1873 1874 /* 1875 * Time to get serious. 1876 * We failed to get a `correctly colored' page from both the 1877 * free and cache lists. 1878 * We escalate in stage. 1879 * 1880 * First try both lists without worring about color. 1881 * 1882 * Then, grab all page accounting locks (ie. pcf[]) and 1883 * steal any pages that they have and set the pcf_block flag to 1884 * stop deletions from the lists. This will help because 1885 * a page can get added to the free list while we are looking 1886 * at the cache list, then another page could be added to the cache 1887 * list allowing the page on the free list to be removed as we 1888 * move from looking at the cache list to the free list. This 1889 * could happen over and over. We would never find the page 1890 * we have accounted for. 1891 * 1892 * Noreloc pages are a subset of the global (relocatable) page pool. 1893 * They are not tracked separately in the pcf bins, so it is 1894 * impossible to know when doing pcf accounting if the available 1895 * page(s) are noreloc pages or not. When looking for a noreloc page 1896 * it is quite easy to end up here even if the global (relocatable) 1897 * page pool has plenty of free pages but the noreloc pool is empty. 1898 * 1899 * When the noreloc pool is empty (or low), additional noreloc pages 1900 * are created by converting pages from the global page pool. This 1901 * process will stall during pcf accounting if the pcf bins are 1902 * already locked. Such is the case when a noreloc allocation is 1903 * looping here in page_create_get_something waiting for more noreloc 1904 * pages to appear. 1905 * 1906 * Short of adding a new field to the pcf bins to accurately track 1907 * the number of free noreloc pages, we instead do not grab the 1908 * pcgs_lock, do not set the pcf blocks and do not timeout when 1909 * allocating a noreloc page. This allows noreloc allocations to 1910 * loop without blocking global page pool allocations. 1911 * 1912 * NOTE: the behaviour of page_create_get_something has not changed 1913 * for the case of global page pool allocations. 1914 */ 1915 1916 flags &= ~PG_MATCH_COLOR; 1917 locked = 0; 1918 #ifndef __sparc 1919 /* 1920 * page_create_get_something may be called because 4g memory may be 1921 * depleted. Set flags to allow for relocation of base page below 1922 * 4g if necessary. 1923 */ 1924 if (physmax4g) 1925 flags |= (PGI_PGCPSZC0 | PGI_PGCPHIPRI); 1926 #endif 1927 1928 lgrp = lgrp_mem_choose(seg, vaddr, PAGESIZE); 1929 1930 for (count = 0; kcage_on || count < MAX_PCGS; count++) { 1931 pp = page_get_freelist(vp, off, seg, vaddr, PAGESIZE, 1932 flags, lgrp); 1933 if (pp == NULL) { 1934 pp = page_get_cachelist(vp, off, seg, vaddr, 1935 flags, lgrp); 1936 } 1937 if (pp == NULL) { 1938 /* 1939 * Serialize. Don't fight with other pcgs(). 1940 */ 1941 if (!locked && (!kcage_on || !(flags & PG_NORELOC))) { 1942 mutex_enter(&pcgs_lock); 1943 VM_STAT_ADD(pcgs_locked); 1944 locked = 1; 1945 p = pcf; 1946 for (i = 0; i < PCF_FANOUT; i++) { 1947 mutex_enter(&p->pcf_lock); 1948 ASSERT(p->pcf_block == 0); 1949 p->pcf_block = 1; 1950 p->pcf_reserve = p->pcf_count; 1951 p->pcf_count = 0; 1952 mutex_exit(&p->pcf_lock); 1953 p++; 1954 } 1955 freemem = 0; 1956 } 1957 1958 if (count) { 1959 /* 1960 * Since page_free() puts pages on 1961 * a list then accounts for it, we 1962 * just have to wait for page_free() 1963 * to unlock any page it was working 1964 * with. The page_lock()-page_reclaim() 1965 * path falls in the same boat. 1966 * 1967 * We don't need to check on the 1968 * PG_WAIT flag, we have already 1969 * accounted for the page we are 1970 * looking for in page_create_va(). 1971 * 1972 * We just wait a moment to let any 1973 * locked pages on the lists free up, 1974 * then continue around and try again. 1975 * 1976 * Will be awakened by set_freemem(). 1977 */ 1978 mutex_enter(&pcgs_wait_lock); 1979 cv_wait(&pcgs_cv, &pcgs_wait_lock); 1980 mutex_exit(&pcgs_wait_lock); 1981 } 1982 } else { 1983 #ifdef VM_STATS 1984 if (count >= PCGS_TRIES) { 1985 VM_STAT_ADD(pcgs_too_many); 1986 } else { 1987 VM_STAT_ADD(pcgs_counts[count]); 1988 } 1989 #endif 1990 if (locked) { 1991 pcgs_unblock(); 1992 mutex_exit(&pcgs_lock); 1993 } 1994 if (cagelocked) 1995 mutex_exit(&pcgs_cagelock); 1996 return (pp); 1997 } 1998 } 1999 /* 2000 * we go down holding the pcf locks. 2001 */ 2002 panic("no %spage found %d", 2003 ((flags & PG_NORELOC) ? "non-reloc " : ""), count); 2004 /*NOTREACHED*/ 2005 } 2006 2007 /* 2008 * Create enough pages for "bytes" worth of data starting at 2009 * "off" in "vp". 2010 * 2011 * Where flag must be one of: 2012 * 2013 * PG_EXCL: Exclusive create (fail if any page already 2014 * exists in the page cache) which does not 2015 * wait for memory to become available. 2016 * 2017 * PG_WAIT: Non-exclusive create which can wait for 2018 * memory to become available. 2019 * 2020 * PG_PHYSCONTIG: Allocate physically contiguous pages. 2021 * (Not Supported) 2022 * 2023 * A doubly linked list of pages is returned to the caller. Each page 2024 * on the list has the "exclusive" (p_selock) lock and "iolock" (p_iolock) 2025 * lock. 2026 * 2027 * Unable to change the parameters to page_create() in a minor release, 2028 * we renamed page_create() to page_create_va(), changed all known calls 2029 * from page_create() to page_create_va(), and created this wrapper. 2030 * 2031 * Upon a major release, we should break compatibility by deleting this 2032 * wrapper, and replacing all the strings "page_create_va", with "page_create". 2033 * 2034 * NOTE: There is a copy of this interface as page_create_io() in 2035 * i86/vm/vm_machdep.c. Any bugs fixed here should be applied 2036 * there. 2037 */ 2038 page_t * 2039 page_create(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags) 2040 { 2041 caddr_t random_vaddr; 2042 struct seg kseg; 2043 2044 #ifdef DEBUG 2045 cmn_err(CE_WARN, "Using deprecated interface page_create: caller %p", 2046 (void *)caller()); 2047 #endif 2048 2049 random_vaddr = (caddr_t)(((uintptr_t)vp >> 7) ^ 2050 (uintptr_t)(off >> PAGESHIFT)); 2051 kseg.s_as = &kas; 2052 2053 return (page_create_va(vp, off, bytes, flags, &kseg, random_vaddr)); 2054 } 2055 2056 #ifdef DEBUG 2057 uint32_t pg_alloc_pgs_mtbf = 0; 2058 #endif 2059 2060 /* 2061 * Used for large page support. It will attempt to allocate 2062 * a large page(s) off the freelist. 2063 * 2064 * Returns non zero on failure. 2065 */ 2066 int 2067 page_alloc_pages(struct seg *seg, caddr_t addr, page_t **basepp, 2068 page_t *ppa[], uint_t szc, int anypgsz) 2069 { 2070 pgcnt_t npgs, curnpgs, totpgs; 2071 size_t pgsz; 2072 page_t *pplist = NULL, *pp; 2073 int err = 0; 2074 lgrp_t *lgrp; 2075 2076 ASSERT(szc != 0 && szc <= (page_num_pagesizes() - 1)); 2077 2078 VM_STAT_ADD(alloc_pages[0]); 2079 2080 #ifdef DEBUG 2081 if (pg_alloc_pgs_mtbf && !(gethrtime() % pg_alloc_pgs_mtbf)) { 2082 return (ENOMEM); 2083 } 2084 #endif 2085 2086 pgsz = page_get_pagesize(szc); 2087 totpgs = curnpgs = npgs = pgsz >> PAGESHIFT; 2088 2089 ASSERT(((uintptr_t)addr & (pgsz - 1)) == 0); 2090 /* 2091 * One must be NULL but not both. 2092 * And one must be non NULL but not both. 2093 */ 2094 ASSERT(basepp != NULL || ppa != NULL); 2095 ASSERT(basepp == NULL || ppa == NULL); 2096 2097 (void) page_create_wait(npgs, PG_WAIT); 2098 2099 while (npgs && szc) { 2100 lgrp = lgrp_mem_choose(seg, addr, pgsz); 2101 pp = page_get_freelist(NULL, 0, seg, addr, pgsz, 0, lgrp); 2102 if (pp != NULL) { 2103 VM_STAT_ADD(alloc_pages[1]); 2104 page_list_concat(&pplist, &pp); 2105 ASSERT(npgs >= curnpgs); 2106 npgs -= curnpgs; 2107 } else if (anypgsz) { 2108 VM_STAT_ADD(alloc_pages[2]); 2109 szc--; 2110 pgsz = page_get_pagesize(szc); 2111 curnpgs = pgsz >> PAGESHIFT; 2112 } else { 2113 VM_STAT_ADD(alloc_pages[3]); 2114 ASSERT(npgs == totpgs); 2115 page_create_putback(npgs); 2116 return (ENOMEM); 2117 } 2118 } 2119 if (szc == 0) { 2120 VM_STAT_ADD(alloc_pages[4]); 2121 ASSERT(npgs != 0); 2122 page_create_putback(npgs); 2123 err = ENOMEM; 2124 } else if (basepp != NULL) { 2125 ASSERT(npgs == 0); 2126 ASSERT(ppa == NULL); 2127 *basepp = pplist; 2128 } 2129 2130 npgs = totpgs - npgs; 2131 pp = pplist; 2132 2133 /* 2134 * Clear the free and age bits. Also if we were passed in a ppa then 2135 * fill it in with all the constituent pages from the large page. But 2136 * if we failed to allocate all the pages just free what we got. 2137 */ 2138 while (npgs != 0) { 2139 ASSERT(PP_ISFREE(pp)); 2140 ASSERT(PP_ISAGED(pp)); 2141 if (ppa != NULL || err != 0) { 2142 if (err == 0) { 2143 VM_STAT_ADD(alloc_pages[5]); 2144 PP_CLRFREE(pp); 2145 PP_CLRAGED(pp); 2146 page_sub(&pplist, pp); 2147 *ppa++ = pp; 2148 npgs--; 2149 } else { 2150 VM_STAT_ADD(alloc_pages[6]); 2151 ASSERT(pp->p_szc != 0); 2152 curnpgs = page_get_pagecnt(pp->p_szc); 2153 page_list_break(&pp, &pplist, curnpgs); 2154 page_list_add_pages(pp, 0); 2155 page_create_putback(curnpgs); 2156 ASSERT(npgs >= curnpgs); 2157 npgs -= curnpgs; 2158 } 2159 pp = pplist; 2160 } else { 2161 VM_STAT_ADD(alloc_pages[7]); 2162 PP_CLRFREE(pp); 2163 PP_CLRAGED(pp); 2164 pp = pp->p_next; 2165 npgs--; 2166 } 2167 } 2168 return (err); 2169 } 2170 2171 /* 2172 * Get a single large page off of the freelists, and set it up for use. 2173 * Number of bytes requested must be a supported page size. 2174 * 2175 * Note that this call may fail even if there is sufficient 2176 * memory available or PG_WAIT is set, so the caller must 2177 * be willing to fallback on page_create_va(), block and retry, 2178 * or fail the requester. 2179 */ 2180 page_t * 2181 page_create_va_large(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags, 2182 struct seg *seg, caddr_t vaddr, void *arg) 2183 { 2184 pgcnt_t npages, pcftotal; 2185 page_t *pp; 2186 page_t *rootpp; 2187 lgrp_t *lgrp; 2188 uint_t enough; 2189 uint_t pcf_index; 2190 uint_t i; 2191 struct pcf *p; 2192 struct pcf *q; 2193 lgrp_id_t *lgrpid = (lgrp_id_t *)arg; 2194 2195 ASSERT(vp != NULL); 2196 2197 ASSERT((flags & ~(PG_EXCL | PG_WAIT | 2198 PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == 0); 2199 /* but no others */ 2200 2201 ASSERT((flags & PG_EXCL) == PG_EXCL); 2202 2203 npages = btop(bytes); 2204 2205 if (!kcage_on || panicstr) { 2206 /* 2207 * Cage is OFF, or we are single threaded in 2208 * panic, so make everything a RELOC request. 2209 */ 2210 flags &= ~PG_NORELOC; 2211 } 2212 2213 /* 2214 * Make sure there's adequate physical memory available. 2215 * Note: PG_WAIT is ignored here. 2216 */ 2217 if (freemem <= throttlefree + npages) { 2218 VM_STAT_ADD(page_create_large_cnt[1]); 2219 return (NULL); 2220 } 2221 2222 /* 2223 * If cage is on, dampen draw from cage when available 2224 * cage space is low. 2225 */ 2226 if ((flags & (PG_NORELOC | PG_WAIT)) == (PG_NORELOC | PG_WAIT) && 2227 kcage_freemem < kcage_throttlefree + npages) { 2228 2229 /* 2230 * The cage is on, the caller wants PG_NORELOC 2231 * pages and available cage memory is very low. 2232 * Call kcage_create_throttle() to attempt to 2233 * control demand on the cage. 2234 */ 2235 if (kcage_create_throttle(npages, flags) == KCT_FAILURE) { 2236 VM_STAT_ADD(page_create_large_cnt[2]); 2237 return (NULL); 2238 } 2239 } 2240 2241 enough = 0; 2242 pcf_index = PCF_INDEX(); 2243 p = &pcf[pcf_index]; 2244 p->pcf_touch = 1; 2245 q = &pcf[PCF_FANOUT]; 2246 for (pcftotal = 0, i = 0; i < PCF_FANOUT; i++) { 2247 if (p->pcf_count > npages) { 2248 /* 2249 * a good one to try. 2250 */ 2251 mutex_enter(&p->pcf_lock); 2252 if (p->pcf_count > npages) { 2253 p->pcf_count -= (uint_t)npages; 2254 /* 2255 * freemem is not protected by any lock. 2256 * Thus, we cannot have any assertion 2257 * containing freemem here. 2258 */ 2259 freemem -= npages; 2260 enough = 1; 2261 mutex_exit(&p->pcf_lock); 2262 break; 2263 } 2264 mutex_exit(&p->pcf_lock); 2265 } 2266 pcftotal += p->pcf_count; 2267 p++; 2268 if (p >= q) { 2269 p = pcf; 2270 } 2271 p->pcf_touch = 1; 2272 } 2273 2274 if (!enough) { 2275 /* If there isn't enough memory available, give up. */ 2276 if (pcftotal < npages) { 2277 VM_STAT_ADD(page_create_large_cnt[3]); 2278 return (NULL); 2279 } 2280 2281 /* try to collect pages from several pcf bins */ 2282 for (p = pcf, pcftotal = 0, i = 0; i < PCF_FANOUT; i++) { 2283 p->pcf_touch = 1; 2284 mutex_enter(&p->pcf_lock); 2285 pcftotal += p->pcf_count; 2286 if (pcftotal >= npages) { 2287 /* 2288 * Wow! There are enough pages laying around 2289 * to satisfy the request. Do the accounting, 2290 * drop the locks we acquired, and go back. 2291 * 2292 * freemem is not protected by any lock. So, 2293 * we cannot have any assertion containing 2294 * freemem. 2295 */ 2296 pgcnt_t tpages = npages; 2297 freemem -= npages; 2298 while (p >= pcf) { 2299 if (p->pcf_count <= tpages) { 2300 tpages -= p->pcf_count; 2301 p->pcf_count = 0; 2302 } else { 2303 p->pcf_count -= (uint_t)tpages; 2304 tpages = 0; 2305 } 2306 mutex_exit(&p->pcf_lock); 2307 p--; 2308 } 2309 ASSERT(tpages == 0); 2310 break; 2311 } 2312 p++; 2313 } 2314 if (i == PCF_FANOUT) { 2315 /* failed to collect pages - release the locks */ 2316 while (--p >= pcf) { 2317 mutex_exit(&p->pcf_lock); 2318 } 2319 VM_STAT_ADD(page_create_large_cnt[4]); 2320 return (NULL); 2321 } 2322 } 2323 2324 /* 2325 * This is where this function behaves fundamentally differently 2326 * than page_create_va(); since we're intending to map the page 2327 * with a single TTE, we have to get it as a physically contiguous 2328 * hardware pagesize chunk. If we can't, we fail. 2329 */ 2330 if (lgrpid != NULL && *lgrpid >= 0 && *lgrpid <= lgrp_alloc_max && 2331 LGRP_EXISTS(lgrp_table[*lgrpid])) 2332 lgrp = lgrp_table[*lgrpid]; 2333 else 2334 lgrp = lgrp_mem_choose(seg, vaddr, bytes); 2335 2336 if ((rootpp = page_get_freelist(&kvp, off, seg, vaddr, 2337 bytes, flags & ~PG_MATCH_COLOR, lgrp)) == NULL) { 2338 page_create_putback(npages); 2339 VM_STAT_ADD(page_create_large_cnt[5]); 2340 return (NULL); 2341 } 2342 2343 /* 2344 * if we got the page with the wrong mtype give it back this is a 2345 * workaround for CR 6249718. When CR 6249718 is fixed we never get 2346 * inside "if" and the workaround becomes just a nop 2347 */ 2348 if (kcage_on && (flags & PG_NORELOC) && !PP_ISNORELOC(rootpp)) { 2349 page_list_add_pages(rootpp, 0); 2350 page_create_putback(npages); 2351 VM_STAT_ADD(page_create_large_cnt[6]); 2352 return (NULL); 2353 } 2354 2355 /* 2356 * If satisfying this request has left us with too little 2357 * memory, start the wheels turning to get some back. The 2358 * first clause of the test prevents waking up the pageout 2359 * daemon in situations where it would decide that there's 2360 * nothing to do. 2361 */ 2362 if (nscan < desscan && freemem < minfree) { 2363 TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL, 2364 "pageout_cv_signal:freemem %ld", freemem); 2365 cv_signal(&proc_pageout->p_cv); 2366 } 2367 2368 pp = rootpp; 2369 while (npages--) { 2370 ASSERT(PAGE_EXCL(pp)); 2371 ASSERT(pp->p_vnode == NULL); 2372 ASSERT(!hat_page_is_mapped(pp)); 2373 PP_CLRFREE(pp); 2374 PP_CLRAGED(pp); 2375 if (!page_hashin(pp, vp, off, NULL)) 2376 panic("page_create_large: hashin failed: page %p", 2377 (void *)pp); 2378 page_io_lock(pp); 2379 off += PAGESIZE; 2380 pp = pp->p_next; 2381 } 2382 2383 VM_STAT_ADD(page_create_large_cnt[0]); 2384 return (rootpp); 2385 } 2386 2387 page_t * 2388 page_create_va(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags, 2389 struct seg *seg, caddr_t vaddr) 2390 { 2391 page_t *plist = NULL; 2392 pgcnt_t npages; 2393 pgcnt_t found_on_free = 0; 2394 pgcnt_t pages_req; 2395 page_t *npp = NULL; 2396 uint_t enough; 2397 uint_t i; 2398 uint_t pcf_index; 2399 struct pcf *p; 2400 struct pcf *q; 2401 lgrp_t *lgrp; 2402 2403 TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START, 2404 "page_create_start:vp %p off %llx bytes %lu flags %x", 2405 vp, off, bytes, flags); 2406 2407 ASSERT(bytes != 0 && vp != NULL); 2408 2409 if ((flags & PG_EXCL) == 0 && (flags & PG_WAIT) == 0) { 2410 panic("page_create: invalid flags"); 2411 /*NOTREACHED*/ 2412 } 2413 ASSERT((flags & ~(PG_EXCL | PG_WAIT | 2414 PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == 0); 2415 /* but no others */ 2416 2417 pages_req = npages = btopr(bytes); 2418 /* 2419 * Try to see whether request is too large to *ever* be 2420 * satisfied, in order to prevent deadlock. We arbitrarily 2421 * decide to limit maximum size requests to max_page_get. 2422 */ 2423 if (npages >= max_page_get) { 2424 if ((flags & PG_WAIT) == 0) { 2425 TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_TOOBIG, 2426 "page_create_toobig:vp %p off %llx npages " 2427 "%lu max_page_get %lu", 2428 vp, off, npages, max_page_get); 2429 return (NULL); 2430 } else { 2431 cmn_err(CE_WARN, 2432 "Request for too much kernel memory " 2433 "(%lu bytes), will hang forever", bytes); 2434 for (;;) 2435 delay(1000000000); 2436 } 2437 } 2438 2439 if (!kcage_on || panicstr) { 2440 /* 2441 * Cage is OFF, or we are single threaded in 2442 * panic, so make everything a RELOC request. 2443 */ 2444 flags &= ~PG_NORELOC; 2445 } 2446 2447 if (freemem <= throttlefree + npages) 2448 if (!page_create_throttle(npages, flags)) 2449 return (NULL); 2450 2451 /* 2452 * If cage is on, dampen draw from cage when available 2453 * cage space is low. 2454 */ 2455 if ((flags & PG_NORELOC) && 2456 kcage_freemem < kcage_throttlefree + npages) { 2457 2458 /* 2459 * The cage is on, the caller wants PG_NORELOC 2460 * pages and available cage memory is very low. 2461 * Call kcage_create_throttle() to attempt to 2462 * control demand on the cage. 2463 */ 2464 if (kcage_create_throttle(npages, flags) == KCT_FAILURE) 2465 return (NULL); 2466 } 2467 2468 VM_STAT_ADD(page_create_cnt[0]); 2469 2470 enough = 0; 2471 pcf_index = PCF_INDEX(); 2472 2473 p = &pcf[pcf_index]; 2474 p->pcf_touch = 1; 2475 q = &pcf[PCF_FANOUT]; 2476 for (i = 0; i < PCF_FANOUT; i++) { 2477 if (p->pcf_count > npages) { 2478 /* 2479 * a good one to try. 2480 */ 2481 mutex_enter(&p->pcf_lock); 2482 if (p->pcf_count > npages) { 2483 p->pcf_count -= (uint_t)npages; 2484 /* 2485 * freemem is not protected by any lock. 2486 * Thus, we cannot have any assertion 2487 * containing freemem here. 2488 */ 2489 freemem -= npages; 2490 enough = 1; 2491 mutex_exit(&p->pcf_lock); 2492 break; 2493 } 2494 mutex_exit(&p->pcf_lock); 2495 } 2496 p++; 2497 if (p >= q) { 2498 p = pcf; 2499 } 2500 p->pcf_touch = 1; 2501 } 2502 2503 if (!enough) { 2504 /* 2505 * Have to look harder. If npages is greater than 2506 * one, then we might have to coalecse the counters. 2507 * 2508 * Go wait. We come back having accounted 2509 * for the memory. 2510 */ 2511 VM_STAT_ADD(page_create_cnt[1]); 2512 if (!page_create_wait(npages, flags)) { 2513 VM_STAT_ADD(page_create_cnt[2]); 2514 return (NULL); 2515 } 2516 } 2517 2518 TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS, 2519 "page_create_success:vp %p off %llx", vp, off); 2520 2521 /* 2522 * If satisfying this request has left us with too little 2523 * memory, start the wheels turning to get some back. The 2524 * first clause of the test prevents waking up the pageout 2525 * daemon in situations where it would decide that there's 2526 * nothing to do. 2527 */ 2528 if (nscan < desscan && freemem < minfree) { 2529 TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL, 2530 "pageout_cv_signal:freemem %ld", freemem); 2531 cv_signal(&proc_pageout->p_cv); 2532 } 2533 2534 /* 2535 * Loop around collecting the requested number of pages. 2536 * Most of the time, we have to `create' a new page. With 2537 * this in mind, pull the page off the free list before 2538 * getting the hash lock. This will minimize the hash 2539 * lock hold time, nesting, and the like. If it turns 2540 * out we don't need the page, we put it back at the end. 2541 */ 2542 while (npages--) { 2543 page_t *pp; 2544 kmutex_t *phm = NULL; 2545 ulong_t index; 2546 2547 index = PAGE_HASH_FUNC(vp, off); 2548 top: 2549 ASSERT(phm == NULL); 2550 ASSERT(index == PAGE_HASH_FUNC(vp, off)); 2551 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 2552 2553 if (npp == NULL) { 2554 /* 2555 * Try to get a page from the freelist (ie, 2556 * a page with no [vp, off] tag). If that 2557 * fails, use the cachelist. 2558 * 2559 * During the first attempt at both the free 2560 * and cache lists we try for the correct color. 2561 */ 2562 /* 2563 * XXXX-how do we deal with virtual indexed 2564 * caches and and colors? 2565 */ 2566 VM_STAT_ADD(page_create_cnt[4]); 2567 /* 2568 * Get lgroup to allocate next page of shared memory 2569 * from and use it to specify where to allocate 2570 * the physical memory 2571 */ 2572 lgrp = lgrp_mem_choose(seg, vaddr, PAGESIZE); 2573 npp = page_get_freelist(vp, off, seg, vaddr, PAGESIZE, 2574 flags | PG_MATCH_COLOR, lgrp); 2575 if (npp == NULL) { 2576 npp = page_get_cachelist(vp, off, seg, 2577 vaddr, flags | PG_MATCH_COLOR, lgrp); 2578 if (npp == NULL) { 2579 npp = page_create_get_something(vp, 2580 off, seg, vaddr, 2581 flags & ~PG_MATCH_COLOR); 2582 } 2583 2584 if (PP_ISAGED(npp) == 0) { 2585 /* 2586 * Since this page came from the 2587 * cachelist, we must destroy the 2588 * old vnode association. 2589 */ 2590 page_hashout(npp, NULL); 2591 } 2592 } 2593 } 2594 2595 /* 2596 * We own this page! 2597 */ 2598 ASSERT(PAGE_EXCL(npp)); 2599 ASSERT(npp->p_vnode == NULL); 2600 ASSERT(!hat_page_is_mapped(npp)); 2601 PP_CLRFREE(npp); 2602 PP_CLRAGED(npp); 2603 2604 /* 2605 * Here we have a page in our hot little mits and are 2606 * just waiting to stuff it on the appropriate lists. 2607 * Get the mutex and check to see if it really does 2608 * not exist. 2609 */ 2610 phm = PAGE_HASH_MUTEX(index); 2611 mutex_enter(phm); 2612 PAGE_HASH_SEARCH(index, pp, vp, off); 2613 if (pp == NULL) { 2614 VM_STAT_ADD(page_create_new); 2615 pp = npp; 2616 npp = NULL; 2617 if (!page_hashin(pp, vp, off, phm)) { 2618 /* 2619 * Since we hold the page hash mutex and 2620 * just searched for this page, page_hashin 2621 * had better not fail. If it does, that 2622 * means somethread did not follow the 2623 * page hash mutex rules. Panic now and 2624 * get it over with. As usual, go down 2625 * holding all the locks. 2626 */ 2627 ASSERT(MUTEX_HELD(phm)); 2628 panic("page_create: " 2629 "hashin failed %p %p %llx %p", 2630 (void *)pp, (void *)vp, off, (void *)phm); 2631 /*NOTREACHED*/ 2632 } 2633 ASSERT(MUTEX_HELD(phm)); 2634 mutex_exit(phm); 2635 phm = NULL; 2636 2637 /* 2638 * Hat layer locking need not be done to set 2639 * the following bits since the page is not hashed 2640 * and was on the free list (i.e., had no mappings). 2641 * 2642 * Set the reference bit to protect 2643 * against immediate pageout 2644 * 2645 * XXXmh modify freelist code to set reference 2646 * bit so we don't have to do it here. 2647 */ 2648 page_set_props(pp, P_REF); 2649 found_on_free++; 2650 } else { 2651 VM_STAT_ADD(page_create_exists); 2652 if (flags & PG_EXCL) { 2653 /* 2654 * Found an existing page, and the caller 2655 * wanted all new pages. Undo all of the work 2656 * we have done. 2657 */ 2658 mutex_exit(phm); 2659 phm = NULL; 2660 while (plist != NULL) { 2661 pp = plist; 2662 page_sub(&plist, pp); 2663 page_io_unlock(pp); 2664 /* large pages should not end up here */ 2665 ASSERT(pp->p_szc == 0); 2666 /*LINTED: constant in conditional ctx*/ 2667 VN_DISPOSE(pp, B_INVAL, 0, kcred); 2668 } 2669 VM_STAT_ADD(page_create_found_one); 2670 goto fail; 2671 } 2672 ASSERT(flags & PG_WAIT); 2673 if (!page_lock(pp, SE_EXCL, phm, P_NO_RECLAIM)) { 2674 /* 2675 * Start all over again if we blocked trying 2676 * to lock the page. 2677 */ 2678 mutex_exit(phm); 2679 VM_STAT_ADD(page_create_page_lock_failed); 2680 phm = NULL; 2681 goto top; 2682 } 2683 mutex_exit(phm); 2684 phm = NULL; 2685 2686 if (PP_ISFREE(pp)) { 2687 ASSERT(PP_ISAGED(pp) == 0); 2688 VM_STAT_ADD(pagecnt.pc_get_cache); 2689 page_list_sub(pp, PG_CACHE_LIST); 2690 PP_CLRFREE(pp); 2691 found_on_free++; 2692 } 2693 } 2694 2695 /* 2696 * Got a page! It is locked. Acquire the i/o 2697 * lock since we are going to use the p_next and 2698 * p_prev fields to link the requested pages together. 2699 */ 2700 page_io_lock(pp); 2701 page_add(&plist, pp); 2702 plist = plist->p_next; 2703 off += PAGESIZE; 2704 vaddr += PAGESIZE; 2705 } 2706 2707 ASSERT((flags & PG_EXCL) ? (found_on_free == pages_req) : 1); 2708 fail: 2709 if (npp != NULL) { 2710 /* 2711 * Did not need this page after all. 2712 * Put it back on the free list. 2713 */ 2714 VM_STAT_ADD(page_create_putbacks); 2715 PP_SETFREE(npp); 2716 PP_SETAGED(npp); 2717 npp->p_offset = (u_offset_t)-1; 2718 page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL); 2719 page_unlock(npp); 2720 2721 } 2722 2723 ASSERT(pages_req >= found_on_free); 2724 2725 { 2726 uint_t overshoot = (uint_t)(pages_req - found_on_free); 2727 2728 if (overshoot) { 2729 VM_STAT_ADD(page_create_overshoot); 2730 p = &pcf[pcf_index]; 2731 p->pcf_touch = 1; 2732 mutex_enter(&p->pcf_lock); 2733 if (p->pcf_block) { 2734 p->pcf_reserve += overshoot; 2735 } else { 2736 p->pcf_count += overshoot; 2737 if (p->pcf_wait) { 2738 mutex_enter(&new_freemem_lock); 2739 if (freemem_wait) { 2740 cv_signal(&freemem_cv); 2741 p->pcf_wait--; 2742 } else { 2743 p->pcf_wait = 0; 2744 } 2745 mutex_exit(&new_freemem_lock); 2746 } 2747 } 2748 mutex_exit(&p->pcf_lock); 2749 /* freemem is approximate, so this test OK */ 2750 if (!p->pcf_block) 2751 freemem += overshoot; 2752 } 2753 } 2754 2755 return (plist); 2756 } 2757 2758 /* 2759 * One or more constituent pages of this large page has been marked 2760 * toxic. Simply demote the large page to PAGESIZE pages and let 2761 * page_free() handle it. This routine should only be called by 2762 * large page free routines (page_free_pages() and page_destroy_pages(). 2763 * All pages are locked SE_EXCL and have already been marked free. 2764 */ 2765 static void 2766 page_free_toxic_pages(page_t *rootpp) 2767 { 2768 page_t *tpp; 2769 pgcnt_t i, pgcnt = page_get_pagecnt(rootpp->p_szc); 2770 uint_t szc = rootpp->p_szc; 2771 2772 for (i = 0, tpp = rootpp; i < pgcnt; i++, tpp = tpp->p_next) { 2773 ASSERT(tpp->p_szc == szc); 2774 ASSERT((PAGE_EXCL(tpp) && 2775 !page_iolock_assert(tpp)) || panicstr); 2776 tpp->p_szc = 0; 2777 } 2778 2779 while (rootpp != NULL) { 2780 tpp = rootpp; 2781 page_sub(&rootpp, tpp); 2782 ASSERT(PP_ISFREE(tpp)); 2783 PP_CLRFREE(tpp); 2784 page_free(tpp, 1); 2785 } 2786 } 2787 2788 /* 2789 * Put page on the "free" list. 2790 * The free list is really two lists maintained by 2791 * the PSM of whatever machine we happen to be on. 2792 */ 2793 void 2794 page_free(page_t *pp, int dontneed) 2795 { 2796 struct pcf *p; 2797 uint_t pcf_index; 2798 2799 ASSERT((PAGE_EXCL(pp) && 2800 !page_iolock_assert(pp)) || panicstr); 2801 2802 if (page_deteriorating(pp)) { 2803 volatile int i = 0; 2804 char *kaddr; 2805 volatile int rb, wb; 2806 uint64_t pa; 2807 volatile int ue = 0; 2808 on_trap_data_t otd; 2809 2810 if (pp->p_vnode != NULL) { 2811 /* 2812 * Let page_destroy() do its bean counting and 2813 * hash out the page; it will then call back 2814 * into page_free() with pp->p_vnode == NULL. 2815 */ 2816 page_destroy(pp, 0); 2817 return; 2818 } 2819 2820 if (page_isfailing(pp)) { 2821 /* 2822 * If we have already exceeded the limit for 2823 * pages retired, we will treat this page as 2824 * 'toxic' rather than failing. That will ensure 2825 * that the page is at least cleaned, and if 2826 * a UE is detected, the page will be retired 2827 * anyway. 2828 */ 2829 if (pages_retired_limit_exceeded()) { 2830 /* 2831 * clear the flag and reset to toxic 2832 */ 2833 page_clrtoxic(pp); 2834 page_settoxic(pp, PAGE_IS_TOXIC); 2835 } else { 2836 pa = ptob((uint64_t)page_pptonum(pp)); 2837 if (page_retire_messages) { 2838 cmn_err(CE_NOTE, "Page 0x%08x.%08x " 2839 "removed from service", 2840 (uint32_t)(pa >> 32), (uint32_t)pa); 2841 } 2842 goto page_failed; 2843 } 2844 } 2845 2846 pagescrub(pp, 0, PAGESIZE); 2847 2848 /* 2849 * We want to determine whether the error that occurred on 2850 * this page is transient or persistent, so we get a mapping 2851 * to the page and try every possible bit pattern to compare 2852 * what we write with what we read back. A smaller number 2853 * of bit patterns might suffice, but there's no point in 2854 * getting fancy. If this is the hot path on your system, 2855 * you've got bigger problems. 2856 */ 2857 kaddr = ppmapin(pp, PROT_READ | PROT_WRITE, (caddr_t)-1); 2858 for (wb = 0xff; wb >= 0; wb--) { 2859 if (on_trap(&otd, OT_DATA_EC)) { 2860 pa = ptob((uint64_t)page_pptonum(pp)) + i; 2861 page_settoxic(pp, PAGE_IS_FAILING); 2862 2863 if (page_retire_messages) { 2864 cmn_err(CE_WARN, "Uncorrectable Error " 2865 "occurred at PA 0x%08x.%08x while " 2866 "attempting to clear previously " 2867 "reported error; page removed from " 2868 "service", (uint32_t)(pa >> 32), 2869 (uint32_t)pa); 2870 } 2871 2872 ue++; 2873 break; 2874 } 2875 2876 /* 2877 * Write out the bit pattern, flush it to memory, and 2878 * read it back while under on_trap() protection. 2879 */ 2880 for (i = 0; i < PAGESIZE; i++) 2881 kaddr[i] = wb; 2882 2883 sync_data_memory(kaddr, PAGESIZE); 2884 2885 for (i = 0; i < PAGESIZE; i++) { 2886 if ((rb = (uchar_t)kaddr[i]) != wb) { 2887 page_settoxic(pp, PAGE_IS_FAILING); 2888 goto out; 2889 } 2890 } 2891 } 2892 out: 2893 no_trap(); 2894 ppmapout(kaddr); 2895 2896 if (wb >= 0 && !ue) { 2897 pa = ptob((uint64_t)page_pptonum(pp)) + i; 2898 if (page_retire_messages) { 2899 cmn_err(CE_WARN, "Data Mismatch occurred at PA " 2900 "0x%08x.%08x [ 0x%x != 0x%x ] while " 2901 "attempting to clear previously reported " 2902 "error; page removed from service", 2903 (uint32_t)(pa >> 32), (uint32_t)pa, rb, wb); 2904 } 2905 } 2906 page_failed: 2907 /* 2908 * DR operations change the association between a page_t 2909 * and the physical page it represents. Check if the 2910 * page is still bad. If it is, then retire it. 2911 */ 2912 if (page_isfaulty(pp) && page_isfailing(pp)) { 2913 /* 2914 * In the future, it might be useful to have a platform 2915 * callback here to tell the hardware to fence off this 2916 * page during the next reboot. 2917 * 2918 * We move the page to the retired_vnode here 2919 */ 2920 (void) page_hashin(pp, &retired_ppages, 2921 (u_offset_t)ptob((uint64_t)page_pptonum(pp)), NULL); 2922 mutex_enter(&freemem_lock); 2923 availrmem--; 2924 mutex_exit(&freemem_lock); 2925 page_retired(pp); 2926 page_downgrade(pp); 2927 2928 /* 2929 * If DR raced with the above page retirement code, 2930 * we might have retired a good page. If so, unretire 2931 * the page. 2932 */ 2933 if (!page_isfaulty(pp)) 2934 page_unretire_pages(); 2935 return; 2936 } 2937 2938 pa = ptob((uint64_t)page_pptonum(pp)); 2939 2940 if (page_retire_messages) { 2941 cmn_err(CE_NOTE, "Previously reported error on page " 2942 "0x%08x.%08x cleared", (uint32_t)(pa >> 32), 2943 (uint32_t)pa); 2944 } 2945 2946 page_clrtoxic(pp); 2947 } 2948 2949 if (PP_ISFREE(pp)) { 2950 panic("page_free: page %p is free", (void *)pp); 2951 } 2952 2953 if (pp->p_szc != 0) { 2954 if (pp->p_vnode == NULL || IS_SWAPFSVP(pp->p_vnode) || 2955 pp->p_vnode == &kvp) { 2956 panic("page_free: anon or kernel " 2957 "or no vnode large page %p", (void *)pp); 2958 } 2959 page_demote_vp_pages(pp); 2960 ASSERT(pp->p_szc == 0); 2961 } 2962 2963 /* 2964 * The page_struct_lock need not be acquired to examine these 2965 * fields since the page has an "exclusive" lock. 2966 */ 2967 if (hat_page_is_mapped(pp) || pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 2968 panic("page_free pp=%p, pfn=%lx, lckcnt=%d, cowcnt=%d", 2969 pp, page_pptonum(pp), pp->p_lckcnt, pp->p_cowcnt); 2970 /*NOTREACHED*/ 2971 } 2972 2973 ASSERT(!hat_page_getshare(pp)); 2974 2975 PP_SETFREE(pp); 2976 ASSERT(pp->p_vnode == NULL || !IS_VMODSORT(pp->p_vnode) || 2977 !hat_ismod(pp)); 2978 page_clr_all_props(pp); 2979 ASSERT(!hat_page_getshare(pp)); 2980 2981 /* 2982 * Now we add the page to the head of the free list. 2983 * But if this page is associated with a paged vnode 2984 * then we adjust the head forward so that the page is 2985 * effectively at the end of the list. 2986 */ 2987 if (pp->p_vnode == NULL) { 2988 /* 2989 * Page has no identity, put it on the free list. 2990 */ 2991 PP_SETAGED(pp); 2992 pp->p_offset = (u_offset_t)-1; 2993 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); 2994 VM_STAT_ADD(pagecnt.pc_free_free); 2995 TRACE_1(TR_FAC_VM, TR_PAGE_FREE_FREE, 2996 "page_free_free:pp %p", pp); 2997 } else { 2998 PP_CLRAGED(pp); 2999 3000 if (!dontneed || nopageage) { 3001 /* move it to the tail of the list */ 3002 page_list_add(pp, PG_CACHE_LIST | PG_LIST_TAIL); 3003 3004 VM_STAT_ADD(pagecnt.pc_free_cache); 3005 TRACE_1(TR_FAC_VM, TR_PAGE_FREE_CACHE_TAIL, 3006 "page_free_cache_tail:pp %p", pp); 3007 } else { 3008 page_list_add(pp, PG_CACHE_LIST | PG_LIST_HEAD); 3009 3010 VM_STAT_ADD(pagecnt.pc_free_dontneed); 3011 TRACE_1(TR_FAC_VM, TR_PAGE_FREE_CACHE_HEAD, 3012 "page_free_cache_head:pp %p", pp); 3013 } 3014 } 3015 page_unlock(pp); 3016 3017 /* 3018 * Now do the `freemem' accounting. 3019 */ 3020 pcf_index = PCF_INDEX(); 3021 p = &pcf[pcf_index]; 3022 p->pcf_touch = 1; 3023 3024 mutex_enter(&p->pcf_lock); 3025 if (p->pcf_block) { 3026 p->pcf_reserve += 1; 3027 } else { 3028 p->pcf_count += 1; 3029 if (p->pcf_wait) { 3030 mutex_enter(&new_freemem_lock); 3031 /* 3032 * Check to see if some other thread 3033 * is actually waiting. Another bucket 3034 * may have woken it up by now. If there 3035 * are no waiters, then set our pcf_wait 3036 * count to zero to avoid coming in here 3037 * next time. Also, since only one page 3038 * was put on the free list, just wake 3039 * up one waiter. 3040 */ 3041 if (freemem_wait) { 3042 cv_signal(&freemem_cv); 3043 p->pcf_wait--; 3044 } else { 3045 p->pcf_wait = 0; 3046 } 3047 mutex_exit(&new_freemem_lock); 3048 } 3049 } 3050 mutex_exit(&p->pcf_lock); 3051 3052 /* freemem is approximate, so this test OK */ 3053 if (!p->pcf_block) 3054 freemem += 1; 3055 } 3056 3057 /* 3058 * Put page on the "free" list during intial startup. 3059 * This happens during initial single threaded execution. 3060 */ 3061 void 3062 page_free_at_startup(page_t *pp) 3063 { 3064 struct pcf *p; 3065 uint_t pcf_index; 3066 3067 page_list_add(pp, PG_FREE_LIST | PG_LIST_HEAD | PG_LIST_ISINIT); 3068 VM_STAT_ADD(pagecnt.pc_free_free); 3069 3070 /* 3071 * Now do the `freemem' accounting. 3072 */ 3073 pcf_index = PCF_INDEX(); 3074 p = &pcf[pcf_index]; 3075 p->pcf_touch = 1; 3076 3077 ASSERT(p->pcf_block == 0); 3078 ASSERT(p->pcf_wait == 0); 3079 p->pcf_count += 1; 3080 3081 /* freemem is approximate, so this is OK */ 3082 freemem += 1; 3083 } 3084 3085 void 3086 page_free_pages(page_t *pp) 3087 { 3088 page_t *tpp, *rootpp = NULL; 3089 pgcnt_t pgcnt = page_get_pagecnt(pp->p_szc); 3090 pgcnt_t i; 3091 uint_t szc = pp->p_szc; 3092 int toxic = 0; 3093 3094 VM_STAT_ADD(pagecnt.pc_free_pages); 3095 TRACE_1(TR_FAC_VM, TR_PAGE_FREE_FREE, 3096 "page_free_free:pp %p", pp); 3097 3098 ASSERT(pp->p_szc != 0 && pp->p_szc < page_num_pagesizes()); 3099 if ((page_pptonum(pp) & (pgcnt - 1)) != 0) { 3100 panic("page_free_pages: not root page %p", (void *)pp); 3101 /*NOTREACHED*/ 3102 } 3103 3104 for (i = 0, tpp = pp; i < pgcnt; i++, tpp = page_next(tpp)) { 3105 ASSERT((PAGE_EXCL(tpp) && 3106 !page_iolock_assert(tpp)) || panicstr); 3107 if (PP_ISFREE(tpp)) { 3108 panic("page_free_pages: page %p is free", (void *)tpp); 3109 /*NOTREACHED*/ 3110 } 3111 if (hat_page_is_mapped(tpp) || tpp->p_lckcnt != 0 || 3112 tpp->p_cowcnt != 0) { 3113 panic("page_free_pages %p", (void *)tpp); 3114 /*NOTREACHED*/ 3115 } 3116 3117 ASSERT(!hat_page_getshare(tpp)); 3118 ASSERT(tpp->p_vnode == NULL); 3119 ASSERT(tpp->p_szc == szc); 3120 3121 if (page_deteriorating(tpp)) 3122 toxic = 1; 3123 3124 PP_SETFREE(tpp); 3125 page_clr_all_props(tpp); 3126 PP_SETAGED(tpp); 3127 tpp->p_offset = (u_offset_t)-1; 3128 ASSERT(tpp->p_next == tpp); 3129 ASSERT(tpp->p_prev == tpp); 3130 page_list_concat(&rootpp, &tpp); 3131 } 3132 ASSERT(rootpp == pp); 3133 3134 if (toxic) { 3135 page_free_toxic_pages(rootpp); 3136 return; 3137 } 3138 page_list_add_pages(rootpp, 0); 3139 page_create_putback(pgcnt); 3140 } 3141 3142 int free_pages = 1; 3143 3144 /* 3145 * This routine attempts to return pages to the cachelist via page_release(). 3146 * It does not *have* to be successful in all cases, since the pageout scanner 3147 * will catch any pages it misses. It does need to be fast and not introduce 3148 * too much overhead. 3149 * 3150 * If a page isn't found on the unlocked sweep of the page_hash bucket, we 3151 * don't lock and retry. This is ok, since the page scanner will eventually 3152 * find any page we miss in free_vp_pages(). 3153 */ 3154 void 3155 free_vp_pages(vnode_t *vp, u_offset_t off, size_t len) 3156 { 3157 page_t *pp; 3158 u_offset_t eoff; 3159 extern int swap_in_range(vnode_t *, u_offset_t, size_t); 3160 3161 eoff = off + len; 3162 3163 if (free_pages == 0) 3164 return; 3165 if (swap_in_range(vp, off, len)) 3166 return; 3167 3168 for (; off < eoff; off += PAGESIZE) { 3169 3170 /* 3171 * find the page using a fast, but inexact search. It'll be OK 3172 * if a few pages slip through the cracks here. 3173 */ 3174 pp = page_exists(vp, off); 3175 3176 /* 3177 * If we didn't find the page (it may not exist), the page 3178 * is free, looks still in use (shared), or we can't lock it, 3179 * just give up. 3180 */ 3181 if (pp == NULL || 3182 PP_ISFREE(pp) || 3183 page_share_cnt(pp) > 0 || 3184 !page_trylock(pp, SE_EXCL)) 3185 continue; 3186 3187 /* 3188 * Once we have locked pp, verify that it's still the 3189 * correct page and not already free 3190 */ 3191 ASSERT(PAGE_LOCKED_SE(pp, SE_EXCL)); 3192 if (pp->p_vnode != vp || pp->p_offset != off || PP_ISFREE(pp)) { 3193 page_unlock(pp); 3194 continue; 3195 } 3196 3197 /* 3198 * try to release the page... 3199 */ 3200 (void) page_release(pp, 1); 3201 } 3202 } 3203 3204 /* 3205 * Reclaim the given page from the free list. 3206 * Returns 1 on success or 0 on failure. 3207 * 3208 * The page is unlocked if it can't be reclaimed (when freemem == 0). 3209 * If `lock' is non-null, it will be dropped and re-acquired if 3210 * the routine must wait while freemem is 0. 3211 * 3212 * As it turns out, boot_getpages() does this. It picks a page, 3213 * based on where OBP mapped in some address, gets its pfn, searches 3214 * the memsegs, locks the page, then pulls it off the free list! 3215 */ 3216 int 3217 page_reclaim(page_t *pp, kmutex_t *lock) 3218 { 3219 struct pcf *p; 3220 uint_t pcf_index; 3221 struct cpu *cpup; 3222 int enough; 3223 uint_t i; 3224 3225 ASSERT(lock != NULL ? MUTEX_HELD(lock) : 1); 3226 ASSERT(PAGE_EXCL(pp) && PP_ISFREE(pp)); 3227 ASSERT(pp->p_szc == 0); 3228 3229 /* 3230 * If `freemem' is 0, we cannot reclaim this page from the 3231 * freelist, so release every lock we might hold: the page, 3232 * and the `lock' before blocking. 3233 * 3234 * The only way `freemem' can become 0 while there are pages 3235 * marked free (have their p->p_free bit set) is when the 3236 * system is low on memory and doing a page_create(). In 3237 * order to guarantee that once page_create() starts acquiring 3238 * pages it will be able to get all that it needs since `freemem' 3239 * was decreased by the requested amount. So, we need to release 3240 * this page, and let page_create() have it. 3241 * 3242 * Since `freemem' being zero is not supposed to happen, just 3243 * use the usual hash stuff as a starting point. If that bucket 3244 * is empty, then assume the worst, and start at the beginning 3245 * of the pcf array. If we always start at the beginning 3246 * when acquiring more than one pcf lock, there won't be any 3247 * deadlock problems. 3248 */ 3249 3250 /* TODO: Do we need to test kcage_freemem if PG_NORELOC(pp)? */ 3251 3252 if (freemem <= throttlefree && !page_create_throttle(1l, 0)) { 3253 pcf_acquire_all(); 3254 goto page_reclaim_nomem; 3255 } 3256 3257 enough = 0; 3258 pcf_index = PCF_INDEX(); 3259 p = &pcf[pcf_index]; 3260 p->pcf_touch = 1; 3261 mutex_enter(&p->pcf_lock); 3262 if (p->pcf_count >= 1) { 3263 enough = 1; 3264 p->pcf_count--; 3265 } 3266 mutex_exit(&p->pcf_lock); 3267 3268 if (!enough) { 3269 VM_STAT_ADD(page_reclaim_zero); 3270 /* 3271 * Check again. Its possible that some other thread 3272 * could have been right behind us, and added one 3273 * to a list somewhere. Acquire each of the pcf locks 3274 * until we find a page. 3275 */ 3276 p = pcf; 3277 for (i = 0; i < PCF_FANOUT; i++) { 3278 p->pcf_touch = 1; 3279 mutex_enter(&p->pcf_lock); 3280 if (p->pcf_count >= 1) { 3281 p->pcf_count -= 1; 3282 enough = 1; 3283 break; 3284 } 3285 p++; 3286 } 3287 3288 if (!enough) { 3289 page_reclaim_nomem: 3290 /* 3291 * We really can't have page `pp'. 3292 * Time for the no-memory dance with 3293 * page_free(). This is just like 3294 * page_create_wait(). Plus the added 3295 * attraction of releasing whatever mutex 3296 * we held when we were called with in `lock'. 3297 * Page_unlock() will wakeup any thread 3298 * waiting around for this page. 3299 */ 3300 if (lock) { 3301 VM_STAT_ADD(page_reclaim_zero_locked); 3302 mutex_exit(lock); 3303 } 3304 page_unlock(pp); 3305 3306 /* 3307 * get this before we drop all the pcf locks. 3308 */ 3309 mutex_enter(&new_freemem_lock); 3310 3311 p = pcf; 3312 for (i = 0; i < PCF_FANOUT; i++) { 3313 p->pcf_wait++; 3314 mutex_exit(&p->pcf_lock); 3315 p++; 3316 } 3317 3318 freemem_wait++; 3319 cv_wait(&freemem_cv, &new_freemem_lock); 3320 freemem_wait--; 3321 3322 mutex_exit(&new_freemem_lock); 3323 3324 if (lock) { 3325 mutex_enter(lock); 3326 } 3327 return (0); 3328 } 3329 3330 /* 3331 * There was a page to be found. 3332 * The pcf accounting has been done, 3333 * though none of the pcf_wait flags have been set, 3334 * drop the locks and continue on. 3335 */ 3336 while (p >= pcf) { 3337 mutex_exit(&p->pcf_lock); 3338 p--; 3339 } 3340 } 3341 3342 /* 3343 * freemem is not protected by any lock. Thus, we cannot 3344 * have any assertion containing freemem here. 3345 */ 3346 freemem -= 1; 3347 3348 VM_STAT_ADD(pagecnt.pc_reclaim); 3349 if (PP_ISAGED(pp)) { 3350 page_list_sub(pp, PG_FREE_LIST); 3351 TRACE_1(TR_FAC_VM, TR_PAGE_UNFREE_FREE, 3352 "page_reclaim_free:pp %p", pp); 3353 } else { 3354 page_list_sub(pp, PG_CACHE_LIST); 3355 TRACE_1(TR_FAC_VM, TR_PAGE_UNFREE_CACHE, 3356 "page_reclaim_cache:pp %p", pp); 3357 } 3358 3359 /* 3360 * clear the p_free & p_age bits since this page is no longer 3361 * on the free list. Notice that there was a brief time where 3362 * a page is marked as free, but is not on the list. 3363 * 3364 * Set the reference bit to protect against immediate pageout. 3365 */ 3366 PP_CLRFREE(pp); 3367 PP_CLRAGED(pp); 3368 page_set_props(pp, P_REF); 3369 3370 CPU_STATS_ENTER_K(); 3371 cpup = CPU; /* get cpup now that CPU cannot change */ 3372 CPU_STATS_ADDQ(cpup, vm, pgrec, 1); 3373 CPU_STATS_ADDQ(cpup, vm, pgfrec, 1); 3374 CPU_STATS_EXIT_K(); 3375 3376 return (1); 3377 } 3378 3379 3380 3381 /* 3382 * Destroy identity of the page and put it back on 3383 * the page free list. Assumes that the caller has 3384 * acquired the "exclusive" lock on the page. 3385 */ 3386 void 3387 page_destroy(page_t *pp, int dontfree) 3388 { 3389 ASSERT((PAGE_EXCL(pp) && 3390 !page_iolock_assert(pp)) || panicstr); 3391 3392 if (pp->p_szc != 0) { 3393 if (pp->p_vnode == NULL || IS_SWAPFSVP(pp->p_vnode) || 3394 pp->p_vnode == &kvp) { 3395 panic("page_destroy: anon or kernel or no vnode " 3396 "large page %p", (void *)pp); 3397 } 3398 page_demote_vp_pages(pp); 3399 ASSERT(pp->p_szc == 0); 3400 } 3401 3402 TRACE_1(TR_FAC_VM, TR_PAGE_DESTROY, "page_destroy:pp %p", pp); 3403 3404 /* 3405 * Unload translations, if any, then hash out the 3406 * page to erase its identity. 3407 */ 3408 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 3409 page_hashout(pp, NULL); 3410 3411 if (!dontfree) { 3412 /* 3413 * Acquire the "freemem_lock" for availrmem. 3414 * The page_struct_lock need not be acquired for lckcnt 3415 * and cowcnt since the page has an "exclusive" lock. 3416 */ 3417 if ((pp->p_lckcnt != 0) || (pp->p_cowcnt != 0)) { 3418 mutex_enter(&freemem_lock); 3419 if (pp->p_lckcnt != 0) { 3420 availrmem++; 3421 pp->p_lckcnt = 0; 3422 } 3423 if (pp->p_cowcnt != 0) { 3424 availrmem += pp->p_cowcnt; 3425 pp->p_cowcnt = 0; 3426 } 3427 mutex_exit(&freemem_lock); 3428 } 3429 /* 3430 * Put the page on the "free" list. 3431 */ 3432 page_free(pp, 0); 3433 } 3434 } 3435 3436 void 3437 page_destroy_pages(page_t *pp) 3438 { 3439 3440 page_t *tpp, *rootpp = NULL; 3441 pgcnt_t pgcnt = page_get_pagecnt(pp->p_szc); 3442 pgcnt_t i, pglcks = 0; 3443 uint_t szc = pp->p_szc; 3444 int toxic = 0; 3445 3446 ASSERT(pp->p_szc != 0 && pp->p_szc < page_num_pagesizes()); 3447 3448 VM_STAT_ADD(pagecnt.pc_destroy_pages); 3449 3450 TRACE_1(TR_FAC_VM, TR_PAGE_DESTROY, "page_destroy_pages:pp %p", pp); 3451 3452 if ((page_pptonum(pp) & (pgcnt - 1)) != 0) { 3453 panic("page_destroy_pages: not root page %p", (void *)pp); 3454 /*NOTREACHED*/ 3455 } 3456 3457 for (i = 0, tpp = pp; i < pgcnt; i++, tpp = page_next(tpp)) { 3458 ASSERT((PAGE_EXCL(tpp) && 3459 !page_iolock_assert(tpp)) || panicstr); 3460 (void) hat_pageunload(tpp, HAT_FORCE_PGUNLOAD); 3461 page_hashout(tpp, NULL); 3462 ASSERT(tpp->p_offset == (u_offset_t)-1); 3463 if (tpp->p_lckcnt != 0) { 3464 pglcks++; 3465 tpp->p_lckcnt = 0; 3466 } else if (tpp->p_cowcnt != 0) { 3467 pglcks += tpp->p_cowcnt; 3468 tpp->p_cowcnt = 0; 3469 } 3470 ASSERT(!hat_page_getshare(tpp)); 3471 ASSERT(tpp->p_vnode == NULL); 3472 ASSERT(tpp->p_szc == szc); 3473 3474 if (page_deteriorating(tpp)) 3475 toxic = 1; 3476 3477 PP_SETFREE(tpp); 3478 page_clr_all_props(tpp); 3479 PP_SETAGED(tpp); 3480 ASSERT(tpp->p_next == tpp); 3481 ASSERT(tpp->p_prev == tpp); 3482 page_list_concat(&rootpp, &tpp); 3483 } 3484 3485 ASSERT(rootpp == pp); 3486 if (pglcks != 0) { 3487 mutex_enter(&freemem_lock); 3488 availrmem += pglcks; 3489 mutex_exit(&freemem_lock); 3490 } 3491 3492 if (toxic) { 3493 page_free_toxic_pages(rootpp); 3494 return; 3495 } 3496 page_list_add_pages(rootpp, 0); 3497 page_create_putback(pgcnt); 3498 } 3499 3500 /* 3501 * Similar to page_destroy(), but destroys pages which are 3502 * locked and known to be on the page free list. Since 3503 * the page is known to be free and locked, no one can access 3504 * it. 3505 * 3506 * Also, the number of free pages does not change. 3507 */ 3508 void 3509 page_destroy_free(page_t *pp) 3510 { 3511 ASSERT(PAGE_EXCL(pp)); 3512 ASSERT(PP_ISFREE(pp)); 3513 ASSERT(pp->p_vnode); 3514 ASSERT(hat_page_getattr(pp, P_MOD | P_REF | P_RO) == 0); 3515 ASSERT(!hat_page_is_mapped(pp)); 3516 ASSERT(PP_ISAGED(pp) == 0); 3517 ASSERT(pp->p_szc == 0); 3518 3519 VM_STAT_ADD(pagecnt.pc_destroy_free); 3520 page_list_sub(pp, PG_CACHE_LIST); 3521 3522 page_hashout(pp, NULL); 3523 ASSERT(pp->p_vnode == NULL); 3524 ASSERT(pp->p_offset == (u_offset_t)-1); 3525 ASSERT(pp->p_hash == NULL); 3526 3527 PP_SETAGED(pp); 3528 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); 3529 page_unlock(pp); 3530 3531 mutex_enter(&new_freemem_lock); 3532 if (freemem_wait) { 3533 cv_signal(&freemem_cv); 3534 } 3535 mutex_exit(&new_freemem_lock); 3536 } 3537 3538 /* 3539 * Rename the page "opp" to have an identity specified 3540 * by [vp, off]. If a page already exists with this name 3541 * it is locked and destroyed. Note that the page's 3542 * translations are not unloaded during the rename. 3543 * 3544 * This routine is used by the anon layer to "steal" the 3545 * original page and is not unlike destroying a page and 3546 * creating a new page using the same page frame. 3547 * 3548 * XXX -- Could deadlock if caller 1 tries to rename A to B while 3549 * caller 2 tries to rename B to A. 3550 */ 3551 void 3552 page_rename(page_t *opp, vnode_t *vp, u_offset_t off) 3553 { 3554 page_t *pp; 3555 int olckcnt = 0; 3556 int ocowcnt = 0; 3557 kmutex_t *phm; 3558 ulong_t index; 3559 3560 ASSERT(PAGE_EXCL(opp) && !page_iolock_assert(opp)); 3561 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 3562 ASSERT(PP_ISFREE(opp) == 0); 3563 3564 VM_STAT_ADD(page_rename_count); 3565 3566 TRACE_3(TR_FAC_VM, TR_PAGE_RENAME, 3567 "page rename:pp %p vp %p off %llx", opp, vp, off); 3568 3569 page_hashout(opp, NULL); 3570 PP_CLRAGED(opp); 3571 3572 /* 3573 * Acquire the appropriate page hash lock, since 3574 * we're going to rename the page. 3575 */ 3576 index = PAGE_HASH_FUNC(vp, off); 3577 phm = PAGE_HASH_MUTEX(index); 3578 mutex_enter(phm); 3579 top: 3580 /* 3581 * Look for an existing page with this name and destroy it if found. 3582 * By holding the page hash lock all the way to the page_hashin() 3583 * call, we are assured that no page can be created with this 3584 * identity. In the case when the phm lock is dropped to undo any 3585 * hat layer mappings, the existing page is held with an "exclusive" 3586 * lock, again preventing another page from being created with 3587 * this identity. 3588 */ 3589 PAGE_HASH_SEARCH(index, pp, vp, off); 3590 if (pp != NULL) { 3591 VM_STAT_ADD(page_rename_exists); 3592 3593 /* 3594 * As it turns out, this is one of only two places where 3595 * page_lock() needs to hold the passed in lock in the 3596 * successful case. In all of the others, the lock could 3597 * be dropped as soon as the attempt is made to lock 3598 * the page. It is tempting to add yet another arguement, 3599 * PL_KEEP or PL_DROP, to let page_lock know what to do. 3600 */ 3601 if (!page_lock(pp, SE_EXCL, phm, P_RECLAIM)) { 3602 /* 3603 * Went to sleep because the page could not 3604 * be locked. We were woken up when the page 3605 * was unlocked, or when the page was destroyed. 3606 * In either case, `phm' was dropped while we 3607 * slept. Hence we should not just roar through 3608 * this loop. 3609 */ 3610 goto top; 3611 } 3612 3613 if (hat_page_is_mapped(pp)) { 3614 /* 3615 * Unload translations. Since we hold the 3616 * exclusive lock on this page, the page 3617 * can not be changed while we drop phm. 3618 * This is also not a lock protocol violation, 3619 * but rather the proper way to do things. 3620 */ 3621 mutex_exit(phm); 3622 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 3623 mutex_enter(phm); 3624 } 3625 page_hashout(pp, phm); 3626 } 3627 /* 3628 * Hash in the page with the new identity. 3629 */ 3630 if (!page_hashin(opp, vp, off, phm)) { 3631 /* 3632 * We were holding phm while we searched for [vp, off] 3633 * and only dropped phm if we found and locked a page. 3634 * If we can't create this page now, then some thing 3635 * is really broken. 3636 */ 3637 panic("page_rename: Can't hash in page: %p", (void *)pp); 3638 /*NOTREACHED*/ 3639 } 3640 3641 ASSERT(MUTEX_HELD(phm)); 3642 mutex_exit(phm); 3643 3644 /* 3645 * Now that we have dropped phm, lets get around to finishing up 3646 * with pp. 3647 */ 3648 if (pp != NULL) { 3649 ASSERT(!hat_page_is_mapped(pp)); 3650 /* for now large pages should not end up here */ 3651 ASSERT(pp->p_szc == 0); 3652 /* 3653 * Save the locks for transfer to the new page and then 3654 * clear them so page_free doesn't think they're important. 3655 * The page_struct_lock need not be acquired for lckcnt and 3656 * cowcnt since the page has an "exclusive" lock. 3657 */ 3658 olckcnt = pp->p_lckcnt; 3659 ocowcnt = pp->p_cowcnt; 3660 pp->p_lckcnt = pp->p_cowcnt = 0; 3661 3662 /* 3663 * Put the page on the "free" list after we drop 3664 * the lock. The less work under the lock the better. 3665 */ 3666 /*LINTED: constant in conditional context*/ 3667 VN_DISPOSE(pp, B_FREE, 0, kcred); 3668 } 3669 3670 /* 3671 * Transfer the lock count from the old page (if any). 3672 * The page_struct_lock need not be acquired for lckcnt and 3673 * cowcnt since the page has an "exclusive" lock. 3674 */ 3675 opp->p_lckcnt += olckcnt; 3676 opp->p_cowcnt += ocowcnt; 3677 } 3678 3679 /* 3680 * low level routine to add page `pp' to the hash and vp chains for [vp, offset] 3681 * 3682 * Pages are normally inserted at the start of a vnode's v_pages list. 3683 * If the vnode is VMODSORT and the page is modified, it goes at the end. 3684 * This can happen when a modified page is relocated for DR. 3685 * 3686 * Returns 1 on success and 0 on failure. 3687 */ 3688 static int 3689 page_do_hashin(page_t *pp, vnode_t *vp, u_offset_t offset) 3690 { 3691 page_t **listp; 3692 page_t *tp; 3693 ulong_t index; 3694 3695 ASSERT(PAGE_EXCL(pp)); 3696 ASSERT(vp != NULL); 3697 ASSERT(MUTEX_HELD(page_vnode_mutex(vp))); 3698 3699 /* 3700 * Be sure to set these up before the page is inserted on the hash 3701 * list. As soon as the page is placed on the list some other 3702 * thread might get confused and wonder how this page could 3703 * possibly hash to this list. 3704 */ 3705 pp->p_vnode = vp; 3706 pp->p_offset = offset; 3707 3708 /* 3709 * record if this page is on a swap vnode 3710 */ 3711 if ((vp->v_flag & VISSWAP) != 0) 3712 PP_SETSWAP(pp); 3713 3714 index = PAGE_HASH_FUNC(vp, offset); 3715 ASSERT(MUTEX_HELD(PAGE_HASH_MUTEX(index))); 3716 listp = &page_hash[index]; 3717 3718 /* 3719 * If this page is already hashed in, fail this attempt to add it. 3720 */ 3721 for (tp = *listp; tp != NULL; tp = tp->p_hash) { 3722 if (tp->p_vnode == vp && tp->p_offset == offset) { 3723 pp->p_vnode = NULL; 3724 pp->p_offset = (u_offset_t)(-1); 3725 return (0); 3726 } 3727 } 3728 pp->p_hash = *listp; 3729 *listp = pp; 3730 3731 /* 3732 * Add the page to the vnode's list of pages 3733 */ 3734 if (vp->v_pages != NULL && IS_VMODSORT(vp) && hat_ismod(pp)) 3735 listp = &vp->v_pages->p_vpprev->p_vpnext; 3736 else 3737 listp = &vp->v_pages; 3738 3739 page_vpadd(listp, pp); 3740 3741 return (1); 3742 } 3743 3744 /* 3745 * Add page `pp' to both the hash and vp chains for [vp, offset]. 3746 * 3747 * Returns 1 on success and 0 on failure. 3748 * If hold is passed in, it is not dropped. 3749 */ 3750 int 3751 page_hashin(page_t *pp, vnode_t *vp, u_offset_t offset, kmutex_t *hold) 3752 { 3753 kmutex_t *phm = NULL; 3754 kmutex_t *vphm; 3755 int rc; 3756 3757 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 3758 3759 TRACE_3(TR_FAC_VM, TR_PAGE_HASHIN, 3760 "page_hashin:pp %p vp %p offset %llx", 3761 pp, vp, offset); 3762 3763 VM_STAT_ADD(hashin_count); 3764 3765 if (hold != NULL) 3766 phm = hold; 3767 else { 3768 VM_STAT_ADD(hashin_not_held); 3769 phm = PAGE_HASH_MUTEX(PAGE_HASH_FUNC(vp, offset)); 3770 mutex_enter(phm); 3771 } 3772 3773 vphm = page_vnode_mutex(vp); 3774 mutex_enter(vphm); 3775 rc = page_do_hashin(pp, vp, offset); 3776 mutex_exit(vphm); 3777 if (hold == NULL) 3778 mutex_exit(phm); 3779 if (rc == 0) 3780 VM_STAT_ADD(hashin_already); 3781 return (rc); 3782 } 3783 3784 /* 3785 * Remove page ``pp'' from the hash and vp chains and remove vp association. 3786 * All mutexes must be held 3787 */ 3788 static void 3789 page_do_hashout(page_t *pp) 3790 { 3791 page_t **hpp; 3792 page_t *hp; 3793 vnode_t *vp = pp->p_vnode; 3794 3795 ASSERT(vp != NULL); 3796 ASSERT(MUTEX_HELD(page_vnode_mutex(vp))); 3797 3798 /* 3799 * First, take pp off of its hash chain. 3800 */ 3801 hpp = &page_hash[PAGE_HASH_FUNC(vp, pp->p_offset)]; 3802 3803 for (;;) { 3804 hp = *hpp; 3805 if (hp == pp) 3806 break; 3807 if (hp == NULL) { 3808 panic("page_do_hashout"); 3809 /*NOTREACHED*/ 3810 } 3811 hpp = &hp->p_hash; 3812 } 3813 *hpp = pp->p_hash; 3814 3815 /* 3816 * Now remove it from its associated vnode. 3817 */ 3818 if (vp->v_pages) 3819 page_vpsub(&vp->v_pages, pp); 3820 3821 pp->p_hash = NULL; 3822 page_clr_all_props(pp); 3823 PP_CLRSWAP(pp); 3824 pp->p_vnode = NULL; 3825 pp->p_offset = (u_offset_t)-1; 3826 } 3827 3828 /* 3829 * Remove page ``pp'' from the hash and vp chains and remove vp association. 3830 * 3831 * When `phm' is non-NULL it contains the address of the mutex protecting the 3832 * hash list pp is on. It is not dropped. 3833 */ 3834 void 3835 page_hashout(page_t *pp, kmutex_t *phm) 3836 { 3837 vnode_t *vp; 3838 ulong_t index; 3839 kmutex_t *nphm; 3840 kmutex_t *vphm; 3841 kmutex_t *sep; 3842 3843 ASSERT(phm != NULL ? MUTEX_HELD(phm) : 1); 3844 ASSERT(pp->p_vnode != NULL); 3845 ASSERT((PAGE_EXCL(pp) && !page_iolock_assert(pp)) || panicstr); 3846 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(pp->p_vnode))); 3847 3848 vp = pp->p_vnode; 3849 3850 TRACE_2(TR_FAC_VM, TR_PAGE_HASHOUT, 3851 "page_hashout:pp %p vp %p", pp, vp); 3852 3853 /* Kernel probe */ 3854 TNF_PROBE_2(page_unmap, "vm pagefault", /* CSTYLED */, 3855 tnf_opaque, vnode, vp, 3856 tnf_offset, offset, pp->p_offset); 3857 3858 /* 3859 * 3860 */ 3861 VM_STAT_ADD(hashout_count); 3862 index = PAGE_HASH_FUNC(vp, pp->p_offset); 3863 if (phm == NULL) { 3864 VM_STAT_ADD(hashout_not_held); 3865 nphm = PAGE_HASH_MUTEX(index); 3866 mutex_enter(nphm); 3867 } 3868 ASSERT(phm ? phm == PAGE_HASH_MUTEX(index) : 1); 3869 3870 3871 /* 3872 * grab page vnode mutex and remove it... 3873 */ 3874 vphm = page_vnode_mutex(vp); 3875 mutex_enter(vphm); 3876 3877 page_do_hashout(pp); 3878 3879 mutex_exit(vphm); 3880 if (phm == NULL) 3881 mutex_exit(nphm); 3882 3883 /* 3884 * If the page was retired, update the pages_retired 3885 * total and clear the page flag 3886 */ 3887 if (page_isretired(pp)) { 3888 retired_page_removed(pp); 3889 } 3890 3891 /* 3892 * Wake up processes waiting for this page. The page's 3893 * identity has been changed, and is probably not the 3894 * desired page any longer. 3895 */ 3896 sep = page_se_mutex(pp); 3897 mutex_enter(sep); 3898 if (CV_HAS_WAITERS(&pp->p_cv)) 3899 cv_broadcast(&pp->p_cv); 3900 mutex_exit(sep); 3901 } 3902 3903 /* 3904 * Add the page to the front of a linked list of pages 3905 * using the p_next & p_prev pointers for the list. 3906 * The caller is responsible for protecting the list pointers. 3907 */ 3908 void 3909 page_add(page_t **ppp, page_t *pp) 3910 { 3911 ASSERT(PAGE_EXCL(pp) || (PAGE_SHARED(pp) && page_iolock_assert(pp))); 3912 3913 page_add_common(ppp, pp); 3914 } 3915 3916 3917 3918 /* 3919 * Common code for page_add() and mach_page_add() 3920 */ 3921 void 3922 page_add_common(page_t **ppp, page_t *pp) 3923 { 3924 if (*ppp == NULL) { 3925 pp->p_next = pp->p_prev = pp; 3926 } else { 3927 pp->p_next = *ppp; 3928 pp->p_prev = (*ppp)->p_prev; 3929 (*ppp)->p_prev = pp; 3930 pp->p_prev->p_next = pp; 3931 } 3932 *ppp = pp; 3933 } 3934 3935 3936 /* 3937 * Remove this page from a linked list of pages 3938 * using the p_next & p_prev pointers for the list. 3939 * 3940 * The caller is responsible for protecting the list pointers. 3941 */ 3942 void 3943 page_sub(page_t **ppp, page_t *pp) 3944 { 3945 ASSERT((PP_ISFREE(pp)) ? 1 : 3946 (PAGE_EXCL(pp)) || (PAGE_SHARED(pp) && page_iolock_assert(pp))); 3947 3948 if (*ppp == NULL || pp == NULL) { 3949 panic("page_sub: bad arg(s): pp %p, *ppp %p", 3950 (void *)pp, (void *)(*ppp)); 3951 /*NOTREACHED*/ 3952 } 3953 3954 page_sub_common(ppp, pp); 3955 } 3956 3957 3958 /* 3959 * Common code for page_sub() and mach_page_sub() 3960 */ 3961 void 3962 page_sub_common(page_t **ppp, page_t *pp) 3963 { 3964 if (*ppp == pp) 3965 *ppp = pp->p_next; /* go to next page */ 3966 3967 if (*ppp == pp) 3968 *ppp = NULL; /* page list is gone */ 3969 else { 3970 pp->p_prev->p_next = pp->p_next; 3971 pp->p_next->p_prev = pp->p_prev; 3972 } 3973 pp->p_prev = pp->p_next = pp; /* make pp a list of one */ 3974 } 3975 3976 3977 /* 3978 * Break page list cppp into two lists with npages in the first list. 3979 * The tail is returned in nppp. 3980 */ 3981 void 3982 page_list_break(page_t **oppp, page_t **nppp, pgcnt_t npages) 3983 { 3984 page_t *s1pp = *oppp; 3985 page_t *s2pp; 3986 page_t *e1pp, *e2pp; 3987 long n = 0; 3988 3989 if (s1pp == NULL) { 3990 *nppp = NULL; 3991 return; 3992 } 3993 if (npages == 0) { 3994 *nppp = s1pp; 3995 *oppp = NULL; 3996 return; 3997 } 3998 for (n = 0, s2pp = *oppp; n < npages; n++) { 3999 s2pp = s2pp->p_next; 4000 } 4001 /* Fix head and tail of new lists */ 4002 e1pp = s2pp->p_prev; 4003 e2pp = s1pp->p_prev; 4004 s1pp->p_prev = e1pp; 4005 e1pp->p_next = s1pp; 4006 s2pp->p_prev = e2pp; 4007 e2pp->p_next = s2pp; 4008 4009 /* second list empty */ 4010 if (s2pp == s1pp) { 4011 *oppp = s1pp; 4012 *nppp = NULL; 4013 } else { 4014 *oppp = s1pp; 4015 *nppp = s2pp; 4016 } 4017 } 4018 4019 /* 4020 * Concatenate page list nppp onto the end of list ppp. 4021 */ 4022 void 4023 page_list_concat(page_t **ppp, page_t **nppp) 4024 { 4025 page_t *s1pp, *s2pp, *e1pp, *e2pp; 4026 4027 if (*nppp == NULL) { 4028 return; 4029 } 4030 if (*ppp == NULL) { 4031 *ppp = *nppp; 4032 return; 4033 } 4034 s1pp = *ppp; 4035 e1pp = s1pp->p_prev; 4036 s2pp = *nppp; 4037 e2pp = s2pp->p_prev; 4038 s1pp->p_prev = e2pp; 4039 e2pp->p_next = s1pp; 4040 e1pp->p_next = s2pp; 4041 s2pp->p_prev = e1pp; 4042 } 4043 4044 /* 4045 * return the next page in the page list 4046 */ 4047 page_t * 4048 page_list_next(page_t *pp) 4049 { 4050 return (pp->p_next); 4051 } 4052 4053 4054 /* 4055 * Add the page to the front of the linked list of pages 4056 * using p_vpnext/p_vpprev pointers for the list. 4057 * 4058 * The caller is responsible for protecting the lists. 4059 */ 4060 void 4061 page_vpadd(page_t **ppp, page_t *pp) 4062 { 4063 if (*ppp == NULL) { 4064 pp->p_vpnext = pp->p_vpprev = pp; 4065 } else { 4066 pp->p_vpnext = *ppp; 4067 pp->p_vpprev = (*ppp)->p_vpprev; 4068 (*ppp)->p_vpprev = pp; 4069 pp->p_vpprev->p_vpnext = pp; 4070 } 4071 *ppp = pp; 4072 } 4073 4074 /* 4075 * Remove this page from the linked list of pages 4076 * using p_vpnext/p_vpprev pointers for the list. 4077 * 4078 * The caller is responsible for protecting the lists. 4079 */ 4080 void 4081 page_vpsub(page_t **ppp, page_t *pp) 4082 { 4083 if (*ppp == NULL || pp == NULL) { 4084 panic("page_vpsub: bad arg(s): pp %p, *ppp %p", 4085 (void *)pp, (void *)(*ppp)); 4086 /*NOTREACHED*/ 4087 } 4088 4089 if (*ppp == pp) 4090 *ppp = pp->p_vpnext; /* go to next page */ 4091 4092 if (*ppp == pp) 4093 *ppp = NULL; /* page list is gone */ 4094 else { 4095 pp->p_vpprev->p_vpnext = pp->p_vpnext; 4096 pp->p_vpnext->p_vpprev = pp->p_vpprev; 4097 } 4098 pp->p_vpprev = pp->p_vpnext = pp; /* make pp a list of one */ 4099 } 4100 4101 /* 4102 * Lock a physical page into memory "long term". Used to support "lock 4103 * in memory" functions. Accepts the page to be locked, and a cow variable 4104 * to indicate whether a the lock will travel to the new page during 4105 * a potential copy-on-write. 4106 */ 4107 int 4108 page_pp_lock( 4109 page_t *pp, /* page to be locked */ 4110 int cow, /* cow lock */ 4111 int kernel) /* must succeed -- ignore checking */ 4112 { 4113 int r = 0; /* result -- assume failure */ 4114 4115 ASSERT(PAGE_LOCKED(pp)); 4116 4117 page_struct_lock(pp); 4118 /* 4119 * Acquire the "freemem_lock" for availrmem. 4120 */ 4121 if (cow) { 4122 mutex_enter(&freemem_lock); 4123 if ((availrmem > pages_pp_maximum) && 4124 (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM)) { 4125 availrmem--; 4126 pages_locked++; 4127 mutex_exit(&freemem_lock); 4128 r = 1; 4129 if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) { 4130 cmn_err(CE_WARN, 4131 "COW lock limit reached on pfn 0x%lx", 4132 page_pptonum(pp)); 4133 } 4134 } else 4135 mutex_exit(&freemem_lock); 4136 } else { 4137 if (pp->p_lckcnt) { 4138 if (pp->p_lckcnt < (ushort_t)PAGE_LOCK_MAXIMUM) { 4139 r = 1; 4140 if (++pp->p_lckcnt == 4141 (ushort_t)PAGE_LOCK_MAXIMUM) { 4142 cmn_err(CE_WARN, "Page lock limit " 4143 "reached on pfn 0x%lx", 4144 page_pptonum(pp)); 4145 } 4146 } 4147 } else { 4148 if (kernel) { 4149 /* availrmem accounting done by caller */ 4150 ++pp->p_lckcnt; 4151 r = 1; 4152 } else { 4153 mutex_enter(&freemem_lock); 4154 if (availrmem > pages_pp_maximum) { 4155 availrmem--; 4156 pages_locked++; 4157 ++pp->p_lckcnt; 4158 r = 1; 4159 } 4160 mutex_exit(&freemem_lock); 4161 } 4162 } 4163 } 4164 page_struct_unlock(pp); 4165 return (r); 4166 } 4167 4168 /* 4169 * Decommit a lock on a physical page frame. Account for cow locks if 4170 * appropriate. 4171 */ 4172 void 4173 page_pp_unlock( 4174 page_t *pp, /* page to be unlocked */ 4175 int cow, /* expect cow lock */ 4176 int kernel) /* this was a kernel lock */ 4177 { 4178 ASSERT(PAGE_LOCKED(pp)); 4179 4180 page_struct_lock(pp); 4181 /* 4182 * Acquire the "freemem_lock" for availrmem. 4183 * If cowcnt or lcknt is already 0 do nothing; i.e., we 4184 * could be called to unlock even if nothing is locked. This could 4185 * happen if locked file pages were truncated (removing the lock) 4186 * and the file was grown again and new pages faulted in; the new 4187 * pages are unlocked but the segment still thinks they're locked. 4188 */ 4189 if (cow) { 4190 if (pp->p_cowcnt) { 4191 mutex_enter(&freemem_lock); 4192 pp->p_cowcnt--; 4193 availrmem++; 4194 pages_locked--; 4195 mutex_exit(&freemem_lock); 4196 } 4197 } else { 4198 if (pp->p_lckcnt && --pp->p_lckcnt == 0) { 4199 if (!kernel) { 4200 mutex_enter(&freemem_lock); 4201 availrmem++; 4202 pages_locked--; 4203 mutex_exit(&freemem_lock); 4204 } 4205 } 4206 } 4207 page_struct_unlock(pp); 4208 } 4209 4210 /* 4211 * This routine reserves availrmem for npages; 4212 * flags: KM_NOSLEEP or KM_SLEEP 4213 * returns 1 on success or 0 on failure 4214 */ 4215 int 4216 page_resv(pgcnt_t npages, uint_t flags) 4217 { 4218 mutex_enter(&freemem_lock); 4219 while (availrmem < tune.t_minarmem + npages) { 4220 if (flags & KM_NOSLEEP) { 4221 mutex_exit(&freemem_lock); 4222 return (0); 4223 } 4224 mutex_exit(&freemem_lock); 4225 page_needfree(npages); 4226 kmem_reap(); 4227 delay(hz >> 2); 4228 page_needfree(-(spgcnt_t)npages); 4229 mutex_enter(&freemem_lock); 4230 } 4231 availrmem -= npages; 4232 mutex_exit(&freemem_lock); 4233 return (1); 4234 } 4235 4236 /* 4237 * This routine unreserves availrmem for npages; 4238 */ 4239 void 4240 page_unresv(pgcnt_t npages) 4241 { 4242 mutex_enter(&freemem_lock); 4243 availrmem += npages; 4244 mutex_exit(&freemem_lock); 4245 } 4246 4247 /* 4248 * See Statement at the beginning of segvn_lockop() regarding 4249 * the way we handle cowcnts and lckcnts. 4250 * 4251 * Transfer cowcnt on 'opp' to cowcnt on 'npp' if the vpage 4252 * that breaks COW has PROT_WRITE. 4253 * 4254 * Note that, we may also break COW in case we are softlocking 4255 * on read access during physio; 4256 * in this softlock case, the vpage may not have PROT_WRITE. 4257 * So, we need to transfer lckcnt on 'opp' to lckcnt on 'npp' 4258 * if the vpage doesn't have PROT_WRITE. 4259 * 4260 * This routine is never called if we are stealing a page 4261 * in anon_private. 4262 * 4263 * The caller subtracted from availrmem for read only mapping. 4264 * if lckcnt is 1 increment availrmem. 4265 */ 4266 void 4267 page_pp_useclaim( 4268 page_t *opp, /* original page frame losing lock */ 4269 page_t *npp, /* new page frame gaining lock */ 4270 uint_t write_perm) /* set if vpage has PROT_WRITE */ 4271 { 4272 int payback = 0; 4273 4274 ASSERT(PAGE_LOCKED(opp)); 4275 ASSERT(PAGE_LOCKED(npp)); 4276 4277 page_struct_lock(opp); 4278 4279 ASSERT(npp->p_cowcnt == 0); 4280 ASSERT(npp->p_lckcnt == 0); 4281 4282 /* Don't use claim if nothing is locked (see page_pp_unlock above) */ 4283 if ((write_perm && opp->p_cowcnt != 0) || 4284 (!write_perm && opp->p_lckcnt != 0)) { 4285 4286 if (write_perm) { 4287 npp->p_cowcnt++; 4288 ASSERT(opp->p_cowcnt != 0); 4289 opp->p_cowcnt--; 4290 } else { 4291 4292 ASSERT(opp->p_lckcnt != 0); 4293 4294 /* 4295 * We didn't need availrmem decremented if p_lckcnt on 4296 * original page is 1. Here, we are unlocking 4297 * read-only copy belonging to original page and 4298 * are locking a copy belonging to new page. 4299 */ 4300 if (opp->p_lckcnt == 1) 4301 payback = 1; 4302 4303 npp->p_lckcnt++; 4304 opp->p_lckcnt--; 4305 } 4306 } 4307 if (payback) { 4308 mutex_enter(&freemem_lock); 4309 availrmem++; 4310 pages_useclaim--; 4311 mutex_exit(&freemem_lock); 4312 } 4313 page_struct_unlock(opp); 4314 } 4315 4316 /* 4317 * Simple claim adjust functions -- used to support changes in 4318 * claims due to changes in access permissions. Used by segvn_setprot(). 4319 */ 4320 int 4321 page_addclaim(page_t *pp) 4322 { 4323 int r = 0; /* result */ 4324 4325 ASSERT(PAGE_LOCKED(pp)); 4326 4327 page_struct_lock(pp); 4328 ASSERT(pp->p_lckcnt != 0); 4329 4330 if (pp->p_lckcnt == 1) { 4331 if (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM) { 4332 --pp->p_lckcnt; 4333 r = 1; 4334 if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) { 4335 cmn_err(CE_WARN, 4336 "COW lock limit reached on pfn 0x%lx", 4337 page_pptonum(pp)); 4338 } 4339 } 4340 } else { 4341 mutex_enter(&freemem_lock); 4342 if ((availrmem > pages_pp_maximum) && 4343 (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM)) { 4344 --availrmem; 4345 ++pages_claimed; 4346 mutex_exit(&freemem_lock); 4347 --pp->p_lckcnt; 4348 r = 1; 4349 if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) { 4350 cmn_err(CE_WARN, 4351 "COW lock limit reached on pfn 0x%lx", 4352 page_pptonum(pp)); 4353 } 4354 } else 4355 mutex_exit(&freemem_lock); 4356 } 4357 page_struct_unlock(pp); 4358 return (r); 4359 } 4360 4361 int 4362 page_subclaim(page_t *pp) 4363 { 4364 int r = 0; 4365 4366 ASSERT(PAGE_LOCKED(pp)); 4367 4368 page_struct_lock(pp); 4369 ASSERT(pp->p_cowcnt != 0); 4370 4371 if (pp->p_lckcnt) { 4372 if (pp->p_lckcnt < (ushort_t)PAGE_LOCK_MAXIMUM) { 4373 r = 1; 4374 /* 4375 * for availrmem 4376 */ 4377 mutex_enter(&freemem_lock); 4378 availrmem++; 4379 pages_claimed--; 4380 mutex_exit(&freemem_lock); 4381 4382 pp->p_cowcnt--; 4383 4384 if (++pp->p_lckcnt == (ushort_t)PAGE_LOCK_MAXIMUM) { 4385 cmn_err(CE_WARN, 4386 "Page lock limit reached on pfn 0x%lx", 4387 page_pptonum(pp)); 4388 } 4389 } 4390 } else { 4391 r = 1; 4392 pp->p_cowcnt--; 4393 pp->p_lckcnt++; 4394 } 4395 page_struct_unlock(pp); 4396 return (r); 4397 } 4398 4399 int 4400 page_addclaim_pages(page_t **ppa) 4401 { 4402 4403 pgcnt_t lckpgs = 0, pg_idx; 4404 4405 VM_STAT_ADD(pagecnt.pc_addclaim_pages); 4406 4407 mutex_enter(&page_llock); 4408 for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) { 4409 4410 ASSERT(PAGE_LOCKED(ppa[pg_idx])); 4411 ASSERT(ppa[pg_idx]->p_lckcnt != 0); 4412 if (ppa[pg_idx]->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) { 4413 mutex_exit(&page_llock); 4414 return (0); 4415 } 4416 if (ppa[pg_idx]->p_lckcnt > 1) 4417 lckpgs++; 4418 } 4419 4420 if (lckpgs != 0) { 4421 mutex_enter(&freemem_lock); 4422 if (availrmem >= pages_pp_maximum + lckpgs) { 4423 availrmem -= lckpgs; 4424 pages_claimed += lckpgs; 4425 } else { 4426 mutex_exit(&freemem_lock); 4427 mutex_exit(&page_llock); 4428 return (0); 4429 } 4430 mutex_exit(&freemem_lock); 4431 } 4432 4433 for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) { 4434 ppa[pg_idx]->p_lckcnt--; 4435 ppa[pg_idx]->p_cowcnt++; 4436 } 4437 mutex_exit(&page_llock); 4438 return (1); 4439 } 4440 4441 int 4442 page_subclaim_pages(page_t **ppa) 4443 { 4444 pgcnt_t ulckpgs = 0, pg_idx; 4445 4446 VM_STAT_ADD(pagecnt.pc_subclaim_pages); 4447 4448 mutex_enter(&page_llock); 4449 for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) { 4450 4451 ASSERT(PAGE_LOCKED(ppa[pg_idx])); 4452 ASSERT(ppa[pg_idx]->p_cowcnt != 0); 4453 if (ppa[pg_idx]->p_lckcnt == (ushort_t)PAGE_LOCK_MAXIMUM) { 4454 mutex_exit(&page_llock); 4455 return (0); 4456 } 4457 if (ppa[pg_idx]->p_lckcnt != 0) 4458 ulckpgs++; 4459 } 4460 4461 if (ulckpgs != 0) { 4462 mutex_enter(&freemem_lock); 4463 availrmem += ulckpgs; 4464 pages_claimed -= ulckpgs; 4465 mutex_exit(&freemem_lock); 4466 } 4467 4468 for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) { 4469 ppa[pg_idx]->p_cowcnt--; 4470 ppa[pg_idx]->p_lckcnt++; 4471 4472 } 4473 mutex_exit(&page_llock); 4474 return (1); 4475 } 4476 4477 page_t * 4478 page_numtopp(pfn_t pfnum, se_t se) 4479 { 4480 page_t *pp; 4481 4482 retry: 4483 pp = page_numtopp_nolock(pfnum); 4484 if (pp == NULL) { 4485 return ((page_t *)NULL); 4486 } 4487 4488 /* 4489 * Acquire the appropriate lock on the page. 4490 */ 4491 while (!page_lock(pp, se, (kmutex_t *)NULL, P_RECLAIM)) { 4492 if (page_pptonum(pp) != pfnum) 4493 goto retry; 4494 continue; 4495 } 4496 4497 if (page_pptonum(pp) != pfnum) { 4498 page_unlock(pp); 4499 goto retry; 4500 } 4501 4502 return (pp); 4503 } 4504 4505 page_t * 4506 page_numtopp_noreclaim(pfn_t pfnum, se_t se) 4507 { 4508 page_t *pp; 4509 4510 retry: 4511 pp = page_numtopp_nolock(pfnum); 4512 if (pp == NULL) { 4513 return ((page_t *)NULL); 4514 } 4515 4516 /* 4517 * Acquire the appropriate lock on the page. 4518 */ 4519 while (!page_lock(pp, se, (kmutex_t *)NULL, P_NO_RECLAIM)) { 4520 if (page_pptonum(pp) != pfnum) 4521 goto retry; 4522 continue; 4523 } 4524 4525 if (page_pptonum(pp) != pfnum) { 4526 page_unlock(pp); 4527 goto retry; 4528 } 4529 4530 return (pp); 4531 } 4532 4533 /* 4534 * This routine is like page_numtopp, but will only return page structs 4535 * for pages which are ok for loading into hardware using the page struct. 4536 */ 4537 page_t * 4538 page_numtopp_nowait(pfn_t pfnum, se_t se) 4539 { 4540 page_t *pp; 4541 4542 retry: 4543 pp = page_numtopp_nolock(pfnum); 4544 if (pp == NULL) { 4545 return ((page_t *)NULL); 4546 } 4547 4548 /* 4549 * Try to acquire the appropriate lock on the page. 4550 */ 4551 if (PP_ISFREE(pp)) 4552 pp = NULL; 4553 else { 4554 if (!page_trylock(pp, se)) 4555 pp = NULL; 4556 else { 4557 if (page_pptonum(pp) != pfnum) { 4558 page_unlock(pp); 4559 goto retry; 4560 } 4561 if (PP_ISFREE(pp)) { 4562 page_unlock(pp); 4563 pp = NULL; 4564 } 4565 } 4566 } 4567 return (pp); 4568 } 4569 4570 /* 4571 * Returns a count of dirty pages that are in the process 4572 * of being written out. If 'cleanit' is set, try to push the page. 4573 */ 4574 pgcnt_t 4575 page_busy(int cleanit) 4576 { 4577 page_t *page0 = page_first(); 4578 page_t *pp = page0; 4579 pgcnt_t nppbusy = 0; 4580 u_offset_t off; 4581 4582 do { 4583 vnode_t *vp = pp->p_vnode; 4584 4585 /* 4586 * A page is a candidate for syncing if it is: 4587 * 4588 * (a) On neither the freelist nor the cachelist 4589 * (b) Hashed onto a vnode 4590 * (c) Not a kernel page 4591 * (d) Dirty 4592 * (e) Not part of a swapfile 4593 * (f) a page which belongs to a real vnode; eg has a non-null 4594 * v_vfsp pointer. 4595 * (g) Backed by a filesystem which doesn't have a 4596 * stubbed-out sync operation 4597 */ 4598 if (!PP_ISFREE(pp) && vp != NULL && vp != &kvp && 4599 hat_ismod(pp) && !IS_SWAPVP(vp) && vp->v_vfsp != NULL && 4600 vfs_can_sync(vp->v_vfsp)) { 4601 nppbusy++; 4602 vfs_syncprogress(); 4603 4604 if (!cleanit) 4605 continue; 4606 if (!page_trylock(pp, SE_EXCL)) 4607 continue; 4608 4609 if (PP_ISFREE(pp) || vp == NULL || IS_SWAPVP(vp) || 4610 pp->p_lckcnt != 0 || pp->p_cowcnt != 0 || 4611 !(hat_pagesync(pp, 4612 HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD) & P_MOD)) { 4613 page_unlock(pp); 4614 continue; 4615 } 4616 off = pp->p_offset; 4617 VN_HOLD(vp); 4618 page_unlock(pp); 4619 (void) VOP_PUTPAGE(vp, off, PAGESIZE, 4620 B_ASYNC | B_FREE, kcred); 4621 VN_RELE(vp); 4622 } 4623 } while ((pp = page_next(pp)) != page0); 4624 4625 return (nppbusy); 4626 } 4627 4628 void page_invalidate_pages(void); 4629 4630 /* 4631 * callback handler to vm sub-system 4632 * 4633 * callers make sure no recursive entries to this func. 4634 */ 4635 /*ARGSUSED*/ 4636 boolean_t 4637 callb_vm_cpr(void *arg, int code) 4638 { 4639 if (code == CB_CODE_CPR_CHKPT) 4640 page_invalidate_pages(); 4641 return (B_TRUE); 4642 } 4643 4644 /* 4645 * Invalidate all pages of the system. 4646 * It shouldn't be called until all user page activities are all stopped. 4647 */ 4648 void 4649 page_invalidate_pages() 4650 { 4651 page_t *pp; 4652 page_t *page0; 4653 pgcnt_t nbusypages; 4654 int retry = 0; 4655 const int MAXRETRIES = 4; 4656 #if defined(__sparc) 4657 extern struct vnode prom_ppages; 4658 #endif /* __sparc */ 4659 4660 top: 4661 /* 4662 * Flush dirty pages and destory the clean ones. 4663 */ 4664 nbusypages = 0; 4665 4666 pp = page0 = page_first(); 4667 do { 4668 struct vnode *vp; 4669 u_offset_t offset; 4670 int mod; 4671 4672 /* 4673 * skip the page if it has no vnode or the page associated 4674 * with the kernel vnode or prom allocated kernel mem. 4675 */ 4676 #if defined(__sparc) 4677 if ((vp = pp->p_vnode) == NULL || vp == &kvp || 4678 vp == &prom_ppages) 4679 #else /* x86 doesn't have prom or prom_ppage */ 4680 if ((vp = pp->p_vnode) == NULL || vp == &kvp) 4681 #endif /* __sparc */ 4682 continue; 4683 4684 /* 4685 * skip the page which is already free invalidated. 4686 */ 4687 if (PP_ISFREE(pp) && PP_ISAGED(pp)) 4688 continue; 4689 4690 /* 4691 * skip pages that are already locked or can't be "exclusively" 4692 * locked or are already free. After we lock the page, check 4693 * the free and age bits again to be sure it's not destroied 4694 * yet. 4695 * To achieve max. parallelization, we use page_trylock instead 4696 * of page_lock so that we don't get block on individual pages 4697 * while we have thousands of other pages to process. 4698 */ 4699 if (!page_trylock(pp, SE_EXCL)) { 4700 nbusypages++; 4701 continue; 4702 } else if (PP_ISFREE(pp)) { 4703 if (!PP_ISAGED(pp)) { 4704 page_destroy_free(pp); 4705 } else { 4706 page_unlock(pp); 4707 } 4708 continue; 4709 } 4710 /* 4711 * Is this page involved in some I/O? shared? 4712 * 4713 * The page_struct_lock need not be acquired to 4714 * examine these fields since the page has an 4715 * "exclusive" lock. 4716 */ 4717 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 4718 page_unlock(pp); 4719 continue; 4720 } 4721 4722 if (vp->v_type == VCHR) { 4723 panic("vp->v_type == VCHR"); 4724 /*NOTREACHED*/ 4725 } 4726 4727 if (!page_try_demote_pages(pp)) { 4728 page_unlock(pp); 4729 continue; 4730 } 4731 4732 /* 4733 * Check the modified bit. Leave the bits alone in hardware 4734 * (they will be modified if we do the putpage). 4735 */ 4736 mod = (hat_pagesync(pp, HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD) 4737 & P_MOD); 4738 if (mod) { 4739 offset = pp->p_offset; 4740 /* 4741 * Hold the vnode before releasing the page lock 4742 * to prevent it from being freed and re-used by 4743 * some other thread. 4744 */ 4745 VN_HOLD(vp); 4746 page_unlock(pp); 4747 /* 4748 * No error return is checked here. Callers such as 4749 * cpr deals with the dirty pages at the dump time 4750 * if this putpage fails. 4751 */ 4752 (void) VOP_PUTPAGE(vp, offset, PAGESIZE, B_INVAL, 4753 kcred); 4754 VN_RELE(vp); 4755 } else { 4756 page_destroy(pp, 0); 4757 } 4758 } while ((pp = page_next(pp)) != page0); 4759 if (nbusypages && retry++ < MAXRETRIES) { 4760 delay(1); 4761 goto top; 4762 } 4763 } 4764 4765 /* 4766 * Replace the page "old" with the page "new" on the page hash and vnode lists 4767 * 4768 * the replacemnt must be done in place, ie the equivalent sequence: 4769 * 4770 * vp = old->p_vnode; 4771 * off = old->p_offset; 4772 * page_do_hashout(old) 4773 * page_do_hashin(new, vp, off) 4774 * 4775 * doesn't work, since 4776 * 1) if old is the only page on the vnode, the v_pages list has a window 4777 * where it looks empty. This will break file system assumptions. 4778 * and 4779 * 2) pvn_vplist_dirty() can't deal with pages moving on the v_pages list. 4780 */ 4781 static void 4782 page_do_relocate_hash(page_t *new, page_t *old) 4783 { 4784 page_t **hash_list; 4785 vnode_t *vp = old->p_vnode; 4786 kmutex_t *sep; 4787 4788 ASSERT(PAGE_EXCL(old)); 4789 ASSERT(PAGE_EXCL(new)); 4790 ASSERT(vp != NULL); 4791 ASSERT(MUTEX_HELD(page_vnode_mutex(vp))); 4792 ASSERT(MUTEX_HELD(PAGE_HASH_MUTEX(PAGE_HASH_FUNC(vp, old->p_offset)))); 4793 4794 /* 4795 * First find old page on the page hash list 4796 */ 4797 hash_list = &page_hash[PAGE_HASH_FUNC(vp, old->p_offset)]; 4798 4799 for (;;) { 4800 if (*hash_list == old) 4801 break; 4802 if (*hash_list == NULL) { 4803 panic("page_do_hashout"); 4804 /*NOTREACHED*/ 4805 } 4806 hash_list = &(*hash_list)->p_hash; 4807 } 4808 4809 /* 4810 * update new and replace old with new on the page hash list 4811 */ 4812 new->p_vnode = old->p_vnode; 4813 new->p_offset = old->p_offset; 4814 new->p_hash = old->p_hash; 4815 *hash_list = new; 4816 4817 if ((new->p_vnode->v_flag & VISSWAP) != 0) 4818 PP_SETSWAP(new); 4819 4820 /* 4821 * replace old with new on the vnode's page list 4822 */ 4823 if (old->p_vpnext == old) { 4824 new->p_vpnext = new; 4825 new->p_vpprev = new; 4826 } else { 4827 new->p_vpnext = old->p_vpnext; 4828 new->p_vpprev = old->p_vpprev; 4829 new->p_vpnext->p_vpprev = new; 4830 new->p_vpprev->p_vpnext = new; 4831 } 4832 if (vp->v_pages == old) 4833 vp->v_pages = new; 4834 4835 /* 4836 * clear out the old page 4837 */ 4838 old->p_hash = NULL; 4839 old->p_vpnext = NULL; 4840 old->p_vpprev = NULL; 4841 old->p_vnode = NULL; 4842 PP_CLRSWAP(old); 4843 old->p_offset = (u_offset_t)-1; 4844 page_clr_all_props(old); 4845 4846 /* 4847 * Wake up processes waiting for this page. The page's 4848 * identity has been changed, and is probably not the 4849 * desired page any longer. 4850 */ 4851 sep = page_se_mutex(old); 4852 mutex_enter(sep); 4853 if (CV_HAS_WAITERS(&old->p_cv)) 4854 cv_broadcast(&old->p_cv); 4855 mutex_exit(sep); 4856 } 4857 4858 /* 4859 * This function moves the identity of page "pp_old" to page "pp_new". 4860 * Both pages must be locked on entry. "pp_new" is free, has no identity, 4861 * and need not be hashed out from anywhere. 4862 */ 4863 void 4864 page_relocate_hash(page_t *pp_new, page_t *pp_old) 4865 { 4866 vnode_t *vp = pp_old->p_vnode; 4867 u_offset_t off = pp_old->p_offset; 4868 kmutex_t *phm, *vphm; 4869 4870 /* 4871 * Rehash two pages 4872 */ 4873 ASSERT(PAGE_EXCL(pp_old)); 4874 ASSERT(PAGE_EXCL(pp_new)); 4875 ASSERT(vp != NULL); 4876 ASSERT(pp_new->p_vnode == NULL); 4877 4878 /* 4879 * hashout then hashin while holding the mutexes 4880 */ 4881 phm = PAGE_HASH_MUTEX(PAGE_HASH_FUNC(vp, off)); 4882 mutex_enter(phm); 4883 vphm = page_vnode_mutex(vp); 4884 mutex_enter(vphm); 4885 4886 page_do_relocate_hash(pp_new, pp_old); 4887 4888 mutex_exit(vphm); 4889 mutex_exit(phm); 4890 4891 /* 4892 * The page_struct_lock need not be acquired for lckcnt and 4893 * cowcnt since the page has an "exclusive" lock. 4894 */ 4895 ASSERT(pp_new->p_lckcnt == 0); 4896 ASSERT(pp_new->p_cowcnt == 0); 4897 pp_new->p_lckcnt = pp_old->p_lckcnt; 4898 pp_new->p_cowcnt = pp_old->p_cowcnt; 4899 pp_old->p_lckcnt = pp_old->p_cowcnt = 0; 4900 4901 /* The following comment preserved from page_flip(). */ 4902 /* XXX - Do we need to protect fsdata? */ 4903 pp_new->p_fsdata = pp_old->p_fsdata; 4904 } 4905 4906 /* 4907 * Helper routine used to lock all remaining members of a 4908 * large page. The caller is responsible for passing in a locked 4909 * pp. If pp is a large page, then it succeeds in locking all the 4910 * remaining constituent pages or it returns with only the 4911 * original page locked. 4912 * 4913 * Returns 1 on success, 0 on failure. 4914 * 4915 * If success is returned this routine gurantees p_szc for all constituent 4916 * pages of a large page pp belongs to can't change. To achieve this we 4917 * recheck szc of pp after locking all constituent pages and retry if szc 4918 * changed (it could only decrease). Since hat_page_demote() needs an EXCL 4919 * lock on one of constituent pages it can't be running after all constituent 4920 * pages are locked. hat_page_demote() with a lock on a constituent page 4921 * outside of this large page (i.e. pp belonged to a larger large page) is 4922 * already done with all constituent pages of pp since the root's p_szc is 4923 * changed last. Thefore no need to synchronize with hat_page_demote() that 4924 * locked a constituent page outside of pp's current large page. 4925 */ 4926 #ifdef DEBUG 4927 uint32_t gpg_trylock_mtbf = 0; 4928 #endif 4929 4930 int 4931 group_page_trylock(page_t *pp, se_t se) 4932 { 4933 page_t *tpp; 4934 pgcnt_t npgs, i, j; 4935 uint_t pszc = pp->p_szc; 4936 4937 #ifdef DEBUG 4938 if (gpg_trylock_mtbf && !(gethrtime() % gpg_trylock_mtbf)) { 4939 return (0); 4940 } 4941 #endif 4942 4943 if (pp != PP_GROUPLEADER(pp, pszc)) { 4944 return (0); 4945 } 4946 4947 retry: 4948 ASSERT(PAGE_LOCKED_SE(pp, se)); 4949 ASSERT(!PP_ISFREE(pp)); 4950 if (pszc == 0) { 4951 return (1); 4952 } 4953 npgs = page_get_pagecnt(pszc); 4954 tpp = pp + 1; 4955 for (i = 1; i < npgs; i++, tpp++) { 4956 if (!page_trylock(tpp, se)) { 4957 tpp = pp + 1; 4958 for (j = 1; j < i; j++, tpp++) { 4959 page_unlock(tpp); 4960 } 4961 return (0); 4962 } 4963 } 4964 if (pp->p_szc != pszc) { 4965 ASSERT(pp->p_szc < pszc); 4966 ASSERT(pp->p_vnode != NULL && pp->p_vnode != &kvp && 4967 !IS_SWAPFSVP(pp->p_vnode)); 4968 tpp = pp + 1; 4969 for (i = 1; i < npgs; i++, tpp++) { 4970 page_unlock(tpp); 4971 } 4972 pszc = pp->p_szc; 4973 goto retry; 4974 } 4975 return (1); 4976 } 4977 4978 void 4979 group_page_unlock(page_t *pp) 4980 { 4981 page_t *tpp; 4982 pgcnt_t npgs, i; 4983 4984 ASSERT(PAGE_LOCKED(pp)); 4985 ASSERT(!PP_ISFREE(pp)); 4986 ASSERT(pp == PP_PAGEROOT(pp)); 4987 npgs = page_get_pagecnt(pp->p_szc); 4988 for (i = 1, tpp = pp + 1; i < npgs; i++, tpp++) { 4989 page_unlock(tpp); 4990 } 4991 } 4992 4993 /* 4994 * returns 4995 * 0 : on success and *nrelocp is number of relocated PAGESIZE pages 4996 * ERANGE : this is not a base page 4997 * EBUSY : failure to get locks on the page/pages 4998 * ENOMEM : failure to obtain replacement pages 4999 * EAGAIN : OBP has not yet completed its boot-time handoff to the kernel 5000 * 5001 * Return with all constituent members of target and replacement 5002 * SE_EXCL locked. It is the callers responsibility to drop the 5003 * locks. 5004 */ 5005 int 5006 do_page_relocate( 5007 page_t **target, 5008 page_t **replacement, 5009 int grouplock, 5010 spgcnt_t *nrelocp, 5011 lgrp_t *lgrp) 5012 { 5013 #ifdef DEBUG 5014 page_t *first_repl; 5015 #endif /* DEBUG */ 5016 page_t *repl; 5017 page_t *targ; 5018 page_t *pl = NULL; 5019 uint_t ppattr; 5020 pfn_t pfn, repl_pfn; 5021 uint_t szc; 5022 spgcnt_t npgs, i; 5023 int repl_contig = 0; 5024 uint_t flags = 0; 5025 spgcnt_t dofree = 0; 5026 5027 *nrelocp = 0; 5028 5029 #if defined(__sparc) 5030 /* 5031 * We need to wait till OBP has completed 5032 * its boot-time handoff of its resources to the kernel 5033 * before we allow page relocation 5034 */ 5035 if (page_relocate_ready == 0) { 5036 return (EAGAIN); 5037 } 5038 #endif 5039 5040 /* 5041 * If this is not a base page, 5042 * just return with 0x0 pages relocated. 5043 */ 5044 targ = *target; 5045 ASSERT(PAGE_EXCL(targ)); 5046 ASSERT(!PP_ISFREE(targ)); 5047 szc = targ->p_szc; 5048 ASSERT(szc < mmu_page_sizes); 5049 VM_STAT_ADD(vmm_vmstats.ppr_reloc[szc]); 5050 pfn = targ->p_pagenum; 5051 if (pfn != PFN_BASE(pfn, szc)) { 5052 VM_STAT_ADD(vmm_vmstats.ppr_relocnoroot[szc]); 5053 return (ERANGE); 5054 } 5055 5056 if ((repl = *replacement) != NULL && repl->p_szc >= szc) { 5057 repl_pfn = repl->p_pagenum; 5058 if (repl_pfn != PFN_BASE(repl_pfn, szc)) { 5059 VM_STAT_ADD(vmm_vmstats.ppr_reloc_replnoroot[szc]); 5060 return (ERANGE); 5061 } 5062 repl_contig = 1; 5063 } 5064 5065 /* 5066 * We must lock all members of this large page or we cannot 5067 * relocate any part of it. 5068 */ 5069 if (grouplock != 0 && !group_page_trylock(targ, SE_EXCL)) { 5070 VM_STAT_ADD(vmm_vmstats.ppr_relocnolock[targ->p_szc]); 5071 return (EBUSY); 5072 } 5073 5074 /* 5075 * reread szc it could have been decreased before 5076 * group_page_trylock() was done. 5077 */ 5078 szc = targ->p_szc; 5079 ASSERT(szc < mmu_page_sizes); 5080 VM_STAT_ADD(vmm_vmstats.ppr_reloc[szc]); 5081 ASSERT(pfn == PFN_BASE(pfn, szc)); 5082 5083 npgs = page_get_pagecnt(targ->p_szc); 5084 5085 if (repl == NULL) { 5086 dofree = npgs; /* Size of target page in MMU pages */ 5087 if (!page_create_wait(dofree, 0)) { 5088 if (grouplock != 0) { 5089 group_page_unlock(targ); 5090 } 5091 VM_STAT_ADD(vmm_vmstats.ppr_relocnomem[szc]); 5092 return (ENOMEM); 5093 } 5094 5095 /* 5096 * seg kmem pages require that the target and replacement 5097 * page be the same pagesize. 5098 */ 5099 flags = (targ->p_vnode == &kvp) ? PGR_SAMESZC : 0; 5100 repl = page_get_replacement_page(targ, lgrp, flags); 5101 if (repl == NULL) { 5102 if (grouplock != 0) { 5103 group_page_unlock(targ); 5104 } 5105 page_create_putback(dofree); 5106 VM_STAT_ADD(vmm_vmstats.ppr_relocnomem[szc]); 5107 return (ENOMEM); 5108 } 5109 } 5110 #ifdef DEBUG 5111 else { 5112 ASSERT(PAGE_LOCKED(repl)); 5113 } 5114 #endif /* DEBUG */ 5115 5116 #if defined(__sparc) 5117 /* 5118 * Let hat_page_relocate() complete the relocation if it's kernel page 5119 */ 5120 if (targ->p_vnode == &kvp) { 5121 *replacement = repl; 5122 if (hat_page_relocate(target, replacement, nrelocp) != 0) { 5123 if (grouplock != 0) { 5124 group_page_unlock(targ); 5125 } 5126 if (dofree) { 5127 *replacement = NULL; 5128 page_free_replacement_page(repl); 5129 page_create_putback(dofree); 5130 } 5131 VM_STAT_ADD(vmm_vmstats.ppr_krelocfail[szc]); 5132 return (EAGAIN); 5133 } 5134 VM_STAT_ADD(vmm_vmstats.ppr_relocok[szc]); 5135 return (0); 5136 } 5137 #else 5138 #if defined(lint) 5139 dofree = dofree; 5140 #endif 5141 #endif 5142 5143 #ifdef DEBUG 5144 first_repl = repl; 5145 #endif /* DEBUG */ 5146 5147 for (i = 0; i < npgs; i++) { 5148 ASSERT(PAGE_EXCL(targ)); 5149 5150 (void) hat_pageunload(targ, HAT_FORCE_PGUNLOAD); 5151 5152 ASSERT(hat_page_getshare(targ) == 0); 5153 ASSERT(!PP_ISFREE(targ)); 5154 ASSERT(targ->p_pagenum == (pfn + i)); 5155 ASSERT(repl_contig == 0 || 5156 repl->p_pagenum == (repl_pfn + i)); 5157 5158 /* 5159 * Copy the page contents and attributes then 5160 * relocate the page in the page hash. 5161 */ 5162 ppcopy(targ, repl); 5163 ppattr = hat_page_getattr(targ, (P_MOD | P_REF | P_RO)); 5164 page_clr_all_props(repl); 5165 page_set_props(repl, ppattr); 5166 page_relocate_hash(repl, targ); 5167 5168 ASSERT(hat_page_getshare(targ) == 0); 5169 ASSERT(hat_page_getshare(repl) == 0); 5170 /* 5171 * Now clear the props on targ, after the 5172 * page_relocate_hash(), they no longer 5173 * have any meaning. 5174 */ 5175 page_clr_all_props(targ); 5176 ASSERT(targ->p_next == targ); 5177 ASSERT(targ->p_prev == targ); 5178 page_list_concat(&pl, &targ); 5179 5180 targ++; 5181 if (repl_contig != 0) { 5182 repl++; 5183 } else { 5184 repl = repl->p_next; 5185 } 5186 } 5187 /* assert that we have come full circle with repl */ 5188 ASSERT(repl_contig == 1 || first_repl == repl); 5189 5190 *target = pl; 5191 if (*replacement == NULL) { 5192 ASSERT(first_repl == repl); 5193 *replacement = repl; 5194 } 5195 VM_STAT_ADD(vmm_vmstats.ppr_relocok[szc]); 5196 *nrelocp = npgs; 5197 return (0); 5198 } 5199 /* 5200 * On success returns 0 and *nrelocp the number of PAGESIZE pages relocated. 5201 */ 5202 int 5203 page_relocate( 5204 page_t **target, 5205 page_t **replacement, 5206 int grouplock, 5207 int freetarget, 5208 spgcnt_t *nrelocp, 5209 lgrp_t *lgrp) 5210 { 5211 spgcnt_t ret; 5212 5213 /* do_page_relocate returns 0 on success or errno value */ 5214 ret = do_page_relocate(target, replacement, grouplock, nrelocp, lgrp); 5215 5216 if (ret != 0 || freetarget == 0) { 5217 return (ret); 5218 } 5219 if (*nrelocp == 1) { 5220 ASSERT(*target != NULL); 5221 page_free(*target, 1); 5222 } else { 5223 page_t *tpp = *target; 5224 uint_t szc = tpp->p_szc; 5225 pgcnt_t npgs = page_get_pagecnt(szc); 5226 ASSERT(npgs > 1); 5227 ASSERT(szc != 0); 5228 do { 5229 ASSERT(PAGE_EXCL(tpp)); 5230 ASSERT(!hat_page_is_mapped(tpp)); 5231 ASSERT(tpp->p_szc == szc); 5232 PP_SETFREE(tpp); 5233 PP_SETAGED(tpp); 5234 npgs--; 5235 } while ((tpp = tpp->p_next) != *target); 5236 ASSERT(npgs == 0); 5237 page_list_add_pages(*target, 0); 5238 npgs = page_get_pagecnt(szc); 5239 page_create_putback(npgs); 5240 } 5241 return (ret); 5242 } 5243 5244 /* 5245 * it is up to the caller to deal with pcf accounting. 5246 */ 5247 void 5248 page_free_replacement_page(page_t *pplist) 5249 { 5250 page_t *pp; 5251 5252 while (pplist != NULL) { 5253 /* 5254 * pp_targ is a linked list. 5255 */ 5256 pp = pplist; 5257 if (pp->p_szc == 0) { 5258 page_sub(&pplist, pp); 5259 page_clr_all_props(pp); 5260 PP_SETFREE(pp); 5261 PP_SETAGED(pp); 5262 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); 5263 page_unlock(pp); 5264 VM_STAT_ADD(pagecnt.pc_free_replacement_page[0]); 5265 } else { 5266 spgcnt_t curnpgs = page_get_pagecnt(pp->p_szc); 5267 page_t *tpp; 5268 page_list_break(&pp, &pplist, curnpgs); 5269 tpp = pp; 5270 do { 5271 ASSERT(PAGE_EXCL(tpp)); 5272 ASSERT(!hat_page_is_mapped(tpp)); 5273 page_clr_all_props(pp); 5274 PP_SETFREE(tpp); 5275 PP_SETAGED(tpp); 5276 } while ((tpp = tpp->p_next) != pp); 5277 page_list_add_pages(pp, 0); 5278 VM_STAT_ADD(pagecnt.pc_free_replacement_page[1]); 5279 } 5280 } 5281 } 5282 5283 /* 5284 * Relocate target to non-relocatable replacement page. 5285 */ 5286 int 5287 page_relocate_cage(page_t **target, page_t **replacement) 5288 { 5289 page_t *tpp, *rpp; 5290 spgcnt_t pgcnt, npgs; 5291 int result; 5292 5293 tpp = *target; 5294 5295 ASSERT(PAGE_EXCL(tpp)); 5296 ASSERT(tpp->p_szc == 0); 5297 5298 pgcnt = btop(page_get_pagesize(tpp->p_szc)); 5299 5300 do { 5301 (void) page_create_wait(pgcnt, PG_WAIT | PG_NORELOC); 5302 rpp = page_get_replacement_page(tpp, NULL, PGR_NORELOC); 5303 if (rpp == NULL) { 5304 page_create_putback(pgcnt); 5305 kcage_cageout_wakeup(); 5306 } 5307 } while (rpp == NULL); 5308 5309 ASSERT(PP_ISNORELOC(rpp)); 5310 5311 result = page_relocate(&tpp, &rpp, 0, 1, &npgs, NULL); 5312 5313 if (result == 0) { 5314 *replacement = rpp; 5315 if (pgcnt != npgs) 5316 panic("page_relocate_cage: partial relocation"); 5317 } 5318 5319 return (result); 5320 } 5321 5322 /* 5323 * Release the page lock on a page, place on cachelist 5324 * tail if no longer mapped. Caller can let us know if 5325 * the page is known to be clean. 5326 */ 5327 int 5328 page_release(page_t *pp, int checkmod) 5329 { 5330 int status; 5331 5332 ASSERT(PAGE_LOCKED(pp) && !PP_ISFREE(pp) && 5333 (pp->p_vnode != NULL)); 5334 5335 if (!hat_page_is_mapped(pp) && !IS_SWAPVP(pp->p_vnode) && 5336 ((PAGE_SHARED(pp) && page_tryupgrade(pp)) || PAGE_EXCL(pp)) && 5337 pp->p_lckcnt == 0 && pp->p_cowcnt == 0 && 5338 !hat_page_is_mapped(pp)) { 5339 5340 /* 5341 * If page is modified, unlock it 5342 * 5343 * (p_nrm & P_MOD) bit has the latest stuff because: 5344 * (1) We found that this page doesn't have any mappings 5345 * _after_ holding SE_EXCL and 5346 * (2) We didn't drop SE_EXCL lock after the check in (1) 5347 */ 5348 if (checkmod && hat_ismod(pp)) { 5349 page_unlock(pp); 5350 status = PGREL_MOD; 5351 } else { 5352 /*LINTED: constant in conditional context*/ 5353 VN_DISPOSE(pp, B_FREE, 0, kcred); 5354 status = PGREL_CLEAN; 5355 } 5356 } else { 5357 page_unlock(pp); 5358 status = PGREL_NOTREL; 5359 } 5360 return (status); 5361 } 5362 5363 int 5364 page_try_demote_pages(page_t *pp) 5365 { 5366 page_t *tpp, *rootpp = pp; 5367 pfn_t pfn = page_pptonum(pp); 5368 spgcnt_t i, npgs; 5369 uint_t szc = pp->p_szc; 5370 vnode_t *vp = pp->p_vnode; 5371 5372 ASSERT(PAGE_EXCL(rootpp)); 5373 5374 VM_STAT_ADD(pagecnt.pc_try_demote_pages[0]); 5375 5376 if (rootpp->p_szc == 0) { 5377 VM_STAT_ADD(pagecnt.pc_try_demote_pages[1]); 5378 return (1); 5379 } 5380 5381 if (vp != NULL && !IS_SWAPFSVP(vp) && vp != &kvp) { 5382 VM_STAT_ADD(pagecnt.pc_try_demote_pages[2]); 5383 page_demote_vp_pages(rootpp); 5384 ASSERT(pp->p_szc == 0); 5385 return (1); 5386 } 5387 5388 /* 5389 * Adjust rootpp if passed in is not the base 5390 * constituent page. 5391 */ 5392 npgs = page_get_pagecnt(rootpp->p_szc); 5393 ASSERT(npgs > 1); 5394 if (!IS_P2ALIGNED(pfn, npgs)) { 5395 pfn = P2ALIGN(pfn, npgs); 5396 rootpp = page_numtopp_nolock(pfn); 5397 VM_STAT_ADD(pagecnt.pc_try_demote_pages[3]); 5398 ASSERT(rootpp->p_vnode != NULL); 5399 ASSERT(rootpp->p_szc == szc); 5400 } 5401 5402 /* 5403 * We can't demote kernel pages since we can't hat_unload() 5404 * the mappings. 5405 */ 5406 if (rootpp->p_vnode == &kvp) 5407 return (0); 5408 5409 /* 5410 * Attempt to lock all constituent pages except the page passed 5411 * in since it's already locked. 5412 */ 5413 for (tpp = rootpp, i = 0; i < npgs; i++, tpp = page_next(tpp)) { 5414 ASSERT(!PP_ISFREE(tpp)); 5415 ASSERT(tpp->p_vnode != NULL); 5416 5417 if (tpp != pp && !page_trylock(tpp, SE_EXCL)) 5418 break; 5419 ASSERT(tpp->p_szc == rootpp->p_szc); 5420 ASSERT(page_pptonum(tpp) == page_pptonum(rootpp) + i); 5421 (void) hat_pageunload(tpp, HAT_FORCE_PGUNLOAD); 5422 } 5423 5424 /* 5425 * If we failed to lock them all then unlock what we have locked 5426 * so far and bail. 5427 */ 5428 if (i < npgs) { 5429 tpp = rootpp; 5430 while (i-- > 0) { 5431 if (tpp != pp) 5432 page_unlock(tpp); 5433 tpp = page_next(tpp); 5434 } 5435 VM_STAT_ADD(pagecnt.pc_try_demote_pages[4]); 5436 return (0); 5437 } 5438 5439 /* 5440 * XXX probably p_szc clearing and page unlocking can be done within 5441 * one loop but since this is rare code we can play very safe. 5442 */ 5443 for (tpp = rootpp, i = 0; i < npgs; i++, tpp = page_next(tpp)) { 5444 ASSERT(PAGE_EXCL(tpp)); 5445 tpp->p_szc = 0; 5446 } 5447 5448 /* 5449 * Unlock all pages except the page passed in. 5450 */ 5451 for (tpp = rootpp, i = 0; i < npgs; i++, tpp = page_next(tpp)) { 5452 ASSERT(!hat_page_is_mapped(tpp)); 5453 if (tpp != pp) 5454 page_unlock(tpp); 5455 } 5456 VM_STAT_ADD(pagecnt.pc_try_demote_pages[5]); 5457 return (1); 5458 } 5459 5460 /* 5461 * Called by page_free() and page_destroy() to demote the page size code 5462 * (p_szc) to 0 (since we can't just put a single PAGESIZE page with non zero 5463 * p_szc on free list, neither can we just clear p_szc of a single page_t 5464 * within a large page since it will break other code that relies on p_szc 5465 * being the same for all page_t's of a large page). Anonymous pages should 5466 * never end up here because anon_map_getpages() cannot deal with p_szc 5467 * changes after a single constituent page is locked. While anonymous or 5468 * kernel large pages are demoted or freed the entire large page at a time 5469 * with all constituent pages locked EXCL for the file system pages we 5470 * have to be able to demote a large page (i.e. decrease all constituent pages 5471 * p_szc) with only just an EXCL lock on one of constituent pages. The reason 5472 * we can easily deal with anonymous page demotion the entire large page at a 5473 * time is that those operation originate at address space level and concern 5474 * the entire large page region with actual demotion only done when pages are 5475 * not shared with any other processes (therefore we can always get EXCL lock 5476 * on all anonymous constituent pages after clearing segment page 5477 * cache). However file system pages can be truncated or invalidated at a 5478 * PAGESIZE level from the file system side and end up in page_free() or 5479 * page_destroy() (we also allow only part of the large page to be SOFTLOCKed 5480 * and therfore pageout should be able to demote a large page by EXCL locking 5481 * any constituent page that is not under SOFTLOCK). In those cases we cannot 5482 * rely on being able to lock EXCL all constituent pages. 5483 * 5484 * To prevent szc changes on file system pages one has to lock all constituent 5485 * pages at least SHARED (or call page_szc_lock()). The only subsystem that 5486 * doesn't rely on locking all constituent pages (or using page_szc_lock()) to 5487 * prevent szc changes is hat layer that uses its own page level mlist 5488 * locks. hat assumes that szc doesn't change after mlist lock for a page is 5489 * taken. Therefore we need to change szc under hat level locks if we only 5490 * have an EXCL lock on a single constituent page and hat still references any 5491 * of constituent pages. (Note we can't "ignore" hat layer by simply 5492 * hat_pageunload() all constituent pages without having EXCL locks on all of 5493 * constituent pages). We use hat_page_demote() call to safely demote szc of 5494 * all constituent pages under hat locks when we only have an EXCL lock on one 5495 * of constituent pages. 5496 * 5497 * This routine calls page_szc_lock() before calling hat_page_demote() to 5498 * allow segvn in one special case not to lock all constituent pages SHARED 5499 * before calling hat_memload_array() that relies on p_szc not changeing even 5500 * before hat level mlist lock is taken. In that case segvn uses 5501 * page_szc_lock() to prevent hat_page_demote() changeing p_szc values. 5502 * 5503 * Anonymous or kernel page demotion still has to lock all pages exclusively 5504 * and do hat_pageunload() on all constituent pages before demoting the page 5505 * therefore there's no need for anonymous or kernel page demotion to use 5506 * hat_page_demote() mechanism. 5507 * 5508 * hat_page_demote() removes all large mappings that map pp and then decreases 5509 * p_szc starting from the last constituent page of the large page. By working 5510 * from the tail of a large page in pfn decreasing order allows one looking at 5511 * the root page to know that hat_page_demote() is done for root's szc area. 5512 * e.g. if a root page has szc 1 one knows it only has to lock all constituent 5513 * pages within szc 1 area to prevent szc changes because hat_page_demote() 5514 * that started on this page when it had szc > 1 is done for this szc 1 area. 5515 * 5516 * We are guranteed that all constituent pages of pp's large page belong to 5517 * the same vnode with the consecutive offsets increasing in the direction of 5518 * the pfn i.e. the identity of constituent pages can't change until their 5519 * p_szc is decreased. Therefore it's safe for hat_page_demote() to remove 5520 * large mappings to pp even though we don't lock any constituent page except 5521 * pp (i.e. we won't unload e.g. kernel locked page). 5522 */ 5523 static void 5524 page_demote_vp_pages(page_t *pp) 5525 { 5526 kmutex_t *mtx; 5527 5528 ASSERT(PAGE_EXCL(pp)); 5529 ASSERT(!PP_ISFREE(pp)); 5530 ASSERT(pp->p_vnode != NULL); 5531 ASSERT(!IS_SWAPFSVP(pp->p_vnode)); 5532 ASSERT(pp->p_vnode != &kvp); 5533 5534 VM_STAT_ADD(pagecnt.pc_demote_pages[0]); 5535 5536 mtx = page_szc_lock(pp); 5537 if (mtx != NULL) { 5538 hat_page_demote(pp); 5539 mutex_exit(mtx); 5540 } 5541 ASSERT(pp->p_szc == 0); 5542 } 5543 5544 /* 5545 * Page retire operation. 5546 * 5547 * page_retire() 5548 * Attempt to retire (throw away) page pp. We cannot do this if 5549 * the page is dirty; if the page is clean, we can try. We return 0 on 5550 * success, -1 on failure. This routine should be invoked by the platform's 5551 * memory error detection code. 5552 * 5553 * pages_retired_limit_exceeded() 5554 * We set a limit on the number of pages which may be retired. This 5555 * is set to a percentage of total physical memory. This limit is 5556 * enforced here. 5557 */ 5558 5559 static pgcnt_t retired_pgcnt = 0; 5560 5561 /* 5562 * routines to update the count of retired pages 5563 */ 5564 static void 5565 page_retired(page_t *pp) 5566 { 5567 ASSERT(pp); 5568 5569 page_settoxic(pp, PAGE_IS_RETIRED); 5570 atomic_add_long(&retired_pgcnt, 1); 5571 } 5572 5573 static void 5574 retired_page_removed(page_t *pp) 5575 { 5576 ASSERT(pp); 5577 ASSERT(page_isretired(pp)); 5578 ASSERT(retired_pgcnt > 0); 5579 5580 page_clrtoxic(pp); 5581 atomic_add_long(&retired_pgcnt, -1); 5582 } 5583 5584 5585 static int 5586 pages_retired_limit_exceeded() 5587 { 5588 pgcnt_t retired_max; 5589 5590 /* 5591 * If the percentage is zero or is not set correctly, 5592 * return TRUE so that pages are not retired. 5593 */ 5594 if (max_pages_retired_bps <= 0 || 5595 max_pages_retired_bps >= 10000) 5596 return (1); 5597 5598 /* 5599 * Calculate the maximum number of pages allowed to 5600 * be retired as a percentage of total physical memory 5601 * (Remember that we are using basis points, hence the 10000.) 5602 */ 5603 retired_max = (physmem * max_pages_retired_bps) / 10000; 5604 5605 /* 5606 * return 'TRUE' if we have already retired more 5607 * than the legal limit 5608 */ 5609 return (retired_pgcnt >= retired_max); 5610 } 5611 5612 #define PAGE_RETIRE_SELOCK 0 5613 #define PAGE_RETIRE_NORECLAIM 1 5614 #define PAGE_RETIRE_LOCKED 2 5615 #define PAGE_RETIRE_COW 3 5616 #define PAGE_RETIRE_DIRTY 4 5617 #define PAGE_RETIRE_LPAGE 5 5618 #define PAGE_RETIRE_SUCCESS 6 5619 #define PAGE_RETIRE_LIMIT 7 5620 #define PAGE_RETIRE_NCODES 8 5621 5622 typedef struct page_retire_op { 5623 int pr_count; 5624 short pr_unlock; 5625 short pr_retval; 5626 char *pr_message; 5627 } page_retire_op_t; 5628 5629 page_retire_op_t page_retire_ops[PAGE_RETIRE_NCODES] = { 5630 { 0, 0, -1, "cannot lock page" }, 5631 { 0, 0, -1, "cannot reclaim cached page" }, 5632 { 0, 1, -1, "page is locked" }, 5633 { 0, 1, -1, "copy-on-write page" }, 5634 { 0, 1, -1, "page is dirty" }, 5635 { 0, 1, -1, "cannot demote large page" }, 5636 { 0, 0, 0, "page successfully retired" }, 5637 { 0, 0, -1, "excess pages retired already" }, 5638 }; 5639 5640 static int 5641 page_retire_done(page_t *pp, int code) 5642 { 5643 page_retire_op_t *prop = &page_retire_ops[code]; 5644 5645 prop->pr_count++; 5646 5647 if (prop->pr_unlock) 5648 page_unlock(pp); 5649 5650 if (page_retire_messages > 1) { 5651 printf("page_retire(%p) pfn 0x%lx %s: %s\n", 5652 (void *)pp, page_pptonum(pp), 5653 prop->pr_retval == -1 ? "failed" : "succeeded", 5654 prop->pr_message); 5655 } 5656 5657 return (prop->pr_retval); 5658 } 5659 5660 int 5661 page_retire(page_t *pp, uchar_t flag) 5662 { 5663 uint64_t pa = ptob((uint64_t)page_pptonum(pp)); 5664 5665 ASSERT(flag == PAGE_IS_FAILING || flag == PAGE_IS_TOXIC); 5666 5667 /* 5668 * DR operations change the association between a page_t 5669 * and the physical page it represents. Check if the 5670 * page is still bad. 5671 */ 5672 if (!page_isfaulty(pp)) { 5673 page_clrtoxic(pp); 5674 return (page_retire_done(pp, PAGE_RETIRE_SUCCESS)); 5675 } 5676 5677 /* 5678 * We set the flag here so that even if we fail due 5679 * to exceeding the limit for retired pages, the 5680 * page will still be checked and either cleared 5681 * or retired in page_free(). 5682 */ 5683 page_settoxic(pp, flag); 5684 5685 if (flag == PAGE_IS_TOXIC) { 5686 if (page_retire_messages) { 5687 cmn_err(CE_NOTE, "Scheduling clearing of error on" 5688 " page 0x%08x.%08x", 5689 (uint32_t)(pa >> 32), (uint32_t)pa); 5690 } 5691 5692 } else { /* PAGE_IS_FAILING */ 5693 if (pages_retired_limit_exceeded()) { 5694 /* 5695 * Return as we have already exceeded the 5696 * maximum number of pages allowed to be 5697 * retired 5698 */ 5699 return (page_retire_done(pp, PAGE_RETIRE_LIMIT)); 5700 } 5701 5702 if (page_retire_messages) { 5703 cmn_err(CE_NOTE, "Scheduling removal of " 5704 "page 0x%08x.%08x", 5705 (uint32_t)(pa >> 32), (uint32_t)pa); 5706 } 5707 } 5708 5709 if (PAGE_LOCKED(pp) || !page_trylock(pp, SE_EXCL)) 5710 return (page_retire_done(pp, PAGE_RETIRE_SELOCK)); 5711 5712 /* 5713 * If this is a large page we first try and demote it 5714 * to PAGESIZE pages and then dispose of the toxic page. 5715 * On failure we will let the page free/destroy 5716 * code handle it later since this is a mapped page. 5717 * Note that free large pages can always be demoted. 5718 * 5719 */ 5720 if (pp->p_szc != 0) { 5721 if (PP_ISFREE(pp)) 5722 (void) page_demote_free_pages(pp); 5723 else 5724 (void) page_try_demote_pages(pp); 5725 5726 if (pp->p_szc != 0) 5727 return (page_retire_done(pp, PAGE_RETIRE_LPAGE)); 5728 } 5729 5730 if (PP_ISFREE(pp)) { 5731 if (!page_reclaim(pp, NULL)) 5732 return (page_retire_done(pp, PAGE_RETIRE_NORECLAIM)); 5733 /*LINTED: constant in conditional context*/ 5734 VN_DISPOSE(pp, pp->p_vnode ? B_INVAL : B_FREE, 0, kcred) 5735 return (page_retire_done(pp, PAGE_RETIRE_SUCCESS)); 5736 } 5737 5738 if (pp->p_lckcnt != 0) 5739 return (page_retire_done(pp, PAGE_RETIRE_LOCKED)); 5740 5741 if (pp->p_cowcnt != 0) 5742 return (page_retire_done(pp, PAGE_RETIRE_COW)); 5743 5744 /* 5745 * Unload all translations to this page. No new translations 5746 * can be created while we hold the exclusive lock on the page. 5747 */ 5748 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 5749 5750 if (hat_ismod(pp)) 5751 return (page_retire_done(pp, PAGE_RETIRE_DIRTY)); 5752 5753 /*LINTED: constant in conditional context*/ 5754 VN_DISPOSE(pp, B_INVAL, 0, kcred); 5755 5756 return (page_retire_done(pp, PAGE_RETIRE_SUCCESS)); 5757 } 5758 5759 /* 5760 * Mark any existing pages for migration in the given range 5761 */ 5762 void 5763 page_mark_migrate(struct seg *seg, caddr_t addr, size_t len, 5764 struct anon_map *amp, ulong_t anon_index, vnode_t *vp, 5765 u_offset_t vnoff, int rflag) 5766 { 5767 struct anon *ap; 5768 vnode_t *curvp; 5769 lgrp_t *from; 5770 pgcnt_t i; 5771 pgcnt_t nlocked; 5772 u_offset_t off; 5773 pfn_t pfn; 5774 size_t pgsz; 5775 size_t segpgsz; 5776 pgcnt_t pages; 5777 uint_t pszc; 5778 page_t **ppa; 5779 pgcnt_t ppa_nentries; 5780 page_t *pp; 5781 caddr_t va; 5782 ulong_t an_idx; 5783 anon_sync_obj_t cookie; 5784 5785 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 5786 5787 /* 5788 * Don't do anything if don't need to do lgroup optimizations 5789 * on this system 5790 */ 5791 if (!lgrp_optimizations()) 5792 return; 5793 5794 /* 5795 * Align address and length to (potentially large) page boundary 5796 */ 5797 segpgsz = page_get_pagesize(seg->s_szc); 5798 addr = (caddr_t)P2ALIGN((uintptr_t)addr, segpgsz); 5799 if (rflag) 5800 len = P2ROUNDUP(len, segpgsz); 5801 5802 /* 5803 * Allocate page array to accomodate largest page size 5804 */ 5805 pgsz = page_get_pagesize(page_num_pagesizes() - 1); 5806 ppa_nentries = btop(pgsz); 5807 ppa = kmem_zalloc(ppa_nentries * sizeof (page_t *), KM_SLEEP); 5808 5809 /* 5810 * Do one (large) page at a time 5811 */ 5812 va = addr; 5813 while (va < addr + len) { 5814 /* 5815 * Lookup (root) page for vnode and offset corresponding to 5816 * this virtual address 5817 * Try anonmap first since there may be copy-on-write 5818 * pages, but initialize vnode pointer and offset using 5819 * vnode arguments just in case there isn't an amp. 5820 */ 5821 curvp = vp; 5822 off = vnoff + va - seg->s_base; 5823 if (amp) { 5824 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5825 an_idx = anon_index + seg_page(seg, va); 5826 anon_array_enter(amp, an_idx, &cookie); 5827 ap = anon_get_ptr(amp->ahp, an_idx); 5828 if (ap) 5829 swap_xlate(ap, &curvp, &off); 5830 anon_array_exit(&cookie); 5831 ANON_LOCK_EXIT(&->a_rwlock); 5832 } 5833 5834 pp = NULL; 5835 if (curvp) 5836 pp = page_lookup(curvp, off, SE_SHARED); 5837 5838 /* 5839 * If there isn't a page at this virtual address, 5840 * skip to next page 5841 */ 5842 if (pp == NULL) { 5843 va += PAGESIZE; 5844 continue; 5845 } 5846 5847 /* 5848 * Figure out which lgroup this page is in for kstats 5849 */ 5850 pfn = page_pptonum(pp); 5851 from = lgrp_pfn_to_lgrp(pfn); 5852 5853 /* 5854 * Get page size, and round up and skip to next page boundary 5855 * if unaligned address 5856 */ 5857 pszc = pp->p_szc; 5858 pgsz = page_get_pagesize(pszc); 5859 pages = btop(pgsz); 5860 if (!IS_P2ALIGNED(va, pgsz) || 5861 !IS_P2ALIGNED(pfn, pages) || 5862 pgsz > segpgsz) { 5863 pgsz = MIN(pgsz, segpgsz); 5864 page_unlock(pp); 5865 i = btop(P2END((uintptr_t)va, pgsz) - 5866 (uintptr_t)va); 5867 va = (caddr_t)P2END((uintptr_t)va, pgsz); 5868 lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS, i); 5869 continue; 5870 } 5871 5872 /* 5873 * Upgrade to exclusive lock on page 5874 */ 5875 if (!page_tryupgrade(pp)) { 5876 page_unlock(pp); 5877 va += pgsz; 5878 lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS, 5879 btop(pgsz)); 5880 continue; 5881 } 5882 5883 /* 5884 * Remember pages locked exclusively and how many 5885 */ 5886 ppa[0] = pp; 5887 nlocked = 1; 5888 5889 /* 5890 * Lock constituent pages if this is large page 5891 */ 5892 if (pages > 1) { 5893 /* 5894 * Lock all constituents except root page, since it 5895 * should be locked already. 5896 */ 5897 for (i = 1; i < pages; i++) { 5898 pp = page_next(pp); 5899 if (!page_trylock(pp, SE_EXCL)) { 5900 break; 5901 } 5902 if (PP_ISFREE(pp) || 5903 pp->p_szc != pszc) { 5904 /* 5905 * hat_page_demote() raced in with us. 5906 */ 5907 ASSERT(!IS_SWAPFSVP(curvp)); 5908 page_unlock(pp); 5909 break; 5910 } 5911 ppa[nlocked] = pp; 5912 nlocked++; 5913 } 5914 } 5915 5916 /* 5917 * If all constituent pages couldn't be locked, 5918 * unlock pages locked so far and skip to next page. 5919 */ 5920 if (nlocked != pages) { 5921 for (i = 0; i < nlocked; i++) 5922 page_unlock(ppa[i]); 5923 va += pgsz; 5924 lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS, 5925 btop(pgsz)); 5926 continue; 5927 } 5928 5929 /* 5930 * hat_page_demote() can no longer happen 5931 * since last cons page had the right p_szc after 5932 * all cons pages were locked. all cons pages 5933 * should now have the same p_szc. 5934 */ 5935 5936 /* 5937 * All constituent pages locked successfully, so mark 5938 * large page for migration and unload the mappings of 5939 * constituent pages, so a fault will occur on any part of the 5940 * large page 5941 */ 5942 PP_SETMIGRATE(ppa[0]); 5943 for (i = 0; i < nlocked; i++) { 5944 pp = ppa[i]; 5945 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 5946 ASSERT(hat_page_getshare(pp) == 0); 5947 page_unlock(pp); 5948 } 5949 lgrp_stat_add(from->lgrp_id, LGRP_PMM_PGS, nlocked); 5950 5951 va += pgsz; 5952 } 5953 kmem_free(ppa, ppa_nentries * sizeof (page_t *)); 5954 } 5955 5956 /* 5957 * Migrate any pages that have been marked for migration in the given range 5958 */ 5959 void 5960 page_migrate( 5961 struct seg *seg, 5962 caddr_t addr, 5963 page_t **ppa, 5964 pgcnt_t npages) 5965 { 5966 lgrp_t *from; 5967 lgrp_t *to; 5968 page_t *newpp; 5969 page_t *pp; 5970 pfn_t pfn; 5971 size_t pgsz; 5972 spgcnt_t page_cnt; 5973 spgcnt_t i; 5974 uint_t pszc; 5975 5976 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 5977 5978 while (npages > 0) { 5979 pp = *ppa; 5980 pszc = pp->p_szc; 5981 pgsz = page_get_pagesize(pszc); 5982 page_cnt = btop(pgsz); 5983 5984 /* 5985 * Check to see whether this page is marked for migration 5986 * 5987 * Assume that root page of large page is marked for 5988 * migration and none of the other constituent pages 5989 * are marked. This really simplifies clearing the 5990 * migrate bit by not having to clear it from each 5991 * constituent page. 5992 * 5993 * note we don't want to relocate an entire large page if 5994 * someone is only using one subpage. 5995 */ 5996 if (npages < page_cnt) 5997 break; 5998 5999 /* 6000 * Is it marked for migration? 6001 */ 6002 if (!PP_ISMIGRATE(pp)) 6003 goto next; 6004 6005 /* 6006 * Determine lgroups that page is being migrated between 6007 */ 6008 pfn = page_pptonum(pp); 6009 if (!IS_P2ALIGNED(pfn, page_cnt)) { 6010 break; 6011 } 6012 from = lgrp_pfn_to_lgrp(pfn); 6013 to = lgrp_mem_choose(seg, addr, pgsz); 6014 6015 /* 6016 * Check to see whether we are trying to migrate page to lgroup 6017 * where it is allocated already 6018 */ 6019 if (to == from) { 6020 PP_CLRMIGRATE(pp); 6021 goto next; 6022 } 6023 6024 /* 6025 * Need to get exclusive lock's to migrate 6026 */ 6027 for (i = 0; i < page_cnt; i++) { 6028 ASSERT(PAGE_LOCKED(ppa[i])); 6029 if (page_pptonum(ppa[i]) != pfn + i || 6030 ppa[i]->p_szc != pszc) { 6031 break; 6032 } 6033 if (!page_tryupgrade(ppa[i])) { 6034 lgrp_stat_add(from->lgrp_id, 6035 LGRP_PM_FAIL_LOCK_PGS, 6036 page_cnt); 6037 break; 6038 } 6039 } 6040 if (i != page_cnt) { 6041 while (--i != -1) { 6042 page_downgrade(ppa[i]); 6043 } 6044 goto next; 6045 } 6046 6047 (void) page_create_wait(page_cnt, PG_WAIT); 6048 newpp = page_get_replacement_page(pp, to, PGR_SAMESZC); 6049 if (newpp == NULL) { 6050 page_create_putback(page_cnt); 6051 for (i = 0; i < page_cnt; i++) { 6052 page_downgrade(ppa[i]); 6053 } 6054 lgrp_stat_add(to->lgrp_id, LGRP_PM_FAIL_ALLOC_PGS, 6055 page_cnt); 6056 goto next; 6057 } 6058 ASSERT(newpp->p_szc == pszc); 6059 /* 6060 * Clear migrate bit and relocate page 6061 */ 6062 PP_CLRMIGRATE(pp); 6063 if (page_relocate(&pp, &newpp, 0, 1, &page_cnt, to)) { 6064 panic("page_migrate: page_relocate failed"); 6065 } 6066 ASSERT(page_cnt * PAGESIZE == pgsz); 6067 6068 /* 6069 * Keep stats for number of pages migrated from and to 6070 * each lgroup 6071 */ 6072 lgrp_stat_add(from->lgrp_id, LGRP_PM_SRC_PGS, page_cnt); 6073 lgrp_stat_add(to->lgrp_id, LGRP_PM_DEST_PGS, page_cnt); 6074 /* 6075 * update the page_t array we were passed in and 6076 * unlink constituent pages of a large page. 6077 */ 6078 for (i = 0; i < page_cnt; ++i, ++pp) { 6079 ASSERT(PAGE_EXCL(newpp)); 6080 ASSERT(newpp->p_szc == pszc); 6081 ppa[i] = newpp; 6082 pp = newpp; 6083 page_sub(&newpp, pp); 6084 page_downgrade(pp); 6085 } 6086 ASSERT(newpp == NULL); 6087 next: 6088 addr += pgsz; 6089 ppa += page_cnt; 6090 npages -= page_cnt; 6091 } 6092 } 6093 6094 /* 6095 * initialize the vnode for retired pages 6096 */ 6097 static void 6098 page_retired_init(void) 6099 { 6100 vn_setops(&retired_ppages, &retired_vnodeops); 6101 } 6102 6103 /* ARGSUSED */ 6104 static void 6105 retired_dispose(vnode_t *vp, page_t *pp, int flag, int dn, cred_t *cr) 6106 { 6107 panic("retired_dispose invoked"); 6108 } 6109 6110 /* ARGSUSED */ 6111 static void 6112 retired_inactive(vnode_t *vp, cred_t *cr) 6113 {} 6114 6115 void 6116 page_unretire_pages(void) 6117 { 6118 page_t *pp; 6119 kmutex_t *vphm; 6120 vnode_t *vp; 6121 page_t *rpages[UNRETIRE_PAGES]; 6122 pgcnt_t i, npages, rmem; 6123 uint64_t pa; 6124 6125 rmem = 0; 6126 6127 for (;;) { 6128 /* 6129 * We do this in 2 steps: 6130 * 6131 * 1. We walk the retired pages list and collect a list of 6132 * pages that have the toxic field cleared. 6133 * 6134 * 2. We iterate through the page list and unretire each one. 6135 * 6136 * We have to do it in two steps on account of the mutexes that 6137 * we need to acquire. 6138 */ 6139 6140 vp = &retired_ppages; 6141 vphm = page_vnode_mutex(vp); 6142 mutex_enter(vphm); 6143 6144 if ((pp = vp->v_pages) == NULL) { 6145 mutex_exit(vphm); 6146 break; 6147 } 6148 6149 i = 0; 6150 do { 6151 ASSERT(pp != NULL); 6152 ASSERT(pp->p_vnode == vp); 6153 6154 /* 6155 * DR operations change the association between a page_t 6156 * and the physical page it represents. Check if the 6157 * page is still bad. If not, unretire it. 6158 */ 6159 if (!page_isfaulty(pp)) 6160 rpages[i++] = pp; 6161 6162 pp = pp->p_vpnext; 6163 } while ((pp != vp->v_pages) && (i < UNRETIRE_PAGES)); 6164 6165 mutex_exit(vphm); 6166 6167 npages = i; 6168 for (i = 0; i < npages; i++) { 6169 pp = rpages[i]; 6170 pa = ptob((uint64_t)page_pptonum(pp)); 6171 6172 /* 6173 * Need to upgrade the shared lock to an exclusive 6174 * lock in order to hash out the page. 6175 * 6176 * The page could have been retired but the page lock 6177 * may not have been downgraded yet. If so, skip this 6178 * page. page_free() will call this function after the 6179 * lock is downgraded. 6180 */ 6181 6182 if (!PAGE_SHARED(pp) || !page_tryupgrade(pp)) 6183 continue; 6184 6185 /* 6186 * Both page_free() and DR call this function. They 6187 * can potentially call this function at the same 6188 * time and race with each other. 6189 */ 6190 if (!page_isretired(pp) || page_isfaulty(pp)) { 6191 page_downgrade(pp); 6192 continue; 6193 } 6194 6195 cmn_err(CE_NOTE, 6196 "unretiring retired page 0x%08x.%08x", 6197 (uint32_t)(pa >> 32), (uint32_t)pa); 6198 6199 /* 6200 * When a page is removed from the retired pages vnode, 6201 * its toxic field is also cleared. So, we do not have 6202 * to do that seperately here. 6203 */ 6204 page_hashout(pp, (kmutex_t *)NULL); 6205 6206 /* 6207 * This is a good page. So, free it. 6208 */ 6209 pp->p_vnode = NULL; 6210 page_free(pp, 1); 6211 rmem++; 6212 } 6213 6214 /* 6215 * If the rpages array was filled up, then there could be more 6216 * retired pages that are not faulty. We need to iterate 6217 * again and unretire them. Otherwise, we are done. 6218 */ 6219 if (npages < UNRETIRE_PAGES) 6220 break; 6221 } 6222 6223 mutex_enter(&freemem_lock); 6224 availrmem += rmem; 6225 mutex_exit(&freemem_lock); 6226 } 6227 6228 ulong_t mem_waiters = 0; 6229 ulong_t max_count = 20; 6230 #define MAX_DELAY 0x1ff 6231 6232 /* 6233 * Check if enough memory is available to proceed. 6234 * Depending on system configuration and how much memory is 6235 * reserved for swap we need to check against two variables. 6236 * e.g. on systems with little physical swap availrmem can be 6237 * more reliable indicator of how much memory is available. 6238 * On systems with large phys swap freemem can be better indicator. 6239 * If freemem drops below threshold level don't return an error 6240 * immediately but wake up pageout to free memory and block. 6241 * This is done number of times. If pageout is not able to free 6242 * memory within certain time return an error. 6243 * The same applies for availrmem but kmem_reap is used to 6244 * free memory. 6245 */ 6246 int 6247 page_mem_avail(pgcnt_t npages) 6248 { 6249 ulong_t count; 6250 6251 #if defined(__i386) 6252 if (freemem > desfree + npages && 6253 availrmem > swapfs_reserve + npages && 6254 btop(vmem_size(heap_arena, VMEM_FREE)) > tune.t_minarmem + 6255 npages) 6256 return (1); 6257 #else 6258 if (freemem > desfree + npages && 6259 availrmem > swapfs_reserve + npages) 6260 return (1); 6261 #endif 6262 6263 count = max_count; 6264 atomic_add_long(&mem_waiters, 1); 6265 6266 while (freemem < desfree + npages && --count) { 6267 cv_signal(&proc_pageout->p_cv); 6268 if (delay_sig(hz + (mem_waiters & MAX_DELAY))) { 6269 atomic_add_long(&mem_waiters, -1); 6270 return (0); 6271 } 6272 } 6273 if (count == 0) { 6274 atomic_add_long(&mem_waiters, -1); 6275 return (0); 6276 } 6277 6278 count = max_count; 6279 while (availrmem < swapfs_reserve + npages && --count) { 6280 kmem_reap(); 6281 if (delay_sig(hz + (mem_waiters & MAX_DELAY))) { 6282 atomic_add_long(&mem_waiters, -1); 6283 return (0); 6284 } 6285 } 6286 atomic_add_long(&mem_waiters, -1); 6287 if (count == 0) 6288 return (0); 6289 6290 #if defined(__i386) 6291 if (btop(vmem_size(heap_arena, VMEM_FREE)) < 6292 tune.t_minarmem + npages) 6293 return (0); 6294 #endif 6295 return (1); 6296 } 6297 6298 6299 /* 6300 * Search the memory segments to locate the desired page. Within a 6301 * segment, pages increase linearly with one page structure per 6302 * physical page frame (size PAGESIZE). The search begins 6303 * with the segment that was accessed last, to take advantage of locality. 6304 * If the hint misses, we start from the beginning of the sorted memseg list 6305 */ 6306 6307 6308 /* 6309 * Some data structures for pfn to pp lookup. 6310 */ 6311 ulong_t mhash_per_slot; 6312 struct memseg *memseg_hash[N_MEM_SLOTS]; 6313 6314 page_t * 6315 page_numtopp_nolock(pfn_t pfnum) 6316 { 6317 static struct memseg *last_memseg_by_pfnum = NULL; 6318 struct memseg *seg; 6319 page_t *pp; 6320 6321 /* 6322 * XXX - Since page_numtopp_nolock is called in many places where 6323 * the search fails more than it succeeds. It maybe worthwhile 6324 * to put a check for pf_is_memory or a pfnum <= max_pfn (set at 6325 * boot time). 6326 * 6327 * if (!pf_is_memory(pfnum) || (pfnum > max_pfn)) 6328 * return (NULL); 6329 */ 6330 6331 MEMSEG_STAT_INCR(nsearch); 6332 6333 /* Try last winner first */ 6334 if (((seg = last_memseg_by_pfnum) != NULL) && 6335 (pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) { 6336 MEMSEG_STAT_INCR(nlastwon); 6337 pp = seg->pages + (pfnum - seg->pages_base); 6338 if (pp->p_pagenum == pfnum) 6339 return ((page_t *)pp); 6340 } 6341 6342 /* Else Try hash */ 6343 if (((seg = memseg_hash[MEMSEG_PFN_HASH(pfnum)]) != NULL) && 6344 (pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) { 6345 MEMSEG_STAT_INCR(nhashwon); 6346 last_memseg_by_pfnum = seg; 6347 pp = seg->pages + (pfnum - seg->pages_base); 6348 if (pp->p_pagenum == pfnum) 6349 return ((page_t *)pp); 6350 } 6351 6352 /* Else Brute force */ 6353 for (seg = memsegs; seg != NULL; seg = seg->next) { 6354 if (pfnum >= seg->pages_base && pfnum < seg->pages_end) { 6355 last_memseg_by_pfnum = seg; 6356 pp = seg->pages + (pfnum - seg->pages_base); 6357 return ((page_t *)pp); 6358 } 6359 } 6360 last_memseg_by_pfnum = NULL; 6361 MEMSEG_STAT_INCR(nnotfound); 6362 return ((page_t *)NULL); 6363 6364 } 6365 6366 struct memseg * 6367 page_numtomemseg_nolock(pfn_t pfnum) 6368 { 6369 struct memseg *seg; 6370 page_t *pp; 6371 6372 /* Try hash */ 6373 if (((seg = memseg_hash[MEMSEG_PFN_HASH(pfnum)]) != NULL) && 6374 (pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) { 6375 pp = seg->pages + (pfnum - seg->pages_base); 6376 if (pp->p_pagenum == pfnum) 6377 return (seg); 6378 } 6379 6380 /* Else Brute force */ 6381 for (seg = memsegs; seg != NULL; seg = seg->next) { 6382 if (pfnum >= seg->pages_base && pfnum < seg->pages_end) { 6383 return (seg); 6384 } 6385 } 6386 return ((struct memseg *)NULL); 6387 } 6388 6389 /* 6390 * Given a page and a count return the page struct that is 6391 * n structs away from the current one in the global page 6392 * list. 6393 * 6394 * This function wraps to the first page upon 6395 * reaching the end of the memseg list. 6396 */ 6397 page_t * 6398 page_nextn(page_t *pp, ulong_t n) 6399 { 6400 static struct memseg *last_page_next_memseg = NULL; 6401 struct memseg *seg; 6402 page_t *ppn; 6403 6404 if (((seg = last_page_next_memseg) == NULL) || 6405 (seg->pages_base == seg->pages_end) || 6406 !(pp >= seg->pages && pp < seg->epages)) { 6407 6408 for (seg = memsegs; seg; seg = seg->next) { 6409 if (pp >= seg->pages && pp < seg->epages) 6410 break; 6411 } 6412 6413 if (seg == NULL) { 6414 /* Memory delete got in, return something valid. */ 6415 /* TODO: fix me. */ 6416 seg = memsegs; 6417 pp = seg->pages; 6418 } 6419 } 6420 6421 /* check for wraparound - possible if n is large */ 6422 while ((ppn = (pp + n)) >= seg->epages || ppn < pp) { 6423 n -= seg->epages - pp; 6424 seg = seg->next; 6425 if (seg == NULL) 6426 seg = memsegs; 6427 pp = seg->pages; 6428 } 6429 last_page_next_memseg = seg; 6430 return (ppn); 6431 } 6432 6433 /* 6434 * Initialize for a loop using page_next_scan_large(). 6435 */ 6436 page_t * 6437 page_next_scan_init(void **cookie) 6438 { 6439 ASSERT(cookie != NULL); 6440 *cookie = (void *)memsegs; 6441 return ((page_t *)memsegs->pages); 6442 } 6443 6444 /* 6445 * Return the next page in a scan of page_t's, assuming we want 6446 * to skip over sub-pages within larger page sizes. 6447 * 6448 * The cookie is used to keep track of the current memseg. 6449 */ 6450 page_t * 6451 page_next_scan_large( 6452 page_t *pp, 6453 ulong_t *n, 6454 void **cookie) 6455 { 6456 struct memseg *seg = (struct memseg *)*cookie; 6457 page_t *new_pp; 6458 ulong_t cnt; 6459 pfn_t pfn; 6460 6461 6462 /* 6463 * get the count of page_t's to skip based on the page size 6464 */ 6465 ASSERT(pp != NULL); 6466 if (pp->p_szc == 0) { 6467 cnt = 1; 6468 } else { 6469 pfn = page_pptonum(pp); 6470 cnt = page_get_pagecnt(pp->p_szc); 6471 cnt -= pfn & (cnt - 1); 6472 } 6473 *n += cnt; 6474 new_pp = pp + cnt; 6475 6476 /* 6477 * Catch if we went past the end of the current memory segment. If so, 6478 * just move to the next segment with pages. 6479 */ 6480 if (new_pp >= seg->epages) { 6481 do { 6482 seg = seg->next; 6483 if (seg == NULL) 6484 seg = memsegs; 6485 } while (seg->pages == seg->epages); 6486 new_pp = seg->pages; 6487 *cookie = (void *)seg; 6488 } 6489 6490 return (new_pp); 6491 } 6492 6493 6494 /* 6495 * Returns next page in list. Note: this function wraps 6496 * to the first page in the list upon reaching the end 6497 * of the list. Callers should be aware of this fact. 6498 */ 6499 6500 /* We should change this be a #define */ 6501 6502 page_t * 6503 page_next(page_t *pp) 6504 { 6505 return (page_nextn(pp, 1)); 6506 } 6507 6508 /* 6509 * Special for routines processing an array of page_t. 6510 */ 6511 page_t * 6512 page_nextn_raw(page_t *pp, ulong_t n) 6513 { 6514 return (pp+n); 6515 } 6516 6517 page_t * 6518 page_first() 6519 { 6520 return ((page_t *)memsegs->pages); 6521 } 6522 6523 6524 /* 6525 * This routine is called at boot with the initial memory configuration 6526 * and when memory is added or removed. 6527 */ 6528 void 6529 build_pfn_hash() 6530 { 6531 pfn_t cur; 6532 pgcnt_t index; 6533 struct memseg *pseg; 6534 int i; 6535 6536 /* 6537 * Clear memseg_hash array. 6538 * Since memory add/delete is designed to operate concurrently 6539 * with normal operation, the hash rebuild must be able to run 6540 * concurrently with page_numtopp_nolock(). To support this 6541 * functionality, assignments to memseg_hash array members must 6542 * be done atomically. 6543 * 6544 * NOTE: bzero() does not currently guarantee this for kernel 6545 * threads, and cannot be used here. 6546 */ 6547 for (i = 0; i < N_MEM_SLOTS; i++) 6548 memseg_hash[i] = NULL; 6549 6550 hat_kpm_mseghash_clear(N_MEM_SLOTS); 6551 6552 /* 6553 * Physmax is the last valid pfn. 6554 */ 6555 mhash_per_slot = (physmax + 1) >> MEM_HASH_SHIFT; 6556 for (pseg = memsegs; pseg != NULL; pseg = pseg->next) { 6557 index = MEMSEG_PFN_HASH(pseg->pages_base); 6558 cur = pseg->pages_base; 6559 do { 6560 if (index >= N_MEM_SLOTS) 6561 index = MEMSEG_PFN_HASH(cur); 6562 6563 if (memseg_hash[index] == NULL || 6564 memseg_hash[index]->pages_base > pseg->pages_base) { 6565 memseg_hash[index] = pseg; 6566 hat_kpm_mseghash_update(index, pseg); 6567 } 6568 cur += mhash_per_slot; 6569 index++; 6570 } while (cur < pseg->pages_end); 6571 } 6572 } 6573 6574 /* 6575 * Return the pagenum for the pp 6576 */ 6577 pfn_t 6578 page_pptonum(page_t *pp) 6579 { 6580 return (pp->p_pagenum); 6581 } 6582 6583 /* 6584 * interface to the referenced and modified etc bits 6585 * in the PSM part of the page struct 6586 * when no locking is desired. 6587 */ 6588 void 6589 page_set_props(page_t *pp, uint_t flags) 6590 { 6591 ASSERT((flags & ~(P_MOD | P_REF | P_RO)) == 0); 6592 pp->p_nrm |= (uchar_t)flags; 6593 } 6594 6595 void 6596 page_clr_all_props(page_t *pp) 6597 { 6598 pp->p_nrm = 0; 6599 } 6600 6601 /* 6602 * The following functions is called from free_vp_pages() 6603 * for an inexact estimate of a newly free'd page... 6604 */ 6605 ulong_t 6606 page_share_cnt(page_t *pp) 6607 { 6608 return (hat_page_getshare(pp)); 6609 } 6610 6611 /* 6612 * The following functions are used in handling memory 6613 * errors. 6614 */ 6615 6616 int 6617 page_istoxic(page_t *pp) 6618 { 6619 return ((pp->p_toxic & PAGE_IS_TOXIC) == PAGE_IS_TOXIC); 6620 } 6621 6622 int 6623 page_isfailing(page_t *pp) 6624 { 6625 return ((pp->p_toxic & PAGE_IS_FAILING) == PAGE_IS_FAILING); 6626 } 6627 6628 int 6629 page_isretired(page_t *pp) 6630 { 6631 return ((pp->p_toxic & PAGE_IS_RETIRED) == PAGE_IS_RETIRED); 6632 } 6633 6634 int 6635 page_deteriorating(page_t *pp) 6636 { 6637 return ((pp->p_toxic & (PAGE_IS_TOXIC | PAGE_IS_FAILING)) != 0); 6638 } 6639 6640 void 6641 page_settoxic(page_t *pp, uchar_t flag) 6642 { 6643 uchar_t new_flag = 0; 6644 while ((new_flag & flag) != flag) { 6645 uchar_t old_flag = pp->p_toxic; 6646 new_flag = old_flag | flag; 6647 (void) cas8(&pp->p_toxic, old_flag, new_flag); 6648 new_flag = ((volatile page_t *)pp)->p_toxic; 6649 } 6650 } 6651 6652 void 6653 page_clrtoxic(page_t *pp) 6654 { 6655 /* 6656 * We don't need to worry about atomicity on the 6657 * p_toxic flag here as this is only called from 6658 * page_free() while holding an exclusive lock on 6659 * the page 6660 */ 6661 pp->p_toxic = PAGE_IS_OK; 6662 } 6663 6664 void 6665 page_clrtoxic_flag(page_t *pp, uchar_t flag) 6666 { 6667 uchar_t new_flag = ((volatile page_t *)pp)->p_toxic; 6668 while ((new_flag & flag) == flag) { 6669 uchar_t old_flag = new_flag; 6670 new_flag = old_flag & ~flag; 6671 (void) cas8(&pp->p_toxic, old_flag, new_flag); 6672 new_flag = ((volatile page_t *)pp)->p_toxic; 6673 } 6674 } 6675 6676 int 6677 page_isfaulty(page_t *pp) 6678 { 6679 return ((pp->p_toxic & PAGE_IS_FAULTY) == PAGE_IS_FAULTY); 6680 } 6681 6682 /* 6683 * The following four functions are called from /proc code 6684 * for the /proc/<pid>/xmap interface. 6685 */ 6686 int 6687 page_isshared(page_t *pp) 6688 { 6689 return (hat_page_getshare(pp) > 1); 6690 } 6691 6692 int 6693 page_isfree(page_t *pp) 6694 { 6695 return (PP_ISFREE(pp)); 6696 } 6697 6698 int 6699 page_isref(page_t *pp) 6700 { 6701 return (hat_page_getattr(pp, P_REF)); 6702 } 6703 6704 int 6705 page_ismod(page_t *pp) 6706 { 6707 return (hat_page_getattr(pp, P_MOD)); 6708 } 6709