1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * University Copyright- Copyright (c) 1982, 1986, 1988 32 * The Regents of the University of California 33 * All Rights Reserved 34 * 35 * University Acknowledgment- Portions of this document are derived from 36 * software developed by the University of California, Berkeley, and its 37 * contributors. 38 */ 39 40 #pragma ident "%Z%%M% %I% %E% SMI" 41 42 /* 43 * VM - physical page management. 44 */ 45 46 #include <sys/types.h> 47 #include <sys/t_lock.h> 48 #include <sys/param.h> 49 #include <sys/systm.h> 50 #include <sys/errno.h> 51 #include <sys/time.h> 52 #include <sys/vnode.h> 53 #include <sys/vm.h> 54 #include <sys/vtrace.h> 55 #include <sys/swap.h> 56 #include <sys/cmn_err.h> 57 #include <sys/tuneable.h> 58 #include <sys/sysmacros.h> 59 #include <sys/cpuvar.h> 60 #include <sys/callb.h> 61 #include <sys/debug.h> 62 #include <sys/tnf_probe.h> 63 #include <sys/condvar_impl.h> 64 #include <sys/mem_config.h> 65 #include <sys/mem_cage.h> 66 #include <sys/kmem.h> 67 #include <sys/atomic.h> 68 #include <sys/strlog.h> 69 #include <sys/mman.h> 70 #include <sys/ontrap.h> 71 #include <sys/lgrp.h> 72 #include <sys/vfs.h> 73 74 #include <vm/hat.h> 75 #include <vm/anon.h> 76 #include <vm/page.h> 77 #include <vm/seg.h> 78 #include <vm/pvn.h> 79 #include <vm/seg_kmem.h> 80 #include <vm/vm_dep.h> 81 82 #include <fs/fs_subr.h> 83 84 static int nopageage = 0; 85 86 static pgcnt_t max_page_get; /* max page_get request size in pages */ 87 pgcnt_t total_pages = 0; /* total number of pages (used by /proc) */ 88 89 /* 90 * vnode for all pages which are retired from the VM system; 91 * such as pages with Uncorrectable Errors. 92 */ 93 struct vnode retired_ppages; 94 95 static void page_retired_init(void); 96 static void retired_dispose(vnode_t *vp, page_t *pp, int flag, 97 int dn, cred_t *cr); 98 static void retired_inactive(vnode_t *vp, cred_t *cr); 99 static void page_retired(page_t *pp); 100 static void retired_page_removed(page_t *pp); 101 void page_unretire_pages(void); 102 103 /* 104 * The maximum number of pages that will be unretired in one iteration. 105 * This number is totally arbitrary. 106 */ 107 #define UNRETIRE_PAGES 256 108 109 /* 110 * We limit the number of pages that may be retired to 111 * a percentage of the total physical memory. Note that 112 * the percentage values are stored as 'basis points', 113 * ie, 100 basis points is 1%. 114 */ 115 #define MAX_PAGES_RETIRED_BPS_DEFAULT 10 /* .1% */ 116 117 uint64_t max_pages_retired_bps = MAX_PAGES_RETIRED_BPS_DEFAULT; 118 119 static int pages_retired_limit_exceeded(void); 120 121 /* 122 * operations vector for vnode with retired pages. Only VOP_DISPOSE 123 * and VOP_INACTIVE are intercepted. 124 */ 125 struct vnodeops retired_vnodeops = { 126 "retired_vnodeops", 127 fs_nosys, /* open */ 128 fs_nosys, /* close */ 129 fs_nosys, /* read */ 130 fs_nosys, /* write */ 131 fs_nosys, /* ioctl */ 132 fs_nosys, /* setfl */ 133 fs_nosys, /* getattr */ 134 fs_nosys, /* setattr */ 135 fs_nosys, /* access */ 136 fs_nosys, /* lookup */ 137 fs_nosys, /* create */ 138 fs_nosys, /* remove */ 139 fs_nosys, /* link */ 140 fs_nosys, /* rename */ 141 fs_nosys, /* mkdir */ 142 fs_nosys, /* rmdir */ 143 fs_nosys, /* readdir */ 144 fs_nosys, /* symlink */ 145 fs_nosys, /* readlink */ 146 fs_nosys, /* fsync */ 147 retired_inactive, 148 fs_nosys, /* fid */ 149 fs_rwlock, /* rwlock */ 150 fs_rwunlock, /* rwunlock */ 151 fs_nosys, /* seek */ 152 fs_nosys, /* cmp */ 153 fs_nosys, /* frlock */ 154 fs_nosys, /* space */ 155 fs_nosys, /* realvp */ 156 fs_nosys, /* getpage */ 157 fs_nosys, /* putpage */ 158 fs_nosys_map, 159 fs_nosys_addmap, 160 fs_nosys, /* delmap */ 161 fs_nosys_poll, 162 fs_nosys, /* dump */ 163 fs_nosys, /* l_pathconf */ 164 fs_nosys, /* pageio */ 165 fs_nosys, /* dumpctl */ 166 retired_dispose, 167 fs_nosys, /* setsecattr */ 168 fs_nosys, /* getsecatt */ 169 fs_nosys, /* shrlock */ 170 fs_vnevent_nosupport /* vnevent */ 171 }; 172 173 /* 174 * freemem_lock protects all freemem variables: 175 * availrmem. Also this lock protects the globals which track the 176 * availrmem changes for accurate kernel footprint calculation. 177 * See below for an explanation of these 178 * globals. 179 */ 180 kmutex_t freemem_lock; 181 pgcnt_t availrmem; 182 pgcnt_t availrmem_initial; 183 184 /* 185 * These globals track availrmem changes to get a more accurate 186 * estimate of tke kernel size. Historically pp_kernel is used for 187 * kernel size and is based on availrmem. But availrmem is adjusted for 188 * locked pages in the system not just for kernel locked pages. 189 * These new counters will track the pages locked through segvn and 190 * by explicit user locking. 191 * 192 * segvn_pages_locked : This keeps track on a global basis how many pages 193 * are currently locked because of I/O. 194 * 195 * pages_locked : How many pages are locked becuase of user specified 196 * locking through mlock or plock. 197 * 198 * pages_useclaim,pages_claimed : These two variables track the 199 * cliam adjustments because of the protection changes on a segvn segment. 200 * 201 * All these globals are protected by the same lock which protects availrmem. 202 */ 203 pgcnt_t segvn_pages_locked; 204 pgcnt_t pages_locked; 205 pgcnt_t pages_useclaim; 206 pgcnt_t pages_claimed; 207 208 209 /* 210 * new_freemem_lock protects freemem, freemem_wait & freemem_cv. 211 */ 212 static kmutex_t new_freemem_lock; 213 static uint_t freemem_wait; /* someone waiting for freemem */ 214 static kcondvar_t freemem_cv; 215 216 /* 217 * The logical page free list is maintained as two lists, the 'free' 218 * and the 'cache' lists. 219 * The free list contains those pages that should be reused first. 220 * 221 * The implementation of the lists is machine dependent. 222 * page_get_freelist(), page_get_cachelist(), 223 * page_list_sub(), and page_list_add() 224 * form the interface to the machine dependent implementation. 225 * 226 * Pages with p_free set are on the cache list. 227 * Pages with p_free and p_age set are on the free list, 228 * 229 * A page may be locked while on either list. 230 */ 231 232 /* 233 * free list accounting stuff. 234 * 235 * 236 * Spread out the value for the number of pages on the 237 * page free and page cache lists. If there is just one 238 * value, then it must be under just one lock. 239 * The lock contention and cache traffic are a real bother. 240 * 241 * When we acquire and then drop a single pcf lock 242 * we can start in the middle of the array of pcf structures. 243 * If we acquire more than one pcf lock at a time, we need to 244 * start at the front to avoid deadlocking. 245 * 246 * pcf_count holds the number of pages in each pool. 247 * 248 * pcf_block is set when page_create_get_something() has asked the 249 * PSM page freelist and page cachelist routines without specifying 250 * a color and nothing came back. This is used to block anything 251 * else from moving pages from one list to the other while the 252 * lists are searched again. If a page is freeed while pcf_block is 253 * set, then pcf_reserve is incremented. pcgs_unblock() takes care 254 * of clearning pcf_block, doing the wakeups, etc. 255 */ 256 257 #if NCPU <= 4 258 #define PAD 1 259 #define PCF_FANOUT 4 260 static uint_t pcf_mask = PCF_FANOUT - 1; 261 #else 262 #define PAD 9 263 #ifdef sun4v 264 #define PCF_FANOUT 32 265 #else 266 #define PCF_FANOUT 128 267 #endif 268 static uint_t pcf_mask = PCF_FANOUT - 1; 269 #endif 270 271 struct pcf { 272 uint_t pcf_touch; /* just to help the cache */ 273 uint_t pcf_count; /* page count */ 274 kmutex_t pcf_lock; /* protects the structure */ 275 uint_t pcf_wait; /* number of waiters */ 276 uint_t pcf_block; /* pcgs flag to page_free() */ 277 uint_t pcf_reserve; /* pages freed after pcf_block set */ 278 uint_t pcf_fill[PAD]; /* to line up on the caches */ 279 }; 280 281 static struct pcf pcf[PCF_FANOUT]; 282 #define PCF_INDEX() ((CPU->cpu_id) & (pcf_mask)) 283 284 kmutex_t pcgs_lock; /* serializes page_create_get_ */ 285 kmutex_t pcgs_cagelock; /* serializes NOSLEEP cage allocs */ 286 kmutex_t pcgs_wait_lock; /* used for delay in pcgs */ 287 static kcondvar_t pcgs_cv; /* cv for delay in pcgs */ 288 289 #define PAGE_LOCK_MAXIMUM \ 290 ((1 << (sizeof (((page_t *)0)->p_lckcnt) * NBBY)) - 1) 291 292 /* 293 * Control over the verbosity of page retirement. When set to zero, no messages 294 * will be printed. A value of one will trigger messages for retirement 295 * operations, and is intended for processors which don't yet support FMA 296 * (spitfire). Two will cause verbose messages to be printed when retirements 297 * complete, and is intended only for debugging purposes. 298 */ 299 int page_retire_messages = 0; 300 301 #ifdef VM_STATS 302 303 /* 304 * No locks, but so what, they are only statistics. 305 */ 306 307 static struct page_tcnt { 308 int pc_free_cache; /* free's into cache list */ 309 int pc_free_dontneed; /* free's with dontneed */ 310 int pc_free_pageout; /* free's from pageout */ 311 int pc_free_free; /* free's into free list */ 312 int pc_free_pages; /* free's into large page free list */ 313 int pc_destroy_pages; /* large page destroy's */ 314 int pc_get_cache; /* get's from cache list */ 315 int pc_get_free; /* get's from free list */ 316 int pc_reclaim; /* reclaim's */ 317 int pc_abortfree; /* abort's of free pages */ 318 int pc_find_hit; /* find's that find page */ 319 int pc_find_miss; /* find's that don't find page */ 320 int pc_destroy_free; /* # of free pages destroyed */ 321 #define PC_HASH_CNT (4*PAGE_HASHAVELEN) 322 int pc_find_hashlen[PC_HASH_CNT+1]; 323 int pc_addclaim_pages; 324 int pc_subclaim_pages; 325 int pc_free_replacement_page[2]; 326 int pc_try_demote_pages[6]; 327 int pc_demote_pages[2]; 328 } pagecnt; 329 330 uint_t hashin_count; 331 uint_t hashin_not_held; 332 uint_t hashin_already; 333 334 uint_t hashout_count; 335 uint_t hashout_not_held; 336 337 uint_t page_create_count; 338 uint_t page_create_not_enough; 339 uint_t page_create_not_enough_again; 340 uint_t page_create_zero; 341 uint_t page_create_hashout; 342 uint_t page_create_page_lock_failed; 343 uint_t page_create_trylock_failed; 344 uint_t page_create_found_one; 345 uint_t page_create_hashin_failed; 346 uint_t page_create_dropped_phm; 347 348 uint_t page_create_new; 349 uint_t page_create_exists; 350 uint_t page_create_putbacks; 351 uint_t page_create_overshoot; 352 353 uint_t page_reclaim_zero; 354 uint_t page_reclaim_zero_locked; 355 356 uint_t page_rename_exists; 357 uint_t page_rename_count; 358 359 uint_t page_lookup_cnt[20]; 360 uint_t page_lookup_nowait_cnt[10]; 361 uint_t page_find_cnt; 362 uint_t page_exists_cnt; 363 uint_t page_exists_forreal_cnt; 364 uint_t page_lookup_dev_cnt; 365 uint_t get_cachelist_cnt; 366 uint_t page_create_cnt[10]; 367 uint_t alloc_pages[8]; 368 uint_t page_exphcontg[19]; 369 uint_t page_create_large_cnt[10]; 370 371 /* 372 * Collects statistics. 373 */ 374 #define PAGE_HASH_SEARCH(index, pp, vp, off) { \ 375 uint_t mylen = 0; \ 376 \ 377 for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash, mylen++) { \ 378 if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \ 379 break; \ 380 } \ 381 if ((pp) != NULL) \ 382 pagecnt.pc_find_hit++; \ 383 else \ 384 pagecnt.pc_find_miss++; \ 385 if (mylen > PC_HASH_CNT) \ 386 mylen = PC_HASH_CNT; \ 387 pagecnt.pc_find_hashlen[mylen]++; \ 388 } 389 390 #else /* VM_STATS */ 391 392 /* 393 * Don't collect statistics 394 */ 395 #define PAGE_HASH_SEARCH(index, pp, vp, off) { \ 396 for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \ 397 if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \ 398 break; \ 399 } \ 400 } 401 402 #endif /* VM_STATS */ 403 404 405 406 #ifdef DEBUG 407 #define MEMSEG_SEARCH_STATS 408 #endif 409 410 #ifdef MEMSEG_SEARCH_STATS 411 struct memseg_stats { 412 uint_t nsearch; 413 uint_t nlastwon; 414 uint_t nhashwon; 415 uint_t nnotfound; 416 } memseg_stats; 417 418 #define MEMSEG_STAT_INCR(v) \ 419 atomic_add_32(&memseg_stats.v, 1) 420 #else 421 #define MEMSEG_STAT_INCR(x) 422 #endif 423 424 struct memseg *memsegs; /* list of memory segments */ 425 426 427 static void page_init_mem_config(void); 428 static int page_do_hashin(page_t *, vnode_t *, u_offset_t); 429 static void page_do_hashout(page_t *); 430 431 static void page_demote_vp_pages(page_t *); 432 433 /* 434 * vm subsystem related initialization 435 */ 436 void 437 vm_init(void) 438 { 439 boolean_t callb_vm_cpr(void *, int); 440 441 (void) callb_add(callb_vm_cpr, 0, CB_CL_CPR_VM, "vm"); 442 page_init_mem_config(); 443 444 /* 445 * initialise the vnode for retired pages 446 */ 447 page_retired_init(); 448 } 449 450 /* 451 * This function is called at startup and when memory is added or deleted. 452 */ 453 void 454 init_pages_pp_maximum() 455 { 456 static pgcnt_t p_min; 457 static pgcnt_t pages_pp_maximum_startup; 458 static pgcnt_t avrmem_delta; 459 static int init_done; 460 static int user_set; /* true if set in /etc/system */ 461 462 if (init_done == 0) { 463 464 /* If the user specified a value, save it */ 465 if (pages_pp_maximum != 0) { 466 user_set = 1; 467 pages_pp_maximum_startup = pages_pp_maximum; 468 } 469 470 /* 471 * Setting of pages_pp_maximum is based first time 472 * on the value of availrmem just after the start-up 473 * allocations. To preserve this relationship at run 474 * time, use a delta from availrmem_initial. 475 */ 476 ASSERT(availrmem_initial >= availrmem); 477 avrmem_delta = availrmem_initial - availrmem; 478 479 /* The allowable floor of pages_pp_maximum */ 480 p_min = tune.t_minarmem + 100; 481 482 /* Make sure we don't come through here again. */ 483 init_done = 1; 484 } 485 /* 486 * Determine pages_pp_maximum, the number of currently available 487 * pages (availrmem) that can't be `locked'. If not set by 488 * the user, we set it to 4% of the currently available memory 489 * plus 4MB. 490 * But we also insist that it be greater than tune.t_minarmem; 491 * otherwise a process could lock down a lot of memory, get swapped 492 * out, and never have enough to get swapped back in. 493 */ 494 if (user_set) 495 pages_pp_maximum = pages_pp_maximum_startup; 496 else 497 pages_pp_maximum = ((availrmem_initial - avrmem_delta) / 25) 498 + btop(4 * 1024 * 1024); 499 500 if (pages_pp_maximum <= p_min) { 501 pages_pp_maximum = p_min; 502 } 503 } 504 505 void 506 set_max_page_get(pgcnt_t target_total_pages) 507 { 508 max_page_get = target_total_pages / 2; 509 } 510 511 static pgcnt_t pending_delete; 512 513 /*ARGSUSED*/ 514 static void 515 page_mem_config_post_add( 516 void *arg, 517 pgcnt_t delta_pages) 518 { 519 set_max_page_get(total_pages - pending_delete); 520 init_pages_pp_maximum(); 521 } 522 523 /*ARGSUSED*/ 524 static int 525 page_mem_config_pre_del( 526 void *arg, 527 pgcnt_t delta_pages) 528 { 529 pgcnt_t nv; 530 531 nv = atomic_add_long_nv(&pending_delete, (spgcnt_t)delta_pages); 532 set_max_page_get(total_pages - nv); 533 return (0); 534 } 535 536 /*ARGSUSED*/ 537 static void 538 page_mem_config_post_del( 539 void *arg, 540 pgcnt_t delta_pages, 541 int cancelled) 542 { 543 pgcnt_t nv; 544 545 nv = atomic_add_long_nv(&pending_delete, -(spgcnt_t)delta_pages); 546 set_max_page_get(total_pages - nv); 547 if (!cancelled) 548 init_pages_pp_maximum(); 549 } 550 551 static kphysm_setup_vector_t page_mem_config_vec = { 552 KPHYSM_SETUP_VECTOR_VERSION, 553 page_mem_config_post_add, 554 page_mem_config_pre_del, 555 page_mem_config_post_del, 556 }; 557 558 static void 559 page_init_mem_config(void) 560 { 561 int ret; 562 563 ret = kphysm_setup_func_register(&page_mem_config_vec, (void *)NULL); 564 ASSERT(ret == 0); 565 } 566 567 /* 568 * Evenly spread out the PCF counters for large free pages 569 */ 570 static void 571 page_free_large_ctr(pgcnt_t npages) 572 { 573 static struct pcf *p = pcf; 574 pgcnt_t lump; 575 576 freemem += npages; 577 578 lump = roundup(npages, PCF_FANOUT) / PCF_FANOUT; 579 580 while (npages > 0) { 581 582 ASSERT(!p->pcf_block); 583 584 if (lump < npages) { 585 p->pcf_count += (uint_t)lump; 586 npages -= lump; 587 } else { 588 p->pcf_count += (uint_t)npages; 589 npages = 0; 590 } 591 592 ASSERT(!p->pcf_wait); 593 594 if (++p > &pcf[PCF_FANOUT - 1]) 595 p = pcf; 596 } 597 598 ASSERT(npages == 0); 599 } 600 601 /* 602 * Add a physical chunk of memory to the system freee lists during startup. 603 * Platform specific startup() allocates the memory for the page structs. 604 * 605 * num - number of page structures 606 * base - page number (pfn) to be associated with the first page. 607 * 608 * Since we are doing this during startup (ie. single threaded), we will 609 * use shortcut routines to avoid any locking overhead while putting all 610 * these pages on the freelists. 611 * 612 * NOTE: Any changes performed to page_free(), must also be performed to 613 * add_physmem() since this is how we initialize all page_t's at 614 * boot time. 615 */ 616 void 617 add_physmem( 618 page_t *pp, 619 pgcnt_t num, 620 pfn_t pnum) 621 { 622 page_t *root = NULL; 623 uint_t szc = page_num_pagesizes() - 1; 624 pgcnt_t large = page_get_pagecnt(szc); 625 pgcnt_t cnt = 0; 626 627 TRACE_2(TR_FAC_VM, TR_PAGE_INIT, 628 "add_physmem:pp %p num %lu", pp, num); 629 630 /* 631 * Arbitrarily limit the max page_get request 632 * to 1/2 of the page structs we have. 633 */ 634 total_pages += num; 635 set_max_page_get(total_pages); 636 637 /* 638 * The physical space for the pages array 639 * representing ram pages has already been 640 * allocated. Here we initialize each lock 641 * in the page structure, and put each on 642 * the free list 643 */ 644 for (; num; pp++, pnum++, num--) { 645 646 /* 647 * this needs to fill in the page number 648 * and do any other arch specific initialization 649 */ 650 add_physmem_cb(pp, pnum); 651 652 /* 653 * Initialize the page lock as unlocked, since nobody 654 * can see or access this page yet. 655 */ 656 pp->p_selock = 0; 657 658 /* 659 * Initialize IO lock 660 */ 661 page_iolock_init(pp); 662 663 /* 664 * initialize other fields in the page_t 665 */ 666 PP_SETFREE(pp); 667 page_clr_all_props(pp); 668 PP_SETAGED(pp); 669 pp->p_offset = (u_offset_t)-1; 670 pp->p_next = pp; 671 pp->p_prev = pp; 672 673 /* 674 * Simple case: System doesn't support large pages. 675 */ 676 if (szc == 0) { 677 pp->p_szc = 0; 678 page_free_at_startup(pp); 679 continue; 680 } 681 682 /* 683 * Handle unaligned pages, we collect them up onto 684 * the root page until we have a full large page. 685 */ 686 if (!IS_P2ALIGNED(pnum, large)) { 687 688 /* 689 * If not in a large page, 690 * just free as small page. 691 */ 692 if (root == NULL) { 693 pp->p_szc = 0; 694 page_free_at_startup(pp); 695 continue; 696 } 697 698 /* 699 * Link a constituent page into the large page. 700 */ 701 pp->p_szc = szc; 702 page_list_concat(&root, &pp); 703 704 /* 705 * When large page is fully formed, free it. 706 */ 707 if (++cnt == large) { 708 page_free_large_ctr(cnt); 709 page_list_add_pages(root, PG_LIST_ISINIT); 710 root = NULL; 711 cnt = 0; 712 } 713 continue; 714 } 715 716 /* 717 * At this point we have a page number which 718 * is aligned. We assert that we aren't already 719 * in a different large page. 720 */ 721 ASSERT(IS_P2ALIGNED(pnum, large)); 722 ASSERT(root == NULL && cnt == 0); 723 724 /* 725 * If insufficient number of pages left to form 726 * a large page, just free the small page. 727 */ 728 if (num < large) { 729 pp->p_szc = 0; 730 page_free_at_startup(pp); 731 continue; 732 } 733 734 /* 735 * Otherwise start a new large page. 736 */ 737 pp->p_szc = szc; 738 cnt++; 739 root = pp; 740 } 741 ASSERT(root == NULL && cnt == 0); 742 } 743 744 /* 745 * Find a page representing the specified [vp, offset]. 746 * If we find the page but it is intransit coming in, 747 * it will have an "exclusive" lock and we wait for 748 * the i/o to complete. A page found on the free list 749 * is always reclaimed and then locked. On success, the page 750 * is locked, its data is valid and it isn't on the free 751 * list, while a NULL is returned if the page doesn't exist. 752 */ 753 page_t * 754 page_lookup(vnode_t *vp, u_offset_t off, se_t se) 755 { 756 return (page_lookup_create(vp, off, se, NULL, NULL, 0)); 757 } 758 759 /* 760 * Find a page representing the specified [vp, offset]. 761 * We either return the one we found or, if passed in, 762 * create one with identity of [vp, offset] of the 763 * pre-allocated page. If we find exsisting page but it is 764 * intransit coming in, it will have an "exclusive" lock 765 * and we wait for the i/o to complete. A page found on 766 * the free list is always reclaimed and then locked. 767 * On success, the page is locked, its data is valid and 768 * it isn't on the free list, while a NULL is returned 769 * if the page doesn't exist and newpp is NULL; 770 */ 771 page_t * 772 page_lookup_create( 773 vnode_t *vp, 774 u_offset_t off, 775 se_t se, 776 page_t *newpp, 777 spgcnt_t *nrelocp, 778 int flags) 779 { 780 page_t *pp; 781 kmutex_t *phm; 782 ulong_t index; 783 uint_t hash_locked; 784 uint_t es; 785 786 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 787 VM_STAT_ADD(page_lookup_cnt[0]); 788 ASSERT(newpp ? PAGE_EXCL(newpp) : 1); 789 790 /* 791 * Acquire the appropriate page hash lock since 792 * we have to search the hash list. Pages that 793 * hash to this list can't change identity while 794 * this lock is held. 795 */ 796 hash_locked = 0; 797 index = PAGE_HASH_FUNC(vp, off); 798 phm = NULL; 799 top: 800 PAGE_HASH_SEARCH(index, pp, vp, off); 801 if (pp != NULL) { 802 VM_STAT_ADD(page_lookup_cnt[1]); 803 es = (newpp != NULL) ? 1 : 0; 804 es |= flags; 805 if (!hash_locked) { 806 VM_STAT_ADD(page_lookup_cnt[2]); 807 if (!page_try_reclaim_lock(pp, se, es)) { 808 /* 809 * On a miss, acquire the phm. Then 810 * next time, page_lock() will be called, 811 * causing a wait if the page is busy. 812 * just looping with page_trylock() would 813 * get pretty boring. 814 */ 815 VM_STAT_ADD(page_lookup_cnt[3]); 816 phm = PAGE_HASH_MUTEX(index); 817 mutex_enter(phm); 818 hash_locked = 1; 819 goto top; 820 } 821 } else { 822 VM_STAT_ADD(page_lookup_cnt[4]); 823 if (!page_lock_es(pp, se, phm, P_RECLAIM, es)) { 824 VM_STAT_ADD(page_lookup_cnt[5]); 825 goto top; 826 } 827 } 828 829 /* 830 * Since `pp' is locked it can not change identity now. 831 * Reconfirm we locked the correct page. 832 * 833 * Both the p_vnode and p_offset *must* be cast volatile 834 * to force a reload of their values: The PAGE_HASH_SEARCH 835 * macro will have stuffed p_vnode and p_offset into 836 * registers before calling page_trylock(); another thread, 837 * actually holding the hash lock, could have changed the 838 * page's identity in memory, but our registers would not 839 * be changed, fooling the reconfirmation. If the hash 840 * lock was held during the search, the casting would 841 * not be needed. 842 */ 843 VM_STAT_ADD(page_lookup_cnt[6]); 844 if (((volatile struct vnode *)(pp->p_vnode) != vp) || 845 ((volatile u_offset_t)(pp->p_offset) != off)) { 846 VM_STAT_ADD(page_lookup_cnt[7]); 847 if (hash_locked) { 848 panic("page_lookup_create: lost page %p", 849 (void *)pp); 850 /*NOTREACHED*/ 851 } 852 page_unlock(pp); 853 phm = PAGE_HASH_MUTEX(index); 854 mutex_enter(phm); 855 hash_locked = 1; 856 goto top; 857 } 858 859 /* 860 * If page_trylock() was called, then pp may still be on 861 * the cachelist (can't be on the free list, it would not 862 * have been found in the search). If it is on the 863 * cachelist it must be pulled now. To pull the page from 864 * the cachelist, it must be exclusively locked. 865 * 866 * The other big difference between page_trylock() and 867 * page_lock(), is that page_lock() will pull the 868 * page from whatever free list (the cache list in this 869 * case) the page is on. If page_trylock() was used 870 * above, then we have to do the reclaim ourselves. 871 */ 872 if ((!hash_locked) && (PP_ISFREE(pp))) { 873 ASSERT(PP_ISAGED(pp) == 0); 874 VM_STAT_ADD(page_lookup_cnt[8]); 875 876 /* 877 * page_relcaim will insure that we 878 * have this page exclusively 879 */ 880 881 if (!page_reclaim(pp, NULL)) { 882 /* 883 * Page_reclaim dropped whatever lock 884 * we held. 885 */ 886 VM_STAT_ADD(page_lookup_cnt[9]); 887 phm = PAGE_HASH_MUTEX(index); 888 mutex_enter(phm); 889 hash_locked = 1; 890 goto top; 891 } else if (se == SE_SHARED && newpp == NULL) { 892 VM_STAT_ADD(page_lookup_cnt[10]); 893 page_downgrade(pp); 894 } 895 } 896 897 if (hash_locked) { 898 mutex_exit(phm); 899 } 900 901 if (newpp != NULL && pp->p_szc < newpp->p_szc && 902 PAGE_EXCL(pp) && nrelocp != NULL) { 903 ASSERT(nrelocp != NULL); 904 (void) page_relocate(&pp, &newpp, 1, 1, nrelocp, 905 NULL); 906 if (*nrelocp > 0) { 907 VM_STAT_COND_ADD(*nrelocp == 1, 908 page_lookup_cnt[11]); 909 VM_STAT_COND_ADD(*nrelocp > 1, 910 page_lookup_cnt[12]); 911 pp = newpp; 912 se = SE_EXCL; 913 } else { 914 if (se == SE_SHARED) { 915 page_downgrade(pp); 916 } 917 VM_STAT_ADD(page_lookup_cnt[13]); 918 } 919 } else if (newpp != NULL && nrelocp != NULL) { 920 if (PAGE_EXCL(pp) && se == SE_SHARED) { 921 page_downgrade(pp); 922 } 923 VM_STAT_COND_ADD(pp->p_szc < newpp->p_szc, 924 page_lookup_cnt[14]); 925 VM_STAT_COND_ADD(pp->p_szc == newpp->p_szc, 926 page_lookup_cnt[15]); 927 VM_STAT_COND_ADD(pp->p_szc > newpp->p_szc, 928 page_lookup_cnt[16]); 929 } else if (newpp != NULL && PAGE_EXCL(pp)) { 930 se = SE_EXCL; 931 } 932 } else if (!hash_locked) { 933 VM_STAT_ADD(page_lookup_cnt[17]); 934 phm = PAGE_HASH_MUTEX(index); 935 mutex_enter(phm); 936 hash_locked = 1; 937 goto top; 938 } else if (newpp != NULL) { 939 /* 940 * If we have a preallocated page then 941 * insert it now and basically behave like 942 * page_create. 943 */ 944 VM_STAT_ADD(page_lookup_cnt[18]); 945 /* 946 * Since we hold the page hash mutex and 947 * just searched for this page, page_hashin 948 * had better not fail. If it does, that 949 * means some thread did not follow the 950 * page hash mutex rules. Panic now and 951 * get it over with. As usual, go down 952 * holding all the locks. 953 */ 954 ASSERT(MUTEX_HELD(phm)); 955 if (!page_hashin(newpp, vp, off, phm)) { 956 ASSERT(MUTEX_HELD(phm)); 957 panic("page_lookup_create: hashin failed %p %p %llx %p", 958 (void *)newpp, (void *)vp, off, (void *)phm); 959 /*NOTREACHED*/ 960 } 961 ASSERT(MUTEX_HELD(phm)); 962 mutex_exit(phm); 963 phm = NULL; 964 page_set_props(newpp, P_REF); 965 page_io_lock(newpp); 966 pp = newpp; 967 se = SE_EXCL; 968 } else { 969 VM_STAT_ADD(page_lookup_cnt[19]); 970 mutex_exit(phm); 971 } 972 973 ASSERT(pp ? PAGE_LOCKED_SE(pp, se) : 1); 974 975 ASSERT(pp ? ((PP_ISFREE(pp) == 0) && (PP_ISAGED(pp) == 0)) : 1); 976 977 return (pp); 978 } 979 980 /* 981 * Search the hash list for the page representing the 982 * specified [vp, offset] and return it locked. Skip 983 * free pages and pages that cannot be locked as requested. 984 * Used while attempting to kluster pages. 985 */ 986 page_t * 987 page_lookup_nowait(vnode_t *vp, u_offset_t off, se_t se) 988 { 989 page_t *pp; 990 kmutex_t *phm; 991 ulong_t index; 992 uint_t locked; 993 994 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 995 VM_STAT_ADD(page_lookup_nowait_cnt[0]); 996 997 index = PAGE_HASH_FUNC(vp, off); 998 PAGE_HASH_SEARCH(index, pp, vp, off); 999 locked = 0; 1000 if (pp == NULL) { 1001 top: 1002 VM_STAT_ADD(page_lookup_nowait_cnt[1]); 1003 locked = 1; 1004 phm = PAGE_HASH_MUTEX(index); 1005 mutex_enter(phm); 1006 PAGE_HASH_SEARCH(index, pp, vp, off); 1007 } 1008 1009 if (pp == NULL || PP_ISFREE(pp)) { 1010 VM_STAT_ADD(page_lookup_nowait_cnt[2]); 1011 pp = NULL; 1012 } else { 1013 if (!page_trylock(pp, se)) { 1014 VM_STAT_ADD(page_lookup_nowait_cnt[3]); 1015 pp = NULL; 1016 } else { 1017 VM_STAT_ADD(page_lookup_nowait_cnt[4]); 1018 /* 1019 * See the comment in page_lookup() 1020 */ 1021 if (((volatile struct vnode *)(pp->p_vnode) != vp) || 1022 ((u_offset_t)(pp->p_offset) != off)) { 1023 VM_STAT_ADD(page_lookup_nowait_cnt[5]); 1024 if (locked) { 1025 panic("page_lookup_nowait %p", 1026 (void *)pp); 1027 /*NOTREACHED*/ 1028 } 1029 page_unlock(pp); 1030 goto top; 1031 } 1032 if (PP_ISFREE(pp)) { 1033 VM_STAT_ADD(page_lookup_nowait_cnt[6]); 1034 page_unlock(pp); 1035 pp = NULL; 1036 } 1037 } 1038 } 1039 if (locked) { 1040 VM_STAT_ADD(page_lookup_nowait_cnt[7]); 1041 mutex_exit(phm); 1042 } 1043 1044 ASSERT(pp ? PAGE_LOCKED_SE(pp, se) : 1); 1045 1046 return (pp); 1047 } 1048 1049 /* 1050 * Search the hash list for a page with the specified [vp, off] 1051 * that is known to exist and is already locked. This routine 1052 * is typically used by segment SOFTUNLOCK routines. 1053 */ 1054 page_t * 1055 page_find(vnode_t *vp, u_offset_t off) 1056 { 1057 page_t *pp; 1058 kmutex_t *phm; 1059 ulong_t index; 1060 1061 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 1062 VM_STAT_ADD(page_find_cnt); 1063 1064 index = PAGE_HASH_FUNC(vp, off); 1065 phm = PAGE_HASH_MUTEX(index); 1066 1067 mutex_enter(phm); 1068 PAGE_HASH_SEARCH(index, pp, vp, off); 1069 mutex_exit(phm); 1070 1071 ASSERT(pp != NULL); 1072 ASSERT(PAGE_LOCKED(pp) || panicstr); 1073 return (pp); 1074 } 1075 1076 /* 1077 * Determine whether a page with the specified [vp, off] 1078 * currently exists in the system. Obviously this should 1079 * only be considered as a hint since nothing prevents the 1080 * page from disappearing or appearing immediately after 1081 * the return from this routine. Subsequently, we don't 1082 * even bother to lock the list. 1083 */ 1084 page_t * 1085 page_exists(vnode_t *vp, u_offset_t off) 1086 { 1087 page_t *pp; 1088 ulong_t index; 1089 1090 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 1091 VM_STAT_ADD(page_exists_cnt); 1092 1093 index = PAGE_HASH_FUNC(vp, off); 1094 PAGE_HASH_SEARCH(index, pp, vp, off); 1095 1096 return (pp); 1097 } 1098 1099 /* 1100 * Determine if physically contiguous pages exist for [vp, off] - [vp, off + 1101 * page_size(szc)) range. if they exist and ppa is not NULL fill ppa array 1102 * with these pages locked SHARED. If necessary reclaim pages from 1103 * freelist. Return 1 if contiguous pages exist and 0 otherwise. 1104 * 1105 * If we fail to lock pages still return 1 if pages exist and contiguous. 1106 * But in this case return value is just a hint. ppa array won't be filled. 1107 * Caller should initialize ppa[0] as NULL to distinguish return value. 1108 * 1109 * Returns 0 if pages don't exist or not physically contiguous. 1110 * 1111 * This routine doesn't work for anonymous(swapfs) pages. 1112 */ 1113 int 1114 page_exists_physcontig(vnode_t *vp, u_offset_t off, uint_t szc, page_t *ppa[]) 1115 { 1116 pgcnt_t pages; 1117 pfn_t pfn; 1118 page_t *rootpp; 1119 pgcnt_t i; 1120 pgcnt_t j; 1121 u_offset_t save_off = off; 1122 ulong_t index; 1123 kmutex_t *phm; 1124 page_t *pp; 1125 uint_t pszc; 1126 int loopcnt = 0; 1127 1128 ASSERT(szc != 0); 1129 ASSERT(vp != NULL); 1130 ASSERT(!IS_SWAPFSVP(vp)); 1131 ASSERT(vp != &kvp); 1132 1133 again: 1134 if (++loopcnt > 3) { 1135 VM_STAT_ADD(page_exphcontg[0]); 1136 return (0); 1137 } 1138 1139 index = PAGE_HASH_FUNC(vp, off); 1140 phm = PAGE_HASH_MUTEX(index); 1141 1142 mutex_enter(phm); 1143 PAGE_HASH_SEARCH(index, pp, vp, off); 1144 mutex_exit(phm); 1145 1146 VM_STAT_ADD(page_exphcontg[1]); 1147 1148 if (pp == NULL) { 1149 VM_STAT_ADD(page_exphcontg[2]); 1150 return (0); 1151 } 1152 1153 pages = page_get_pagecnt(szc); 1154 rootpp = pp; 1155 pfn = rootpp->p_pagenum; 1156 1157 if ((pszc = pp->p_szc) >= szc && ppa != NULL) { 1158 VM_STAT_ADD(page_exphcontg[3]); 1159 if (!page_trylock(pp, SE_SHARED)) { 1160 VM_STAT_ADD(page_exphcontg[4]); 1161 return (1); 1162 } 1163 if (pp->p_szc != pszc || pp->p_vnode != vp || 1164 pp->p_offset != off) { 1165 VM_STAT_ADD(page_exphcontg[5]); 1166 page_unlock(pp); 1167 off = save_off; 1168 goto again; 1169 } 1170 /* 1171 * szc was non zero and vnode and offset matched after we 1172 * locked the page it means it can't become free on us. 1173 */ 1174 ASSERT(!PP_ISFREE(pp)); 1175 if (!IS_P2ALIGNED(pfn, pages)) { 1176 page_unlock(pp); 1177 return (0); 1178 } 1179 ppa[0] = pp; 1180 pp++; 1181 off += PAGESIZE; 1182 pfn++; 1183 for (i = 1; i < pages; i++, pp++, off += PAGESIZE, pfn++) { 1184 if (!page_trylock(pp, SE_SHARED)) { 1185 VM_STAT_ADD(page_exphcontg[6]); 1186 pp--; 1187 while (i-- > 0) { 1188 page_unlock(pp); 1189 pp--; 1190 } 1191 ppa[0] = NULL; 1192 return (1); 1193 } 1194 if (pp->p_szc != pszc) { 1195 VM_STAT_ADD(page_exphcontg[7]); 1196 page_unlock(pp); 1197 pp--; 1198 while (i-- > 0) { 1199 page_unlock(pp); 1200 pp--; 1201 } 1202 ppa[0] = NULL; 1203 off = save_off; 1204 goto again; 1205 } 1206 /* 1207 * szc the same as for previous already locked pages 1208 * with right identity. Since this page had correct 1209 * szc after we locked it can't get freed or destroyed 1210 * and therefore must have the expected identity. 1211 */ 1212 ASSERT(!PP_ISFREE(pp)); 1213 if (pp->p_vnode != vp || 1214 pp->p_offset != off) { 1215 panic("page_exists_physcontig: " 1216 "large page identity doesn't match"); 1217 } 1218 ppa[i] = pp; 1219 ASSERT(pp->p_pagenum == pfn); 1220 } 1221 VM_STAT_ADD(page_exphcontg[8]); 1222 ppa[pages] = NULL; 1223 return (1); 1224 } else if (pszc >= szc) { 1225 VM_STAT_ADD(page_exphcontg[9]); 1226 if (!IS_P2ALIGNED(pfn, pages)) { 1227 return (0); 1228 } 1229 return (1); 1230 } 1231 1232 if (!IS_P2ALIGNED(pfn, pages)) { 1233 VM_STAT_ADD(page_exphcontg[10]); 1234 return (0); 1235 } 1236 1237 if (page_numtomemseg_nolock(pfn) != 1238 page_numtomemseg_nolock(pfn + pages - 1)) { 1239 VM_STAT_ADD(page_exphcontg[11]); 1240 return (0); 1241 } 1242 1243 /* 1244 * We loop up 4 times across pages to promote page size. 1245 * We're extra cautious to promote page size atomically with respect 1246 * to everybody else. But we can probably optimize into 1 loop if 1247 * this becomes an issue. 1248 */ 1249 1250 for (i = 0; i < pages; i++, pp++, off += PAGESIZE, pfn++) { 1251 ASSERT(pp->p_pagenum == pfn); 1252 if (!page_trylock(pp, SE_EXCL)) { 1253 VM_STAT_ADD(page_exphcontg[12]); 1254 break; 1255 } 1256 if (pp->p_vnode != vp || 1257 pp->p_offset != off) { 1258 VM_STAT_ADD(page_exphcontg[13]); 1259 page_unlock(pp); 1260 break; 1261 } 1262 if (pp->p_szc >= szc) { 1263 ASSERT(i == 0); 1264 page_unlock(pp); 1265 off = save_off; 1266 goto again; 1267 } 1268 } 1269 1270 if (i != pages) { 1271 VM_STAT_ADD(page_exphcontg[14]); 1272 --pp; 1273 while (i-- > 0) { 1274 page_unlock(pp); 1275 --pp; 1276 } 1277 return (0); 1278 } 1279 1280 pp = rootpp; 1281 for (i = 0; i < pages; i++, pp++) { 1282 if (PP_ISFREE(pp)) { 1283 VM_STAT_ADD(page_exphcontg[15]); 1284 ASSERT(!PP_ISAGED(pp)); 1285 ASSERT(pp->p_szc == 0); 1286 if (!page_reclaim(pp, NULL)) { 1287 break; 1288 } 1289 } else { 1290 ASSERT(pp->p_szc < szc); 1291 VM_STAT_ADD(page_exphcontg[16]); 1292 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1293 } 1294 } 1295 if (i < pages) { 1296 VM_STAT_ADD(page_exphcontg[17]); 1297 /* 1298 * page_reclaim failed because we were out of memory. 1299 * drop the rest of the locks and return because this page 1300 * must be already reallocated anyway. 1301 */ 1302 pp = rootpp; 1303 for (j = 0; j < pages; j++, pp++) { 1304 if (j != i) { 1305 page_unlock(pp); 1306 } 1307 } 1308 return (0); 1309 } 1310 1311 off = save_off; 1312 pp = rootpp; 1313 for (i = 0; i < pages; i++, pp++, off += PAGESIZE) { 1314 ASSERT(PAGE_EXCL(pp)); 1315 ASSERT(!PP_ISFREE(pp)); 1316 ASSERT(!hat_page_is_mapped(pp)); 1317 ASSERT(pp->p_vnode == vp); 1318 ASSERT(pp->p_offset == off); 1319 pp->p_szc = szc; 1320 } 1321 pp = rootpp; 1322 for (i = 0; i < pages; i++, pp++) { 1323 if (ppa == NULL) { 1324 page_unlock(pp); 1325 } else { 1326 ppa[i] = pp; 1327 page_downgrade(ppa[i]); 1328 } 1329 } 1330 if (ppa != NULL) { 1331 ppa[pages] = NULL; 1332 } 1333 VM_STAT_ADD(page_exphcontg[18]); 1334 ASSERT(vp->v_pages != NULL); 1335 return (1); 1336 } 1337 1338 /* 1339 * Determine whether a page with the specified [vp, off] 1340 * currently exists in the system and if so return its 1341 * size code. Obviously this should only be considered as 1342 * a hint since nothing prevents the page from disappearing 1343 * or appearing immediately after the return from this routine. 1344 */ 1345 int 1346 page_exists_forreal(vnode_t *vp, u_offset_t off, uint_t *szc) 1347 { 1348 page_t *pp; 1349 kmutex_t *phm; 1350 ulong_t index; 1351 int rc = 0; 1352 1353 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 1354 ASSERT(szc != NULL); 1355 VM_STAT_ADD(page_exists_forreal_cnt); 1356 1357 index = PAGE_HASH_FUNC(vp, off); 1358 phm = PAGE_HASH_MUTEX(index); 1359 1360 mutex_enter(phm); 1361 PAGE_HASH_SEARCH(index, pp, vp, off); 1362 if (pp != NULL) { 1363 *szc = pp->p_szc; 1364 rc = 1; 1365 } 1366 mutex_exit(phm); 1367 return (rc); 1368 } 1369 1370 /* wakeup threads waiting for pages in page_create_get_something() */ 1371 void 1372 wakeup_pcgs(void) 1373 { 1374 if (!CV_HAS_WAITERS(&pcgs_cv)) 1375 return; 1376 cv_broadcast(&pcgs_cv); 1377 } 1378 1379 /* 1380 * 'freemem' is used all over the kernel as an indication of how many 1381 * pages are free (either on the cache list or on the free page list) 1382 * in the system. In very few places is a really accurate 'freemem' 1383 * needed. To avoid contention of the lock protecting a the 1384 * single freemem, it was spread out into NCPU buckets. Set_freemem 1385 * sets freemem to the total of all NCPU buckets. It is called from 1386 * clock() on each TICK. 1387 */ 1388 void 1389 set_freemem() 1390 { 1391 struct pcf *p; 1392 ulong_t t; 1393 uint_t i; 1394 1395 t = 0; 1396 p = pcf; 1397 for (i = 0; i < PCF_FANOUT; i++) { 1398 t += p->pcf_count; 1399 p++; 1400 } 1401 freemem = t; 1402 1403 /* 1404 * Don't worry about grabbing mutex. It's not that 1405 * critical if we miss a tick or two. This is 1406 * where we wakeup possible delayers in 1407 * page_create_get_something(). 1408 */ 1409 wakeup_pcgs(); 1410 } 1411 1412 ulong_t 1413 get_freemem() 1414 { 1415 struct pcf *p; 1416 ulong_t t; 1417 uint_t i; 1418 1419 t = 0; 1420 p = pcf; 1421 for (i = 0; i < PCF_FANOUT; i++) { 1422 t += p->pcf_count; 1423 p++; 1424 } 1425 /* 1426 * We just calculated it, might as well set it. 1427 */ 1428 freemem = t; 1429 return (t); 1430 } 1431 1432 /* 1433 * Acquire all of the page cache & free (pcf) locks. 1434 */ 1435 void 1436 pcf_acquire_all() 1437 { 1438 struct pcf *p; 1439 uint_t i; 1440 1441 p = pcf; 1442 for (i = 0; i < PCF_FANOUT; i++) { 1443 p->pcf_touch = 1; 1444 mutex_enter(&p->pcf_lock); 1445 p++; 1446 } 1447 } 1448 1449 /* 1450 * Release all the pcf_locks. 1451 */ 1452 void 1453 pcf_release_all() 1454 { 1455 struct pcf *p; 1456 uint_t i; 1457 1458 p = pcf; 1459 for (i = 0; i < PCF_FANOUT; i++) { 1460 mutex_exit(&p->pcf_lock); 1461 p++; 1462 } 1463 } 1464 1465 /* 1466 * Inform the VM system that we need some pages freed up. 1467 * Calls must be symmetric, e.g.: 1468 * 1469 * page_needfree(100); 1470 * wait a bit; 1471 * page_needfree(-100); 1472 */ 1473 void 1474 page_needfree(spgcnt_t npages) 1475 { 1476 mutex_enter(&new_freemem_lock); 1477 needfree += npages; 1478 mutex_exit(&new_freemem_lock); 1479 } 1480 1481 /* 1482 * Throttle for page_create(): try to prevent freemem from dropping 1483 * below throttlefree. We can't provide a 100% guarantee because 1484 * KM_NOSLEEP allocations, page_reclaim(), and various other things 1485 * nibble away at the freelist. However, we can block all PG_WAIT 1486 * allocations until memory becomes available. The motivation is 1487 * that several things can fall apart when there's no free memory: 1488 * 1489 * (1) If pageout() needs memory to push a page, the system deadlocks. 1490 * 1491 * (2) By (broken) specification, timeout(9F) can neither fail nor 1492 * block, so it has no choice but to panic the system if it 1493 * cannot allocate a callout structure. 1494 * 1495 * (3) Like timeout(), ddi_set_callback() cannot fail and cannot block; 1496 * it panics if it cannot allocate a callback structure. 1497 * 1498 * (4) Untold numbers of third-party drivers have not yet been hardened 1499 * against KM_NOSLEEP and/or allocb() failures; they simply assume 1500 * success and panic the system with a data fault on failure. 1501 * (The long-term solution to this particular problem is to ship 1502 * hostile fault-injecting DEBUG kernels with the DDK.) 1503 * 1504 * It is theoretically impossible to guarantee success of non-blocking 1505 * allocations, but in practice, this throttle is very hard to break. 1506 */ 1507 static int 1508 page_create_throttle(pgcnt_t npages, int flags) 1509 { 1510 ulong_t fm; 1511 uint_t i; 1512 pgcnt_t tf; /* effective value of throttlefree */ 1513 1514 /* 1515 * Never deny pages when: 1516 * - it's a thread that cannot block [NOMEMWAIT()] 1517 * - the allocation cannot block and must not fail 1518 * - the allocation cannot block and is pageout dispensated 1519 */ 1520 if (NOMEMWAIT() || 1521 ((flags & (PG_WAIT | PG_PANIC)) == PG_PANIC) || 1522 ((flags & (PG_WAIT | PG_PUSHPAGE)) == PG_PUSHPAGE)) 1523 return (1); 1524 1525 /* 1526 * If the allocation can't block, we look favorably upon it 1527 * unless we're below pageout_reserve. In that case we fail 1528 * the allocation because we want to make sure there are a few 1529 * pages available for pageout. 1530 */ 1531 if ((flags & PG_WAIT) == 0) 1532 return (freemem >= npages + pageout_reserve); 1533 1534 /* Calculate the effective throttlefree value */ 1535 tf = throttlefree - 1536 ((flags & PG_PUSHPAGE) ? pageout_reserve : 0); 1537 1538 cv_signal(&proc_pageout->p_cv); 1539 1540 while (freemem < npages + tf) { 1541 pcf_acquire_all(); 1542 mutex_enter(&new_freemem_lock); 1543 fm = 0; 1544 for (i = 0; i < PCF_FANOUT; i++) { 1545 fm += pcf[i].pcf_count; 1546 pcf[i].pcf_wait++; 1547 mutex_exit(&pcf[i].pcf_lock); 1548 } 1549 freemem = fm; 1550 needfree += npages; 1551 freemem_wait++; 1552 cv_wait(&freemem_cv, &new_freemem_lock); 1553 freemem_wait--; 1554 needfree -= npages; 1555 mutex_exit(&new_freemem_lock); 1556 } 1557 return (1); 1558 } 1559 1560 /* 1561 * page_create_wait() is called to either coalecse pages from the 1562 * different pcf buckets or to wait because there simply are not 1563 * enough pages to satisfy the caller's request. 1564 * 1565 * Sadly, this is called from platform/vm/vm_machdep.c 1566 */ 1567 int 1568 page_create_wait(size_t npages, uint_t flags) 1569 { 1570 pgcnt_t total; 1571 uint_t i; 1572 struct pcf *p; 1573 1574 /* 1575 * Wait until there are enough free pages to satisfy our 1576 * entire request. 1577 * We set needfree += npages before prodding pageout, to make sure 1578 * it does real work when npages > lotsfree > freemem. 1579 */ 1580 VM_STAT_ADD(page_create_not_enough); 1581 1582 ASSERT(!kcage_on ? !(flags & PG_NORELOC) : 1); 1583 checkagain: 1584 if ((flags & PG_NORELOC) && 1585 kcage_freemem < kcage_throttlefree + npages) 1586 (void) kcage_create_throttle(npages, flags); 1587 1588 if (freemem < npages + throttlefree) 1589 if (!page_create_throttle(npages, flags)) 1590 return (0); 1591 1592 /* 1593 * Since page_create_va() looked at every 1594 * bucket, assume we are going to have to wait. 1595 * Get all of the pcf locks. 1596 */ 1597 total = 0; 1598 p = pcf; 1599 for (i = 0; i < PCF_FANOUT; i++) { 1600 p->pcf_touch = 1; 1601 mutex_enter(&p->pcf_lock); 1602 total += p->pcf_count; 1603 if (total >= npages) { 1604 /* 1605 * Wow! There are enough pages laying around 1606 * to satisfy the request. Do the accounting, 1607 * drop the locks we acquired, and go back. 1608 * 1609 * freemem is not protected by any lock. So, 1610 * we cannot have any assertion containing 1611 * freemem. 1612 */ 1613 freemem -= npages; 1614 1615 while (p >= pcf) { 1616 if (p->pcf_count <= npages) { 1617 npages -= p->pcf_count; 1618 p->pcf_count = 0; 1619 } else { 1620 p->pcf_count -= (uint_t)npages; 1621 npages = 0; 1622 } 1623 mutex_exit(&p->pcf_lock); 1624 p--; 1625 } 1626 ASSERT(npages == 0); 1627 return (1); 1628 } 1629 p++; 1630 } 1631 1632 /* 1633 * All of the pcf locks are held, there are not enough pages 1634 * to satisfy the request (npages < total). 1635 * Be sure to acquire the new_freemem_lock before dropping 1636 * the pcf locks. This prevents dropping wakeups in page_free(). 1637 * The order is always pcf_lock then new_freemem_lock. 1638 * 1639 * Since we hold all the pcf locks, it is a good time to set freemem. 1640 * 1641 * If the caller does not want to wait, return now. 1642 * Else turn the pageout daemon loose to find something 1643 * and wait till it does. 1644 * 1645 */ 1646 freemem = total; 1647 1648 if ((flags & PG_WAIT) == 0) { 1649 pcf_release_all(); 1650 1651 TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_NOMEM, 1652 "page_create_nomem:npages %ld freemem %ld", npages, freemem); 1653 return (0); 1654 } 1655 1656 ASSERT(proc_pageout != NULL); 1657 cv_signal(&proc_pageout->p_cv); 1658 1659 TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SLEEP_START, 1660 "page_create_sleep_start: freemem %ld needfree %ld", 1661 freemem, needfree); 1662 1663 /* 1664 * We are going to wait. 1665 * We currently hold all of the pcf_locks, 1666 * get the new_freemem_lock (it protects freemem_wait), 1667 * before dropping the pcf_locks. 1668 */ 1669 mutex_enter(&new_freemem_lock); 1670 1671 p = pcf; 1672 for (i = 0; i < PCF_FANOUT; i++) { 1673 p->pcf_wait++; 1674 mutex_exit(&p->pcf_lock); 1675 p++; 1676 } 1677 1678 needfree += npages; 1679 freemem_wait++; 1680 1681 cv_wait(&freemem_cv, &new_freemem_lock); 1682 1683 freemem_wait--; 1684 needfree -= npages; 1685 1686 mutex_exit(&new_freemem_lock); 1687 1688 TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SLEEP_END, 1689 "page_create_sleep_end: freemem %ld needfree %ld", 1690 freemem, needfree); 1691 1692 VM_STAT_ADD(page_create_not_enough_again); 1693 goto checkagain; 1694 } 1695 1696 /* 1697 * A routine to do the opposite of page_create_wait(). 1698 */ 1699 void 1700 page_create_putback(spgcnt_t npages) 1701 { 1702 struct pcf *p; 1703 pgcnt_t lump; 1704 uint_t *which; 1705 1706 /* 1707 * When a contiguous lump is broken up, we have to 1708 * deal with lots of pages (min 64) so lets spread 1709 * the wealth around. 1710 */ 1711 lump = roundup(npages, PCF_FANOUT) / PCF_FANOUT; 1712 freemem += npages; 1713 1714 for (p = pcf; (npages > 0) && (p < &pcf[PCF_FANOUT]); p++) { 1715 which = &p->pcf_count; 1716 1717 mutex_enter(&p->pcf_lock); 1718 1719 if (p->pcf_block) { 1720 which = &p->pcf_reserve; 1721 } 1722 1723 if (lump < npages) { 1724 *which += (uint_t)lump; 1725 npages -= lump; 1726 } else { 1727 *which += (uint_t)npages; 1728 npages = 0; 1729 } 1730 1731 if (p->pcf_wait) { 1732 mutex_enter(&new_freemem_lock); 1733 /* 1734 * Check to see if some other thread 1735 * is actually waiting. Another bucket 1736 * may have woken it up by now. If there 1737 * are no waiters, then set our pcf_wait 1738 * count to zero to avoid coming in here 1739 * next time. 1740 */ 1741 if (freemem_wait) { 1742 if (npages > 1) { 1743 cv_broadcast(&freemem_cv); 1744 } else { 1745 cv_signal(&freemem_cv); 1746 } 1747 p->pcf_wait--; 1748 } else { 1749 p->pcf_wait = 0; 1750 } 1751 mutex_exit(&new_freemem_lock); 1752 } 1753 mutex_exit(&p->pcf_lock); 1754 } 1755 ASSERT(npages == 0); 1756 } 1757 1758 /* 1759 * A helper routine for page_create_get_something. 1760 * The indenting got to deep down there. 1761 * Unblock the pcf counters. Any pages freed after 1762 * pcf_block got set are moved to pcf_count and 1763 * wakeups (cv_broadcast() or cv_signal()) are done as needed. 1764 */ 1765 static void 1766 pcgs_unblock(void) 1767 { 1768 int i; 1769 struct pcf *p; 1770 1771 /* Update freemem while we're here. */ 1772 freemem = 0; 1773 p = pcf; 1774 for (i = 0; i < PCF_FANOUT; i++) { 1775 mutex_enter(&p->pcf_lock); 1776 ASSERT(p->pcf_count == 0); 1777 p->pcf_count = p->pcf_reserve; 1778 p->pcf_block = 0; 1779 freemem += p->pcf_count; 1780 if (p->pcf_wait) { 1781 mutex_enter(&new_freemem_lock); 1782 if (freemem_wait) { 1783 if (p->pcf_reserve > 1) { 1784 cv_broadcast(&freemem_cv); 1785 p->pcf_wait = 0; 1786 } else { 1787 cv_signal(&freemem_cv); 1788 p->pcf_wait--; 1789 } 1790 } else { 1791 p->pcf_wait = 0; 1792 } 1793 mutex_exit(&new_freemem_lock); 1794 } 1795 p->pcf_reserve = 0; 1796 mutex_exit(&p->pcf_lock); 1797 p++; 1798 } 1799 } 1800 1801 /* 1802 * Called from page_create_va() when both the cache and free lists 1803 * have been checked once. 1804 * 1805 * Either returns a page or panics since the accounting was done 1806 * way before we got here. 1807 * 1808 * We don't come here often, so leave the accounting on permanently. 1809 */ 1810 1811 #define MAX_PCGS 100 1812 1813 #ifdef DEBUG 1814 #define PCGS_TRIES 100 1815 #else /* DEBUG */ 1816 #define PCGS_TRIES 10 1817 #endif /* DEBUG */ 1818 1819 #ifdef VM_STATS 1820 uint_t pcgs_counts[PCGS_TRIES]; 1821 uint_t pcgs_too_many; 1822 uint_t pcgs_entered; 1823 uint_t pcgs_entered_noreloc; 1824 uint_t pcgs_locked; 1825 uint_t pcgs_cagelocked; 1826 #endif /* VM_STATS */ 1827 1828 static page_t * 1829 page_create_get_something(vnode_t *vp, u_offset_t off, struct seg *seg, 1830 caddr_t vaddr, uint_t flags) 1831 { 1832 uint_t count; 1833 page_t *pp; 1834 uint_t locked, i; 1835 struct pcf *p; 1836 lgrp_t *lgrp; 1837 int cagelocked = 0; 1838 1839 VM_STAT_ADD(pcgs_entered); 1840 1841 /* 1842 * Tap any reserve freelists: if we fail now, we'll die 1843 * since the page(s) we're looking for have already been 1844 * accounted for. 1845 */ 1846 flags |= PG_PANIC; 1847 1848 if ((flags & PG_NORELOC) != 0) { 1849 VM_STAT_ADD(pcgs_entered_noreloc); 1850 /* 1851 * Requests for free pages from critical threads 1852 * such as pageout still won't throttle here, but 1853 * we must try again, to give the cageout thread 1854 * another chance to catch up. Since we already 1855 * accounted for the pages, we had better get them 1856 * this time. 1857 * 1858 * N.B. All non-critical threads acquire the pcgs_cagelock 1859 * to serialize access to the freelists. This implements a 1860 * turnstile-type synchornization to avoid starvation of 1861 * critical requests for PG_NORELOC memory by non-critical 1862 * threads: all non-critical threads must acquire a 'ticket' 1863 * before passing through, which entails making sure 1864 * kcage_freemem won't fall below minfree prior to grabbing 1865 * pages from the freelists. 1866 */ 1867 if (kcage_create_throttle(1, flags) == KCT_NONCRIT) { 1868 mutex_enter(&pcgs_cagelock); 1869 cagelocked = 1; 1870 VM_STAT_ADD(pcgs_cagelocked); 1871 } 1872 } 1873 1874 /* 1875 * Time to get serious. 1876 * We failed to get a `correctly colored' page from both the 1877 * free and cache lists. 1878 * We escalate in stage. 1879 * 1880 * First try both lists without worring about color. 1881 * 1882 * Then, grab all page accounting locks (ie. pcf[]) and 1883 * steal any pages that they have and set the pcf_block flag to 1884 * stop deletions from the lists. This will help because 1885 * a page can get added to the free list while we are looking 1886 * at the cache list, then another page could be added to the cache 1887 * list allowing the page on the free list to be removed as we 1888 * move from looking at the cache list to the free list. This 1889 * could happen over and over. We would never find the page 1890 * we have accounted for. 1891 * 1892 * Noreloc pages are a subset of the global (relocatable) page pool. 1893 * They are not tracked separately in the pcf bins, so it is 1894 * impossible to know when doing pcf accounting if the available 1895 * page(s) are noreloc pages or not. When looking for a noreloc page 1896 * it is quite easy to end up here even if the global (relocatable) 1897 * page pool has plenty of free pages but the noreloc pool is empty. 1898 * 1899 * When the noreloc pool is empty (or low), additional noreloc pages 1900 * are created by converting pages from the global page pool. This 1901 * process will stall during pcf accounting if the pcf bins are 1902 * already locked. Such is the case when a noreloc allocation is 1903 * looping here in page_create_get_something waiting for more noreloc 1904 * pages to appear. 1905 * 1906 * Short of adding a new field to the pcf bins to accurately track 1907 * the number of free noreloc pages, we instead do not grab the 1908 * pcgs_lock, do not set the pcf blocks and do not timeout when 1909 * allocating a noreloc page. This allows noreloc allocations to 1910 * loop without blocking global page pool allocations. 1911 * 1912 * NOTE: the behaviour of page_create_get_something has not changed 1913 * for the case of global page pool allocations. 1914 */ 1915 1916 flags &= ~PG_MATCH_COLOR; 1917 locked = 0; 1918 #ifndef __sparc 1919 /* 1920 * page_create_get_something may be called because 4g memory may be 1921 * depleted. Set flags to allow for relocation of base page below 1922 * 4g if necessary. 1923 */ 1924 if (physmax4g) 1925 flags |= (PGI_PGCPSZC0 | PGI_PGCPHIPRI); 1926 #endif 1927 1928 lgrp = lgrp_mem_choose(seg, vaddr, PAGESIZE); 1929 1930 for (count = 0; kcage_on || count < MAX_PCGS; count++) { 1931 pp = page_get_freelist(vp, off, seg, vaddr, PAGESIZE, 1932 flags, lgrp); 1933 if (pp == NULL) { 1934 pp = page_get_cachelist(vp, off, seg, vaddr, 1935 flags, lgrp); 1936 } 1937 if (pp == NULL) { 1938 /* 1939 * Serialize. Don't fight with other pcgs(). 1940 */ 1941 if (!locked && (!kcage_on || !(flags & PG_NORELOC))) { 1942 mutex_enter(&pcgs_lock); 1943 VM_STAT_ADD(pcgs_locked); 1944 locked = 1; 1945 p = pcf; 1946 for (i = 0; i < PCF_FANOUT; i++) { 1947 mutex_enter(&p->pcf_lock); 1948 ASSERT(p->pcf_block == 0); 1949 p->pcf_block = 1; 1950 p->pcf_reserve = p->pcf_count; 1951 p->pcf_count = 0; 1952 mutex_exit(&p->pcf_lock); 1953 p++; 1954 } 1955 freemem = 0; 1956 } 1957 1958 if (count) { 1959 /* 1960 * Since page_free() puts pages on 1961 * a list then accounts for it, we 1962 * just have to wait for page_free() 1963 * to unlock any page it was working 1964 * with. The page_lock()-page_reclaim() 1965 * path falls in the same boat. 1966 * 1967 * We don't need to check on the 1968 * PG_WAIT flag, we have already 1969 * accounted for the page we are 1970 * looking for in page_create_va(). 1971 * 1972 * We just wait a moment to let any 1973 * locked pages on the lists free up, 1974 * then continue around and try again. 1975 * 1976 * Will be awakened by set_freemem(). 1977 */ 1978 mutex_enter(&pcgs_wait_lock); 1979 cv_wait(&pcgs_cv, &pcgs_wait_lock); 1980 mutex_exit(&pcgs_wait_lock); 1981 } 1982 } else { 1983 #ifdef VM_STATS 1984 if (count >= PCGS_TRIES) { 1985 VM_STAT_ADD(pcgs_too_many); 1986 } else { 1987 VM_STAT_ADD(pcgs_counts[count]); 1988 } 1989 #endif 1990 if (locked) { 1991 pcgs_unblock(); 1992 mutex_exit(&pcgs_lock); 1993 } 1994 if (cagelocked) 1995 mutex_exit(&pcgs_cagelock); 1996 return (pp); 1997 } 1998 } 1999 /* 2000 * we go down holding the pcf locks. 2001 */ 2002 panic("no %spage found %d", 2003 ((flags & PG_NORELOC) ? "non-reloc " : ""), count); 2004 /*NOTREACHED*/ 2005 } 2006 2007 /* 2008 * Create enough pages for "bytes" worth of data starting at 2009 * "off" in "vp". 2010 * 2011 * Where flag must be one of: 2012 * 2013 * PG_EXCL: Exclusive create (fail if any page already 2014 * exists in the page cache) which does not 2015 * wait for memory to become available. 2016 * 2017 * PG_WAIT: Non-exclusive create which can wait for 2018 * memory to become available. 2019 * 2020 * PG_PHYSCONTIG: Allocate physically contiguous pages. 2021 * (Not Supported) 2022 * 2023 * A doubly linked list of pages is returned to the caller. Each page 2024 * on the list has the "exclusive" (p_selock) lock and "iolock" (p_iolock) 2025 * lock. 2026 * 2027 * Unable to change the parameters to page_create() in a minor release, 2028 * we renamed page_create() to page_create_va(), changed all known calls 2029 * from page_create() to page_create_va(), and created this wrapper. 2030 * 2031 * Upon a major release, we should break compatibility by deleting this 2032 * wrapper, and replacing all the strings "page_create_va", with "page_create". 2033 * 2034 * NOTE: There is a copy of this interface as page_create_io() in 2035 * i86/vm/vm_machdep.c. Any bugs fixed here should be applied 2036 * there. 2037 */ 2038 page_t * 2039 page_create(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags) 2040 { 2041 caddr_t random_vaddr; 2042 struct seg kseg; 2043 2044 #ifdef DEBUG 2045 cmn_err(CE_WARN, "Using deprecated interface page_create: caller %p", 2046 (void *)caller()); 2047 #endif 2048 2049 random_vaddr = (caddr_t)(((uintptr_t)vp >> 7) ^ 2050 (uintptr_t)(off >> PAGESHIFT)); 2051 kseg.s_as = &kas; 2052 2053 return (page_create_va(vp, off, bytes, flags, &kseg, random_vaddr)); 2054 } 2055 2056 #ifdef DEBUG 2057 uint32_t pg_alloc_pgs_mtbf = 0; 2058 #endif 2059 2060 /* 2061 * Used for large page support. It will attempt to allocate 2062 * a large page(s) off the freelist. 2063 * 2064 * Returns non zero on failure. 2065 */ 2066 int 2067 page_alloc_pages(struct seg *seg, caddr_t addr, page_t **basepp, 2068 page_t *ppa[], uint_t szc, int anypgsz) 2069 { 2070 pgcnt_t npgs, curnpgs, totpgs; 2071 size_t pgsz; 2072 page_t *pplist = NULL, *pp; 2073 int err = 0; 2074 lgrp_t *lgrp; 2075 2076 ASSERT(szc != 0 && szc <= (page_num_pagesizes() - 1)); 2077 2078 VM_STAT_ADD(alloc_pages[0]); 2079 2080 #ifdef DEBUG 2081 if (pg_alloc_pgs_mtbf && !(gethrtime() % pg_alloc_pgs_mtbf)) { 2082 return (ENOMEM); 2083 } 2084 #endif 2085 2086 pgsz = page_get_pagesize(szc); 2087 totpgs = curnpgs = npgs = pgsz >> PAGESHIFT; 2088 2089 ASSERT(((uintptr_t)addr & (pgsz - 1)) == 0); 2090 /* 2091 * One must be NULL but not both. 2092 * And one must be non NULL but not both. 2093 */ 2094 ASSERT(basepp != NULL || ppa != NULL); 2095 ASSERT(basepp == NULL || ppa == NULL); 2096 2097 (void) page_create_wait(npgs, PG_WAIT); 2098 2099 while (npgs && szc) { 2100 lgrp = lgrp_mem_choose(seg, addr, pgsz); 2101 pp = page_get_freelist(NULL, 0, seg, addr, pgsz, 0, lgrp); 2102 if (pp != NULL) { 2103 VM_STAT_ADD(alloc_pages[1]); 2104 page_list_concat(&pplist, &pp); 2105 ASSERT(npgs >= curnpgs); 2106 npgs -= curnpgs; 2107 } else if (anypgsz) { 2108 VM_STAT_ADD(alloc_pages[2]); 2109 szc--; 2110 pgsz = page_get_pagesize(szc); 2111 curnpgs = pgsz >> PAGESHIFT; 2112 } else { 2113 VM_STAT_ADD(alloc_pages[3]); 2114 ASSERT(npgs == totpgs); 2115 page_create_putback(npgs); 2116 return (ENOMEM); 2117 } 2118 } 2119 if (szc == 0) { 2120 VM_STAT_ADD(alloc_pages[4]); 2121 ASSERT(npgs != 0); 2122 page_create_putback(npgs); 2123 err = ENOMEM; 2124 } else if (basepp != NULL) { 2125 ASSERT(npgs == 0); 2126 ASSERT(ppa == NULL); 2127 *basepp = pplist; 2128 } 2129 2130 npgs = totpgs - npgs; 2131 pp = pplist; 2132 2133 /* 2134 * Clear the free and age bits. Also if we were passed in a ppa then 2135 * fill it in with all the constituent pages from the large page. But 2136 * if we failed to allocate all the pages just free what we got. 2137 */ 2138 while (npgs != 0) { 2139 ASSERT(PP_ISFREE(pp)); 2140 ASSERT(PP_ISAGED(pp)); 2141 if (ppa != NULL || err != 0) { 2142 if (err == 0) { 2143 VM_STAT_ADD(alloc_pages[5]); 2144 PP_CLRFREE(pp); 2145 PP_CLRAGED(pp); 2146 page_sub(&pplist, pp); 2147 *ppa++ = pp; 2148 npgs--; 2149 } else { 2150 VM_STAT_ADD(alloc_pages[6]); 2151 ASSERT(pp->p_szc != 0); 2152 curnpgs = page_get_pagecnt(pp->p_szc); 2153 page_list_break(&pp, &pplist, curnpgs); 2154 page_list_add_pages(pp, 0); 2155 page_create_putback(curnpgs); 2156 ASSERT(npgs >= curnpgs); 2157 npgs -= curnpgs; 2158 } 2159 pp = pplist; 2160 } else { 2161 VM_STAT_ADD(alloc_pages[7]); 2162 PP_CLRFREE(pp); 2163 PP_CLRAGED(pp); 2164 pp = pp->p_next; 2165 npgs--; 2166 } 2167 } 2168 return (err); 2169 } 2170 2171 /* 2172 * Get a single large page off of the freelists, and set it up for use. 2173 * Number of bytes requested must be a supported page size. 2174 * 2175 * Note that this call may fail even if there is sufficient 2176 * memory available or PG_WAIT is set, so the caller must 2177 * be willing to fallback on page_create_va(), block and retry, 2178 * or fail the requester. 2179 */ 2180 page_t * 2181 page_create_va_large(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags, 2182 struct seg *seg, caddr_t vaddr, void *arg) 2183 { 2184 pgcnt_t npages, pcftotal; 2185 page_t *pp; 2186 page_t *rootpp; 2187 lgrp_t *lgrp; 2188 uint_t enough; 2189 uint_t pcf_index; 2190 uint_t i; 2191 struct pcf *p; 2192 struct pcf *q; 2193 lgrp_id_t *lgrpid = (lgrp_id_t *)arg; 2194 2195 ASSERT(vp != NULL); 2196 2197 ASSERT((flags & ~(PG_EXCL | PG_WAIT | 2198 PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == 0); 2199 /* but no others */ 2200 2201 ASSERT((flags & PG_EXCL) == PG_EXCL); 2202 2203 npages = btop(bytes); 2204 2205 if (!kcage_on || panicstr) { 2206 /* 2207 * Cage is OFF, or we are single threaded in 2208 * panic, so make everything a RELOC request. 2209 */ 2210 flags &= ~PG_NORELOC; 2211 } 2212 2213 /* 2214 * Make sure there's adequate physical memory available. 2215 * Note: PG_WAIT is ignored here. 2216 */ 2217 if (freemem <= throttlefree + npages) { 2218 VM_STAT_ADD(page_create_large_cnt[1]); 2219 return (NULL); 2220 } 2221 2222 /* 2223 * If cage is on, dampen draw from cage when available 2224 * cage space is low. 2225 */ 2226 if ((flags & (PG_NORELOC | PG_WAIT)) == (PG_NORELOC | PG_WAIT) && 2227 kcage_freemem < kcage_throttlefree + npages) { 2228 2229 /* 2230 * The cage is on, the caller wants PG_NORELOC 2231 * pages and available cage memory is very low. 2232 * Call kcage_create_throttle() to attempt to 2233 * control demand on the cage. 2234 */ 2235 if (kcage_create_throttle(npages, flags) == KCT_FAILURE) { 2236 VM_STAT_ADD(page_create_large_cnt[2]); 2237 return (NULL); 2238 } 2239 } 2240 2241 enough = 0; 2242 pcf_index = PCF_INDEX(); 2243 p = &pcf[pcf_index]; 2244 p->pcf_touch = 1; 2245 q = &pcf[PCF_FANOUT]; 2246 for (pcftotal = 0, i = 0; i < PCF_FANOUT; i++) { 2247 if (p->pcf_count > npages) { 2248 /* 2249 * a good one to try. 2250 */ 2251 mutex_enter(&p->pcf_lock); 2252 if (p->pcf_count > npages) { 2253 p->pcf_count -= (uint_t)npages; 2254 /* 2255 * freemem is not protected by any lock. 2256 * Thus, we cannot have any assertion 2257 * containing freemem here. 2258 */ 2259 freemem -= npages; 2260 enough = 1; 2261 mutex_exit(&p->pcf_lock); 2262 break; 2263 } 2264 mutex_exit(&p->pcf_lock); 2265 } 2266 pcftotal += p->pcf_count; 2267 p++; 2268 if (p >= q) { 2269 p = pcf; 2270 } 2271 p->pcf_touch = 1; 2272 } 2273 2274 if (!enough) { 2275 /* If there isn't enough memory available, give up. */ 2276 if (pcftotal < npages) { 2277 VM_STAT_ADD(page_create_large_cnt[3]); 2278 return (NULL); 2279 } 2280 2281 /* try to collect pages from several pcf bins */ 2282 for (p = pcf, pcftotal = 0, i = 0; i < PCF_FANOUT; i++) { 2283 p->pcf_touch = 1; 2284 mutex_enter(&p->pcf_lock); 2285 pcftotal += p->pcf_count; 2286 if (pcftotal >= npages) { 2287 /* 2288 * Wow! There are enough pages laying around 2289 * to satisfy the request. Do the accounting, 2290 * drop the locks we acquired, and go back. 2291 * 2292 * freemem is not protected by any lock. So, 2293 * we cannot have any assertion containing 2294 * freemem. 2295 */ 2296 pgcnt_t tpages = npages; 2297 freemem -= npages; 2298 while (p >= pcf) { 2299 if (p->pcf_count <= tpages) { 2300 tpages -= p->pcf_count; 2301 p->pcf_count = 0; 2302 } else { 2303 p->pcf_count -= (uint_t)tpages; 2304 tpages = 0; 2305 } 2306 mutex_exit(&p->pcf_lock); 2307 p--; 2308 } 2309 ASSERT(tpages == 0); 2310 break; 2311 } 2312 p++; 2313 } 2314 if (i == PCF_FANOUT) { 2315 /* failed to collect pages - release the locks */ 2316 while (--p >= pcf) { 2317 mutex_exit(&p->pcf_lock); 2318 } 2319 VM_STAT_ADD(page_create_large_cnt[4]); 2320 return (NULL); 2321 } 2322 } 2323 2324 /* 2325 * This is where this function behaves fundamentally differently 2326 * than page_create_va(); since we're intending to map the page 2327 * with a single TTE, we have to get it as a physically contiguous 2328 * hardware pagesize chunk. If we can't, we fail. 2329 */ 2330 if (lgrpid != NULL && *lgrpid >= 0 && *lgrpid <= lgrp_alloc_max && 2331 LGRP_EXISTS(lgrp_table[*lgrpid])) 2332 lgrp = lgrp_table[*lgrpid]; 2333 else 2334 lgrp = lgrp_mem_choose(seg, vaddr, bytes); 2335 2336 if ((rootpp = page_get_freelist(&kvp, off, seg, vaddr, 2337 bytes, flags & ~PG_MATCH_COLOR, lgrp)) == NULL) { 2338 page_create_putback(npages); 2339 VM_STAT_ADD(page_create_large_cnt[5]); 2340 return (NULL); 2341 } 2342 2343 /* 2344 * if we got the page with the wrong mtype give it back this is a 2345 * workaround for CR 6249718. When CR 6249718 is fixed we never get 2346 * inside "if" and the workaround becomes just a nop 2347 */ 2348 if (kcage_on && (flags & PG_NORELOC) && !PP_ISNORELOC(rootpp)) { 2349 page_list_add_pages(rootpp, 0); 2350 page_create_putback(npages); 2351 VM_STAT_ADD(page_create_large_cnt[6]); 2352 return (NULL); 2353 } 2354 2355 /* 2356 * If satisfying this request has left us with too little 2357 * memory, start the wheels turning to get some back. The 2358 * first clause of the test prevents waking up the pageout 2359 * daemon in situations where it would decide that there's 2360 * nothing to do. 2361 */ 2362 if (nscan < desscan && freemem < minfree) { 2363 TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL, 2364 "pageout_cv_signal:freemem %ld", freemem); 2365 cv_signal(&proc_pageout->p_cv); 2366 } 2367 2368 pp = rootpp; 2369 while (npages--) { 2370 ASSERT(PAGE_EXCL(pp)); 2371 ASSERT(pp->p_vnode == NULL); 2372 ASSERT(!hat_page_is_mapped(pp)); 2373 PP_CLRFREE(pp); 2374 PP_CLRAGED(pp); 2375 if (!page_hashin(pp, vp, off, NULL)) 2376 panic("page_create_large: hashin failed: page %p", 2377 (void *)pp); 2378 page_io_lock(pp); 2379 off += PAGESIZE; 2380 pp = pp->p_next; 2381 } 2382 2383 VM_STAT_ADD(page_create_large_cnt[0]); 2384 return (rootpp); 2385 } 2386 2387 page_t * 2388 page_create_va(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags, 2389 struct seg *seg, caddr_t vaddr) 2390 { 2391 page_t *plist = NULL; 2392 pgcnt_t npages; 2393 pgcnt_t found_on_free = 0; 2394 pgcnt_t pages_req; 2395 page_t *npp = NULL; 2396 uint_t enough; 2397 uint_t i; 2398 uint_t pcf_index; 2399 struct pcf *p; 2400 struct pcf *q; 2401 lgrp_t *lgrp; 2402 2403 TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START, 2404 "page_create_start:vp %p off %llx bytes %lu flags %x", 2405 vp, off, bytes, flags); 2406 2407 ASSERT(bytes != 0 && vp != NULL); 2408 2409 if ((flags & PG_EXCL) == 0 && (flags & PG_WAIT) == 0) { 2410 panic("page_create: invalid flags"); 2411 /*NOTREACHED*/ 2412 } 2413 ASSERT((flags & ~(PG_EXCL | PG_WAIT | 2414 PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == 0); 2415 /* but no others */ 2416 2417 pages_req = npages = btopr(bytes); 2418 /* 2419 * Try to see whether request is too large to *ever* be 2420 * satisfied, in order to prevent deadlock. We arbitrarily 2421 * decide to limit maximum size requests to max_page_get. 2422 */ 2423 if (npages >= max_page_get) { 2424 if ((flags & PG_WAIT) == 0) { 2425 TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_TOOBIG, 2426 "page_create_toobig:vp %p off %llx npages " 2427 "%lu max_page_get %lu", 2428 vp, off, npages, max_page_get); 2429 return (NULL); 2430 } else { 2431 cmn_err(CE_WARN, 2432 "Request for too much kernel memory " 2433 "(%lu bytes), will hang forever", bytes); 2434 for (;;) 2435 delay(1000000000); 2436 } 2437 } 2438 2439 if (!kcage_on || panicstr) { 2440 /* 2441 * Cage is OFF, or we are single threaded in 2442 * panic, so make everything a RELOC request. 2443 */ 2444 flags &= ~PG_NORELOC; 2445 } 2446 2447 if (freemem <= throttlefree + npages) 2448 if (!page_create_throttle(npages, flags)) 2449 return (NULL); 2450 2451 /* 2452 * If cage is on, dampen draw from cage when available 2453 * cage space is low. 2454 */ 2455 if ((flags & PG_NORELOC) && 2456 kcage_freemem < kcage_throttlefree + npages) { 2457 2458 /* 2459 * The cage is on, the caller wants PG_NORELOC 2460 * pages and available cage memory is very low. 2461 * Call kcage_create_throttle() to attempt to 2462 * control demand on the cage. 2463 */ 2464 if (kcage_create_throttle(npages, flags) == KCT_FAILURE) 2465 return (NULL); 2466 } 2467 2468 VM_STAT_ADD(page_create_cnt[0]); 2469 2470 enough = 0; 2471 pcf_index = PCF_INDEX(); 2472 2473 p = &pcf[pcf_index]; 2474 p->pcf_touch = 1; 2475 q = &pcf[PCF_FANOUT]; 2476 for (i = 0; i < PCF_FANOUT; i++) { 2477 if (p->pcf_count > npages) { 2478 /* 2479 * a good one to try. 2480 */ 2481 mutex_enter(&p->pcf_lock); 2482 if (p->pcf_count > npages) { 2483 p->pcf_count -= (uint_t)npages; 2484 /* 2485 * freemem is not protected by any lock. 2486 * Thus, we cannot have any assertion 2487 * containing freemem here. 2488 */ 2489 freemem -= npages; 2490 enough = 1; 2491 mutex_exit(&p->pcf_lock); 2492 break; 2493 } 2494 mutex_exit(&p->pcf_lock); 2495 } 2496 p++; 2497 if (p >= q) { 2498 p = pcf; 2499 } 2500 p->pcf_touch = 1; 2501 } 2502 2503 if (!enough) { 2504 /* 2505 * Have to look harder. If npages is greater than 2506 * one, then we might have to coalecse the counters. 2507 * 2508 * Go wait. We come back having accounted 2509 * for the memory. 2510 */ 2511 VM_STAT_ADD(page_create_cnt[1]); 2512 if (!page_create_wait(npages, flags)) { 2513 VM_STAT_ADD(page_create_cnt[2]); 2514 return (NULL); 2515 } 2516 } 2517 2518 TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS, 2519 "page_create_success:vp %p off %llx", vp, off); 2520 2521 /* 2522 * If satisfying this request has left us with too little 2523 * memory, start the wheels turning to get some back. The 2524 * first clause of the test prevents waking up the pageout 2525 * daemon in situations where it would decide that there's 2526 * nothing to do. 2527 */ 2528 if (nscan < desscan && freemem < minfree) { 2529 TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL, 2530 "pageout_cv_signal:freemem %ld", freemem); 2531 cv_signal(&proc_pageout->p_cv); 2532 } 2533 2534 /* 2535 * Loop around collecting the requested number of pages. 2536 * Most of the time, we have to `create' a new page. With 2537 * this in mind, pull the page off the free list before 2538 * getting the hash lock. This will minimize the hash 2539 * lock hold time, nesting, and the like. If it turns 2540 * out we don't need the page, we put it back at the end. 2541 */ 2542 while (npages--) { 2543 page_t *pp; 2544 kmutex_t *phm = NULL; 2545 ulong_t index; 2546 2547 index = PAGE_HASH_FUNC(vp, off); 2548 top: 2549 ASSERT(phm == NULL); 2550 ASSERT(index == PAGE_HASH_FUNC(vp, off)); 2551 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 2552 2553 if (npp == NULL) { 2554 /* 2555 * Try to get a page from the freelist (ie, 2556 * a page with no [vp, off] tag). If that 2557 * fails, use the cachelist. 2558 * 2559 * During the first attempt at both the free 2560 * and cache lists we try for the correct color. 2561 */ 2562 /* 2563 * XXXX-how do we deal with virtual indexed 2564 * caches and and colors? 2565 */ 2566 VM_STAT_ADD(page_create_cnt[4]); 2567 /* 2568 * Get lgroup to allocate next page of shared memory 2569 * from and use it to specify where to allocate 2570 * the physical memory 2571 */ 2572 lgrp = lgrp_mem_choose(seg, vaddr, PAGESIZE); 2573 npp = page_get_freelist(vp, off, seg, vaddr, PAGESIZE, 2574 flags | PG_MATCH_COLOR, lgrp); 2575 if (npp == NULL) { 2576 npp = page_get_cachelist(vp, off, seg, 2577 vaddr, flags | PG_MATCH_COLOR, lgrp); 2578 if (npp == NULL) { 2579 npp = page_create_get_something(vp, 2580 off, seg, vaddr, 2581 flags & ~PG_MATCH_COLOR); 2582 } 2583 2584 if (PP_ISAGED(npp) == 0) { 2585 /* 2586 * Since this page came from the 2587 * cachelist, we must destroy the 2588 * old vnode association. 2589 */ 2590 page_hashout(npp, NULL); 2591 } 2592 } 2593 } 2594 2595 /* 2596 * We own this page! 2597 */ 2598 ASSERT(PAGE_EXCL(npp)); 2599 ASSERT(npp->p_vnode == NULL); 2600 ASSERT(!hat_page_is_mapped(npp)); 2601 PP_CLRFREE(npp); 2602 PP_CLRAGED(npp); 2603 2604 /* 2605 * Here we have a page in our hot little mits and are 2606 * just waiting to stuff it on the appropriate lists. 2607 * Get the mutex and check to see if it really does 2608 * not exist. 2609 */ 2610 phm = PAGE_HASH_MUTEX(index); 2611 mutex_enter(phm); 2612 PAGE_HASH_SEARCH(index, pp, vp, off); 2613 if (pp == NULL) { 2614 VM_STAT_ADD(page_create_new); 2615 pp = npp; 2616 npp = NULL; 2617 if (!page_hashin(pp, vp, off, phm)) { 2618 /* 2619 * Since we hold the page hash mutex and 2620 * just searched for this page, page_hashin 2621 * had better not fail. If it does, that 2622 * means somethread did not follow the 2623 * page hash mutex rules. Panic now and 2624 * get it over with. As usual, go down 2625 * holding all the locks. 2626 */ 2627 ASSERT(MUTEX_HELD(phm)); 2628 panic("page_create: " 2629 "hashin failed %p %p %llx %p", 2630 (void *)pp, (void *)vp, off, (void *)phm); 2631 /*NOTREACHED*/ 2632 } 2633 ASSERT(MUTEX_HELD(phm)); 2634 mutex_exit(phm); 2635 phm = NULL; 2636 2637 /* 2638 * Hat layer locking need not be done to set 2639 * the following bits since the page is not hashed 2640 * and was on the free list (i.e., had no mappings). 2641 * 2642 * Set the reference bit to protect 2643 * against immediate pageout 2644 * 2645 * XXXmh modify freelist code to set reference 2646 * bit so we don't have to do it here. 2647 */ 2648 page_set_props(pp, P_REF); 2649 found_on_free++; 2650 } else { 2651 VM_STAT_ADD(page_create_exists); 2652 if (flags & PG_EXCL) { 2653 /* 2654 * Found an existing page, and the caller 2655 * wanted all new pages. Undo all of the work 2656 * we have done. 2657 */ 2658 mutex_exit(phm); 2659 phm = NULL; 2660 while (plist != NULL) { 2661 pp = plist; 2662 page_sub(&plist, pp); 2663 page_io_unlock(pp); 2664 /* large pages should not end up here */ 2665 ASSERT(pp->p_szc == 0); 2666 /*LINTED: constant in conditional ctx*/ 2667 VN_DISPOSE(pp, B_INVAL, 0, kcred); 2668 } 2669 VM_STAT_ADD(page_create_found_one); 2670 goto fail; 2671 } 2672 ASSERT(flags & PG_WAIT); 2673 if (!page_lock(pp, SE_EXCL, phm, P_NO_RECLAIM)) { 2674 /* 2675 * Start all over again if we blocked trying 2676 * to lock the page. 2677 */ 2678 mutex_exit(phm); 2679 VM_STAT_ADD(page_create_page_lock_failed); 2680 phm = NULL; 2681 goto top; 2682 } 2683 mutex_exit(phm); 2684 phm = NULL; 2685 2686 if (PP_ISFREE(pp)) { 2687 ASSERT(PP_ISAGED(pp) == 0); 2688 VM_STAT_ADD(pagecnt.pc_get_cache); 2689 page_list_sub(pp, PG_CACHE_LIST); 2690 PP_CLRFREE(pp); 2691 found_on_free++; 2692 } 2693 } 2694 2695 /* 2696 * Got a page! It is locked. Acquire the i/o 2697 * lock since we are going to use the p_next and 2698 * p_prev fields to link the requested pages together. 2699 */ 2700 page_io_lock(pp); 2701 page_add(&plist, pp); 2702 plist = plist->p_next; 2703 off += PAGESIZE; 2704 vaddr += PAGESIZE; 2705 } 2706 2707 ASSERT((flags & PG_EXCL) ? (found_on_free == pages_req) : 1); 2708 fail: 2709 if (npp != NULL) { 2710 /* 2711 * Did not need this page after all. 2712 * Put it back on the free list. 2713 */ 2714 VM_STAT_ADD(page_create_putbacks); 2715 PP_SETFREE(npp); 2716 PP_SETAGED(npp); 2717 npp->p_offset = (u_offset_t)-1; 2718 page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL); 2719 page_unlock(npp); 2720 2721 } 2722 2723 ASSERT(pages_req >= found_on_free); 2724 2725 { 2726 uint_t overshoot = (uint_t)(pages_req - found_on_free); 2727 2728 if (overshoot) { 2729 VM_STAT_ADD(page_create_overshoot); 2730 p = &pcf[pcf_index]; 2731 p->pcf_touch = 1; 2732 mutex_enter(&p->pcf_lock); 2733 if (p->pcf_block) { 2734 p->pcf_reserve += overshoot; 2735 } else { 2736 p->pcf_count += overshoot; 2737 if (p->pcf_wait) { 2738 mutex_enter(&new_freemem_lock); 2739 if (freemem_wait) { 2740 cv_signal(&freemem_cv); 2741 p->pcf_wait--; 2742 } else { 2743 p->pcf_wait = 0; 2744 } 2745 mutex_exit(&new_freemem_lock); 2746 } 2747 } 2748 mutex_exit(&p->pcf_lock); 2749 /* freemem is approximate, so this test OK */ 2750 if (!p->pcf_block) 2751 freemem += overshoot; 2752 } 2753 } 2754 2755 return (plist); 2756 } 2757 2758 /* 2759 * One or more constituent pages of this large page has been marked 2760 * toxic. Simply demote the large page to PAGESIZE pages and let 2761 * page_free() handle it. This routine should only be called by 2762 * large page free routines (page_free_pages() and page_destroy_pages(). 2763 * All pages are locked SE_EXCL and have already been marked free. 2764 */ 2765 static void 2766 page_free_toxic_pages(page_t *rootpp) 2767 { 2768 page_t *tpp; 2769 pgcnt_t i, pgcnt = page_get_pagecnt(rootpp->p_szc); 2770 uint_t szc = rootpp->p_szc; 2771 2772 for (i = 0, tpp = rootpp; i < pgcnt; i++, tpp = tpp->p_next) { 2773 ASSERT(tpp->p_szc == szc); 2774 ASSERT((PAGE_EXCL(tpp) && 2775 !page_iolock_assert(tpp)) || panicstr); 2776 tpp->p_szc = 0; 2777 } 2778 2779 while (rootpp != NULL) { 2780 tpp = rootpp; 2781 page_sub(&rootpp, tpp); 2782 ASSERT(PP_ISFREE(tpp)); 2783 PP_CLRFREE(tpp); 2784 page_free(tpp, 1); 2785 } 2786 } 2787 2788 /* 2789 * Put page on the "free" list. 2790 * The free list is really two lists maintained by 2791 * the PSM of whatever machine we happen to be on. 2792 */ 2793 void 2794 page_free(page_t *pp, int dontneed) 2795 { 2796 struct pcf *p; 2797 uint_t pcf_index; 2798 2799 ASSERT((PAGE_EXCL(pp) && 2800 !page_iolock_assert(pp)) || panicstr); 2801 2802 if (page_deteriorating(pp)) { 2803 volatile int i = 0; 2804 char *kaddr; 2805 volatile int rb, wb; 2806 uint64_t pa; 2807 volatile int ue = 0; 2808 on_trap_data_t otd; 2809 2810 if (pp->p_vnode != NULL) { 2811 /* 2812 * Let page_destroy() do its bean counting and 2813 * hash out the page; it will then call back 2814 * into page_free() with pp->p_vnode == NULL. 2815 */ 2816 page_destroy(pp, 0); 2817 return; 2818 } 2819 2820 if (page_isfailing(pp)) { 2821 /* 2822 * If we have already exceeded the limit for 2823 * pages retired, we will treat this page as 2824 * 'toxic' rather than failing. That will ensure 2825 * that the page is at least cleaned, and if 2826 * a UE is detected, the page will be retired 2827 * anyway. 2828 */ 2829 if (pages_retired_limit_exceeded()) { 2830 /* 2831 * clear the flag and reset to toxic 2832 */ 2833 page_clrtoxic(pp); 2834 page_settoxic(pp, PAGE_IS_TOXIC); 2835 } else { 2836 pa = ptob((uint64_t)page_pptonum(pp)); 2837 if (page_retire_messages) { 2838 cmn_err(CE_NOTE, "Page 0x%08x.%08x " 2839 "removed from service", 2840 (uint32_t)(pa >> 32), (uint32_t)pa); 2841 } 2842 goto page_failed; 2843 } 2844 } 2845 2846 pagescrub(pp, 0, PAGESIZE); 2847 2848 /* 2849 * We want to determine whether the error that occurred on 2850 * this page is transient or persistent, so we get a mapping 2851 * to the page and try every possible bit pattern to compare 2852 * what we write with what we read back. A smaller number 2853 * of bit patterns might suffice, but there's no point in 2854 * getting fancy. If this is the hot path on your system, 2855 * you've got bigger problems. 2856 */ 2857 kaddr = ppmapin(pp, PROT_READ | PROT_WRITE, (caddr_t)-1); 2858 for (wb = 0xff; wb >= 0; wb--) { 2859 if (on_trap(&otd, OT_DATA_EC)) { 2860 pa = ptob((uint64_t)page_pptonum(pp)) + i; 2861 page_settoxic(pp, PAGE_IS_FAILING); 2862 2863 if (page_retire_messages) { 2864 cmn_err(CE_WARN, "Uncorrectable Error " 2865 "occurred at PA 0x%08x.%08x while " 2866 "attempting to clear previously " 2867 "reported error; page removed from " 2868 "service", (uint32_t)(pa >> 32), 2869 (uint32_t)pa); 2870 } 2871 2872 ue++; 2873 break; 2874 } 2875 2876 /* 2877 * Write out the bit pattern, flush it to memory, and 2878 * read it back while under on_trap() protection. 2879 */ 2880 for (i = 0; i < PAGESIZE; i++) 2881 kaddr[i] = wb; 2882 2883 sync_data_memory(kaddr, PAGESIZE); 2884 2885 for (i = 0; i < PAGESIZE; i++) { 2886 if ((rb = (uchar_t)kaddr[i]) != wb) { 2887 page_settoxic(pp, PAGE_IS_FAILING); 2888 goto out; 2889 } 2890 } 2891 } 2892 out: 2893 no_trap(); 2894 ppmapout(kaddr); 2895 2896 if (wb >= 0 && !ue) { 2897 pa = ptob((uint64_t)page_pptonum(pp)) + i; 2898 if (page_retire_messages) { 2899 cmn_err(CE_WARN, "Data Mismatch occurred at PA " 2900 "0x%08x.%08x [ 0x%x != 0x%x ] while " 2901 "attempting to clear previously reported " 2902 "error; page removed from service", 2903 (uint32_t)(pa >> 32), (uint32_t)pa, rb, wb); 2904 } 2905 } 2906 page_failed: 2907 /* 2908 * DR operations change the association between a page_t 2909 * and the physical page it represents. Check if the 2910 * page is still bad. If it is, then retire it. 2911 */ 2912 if (page_isfaulty(pp) && page_isfailing(pp)) { 2913 /* 2914 * In the future, it might be useful to have a platform 2915 * callback here to tell the hardware to fence off this 2916 * page during the next reboot. 2917 * 2918 * We move the page to the retired_vnode here 2919 */ 2920 (void) page_hashin(pp, &retired_ppages, 2921 (u_offset_t)ptob((uint64_t)page_pptonum(pp)), NULL); 2922 mutex_enter(&freemem_lock); 2923 availrmem--; 2924 mutex_exit(&freemem_lock); 2925 page_retired(pp); 2926 page_downgrade(pp); 2927 2928 /* 2929 * If DR raced with the above page retirement code, 2930 * we might have retired a good page. If so, unretire 2931 * the page. 2932 */ 2933 if (!page_isfaulty(pp)) 2934 page_unretire_pages(); 2935 return; 2936 } 2937 2938 pa = ptob((uint64_t)page_pptonum(pp)); 2939 2940 if (page_retire_messages) { 2941 cmn_err(CE_NOTE, "Previously reported error on page " 2942 "0x%08x.%08x cleared", (uint32_t)(pa >> 32), 2943 (uint32_t)pa); 2944 } 2945 2946 page_clrtoxic(pp); 2947 } 2948 2949 if (PP_ISFREE(pp)) { 2950 panic("page_free: page %p is free", (void *)pp); 2951 } 2952 2953 if (pp->p_szc != 0) { 2954 if (pp->p_vnode == NULL || IS_SWAPFSVP(pp->p_vnode) || 2955 pp->p_vnode == &kvp) { 2956 panic("page_free: anon or kernel " 2957 "or no vnode large page %p", (void *)pp); 2958 } 2959 page_demote_vp_pages(pp); 2960 ASSERT(pp->p_szc == 0); 2961 } 2962 2963 /* 2964 * The page_struct_lock need not be acquired to examine these 2965 * fields since the page has an "exclusive" lock. 2966 */ 2967 if (hat_page_is_mapped(pp) || pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 2968 panic("page_free pp=%p, pfn=%lx, lckcnt=%d, cowcnt=%d", 2969 pp, page_pptonum(pp), pp->p_lckcnt, pp->p_cowcnt); 2970 /*NOTREACHED*/ 2971 } 2972 2973 ASSERT(!hat_page_getshare(pp)); 2974 2975 PP_SETFREE(pp); 2976 ASSERT(pp->p_vnode == NULL || !IS_VMODSORT(pp->p_vnode) || 2977 !hat_ismod(pp)); 2978 page_clr_all_props(pp); 2979 ASSERT(!hat_page_getshare(pp)); 2980 2981 /* 2982 * Now we add the page to the head of the free list. 2983 * But if this page is associated with a paged vnode 2984 * then we adjust the head forward so that the page is 2985 * effectively at the end of the list. 2986 */ 2987 if (pp->p_vnode == NULL) { 2988 /* 2989 * Page has no identity, put it on the free list. 2990 */ 2991 PP_SETAGED(pp); 2992 pp->p_offset = (u_offset_t)-1; 2993 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); 2994 VM_STAT_ADD(pagecnt.pc_free_free); 2995 TRACE_1(TR_FAC_VM, TR_PAGE_FREE_FREE, 2996 "page_free_free:pp %p", pp); 2997 } else { 2998 PP_CLRAGED(pp); 2999 3000 if (!dontneed || nopageage) { 3001 /* move it to the tail of the list */ 3002 page_list_add(pp, PG_CACHE_LIST | PG_LIST_TAIL); 3003 3004 VM_STAT_ADD(pagecnt.pc_free_cache); 3005 TRACE_1(TR_FAC_VM, TR_PAGE_FREE_CACHE_TAIL, 3006 "page_free_cache_tail:pp %p", pp); 3007 } else { 3008 page_list_add(pp, PG_CACHE_LIST | PG_LIST_HEAD); 3009 3010 VM_STAT_ADD(pagecnt.pc_free_dontneed); 3011 TRACE_1(TR_FAC_VM, TR_PAGE_FREE_CACHE_HEAD, 3012 "page_free_cache_head:pp %p", pp); 3013 } 3014 } 3015 page_unlock(pp); 3016 3017 /* 3018 * Now do the `freemem' accounting. 3019 */ 3020 pcf_index = PCF_INDEX(); 3021 p = &pcf[pcf_index]; 3022 p->pcf_touch = 1; 3023 3024 mutex_enter(&p->pcf_lock); 3025 if (p->pcf_block) { 3026 p->pcf_reserve += 1; 3027 } else { 3028 p->pcf_count += 1; 3029 if (p->pcf_wait) { 3030 mutex_enter(&new_freemem_lock); 3031 /* 3032 * Check to see if some other thread 3033 * is actually waiting. Another bucket 3034 * may have woken it up by now. If there 3035 * are no waiters, then set our pcf_wait 3036 * count to zero to avoid coming in here 3037 * next time. Also, since only one page 3038 * was put on the free list, just wake 3039 * up one waiter. 3040 */ 3041 if (freemem_wait) { 3042 cv_signal(&freemem_cv); 3043 p->pcf_wait--; 3044 } else { 3045 p->pcf_wait = 0; 3046 } 3047 mutex_exit(&new_freemem_lock); 3048 } 3049 } 3050 mutex_exit(&p->pcf_lock); 3051 3052 /* freemem is approximate, so this test OK */ 3053 if (!p->pcf_block) 3054 freemem += 1; 3055 } 3056 3057 /* 3058 * Put page on the "free" list during intial startup. 3059 * This happens during initial single threaded execution. 3060 */ 3061 void 3062 page_free_at_startup(page_t *pp) 3063 { 3064 struct pcf *p; 3065 uint_t pcf_index; 3066 3067 page_list_add(pp, PG_FREE_LIST | PG_LIST_HEAD | PG_LIST_ISINIT); 3068 VM_STAT_ADD(pagecnt.pc_free_free); 3069 3070 /* 3071 * Now do the `freemem' accounting. 3072 */ 3073 pcf_index = PCF_INDEX(); 3074 p = &pcf[pcf_index]; 3075 p->pcf_touch = 1; 3076 3077 ASSERT(p->pcf_block == 0); 3078 ASSERT(p->pcf_wait == 0); 3079 p->pcf_count += 1; 3080 3081 /* freemem is approximate, so this is OK */ 3082 freemem += 1; 3083 } 3084 3085 void 3086 page_free_pages(page_t *pp) 3087 { 3088 page_t *tpp, *rootpp = NULL; 3089 pgcnt_t pgcnt = page_get_pagecnt(pp->p_szc); 3090 pgcnt_t i; 3091 uint_t szc = pp->p_szc; 3092 int toxic = 0; 3093 3094 VM_STAT_ADD(pagecnt.pc_free_pages); 3095 TRACE_1(TR_FAC_VM, TR_PAGE_FREE_FREE, 3096 "page_free_free:pp %p", pp); 3097 3098 ASSERT(pp->p_szc != 0 && pp->p_szc < page_num_pagesizes()); 3099 if ((page_pptonum(pp) & (pgcnt - 1)) != 0) { 3100 panic("page_free_pages: not root page %p", (void *)pp); 3101 /*NOTREACHED*/ 3102 } 3103 3104 for (i = 0, tpp = pp; i < pgcnt; i++, tpp++) { 3105 ASSERT((PAGE_EXCL(tpp) && 3106 !page_iolock_assert(tpp)) || panicstr); 3107 if (PP_ISFREE(tpp)) { 3108 panic("page_free_pages: page %p is free", (void *)tpp); 3109 /*NOTREACHED*/ 3110 } 3111 if (hat_page_is_mapped(tpp) || tpp->p_lckcnt != 0 || 3112 tpp->p_cowcnt != 0) { 3113 panic("page_free_pages %p", (void *)tpp); 3114 /*NOTREACHED*/ 3115 } 3116 3117 ASSERT(!hat_page_getshare(tpp)); 3118 ASSERT(tpp->p_vnode == NULL); 3119 ASSERT(tpp->p_szc == szc); 3120 3121 if (page_deteriorating(tpp)) 3122 toxic = 1; 3123 3124 PP_SETFREE(tpp); 3125 page_clr_all_props(tpp); 3126 PP_SETAGED(tpp); 3127 tpp->p_offset = (u_offset_t)-1; 3128 ASSERT(tpp->p_next == tpp); 3129 ASSERT(tpp->p_prev == tpp); 3130 page_list_concat(&rootpp, &tpp); 3131 } 3132 ASSERT(rootpp == pp); 3133 3134 if (toxic) { 3135 page_free_toxic_pages(rootpp); 3136 return; 3137 } 3138 page_list_add_pages(rootpp, 0); 3139 page_create_putback(pgcnt); 3140 } 3141 3142 int free_pages = 1; 3143 3144 /* 3145 * This routine attempts to return pages to the cachelist via page_release(). 3146 * It does not *have* to be successful in all cases, since the pageout scanner 3147 * will catch any pages it misses. It does need to be fast and not introduce 3148 * too much overhead. 3149 * 3150 * If a page isn't found on the unlocked sweep of the page_hash bucket, we 3151 * don't lock and retry. This is ok, since the page scanner will eventually 3152 * find any page we miss in free_vp_pages(). 3153 */ 3154 void 3155 free_vp_pages(vnode_t *vp, u_offset_t off, size_t len) 3156 { 3157 page_t *pp; 3158 u_offset_t eoff; 3159 extern int swap_in_range(vnode_t *, u_offset_t, size_t); 3160 3161 eoff = off + len; 3162 3163 if (free_pages == 0) 3164 return; 3165 if (swap_in_range(vp, off, len)) 3166 return; 3167 3168 for (; off < eoff; off += PAGESIZE) { 3169 3170 /* 3171 * find the page using a fast, but inexact search. It'll be OK 3172 * if a few pages slip through the cracks here. 3173 */ 3174 pp = page_exists(vp, off); 3175 3176 /* 3177 * If we didn't find the page (it may not exist), the page 3178 * is free, looks still in use (shared), or we can't lock it, 3179 * just give up. 3180 */ 3181 if (pp == NULL || 3182 PP_ISFREE(pp) || 3183 page_share_cnt(pp) > 0 || 3184 !page_trylock(pp, SE_EXCL)) 3185 continue; 3186 3187 /* 3188 * Once we have locked pp, verify that it's still the 3189 * correct page and not already free 3190 */ 3191 ASSERT(PAGE_LOCKED_SE(pp, SE_EXCL)); 3192 if (pp->p_vnode != vp || pp->p_offset != off || PP_ISFREE(pp)) { 3193 page_unlock(pp); 3194 continue; 3195 } 3196 3197 /* 3198 * try to release the page... 3199 */ 3200 (void) page_release(pp, 1); 3201 } 3202 } 3203 3204 /* 3205 * Reclaim the given page from the free list. 3206 * Returns 1 on success or 0 on failure. 3207 * 3208 * The page is unlocked if it can't be reclaimed (when freemem == 0). 3209 * If `lock' is non-null, it will be dropped and re-acquired if 3210 * the routine must wait while freemem is 0. 3211 * 3212 * As it turns out, boot_getpages() does this. It picks a page, 3213 * based on where OBP mapped in some address, gets its pfn, searches 3214 * the memsegs, locks the page, then pulls it off the free list! 3215 */ 3216 int 3217 page_reclaim(page_t *pp, kmutex_t *lock) 3218 { 3219 struct pcf *p; 3220 uint_t pcf_index; 3221 struct cpu *cpup; 3222 int enough; 3223 uint_t i; 3224 3225 ASSERT(lock != NULL ? MUTEX_HELD(lock) : 1); 3226 ASSERT(PAGE_EXCL(pp) && PP_ISFREE(pp)); 3227 ASSERT(pp->p_szc == 0); 3228 3229 /* 3230 * If `freemem' is 0, we cannot reclaim this page from the 3231 * freelist, so release every lock we might hold: the page, 3232 * and the `lock' before blocking. 3233 * 3234 * The only way `freemem' can become 0 while there are pages 3235 * marked free (have their p->p_free bit set) is when the 3236 * system is low on memory and doing a page_create(). In 3237 * order to guarantee that once page_create() starts acquiring 3238 * pages it will be able to get all that it needs since `freemem' 3239 * was decreased by the requested amount. So, we need to release 3240 * this page, and let page_create() have it. 3241 * 3242 * Since `freemem' being zero is not supposed to happen, just 3243 * use the usual hash stuff as a starting point. If that bucket 3244 * is empty, then assume the worst, and start at the beginning 3245 * of the pcf array. If we always start at the beginning 3246 * when acquiring more than one pcf lock, there won't be any 3247 * deadlock problems. 3248 */ 3249 3250 /* TODO: Do we need to test kcage_freemem if PG_NORELOC(pp)? */ 3251 3252 if (freemem <= throttlefree && !page_create_throttle(1l, 0)) { 3253 pcf_acquire_all(); 3254 goto page_reclaim_nomem; 3255 } 3256 3257 enough = 0; 3258 pcf_index = PCF_INDEX(); 3259 p = &pcf[pcf_index]; 3260 p->pcf_touch = 1; 3261 mutex_enter(&p->pcf_lock); 3262 if (p->pcf_count >= 1) { 3263 enough = 1; 3264 p->pcf_count--; 3265 } 3266 mutex_exit(&p->pcf_lock); 3267 3268 if (!enough) { 3269 VM_STAT_ADD(page_reclaim_zero); 3270 /* 3271 * Check again. Its possible that some other thread 3272 * could have been right behind us, and added one 3273 * to a list somewhere. Acquire each of the pcf locks 3274 * until we find a page. 3275 */ 3276 p = pcf; 3277 for (i = 0; i < PCF_FANOUT; i++) { 3278 p->pcf_touch = 1; 3279 mutex_enter(&p->pcf_lock); 3280 if (p->pcf_count >= 1) { 3281 p->pcf_count -= 1; 3282 enough = 1; 3283 break; 3284 } 3285 p++; 3286 } 3287 3288 if (!enough) { 3289 page_reclaim_nomem: 3290 /* 3291 * We really can't have page `pp'. 3292 * Time for the no-memory dance with 3293 * page_free(). This is just like 3294 * page_create_wait(). Plus the added 3295 * attraction of releasing whatever mutex 3296 * we held when we were called with in `lock'. 3297 * Page_unlock() will wakeup any thread 3298 * waiting around for this page. 3299 */ 3300 if (lock) { 3301 VM_STAT_ADD(page_reclaim_zero_locked); 3302 mutex_exit(lock); 3303 } 3304 page_unlock(pp); 3305 3306 /* 3307 * get this before we drop all the pcf locks. 3308 */ 3309 mutex_enter(&new_freemem_lock); 3310 3311 p = pcf; 3312 for (i = 0; i < PCF_FANOUT; i++) { 3313 p->pcf_wait++; 3314 mutex_exit(&p->pcf_lock); 3315 p++; 3316 } 3317 3318 freemem_wait++; 3319 cv_wait(&freemem_cv, &new_freemem_lock); 3320 freemem_wait--; 3321 3322 mutex_exit(&new_freemem_lock); 3323 3324 if (lock) { 3325 mutex_enter(lock); 3326 } 3327 return (0); 3328 } 3329 3330 /* 3331 * There was a page to be found. 3332 * The pcf accounting has been done, 3333 * though none of the pcf_wait flags have been set, 3334 * drop the locks and continue on. 3335 */ 3336 while (p >= pcf) { 3337 mutex_exit(&p->pcf_lock); 3338 p--; 3339 } 3340 } 3341 3342 /* 3343 * freemem is not protected by any lock. Thus, we cannot 3344 * have any assertion containing freemem here. 3345 */ 3346 freemem -= 1; 3347 3348 VM_STAT_ADD(pagecnt.pc_reclaim); 3349 if (PP_ISAGED(pp)) { 3350 page_list_sub(pp, PG_FREE_LIST); 3351 TRACE_1(TR_FAC_VM, TR_PAGE_UNFREE_FREE, 3352 "page_reclaim_free:pp %p", pp); 3353 } else { 3354 page_list_sub(pp, PG_CACHE_LIST); 3355 TRACE_1(TR_FAC_VM, TR_PAGE_UNFREE_CACHE, 3356 "page_reclaim_cache:pp %p", pp); 3357 } 3358 3359 /* 3360 * clear the p_free & p_age bits since this page is no longer 3361 * on the free list. Notice that there was a brief time where 3362 * a page is marked as free, but is not on the list. 3363 * 3364 * Set the reference bit to protect against immediate pageout. 3365 */ 3366 PP_CLRFREE(pp); 3367 PP_CLRAGED(pp); 3368 page_set_props(pp, P_REF); 3369 3370 CPU_STATS_ENTER_K(); 3371 cpup = CPU; /* get cpup now that CPU cannot change */ 3372 CPU_STATS_ADDQ(cpup, vm, pgrec, 1); 3373 CPU_STATS_ADDQ(cpup, vm, pgfrec, 1); 3374 CPU_STATS_EXIT_K(); 3375 3376 return (1); 3377 } 3378 3379 3380 3381 /* 3382 * Destroy identity of the page and put it back on 3383 * the page free list. Assumes that the caller has 3384 * acquired the "exclusive" lock on the page. 3385 */ 3386 void 3387 page_destroy(page_t *pp, int dontfree) 3388 { 3389 ASSERT((PAGE_EXCL(pp) && 3390 !page_iolock_assert(pp)) || panicstr); 3391 3392 if (pp->p_szc != 0) { 3393 if (pp->p_vnode == NULL || IS_SWAPFSVP(pp->p_vnode) || 3394 pp->p_vnode == &kvp) { 3395 panic("page_destroy: anon or kernel or no vnode " 3396 "large page %p", (void *)pp); 3397 } 3398 page_demote_vp_pages(pp); 3399 ASSERT(pp->p_szc == 0); 3400 } 3401 3402 TRACE_1(TR_FAC_VM, TR_PAGE_DESTROY, "page_destroy:pp %p", pp); 3403 3404 /* 3405 * Unload translations, if any, then hash out the 3406 * page to erase its identity. 3407 */ 3408 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 3409 page_hashout(pp, NULL); 3410 3411 if (!dontfree) { 3412 /* 3413 * Acquire the "freemem_lock" for availrmem. 3414 * The page_struct_lock need not be acquired for lckcnt 3415 * and cowcnt since the page has an "exclusive" lock. 3416 */ 3417 if ((pp->p_lckcnt != 0) || (pp->p_cowcnt != 0)) { 3418 mutex_enter(&freemem_lock); 3419 if (pp->p_lckcnt != 0) { 3420 availrmem++; 3421 pp->p_lckcnt = 0; 3422 } 3423 if (pp->p_cowcnt != 0) { 3424 availrmem += pp->p_cowcnt; 3425 pp->p_cowcnt = 0; 3426 } 3427 mutex_exit(&freemem_lock); 3428 } 3429 /* 3430 * Put the page on the "free" list. 3431 */ 3432 page_free(pp, 0); 3433 } 3434 } 3435 3436 void 3437 page_destroy_pages(page_t *pp) 3438 { 3439 3440 page_t *tpp, *rootpp = NULL; 3441 pgcnt_t pgcnt = page_get_pagecnt(pp->p_szc); 3442 pgcnt_t i, pglcks = 0; 3443 uint_t szc = pp->p_szc; 3444 int toxic = 0; 3445 3446 ASSERT(pp->p_szc != 0 && pp->p_szc < page_num_pagesizes()); 3447 3448 VM_STAT_ADD(pagecnt.pc_destroy_pages); 3449 3450 TRACE_1(TR_FAC_VM, TR_PAGE_DESTROY, "page_destroy_pages:pp %p", pp); 3451 3452 if ((page_pptonum(pp) & (pgcnt - 1)) != 0) { 3453 panic("page_destroy_pages: not root page %p", (void *)pp); 3454 /*NOTREACHED*/ 3455 } 3456 3457 for (i = 0, tpp = pp; i < pgcnt; i++, tpp++) { 3458 ASSERT((PAGE_EXCL(tpp) && 3459 !page_iolock_assert(tpp)) || panicstr); 3460 (void) hat_pageunload(tpp, HAT_FORCE_PGUNLOAD); 3461 page_hashout(tpp, NULL); 3462 ASSERT(tpp->p_offset == (u_offset_t)-1); 3463 if (tpp->p_lckcnt != 0) { 3464 pglcks++; 3465 tpp->p_lckcnt = 0; 3466 } else if (tpp->p_cowcnt != 0) { 3467 pglcks += tpp->p_cowcnt; 3468 tpp->p_cowcnt = 0; 3469 } 3470 ASSERT(!hat_page_getshare(tpp)); 3471 ASSERT(tpp->p_vnode == NULL); 3472 ASSERT(tpp->p_szc == szc); 3473 3474 if (page_deteriorating(tpp)) 3475 toxic = 1; 3476 3477 PP_SETFREE(tpp); 3478 page_clr_all_props(tpp); 3479 PP_SETAGED(tpp); 3480 ASSERT(tpp->p_next == tpp); 3481 ASSERT(tpp->p_prev == tpp); 3482 page_list_concat(&rootpp, &tpp); 3483 } 3484 3485 ASSERT(rootpp == pp); 3486 if (pglcks != 0) { 3487 mutex_enter(&freemem_lock); 3488 availrmem += pglcks; 3489 mutex_exit(&freemem_lock); 3490 } 3491 3492 if (toxic) { 3493 page_free_toxic_pages(rootpp); 3494 return; 3495 } 3496 page_list_add_pages(rootpp, 0); 3497 page_create_putback(pgcnt); 3498 } 3499 3500 /* 3501 * Similar to page_destroy(), but destroys pages which are 3502 * locked and known to be on the page free list. Since 3503 * the page is known to be free and locked, no one can access 3504 * it. 3505 * 3506 * Also, the number of free pages does not change. 3507 */ 3508 void 3509 page_destroy_free(page_t *pp) 3510 { 3511 ASSERT(PAGE_EXCL(pp)); 3512 ASSERT(PP_ISFREE(pp)); 3513 ASSERT(pp->p_vnode); 3514 ASSERT(hat_page_getattr(pp, P_MOD | P_REF | P_RO) == 0); 3515 ASSERT(!hat_page_is_mapped(pp)); 3516 ASSERT(PP_ISAGED(pp) == 0); 3517 ASSERT(pp->p_szc == 0); 3518 3519 VM_STAT_ADD(pagecnt.pc_destroy_free); 3520 page_list_sub(pp, PG_CACHE_LIST); 3521 3522 page_hashout(pp, NULL); 3523 ASSERT(pp->p_vnode == NULL); 3524 ASSERT(pp->p_offset == (u_offset_t)-1); 3525 ASSERT(pp->p_hash == NULL); 3526 3527 PP_SETAGED(pp); 3528 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); 3529 page_unlock(pp); 3530 3531 mutex_enter(&new_freemem_lock); 3532 if (freemem_wait) { 3533 cv_signal(&freemem_cv); 3534 } 3535 mutex_exit(&new_freemem_lock); 3536 } 3537 3538 /* 3539 * Rename the page "opp" to have an identity specified 3540 * by [vp, off]. If a page already exists with this name 3541 * it is locked and destroyed. Note that the page's 3542 * translations are not unloaded during the rename. 3543 * 3544 * This routine is used by the anon layer to "steal" the 3545 * original page and is not unlike destroying a page and 3546 * creating a new page using the same page frame. 3547 * 3548 * XXX -- Could deadlock if caller 1 tries to rename A to B while 3549 * caller 2 tries to rename B to A. 3550 */ 3551 void 3552 page_rename(page_t *opp, vnode_t *vp, u_offset_t off) 3553 { 3554 page_t *pp; 3555 int olckcnt = 0; 3556 int ocowcnt = 0; 3557 kmutex_t *phm; 3558 ulong_t index; 3559 3560 ASSERT(PAGE_EXCL(opp) && !page_iolock_assert(opp)); 3561 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 3562 ASSERT(PP_ISFREE(opp) == 0); 3563 3564 VM_STAT_ADD(page_rename_count); 3565 3566 TRACE_3(TR_FAC_VM, TR_PAGE_RENAME, 3567 "page rename:pp %p vp %p off %llx", opp, vp, off); 3568 3569 /* 3570 * CacheFS may call page_rename for a large NFS page 3571 * when both CacheFS and NFS mount points are used 3572 * by applications. Demote this large page before 3573 * renaming it, to ensure that there are no "partial" 3574 * large pages left lying around. 3575 */ 3576 if (opp->p_szc != 0) { 3577 vnode_t *ovp = opp->p_vnode; 3578 ASSERT(ovp != NULL); 3579 ASSERT(!IS_SWAPFSVP(ovp)); 3580 ASSERT(ovp != &kvp); 3581 page_demote_vp_pages(opp); 3582 ASSERT(opp->p_szc == 0); 3583 } 3584 3585 page_hashout(opp, NULL); 3586 PP_CLRAGED(opp); 3587 3588 /* 3589 * Acquire the appropriate page hash lock, since 3590 * we're going to rename the page. 3591 */ 3592 index = PAGE_HASH_FUNC(vp, off); 3593 phm = PAGE_HASH_MUTEX(index); 3594 mutex_enter(phm); 3595 top: 3596 /* 3597 * Look for an existing page with this name and destroy it if found. 3598 * By holding the page hash lock all the way to the page_hashin() 3599 * call, we are assured that no page can be created with this 3600 * identity. In the case when the phm lock is dropped to undo any 3601 * hat layer mappings, the existing page is held with an "exclusive" 3602 * lock, again preventing another page from being created with 3603 * this identity. 3604 */ 3605 PAGE_HASH_SEARCH(index, pp, vp, off); 3606 if (pp != NULL) { 3607 VM_STAT_ADD(page_rename_exists); 3608 3609 /* 3610 * As it turns out, this is one of only two places where 3611 * page_lock() needs to hold the passed in lock in the 3612 * successful case. In all of the others, the lock could 3613 * be dropped as soon as the attempt is made to lock 3614 * the page. It is tempting to add yet another arguement, 3615 * PL_KEEP or PL_DROP, to let page_lock know what to do. 3616 */ 3617 if (!page_lock(pp, SE_EXCL, phm, P_RECLAIM)) { 3618 /* 3619 * Went to sleep because the page could not 3620 * be locked. We were woken up when the page 3621 * was unlocked, or when the page was destroyed. 3622 * In either case, `phm' was dropped while we 3623 * slept. Hence we should not just roar through 3624 * this loop. 3625 */ 3626 goto top; 3627 } 3628 3629 /* 3630 * If an existing page is a large page, then demote 3631 * it to ensure that no "partial" large pages are 3632 * "created" after page_rename. An existing page 3633 * can be a CacheFS page, and can't belong to swapfs. 3634 */ 3635 if (hat_page_is_mapped(pp)) { 3636 /* 3637 * Unload translations. Since we hold the 3638 * exclusive lock on this page, the page 3639 * can not be changed while we drop phm. 3640 * This is also not a lock protocol violation, 3641 * but rather the proper way to do things. 3642 */ 3643 mutex_exit(phm); 3644 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 3645 if (pp->p_szc != 0) { 3646 ASSERT(!IS_SWAPFSVP(vp)); 3647 ASSERT(vp != &kvp); 3648 page_demote_vp_pages(pp); 3649 ASSERT(pp->p_szc == 0); 3650 } 3651 mutex_enter(phm); 3652 } else if (pp->p_szc != 0) { 3653 ASSERT(!IS_SWAPFSVP(vp)); 3654 ASSERT(vp != &kvp); 3655 mutex_exit(phm); 3656 page_demote_vp_pages(pp); 3657 ASSERT(pp->p_szc == 0); 3658 mutex_enter(phm); 3659 } 3660 page_hashout(pp, phm); 3661 } 3662 /* 3663 * Hash in the page with the new identity. 3664 */ 3665 if (!page_hashin(opp, vp, off, phm)) { 3666 /* 3667 * We were holding phm while we searched for [vp, off] 3668 * and only dropped phm if we found and locked a page. 3669 * If we can't create this page now, then some thing 3670 * is really broken. 3671 */ 3672 panic("page_rename: Can't hash in page: %p", (void *)pp); 3673 /*NOTREACHED*/ 3674 } 3675 3676 ASSERT(MUTEX_HELD(phm)); 3677 mutex_exit(phm); 3678 3679 /* 3680 * Now that we have dropped phm, lets get around to finishing up 3681 * with pp. 3682 */ 3683 if (pp != NULL) { 3684 ASSERT(!hat_page_is_mapped(pp)); 3685 /* for now large pages should not end up here */ 3686 ASSERT(pp->p_szc == 0); 3687 /* 3688 * Save the locks for transfer to the new page and then 3689 * clear them so page_free doesn't think they're important. 3690 * The page_struct_lock need not be acquired for lckcnt and 3691 * cowcnt since the page has an "exclusive" lock. 3692 */ 3693 olckcnt = pp->p_lckcnt; 3694 ocowcnt = pp->p_cowcnt; 3695 pp->p_lckcnt = pp->p_cowcnt = 0; 3696 3697 /* 3698 * Put the page on the "free" list after we drop 3699 * the lock. The less work under the lock the better. 3700 */ 3701 /*LINTED: constant in conditional context*/ 3702 VN_DISPOSE(pp, B_FREE, 0, kcred); 3703 } 3704 3705 /* 3706 * Transfer the lock count from the old page (if any). 3707 * The page_struct_lock need not be acquired for lckcnt and 3708 * cowcnt since the page has an "exclusive" lock. 3709 */ 3710 opp->p_lckcnt += olckcnt; 3711 opp->p_cowcnt += ocowcnt; 3712 } 3713 3714 /* 3715 * low level routine to add page `pp' to the hash and vp chains for [vp, offset] 3716 * 3717 * Pages are normally inserted at the start of a vnode's v_pages list. 3718 * If the vnode is VMODSORT and the page is modified, it goes at the end. 3719 * This can happen when a modified page is relocated for DR. 3720 * 3721 * Returns 1 on success and 0 on failure. 3722 */ 3723 static int 3724 page_do_hashin(page_t *pp, vnode_t *vp, u_offset_t offset) 3725 { 3726 page_t **listp; 3727 page_t *tp; 3728 ulong_t index; 3729 3730 ASSERT(PAGE_EXCL(pp)); 3731 ASSERT(vp != NULL); 3732 ASSERT(MUTEX_HELD(page_vnode_mutex(vp))); 3733 3734 /* 3735 * Be sure to set these up before the page is inserted on the hash 3736 * list. As soon as the page is placed on the list some other 3737 * thread might get confused and wonder how this page could 3738 * possibly hash to this list. 3739 */ 3740 pp->p_vnode = vp; 3741 pp->p_offset = offset; 3742 3743 /* 3744 * record if this page is on a swap vnode 3745 */ 3746 if ((vp->v_flag & VISSWAP) != 0) 3747 PP_SETSWAP(pp); 3748 3749 index = PAGE_HASH_FUNC(vp, offset); 3750 ASSERT(MUTEX_HELD(PAGE_HASH_MUTEX(index))); 3751 listp = &page_hash[index]; 3752 3753 /* 3754 * If this page is already hashed in, fail this attempt to add it. 3755 */ 3756 for (tp = *listp; tp != NULL; tp = tp->p_hash) { 3757 if (tp->p_vnode == vp && tp->p_offset == offset) { 3758 pp->p_vnode = NULL; 3759 pp->p_offset = (u_offset_t)(-1); 3760 return (0); 3761 } 3762 } 3763 pp->p_hash = *listp; 3764 *listp = pp; 3765 3766 /* 3767 * Add the page to the vnode's list of pages 3768 */ 3769 if (vp->v_pages != NULL && IS_VMODSORT(vp) && hat_ismod(pp)) 3770 listp = &vp->v_pages->p_vpprev->p_vpnext; 3771 else 3772 listp = &vp->v_pages; 3773 3774 page_vpadd(listp, pp); 3775 3776 return (1); 3777 } 3778 3779 /* 3780 * Add page `pp' to both the hash and vp chains for [vp, offset]. 3781 * 3782 * Returns 1 on success and 0 on failure. 3783 * If hold is passed in, it is not dropped. 3784 */ 3785 int 3786 page_hashin(page_t *pp, vnode_t *vp, u_offset_t offset, kmutex_t *hold) 3787 { 3788 kmutex_t *phm = NULL; 3789 kmutex_t *vphm; 3790 int rc; 3791 3792 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 3793 3794 TRACE_3(TR_FAC_VM, TR_PAGE_HASHIN, 3795 "page_hashin:pp %p vp %p offset %llx", 3796 pp, vp, offset); 3797 3798 VM_STAT_ADD(hashin_count); 3799 3800 if (hold != NULL) 3801 phm = hold; 3802 else { 3803 VM_STAT_ADD(hashin_not_held); 3804 phm = PAGE_HASH_MUTEX(PAGE_HASH_FUNC(vp, offset)); 3805 mutex_enter(phm); 3806 } 3807 3808 vphm = page_vnode_mutex(vp); 3809 mutex_enter(vphm); 3810 rc = page_do_hashin(pp, vp, offset); 3811 mutex_exit(vphm); 3812 if (hold == NULL) 3813 mutex_exit(phm); 3814 if (rc == 0) 3815 VM_STAT_ADD(hashin_already); 3816 return (rc); 3817 } 3818 3819 /* 3820 * Remove page ``pp'' from the hash and vp chains and remove vp association. 3821 * All mutexes must be held 3822 */ 3823 static void 3824 page_do_hashout(page_t *pp) 3825 { 3826 page_t **hpp; 3827 page_t *hp; 3828 vnode_t *vp = pp->p_vnode; 3829 3830 ASSERT(vp != NULL); 3831 ASSERT(MUTEX_HELD(page_vnode_mutex(vp))); 3832 3833 /* 3834 * First, take pp off of its hash chain. 3835 */ 3836 hpp = &page_hash[PAGE_HASH_FUNC(vp, pp->p_offset)]; 3837 3838 for (;;) { 3839 hp = *hpp; 3840 if (hp == pp) 3841 break; 3842 if (hp == NULL) { 3843 panic("page_do_hashout"); 3844 /*NOTREACHED*/ 3845 } 3846 hpp = &hp->p_hash; 3847 } 3848 *hpp = pp->p_hash; 3849 3850 /* 3851 * Now remove it from its associated vnode. 3852 */ 3853 if (vp->v_pages) 3854 page_vpsub(&vp->v_pages, pp); 3855 3856 pp->p_hash = NULL; 3857 page_clr_all_props(pp); 3858 PP_CLRSWAP(pp); 3859 pp->p_vnode = NULL; 3860 pp->p_offset = (u_offset_t)-1; 3861 } 3862 3863 /* 3864 * Remove page ``pp'' from the hash and vp chains and remove vp association. 3865 * 3866 * When `phm' is non-NULL it contains the address of the mutex protecting the 3867 * hash list pp is on. It is not dropped. 3868 */ 3869 void 3870 page_hashout(page_t *pp, kmutex_t *phm) 3871 { 3872 vnode_t *vp; 3873 ulong_t index; 3874 kmutex_t *nphm; 3875 kmutex_t *vphm; 3876 kmutex_t *sep; 3877 3878 ASSERT(phm != NULL ? MUTEX_HELD(phm) : 1); 3879 ASSERT(pp->p_vnode != NULL); 3880 ASSERT((PAGE_EXCL(pp) && !page_iolock_assert(pp)) || panicstr); 3881 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(pp->p_vnode))); 3882 3883 vp = pp->p_vnode; 3884 3885 TRACE_2(TR_FAC_VM, TR_PAGE_HASHOUT, 3886 "page_hashout:pp %p vp %p", pp, vp); 3887 3888 /* Kernel probe */ 3889 TNF_PROBE_2(page_unmap, "vm pagefault", /* CSTYLED */, 3890 tnf_opaque, vnode, vp, 3891 tnf_offset, offset, pp->p_offset); 3892 3893 /* 3894 * 3895 */ 3896 VM_STAT_ADD(hashout_count); 3897 index = PAGE_HASH_FUNC(vp, pp->p_offset); 3898 if (phm == NULL) { 3899 VM_STAT_ADD(hashout_not_held); 3900 nphm = PAGE_HASH_MUTEX(index); 3901 mutex_enter(nphm); 3902 } 3903 ASSERT(phm ? phm == PAGE_HASH_MUTEX(index) : 1); 3904 3905 3906 /* 3907 * grab page vnode mutex and remove it... 3908 */ 3909 vphm = page_vnode_mutex(vp); 3910 mutex_enter(vphm); 3911 3912 page_do_hashout(pp); 3913 3914 mutex_exit(vphm); 3915 if (phm == NULL) 3916 mutex_exit(nphm); 3917 3918 /* 3919 * If the page was retired, update the pages_retired 3920 * total and clear the page flag 3921 */ 3922 if (page_isretired(pp)) { 3923 retired_page_removed(pp); 3924 } 3925 3926 /* 3927 * Wake up processes waiting for this page. The page's 3928 * identity has been changed, and is probably not the 3929 * desired page any longer. 3930 */ 3931 sep = page_se_mutex(pp); 3932 mutex_enter(sep); 3933 if (CV_HAS_WAITERS(&pp->p_cv)) 3934 cv_broadcast(&pp->p_cv); 3935 mutex_exit(sep); 3936 } 3937 3938 /* 3939 * Add the page to the front of a linked list of pages 3940 * using the p_next & p_prev pointers for the list. 3941 * The caller is responsible for protecting the list pointers. 3942 */ 3943 void 3944 page_add(page_t **ppp, page_t *pp) 3945 { 3946 ASSERT(PAGE_EXCL(pp) || (PAGE_SHARED(pp) && page_iolock_assert(pp))); 3947 3948 page_add_common(ppp, pp); 3949 } 3950 3951 3952 3953 /* 3954 * Common code for page_add() and mach_page_add() 3955 */ 3956 void 3957 page_add_common(page_t **ppp, page_t *pp) 3958 { 3959 if (*ppp == NULL) { 3960 pp->p_next = pp->p_prev = pp; 3961 } else { 3962 pp->p_next = *ppp; 3963 pp->p_prev = (*ppp)->p_prev; 3964 (*ppp)->p_prev = pp; 3965 pp->p_prev->p_next = pp; 3966 } 3967 *ppp = pp; 3968 } 3969 3970 3971 /* 3972 * Remove this page from a linked list of pages 3973 * using the p_next & p_prev pointers for the list. 3974 * 3975 * The caller is responsible for protecting the list pointers. 3976 */ 3977 void 3978 page_sub(page_t **ppp, page_t *pp) 3979 { 3980 ASSERT((PP_ISFREE(pp)) ? 1 : 3981 (PAGE_EXCL(pp)) || (PAGE_SHARED(pp) && page_iolock_assert(pp))); 3982 3983 if (*ppp == NULL || pp == NULL) { 3984 panic("page_sub: bad arg(s): pp %p, *ppp %p", 3985 (void *)pp, (void *)(*ppp)); 3986 /*NOTREACHED*/ 3987 } 3988 3989 page_sub_common(ppp, pp); 3990 } 3991 3992 3993 /* 3994 * Common code for page_sub() and mach_page_sub() 3995 */ 3996 void 3997 page_sub_common(page_t **ppp, page_t *pp) 3998 { 3999 if (*ppp == pp) 4000 *ppp = pp->p_next; /* go to next page */ 4001 4002 if (*ppp == pp) 4003 *ppp = NULL; /* page list is gone */ 4004 else { 4005 pp->p_prev->p_next = pp->p_next; 4006 pp->p_next->p_prev = pp->p_prev; 4007 } 4008 pp->p_prev = pp->p_next = pp; /* make pp a list of one */ 4009 } 4010 4011 4012 /* 4013 * Break page list cppp into two lists with npages in the first list. 4014 * The tail is returned in nppp. 4015 */ 4016 void 4017 page_list_break(page_t **oppp, page_t **nppp, pgcnt_t npages) 4018 { 4019 page_t *s1pp = *oppp; 4020 page_t *s2pp; 4021 page_t *e1pp, *e2pp; 4022 long n = 0; 4023 4024 if (s1pp == NULL) { 4025 *nppp = NULL; 4026 return; 4027 } 4028 if (npages == 0) { 4029 *nppp = s1pp; 4030 *oppp = NULL; 4031 return; 4032 } 4033 for (n = 0, s2pp = *oppp; n < npages; n++) { 4034 s2pp = s2pp->p_next; 4035 } 4036 /* Fix head and tail of new lists */ 4037 e1pp = s2pp->p_prev; 4038 e2pp = s1pp->p_prev; 4039 s1pp->p_prev = e1pp; 4040 e1pp->p_next = s1pp; 4041 s2pp->p_prev = e2pp; 4042 e2pp->p_next = s2pp; 4043 4044 /* second list empty */ 4045 if (s2pp == s1pp) { 4046 *oppp = s1pp; 4047 *nppp = NULL; 4048 } else { 4049 *oppp = s1pp; 4050 *nppp = s2pp; 4051 } 4052 } 4053 4054 /* 4055 * Concatenate page list nppp onto the end of list ppp. 4056 */ 4057 void 4058 page_list_concat(page_t **ppp, page_t **nppp) 4059 { 4060 page_t *s1pp, *s2pp, *e1pp, *e2pp; 4061 4062 if (*nppp == NULL) { 4063 return; 4064 } 4065 if (*ppp == NULL) { 4066 *ppp = *nppp; 4067 return; 4068 } 4069 s1pp = *ppp; 4070 e1pp = s1pp->p_prev; 4071 s2pp = *nppp; 4072 e2pp = s2pp->p_prev; 4073 s1pp->p_prev = e2pp; 4074 e2pp->p_next = s1pp; 4075 e1pp->p_next = s2pp; 4076 s2pp->p_prev = e1pp; 4077 } 4078 4079 /* 4080 * return the next page in the page list 4081 */ 4082 page_t * 4083 page_list_next(page_t *pp) 4084 { 4085 return (pp->p_next); 4086 } 4087 4088 4089 /* 4090 * Add the page to the front of the linked list of pages 4091 * using p_vpnext/p_vpprev pointers for the list. 4092 * 4093 * The caller is responsible for protecting the lists. 4094 */ 4095 void 4096 page_vpadd(page_t **ppp, page_t *pp) 4097 { 4098 if (*ppp == NULL) { 4099 pp->p_vpnext = pp->p_vpprev = pp; 4100 } else { 4101 pp->p_vpnext = *ppp; 4102 pp->p_vpprev = (*ppp)->p_vpprev; 4103 (*ppp)->p_vpprev = pp; 4104 pp->p_vpprev->p_vpnext = pp; 4105 } 4106 *ppp = pp; 4107 } 4108 4109 /* 4110 * Remove this page from the linked list of pages 4111 * using p_vpnext/p_vpprev pointers for the list. 4112 * 4113 * The caller is responsible for protecting the lists. 4114 */ 4115 void 4116 page_vpsub(page_t **ppp, page_t *pp) 4117 { 4118 if (*ppp == NULL || pp == NULL) { 4119 panic("page_vpsub: bad arg(s): pp %p, *ppp %p", 4120 (void *)pp, (void *)(*ppp)); 4121 /*NOTREACHED*/ 4122 } 4123 4124 if (*ppp == pp) 4125 *ppp = pp->p_vpnext; /* go to next page */ 4126 4127 if (*ppp == pp) 4128 *ppp = NULL; /* page list is gone */ 4129 else { 4130 pp->p_vpprev->p_vpnext = pp->p_vpnext; 4131 pp->p_vpnext->p_vpprev = pp->p_vpprev; 4132 } 4133 pp->p_vpprev = pp->p_vpnext = pp; /* make pp a list of one */ 4134 } 4135 4136 /* 4137 * Lock a physical page into memory "long term". Used to support "lock 4138 * in memory" functions. Accepts the page to be locked, and a cow variable 4139 * to indicate whether a the lock will travel to the new page during 4140 * a potential copy-on-write. 4141 */ 4142 int 4143 page_pp_lock( 4144 page_t *pp, /* page to be locked */ 4145 int cow, /* cow lock */ 4146 int kernel) /* must succeed -- ignore checking */ 4147 { 4148 int r = 0; /* result -- assume failure */ 4149 4150 ASSERT(PAGE_LOCKED(pp)); 4151 4152 page_struct_lock(pp); 4153 /* 4154 * Acquire the "freemem_lock" for availrmem. 4155 */ 4156 if (cow) { 4157 mutex_enter(&freemem_lock); 4158 if ((availrmem > pages_pp_maximum) && 4159 (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM)) { 4160 availrmem--; 4161 pages_locked++; 4162 mutex_exit(&freemem_lock); 4163 r = 1; 4164 if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) { 4165 cmn_err(CE_WARN, 4166 "COW lock limit reached on pfn 0x%lx", 4167 page_pptonum(pp)); 4168 } 4169 } else 4170 mutex_exit(&freemem_lock); 4171 } else { 4172 if (pp->p_lckcnt) { 4173 if (pp->p_lckcnt < (ushort_t)PAGE_LOCK_MAXIMUM) { 4174 r = 1; 4175 if (++pp->p_lckcnt == 4176 (ushort_t)PAGE_LOCK_MAXIMUM) { 4177 cmn_err(CE_WARN, "Page lock limit " 4178 "reached on pfn 0x%lx", 4179 page_pptonum(pp)); 4180 } 4181 } 4182 } else { 4183 if (kernel) { 4184 /* availrmem accounting done by caller */ 4185 ++pp->p_lckcnt; 4186 r = 1; 4187 } else { 4188 mutex_enter(&freemem_lock); 4189 if (availrmem > pages_pp_maximum) { 4190 availrmem--; 4191 pages_locked++; 4192 ++pp->p_lckcnt; 4193 r = 1; 4194 } 4195 mutex_exit(&freemem_lock); 4196 } 4197 } 4198 } 4199 page_struct_unlock(pp); 4200 return (r); 4201 } 4202 4203 /* 4204 * Decommit a lock on a physical page frame. Account for cow locks if 4205 * appropriate. 4206 */ 4207 void 4208 page_pp_unlock( 4209 page_t *pp, /* page to be unlocked */ 4210 int cow, /* expect cow lock */ 4211 int kernel) /* this was a kernel lock */ 4212 { 4213 ASSERT(PAGE_LOCKED(pp)); 4214 4215 page_struct_lock(pp); 4216 /* 4217 * Acquire the "freemem_lock" for availrmem. 4218 * If cowcnt or lcknt is already 0 do nothing; i.e., we 4219 * could be called to unlock even if nothing is locked. This could 4220 * happen if locked file pages were truncated (removing the lock) 4221 * and the file was grown again and new pages faulted in; the new 4222 * pages are unlocked but the segment still thinks they're locked. 4223 */ 4224 if (cow) { 4225 if (pp->p_cowcnt) { 4226 mutex_enter(&freemem_lock); 4227 pp->p_cowcnt--; 4228 availrmem++; 4229 pages_locked--; 4230 mutex_exit(&freemem_lock); 4231 } 4232 } else { 4233 if (pp->p_lckcnt && --pp->p_lckcnt == 0) { 4234 if (!kernel) { 4235 mutex_enter(&freemem_lock); 4236 availrmem++; 4237 pages_locked--; 4238 mutex_exit(&freemem_lock); 4239 } 4240 } 4241 } 4242 page_struct_unlock(pp); 4243 } 4244 4245 /* 4246 * This routine reserves availrmem for npages; 4247 * flags: KM_NOSLEEP or KM_SLEEP 4248 * returns 1 on success or 0 on failure 4249 */ 4250 int 4251 page_resv(pgcnt_t npages, uint_t flags) 4252 { 4253 mutex_enter(&freemem_lock); 4254 while (availrmem < tune.t_minarmem + npages) { 4255 if (flags & KM_NOSLEEP) { 4256 mutex_exit(&freemem_lock); 4257 return (0); 4258 } 4259 mutex_exit(&freemem_lock); 4260 page_needfree(npages); 4261 kmem_reap(); 4262 delay(hz >> 2); 4263 page_needfree(-(spgcnt_t)npages); 4264 mutex_enter(&freemem_lock); 4265 } 4266 availrmem -= npages; 4267 mutex_exit(&freemem_lock); 4268 return (1); 4269 } 4270 4271 /* 4272 * This routine unreserves availrmem for npages; 4273 */ 4274 void 4275 page_unresv(pgcnt_t npages) 4276 { 4277 mutex_enter(&freemem_lock); 4278 availrmem += npages; 4279 mutex_exit(&freemem_lock); 4280 } 4281 4282 /* 4283 * See Statement at the beginning of segvn_lockop() regarding 4284 * the way we handle cowcnts and lckcnts. 4285 * 4286 * Transfer cowcnt on 'opp' to cowcnt on 'npp' if the vpage 4287 * that breaks COW has PROT_WRITE. 4288 * 4289 * Note that, we may also break COW in case we are softlocking 4290 * on read access during physio; 4291 * in this softlock case, the vpage may not have PROT_WRITE. 4292 * So, we need to transfer lckcnt on 'opp' to lckcnt on 'npp' 4293 * if the vpage doesn't have PROT_WRITE. 4294 * 4295 * This routine is never called if we are stealing a page 4296 * in anon_private. 4297 * 4298 * The caller subtracted from availrmem for read only mapping. 4299 * if lckcnt is 1 increment availrmem. 4300 */ 4301 void 4302 page_pp_useclaim( 4303 page_t *opp, /* original page frame losing lock */ 4304 page_t *npp, /* new page frame gaining lock */ 4305 uint_t write_perm) /* set if vpage has PROT_WRITE */ 4306 { 4307 int payback = 0; 4308 4309 ASSERT(PAGE_LOCKED(opp)); 4310 ASSERT(PAGE_LOCKED(npp)); 4311 4312 page_struct_lock(opp); 4313 4314 ASSERT(npp->p_cowcnt == 0); 4315 ASSERT(npp->p_lckcnt == 0); 4316 4317 /* Don't use claim if nothing is locked (see page_pp_unlock above) */ 4318 if ((write_perm && opp->p_cowcnt != 0) || 4319 (!write_perm && opp->p_lckcnt != 0)) { 4320 4321 if (write_perm) { 4322 npp->p_cowcnt++; 4323 ASSERT(opp->p_cowcnt != 0); 4324 opp->p_cowcnt--; 4325 } else { 4326 4327 ASSERT(opp->p_lckcnt != 0); 4328 4329 /* 4330 * We didn't need availrmem decremented if p_lckcnt on 4331 * original page is 1. Here, we are unlocking 4332 * read-only copy belonging to original page and 4333 * are locking a copy belonging to new page. 4334 */ 4335 if (opp->p_lckcnt == 1) 4336 payback = 1; 4337 4338 npp->p_lckcnt++; 4339 opp->p_lckcnt--; 4340 } 4341 } 4342 if (payback) { 4343 mutex_enter(&freemem_lock); 4344 availrmem++; 4345 pages_useclaim--; 4346 mutex_exit(&freemem_lock); 4347 } 4348 page_struct_unlock(opp); 4349 } 4350 4351 /* 4352 * Simple claim adjust functions -- used to support changes in 4353 * claims due to changes in access permissions. Used by segvn_setprot(). 4354 */ 4355 int 4356 page_addclaim(page_t *pp) 4357 { 4358 int r = 0; /* result */ 4359 4360 ASSERT(PAGE_LOCKED(pp)); 4361 4362 page_struct_lock(pp); 4363 ASSERT(pp->p_lckcnt != 0); 4364 4365 if (pp->p_lckcnt == 1) { 4366 if (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM) { 4367 --pp->p_lckcnt; 4368 r = 1; 4369 if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) { 4370 cmn_err(CE_WARN, 4371 "COW lock limit reached on pfn 0x%lx", 4372 page_pptonum(pp)); 4373 } 4374 } 4375 } else { 4376 mutex_enter(&freemem_lock); 4377 if ((availrmem > pages_pp_maximum) && 4378 (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM)) { 4379 --availrmem; 4380 ++pages_claimed; 4381 mutex_exit(&freemem_lock); 4382 --pp->p_lckcnt; 4383 r = 1; 4384 if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) { 4385 cmn_err(CE_WARN, 4386 "COW lock limit reached on pfn 0x%lx", 4387 page_pptonum(pp)); 4388 } 4389 } else 4390 mutex_exit(&freemem_lock); 4391 } 4392 page_struct_unlock(pp); 4393 return (r); 4394 } 4395 4396 int 4397 page_subclaim(page_t *pp) 4398 { 4399 int r = 0; 4400 4401 ASSERT(PAGE_LOCKED(pp)); 4402 4403 page_struct_lock(pp); 4404 ASSERT(pp->p_cowcnt != 0); 4405 4406 if (pp->p_lckcnt) { 4407 if (pp->p_lckcnt < (ushort_t)PAGE_LOCK_MAXIMUM) { 4408 r = 1; 4409 /* 4410 * for availrmem 4411 */ 4412 mutex_enter(&freemem_lock); 4413 availrmem++; 4414 pages_claimed--; 4415 mutex_exit(&freemem_lock); 4416 4417 pp->p_cowcnt--; 4418 4419 if (++pp->p_lckcnt == (ushort_t)PAGE_LOCK_MAXIMUM) { 4420 cmn_err(CE_WARN, 4421 "Page lock limit reached on pfn 0x%lx", 4422 page_pptonum(pp)); 4423 } 4424 } 4425 } else { 4426 r = 1; 4427 pp->p_cowcnt--; 4428 pp->p_lckcnt++; 4429 } 4430 page_struct_unlock(pp); 4431 return (r); 4432 } 4433 4434 int 4435 page_addclaim_pages(page_t **ppa) 4436 { 4437 4438 pgcnt_t lckpgs = 0, pg_idx; 4439 4440 VM_STAT_ADD(pagecnt.pc_addclaim_pages); 4441 4442 mutex_enter(&page_llock); 4443 for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) { 4444 4445 ASSERT(PAGE_LOCKED(ppa[pg_idx])); 4446 ASSERT(ppa[pg_idx]->p_lckcnt != 0); 4447 if (ppa[pg_idx]->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) { 4448 mutex_exit(&page_llock); 4449 return (0); 4450 } 4451 if (ppa[pg_idx]->p_lckcnt > 1) 4452 lckpgs++; 4453 } 4454 4455 if (lckpgs != 0) { 4456 mutex_enter(&freemem_lock); 4457 if (availrmem >= pages_pp_maximum + lckpgs) { 4458 availrmem -= lckpgs; 4459 pages_claimed += lckpgs; 4460 } else { 4461 mutex_exit(&freemem_lock); 4462 mutex_exit(&page_llock); 4463 return (0); 4464 } 4465 mutex_exit(&freemem_lock); 4466 } 4467 4468 for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) { 4469 ppa[pg_idx]->p_lckcnt--; 4470 ppa[pg_idx]->p_cowcnt++; 4471 } 4472 mutex_exit(&page_llock); 4473 return (1); 4474 } 4475 4476 int 4477 page_subclaim_pages(page_t **ppa) 4478 { 4479 pgcnt_t ulckpgs = 0, pg_idx; 4480 4481 VM_STAT_ADD(pagecnt.pc_subclaim_pages); 4482 4483 mutex_enter(&page_llock); 4484 for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) { 4485 4486 ASSERT(PAGE_LOCKED(ppa[pg_idx])); 4487 ASSERT(ppa[pg_idx]->p_cowcnt != 0); 4488 if (ppa[pg_idx]->p_lckcnt == (ushort_t)PAGE_LOCK_MAXIMUM) { 4489 mutex_exit(&page_llock); 4490 return (0); 4491 } 4492 if (ppa[pg_idx]->p_lckcnt != 0) 4493 ulckpgs++; 4494 } 4495 4496 if (ulckpgs != 0) { 4497 mutex_enter(&freemem_lock); 4498 availrmem += ulckpgs; 4499 pages_claimed -= ulckpgs; 4500 mutex_exit(&freemem_lock); 4501 } 4502 4503 for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) { 4504 ppa[pg_idx]->p_cowcnt--; 4505 ppa[pg_idx]->p_lckcnt++; 4506 4507 } 4508 mutex_exit(&page_llock); 4509 return (1); 4510 } 4511 4512 page_t * 4513 page_numtopp(pfn_t pfnum, se_t se) 4514 { 4515 page_t *pp; 4516 4517 retry: 4518 pp = page_numtopp_nolock(pfnum); 4519 if (pp == NULL) { 4520 return ((page_t *)NULL); 4521 } 4522 4523 /* 4524 * Acquire the appropriate lock on the page. 4525 */ 4526 while (!page_lock(pp, se, (kmutex_t *)NULL, P_RECLAIM)) { 4527 if (page_pptonum(pp) != pfnum) 4528 goto retry; 4529 continue; 4530 } 4531 4532 if (page_pptonum(pp) != pfnum) { 4533 page_unlock(pp); 4534 goto retry; 4535 } 4536 4537 return (pp); 4538 } 4539 4540 page_t * 4541 page_numtopp_noreclaim(pfn_t pfnum, se_t se) 4542 { 4543 page_t *pp; 4544 4545 retry: 4546 pp = page_numtopp_nolock(pfnum); 4547 if (pp == NULL) { 4548 return ((page_t *)NULL); 4549 } 4550 4551 /* 4552 * Acquire the appropriate lock on the page. 4553 */ 4554 while (!page_lock(pp, se, (kmutex_t *)NULL, P_NO_RECLAIM)) { 4555 if (page_pptonum(pp) != pfnum) 4556 goto retry; 4557 continue; 4558 } 4559 4560 if (page_pptonum(pp) != pfnum) { 4561 page_unlock(pp); 4562 goto retry; 4563 } 4564 4565 return (pp); 4566 } 4567 4568 /* 4569 * This routine is like page_numtopp, but will only return page structs 4570 * for pages which are ok for loading into hardware using the page struct. 4571 */ 4572 page_t * 4573 page_numtopp_nowait(pfn_t pfnum, se_t se) 4574 { 4575 page_t *pp; 4576 4577 retry: 4578 pp = page_numtopp_nolock(pfnum); 4579 if (pp == NULL) { 4580 return ((page_t *)NULL); 4581 } 4582 4583 /* 4584 * Try to acquire the appropriate lock on the page. 4585 */ 4586 if (PP_ISFREE(pp)) 4587 pp = NULL; 4588 else { 4589 if (!page_trylock(pp, se)) 4590 pp = NULL; 4591 else { 4592 if (page_pptonum(pp) != pfnum) { 4593 page_unlock(pp); 4594 goto retry; 4595 } 4596 if (PP_ISFREE(pp)) { 4597 page_unlock(pp); 4598 pp = NULL; 4599 } 4600 } 4601 } 4602 return (pp); 4603 } 4604 4605 /* 4606 * Returns a count of dirty pages that are in the process 4607 * of being written out. If 'cleanit' is set, try to push the page. 4608 */ 4609 pgcnt_t 4610 page_busy(int cleanit) 4611 { 4612 page_t *page0 = page_first(); 4613 page_t *pp = page0; 4614 pgcnt_t nppbusy = 0; 4615 u_offset_t off; 4616 4617 do { 4618 vnode_t *vp = pp->p_vnode; 4619 4620 /* 4621 * A page is a candidate for syncing if it is: 4622 * 4623 * (a) On neither the freelist nor the cachelist 4624 * (b) Hashed onto a vnode 4625 * (c) Not a kernel page 4626 * (d) Dirty 4627 * (e) Not part of a swapfile 4628 * (f) a page which belongs to a real vnode; eg has a non-null 4629 * v_vfsp pointer. 4630 * (g) Backed by a filesystem which doesn't have a 4631 * stubbed-out sync operation 4632 */ 4633 if (!PP_ISFREE(pp) && vp != NULL && vp != &kvp && 4634 hat_ismod(pp) && !IS_SWAPVP(vp) && vp->v_vfsp != NULL && 4635 vfs_can_sync(vp->v_vfsp)) { 4636 nppbusy++; 4637 vfs_syncprogress(); 4638 4639 if (!cleanit) 4640 continue; 4641 if (!page_trylock(pp, SE_EXCL)) 4642 continue; 4643 4644 if (PP_ISFREE(pp) || vp == NULL || IS_SWAPVP(vp) || 4645 pp->p_lckcnt != 0 || pp->p_cowcnt != 0 || 4646 !(hat_pagesync(pp, 4647 HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD) & P_MOD)) { 4648 page_unlock(pp); 4649 continue; 4650 } 4651 off = pp->p_offset; 4652 VN_HOLD(vp); 4653 page_unlock(pp); 4654 (void) VOP_PUTPAGE(vp, off, PAGESIZE, 4655 B_ASYNC | B_FREE, kcred); 4656 VN_RELE(vp); 4657 } 4658 } while ((pp = page_next(pp)) != page0); 4659 4660 return (nppbusy); 4661 } 4662 4663 void page_invalidate_pages(void); 4664 4665 /* 4666 * callback handler to vm sub-system 4667 * 4668 * callers make sure no recursive entries to this func. 4669 */ 4670 /*ARGSUSED*/ 4671 boolean_t 4672 callb_vm_cpr(void *arg, int code) 4673 { 4674 if (code == CB_CODE_CPR_CHKPT) 4675 page_invalidate_pages(); 4676 return (B_TRUE); 4677 } 4678 4679 /* 4680 * Invalidate all pages of the system. 4681 * It shouldn't be called until all user page activities are all stopped. 4682 */ 4683 void 4684 page_invalidate_pages() 4685 { 4686 page_t *pp; 4687 page_t *page0; 4688 pgcnt_t nbusypages; 4689 int retry = 0; 4690 const int MAXRETRIES = 4; 4691 #if defined(__sparc) 4692 extern struct vnode prom_ppages; 4693 #endif /* __sparc */ 4694 4695 top: 4696 /* 4697 * Flush dirty pages and destory the clean ones. 4698 */ 4699 nbusypages = 0; 4700 4701 pp = page0 = page_first(); 4702 do { 4703 struct vnode *vp; 4704 u_offset_t offset; 4705 int mod; 4706 4707 /* 4708 * skip the page if it has no vnode or the page associated 4709 * with the kernel vnode or prom allocated kernel mem. 4710 */ 4711 #if defined(__sparc) 4712 if ((vp = pp->p_vnode) == NULL || vp == &kvp || 4713 vp == &prom_ppages) 4714 #else /* x86 doesn't have prom or prom_ppage */ 4715 if ((vp = pp->p_vnode) == NULL || vp == &kvp) 4716 #endif /* __sparc */ 4717 continue; 4718 4719 /* 4720 * skip the page which is already free invalidated. 4721 */ 4722 if (PP_ISFREE(pp) && PP_ISAGED(pp)) 4723 continue; 4724 4725 /* 4726 * skip pages that are already locked or can't be "exclusively" 4727 * locked or are already free. After we lock the page, check 4728 * the free and age bits again to be sure it's not destroied 4729 * yet. 4730 * To achieve max. parallelization, we use page_trylock instead 4731 * of page_lock so that we don't get block on individual pages 4732 * while we have thousands of other pages to process. 4733 */ 4734 if (!page_trylock(pp, SE_EXCL)) { 4735 nbusypages++; 4736 continue; 4737 } else if (PP_ISFREE(pp)) { 4738 if (!PP_ISAGED(pp)) { 4739 page_destroy_free(pp); 4740 } else { 4741 page_unlock(pp); 4742 } 4743 continue; 4744 } 4745 /* 4746 * Is this page involved in some I/O? shared? 4747 * 4748 * The page_struct_lock need not be acquired to 4749 * examine these fields since the page has an 4750 * "exclusive" lock. 4751 */ 4752 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 4753 page_unlock(pp); 4754 continue; 4755 } 4756 4757 if (vp->v_type == VCHR) { 4758 panic("vp->v_type == VCHR"); 4759 /*NOTREACHED*/ 4760 } 4761 4762 if (!page_try_demote_pages(pp)) { 4763 page_unlock(pp); 4764 continue; 4765 } 4766 4767 /* 4768 * Check the modified bit. Leave the bits alone in hardware 4769 * (they will be modified if we do the putpage). 4770 */ 4771 mod = (hat_pagesync(pp, HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD) 4772 & P_MOD); 4773 if (mod) { 4774 offset = pp->p_offset; 4775 /* 4776 * Hold the vnode before releasing the page lock 4777 * to prevent it from being freed and re-used by 4778 * some other thread. 4779 */ 4780 VN_HOLD(vp); 4781 page_unlock(pp); 4782 /* 4783 * No error return is checked here. Callers such as 4784 * cpr deals with the dirty pages at the dump time 4785 * if this putpage fails. 4786 */ 4787 (void) VOP_PUTPAGE(vp, offset, PAGESIZE, B_INVAL, 4788 kcred); 4789 VN_RELE(vp); 4790 } else { 4791 page_destroy(pp, 0); 4792 } 4793 } while ((pp = page_next(pp)) != page0); 4794 if (nbusypages && retry++ < MAXRETRIES) { 4795 delay(1); 4796 goto top; 4797 } 4798 } 4799 4800 /* 4801 * Replace the page "old" with the page "new" on the page hash and vnode lists 4802 * 4803 * the replacemnt must be done in place, ie the equivalent sequence: 4804 * 4805 * vp = old->p_vnode; 4806 * off = old->p_offset; 4807 * page_do_hashout(old) 4808 * page_do_hashin(new, vp, off) 4809 * 4810 * doesn't work, since 4811 * 1) if old is the only page on the vnode, the v_pages list has a window 4812 * where it looks empty. This will break file system assumptions. 4813 * and 4814 * 2) pvn_vplist_dirty() can't deal with pages moving on the v_pages list. 4815 */ 4816 static void 4817 page_do_relocate_hash(page_t *new, page_t *old) 4818 { 4819 page_t **hash_list; 4820 vnode_t *vp = old->p_vnode; 4821 kmutex_t *sep; 4822 4823 ASSERT(PAGE_EXCL(old)); 4824 ASSERT(PAGE_EXCL(new)); 4825 ASSERT(vp != NULL); 4826 ASSERT(MUTEX_HELD(page_vnode_mutex(vp))); 4827 ASSERT(MUTEX_HELD(PAGE_HASH_MUTEX(PAGE_HASH_FUNC(vp, old->p_offset)))); 4828 4829 /* 4830 * First find old page on the page hash list 4831 */ 4832 hash_list = &page_hash[PAGE_HASH_FUNC(vp, old->p_offset)]; 4833 4834 for (;;) { 4835 if (*hash_list == old) 4836 break; 4837 if (*hash_list == NULL) { 4838 panic("page_do_hashout"); 4839 /*NOTREACHED*/ 4840 } 4841 hash_list = &(*hash_list)->p_hash; 4842 } 4843 4844 /* 4845 * update new and replace old with new on the page hash list 4846 */ 4847 new->p_vnode = old->p_vnode; 4848 new->p_offset = old->p_offset; 4849 new->p_hash = old->p_hash; 4850 *hash_list = new; 4851 4852 if ((new->p_vnode->v_flag & VISSWAP) != 0) 4853 PP_SETSWAP(new); 4854 4855 /* 4856 * replace old with new on the vnode's page list 4857 */ 4858 if (old->p_vpnext == old) { 4859 new->p_vpnext = new; 4860 new->p_vpprev = new; 4861 } else { 4862 new->p_vpnext = old->p_vpnext; 4863 new->p_vpprev = old->p_vpprev; 4864 new->p_vpnext->p_vpprev = new; 4865 new->p_vpprev->p_vpnext = new; 4866 } 4867 if (vp->v_pages == old) 4868 vp->v_pages = new; 4869 4870 /* 4871 * clear out the old page 4872 */ 4873 old->p_hash = NULL; 4874 old->p_vpnext = NULL; 4875 old->p_vpprev = NULL; 4876 old->p_vnode = NULL; 4877 PP_CLRSWAP(old); 4878 old->p_offset = (u_offset_t)-1; 4879 page_clr_all_props(old); 4880 4881 /* 4882 * Wake up processes waiting for this page. The page's 4883 * identity has been changed, and is probably not the 4884 * desired page any longer. 4885 */ 4886 sep = page_se_mutex(old); 4887 mutex_enter(sep); 4888 if (CV_HAS_WAITERS(&old->p_cv)) 4889 cv_broadcast(&old->p_cv); 4890 mutex_exit(sep); 4891 } 4892 4893 /* 4894 * This function moves the identity of page "pp_old" to page "pp_new". 4895 * Both pages must be locked on entry. "pp_new" is free, has no identity, 4896 * and need not be hashed out from anywhere. 4897 */ 4898 void 4899 page_relocate_hash(page_t *pp_new, page_t *pp_old) 4900 { 4901 vnode_t *vp = pp_old->p_vnode; 4902 u_offset_t off = pp_old->p_offset; 4903 kmutex_t *phm, *vphm; 4904 4905 /* 4906 * Rehash two pages 4907 */ 4908 ASSERT(PAGE_EXCL(pp_old)); 4909 ASSERT(PAGE_EXCL(pp_new)); 4910 ASSERT(vp != NULL); 4911 ASSERT(pp_new->p_vnode == NULL); 4912 4913 /* 4914 * hashout then hashin while holding the mutexes 4915 */ 4916 phm = PAGE_HASH_MUTEX(PAGE_HASH_FUNC(vp, off)); 4917 mutex_enter(phm); 4918 vphm = page_vnode_mutex(vp); 4919 mutex_enter(vphm); 4920 4921 page_do_relocate_hash(pp_new, pp_old); 4922 4923 mutex_exit(vphm); 4924 mutex_exit(phm); 4925 4926 /* 4927 * The page_struct_lock need not be acquired for lckcnt and 4928 * cowcnt since the page has an "exclusive" lock. 4929 */ 4930 ASSERT(pp_new->p_lckcnt == 0); 4931 ASSERT(pp_new->p_cowcnt == 0); 4932 pp_new->p_lckcnt = pp_old->p_lckcnt; 4933 pp_new->p_cowcnt = pp_old->p_cowcnt; 4934 pp_old->p_lckcnt = pp_old->p_cowcnt = 0; 4935 4936 /* The following comment preserved from page_flip(). */ 4937 /* XXX - Do we need to protect fsdata? */ 4938 pp_new->p_fsdata = pp_old->p_fsdata; 4939 } 4940 4941 /* 4942 * Helper routine used to lock all remaining members of a 4943 * large page. The caller is responsible for passing in a locked 4944 * pp. If pp is a large page, then it succeeds in locking all the 4945 * remaining constituent pages or it returns with only the 4946 * original page locked. 4947 * 4948 * Returns 1 on success, 0 on failure. 4949 * 4950 * If success is returned this routine gurantees p_szc for all constituent 4951 * pages of a large page pp belongs to can't change. To achieve this we 4952 * recheck szc of pp after locking all constituent pages and retry if szc 4953 * changed (it could only decrease). Since hat_page_demote() needs an EXCL 4954 * lock on one of constituent pages it can't be running after all constituent 4955 * pages are locked. hat_page_demote() with a lock on a constituent page 4956 * outside of this large page (i.e. pp belonged to a larger large page) is 4957 * already done with all constituent pages of pp since the root's p_szc is 4958 * changed last. Thefore no need to synchronize with hat_page_demote() that 4959 * locked a constituent page outside of pp's current large page. 4960 */ 4961 #ifdef DEBUG 4962 uint32_t gpg_trylock_mtbf = 0; 4963 #endif 4964 4965 int 4966 group_page_trylock(page_t *pp, se_t se) 4967 { 4968 page_t *tpp; 4969 pgcnt_t npgs, i, j; 4970 uint_t pszc = pp->p_szc; 4971 4972 #ifdef DEBUG 4973 if (gpg_trylock_mtbf && !(gethrtime() % gpg_trylock_mtbf)) { 4974 return (0); 4975 } 4976 #endif 4977 4978 if (pp != PP_GROUPLEADER(pp, pszc)) { 4979 return (0); 4980 } 4981 4982 retry: 4983 ASSERT(PAGE_LOCKED_SE(pp, se)); 4984 ASSERT(!PP_ISFREE(pp)); 4985 if (pszc == 0) { 4986 return (1); 4987 } 4988 npgs = page_get_pagecnt(pszc); 4989 tpp = pp + 1; 4990 for (i = 1; i < npgs; i++, tpp++) { 4991 if (!page_trylock(tpp, se)) { 4992 tpp = pp + 1; 4993 for (j = 1; j < i; j++, tpp++) { 4994 page_unlock(tpp); 4995 } 4996 return (0); 4997 } 4998 } 4999 if (pp->p_szc != pszc) { 5000 ASSERT(pp->p_szc < pszc); 5001 ASSERT(pp->p_vnode != NULL && pp->p_vnode != &kvp && 5002 !IS_SWAPFSVP(pp->p_vnode)); 5003 tpp = pp + 1; 5004 for (i = 1; i < npgs; i++, tpp++) { 5005 page_unlock(tpp); 5006 } 5007 pszc = pp->p_szc; 5008 goto retry; 5009 } 5010 return (1); 5011 } 5012 5013 void 5014 group_page_unlock(page_t *pp) 5015 { 5016 page_t *tpp; 5017 pgcnt_t npgs, i; 5018 5019 ASSERT(PAGE_LOCKED(pp)); 5020 ASSERT(!PP_ISFREE(pp)); 5021 ASSERT(pp == PP_PAGEROOT(pp)); 5022 npgs = page_get_pagecnt(pp->p_szc); 5023 for (i = 1, tpp = pp + 1; i < npgs; i++, tpp++) { 5024 page_unlock(tpp); 5025 } 5026 } 5027 5028 /* 5029 * returns 5030 * 0 : on success and *nrelocp is number of relocated PAGESIZE pages 5031 * ERANGE : this is not a base page 5032 * EBUSY : failure to get locks on the page/pages 5033 * ENOMEM : failure to obtain replacement pages 5034 * EAGAIN : OBP has not yet completed its boot-time handoff to the kernel 5035 * 5036 * Return with all constituent members of target and replacement 5037 * SE_EXCL locked. It is the callers responsibility to drop the 5038 * locks. 5039 */ 5040 int 5041 do_page_relocate( 5042 page_t **target, 5043 page_t **replacement, 5044 int grouplock, 5045 spgcnt_t *nrelocp, 5046 lgrp_t *lgrp) 5047 { 5048 #ifdef DEBUG 5049 page_t *first_repl; 5050 #endif /* DEBUG */ 5051 page_t *repl; 5052 page_t *targ; 5053 page_t *pl = NULL; 5054 uint_t ppattr; 5055 pfn_t pfn, repl_pfn; 5056 uint_t szc; 5057 spgcnt_t npgs, i; 5058 int repl_contig = 0; 5059 uint_t flags = 0; 5060 spgcnt_t dofree = 0; 5061 5062 *nrelocp = 0; 5063 5064 #if defined(__sparc) 5065 /* 5066 * We need to wait till OBP has completed 5067 * its boot-time handoff of its resources to the kernel 5068 * before we allow page relocation 5069 */ 5070 if (page_relocate_ready == 0) { 5071 return (EAGAIN); 5072 } 5073 #endif 5074 5075 /* 5076 * If this is not a base page, 5077 * just return with 0x0 pages relocated. 5078 */ 5079 targ = *target; 5080 ASSERT(PAGE_EXCL(targ)); 5081 ASSERT(!PP_ISFREE(targ)); 5082 szc = targ->p_szc; 5083 ASSERT(szc < mmu_page_sizes); 5084 VM_STAT_ADD(vmm_vmstats.ppr_reloc[szc]); 5085 pfn = targ->p_pagenum; 5086 if (pfn != PFN_BASE(pfn, szc)) { 5087 VM_STAT_ADD(vmm_vmstats.ppr_relocnoroot[szc]); 5088 return (ERANGE); 5089 } 5090 5091 if ((repl = *replacement) != NULL && repl->p_szc >= szc) { 5092 repl_pfn = repl->p_pagenum; 5093 if (repl_pfn != PFN_BASE(repl_pfn, szc)) { 5094 VM_STAT_ADD(vmm_vmstats.ppr_reloc_replnoroot[szc]); 5095 return (ERANGE); 5096 } 5097 repl_contig = 1; 5098 } 5099 5100 /* 5101 * We must lock all members of this large page or we cannot 5102 * relocate any part of it. 5103 */ 5104 if (grouplock != 0 && !group_page_trylock(targ, SE_EXCL)) { 5105 VM_STAT_ADD(vmm_vmstats.ppr_relocnolock[targ->p_szc]); 5106 return (EBUSY); 5107 } 5108 5109 /* 5110 * reread szc it could have been decreased before 5111 * group_page_trylock() was done. 5112 */ 5113 szc = targ->p_szc; 5114 ASSERT(szc < mmu_page_sizes); 5115 VM_STAT_ADD(vmm_vmstats.ppr_reloc[szc]); 5116 ASSERT(pfn == PFN_BASE(pfn, szc)); 5117 5118 npgs = page_get_pagecnt(targ->p_szc); 5119 5120 if (repl == NULL) { 5121 dofree = npgs; /* Size of target page in MMU pages */ 5122 if (!page_create_wait(dofree, 0)) { 5123 if (grouplock != 0) { 5124 group_page_unlock(targ); 5125 } 5126 VM_STAT_ADD(vmm_vmstats.ppr_relocnomem[szc]); 5127 return (ENOMEM); 5128 } 5129 5130 /* 5131 * seg kmem pages require that the target and replacement 5132 * page be the same pagesize. 5133 */ 5134 flags = (targ->p_vnode == &kvp) ? PGR_SAMESZC : 0; 5135 repl = page_get_replacement_page(targ, lgrp, flags); 5136 if (repl == NULL) { 5137 if (grouplock != 0) { 5138 group_page_unlock(targ); 5139 } 5140 page_create_putback(dofree); 5141 VM_STAT_ADD(vmm_vmstats.ppr_relocnomem[szc]); 5142 return (ENOMEM); 5143 } 5144 } 5145 #ifdef DEBUG 5146 else { 5147 ASSERT(PAGE_LOCKED(repl)); 5148 } 5149 #endif /* DEBUG */ 5150 5151 #if defined(__sparc) 5152 /* 5153 * Let hat_page_relocate() complete the relocation if it's kernel page 5154 */ 5155 if (targ->p_vnode == &kvp) { 5156 *replacement = repl; 5157 if (hat_page_relocate(target, replacement, nrelocp) != 0) { 5158 if (grouplock != 0) { 5159 group_page_unlock(targ); 5160 } 5161 if (dofree) { 5162 *replacement = NULL; 5163 page_free_replacement_page(repl); 5164 page_create_putback(dofree); 5165 } 5166 VM_STAT_ADD(vmm_vmstats.ppr_krelocfail[szc]); 5167 return (EAGAIN); 5168 } 5169 VM_STAT_ADD(vmm_vmstats.ppr_relocok[szc]); 5170 return (0); 5171 } 5172 #else 5173 #if defined(lint) 5174 dofree = dofree; 5175 #endif 5176 #endif 5177 5178 #ifdef DEBUG 5179 first_repl = repl; 5180 #endif /* DEBUG */ 5181 5182 for (i = 0; i < npgs; i++) { 5183 ASSERT(PAGE_EXCL(targ)); 5184 5185 (void) hat_pageunload(targ, HAT_FORCE_PGUNLOAD); 5186 5187 ASSERT(hat_page_getshare(targ) == 0); 5188 ASSERT(!PP_ISFREE(targ)); 5189 ASSERT(targ->p_pagenum == (pfn + i)); 5190 ASSERT(repl_contig == 0 || 5191 repl->p_pagenum == (repl_pfn + i)); 5192 5193 /* 5194 * Copy the page contents and attributes then 5195 * relocate the page in the page hash. 5196 */ 5197 ppcopy(targ, repl); 5198 ppattr = hat_page_getattr(targ, (P_MOD | P_REF | P_RO)); 5199 page_clr_all_props(repl); 5200 page_set_props(repl, ppattr); 5201 page_relocate_hash(repl, targ); 5202 5203 ASSERT(hat_page_getshare(targ) == 0); 5204 ASSERT(hat_page_getshare(repl) == 0); 5205 /* 5206 * Now clear the props on targ, after the 5207 * page_relocate_hash(), they no longer 5208 * have any meaning. 5209 */ 5210 page_clr_all_props(targ); 5211 ASSERT(targ->p_next == targ); 5212 ASSERT(targ->p_prev == targ); 5213 page_list_concat(&pl, &targ); 5214 5215 targ++; 5216 if (repl_contig != 0) { 5217 repl++; 5218 } else { 5219 repl = repl->p_next; 5220 } 5221 } 5222 /* assert that we have come full circle with repl */ 5223 ASSERT(repl_contig == 1 || first_repl == repl); 5224 5225 *target = pl; 5226 if (*replacement == NULL) { 5227 ASSERT(first_repl == repl); 5228 *replacement = repl; 5229 } 5230 VM_STAT_ADD(vmm_vmstats.ppr_relocok[szc]); 5231 *nrelocp = npgs; 5232 return (0); 5233 } 5234 /* 5235 * On success returns 0 and *nrelocp the number of PAGESIZE pages relocated. 5236 */ 5237 int 5238 page_relocate( 5239 page_t **target, 5240 page_t **replacement, 5241 int grouplock, 5242 int freetarget, 5243 spgcnt_t *nrelocp, 5244 lgrp_t *lgrp) 5245 { 5246 spgcnt_t ret; 5247 5248 /* do_page_relocate returns 0 on success or errno value */ 5249 ret = do_page_relocate(target, replacement, grouplock, nrelocp, lgrp); 5250 5251 if (ret != 0 || freetarget == 0) { 5252 return (ret); 5253 } 5254 if (*nrelocp == 1) { 5255 ASSERT(*target != NULL); 5256 page_free(*target, 1); 5257 } else { 5258 page_t *tpp = *target; 5259 uint_t szc = tpp->p_szc; 5260 pgcnt_t npgs = page_get_pagecnt(szc); 5261 ASSERT(npgs > 1); 5262 ASSERT(szc != 0); 5263 do { 5264 ASSERT(PAGE_EXCL(tpp)); 5265 ASSERT(!hat_page_is_mapped(tpp)); 5266 ASSERT(tpp->p_szc == szc); 5267 PP_SETFREE(tpp); 5268 PP_SETAGED(tpp); 5269 npgs--; 5270 } while ((tpp = tpp->p_next) != *target); 5271 ASSERT(npgs == 0); 5272 page_list_add_pages(*target, 0); 5273 npgs = page_get_pagecnt(szc); 5274 page_create_putback(npgs); 5275 } 5276 return (ret); 5277 } 5278 5279 /* 5280 * it is up to the caller to deal with pcf accounting. 5281 */ 5282 void 5283 page_free_replacement_page(page_t *pplist) 5284 { 5285 page_t *pp; 5286 5287 while (pplist != NULL) { 5288 /* 5289 * pp_targ is a linked list. 5290 */ 5291 pp = pplist; 5292 if (pp->p_szc == 0) { 5293 page_sub(&pplist, pp); 5294 page_clr_all_props(pp); 5295 PP_SETFREE(pp); 5296 PP_SETAGED(pp); 5297 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); 5298 page_unlock(pp); 5299 VM_STAT_ADD(pagecnt.pc_free_replacement_page[0]); 5300 } else { 5301 spgcnt_t curnpgs = page_get_pagecnt(pp->p_szc); 5302 page_t *tpp; 5303 page_list_break(&pp, &pplist, curnpgs); 5304 tpp = pp; 5305 do { 5306 ASSERT(PAGE_EXCL(tpp)); 5307 ASSERT(!hat_page_is_mapped(tpp)); 5308 page_clr_all_props(pp); 5309 PP_SETFREE(tpp); 5310 PP_SETAGED(tpp); 5311 } while ((tpp = tpp->p_next) != pp); 5312 page_list_add_pages(pp, 0); 5313 VM_STAT_ADD(pagecnt.pc_free_replacement_page[1]); 5314 } 5315 } 5316 } 5317 5318 /* 5319 * Relocate target to non-relocatable replacement page. 5320 */ 5321 int 5322 page_relocate_cage(page_t **target, page_t **replacement) 5323 { 5324 page_t *tpp, *rpp; 5325 spgcnt_t pgcnt, npgs; 5326 int result; 5327 5328 tpp = *target; 5329 5330 ASSERT(PAGE_EXCL(tpp)); 5331 ASSERT(tpp->p_szc == 0); 5332 5333 pgcnt = btop(page_get_pagesize(tpp->p_szc)); 5334 5335 do { 5336 (void) page_create_wait(pgcnt, PG_WAIT | PG_NORELOC); 5337 rpp = page_get_replacement_page(tpp, NULL, PGR_NORELOC); 5338 if (rpp == NULL) { 5339 page_create_putback(pgcnt); 5340 kcage_cageout_wakeup(); 5341 } 5342 } while (rpp == NULL); 5343 5344 ASSERT(PP_ISNORELOC(rpp)); 5345 5346 result = page_relocate(&tpp, &rpp, 0, 1, &npgs, NULL); 5347 5348 if (result == 0) { 5349 *replacement = rpp; 5350 if (pgcnt != npgs) 5351 panic("page_relocate_cage: partial relocation"); 5352 } 5353 5354 return (result); 5355 } 5356 5357 /* 5358 * Release the page lock on a page, place on cachelist 5359 * tail if no longer mapped. Caller can let us know if 5360 * the page is known to be clean. 5361 */ 5362 int 5363 page_release(page_t *pp, int checkmod) 5364 { 5365 int status; 5366 5367 ASSERT(PAGE_LOCKED(pp) && !PP_ISFREE(pp) && 5368 (pp->p_vnode != NULL)); 5369 5370 if (!hat_page_is_mapped(pp) && !IS_SWAPVP(pp->p_vnode) && 5371 ((PAGE_SHARED(pp) && page_tryupgrade(pp)) || PAGE_EXCL(pp)) && 5372 pp->p_lckcnt == 0 && pp->p_cowcnt == 0 && 5373 !hat_page_is_mapped(pp)) { 5374 5375 /* 5376 * If page is modified, unlock it 5377 * 5378 * (p_nrm & P_MOD) bit has the latest stuff because: 5379 * (1) We found that this page doesn't have any mappings 5380 * _after_ holding SE_EXCL and 5381 * (2) We didn't drop SE_EXCL lock after the check in (1) 5382 */ 5383 if (checkmod && hat_ismod(pp)) { 5384 page_unlock(pp); 5385 status = PGREL_MOD; 5386 } else { 5387 /*LINTED: constant in conditional context*/ 5388 VN_DISPOSE(pp, B_FREE, 0, kcred); 5389 status = PGREL_CLEAN; 5390 } 5391 } else { 5392 page_unlock(pp); 5393 status = PGREL_NOTREL; 5394 } 5395 return (status); 5396 } 5397 5398 int 5399 page_try_demote_pages(page_t *pp) 5400 { 5401 page_t *tpp, *rootpp = pp; 5402 pfn_t pfn = page_pptonum(pp); 5403 spgcnt_t i, npgs; 5404 uint_t szc = pp->p_szc; 5405 vnode_t *vp = pp->p_vnode; 5406 5407 ASSERT(PAGE_EXCL(rootpp)); 5408 5409 VM_STAT_ADD(pagecnt.pc_try_demote_pages[0]); 5410 5411 if (rootpp->p_szc == 0) { 5412 VM_STAT_ADD(pagecnt.pc_try_demote_pages[1]); 5413 return (1); 5414 } 5415 5416 if (vp != NULL && !IS_SWAPFSVP(vp) && vp != &kvp) { 5417 VM_STAT_ADD(pagecnt.pc_try_demote_pages[2]); 5418 page_demote_vp_pages(rootpp); 5419 ASSERT(pp->p_szc == 0); 5420 return (1); 5421 } 5422 5423 /* 5424 * Adjust rootpp if passed in is not the base 5425 * constituent page. 5426 */ 5427 npgs = page_get_pagecnt(rootpp->p_szc); 5428 ASSERT(npgs > 1); 5429 if (!IS_P2ALIGNED(pfn, npgs)) { 5430 pfn = P2ALIGN(pfn, npgs); 5431 rootpp = page_numtopp_nolock(pfn); 5432 VM_STAT_ADD(pagecnt.pc_try_demote_pages[3]); 5433 ASSERT(rootpp->p_vnode != NULL); 5434 ASSERT(rootpp->p_szc == szc); 5435 } 5436 5437 /* 5438 * We can't demote kernel pages since we can't hat_unload() 5439 * the mappings. 5440 */ 5441 if (rootpp->p_vnode == &kvp) 5442 return (0); 5443 5444 /* 5445 * Attempt to lock all constituent pages except the page passed 5446 * in since it's already locked. 5447 */ 5448 for (tpp = rootpp, i = 0; i < npgs; i++, tpp++) { 5449 ASSERT(!PP_ISFREE(tpp)); 5450 ASSERT(tpp->p_vnode != NULL); 5451 5452 if (tpp != pp && !page_trylock(tpp, SE_EXCL)) 5453 break; 5454 ASSERT(tpp->p_szc == rootpp->p_szc); 5455 ASSERT(page_pptonum(tpp) == page_pptonum(rootpp) + i); 5456 (void) hat_pageunload(tpp, HAT_FORCE_PGUNLOAD); 5457 } 5458 5459 /* 5460 * If we failed to lock them all then unlock what we have locked 5461 * so far and bail. 5462 */ 5463 if (i < npgs) { 5464 tpp = rootpp; 5465 while (i-- > 0) { 5466 if (tpp != pp) 5467 page_unlock(tpp); 5468 tpp++; 5469 } 5470 VM_STAT_ADD(pagecnt.pc_try_demote_pages[4]); 5471 return (0); 5472 } 5473 5474 /* 5475 * XXX probably p_szc clearing and page unlocking can be done within 5476 * one loop but since this is rare code we can play very safe. 5477 */ 5478 for (tpp = rootpp, i = 0; i < npgs; i++, tpp++) { 5479 ASSERT(PAGE_EXCL(tpp)); 5480 tpp->p_szc = 0; 5481 } 5482 5483 /* 5484 * Unlock all pages except the page passed in. 5485 */ 5486 for (tpp = rootpp, i = 0; i < npgs; i++, tpp++) { 5487 ASSERT(!hat_page_is_mapped(tpp)); 5488 if (tpp != pp) 5489 page_unlock(tpp); 5490 } 5491 VM_STAT_ADD(pagecnt.pc_try_demote_pages[5]); 5492 return (1); 5493 } 5494 5495 /* 5496 * Called by page_free() and page_destroy() to demote the page size code 5497 * (p_szc) to 0 (since we can't just put a single PAGESIZE page with non zero 5498 * p_szc on free list, neither can we just clear p_szc of a single page_t 5499 * within a large page since it will break other code that relies on p_szc 5500 * being the same for all page_t's of a large page). Anonymous pages should 5501 * never end up here because anon_map_getpages() cannot deal with p_szc 5502 * changes after a single constituent page is locked. While anonymous or 5503 * kernel large pages are demoted or freed the entire large page at a time 5504 * with all constituent pages locked EXCL for the file system pages we 5505 * have to be able to demote a large page (i.e. decrease all constituent pages 5506 * p_szc) with only just an EXCL lock on one of constituent pages. The reason 5507 * we can easily deal with anonymous page demotion the entire large page at a 5508 * time is that those operation originate at address space level and concern 5509 * the entire large page region with actual demotion only done when pages are 5510 * not shared with any other processes (therefore we can always get EXCL lock 5511 * on all anonymous constituent pages after clearing segment page 5512 * cache). However file system pages can be truncated or invalidated at a 5513 * PAGESIZE level from the file system side and end up in page_free() or 5514 * page_destroy() (we also allow only part of the large page to be SOFTLOCKed 5515 * and therfore pageout should be able to demote a large page by EXCL locking 5516 * any constituent page that is not under SOFTLOCK). In those cases we cannot 5517 * rely on being able to lock EXCL all constituent pages. 5518 * 5519 * To prevent szc changes on file system pages one has to lock all constituent 5520 * pages at least SHARED (or call page_szc_lock()). The only subsystem that 5521 * doesn't rely on locking all constituent pages (or using page_szc_lock()) to 5522 * prevent szc changes is hat layer that uses its own page level mlist 5523 * locks. hat assumes that szc doesn't change after mlist lock for a page is 5524 * taken. Therefore we need to change szc under hat level locks if we only 5525 * have an EXCL lock on a single constituent page and hat still references any 5526 * of constituent pages. (Note we can't "ignore" hat layer by simply 5527 * hat_pageunload() all constituent pages without having EXCL locks on all of 5528 * constituent pages). We use hat_page_demote() call to safely demote szc of 5529 * all constituent pages under hat locks when we only have an EXCL lock on one 5530 * of constituent pages. 5531 * 5532 * This routine calls page_szc_lock() before calling hat_page_demote() to 5533 * allow segvn in one special case not to lock all constituent pages SHARED 5534 * before calling hat_memload_array() that relies on p_szc not changeing even 5535 * before hat level mlist lock is taken. In that case segvn uses 5536 * page_szc_lock() to prevent hat_page_demote() changeing p_szc values. 5537 * 5538 * Anonymous or kernel page demotion still has to lock all pages exclusively 5539 * and do hat_pageunload() on all constituent pages before demoting the page 5540 * therefore there's no need for anonymous or kernel page demotion to use 5541 * hat_page_demote() mechanism. 5542 * 5543 * hat_page_demote() removes all large mappings that map pp and then decreases 5544 * p_szc starting from the last constituent page of the large page. By working 5545 * from the tail of a large page in pfn decreasing order allows one looking at 5546 * the root page to know that hat_page_demote() is done for root's szc area. 5547 * e.g. if a root page has szc 1 one knows it only has to lock all constituent 5548 * pages within szc 1 area to prevent szc changes because hat_page_demote() 5549 * that started on this page when it had szc > 1 is done for this szc 1 area. 5550 * 5551 * We are guranteed that all constituent pages of pp's large page belong to 5552 * the same vnode with the consecutive offsets increasing in the direction of 5553 * the pfn i.e. the identity of constituent pages can't change until their 5554 * p_szc is decreased. Therefore it's safe for hat_page_demote() to remove 5555 * large mappings to pp even though we don't lock any constituent page except 5556 * pp (i.e. we won't unload e.g. kernel locked page). 5557 */ 5558 static void 5559 page_demote_vp_pages(page_t *pp) 5560 { 5561 kmutex_t *mtx; 5562 5563 ASSERT(PAGE_EXCL(pp)); 5564 ASSERT(!PP_ISFREE(pp)); 5565 ASSERT(pp->p_vnode != NULL); 5566 ASSERT(!IS_SWAPFSVP(pp->p_vnode)); 5567 ASSERT(pp->p_vnode != &kvp); 5568 5569 VM_STAT_ADD(pagecnt.pc_demote_pages[0]); 5570 5571 mtx = page_szc_lock(pp); 5572 if (mtx != NULL) { 5573 hat_page_demote(pp); 5574 mutex_exit(mtx); 5575 } 5576 ASSERT(pp->p_szc == 0); 5577 } 5578 5579 /* 5580 * Page retire operation. 5581 * 5582 * page_retire() 5583 * Attempt to retire (throw away) page pp. We cannot do this if 5584 * the page is dirty; if the page is clean, we can try. We return 0 on 5585 * success, -1 on failure. This routine should be invoked by the platform's 5586 * memory error detection code. 5587 * 5588 * pages_retired_limit_exceeded() 5589 * We set a limit on the number of pages which may be retired. This 5590 * is set to a percentage of total physical memory. This limit is 5591 * enforced here. 5592 */ 5593 5594 static pgcnt_t retired_pgcnt = 0; 5595 5596 /* 5597 * routines to update the count of retired pages 5598 */ 5599 static void 5600 page_retired(page_t *pp) 5601 { 5602 ASSERT(pp); 5603 5604 page_settoxic(pp, PAGE_IS_RETIRED); 5605 atomic_add_long(&retired_pgcnt, 1); 5606 } 5607 5608 static void 5609 retired_page_removed(page_t *pp) 5610 { 5611 ASSERT(pp); 5612 ASSERT(page_isretired(pp)); 5613 ASSERT(retired_pgcnt > 0); 5614 5615 page_clrtoxic(pp); 5616 atomic_add_long(&retired_pgcnt, -1); 5617 } 5618 5619 5620 static int 5621 pages_retired_limit_exceeded() 5622 { 5623 pgcnt_t retired_max; 5624 5625 /* 5626 * If the percentage is zero or is not set correctly, 5627 * return TRUE so that pages are not retired. 5628 */ 5629 if (max_pages_retired_bps <= 0 || 5630 max_pages_retired_bps >= 10000) 5631 return (1); 5632 5633 /* 5634 * Calculate the maximum number of pages allowed to 5635 * be retired as a percentage of total physical memory 5636 * (Remember that we are using basis points, hence the 10000.) 5637 */ 5638 retired_max = (physmem * max_pages_retired_bps) / 10000; 5639 5640 /* 5641 * return 'TRUE' if we have already retired more 5642 * than the legal limit 5643 */ 5644 return (retired_pgcnt >= retired_max); 5645 } 5646 5647 #define PAGE_RETIRE_SELOCK 0 5648 #define PAGE_RETIRE_NORECLAIM 1 5649 #define PAGE_RETIRE_LOCKED 2 5650 #define PAGE_RETIRE_COW 3 5651 #define PAGE_RETIRE_DIRTY 4 5652 #define PAGE_RETIRE_LPAGE 5 5653 #define PAGE_RETIRE_SUCCESS 6 5654 #define PAGE_RETIRE_LIMIT 7 5655 #define PAGE_RETIRE_NCODES 8 5656 5657 typedef struct page_retire_op { 5658 int pr_count; 5659 short pr_unlock; 5660 short pr_retval; 5661 char *pr_message; 5662 } page_retire_op_t; 5663 5664 page_retire_op_t page_retire_ops[PAGE_RETIRE_NCODES] = { 5665 { 0, 0, -1, "cannot lock page" }, 5666 { 0, 0, -1, "cannot reclaim cached page" }, 5667 { 0, 1, -1, "page is locked" }, 5668 { 0, 1, -1, "copy-on-write page" }, 5669 { 0, 1, -1, "page is dirty" }, 5670 { 0, 1, -1, "cannot demote large page" }, 5671 { 0, 0, 0, "page successfully retired" }, 5672 { 0, 0, -1, "excess pages retired already" }, 5673 }; 5674 5675 static int 5676 page_retire_done(page_t *pp, int code) 5677 { 5678 page_retire_op_t *prop = &page_retire_ops[code]; 5679 5680 prop->pr_count++; 5681 5682 if (prop->pr_unlock) 5683 page_unlock(pp); 5684 5685 if (page_retire_messages > 1) { 5686 printf("page_retire(%p) pfn 0x%lx %s: %s\n", 5687 (void *)pp, page_pptonum(pp), 5688 prop->pr_retval == -1 ? "failed" : "succeeded", 5689 prop->pr_message); 5690 } 5691 5692 return (prop->pr_retval); 5693 } 5694 5695 int 5696 page_retire(page_t *pp, uchar_t flag) 5697 { 5698 uint64_t pa = ptob((uint64_t)page_pptonum(pp)); 5699 5700 ASSERT(flag == PAGE_IS_FAILING || flag == PAGE_IS_TOXIC); 5701 5702 /* 5703 * DR operations change the association between a page_t 5704 * and the physical page it represents. Check if the 5705 * page is still bad. 5706 */ 5707 if (!page_isfaulty(pp)) { 5708 page_clrtoxic(pp); 5709 return (page_retire_done(pp, PAGE_RETIRE_SUCCESS)); 5710 } 5711 5712 /* 5713 * We set the flag here so that even if we fail due 5714 * to exceeding the limit for retired pages, the 5715 * page will still be checked and either cleared 5716 * or retired in page_free(). 5717 */ 5718 page_settoxic(pp, flag); 5719 5720 if (flag == PAGE_IS_TOXIC) { 5721 if (page_retire_messages) { 5722 cmn_err(CE_NOTE, "Scheduling clearing of error on" 5723 " page 0x%08x.%08x", 5724 (uint32_t)(pa >> 32), (uint32_t)pa); 5725 } 5726 5727 } else { /* PAGE_IS_FAILING */ 5728 if (pages_retired_limit_exceeded()) { 5729 /* 5730 * Return as we have already exceeded the 5731 * maximum number of pages allowed to be 5732 * retired 5733 */ 5734 return (page_retire_done(pp, PAGE_RETIRE_LIMIT)); 5735 } 5736 5737 if (page_retire_messages) { 5738 cmn_err(CE_NOTE, "Scheduling removal of " 5739 "page 0x%08x.%08x", 5740 (uint32_t)(pa >> 32), (uint32_t)pa); 5741 } 5742 } 5743 5744 if (PAGE_LOCKED(pp) || !page_trylock(pp, SE_EXCL)) 5745 return (page_retire_done(pp, PAGE_RETIRE_SELOCK)); 5746 5747 /* 5748 * If this is a large page we first try and demote it 5749 * to PAGESIZE pages and then dispose of the toxic page. 5750 * On failure we will let the page free/destroy 5751 * code handle it later since this is a mapped page. 5752 * Note that free large pages can always be demoted. 5753 * 5754 */ 5755 if (pp->p_szc != 0) { 5756 if (PP_ISFREE(pp)) 5757 (void) page_demote_free_pages(pp); 5758 else 5759 (void) page_try_demote_pages(pp); 5760 5761 if (pp->p_szc != 0) 5762 return (page_retire_done(pp, PAGE_RETIRE_LPAGE)); 5763 } 5764 5765 if (PP_ISFREE(pp)) { 5766 if (!page_reclaim(pp, NULL)) 5767 return (page_retire_done(pp, PAGE_RETIRE_NORECLAIM)); 5768 /*LINTED: constant in conditional context*/ 5769 VN_DISPOSE(pp, pp->p_vnode ? B_INVAL : B_FREE, 0, kcred) 5770 return (page_retire_done(pp, PAGE_RETIRE_SUCCESS)); 5771 } 5772 5773 if (pp->p_lckcnt != 0) 5774 return (page_retire_done(pp, PAGE_RETIRE_LOCKED)); 5775 5776 if (pp->p_cowcnt != 0) 5777 return (page_retire_done(pp, PAGE_RETIRE_COW)); 5778 5779 /* 5780 * Unload all translations to this page. No new translations 5781 * can be created while we hold the exclusive lock on the page. 5782 */ 5783 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 5784 5785 if (hat_ismod(pp)) 5786 return (page_retire_done(pp, PAGE_RETIRE_DIRTY)); 5787 5788 /*LINTED: constant in conditional context*/ 5789 VN_DISPOSE(pp, B_INVAL, 0, kcred); 5790 5791 return (page_retire_done(pp, PAGE_RETIRE_SUCCESS)); 5792 } 5793 5794 /* 5795 * Mark any existing pages for migration in the given range 5796 */ 5797 void 5798 page_mark_migrate(struct seg *seg, caddr_t addr, size_t len, 5799 struct anon_map *amp, ulong_t anon_index, vnode_t *vp, 5800 u_offset_t vnoff, int rflag) 5801 { 5802 struct anon *ap; 5803 vnode_t *curvp; 5804 lgrp_t *from; 5805 pgcnt_t i; 5806 pgcnt_t nlocked; 5807 u_offset_t off; 5808 pfn_t pfn; 5809 size_t pgsz; 5810 size_t segpgsz; 5811 pgcnt_t pages; 5812 uint_t pszc; 5813 page_t **ppa; 5814 pgcnt_t ppa_nentries; 5815 page_t *pp; 5816 caddr_t va; 5817 ulong_t an_idx; 5818 anon_sync_obj_t cookie; 5819 5820 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 5821 5822 /* 5823 * Don't do anything if don't need to do lgroup optimizations 5824 * on this system 5825 */ 5826 if (!lgrp_optimizations()) 5827 return; 5828 5829 /* 5830 * Align address and length to (potentially large) page boundary 5831 */ 5832 segpgsz = page_get_pagesize(seg->s_szc); 5833 addr = (caddr_t)P2ALIGN((uintptr_t)addr, segpgsz); 5834 if (rflag) 5835 len = P2ROUNDUP(len, segpgsz); 5836 5837 /* 5838 * Allocate page array to accomodate largest page size 5839 */ 5840 pgsz = page_get_pagesize(page_num_pagesizes() - 1); 5841 ppa_nentries = btop(pgsz); 5842 ppa = kmem_zalloc(ppa_nentries * sizeof (page_t *), KM_SLEEP); 5843 5844 /* 5845 * Do one (large) page at a time 5846 */ 5847 va = addr; 5848 while (va < addr + len) { 5849 /* 5850 * Lookup (root) page for vnode and offset corresponding to 5851 * this virtual address 5852 * Try anonmap first since there may be copy-on-write 5853 * pages, but initialize vnode pointer and offset using 5854 * vnode arguments just in case there isn't an amp. 5855 */ 5856 curvp = vp; 5857 off = vnoff + va - seg->s_base; 5858 if (amp) { 5859 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5860 an_idx = anon_index + seg_page(seg, va); 5861 anon_array_enter(amp, an_idx, &cookie); 5862 ap = anon_get_ptr(amp->ahp, an_idx); 5863 if (ap) 5864 swap_xlate(ap, &curvp, &off); 5865 anon_array_exit(&cookie); 5866 ANON_LOCK_EXIT(&->a_rwlock); 5867 } 5868 5869 pp = NULL; 5870 if (curvp) 5871 pp = page_lookup(curvp, off, SE_SHARED); 5872 5873 /* 5874 * If there isn't a page at this virtual address, 5875 * skip to next page 5876 */ 5877 if (pp == NULL) { 5878 va += PAGESIZE; 5879 continue; 5880 } 5881 5882 /* 5883 * Figure out which lgroup this page is in for kstats 5884 */ 5885 pfn = page_pptonum(pp); 5886 from = lgrp_pfn_to_lgrp(pfn); 5887 5888 /* 5889 * Get page size, and round up and skip to next page boundary 5890 * if unaligned address 5891 */ 5892 pszc = pp->p_szc; 5893 pgsz = page_get_pagesize(pszc); 5894 pages = btop(pgsz); 5895 if (!IS_P2ALIGNED(va, pgsz) || 5896 !IS_P2ALIGNED(pfn, pages) || 5897 pgsz > segpgsz) { 5898 pgsz = MIN(pgsz, segpgsz); 5899 page_unlock(pp); 5900 i = btop(P2END((uintptr_t)va, pgsz) - 5901 (uintptr_t)va); 5902 va = (caddr_t)P2END((uintptr_t)va, pgsz); 5903 lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS, i); 5904 continue; 5905 } 5906 5907 /* 5908 * Upgrade to exclusive lock on page 5909 */ 5910 if (!page_tryupgrade(pp)) { 5911 page_unlock(pp); 5912 va += pgsz; 5913 lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS, 5914 btop(pgsz)); 5915 continue; 5916 } 5917 5918 /* 5919 * Remember pages locked exclusively and how many 5920 */ 5921 ppa[0] = pp; 5922 nlocked = 1; 5923 5924 /* 5925 * Lock constituent pages if this is large page 5926 */ 5927 if (pages > 1) { 5928 /* 5929 * Lock all constituents except root page, since it 5930 * should be locked already. 5931 */ 5932 for (i = 1; i < pages; i++) { 5933 pp++; 5934 if (!page_trylock(pp, SE_EXCL)) { 5935 break; 5936 } 5937 if (PP_ISFREE(pp) || 5938 pp->p_szc != pszc) { 5939 /* 5940 * hat_page_demote() raced in with us. 5941 */ 5942 ASSERT(!IS_SWAPFSVP(curvp)); 5943 page_unlock(pp); 5944 break; 5945 } 5946 ppa[nlocked] = pp; 5947 nlocked++; 5948 } 5949 } 5950 5951 /* 5952 * If all constituent pages couldn't be locked, 5953 * unlock pages locked so far and skip to next page. 5954 */ 5955 if (nlocked != pages) { 5956 for (i = 0; i < nlocked; i++) 5957 page_unlock(ppa[i]); 5958 va += pgsz; 5959 lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS, 5960 btop(pgsz)); 5961 continue; 5962 } 5963 5964 /* 5965 * hat_page_demote() can no longer happen 5966 * since last cons page had the right p_szc after 5967 * all cons pages were locked. all cons pages 5968 * should now have the same p_szc. 5969 */ 5970 5971 /* 5972 * All constituent pages locked successfully, so mark 5973 * large page for migration and unload the mappings of 5974 * constituent pages, so a fault will occur on any part of the 5975 * large page 5976 */ 5977 PP_SETMIGRATE(ppa[0]); 5978 for (i = 0; i < nlocked; i++) { 5979 pp = ppa[i]; 5980 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 5981 ASSERT(hat_page_getshare(pp) == 0); 5982 page_unlock(pp); 5983 } 5984 lgrp_stat_add(from->lgrp_id, LGRP_PMM_PGS, nlocked); 5985 5986 va += pgsz; 5987 } 5988 kmem_free(ppa, ppa_nentries * sizeof (page_t *)); 5989 } 5990 5991 /* 5992 * Migrate any pages that have been marked for migration in the given range 5993 */ 5994 void 5995 page_migrate( 5996 struct seg *seg, 5997 caddr_t addr, 5998 page_t **ppa, 5999 pgcnt_t npages) 6000 { 6001 lgrp_t *from; 6002 lgrp_t *to; 6003 page_t *newpp; 6004 page_t *pp; 6005 pfn_t pfn; 6006 size_t pgsz; 6007 spgcnt_t page_cnt; 6008 spgcnt_t i; 6009 uint_t pszc; 6010 6011 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6012 6013 while (npages > 0) { 6014 pp = *ppa; 6015 pszc = pp->p_szc; 6016 pgsz = page_get_pagesize(pszc); 6017 page_cnt = btop(pgsz); 6018 6019 /* 6020 * Check to see whether this page is marked for migration 6021 * 6022 * Assume that root page of large page is marked for 6023 * migration and none of the other constituent pages 6024 * are marked. This really simplifies clearing the 6025 * migrate bit by not having to clear it from each 6026 * constituent page. 6027 * 6028 * note we don't want to relocate an entire large page if 6029 * someone is only using one subpage. 6030 */ 6031 if (npages < page_cnt) 6032 break; 6033 6034 /* 6035 * Is it marked for migration? 6036 */ 6037 if (!PP_ISMIGRATE(pp)) 6038 goto next; 6039 6040 /* 6041 * Determine lgroups that page is being migrated between 6042 */ 6043 pfn = page_pptonum(pp); 6044 if (!IS_P2ALIGNED(pfn, page_cnt)) { 6045 break; 6046 } 6047 from = lgrp_pfn_to_lgrp(pfn); 6048 to = lgrp_mem_choose(seg, addr, pgsz); 6049 6050 /* 6051 * Check to see whether we are trying to migrate page to lgroup 6052 * where it is allocated already 6053 */ 6054 if (to == from) { 6055 PP_CLRMIGRATE(pp); 6056 goto next; 6057 } 6058 6059 /* 6060 * Need to get exclusive lock's to migrate 6061 */ 6062 for (i = 0; i < page_cnt; i++) { 6063 ASSERT(PAGE_LOCKED(ppa[i])); 6064 if (page_pptonum(ppa[i]) != pfn + i || 6065 ppa[i]->p_szc != pszc) { 6066 break; 6067 } 6068 if (!page_tryupgrade(ppa[i])) { 6069 lgrp_stat_add(from->lgrp_id, 6070 LGRP_PM_FAIL_LOCK_PGS, 6071 page_cnt); 6072 break; 6073 } 6074 } 6075 if (i != page_cnt) { 6076 while (--i != -1) { 6077 page_downgrade(ppa[i]); 6078 } 6079 goto next; 6080 } 6081 6082 (void) page_create_wait(page_cnt, PG_WAIT); 6083 newpp = page_get_replacement_page(pp, to, PGR_SAMESZC); 6084 if (newpp == NULL) { 6085 page_create_putback(page_cnt); 6086 for (i = 0; i < page_cnt; i++) { 6087 page_downgrade(ppa[i]); 6088 } 6089 lgrp_stat_add(to->lgrp_id, LGRP_PM_FAIL_ALLOC_PGS, 6090 page_cnt); 6091 goto next; 6092 } 6093 ASSERT(newpp->p_szc == pszc); 6094 /* 6095 * Clear migrate bit and relocate page 6096 */ 6097 PP_CLRMIGRATE(pp); 6098 if (page_relocate(&pp, &newpp, 0, 1, &page_cnt, to)) { 6099 panic("page_migrate: page_relocate failed"); 6100 } 6101 ASSERT(page_cnt * PAGESIZE == pgsz); 6102 6103 /* 6104 * Keep stats for number of pages migrated from and to 6105 * each lgroup 6106 */ 6107 lgrp_stat_add(from->lgrp_id, LGRP_PM_SRC_PGS, page_cnt); 6108 lgrp_stat_add(to->lgrp_id, LGRP_PM_DEST_PGS, page_cnt); 6109 /* 6110 * update the page_t array we were passed in and 6111 * unlink constituent pages of a large page. 6112 */ 6113 for (i = 0; i < page_cnt; ++i, ++pp) { 6114 ASSERT(PAGE_EXCL(newpp)); 6115 ASSERT(newpp->p_szc == pszc); 6116 ppa[i] = newpp; 6117 pp = newpp; 6118 page_sub(&newpp, pp); 6119 page_downgrade(pp); 6120 } 6121 ASSERT(newpp == NULL); 6122 next: 6123 addr += pgsz; 6124 ppa += page_cnt; 6125 npages -= page_cnt; 6126 } 6127 } 6128 6129 /* 6130 * initialize the vnode for retired pages 6131 */ 6132 static void 6133 page_retired_init(void) 6134 { 6135 vn_setops(&retired_ppages, &retired_vnodeops); 6136 } 6137 6138 /* ARGSUSED */ 6139 static void 6140 retired_dispose(vnode_t *vp, page_t *pp, int flag, int dn, cred_t *cr) 6141 { 6142 panic("retired_dispose invoked"); 6143 } 6144 6145 /* ARGSUSED */ 6146 static void 6147 retired_inactive(vnode_t *vp, cred_t *cr) 6148 {} 6149 6150 void 6151 page_unretire_pages(void) 6152 { 6153 page_t *pp; 6154 kmutex_t *vphm; 6155 vnode_t *vp; 6156 page_t *rpages[UNRETIRE_PAGES]; 6157 pgcnt_t i, npages, rmem; 6158 uint64_t pa; 6159 6160 rmem = 0; 6161 6162 for (;;) { 6163 /* 6164 * We do this in 2 steps: 6165 * 6166 * 1. We walk the retired pages list and collect a list of 6167 * pages that have the toxic field cleared. 6168 * 6169 * 2. We iterate through the page list and unretire each one. 6170 * 6171 * We have to do it in two steps on account of the mutexes that 6172 * we need to acquire. 6173 */ 6174 6175 vp = &retired_ppages; 6176 vphm = page_vnode_mutex(vp); 6177 mutex_enter(vphm); 6178 6179 if ((pp = vp->v_pages) == NULL) { 6180 mutex_exit(vphm); 6181 break; 6182 } 6183 6184 i = 0; 6185 do { 6186 ASSERT(pp != NULL); 6187 ASSERT(pp->p_vnode == vp); 6188 6189 /* 6190 * DR operations change the association between a page_t 6191 * and the physical page it represents. Check if the 6192 * page is still bad. If not, unretire it. 6193 */ 6194 if (!page_isfaulty(pp)) 6195 rpages[i++] = pp; 6196 6197 pp = pp->p_vpnext; 6198 } while ((pp != vp->v_pages) && (i < UNRETIRE_PAGES)); 6199 6200 mutex_exit(vphm); 6201 6202 npages = i; 6203 for (i = 0; i < npages; i++) { 6204 pp = rpages[i]; 6205 pa = ptob((uint64_t)page_pptonum(pp)); 6206 6207 /* 6208 * Need to upgrade the shared lock to an exclusive 6209 * lock in order to hash out the page. 6210 * 6211 * The page could have been retired but the page lock 6212 * may not have been downgraded yet. If so, skip this 6213 * page. page_free() will call this function after the 6214 * lock is downgraded. 6215 */ 6216 6217 if (!PAGE_SHARED(pp) || !page_tryupgrade(pp)) 6218 continue; 6219 6220 /* 6221 * Both page_free() and DR call this function. They 6222 * can potentially call this function at the same 6223 * time and race with each other. 6224 */ 6225 if (!page_isretired(pp) || page_isfaulty(pp)) { 6226 page_downgrade(pp); 6227 continue; 6228 } 6229 6230 cmn_err(CE_NOTE, 6231 "unretiring retired page 0x%08x.%08x", 6232 (uint32_t)(pa >> 32), (uint32_t)pa); 6233 6234 /* 6235 * When a page is removed from the retired pages vnode, 6236 * its toxic field is also cleared. So, we do not have 6237 * to do that seperately here. 6238 */ 6239 page_hashout(pp, (kmutex_t *)NULL); 6240 6241 /* 6242 * This is a good page. So, free it. 6243 */ 6244 pp->p_vnode = NULL; 6245 page_free(pp, 1); 6246 rmem++; 6247 } 6248 6249 /* 6250 * If the rpages array was filled up, then there could be more 6251 * retired pages that are not faulty. We need to iterate 6252 * again and unretire them. Otherwise, we are done. 6253 */ 6254 if (npages < UNRETIRE_PAGES) 6255 break; 6256 } 6257 6258 mutex_enter(&freemem_lock); 6259 availrmem += rmem; 6260 mutex_exit(&freemem_lock); 6261 } 6262 6263 ulong_t mem_waiters = 0; 6264 ulong_t max_count = 20; 6265 #define MAX_DELAY 0x1ff 6266 6267 /* 6268 * Check if enough memory is available to proceed. 6269 * Depending on system configuration and how much memory is 6270 * reserved for swap we need to check against two variables. 6271 * e.g. on systems with little physical swap availrmem can be 6272 * more reliable indicator of how much memory is available. 6273 * On systems with large phys swap freemem can be better indicator. 6274 * If freemem drops below threshold level don't return an error 6275 * immediately but wake up pageout to free memory and block. 6276 * This is done number of times. If pageout is not able to free 6277 * memory within certain time return an error. 6278 * The same applies for availrmem but kmem_reap is used to 6279 * free memory. 6280 */ 6281 int 6282 page_mem_avail(pgcnt_t npages) 6283 { 6284 ulong_t count; 6285 6286 #if defined(__i386) 6287 if (freemem > desfree + npages && 6288 availrmem > swapfs_reserve + npages && 6289 btop(vmem_size(heap_arena, VMEM_FREE)) > tune.t_minarmem + 6290 npages) 6291 return (1); 6292 #else 6293 if (freemem > desfree + npages && 6294 availrmem > swapfs_reserve + npages) 6295 return (1); 6296 #endif 6297 6298 count = max_count; 6299 atomic_add_long(&mem_waiters, 1); 6300 6301 while (freemem < desfree + npages && --count) { 6302 cv_signal(&proc_pageout->p_cv); 6303 if (delay_sig(hz + (mem_waiters & MAX_DELAY))) { 6304 atomic_add_long(&mem_waiters, -1); 6305 return (0); 6306 } 6307 } 6308 if (count == 0) { 6309 atomic_add_long(&mem_waiters, -1); 6310 return (0); 6311 } 6312 6313 count = max_count; 6314 while (availrmem < swapfs_reserve + npages && --count) { 6315 kmem_reap(); 6316 if (delay_sig(hz + (mem_waiters & MAX_DELAY))) { 6317 atomic_add_long(&mem_waiters, -1); 6318 return (0); 6319 } 6320 } 6321 atomic_add_long(&mem_waiters, -1); 6322 if (count == 0) 6323 return (0); 6324 6325 #if defined(__i386) 6326 if (btop(vmem_size(heap_arena, VMEM_FREE)) < 6327 tune.t_minarmem + npages) 6328 return (0); 6329 #endif 6330 return (1); 6331 } 6332 6333 6334 /* 6335 * Search the memory segments to locate the desired page. Within a 6336 * segment, pages increase linearly with one page structure per 6337 * physical page frame (size PAGESIZE). The search begins 6338 * with the segment that was accessed last, to take advantage of locality. 6339 * If the hint misses, we start from the beginning of the sorted memseg list 6340 */ 6341 6342 6343 /* 6344 * Some data structures for pfn to pp lookup. 6345 */ 6346 ulong_t mhash_per_slot; 6347 struct memseg *memseg_hash[N_MEM_SLOTS]; 6348 6349 page_t * 6350 page_numtopp_nolock(pfn_t pfnum) 6351 { 6352 struct memseg *seg; 6353 page_t *pp; 6354 vm_cpu_data_t *vc = CPU->cpu_vm_data; 6355 6356 ASSERT(vc != NULL); 6357 6358 MEMSEG_STAT_INCR(nsearch); 6359 6360 /* Try last winner first */ 6361 if (((seg = vc->vc_pnum_memseg) != NULL) && 6362 (pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) { 6363 MEMSEG_STAT_INCR(nlastwon); 6364 pp = seg->pages + (pfnum - seg->pages_base); 6365 if (pp->p_pagenum == pfnum) 6366 return ((page_t *)pp); 6367 } 6368 6369 /* Else Try hash */ 6370 if (((seg = memseg_hash[MEMSEG_PFN_HASH(pfnum)]) != NULL) && 6371 (pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) { 6372 MEMSEG_STAT_INCR(nhashwon); 6373 vc->vc_pnum_memseg = seg; 6374 pp = seg->pages + (pfnum - seg->pages_base); 6375 if (pp->p_pagenum == pfnum) 6376 return ((page_t *)pp); 6377 } 6378 6379 /* Else Brute force */ 6380 for (seg = memsegs; seg != NULL; seg = seg->next) { 6381 if (pfnum >= seg->pages_base && pfnum < seg->pages_end) { 6382 vc->vc_pnum_memseg = seg; 6383 pp = seg->pages + (pfnum - seg->pages_base); 6384 return ((page_t *)pp); 6385 } 6386 } 6387 vc->vc_pnum_memseg = NULL; 6388 MEMSEG_STAT_INCR(nnotfound); 6389 return ((page_t *)NULL); 6390 6391 } 6392 6393 struct memseg * 6394 page_numtomemseg_nolock(pfn_t pfnum) 6395 { 6396 struct memseg *seg; 6397 page_t *pp; 6398 6399 /* Try hash */ 6400 if (((seg = memseg_hash[MEMSEG_PFN_HASH(pfnum)]) != NULL) && 6401 (pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) { 6402 pp = seg->pages + (pfnum - seg->pages_base); 6403 if (pp->p_pagenum == pfnum) 6404 return (seg); 6405 } 6406 6407 /* Else Brute force */ 6408 for (seg = memsegs; seg != NULL; seg = seg->next) { 6409 if (pfnum >= seg->pages_base && pfnum < seg->pages_end) { 6410 return (seg); 6411 } 6412 } 6413 return ((struct memseg *)NULL); 6414 } 6415 6416 /* 6417 * Given a page and a count return the page struct that is 6418 * n structs away from the current one in the global page 6419 * list. 6420 * 6421 * This function wraps to the first page upon 6422 * reaching the end of the memseg list. 6423 */ 6424 page_t * 6425 page_nextn(page_t *pp, ulong_t n) 6426 { 6427 struct memseg *seg; 6428 page_t *ppn; 6429 vm_cpu_data_t *vc = (vm_cpu_data_t *)CPU->cpu_vm_data; 6430 6431 ASSERT(vc != NULL); 6432 6433 if (((seg = vc->vc_pnext_memseg) == NULL) || 6434 (seg->pages_base == seg->pages_end) || 6435 !(pp >= seg->pages && pp < seg->epages)) { 6436 6437 for (seg = memsegs; seg; seg = seg->next) { 6438 if (pp >= seg->pages && pp < seg->epages) 6439 break; 6440 } 6441 6442 if (seg == NULL) { 6443 /* Memory delete got in, return something valid. */ 6444 /* TODO: fix me. */ 6445 seg = memsegs; 6446 pp = seg->pages; 6447 } 6448 } 6449 6450 /* check for wraparound - possible if n is large */ 6451 while ((ppn = (pp + n)) >= seg->epages || ppn < pp) { 6452 n -= seg->epages - pp; 6453 seg = seg->next; 6454 if (seg == NULL) 6455 seg = memsegs; 6456 pp = seg->pages; 6457 } 6458 vc->vc_pnext_memseg = seg; 6459 return (ppn); 6460 } 6461 6462 /* 6463 * Initialize for a loop using page_next_scan_large(). 6464 */ 6465 page_t * 6466 page_next_scan_init(void **cookie) 6467 { 6468 ASSERT(cookie != NULL); 6469 *cookie = (void *)memsegs; 6470 return ((page_t *)memsegs->pages); 6471 } 6472 6473 /* 6474 * Return the next page in a scan of page_t's, assuming we want 6475 * to skip over sub-pages within larger page sizes. 6476 * 6477 * The cookie is used to keep track of the current memseg. 6478 */ 6479 page_t * 6480 page_next_scan_large( 6481 page_t *pp, 6482 ulong_t *n, 6483 void **cookie) 6484 { 6485 struct memseg *seg = (struct memseg *)*cookie; 6486 page_t *new_pp; 6487 ulong_t cnt; 6488 pfn_t pfn; 6489 6490 6491 /* 6492 * get the count of page_t's to skip based on the page size 6493 */ 6494 ASSERT(pp != NULL); 6495 if (pp->p_szc == 0) { 6496 cnt = 1; 6497 } else { 6498 pfn = page_pptonum(pp); 6499 cnt = page_get_pagecnt(pp->p_szc); 6500 cnt -= pfn & (cnt - 1); 6501 } 6502 *n += cnt; 6503 new_pp = pp + cnt; 6504 6505 /* 6506 * Catch if we went past the end of the current memory segment. If so, 6507 * just move to the next segment with pages. 6508 */ 6509 if (new_pp >= seg->epages) { 6510 do { 6511 seg = seg->next; 6512 if (seg == NULL) 6513 seg = memsegs; 6514 } while (seg->pages == seg->epages); 6515 new_pp = seg->pages; 6516 *cookie = (void *)seg; 6517 } 6518 6519 return (new_pp); 6520 } 6521 6522 6523 /* 6524 * Returns next page in list. Note: this function wraps 6525 * to the first page in the list upon reaching the end 6526 * of the list. Callers should be aware of this fact. 6527 */ 6528 6529 /* We should change this be a #define */ 6530 6531 page_t * 6532 page_next(page_t *pp) 6533 { 6534 return (page_nextn(pp, 1)); 6535 } 6536 6537 page_t * 6538 page_first() 6539 { 6540 return ((page_t *)memsegs->pages); 6541 } 6542 6543 6544 /* 6545 * This routine is called at boot with the initial memory configuration 6546 * and when memory is added or removed. 6547 */ 6548 void 6549 build_pfn_hash() 6550 { 6551 pfn_t cur; 6552 pgcnt_t index; 6553 struct memseg *pseg; 6554 int i; 6555 6556 /* 6557 * Clear memseg_hash array. 6558 * Since memory add/delete is designed to operate concurrently 6559 * with normal operation, the hash rebuild must be able to run 6560 * concurrently with page_numtopp_nolock(). To support this 6561 * functionality, assignments to memseg_hash array members must 6562 * be done atomically. 6563 * 6564 * NOTE: bzero() does not currently guarantee this for kernel 6565 * threads, and cannot be used here. 6566 */ 6567 for (i = 0; i < N_MEM_SLOTS; i++) 6568 memseg_hash[i] = NULL; 6569 6570 hat_kpm_mseghash_clear(N_MEM_SLOTS); 6571 6572 /* 6573 * Physmax is the last valid pfn. 6574 */ 6575 mhash_per_slot = (physmax + 1) >> MEM_HASH_SHIFT; 6576 for (pseg = memsegs; pseg != NULL; pseg = pseg->next) { 6577 index = MEMSEG_PFN_HASH(pseg->pages_base); 6578 cur = pseg->pages_base; 6579 do { 6580 if (index >= N_MEM_SLOTS) 6581 index = MEMSEG_PFN_HASH(cur); 6582 6583 if (memseg_hash[index] == NULL || 6584 memseg_hash[index]->pages_base > pseg->pages_base) { 6585 memseg_hash[index] = pseg; 6586 hat_kpm_mseghash_update(index, pseg); 6587 } 6588 cur += mhash_per_slot; 6589 index++; 6590 } while (cur < pseg->pages_end); 6591 } 6592 } 6593 6594 /* 6595 * Return the pagenum for the pp 6596 */ 6597 pfn_t 6598 page_pptonum(page_t *pp) 6599 { 6600 return (pp->p_pagenum); 6601 } 6602 6603 /* 6604 * interface to the referenced and modified etc bits 6605 * in the PSM part of the page struct 6606 * when no locking is desired. 6607 */ 6608 void 6609 page_set_props(page_t *pp, uint_t flags) 6610 { 6611 ASSERT((flags & ~(P_MOD | P_REF | P_RO)) == 0); 6612 pp->p_nrm |= (uchar_t)flags; 6613 } 6614 6615 void 6616 page_clr_all_props(page_t *pp) 6617 { 6618 pp->p_nrm = 0; 6619 } 6620 6621 /* 6622 * The following functions is called from free_vp_pages() 6623 * for an inexact estimate of a newly free'd page... 6624 */ 6625 ulong_t 6626 page_share_cnt(page_t *pp) 6627 { 6628 return (hat_page_getshare(pp)); 6629 } 6630 6631 /* 6632 * The following functions are used in handling memory 6633 * errors. 6634 */ 6635 6636 int 6637 page_istoxic(page_t *pp) 6638 { 6639 return ((pp->p_toxic & PAGE_IS_TOXIC) == PAGE_IS_TOXIC); 6640 } 6641 6642 int 6643 page_isfailing(page_t *pp) 6644 { 6645 return ((pp->p_toxic & PAGE_IS_FAILING) == PAGE_IS_FAILING); 6646 } 6647 6648 int 6649 page_isretired(page_t *pp) 6650 { 6651 return ((pp->p_toxic & PAGE_IS_RETIRED) == PAGE_IS_RETIRED); 6652 } 6653 6654 int 6655 page_deteriorating(page_t *pp) 6656 { 6657 return ((pp->p_toxic & (PAGE_IS_TOXIC | PAGE_IS_FAILING)) != 0); 6658 } 6659 6660 void 6661 page_settoxic(page_t *pp, uchar_t flag) 6662 { 6663 uchar_t new_flag = 0; 6664 while ((new_flag & flag) != flag) { 6665 uchar_t old_flag = pp->p_toxic; 6666 new_flag = old_flag | flag; 6667 (void) cas8(&pp->p_toxic, old_flag, new_flag); 6668 new_flag = ((volatile page_t *)pp)->p_toxic; 6669 } 6670 } 6671 6672 void 6673 page_clrtoxic(page_t *pp) 6674 { 6675 /* 6676 * We don't need to worry about atomicity on the 6677 * p_toxic flag here as this is only called from 6678 * page_free() while holding an exclusive lock on 6679 * the page 6680 */ 6681 pp->p_toxic = PAGE_IS_OK; 6682 } 6683 6684 void 6685 page_clrtoxic_flag(page_t *pp, uchar_t flag) 6686 { 6687 uchar_t new_flag = ((volatile page_t *)pp)->p_toxic; 6688 while ((new_flag & flag) == flag) { 6689 uchar_t old_flag = new_flag; 6690 new_flag = old_flag & ~flag; 6691 (void) cas8(&pp->p_toxic, old_flag, new_flag); 6692 new_flag = ((volatile page_t *)pp)->p_toxic; 6693 } 6694 } 6695 6696 int 6697 page_isfaulty(page_t *pp) 6698 { 6699 return ((pp->p_toxic & PAGE_IS_FAULTY) == PAGE_IS_FAULTY); 6700 } 6701 6702 /* 6703 * The following four functions are called from /proc code 6704 * for the /proc/<pid>/xmap interface. 6705 */ 6706 int 6707 page_isshared(page_t *pp) 6708 { 6709 return (hat_page_getshare(pp) > 1); 6710 } 6711 6712 int 6713 page_isfree(page_t *pp) 6714 { 6715 return (PP_ISFREE(pp)); 6716 } 6717 6718 int 6719 page_isref(page_t *pp) 6720 { 6721 return (hat_page_getattr(pp, P_REF)); 6722 } 6723 6724 int 6725 page_ismod(page_t *pp) 6726 { 6727 return (hat_page_getattr(pp, P_MOD)); 6728 } 6729