1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * University Copyright- Copyright (c) 1982, 1986, 1988 32 * The Regents of the University of California 33 * All Rights Reserved 34 * 35 * University Acknowledgment- Portions of this document are derived from 36 * software developed by the University of California, Berkeley, and its 37 * contributors. 38 */ 39 40 #pragma ident "%Z%%M% %I% %E% SMI" 41 42 /* 43 * VM - physical page management. 44 */ 45 46 #include <sys/types.h> 47 #include <sys/t_lock.h> 48 #include <sys/param.h> 49 #include <sys/systm.h> 50 #include <sys/errno.h> 51 #include <sys/time.h> 52 #include <sys/vnode.h> 53 #include <sys/vm.h> 54 #include <sys/vtrace.h> 55 #include <sys/swap.h> 56 #include <sys/cmn_err.h> 57 #include <sys/tuneable.h> 58 #include <sys/sysmacros.h> 59 #include <sys/cpuvar.h> 60 #include <sys/callb.h> 61 #include <sys/debug.h> 62 #include <sys/tnf_probe.h> 63 #include <sys/condvar_impl.h> 64 #include <sys/mem_config.h> 65 #include <sys/mem_cage.h> 66 #include <sys/kmem.h> 67 #include <sys/atomic.h> 68 #include <sys/strlog.h> 69 #include <sys/mman.h> 70 #include <sys/ontrap.h> 71 #include <sys/lgrp.h> 72 #include <sys/vfs.h> 73 74 #include <vm/hat.h> 75 #include <vm/anon.h> 76 #include <vm/page.h> 77 #include <vm/seg.h> 78 #include <vm/pvn.h> 79 #include <vm/seg_kmem.h> 80 #include <vm/vm_dep.h> 81 82 #include <fs/fs_subr.h> 83 84 static int nopageage = 0; 85 86 static pgcnt_t max_page_get; /* max page_get request size in pages */ 87 pgcnt_t total_pages = 0; /* total number of pages (used by /proc) */ 88 89 /* 90 * vnode for all pages which are retired from the VM system; 91 * such as pages with Uncorrectable Errors. 92 */ 93 struct vnode retired_ppages; 94 95 static void page_retired_init(void); 96 static void retired_dispose(vnode_t *vp, page_t *pp, int flag, 97 int dn, cred_t *cr); 98 static void retired_inactive(vnode_t *vp, cred_t *cr); 99 static void page_retired(page_t *pp); 100 static void retired_page_removed(page_t *pp); 101 void page_unretire_pages(void); 102 103 /* 104 * The maximum number of pages that will be unretired in one iteration. 105 * This number is totally arbitrary. 106 */ 107 #define UNRETIRE_PAGES 256 108 109 /* 110 * We limit the number of pages that may be retired to 111 * a percentage of the total physical memory. Note that 112 * the percentage values are stored as 'basis points', 113 * ie, 100 basis points is 1%. 114 */ 115 #define MAX_PAGES_RETIRED_BPS_DEFAULT 10 /* .1% */ 116 117 uint64_t max_pages_retired_bps = MAX_PAGES_RETIRED_BPS_DEFAULT; 118 119 static int pages_retired_limit_exceeded(void); 120 121 /* 122 * operations vector for vnode with retired pages. Only VOP_DISPOSE 123 * and VOP_INACTIVE are intercepted. 124 */ 125 struct vnodeops retired_vnodeops = { 126 "retired_vnodeops", 127 fs_nosys, /* open */ 128 fs_nosys, /* close */ 129 fs_nosys, /* read */ 130 fs_nosys, /* write */ 131 fs_nosys, /* ioctl */ 132 fs_nosys, /* setfl */ 133 fs_nosys, /* getattr */ 134 fs_nosys, /* setattr */ 135 fs_nosys, /* access */ 136 fs_nosys, /* lookup */ 137 fs_nosys, /* create */ 138 fs_nosys, /* remove */ 139 fs_nosys, /* link */ 140 fs_nosys, /* rename */ 141 fs_nosys, /* mkdir */ 142 fs_nosys, /* rmdir */ 143 fs_nosys, /* readdir */ 144 fs_nosys, /* symlink */ 145 fs_nosys, /* readlink */ 146 fs_nosys, /* fsync */ 147 retired_inactive, 148 fs_nosys, /* fid */ 149 fs_rwlock, /* rwlock */ 150 fs_rwunlock, /* rwunlock */ 151 fs_nosys, /* seek */ 152 fs_nosys, /* cmp */ 153 fs_nosys, /* frlock */ 154 fs_nosys, /* space */ 155 fs_nosys, /* realvp */ 156 fs_nosys, /* getpage */ 157 fs_nosys, /* putpage */ 158 fs_nosys_map, 159 fs_nosys_addmap, 160 fs_nosys, /* delmap */ 161 fs_nosys_poll, 162 fs_nosys, /* dump */ 163 fs_nosys, /* l_pathconf */ 164 fs_nosys, /* pageio */ 165 fs_nosys, /* dumpctl */ 166 retired_dispose, 167 fs_nosys, /* setsecattr */ 168 fs_nosys, /* getsecatt */ 169 fs_nosys, /* shrlock */ 170 fs_vnevent_nosupport /* vnevent */ 171 }; 172 173 /* 174 * freemem_lock protects all freemem variables: 175 * availrmem. Also this lock protects the globals which track the 176 * availrmem changes for accurate kernel footprint calculation. 177 * See below for an explanation of these 178 * globals. 179 */ 180 kmutex_t freemem_lock; 181 pgcnt_t availrmem; 182 pgcnt_t availrmem_initial; 183 184 /* 185 * These globals track availrmem changes to get a more accurate 186 * estimate of tke kernel size. Historically pp_kernel is used for 187 * kernel size and is based on availrmem. But availrmem is adjusted for 188 * locked pages in the system not just for kernel locked pages. 189 * These new counters will track the pages locked through segvn and 190 * by explicit user locking. 191 * 192 * segvn_pages_locked : This keeps track on a global basis how many pages 193 * are currently locked because of I/O. 194 * 195 * pages_locked : How many pages are locked becuase of user specified 196 * locking through mlock or plock. 197 * 198 * pages_useclaim,pages_claimed : These two variables track the 199 * cliam adjustments because of the protection changes on a segvn segment. 200 * 201 * All these globals are protected by the same lock which protects availrmem. 202 */ 203 pgcnt_t segvn_pages_locked; 204 pgcnt_t pages_locked; 205 pgcnt_t pages_useclaim; 206 pgcnt_t pages_claimed; 207 208 209 /* 210 * new_freemem_lock protects freemem, freemem_wait & freemem_cv. 211 */ 212 static kmutex_t new_freemem_lock; 213 static uint_t freemem_wait; /* someone waiting for freemem */ 214 static kcondvar_t freemem_cv; 215 216 /* 217 * The logical page free list is maintained as two lists, the 'free' 218 * and the 'cache' lists. 219 * The free list contains those pages that should be reused first. 220 * 221 * The implementation of the lists is machine dependent. 222 * page_get_freelist(), page_get_cachelist(), 223 * page_list_sub(), and page_list_add() 224 * form the interface to the machine dependent implementation. 225 * 226 * Pages with p_free set are on the cache list. 227 * Pages with p_free and p_age set are on the free list, 228 * 229 * A page may be locked while on either list. 230 */ 231 232 /* 233 * free list accounting stuff. 234 * 235 * 236 * Spread out the value for the number of pages on the 237 * page free and page cache lists. If there is just one 238 * value, then it must be under just one lock. 239 * The lock contention and cache traffic are a real bother. 240 * 241 * When we acquire and then drop a single pcf lock 242 * we can start in the middle of the array of pcf structures. 243 * If we acquire more than one pcf lock at a time, we need to 244 * start at the front to avoid deadlocking. 245 * 246 * pcf_count holds the number of pages in each pool. 247 * 248 * pcf_block is set when page_create_get_something() has asked the 249 * PSM page freelist and page cachelist routines without specifying 250 * a color and nothing came back. This is used to block anything 251 * else from moving pages from one list to the other while the 252 * lists are searched again. If a page is freeed while pcf_block is 253 * set, then pcf_reserve is incremented. pcgs_unblock() takes care 254 * of clearning pcf_block, doing the wakeups, etc. 255 */ 256 257 #if NCPU <= 4 258 #define PAD 1 259 #define PCF_FANOUT 4 260 static uint_t pcf_mask = PCF_FANOUT - 1; 261 #else 262 #define PAD 9 263 #ifdef sun4v 264 #define PCF_FANOUT 32 265 #else 266 #define PCF_FANOUT 128 267 #endif 268 static uint_t pcf_mask = PCF_FANOUT - 1; 269 #endif 270 271 struct pcf { 272 uint_t pcf_touch; /* just to help the cache */ 273 uint_t pcf_count; /* page count */ 274 kmutex_t pcf_lock; /* protects the structure */ 275 uint_t pcf_wait; /* number of waiters */ 276 uint_t pcf_block; /* pcgs flag to page_free() */ 277 uint_t pcf_reserve; /* pages freed after pcf_block set */ 278 uint_t pcf_fill[PAD]; /* to line up on the caches */ 279 }; 280 281 static struct pcf pcf[PCF_FANOUT]; 282 #define PCF_INDEX() ((CPU->cpu_id) & (pcf_mask)) 283 284 kmutex_t pcgs_lock; /* serializes page_create_get_ */ 285 kmutex_t pcgs_cagelock; /* serializes NOSLEEP cage allocs */ 286 kmutex_t pcgs_wait_lock; /* used for delay in pcgs */ 287 static kcondvar_t pcgs_cv; /* cv for delay in pcgs */ 288 289 #define PAGE_LOCK_MAXIMUM \ 290 ((1 << (sizeof (((page_t *)0)->p_lckcnt) * NBBY)) - 1) 291 292 /* 293 * Control over the verbosity of page retirement. When set to zero, no messages 294 * will be printed. A value of one will trigger messages for retirement 295 * operations, and is intended for processors which don't yet support FMA 296 * (spitfire). Two will cause verbose messages to be printed when retirements 297 * complete, and is intended only for debugging purposes. 298 */ 299 int page_retire_messages = 0; 300 301 #ifdef VM_STATS 302 303 /* 304 * No locks, but so what, they are only statistics. 305 */ 306 307 static struct page_tcnt { 308 int pc_free_cache; /* free's into cache list */ 309 int pc_free_dontneed; /* free's with dontneed */ 310 int pc_free_pageout; /* free's from pageout */ 311 int pc_free_free; /* free's into free list */ 312 int pc_free_pages; /* free's into large page free list */ 313 int pc_destroy_pages; /* large page destroy's */ 314 int pc_get_cache; /* get's from cache list */ 315 int pc_get_free; /* get's from free list */ 316 int pc_reclaim; /* reclaim's */ 317 int pc_abortfree; /* abort's of free pages */ 318 int pc_find_hit; /* find's that find page */ 319 int pc_find_miss; /* find's that don't find page */ 320 int pc_destroy_free; /* # of free pages destroyed */ 321 #define PC_HASH_CNT (4*PAGE_HASHAVELEN) 322 int pc_find_hashlen[PC_HASH_CNT+1]; 323 int pc_addclaim_pages; 324 int pc_subclaim_pages; 325 int pc_free_replacement_page[2]; 326 int pc_try_demote_pages[6]; 327 int pc_demote_pages[2]; 328 } pagecnt; 329 330 uint_t hashin_count; 331 uint_t hashin_not_held; 332 uint_t hashin_already; 333 334 uint_t hashout_count; 335 uint_t hashout_not_held; 336 337 uint_t page_create_count; 338 uint_t page_create_not_enough; 339 uint_t page_create_not_enough_again; 340 uint_t page_create_zero; 341 uint_t page_create_hashout; 342 uint_t page_create_page_lock_failed; 343 uint_t page_create_trylock_failed; 344 uint_t page_create_found_one; 345 uint_t page_create_hashin_failed; 346 uint_t page_create_dropped_phm; 347 348 uint_t page_create_new; 349 uint_t page_create_exists; 350 uint_t page_create_putbacks; 351 uint_t page_create_overshoot; 352 353 uint_t page_reclaim_zero; 354 uint_t page_reclaim_zero_locked; 355 356 uint_t page_rename_exists; 357 uint_t page_rename_count; 358 359 uint_t page_lookup_cnt[20]; 360 uint_t page_lookup_nowait_cnt[10]; 361 uint_t page_find_cnt; 362 uint_t page_exists_cnt; 363 uint_t page_exists_forreal_cnt; 364 uint_t page_lookup_dev_cnt; 365 uint_t get_cachelist_cnt; 366 uint_t page_create_cnt[10]; 367 uint_t alloc_pages[8]; 368 uint_t page_exphcontg[19]; 369 uint_t page_create_large_cnt[10]; 370 371 /* 372 * Collects statistics. 373 */ 374 #define PAGE_HASH_SEARCH(index, pp, vp, off) { \ 375 uint_t mylen = 0; \ 376 \ 377 for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash, mylen++) { \ 378 if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \ 379 break; \ 380 } \ 381 if ((pp) != NULL) \ 382 pagecnt.pc_find_hit++; \ 383 else \ 384 pagecnt.pc_find_miss++; \ 385 if (mylen > PC_HASH_CNT) \ 386 mylen = PC_HASH_CNT; \ 387 pagecnt.pc_find_hashlen[mylen]++; \ 388 } 389 390 #else /* VM_STATS */ 391 392 /* 393 * Don't collect statistics 394 */ 395 #define PAGE_HASH_SEARCH(index, pp, vp, off) { \ 396 for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \ 397 if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \ 398 break; \ 399 } \ 400 } 401 402 #endif /* VM_STATS */ 403 404 405 406 #ifdef DEBUG 407 #define MEMSEG_SEARCH_STATS 408 #endif 409 410 #ifdef MEMSEG_SEARCH_STATS 411 struct memseg_stats { 412 uint_t nsearch; 413 uint_t nlastwon; 414 uint_t nhashwon; 415 uint_t nnotfound; 416 } memseg_stats; 417 418 #define MEMSEG_STAT_INCR(v) \ 419 atomic_add_32(&memseg_stats.v, 1) 420 #else 421 #define MEMSEG_STAT_INCR(x) 422 #endif 423 424 struct memseg *memsegs; /* list of memory segments */ 425 426 427 static void page_init_mem_config(void); 428 static int page_do_hashin(page_t *, vnode_t *, u_offset_t); 429 static void page_do_hashout(page_t *); 430 431 static void page_demote_vp_pages(page_t *); 432 433 /* 434 * vm subsystem related initialization 435 */ 436 void 437 vm_init(void) 438 { 439 boolean_t callb_vm_cpr(void *, int); 440 441 (void) callb_add(callb_vm_cpr, 0, CB_CL_CPR_VM, "vm"); 442 page_init_mem_config(); 443 444 /* 445 * initialise the vnode for retired pages 446 */ 447 page_retired_init(); 448 } 449 450 /* 451 * This function is called at startup and when memory is added or deleted. 452 */ 453 void 454 init_pages_pp_maximum() 455 { 456 static pgcnt_t p_min; 457 static pgcnt_t pages_pp_maximum_startup; 458 static pgcnt_t avrmem_delta; 459 static int init_done; 460 static int user_set; /* true if set in /etc/system */ 461 462 if (init_done == 0) { 463 464 /* If the user specified a value, save it */ 465 if (pages_pp_maximum != 0) { 466 user_set = 1; 467 pages_pp_maximum_startup = pages_pp_maximum; 468 } 469 470 /* 471 * Setting of pages_pp_maximum is based first time 472 * on the value of availrmem just after the start-up 473 * allocations. To preserve this relationship at run 474 * time, use a delta from availrmem_initial. 475 */ 476 ASSERT(availrmem_initial >= availrmem); 477 avrmem_delta = availrmem_initial - availrmem; 478 479 /* The allowable floor of pages_pp_maximum */ 480 p_min = tune.t_minarmem + 100; 481 482 /* Make sure we don't come through here again. */ 483 init_done = 1; 484 } 485 /* 486 * Determine pages_pp_maximum, the number of currently available 487 * pages (availrmem) that can't be `locked'. If not set by 488 * the user, we set it to 4% of the currently available memory 489 * plus 4MB. 490 * But we also insist that it be greater than tune.t_minarmem; 491 * otherwise a process could lock down a lot of memory, get swapped 492 * out, and never have enough to get swapped back in. 493 */ 494 if (user_set) 495 pages_pp_maximum = pages_pp_maximum_startup; 496 else 497 pages_pp_maximum = ((availrmem_initial - avrmem_delta) / 25) 498 + btop(4 * 1024 * 1024); 499 500 if (pages_pp_maximum <= p_min) { 501 pages_pp_maximum = p_min; 502 } 503 } 504 505 void 506 set_max_page_get(pgcnt_t target_total_pages) 507 { 508 max_page_get = target_total_pages / 2; 509 } 510 511 static pgcnt_t pending_delete; 512 513 /*ARGSUSED*/ 514 static void 515 page_mem_config_post_add( 516 void *arg, 517 pgcnt_t delta_pages) 518 { 519 set_max_page_get(total_pages - pending_delete); 520 init_pages_pp_maximum(); 521 } 522 523 /*ARGSUSED*/ 524 static int 525 page_mem_config_pre_del( 526 void *arg, 527 pgcnt_t delta_pages) 528 { 529 pgcnt_t nv; 530 531 nv = atomic_add_long_nv(&pending_delete, (spgcnt_t)delta_pages); 532 set_max_page_get(total_pages - nv); 533 return (0); 534 } 535 536 /*ARGSUSED*/ 537 static void 538 page_mem_config_post_del( 539 void *arg, 540 pgcnt_t delta_pages, 541 int cancelled) 542 { 543 pgcnt_t nv; 544 545 nv = atomic_add_long_nv(&pending_delete, -(spgcnt_t)delta_pages); 546 set_max_page_get(total_pages - nv); 547 if (!cancelled) 548 init_pages_pp_maximum(); 549 } 550 551 static kphysm_setup_vector_t page_mem_config_vec = { 552 KPHYSM_SETUP_VECTOR_VERSION, 553 page_mem_config_post_add, 554 page_mem_config_pre_del, 555 page_mem_config_post_del, 556 }; 557 558 static void 559 page_init_mem_config(void) 560 { 561 int ret; 562 563 ret = kphysm_setup_func_register(&page_mem_config_vec, (void *)NULL); 564 ASSERT(ret == 0); 565 } 566 567 /* 568 * Evenly spread out the PCF counters for large free pages 569 */ 570 static void 571 page_free_large_ctr(pgcnt_t npages) 572 { 573 static struct pcf *p = pcf; 574 pgcnt_t lump; 575 576 freemem += npages; 577 578 lump = roundup(npages, PCF_FANOUT) / PCF_FANOUT; 579 580 while (npages > 0) { 581 582 ASSERT(!p->pcf_block); 583 584 if (lump < npages) { 585 p->pcf_count += (uint_t)lump; 586 npages -= lump; 587 } else { 588 p->pcf_count += (uint_t)npages; 589 npages = 0; 590 } 591 592 ASSERT(!p->pcf_wait); 593 594 if (++p > &pcf[PCF_FANOUT - 1]) 595 p = pcf; 596 } 597 598 ASSERT(npages == 0); 599 } 600 601 /* 602 * Add a physical chunk of memory to the system freee lists during startup. 603 * Platform specific startup() allocates the memory for the page structs. 604 * 605 * num - number of page structures 606 * base - page number (pfn) to be associated with the first page. 607 * 608 * Since we are doing this during startup (ie. single threaded), we will 609 * use shortcut routines to avoid any locking overhead while putting all 610 * these pages on the freelists. 611 * 612 * NOTE: Any changes performed to page_free(), must also be performed to 613 * add_physmem() since this is how we initialize all page_t's at 614 * boot time. 615 */ 616 void 617 add_physmem( 618 page_t *pp, 619 pgcnt_t num, 620 pfn_t pnum) 621 { 622 page_t *root = NULL; 623 uint_t szc = page_num_pagesizes() - 1; 624 pgcnt_t large = page_get_pagecnt(szc); 625 pgcnt_t cnt = 0; 626 627 TRACE_2(TR_FAC_VM, TR_PAGE_INIT, 628 "add_physmem:pp %p num %lu", pp, num); 629 630 /* 631 * Arbitrarily limit the max page_get request 632 * to 1/2 of the page structs we have. 633 */ 634 total_pages += num; 635 set_max_page_get(total_pages); 636 637 /* 638 * The physical space for the pages array 639 * representing ram pages has already been 640 * allocated. Here we initialize each lock 641 * in the page structure, and put each on 642 * the free list 643 */ 644 for (; num; pp++, pnum++, num--) { 645 646 /* 647 * this needs to fill in the page number 648 * and do any other arch specific initialization 649 */ 650 add_physmem_cb(pp, pnum); 651 652 /* 653 * Initialize the page lock as unlocked, since nobody 654 * can see or access this page yet. 655 */ 656 pp->p_selock = 0; 657 658 /* 659 * Initialize IO lock 660 */ 661 page_iolock_init(pp); 662 663 /* 664 * initialize other fields in the page_t 665 */ 666 PP_SETFREE(pp); 667 page_clr_all_props(pp); 668 PP_SETAGED(pp); 669 pp->p_offset = (u_offset_t)-1; 670 pp->p_next = pp; 671 pp->p_prev = pp; 672 673 /* 674 * Simple case: System doesn't support large pages. 675 */ 676 if (szc == 0) { 677 pp->p_szc = 0; 678 page_free_at_startup(pp); 679 continue; 680 } 681 682 /* 683 * Handle unaligned pages, we collect them up onto 684 * the root page until we have a full large page. 685 */ 686 if (!IS_P2ALIGNED(pnum, large)) { 687 688 /* 689 * If not in a large page, 690 * just free as small page. 691 */ 692 if (root == NULL) { 693 pp->p_szc = 0; 694 page_free_at_startup(pp); 695 continue; 696 } 697 698 /* 699 * Link a constituent page into the large page. 700 */ 701 pp->p_szc = szc; 702 page_list_concat(&root, &pp); 703 704 /* 705 * When large page is fully formed, free it. 706 */ 707 if (++cnt == large) { 708 page_free_large_ctr(cnt); 709 page_list_add_pages(root, PG_LIST_ISINIT); 710 root = NULL; 711 cnt = 0; 712 } 713 continue; 714 } 715 716 /* 717 * At this point we have a page number which 718 * is aligned. We assert that we aren't already 719 * in a different large page. 720 */ 721 ASSERT(IS_P2ALIGNED(pnum, large)); 722 ASSERT(root == NULL && cnt == 0); 723 724 /* 725 * If insufficient number of pages left to form 726 * a large page, just free the small page. 727 */ 728 if (num < large) { 729 pp->p_szc = 0; 730 page_free_at_startup(pp); 731 continue; 732 } 733 734 /* 735 * Otherwise start a new large page. 736 */ 737 pp->p_szc = szc; 738 cnt++; 739 root = pp; 740 } 741 ASSERT(root == NULL && cnt == 0); 742 } 743 744 /* 745 * Find a page representing the specified [vp, offset]. 746 * If we find the page but it is intransit coming in, 747 * it will have an "exclusive" lock and we wait for 748 * the i/o to complete. A page found on the free list 749 * is always reclaimed and then locked. On success, the page 750 * is locked, its data is valid and it isn't on the free 751 * list, while a NULL is returned if the page doesn't exist. 752 */ 753 page_t * 754 page_lookup(vnode_t *vp, u_offset_t off, se_t se) 755 { 756 return (page_lookup_create(vp, off, se, NULL, NULL, 0)); 757 } 758 759 /* 760 * Find a page representing the specified [vp, offset]. 761 * We either return the one we found or, if passed in, 762 * create one with identity of [vp, offset] of the 763 * pre-allocated page. If we find exsisting page but it is 764 * intransit coming in, it will have an "exclusive" lock 765 * and we wait for the i/o to complete. A page found on 766 * the free list is always reclaimed and then locked. 767 * On success, the page is locked, its data is valid and 768 * it isn't on the free list, while a NULL is returned 769 * if the page doesn't exist and newpp is NULL; 770 */ 771 page_t * 772 page_lookup_create( 773 vnode_t *vp, 774 u_offset_t off, 775 se_t se, 776 page_t *newpp, 777 spgcnt_t *nrelocp, 778 int flags) 779 { 780 page_t *pp; 781 kmutex_t *phm; 782 ulong_t index; 783 uint_t hash_locked; 784 uint_t es; 785 786 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 787 VM_STAT_ADD(page_lookup_cnt[0]); 788 ASSERT(newpp ? PAGE_EXCL(newpp) : 1); 789 790 /* 791 * Acquire the appropriate page hash lock since 792 * we have to search the hash list. Pages that 793 * hash to this list can't change identity while 794 * this lock is held. 795 */ 796 hash_locked = 0; 797 index = PAGE_HASH_FUNC(vp, off); 798 phm = NULL; 799 top: 800 PAGE_HASH_SEARCH(index, pp, vp, off); 801 if (pp != NULL) { 802 VM_STAT_ADD(page_lookup_cnt[1]); 803 es = (newpp != NULL) ? 1 : 0; 804 es |= flags; 805 if (!hash_locked) { 806 VM_STAT_ADD(page_lookup_cnt[2]); 807 if (!page_try_reclaim_lock(pp, se, es)) { 808 /* 809 * On a miss, acquire the phm. Then 810 * next time, page_lock() will be called, 811 * causing a wait if the page is busy. 812 * just looping with page_trylock() would 813 * get pretty boring. 814 */ 815 VM_STAT_ADD(page_lookup_cnt[3]); 816 phm = PAGE_HASH_MUTEX(index); 817 mutex_enter(phm); 818 hash_locked = 1; 819 goto top; 820 } 821 } else { 822 VM_STAT_ADD(page_lookup_cnt[4]); 823 if (!page_lock_es(pp, se, phm, P_RECLAIM, es)) { 824 VM_STAT_ADD(page_lookup_cnt[5]); 825 goto top; 826 } 827 } 828 829 /* 830 * Since `pp' is locked it can not change identity now. 831 * Reconfirm we locked the correct page. 832 * 833 * Both the p_vnode and p_offset *must* be cast volatile 834 * to force a reload of their values: The PAGE_HASH_SEARCH 835 * macro will have stuffed p_vnode and p_offset into 836 * registers before calling page_trylock(); another thread, 837 * actually holding the hash lock, could have changed the 838 * page's identity in memory, but our registers would not 839 * be changed, fooling the reconfirmation. If the hash 840 * lock was held during the search, the casting would 841 * not be needed. 842 */ 843 VM_STAT_ADD(page_lookup_cnt[6]); 844 if (((volatile struct vnode *)(pp->p_vnode) != vp) || 845 ((volatile u_offset_t)(pp->p_offset) != off)) { 846 VM_STAT_ADD(page_lookup_cnt[7]); 847 if (hash_locked) { 848 panic("page_lookup_create: lost page %p", 849 (void *)pp); 850 /*NOTREACHED*/ 851 } 852 page_unlock(pp); 853 phm = PAGE_HASH_MUTEX(index); 854 mutex_enter(phm); 855 hash_locked = 1; 856 goto top; 857 } 858 859 /* 860 * If page_trylock() was called, then pp may still be on 861 * the cachelist (can't be on the free list, it would not 862 * have been found in the search). If it is on the 863 * cachelist it must be pulled now. To pull the page from 864 * the cachelist, it must be exclusively locked. 865 * 866 * The other big difference between page_trylock() and 867 * page_lock(), is that page_lock() will pull the 868 * page from whatever free list (the cache list in this 869 * case) the page is on. If page_trylock() was used 870 * above, then we have to do the reclaim ourselves. 871 */ 872 if ((!hash_locked) && (PP_ISFREE(pp))) { 873 ASSERT(PP_ISAGED(pp) == 0); 874 VM_STAT_ADD(page_lookup_cnt[8]); 875 876 /* 877 * page_relcaim will insure that we 878 * have this page exclusively 879 */ 880 881 if (!page_reclaim(pp, NULL)) { 882 /* 883 * Page_reclaim dropped whatever lock 884 * we held. 885 */ 886 VM_STAT_ADD(page_lookup_cnt[9]); 887 phm = PAGE_HASH_MUTEX(index); 888 mutex_enter(phm); 889 hash_locked = 1; 890 goto top; 891 } else if (se == SE_SHARED && newpp == NULL) { 892 VM_STAT_ADD(page_lookup_cnt[10]); 893 page_downgrade(pp); 894 } 895 } 896 897 if (hash_locked) { 898 mutex_exit(phm); 899 } 900 901 if (newpp != NULL && pp->p_szc < newpp->p_szc && 902 PAGE_EXCL(pp) && nrelocp != NULL) { 903 ASSERT(nrelocp != NULL); 904 (void) page_relocate(&pp, &newpp, 1, 1, nrelocp, 905 NULL); 906 if (*nrelocp > 0) { 907 VM_STAT_COND_ADD(*nrelocp == 1, 908 page_lookup_cnt[11]); 909 VM_STAT_COND_ADD(*nrelocp > 1, 910 page_lookup_cnt[12]); 911 pp = newpp; 912 se = SE_EXCL; 913 } else { 914 if (se == SE_SHARED) { 915 page_downgrade(pp); 916 } 917 VM_STAT_ADD(page_lookup_cnt[13]); 918 } 919 } else if (newpp != NULL && nrelocp != NULL) { 920 if (PAGE_EXCL(pp) && se == SE_SHARED) { 921 page_downgrade(pp); 922 } 923 VM_STAT_COND_ADD(pp->p_szc < newpp->p_szc, 924 page_lookup_cnt[14]); 925 VM_STAT_COND_ADD(pp->p_szc == newpp->p_szc, 926 page_lookup_cnt[15]); 927 VM_STAT_COND_ADD(pp->p_szc > newpp->p_szc, 928 page_lookup_cnt[16]); 929 } else if (newpp != NULL && PAGE_EXCL(pp)) { 930 se = SE_EXCL; 931 } 932 } else if (!hash_locked) { 933 VM_STAT_ADD(page_lookup_cnt[17]); 934 phm = PAGE_HASH_MUTEX(index); 935 mutex_enter(phm); 936 hash_locked = 1; 937 goto top; 938 } else if (newpp != NULL) { 939 /* 940 * If we have a preallocated page then 941 * insert it now and basically behave like 942 * page_create. 943 */ 944 VM_STAT_ADD(page_lookup_cnt[18]); 945 /* 946 * Since we hold the page hash mutex and 947 * just searched for this page, page_hashin 948 * had better not fail. If it does, that 949 * means some thread did not follow the 950 * page hash mutex rules. Panic now and 951 * get it over with. As usual, go down 952 * holding all the locks. 953 */ 954 ASSERT(MUTEX_HELD(phm)); 955 if (!page_hashin(newpp, vp, off, phm)) { 956 ASSERT(MUTEX_HELD(phm)); 957 panic("page_lookup_create: hashin failed %p %p %llx %p", 958 (void *)newpp, (void *)vp, off, (void *)phm); 959 /*NOTREACHED*/ 960 } 961 ASSERT(MUTEX_HELD(phm)); 962 mutex_exit(phm); 963 phm = NULL; 964 page_set_props(newpp, P_REF); 965 page_io_lock(newpp); 966 pp = newpp; 967 se = SE_EXCL; 968 } else { 969 VM_STAT_ADD(page_lookup_cnt[19]); 970 mutex_exit(phm); 971 } 972 973 ASSERT(pp ? PAGE_LOCKED_SE(pp, se) : 1); 974 975 ASSERT(pp ? ((PP_ISFREE(pp) == 0) && (PP_ISAGED(pp) == 0)) : 1); 976 977 return (pp); 978 } 979 980 /* 981 * Search the hash list for the page representing the 982 * specified [vp, offset] and return it locked. Skip 983 * free pages and pages that cannot be locked as requested. 984 * Used while attempting to kluster pages. 985 */ 986 page_t * 987 page_lookup_nowait(vnode_t *vp, u_offset_t off, se_t se) 988 { 989 page_t *pp; 990 kmutex_t *phm; 991 ulong_t index; 992 uint_t locked; 993 994 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 995 VM_STAT_ADD(page_lookup_nowait_cnt[0]); 996 997 index = PAGE_HASH_FUNC(vp, off); 998 PAGE_HASH_SEARCH(index, pp, vp, off); 999 locked = 0; 1000 if (pp == NULL) { 1001 top: 1002 VM_STAT_ADD(page_lookup_nowait_cnt[1]); 1003 locked = 1; 1004 phm = PAGE_HASH_MUTEX(index); 1005 mutex_enter(phm); 1006 PAGE_HASH_SEARCH(index, pp, vp, off); 1007 } 1008 1009 if (pp == NULL || PP_ISFREE(pp)) { 1010 VM_STAT_ADD(page_lookup_nowait_cnt[2]); 1011 pp = NULL; 1012 } else { 1013 if (!page_trylock(pp, se)) { 1014 VM_STAT_ADD(page_lookup_nowait_cnt[3]); 1015 pp = NULL; 1016 } else { 1017 VM_STAT_ADD(page_lookup_nowait_cnt[4]); 1018 /* 1019 * See the comment in page_lookup() 1020 */ 1021 if (((volatile struct vnode *)(pp->p_vnode) != vp) || 1022 ((u_offset_t)(pp->p_offset) != off)) { 1023 VM_STAT_ADD(page_lookup_nowait_cnt[5]); 1024 if (locked) { 1025 panic("page_lookup_nowait %p", 1026 (void *)pp); 1027 /*NOTREACHED*/ 1028 } 1029 page_unlock(pp); 1030 goto top; 1031 } 1032 if (PP_ISFREE(pp)) { 1033 VM_STAT_ADD(page_lookup_nowait_cnt[6]); 1034 page_unlock(pp); 1035 pp = NULL; 1036 } 1037 } 1038 } 1039 if (locked) { 1040 VM_STAT_ADD(page_lookup_nowait_cnt[7]); 1041 mutex_exit(phm); 1042 } 1043 1044 ASSERT(pp ? PAGE_LOCKED_SE(pp, se) : 1); 1045 1046 return (pp); 1047 } 1048 1049 /* 1050 * Search the hash list for a page with the specified [vp, off] 1051 * that is known to exist and is already locked. This routine 1052 * is typically used by segment SOFTUNLOCK routines. 1053 */ 1054 page_t * 1055 page_find(vnode_t *vp, u_offset_t off) 1056 { 1057 page_t *pp; 1058 kmutex_t *phm; 1059 ulong_t index; 1060 1061 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 1062 VM_STAT_ADD(page_find_cnt); 1063 1064 index = PAGE_HASH_FUNC(vp, off); 1065 phm = PAGE_HASH_MUTEX(index); 1066 1067 mutex_enter(phm); 1068 PAGE_HASH_SEARCH(index, pp, vp, off); 1069 mutex_exit(phm); 1070 1071 ASSERT(pp != NULL); 1072 ASSERT(PAGE_LOCKED(pp) || panicstr); 1073 return (pp); 1074 } 1075 1076 /* 1077 * Determine whether a page with the specified [vp, off] 1078 * currently exists in the system. Obviously this should 1079 * only be considered as a hint since nothing prevents the 1080 * page from disappearing or appearing immediately after 1081 * the return from this routine. Subsequently, we don't 1082 * even bother to lock the list. 1083 */ 1084 page_t * 1085 page_exists(vnode_t *vp, u_offset_t off) 1086 { 1087 page_t *pp; 1088 ulong_t index; 1089 1090 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 1091 VM_STAT_ADD(page_exists_cnt); 1092 1093 index = PAGE_HASH_FUNC(vp, off); 1094 PAGE_HASH_SEARCH(index, pp, vp, off); 1095 1096 return (pp); 1097 } 1098 1099 /* 1100 * Determine if physically contiguous pages exist for [vp, off] - [vp, off + 1101 * page_size(szc)) range. if they exist and ppa is not NULL fill ppa array 1102 * with these pages locked SHARED. If necessary reclaim pages from 1103 * freelist. Return 1 if contiguous pages exist and 0 otherwise. 1104 * 1105 * If we fail to lock pages still return 1 if pages exist and contiguous. 1106 * But in this case return value is just a hint. ppa array won't be filled. 1107 * Caller should initialize ppa[0] as NULL to distinguish return value. 1108 * 1109 * Returns 0 if pages don't exist or not physically contiguous. 1110 * 1111 * This routine doesn't work for anonymous(swapfs) pages. 1112 */ 1113 int 1114 page_exists_physcontig(vnode_t *vp, u_offset_t off, uint_t szc, page_t *ppa[]) 1115 { 1116 pgcnt_t pages; 1117 pfn_t pfn; 1118 page_t *rootpp; 1119 pgcnt_t i; 1120 pgcnt_t j; 1121 u_offset_t save_off = off; 1122 ulong_t index; 1123 kmutex_t *phm; 1124 page_t *pp; 1125 uint_t pszc; 1126 int loopcnt = 0; 1127 1128 ASSERT(szc != 0); 1129 ASSERT(vp != NULL); 1130 ASSERT(!IS_SWAPFSVP(vp)); 1131 ASSERT(vp != &kvp); 1132 1133 again: 1134 if (++loopcnt > 3) { 1135 VM_STAT_ADD(page_exphcontg[0]); 1136 return (0); 1137 } 1138 1139 index = PAGE_HASH_FUNC(vp, off); 1140 phm = PAGE_HASH_MUTEX(index); 1141 1142 mutex_enter(phm); 1143 PAGE_HASH_SEARCH(index, pp, vp, off); 1144 mutex_exit(phm); 1145 1146 VM_STAT_ADD(page_exphcontg[1]); 1147 1148 if (pp == NULL) { 1149 VM_STAT_ADD(page_exphcontg[2]); 1150 return (0); 1151 } 1152 1153 pages = page_get_pagecnt(szc); 1154 rootpp = pp; 1155 pfn = rootpp->p_pagenum; 1156 1157 if ((pszc = pp->p_szc) >= szc && ppa != NULL) { 1158 VM_STAT_ADD(page_exphcontg[3]); 1159 if (!page_trylock(pp, SE_SHARED)) { 1160 VM_STAT_ADD(page_exphcontg[4]); 1161 return (1); 1162 } 1163 if (pp->p_szc != pszc || pp->p_vnode != vp || 1164 pp->p_offset != off) { 1165 VM_STAT_ADD(page_exphcontg[5]); 1166 page_unlock(pp); 1167 off = save_off; 1168 goto again; 1169 } 1170 /* 1171 * szc was non zero and vnode and offset matched after we 1172 * locked the page it means it can't become free on us. 1173 */ 1174 ASSERT(!PP_ISFREE(pp)); 1175 if (!IS_P2ALIGNED(pfn, pages)) { 1176 page_unlock(pp); 1177 return (0); 1178 } 1179 ppa[0] = pp; 1180 pp++; 1181 off += PAGESIZE; 1182 pfn++; 1183 for (i = 1; i < pages; i++, pp++, off += PAGESIZE, pfn++) { 1184 if (!page_trylock(pp, SE_SHARED)) { 1185 VM_STAT_ADD(page_exphcontg[6]); 1186 pp--; 1187 while (i-- > 0) { 1188 page_unlock(pp); 1189 pp--; 1190 } 1191 ppa[0] = NULL; 1192 return (1); 1193 } 1194 if (pp->p_szc != pszc) { 1195 VM_STAT_ADD(page_exphcontg[7]); 1196 page_unlock(pp); 1197 pp--; 1198 while (i-- > 0) { 1199 page_unlock(pp); 1200 pp--; 1201 } 1202 ppa[0] = NULL; 1203 off = save_off; 1204 goto again; 1205 } 1206 /* 1207 * szc the same as for previous already locked pages 1208 * with right identity. Since this page had correct 1209 * szc after we locked it can't get freed or destroyed 1210 * and therefore must have the expected identity. 1211 */ 1212 ASSERT(!PP_ISFREE(pp)); 1213 if (pp->p_vnode != vp || 1214 pp->p_offset != off) { 1215 panic("page_exists_physcontig: " 1216 "large page identity doesn't match"); 1217 } 1218 ppa[i] = pp; 1219 ASSERT(pp->p_pagenum == pfn); 1220 } 1221 VM_STAT_ADD(page_exphcontg[8]); 1222 ppa[pages] = NULL; 1223 return (1); 1224 } else if (pszc >= szc) { 1225 VM_STAT_ADD(page_exphcontg[9]); 1226 if (!IS_P2ALIGNED(pfn, pages)) { 1227 return (0); 1228 } 1229 return (1); 1230 } 1231 1232 if (!IS_P2ALIGNED(pfn, pages)) { 1233 VM_STAT_ADD(page_exphcontg[10]); 1234 return (0); 1235 } 1236 1237 if (page_numtomemseg_nolock(pfn) != 1238 page_numtomemseg_nolock(pfn + pages - 1)) { 1239 VM_STAT_ADD(page_exphcontg[11]); 1240 return (0); 1241 } 1242 1243 /* 1244 * We loop up 4 times across pages to promote page size. 1245 * We're extra cautious to promote page size atomically with respect 1246 * to everybody else. But we can probably optimize into 1 loop if 1247 * this becomes an issue. 1248 */ 1249 1250 for (i = 0; i < pages; i++, pp++, off += PAGESIZE, pfn++) { 1251 ASSERT(pp->p_pagenum == pfn); 1252 if (!page_trylock(pp, SE_EXCL)) { 1253 VM_STAT_ADD(page_exphcontg[12]); 1254 break; 1255 } 1256 if (pp->p_vnode != vp || 1257 pp->p_offset != off) { 1258 VM_STAT_ADD(page_exphcontg[13]); 1259 page_unlock(pp); 1260 break; 1261 } 1262 if (pp->p_szc >= szc) { 1263 ASSERT(i == 0); 1264 page_unlock(pp); 1265 off = save_off; 1266 goto again; 1267 } 1268 } 1269 1270 if (i != pages) { 1271 VM_STAT_ADD(page_exphcontg[14]); 1272 --pp; 1273 while (i-- > 0) { 1274 page_unlock(pp); 1275 --pp; 1276 } 1277 return (0); 1278 } 1279 1280 pp = rootpp; 1281 for (i = 0; i < pages; i++, pp++) { 1282 if (PP_ISFREE(pp)) { 1283 VM_STAT_ADD(page_exphcontg[15]); 1284 ASSERT(!PP_ISAGED(pp)); 1285 ASSERT(pp->p_szc == 0); 1286 if (!page_reclaim(pp, NULL)) { 1287 break; 1288 } 1289 } else { 1290 ASSERT(pp->p_szc < szc); 1291 VM_STAT_ADD(page_exphcontg[16]); 1292 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1293 } 1294 } 1295 if (i < pages) { 1296 VM_STAT_ADD(page_exphcontg[17]); 1297 /* 1298 * page_reclaim failed because we were out of memory. 1299 * drop the rest of the locks and return because this page 1300 * must be already reallocated anyway. 1301 */ 1302 pp = rootpp; 1303 for (j = 0; j < pages; j++, pp++) { 1304 if (j != i) { 1305 page_unlock(pp); 1306 } 1307 } 1308 return (0); 1309 } 1310 1311 off = save_off; 1312 pp = rootpp; 1313 for (i = 0; i < pages; i++, pp++, off += PAGESIZE) { 1314 ASSERT(PAGE_EXCL(pp)); 1315 ASSERT(!PP_ISFREE(pp)); 1316 ASSERT(!hat_page_is_mapped(pp)); 1317 ASSERT(pp->p_vnode == vp); 1318 ASSERT(pp->p_offset == off); 1319 pp->p_szc = szc; 1320 } 1321 pp = rootpp; 1322 for (i = 0; i < pages; i++, pp++) { 1323 if (ppa == NULL) { 1324 page_unlock(pp); 1325 } else { 1326 ppa[i] = pp; 1327 page_downgrade(ppa[i]); 1328 } 1329 } 1330 if (ppa != NULL) { 1331 ppa[pages] = NULL; 1332 } 1333 VM_STAT_ADD(page_exphcontg[18]); 1334 ASSERT(vp->v_pages != NULL); 1335 return (1); 1336 } 1337 1338 /* 1339 * Determine whether a page with the specified [vp, off] 1340 * currently exists in the system and if so return its 1341 * size code. Obviously this should only be considered as 1342 * a hint since nothing prevents the page from disappearing 1343 * or appearing immediately after the return from this routine. 1344 */ 1345 int 1346 page_exists_forreal(vnode_t *vp, u_offset_t off, uint_t *szc) 1347 { 1348 page_t *pp; 1349 kmutex_t *phm; 1350 ulong_t index; 1351 int rc = 0; 1352 1353 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 1354 ASSERT(szc != NULL); 1355 VM_STAT_ADD(page_exists_forreal_cnt); 1356 1357 index = PAGE_HASH_FUNC(vp, off); 1358 phm = PAGE_HASH_MUTEX(index); 1359 1360 mutex_enter(phm); 1361 PAGE_HASH_SEARCH(index, pp, vp, off); 1362 if (pp != NULL) { 1363 *szc = pp->p_szc; 1364 rc = 1; 1365 } 1366 mutex_exit(phm); 1367 return (rc); 1368 } 1369 1370 /* wakeup threads waiting for pages in page_create_get_something() */ 1371 void 1372 wakeup_pcgs(void) 1373 { 1374 if (!CV_HAS_WAITERS(&pcgs_cv)) 1375 return; 1376 cv_broadcast(&pcgs_cv); 1377 } 1378 1379 /* 1380 * 'freemem' is used all over the kernel as an indication of how many 1381 * pages are free (either on the cache list or on the free page list) 1382 * in the system. In very few places is a really accurate 'freemem' 1383 * needed. To avoid contention of the lock protecting a the 1384 * single freemem, it was spread out into NCPU buckets. Set_freemem 1385 * sets freemem to the total of all NCPU buckets. It is called from 1386 * clock() on each TICK. 1387 */ 1388 void 1389 set_freemem() 1390 { 1391 struct pcf *p; 1392 ulong_t t; 1393 uint_t i; 1394 1395 t = 0; 1396 p = pcf; 1397 for (i = 0; i < PCF_FANOUT; i++) { 1398 t += p->pcf_count; 1399 p++; 1400 } 1401 freemem = t; 1402 1403 /* 1404 * Don't worry about grabbing mutex. It's not that 1405 * critical if we miss a tick or two. This is 1406 * where we wakeup possible delayers in 1407 * page_create_get_something(). 1408 */ 1409 wakeup_pcgs(); 1410 } 1411 1412 ulong_t 1413 get_freemem() 1414 { 1415 struct pcf *p; 1416 ulong_t t; 1417 uint_t i; 1418 1419 t = 0; 1420 p = pcf; 1421 for (i = 0; i < PCF_FANOUT; i++) { 1422 t += p->pcf_count; 1423 p++; 1424 } 1425 /* 1426 * We just calculated it, might as well set it. 1427 */ 1428 freemem = t; 1429 return (t); 1430 } 1431 1432 /* 1433 * Acquire all of the page cache & free (pcf) locks. 1434 */ 1435 void 1436 pcf_acquire_all() 1437 { 1438 struct pcf *p; 1439 uint_t i; 1440 1441 p = pcf; 1442 for (i = 0; i < PCF_FANOUT; i++) { 1443 p->pcf_touch = 1; 1444 mutex_enter(&p->pcf_lock); 1445 p++; 1446 } 1447 } 1448 1449 /* 1450 * Release all the pcf_locks. 1451 */ 1452 void 1453 pcf_release_all() 1454 { 1455 struct pcf *p; 1456 uint_t i; 1457 1458 p = pcf; 1459 for (i = 0; i < PCF_FANOUT; i++) { 1460 mutex_exit(&p->pcf_lock); 1461 p++; 1462 } 1463 } 1464 1465 /* 1466 * Inform the VM system that we need some pages freed up. 1467 * Calls must be symmetric, e.g.: 1468 * 1469 * page_needfree(100); 1470 * wait a bit; 1471 * page_needfree(-100); 1472 */ 1473 void 1474 page_needfree(spgcnt_t npages) 1475 { 1476 mutex_enter(&new_freemem_lock); 1477 needfree += npages; 1478 mutex_exit(&new_freemem_lock); 1479 } 1480 1481 /* 1482 * Throttle for page_create(): try to prevent freemem from dropping 1483 * below throttlefree. We can't provide a 100% guarantee because 1484 * KM_NOSLEEP allocations, page_reclaim(), and various other things 1485 * nibble away at the freelist. However, we can block all PG_WAIT 1486 * allocations until memory becomes available. The motivation is 1487 * that several things can fall apart when there's no free memory: 1488 * 1489 * (1) If pageout() needs memory to push a page, the system deadlocks. 1490 * 1491 * (2) By (broken) specification, timeout(9F) can neither fail nor 1492 * block, so it has no choice but to panic the system if it 1493 * cannot allocate a callout structure. 1494 * 1495 * (3) Like timeout(), ddi_set_callback() cannot fail and cannot block; 1496 * it panics if it cannot allocate a callback structure. 1497 * 1498 * (4) Untold numbers of third-party drivers have not yet been hardened 1499 * against KM_NOSLEEP and/or allocb() failures; they simply assume 1500 * success and panic the system with a data fault on failure. 1501 * (The long-term solution to this particular problem is to ship 1502 * hostile fault-injecting DEBUG kernels with the DDK.) 1503 * 1504 * It is theoretically impossible to guarantee success of non-blocking 1505 * allocations, but in practice, this throttle is very hard to break. 1506 */ 1507 static int 1508 page_create_throttle(pgcnt_t npages, int flags) 1509 { 1510 ulong_t fm; 1511 uint_t i; 1512 pgcnt_t tf; /* effective value of throttlefree */ 1513 1514 /* 1515 * Never deny pages when: 1516 * - it's a thread that cannot block [NOMEMWAIT()] 1517 * - the allocation cannot block and must not fail 1518 * - the allocation cannot block and is pageout dispensated 1519 */ 1520 if (NOMEMWAIT() || 1521 ((flags & (PG_WAIT | PG_PANIC)) == PG_PANIC) || 1522 ((flags & (PG_WAIT | PG_PUSHPAGE)) == PG_PUSHPAGE)) 1523 return (1); 1524 1525 /* 1526 * If the allocation can't block, we look favorably upon it 1527 * unless we're below pageout_reserve. In that case we fail 1528 * the allocation because we want to make sure there are a few 1529 * pages available for pageout. 1530 */ 1531 if ((flags & PG_WAIT) == 0) 1532 return (freemem >= npages + pageout_reserve); 1533 1534 /* Calculate the effective throttlefree value */ 1535 tf = throttlefree - 1536 ((flags & PG_PUSHPAGE) ? pageout_reserve : 0); 1537 1538 cv_signal(&proc_pageout->p_cv); 1539 1540 while (freemem < npages + tf) { 1541 pcf_acquire_all(); 1542 mutex_enter(&new_freemem_lock); 1543 fm = 0; 1544 for (i = 0; i < PCF_FANOUT; i++) { 1545 fm += pcf[i].pcf_count; 1546 pcf[i].pcf_wait++; 1547 mutex_exit(&pcf[i].pcf_lock); 1548 } 1549 freemem = fm; 1550 needfree += npages; 1551 freemem_wait++; 1552 cv_wait(&freemem_cv, &new_freemem_lock); 1553 freemem_wait--; 1554 needfree -= npages; 1555 mutex_exit(&new_freemem_lock); 1556 } 1557 return (1); 1558 } 1559 1560 /* 1561 * page_create_wait() is called to either coalecse pages from the 1562 * different pcf buckets or to wait because there simply are not 1563 * enough pages to satisfy the caller's request. 1564 * 1565 * Sadly, this is called from platform/vm/vm_machdep.c 1566 */ 1567 int 1568 page_create_wait(size_t npages, uint_t flags) 1569 { 1570 pgcnt_t total; 1571 uint_t i; 1572 struct pcf *p; 1573 1574 /* 1575 * Wait until there are enough free pages to satisfy our 1576 * entire request. 1577 * We set needfree += npages before prodding pageout, to make sure 1578 * it does real work when npages > lotsfree > freemem. 1579 */ 1580 VM_STAT_ADD(page_create_not_enough); 1581 1582 ASSERT(!kcage_on ? !(flags & PG_NORELOC) : 1); 1583 checkagain: 1584 if ((flags & PG_NORELOC) && 1585 kcage_freemem < kcage_throttlefree + npages) 1586 (void) kcage_create_throttle(npages, flags); 1587 1588 if (freemem < npages + throttlefree) 1589 if (!page_create_throttle(npages, flags)) 1590 return (0); 1591 1592 /* 1593 * Since page_create_va() looked at every 1594 * bucket, assume we are going to have to wait. 1595 * Get all of the pcf locks. 1596 */ 1597 total = 0; 1598 p = pcf; 1599 for (i = 0; i < PCF_FANOUT; i++) { 1600 p->pcf_touch = 1; 1601 mutex_enter(&p->pcf_lock); 1602 total += p->pcf_count; 1603 if (total >= npages) { 1604 /* 1605 * Wow! There are enough pages laying around 1606 * to satisfy the request. Do the accounting, 1607 * drop the locks we acquired, and go back. 1608 * 1609 * freemem is not protected by any lock. So, 1610 * we cannot have any assertion containing 1611 * freemem. 1612 */ 1613 freemem -= npages; 1614 1615 while (p >= pcf) { 1616 if (p->pcf_count <= npages) { 1617 npages -= p->pcf_count; 1618 p->pcf_count = 0; 1619 } else { 1620 p->pcf_count -= (uint_t)npages; 1621 npages = 0; 1622 } 1623 mutex_exit(&p->pcf_lock); 1624 p--; 1625 } 1626 ASSERT(npages == 0); 1627 return (1); 1628 } 1629 p++; 1630 } 1631 1632 /* 1633 * All of the pcf locks are held, there are not enough pages 1634 * to satisfy the request (npages < total). 1635 * Be sure to acquire the new_freemem_lock before dropping 1636 * the pcf locks. This prevents dropping wakeups in page_free(). 1637 * The order is always pcf_lock then new_freemem_lock. 1638 * 1639 * Since we hold all the pcf locks, it is a good time to set freemem. 1640 * 1641 * If the caller does not want to wait, return now. 1642 * Else turn the pageout daemon loose to find something 1643 * and wait till it does. 1644 * 1645 */ 1646 freemem = total; 1647 1648 if ((flags & PG_WAIT) == 0) { 1649 pcf_release_all(); 1650 1651 TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_NOMEM, 1652 "page_create_nomem:npages %ld freemem %ld", npages, freemem); 1653 return (0); 1654 } 1655 1656 ASSERT(proc_pageout != NULL); 1657 cv_signal(&proc_pageout->p_cv); 1658 1659 TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SLEEP_START, 1660 "page_create_sleep_start: freemem %ld needfree %ld", 1661 freemem, needfree); 1662 1663 /* 1664 * We are going to wait. 1665 * We currently hold all of the pcf_locks, 1666 * get the new_freemem_lock (it protects freemem_wait), 1667 * before dropping the pcf_locks. 1668 */ 1669 mutex_enter(&new_freemem_lock); 1670 1671 p = pcf; 1672 for (i = 0; i < PCF_FANOUT; i++) { 1673 p->pcf_wait++; 1674 mutex_exit(&p->pcf_lock); 1675 p++; 1676 } 1677 1678 needfree += npages; 1679 freemem_wait++; 1680 1681 cv_wait(&freemem_cv, &new_freemem_lock); 1682 1683 freemem_wait--; 1684 needfree -= npages; 1685 1686 mutex_exit(&new_freemem_lock); 1687 1688 TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SLEEP_END, 1689 "page_create_sleep_end: freemem %ld needfree %ld", 1690 freemem, needfree); 1691 1692 VM_STAT_ADD(page_create_not_enough_again); 1693 goto checkagain; 1694 } 1695 1696 /* 1697 * A routine to do the opposite of page_create_wait(). 1698 */ 1699 void 1700 page_create_putback(spgcnt_t npages) 1701 { 1702 struct pcf *p; 1703 pgcnt_t lump; 1704 uint_t *which; 1705 1706 /* 1707 * When a contiguous lump is broken up, we have to 1708 * deal with lots of pages (min 64) so lets spread 1709 * the wealth around. 1710 */ 1711 lump = roundup(npages, PCF_FANOUT) / PCF_FANOUT; 1712 freemem += npages; 1713 1714 for (p = pcf; (npages > 0) && (p < &pcf[PCF_FANOUT]); p++) { 1715 which = &p->pcf_count; 1716 1717 mutex_enter(&p->pcf_lock); 1718 1719 if (p->pcf_block) { 1720 which = &p->pcf_reserve; 1721 } 1722 1723 if (lump < npages) { 1724 *which += (uint_t)lump; 1725 npages -= lump; 1726 } else { 1727 *which += (uint_t)npages; 1728 npages = 0; 1729 } 1730 1731 if (p->pcf_wait) { 1732 mutex_enter(&new_freemem_lock); 1733 /* 1734 * Check to see if some other thread 1735 * is actually waiting. Another bucket 1736 * may have woken it up by now. If there 1737 * are no waiters, then set our pcf_wait 1738 * count to zero to avoid coming in here 1739 * next time. 1740 */ 1741 if (freemem_wait) { 1742 if (npages > 1) { 1743 cv_broadcast(&freemem_cv); 1744 } else { 1745 cv_signal(&freemem_cv); 1746 } 1747 p->pcf_wait--; 1748 } else { 1749 p->pcf_wait = 0; 1750 } 1751 mutex_exit(&new_freemem_lock); 1752 } 1753 mutex_exit(&p->pcf_lock); 1754 } 1755 ASSERT(npages == 0); 1756 } 1757 1758 /* 1759 * A helper routine for page_create_get_something. 1760 * The indenting got to deep down there. 1761 * Unblock the pcf counters. Any pages freed after 1762 * pcf_block got set are moved to pcf_count and 1763 * wakeups (cv_broadcast() or cv_signal()) are done as needed. 1764 */ 1765 static void 1766 pcgs_unblock(void) 1767 { 1768 int i; 1769 struct pcf *p; 1770 1771 /* Update freemem while we're here. */ 1772 freemem = 0; 1773 p = pcf; 1774 for (i = 0; i < PCF_FANOUT; i++) { 1775 mutex_enter(&p->pcf_lock); 1776 ASSERT(p->pcf_count == 0); 1777 p->pcf_count = p->pcf_reserve; 1778 p->pcf_block = 0; 1779 freemem += p->pcf_count; 1780 if (p->pcf_wait) { 1781 mutex_enter(&new_freemem_lock); 1782 if (freemem_wait) { 1783 if (p->pcf_reserve > 1) { 1784 cv_broadcast(&freemem_cv); 1785 p->pcf_wait = 0; 1786 } else { 1787 cv_signal(&freemem_cv); 1788 p->pcf_wait--; 1789 } 1790 } else { 1791 p->pcf_wait = 0; 1792 } 1793 mutex_exit(&new_freemem_lock); 1794 } 1795 p->pcf_reserve = 0; 1796 mutex_exit(&p->pcf_lock); 1797 p++; 1798 } 1799 } 1800 1801 /* 1802 * Called from page_create_va() when both the cache and free lists 1803 * have been checked once. 1804 * 1805 * Either returns a page or panics since the accounting was done 1806 * way before we got here. 1807 * 1808 * We don't come here often, so leave the accounting on permanently. 1809 */ 1810 1811 #define MAX_PCGS 100 1812 1813 #ifdef DEBUG 1814 #define PCGS_TRIES 100 1815 #else /* DEBUG */ 1816 #define PCGS_TRIES 10 1817 #endif /* DEBUG */ 1818 1819 #ifdef VM_STATS 1820 uint_t pcgs_counts[PCGS_TRIES]; 1821 uint_t pcgs_too_many; 1822 uint_t pcgs_entered; 1823 uint_t pcgs_entered_noreloc; 1824 uint_t pcgs_locked; 1825 uint_t pcgs_cagelocked; 1826 #endif /* VM_STATS */ 1827 1828 static page_t * 1829 page_create_get_something(vnode_t *vp, u_offset_t off, struct seg *seg, 1830 caddr_t vaddr, uint_t flags) 1831 { 1832 uint_t count; 1833 page_t *pp; 1834 uint_t locked, i; 1835 struct pcf *p; 1836 lgrp_t *lgrp; 1837 int cagelocked = 0; 1838 1839 VM_STAT_ADD(pcgs_entered); 1840 1841 /* 1842 * Tap any reserve freelists: if we fail now, we'll die 1843 * since the page(s) we're looking for have already been 1844 * accounted for. 1845 */ 1846 flags |= PG_PANIC; 1847 1848 if ((flags & PG_NORELOC) != 0) { 1849 VM_STAT_ADD(pcgs_entered_noreloc); 1850 /* 1851 * Requests for free pages from critical threads 1852 * such as pageout still won't throttle here, but 1853 * we must try again, to give the cageout thread 1854 * another chance to catch up. Since we already 1855 * accounted for the pages, we had better get them 1856 * this time. 1857 * 1858 * N.B. All non-critical threads acquire the pcgs_cagelock 1859 * to serialize access to the freelists. This implements a 1860 * turnstile-type synchornization to avoid starvation of 1861 * critical requests for PG_NORELOC memory by non-critical 1862 * threads: all non-critical threads must acquire a 'ticket' 1863 * before passing through, which entails making sure 1864 * kcage_freemem won't fall below minfree prior to grabbing 1865 * pages from the freelists. 1866 */ 1867 if (kcage_create_throttle(1, flags) == KCT_NONCRIT) { 1868 mutex_enter(&pcgs_cagelock); 1869 cagelocked = 1; 1870 VM_STAT_ADD(pcgs_cagelocked); 1871 } 1872 } 1873 1874 /* 1875 * Time to get serious. 1876 * We failed to get a `correctly colored' page from both the 1877 * free and cache lists. 1878 * We escalate in stage. 1879 * 1880 * First try both lists without worring about color. 1881 * 1882 * Then, grab all page accounting locks (ie. pcf[]) and 1883 * steal any pages that they have and set the pcf_block flag to 1884 * stop deletions from the lists. This will help because 1885 * a page can get added to the free list while we are looking 1886 * at the cache list, then another page could be added to the cache 1887 * list allowing the page on the free list to be removed as we 1888 * move from looking at the cache list to the free list. This 1889 * could happen over and over. We would never find the page 1890 * we have accounted for. 1891 * 1892 * Noreloc pages are a subset of the global (relocatable) page pool. 1893 * They are not tracked separately in the pcf bins, so it is 1894 * impossible to know when doing pcf accounting if the available 1895 * page(s) are noreloc pages or not. When looking for a noreloc page 1896 * it is quite easy to end up here even if the global (relocatable) 1897 * page pool has plenty of free pages but the noreloc pool is empty. 1898 * 1899 * When the noreloc pool is empty (or low), additional noreloc pages 1900 * are created by converting pages from the global page pool. This 1901 * process will stall during pcf accounting if the pcf bins are 1902 * already locked. Such is the case when a noreloc allocation is 1903 * looping here in page_create_get_something waiting for more noreloc 1904 * pages to appear. 1905 * 1906 * Short of adding a new field to the pcf bins to accurately track 1907 * the number of free noreloc pages, we instead do not grab the 1908 * pcgs_lock, do not set the pcf blocks and do not timeout when 1909 * allocating a noreloc page. This allows noreloc allocations to 1910 * loop without blocking global page pool allocations. 1911 * 1912 * NOTE: the behaviour of page_create_get_something has not changed 1913 * for the case of global page pool allocations. 1914 */ 1915 1916 flags &= ~PG_MATCH_COLOR; 1917 locked = 0; 1918 #ifndef __sparc 1919 /* 1920 * page_create_get_something may be called because 4g memory may be 1921 * depleted. Set flags to allow for relocation of base page below 1922 * 4g if necessary. 1923 */ 1924 if (physmax4g) 1925 flags |= (PGI_PGCPSZC0 | PGI_PGCPHIPRI); 1926 #endif 1927 1928 lgrp = lgrp_mem_choose(seg, vaddr, PAGESIZE); 1929 1930 for (count = 0; kcage_on || count < MAX_PCGS; count++) { 1931 pp = page_get_freelist(vp, off, seg, vaddr, PAGESIZE, 1932 flags, lgrp); 1933 if (pp == NULL) { 1934 pp = page_get_cachelist(vp, off, seg, vaddr, 1935 flags, lgrp); 1936 } 1937 if (pp == NULL) { 1938 /* 1939 * Serialize. Don't fight with other pcgs(). 1940 */ 1941 if (!locked && (!kcage_on || !(flags & PG_NORELOC))) { 1942 mutex_enter(&pcgs_lock); 1943 VM_STAT_ADD(pcgs_locked); 1944 locked = 1; 1945 p = pcf; 1946 for (i = 0; i < PCF_FANOUT; i++) { 1947 mutex_enter(&p->pcf_lock); 1948 ASSERT(p->pcf_block == 0); 1949 p->pcf_block = 1; 1950 p->pcf_reserve = p->pcf_count; 1951 p->pcf_count = 0; 1952 mutex_exit(&p->pcf_lock); 1953 p++; 1954 } 1955 freemem = 0; 1956 } 1957 1958 if (count) { 1959 /* 1960 * Since page_free() puts pages on 1961 * a list then accounts for it, we 1962 * just have to wait for page_free() 1963 * to unlock any page it was working 1964 * with. The page_lock()-page_reclaim() 1965 * path falls in the same boat. 1966 * 1967 * We don't need to check on the 1968 * PG_WAIT flag, we have already 1969 * accounted for the page we are 1970 * looking for in page_create_va(). 1971 * 1972 * We just wait a moment to let any 1973 * locked pages on the lists free up, 1974 * then continue around and try again. 1975 * 1976 * Will be awakened by set_freemem(). 1977 */ 1978 mutex_enter(&pcgs_wait_lock); 1979 cv_wait(&pcgs_cv, &pcgs_wait_lock); 1980 mutex_exit(&pcgs_wait_lock); 1981 } 1982 } else { 1983 #ifdef VM_STATS 1984 if (count >= PCGS_TRIES) { 1985 VM_STAT_ADD(pcgs_too_many); 1986 } else { 1987 VM_STAT_ADD(pcgs_counts[count]); 1988 } 1989 #endif 1990 if (locked) { 1991 pcgs_unblock(); 1992 mutex_exit(&pcgs_lock); 1993 } 1994 if (cagelocked) 1995 mutex_exit(&pcgs_cagelock); 1996 return (pp); 1997 } 1998 } 1999 /* 2000 * we go down holding the pcf locks. 2001 */ 2002 panic("no %spage found %d", 2003 ((flags & PG_NORELOC) ? "non-reloc " : ""), count); 2004 /*NOTREACHED*/ 2005 } 2006 2007 /* 2008 * Create enough pages for "bytes" worth of data starting at 2009 * "off" in "vp". 2010 * 2011 * Where flag must be one of: 2012 * 2013 * PG_EXCL: Exclusive create (fail if any page already 2014 * exists in the page cache) which does not 2015 * wait for memory to become available. 2016 * 2017 * PG_WAIT: Non-exclusive create which can wait for 2018 * memory to become available. 2019 * 2020 * PG_PHYSCONTIG: Allocate physically contiguous pages. 2021 * (Not Supported) 2022 * 2023 * A doubly linked list of pages is returned to the caller. Each page 2024 * on the list has the "exclusive" (p_selock) lock and "iolock" (p_iolock) 2025 * lock. 2026 * 2027 * Unable to change the parameters to page_create() in a minor release, 2028 * we renamed page_create() to page_create_va(), changed all known calls 2029 * from page_create() to page_create_va(), and created this wrapper. 2030 * 2031 * Upon a major release, we should break compatibility by deleting this 2032 * wrapper, and replacing all the strings "page_create_va", with "page_create". 2033 * 2034 * NOTE: There is a copy of this interface as page_create_io() in 2035 * i86/vm/vm_machdep.c. Any bugs fixed here should be applied 2036 * there. 2037 */ 2038 page_t * 2039 page_create(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags) 2040 { 2041 caddr_t random_vaddr; 2042 struct seg kseg; 2043 2044 #ifdef DEBUG 2045 cmn_err(CE_WARN, "Using deprecated interface page_create: caller %p", 2046 (void *)caller()); 2047 #endif 2048 2049 random_vaddr = (caddr_t)(((uintptr_t)vp >> 7) ^ 2050 (uintptr_t)(off >> PAGESHIFT)); 2051 kseg.s_as = &kas; 2052 2053 return (page_create_va(vp, off, bytes, flags, &kseg, random_vaddr)); 2054 } 2055 2056 #ifdef DEBUG 2057 uint32_t pg_alloc_pgs_mtbf = 0; 2058 #endif 2059 2060 /* 2061 * Used for large page support. It will attempt to allocate 2062 * a large page(s) off the freelist. 2063 * 2064 * Returns non zero on failure. 2065 */ 2066 int 2067 page_alloc_pages(struct vnode *vp, struct seg *seg, caddr_t addr, 2068 page_t **basepp, page_t *ppa[], uint_t szc, int anypgsz) 2069 { 2070 pgcnt_t npgs, curnpgs, totpgs; 2071 size_t pgsz; 2072 page_t *pplist = NULL, *pp; 2073 int err = 0; 2074 lgrp_t *lgrp; 2075 2076 ASSERT(szc != 0 && szc <= (page_num_pagesizes() - 1)); 2077 2078 VM_STAT_ADD(alloc_pages[0]); 2079 2080 #ifdef DEBUG 2081 if (pg_alloc_pgs_mtbf && !(gethrtime() % pg_alloc_pgs_mtbf)) { 2082 return (ENOMEM); 2083 } 2084 #endif 2085 2086 pgsz = page_get_pagesize(szc); 2087 totpgs = curnpgs = npgs = pgsz >> PAGESHIFT; 2088 2089 ASSERT(((uintptr_t)addr & (pgsz - 1)) == 0); 2090 /* 2091 * One must be NULL but not both. 2092 * And one must be non NULL but not both. 2093 */ 2094 ASSERT(basepp != NULL || ppa != NULL); 2095 ASSERT(basepp == NULL || ppa == NULL); 2096 2097 (void) page_create_wait(npgs, PG_WAIT); 2098 2099 while (npgs && szc) { 2100 lgrp = lgrp_mem_choose(seg, addr, pgsz); 2101 pp = page_get_freelist(vp, 0, seg, addr, pgsz, 0, lgrp); 2102 if (pp != NULL) { 2103 VM_STAT_ADD(alloc_pages[1]); 2104 page_list_concat(&pplist, &pp); 2105 ASSERT(npgs >= curnpgs); 2106 npgs -= curnpgs; 2107 } else if (anypgsz) { 2108 VM_STAT_ADD(alloc_pages[2]); 2109 szc--; 2110 pgsz = page_get_pagesize(szc); 2111 curnpgs = pgsz >> PAGESHIFT; 2112 } else { 2113 VM_STAT_ADD(alloc_pages[3]); 2114 ASSERT(npgs == totpgs); 2115 page_create_putback(npgs); 2116 return (ENOMEM); 2117 } 2118 } 2119 if (szc == 0) { 2120 VM_STAT_ADD(alloc_pages[4]); 2121 ASSERT(npgs != 0); 2122 page_create_putback(npgs); 2123 err = ENOMEM; 2124 } else if (basepp != NULL) { 2125 ASSERT(npgs == 0); 2126 ASSERT(ppa == NULL); 2127 *basepp = pplist; 2128 } 2129 2130 npgs = totpgs - npgs; 2131 pp = pplist; 2132 2133 /* 2134 * Clear the free and age bits. Also if we were passed in a ppa then 2135 * fill it in with all the constituent pages from the large page. But 2136 * if we failed to allocate all the pages just free what we got. 2137 */ 2138 while (npgs != 0) { 2139 ASSERT(PP_ISFREE(pp)); 2140 ASSERT(PP_ISAGED(pp)); 2141 if (ppa != NULL || err != 0) { 2142 if (err == 0) { 2143 VM_STAT_ADD(alloc_pages[5]); 2144 PP_CLRFREE(pp); 2145 PP_CLRAGED(pp); 2146 page_sub(&pplist, pp); 2147 *ppa++ = pp; 2148 npgs--; 2149 } else { 2150 VM_STAT_ADD(alloc_pages[6]); 2151 ASSERT(pp->p_szc != 0); 2152 curnpgs = page_get_pagecnt(pp->p_szc); 2153 page_list_break(&pp, &pplist, curnpgs); 2154 page_list_add_pages(pp, 0); 2155 page_create_putback(curnpgs); 2156 ASSERT(npgs >= curnpgs); 2157 npgs -= curnpgs; 2158 } 2159 pp = pplist; 2160 } else { 2161 VM_STAT_ADD(alloc_pages[7]); 2162 PP_CLRFREE(pp); 2163 PP_CLRAGED(pp); 2164 pp = pp->p_next; 2165 npgs--; 2166 } 2167 } 2168 return (err); 2169 } 2170 2171 /* 2172 * Get a single large page off of the freelists, and set it up for use. 2173 * Number of bytes requested must be a supported page size. 2174 * 2175 * Note that this call may fail even if there is sufficient 2176 * memory available or PG_WAIT is set, so the caller must 2177 * be willing to fallback on page_create_va(), block and retry, 2178 * or fail the requester. 2179 */ 2180 page_t * 2181 page_create_va_large(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags, 2182 struct seg *seg, caddr_t vaddr, void *arg) 2183 { 2184 pgcnt_t npages, pcftotal; 2185 page_t *pp; 2186 page_t *rootpp; 2187 lgrp_t *lgrp; 2188 uint_t enough; 2189 uint_t pcf_index; 2190 uint_t i; 2191 struct pcf *p; 2192 struct pcf *q; 2193 lgrp_id_t *lgrpid = (lgrp_id_t *)arg; 2194 2195 ASSERT(vp != NULL); 2196 2197 ASSERT((flags & ~(PG_EXCL | PG_WAIT | 2198 PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == 0); 2199 /* but no others */ 2200 2201 ASSERT((flags & PG_EXCL) == PG_EXCL); 2202 2203 npages = btop(bytes); 2204 2205 if (!kcage_on || panicstr) { 2206 /* 2207 * Cage is OFF, or we are single threaded in 2208 * panic, so make everything a RELOC request. 2209 */ 2210 flags &= ~PG_NORELOC; 2211 } 2212 2213 /* 2214 * Make sure there's adequate physical memory available. 2215 * Note: PG_WAIT is ignored here. 2216 */ 2217 if (freemem <= throttlefree + npages) { 2218 VM_STAT_ADD(page_create_large_cnt[1]); 2219 return (NULL); 2220 } 2221 2222 /* 2223 * If cage is on, dampen draw from cage when available 2224 * cage space is low. 2225 */ 2226 if ((flags & (PG_NORELOC | PG_WAIT)) == (PG_NORELOC | PG_WAIT) && 2227 kcage_freemem < kcage_throttlefree + npages) { 2228 2229 /* 2230 * The cage is on, the caller wants PG_NORELOC 2231 * pages and available cage memory is very low. 2232 * Call kcage_create_throttle() to attempt to 2233 * control demand on the cage. 2234 */ 2235 if (kcage_create_throttle(npages, flags) == KCT_FAILURE) { 2236 VM_STAT_ADD(page_create_large_cnt[2]); 2237 return (NULL); 2238 } 2239 } 2240 2241 enough = 0; 2242 pcf_index = PCF_INDEX(); 2243 p = &pcf[pcf_index]; 2244 p->pcf_touch = 1; 2245 q = &pcf[PCF_FANOUT]; 2246 for (pcftotal = 0, i = 0; i < PCF_FANOUT; i++) { 2247 if (p->pcf_count > npages) { 2248 /* 2249 * a good one to try. 2250 */ 2251 mutex_enter(&p->pcf_lock); 2252 if (p->pcf_count > npages) { 2253 p->pcf_count -= (uint_t)npages; 2254 /* 2255 * freemem is not protected by any lock. 2256 * Thus, we cannot have any assertion 2257 * containing freemem here. 2258 */ 2259 freemem -= npages; 2260 enough = 1; 2261 mutex_exit(&p->pcf_lock); 2262 break; 2263 } 2264 mutex_exit(&p->pcf_lock); 2265 } 2266 pcftotal += p->pcf_count; 2267 p++; 2268 if (p >= q) { 2269 p = pcf; 2270 } 2271 p->pcf_touch = 1; 2272 } 2273 2274 if (!enough) { 2275 /* If there isn't enough memory available, give up. */ 2276 if (pcftotal < npages) { 2277 VM_STAT_ADD(page_create_large_cnt[3]); 2278 return (NULL); 2279 } 2280 2281 /* try to collect pages from several pcf bins */ 2282 for (p = pcf, pcftotal = 0, i = 0; i < PCF_FANOUT; i++) { 2283 p->pcf_touch = 1; 2284 mutex_enter(&p->pcf_lock); 2285 pcftotal += p->pcf_count; 2286 if (pcftotal >= npages) { 2287 /* 2288 * Wow! There are enough pages laying around 2289 * to satisfy the request. Do the accounting, 2290 * drop the locks we acquired, and go back. 2291 * 2292 * freemem is not protected by any lock. So, 2293 * we cannot have any assertion containing 2294 * freemem. 2295 */ 2296 pgcnt_t tpages = npages; 2297 freemem -= npages; 2298 while (p >= pcf) { 2299 if (p->pcf_count <= tpages) { 2300 tpages -= p->pcf_count; 2301 p->pcf_count = 0; 2302 } else { 2303 p->pcf_count -= (uint_t)tpages; 2304 tpages = 0; 2305 } 2306 mutex_exit(&p->pcf_lock); 2307 p--; 2308 } 2309 ASSERT(tpages == 0); 2310 break; 2311 } 2312 p++; 2313 } 2314 if (i == PCF_FANOUT) { 2315 /* failed to collect pages - release the locks */ 2316 while (--p >= pcf) { 2317 mutex_exit(&p->pcf_lock); 2318 } 2319 VM_STAT_ADD(page_create_large_cnt[4]); 2320 return (NULL); 2321 } 2322 } 2323 2324 /* 2325 * This is where this function behaves fundamentally differently 2326 * than page_create_va(); since we're intending to map the page 2327 * with a single TTE, we have to get it as a physically contiguous 2328 * hardware pagesize chunk. If we can't, we fail. 2329 */ 2330 if (lgrpid != NULL && *lgrpid >= 0 && *lgrpid <= lgrp_alloc_max && 2331 LGRP_EXISTS(lgrp_table[*lgrpid])) 2332 lgrp = lgrp_table[*lgrpid]; 2333 else 2334 lgrp = lgrp_mem_choose(seg, vaddr, bytes); 2335 2336 if ((rootpp = page_get_freelist(&kvp, off, seg, vaddr, 2337 bytes, flags & ~PG_MATCH_COLOR, lgrp)) == NULL) { 2338 page_create_putback(npages); 2339 VM_STAT_ADD(page_create_large_cnt[5]); 2340 return (NULL); 2341 } 2342 2343 /* 2344 * if we got the page with the wrong mtype give it back this is a 2345 * workaround for CR 6249718. When CR 6249718 is fixed we never get 2346 * inside "if" and the workaround becomes just a nop 2347 */ 2348 if (kcage_on && (flags & PG_NORELOC) && !PP_ISNORELOC(rootpp)) { 2349 page_list_add_pages(rootpp, 0); 2350 page_create_putback(npages); 2351 VM_STAT_ADD(page_create_large_cnt[6]); 2352 return (NULL); 2353 } 2354 2355 /* 2356 * If satisfying this request has left us with too little 2357 * memory, start the wheels turning to get some back. The 2358 * first clause of the test prevents waking up the pageout 2359 * daemon in situations where it would decide that there's 2360 * nothing to do. 2361 */ 2362 if (nscan < desscan && freemem < minfree) { 2363 TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL, 2364 "pageout_cv_signal:freemem %ld", freemem); 2365 cv_signal(&proc_pageout->p_cv); 2366 } 2367 2368 pp = rootpp; 2369 while (npages--) { 2370 ASSERT(PAGE_EXCL(pp)); 2371 ASSERT(pp->p_vnode == NULL); 2372 ASSERT(!hat_page_is_mapped(pp)); 2373 PP_CLRFREE(pp); 2374 PP_CLRAGED(pp); 2375 if (!page_hashin(pp, vp, off, NULL)) 2376 panic("page_create_large: hashin failed: page %p", 2377 (void *)pp); 2378 page_io_lock(pp); 2379 off += PAGESIZE; 2380 pp = pp->p_next; 2381 } 2382 2383 VM_STAT_ADD(page_create_large_cnt[0]); 2384 return (rootpp); 2385 } 2386 2387 page_t * 2388 page_create_va(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags, 2389 struct seg *seg, caddr_t vaddr) 2390 { 2391 page_t *plist = NULL; 2392 pgcnt_t npages; 2393 pgcnt_t found_on_free = 0; 2394 pgcnt_t pages_req; 2395 page_t *npp = NULL; 2396 uint_t enough; 2397 uint_t i; 2398 uint_t pcf_index; 2399 struct pcf *p; 2400 struct pcf *q; 2401 lgrp_t *lgrp; 2402 2403 TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START, 2404 "page_create_start:vp %p off %llx bytes %lu flags %x", 2405 vp, off, bytes, flags); 2406 2407 ASSERT(bytes != 0 && vp != NULL); 2408 2409 if ((flags & PG_EXCL) == 0 && (flags & PG_WAIT) == 0) { 2410 panic("page_create: invalid flags"); 2411 /*NOTREACHED*/ 2412 } 2413 ASSERT((flags & ~(PG_EXCL | PG_WAIT | 2414 PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == 0); 2415 /* but no others */ 2416 2417 pages_req = npages = btopr(bytes); 2418 /* 2419 * Try to see whether request is too large to *ever* be 2420 * satisfied, in order to prevent deadlock. We arbitrarily 2421 * decide to limit maximum size requests to max_page_get. 2422 */ 2423 if (npages >= max_page_get) { 2424 if ((flags & PG_WAIT) == 0) { 2425 TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_TOOBIG, 2426 "page_create_toobig:vp %p off %llx npages " 2427 "%lu max_page_get %lu", 2428 vp, off, npages, max_page_get); 2429 return (NULL); 2430 } else { 2431 cmn_err(CE_WARN, 2432 "Request for too much kernel memory " 2433 "(%lu bytes), will hang forever", bytes); 2434 for (;;) 2435 delay(1000000000); 2436 } 2437 } 2438 2439 if (!kcage_on || panicstr) { 2440 /* 2441 * Cage is OFF, or we are single threaded in 2442 * panic, so make everything a RELOC request. 2443 */ 2444 flags &= ~PG_NORELOC; 2445 } 2446 2447 if (freemem <= throttlefree + npages) 2448 if (!page_create_throttle(npages, flags)) 2449 return (NULL); 2450 2451 /* 2452 * If cage is on, dampen draw from cage when available 2453 * cage space is low. 2454 */ 2455 if ((flags & PG_NORELOC) && 2456 kcage_freemem < kcage_throttlefree + npages) { 2457 2458 /* 2459 * The cage is on, the caller wants PG_NORELOC 2460 * pages and available cage memory is very low. 2461 * Call kcage_create_throttle() to attempt to 2462 * control demand on the cage. 2463 */ 2464 if (kcage_create_throttle(npages, flags) == KCT_FAILURE) 2465 return (NULL); 2466 } 2467 2468 VM_STAT_ADD(page_create_cnt[0]); 2469 2470 enough = 0; 2471 pcf_index = PCF_INDEX(); 2472 2473 p = &pcf[pcf_index]; 2474 p->pcf_touch = 1; 2475 q = &pcf[PCF_FANOUT]; 2476 for (i = 0; i < PCF_FANOUT; i++) { 2477 if (p->pcf_count > npages) { 2478 /* 2479 * a good one to try. 2480 */ 2481 mutex_enter(&p->pcf_lock); 2482 if (p->pcf_count > npages) { 2483 p->pcf_count -= (uint_t)npages; 2484 /* 2485 * freemem is not protected by any lock. 2486 * Thus, we cannot have any assertion 2487 * containing freemem here. 2488 */ 2489 freemem -= npages; 2490 enough = 1; 2491 mutex_exit(&p->pcf_lock); 2492 break; 2493 } 2494 mutex_exit(&p->pcf_lock); 2495 } 2496 p++; 2497 if (p >= q) { 2498 p = pcf; 2499 } 2500 p->pcf_touch = 1; 2501 } 2502 2503 if (!enough) { 2504 /* 2505 * Have to look harder. If npages is greater than 2506 * one, then we might have to coalecse the counters. 2507 * 2508 * Go wait. We come back having accounted 2509 * for the memory. 2510 */ 2511 VM_STAT_ADD(page_create_cnt[1]); 2512 if (!page_create_wait(npages, flags)) { 2513 VM_STAT_ADD(page_create_cnt[2]); 2514 return (NULL); 2515 } 2516 } 2517 2518 TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS, 2519 "page_create_success:vp %p off %llx", vp, off); 2520 2521 /* 2522 * If satisfying this request has left us with too little 2523 * memory, start the wheels turning to get some back. The 2524 * first clause of the test prevents waking up the pageout 2525 * daemon in situations where it would decide that there's 2526 * nothing to do. 2527 */ 2528 if (nscan < desscan && freemem < minfree) { 2529 TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL, 2530 "pageout_cv_signal:freemem %ld", freemem); 2531 cv_signal(&proc_pageout->p_cv); 2532 } 2533 2534 /* 2535 * Loop around collecting the requested number of pages. 2536 * Most of the time, we have to `create' a new page. With 2537 * this in mind, pull the page off the free list before 2538 * getting the hash lock. This will minimize the hash 2539 * lock hold time, nesting, and the like. If it turns 2540 * out we don't need the page, we put it back at the end. 2541 */ 2542 while (npages--) { 2543 page_t *pp; 2544 kmutex_t *phm = NULL; 2545 ulong_t index; 2546 2547 index = PAGE_HASH_FUNC(vp, off); 2548 top: 2549 ASSERT(phm == NULL); 2550 ASSERT(index == PAGE_HASH_FUNC(vp, off)); 2551 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 2552 2553 if (npp == NULL) { 2554 /* 2555 * Try to get a page from the freelist (ie, 2556 * a page with no [vp, off] tag). If that 2557 * fails, use the cachelist. 2558 * 2559 * During the first attempt at both the free 2560 * and cache lists we try for the correct color. 2561 */ 2562 /* 2563 * XXXX-how do we deal with virtual indexed 2564 * caches and and colors? 2565 */ 2566 VM_STAT_ADD(page_create_cnt[4]); 2567 /* 2568 * Get lgroup to allocate next page of shared memory 2569 * from and use it to specify where to allocate 2570 * the physical memory 2571 */ 2572 lgrp = lgrp_mem_choose(seg, vaddr, PAGESIZE); 2573 npp = page_get_freelist(vp, off, seg, vaddr, PAGESIZE, 2574 flags | PG_MATCH_COLOR, lgrp); 2575 if (npp == NULL) { 2576 npp = page_get_cachelist(vp, off, seg, 2577 vaddr, flags | PG_MATCH_COLOR, lgrp); 2578 if (npp == NULL) { 2579 npp = page_create_get_something(vp, 2580 off, seg, vaddr, 2581 flags & ~PG_MATCH_COLOR); 2582 } 2583 2584 if (PP_ISAGED(npp) == 0) { 2585 /* 2586 * Since this page came from the 2587 * cachelist, we must destroy the 2588 * old vnode association. 2589 */ 2590 page_hashout(npp, NULL); 2591 } 2592 } 2593 } 2594 2595 /* 2596 * We own this page! 2597 */ 2598 ASSERT(PAGE_EXCL(npp)); 2599 ASSERT(npp->p_vnode == NULL); 2600 ASSERT(!hat_page_is_mapped(npp)); 2601 PP_CLRFREE(npp); 2602 PP_CLRAGED(npp); 2603 2604 /* 2605 * Here we have a page in our hot little mits and are 2606 * just waiting to stuff it on the appropriate lists. 2607 * Get the mutex and check to see if it really does 2608 * not exist. 2609 */ 2610 phm = PAGE_HASH_MUTEX(index); 2611 mutex_enter(phm); 2612 PAGE_HASH_SEARCH(index, pp, vp, off); 2613 if (pp == NULL) { 2614 VM_STAT_ADD(page_create_new); 2615 pp = npp; 2616 npp = NULL; 2617 if (!page_hashin(pp, vp, off, phm)) { 2618 /* 2619 * Since we hold the page hash mutex and 2620 * just searched for this page, page_hashin 2621 * had better not fail. If it does, that 2622 * means somethread did not follow the 2623 * page hash mutex rules. Panic now and 2624 * get it over with. As usual, go down 2625 * holding all the locks. 2626 */ 2627 ASSERT(MUTEX_HELD(phm)); 2628 panic("page_create: " 2629 "hashin failed %p %p %llx %p", 2630 (void *)pp, (void *)vp, off, (void *)phm); 2631 /*NOTREACHED*/ 2632 } 2633 ASSERT(MUTEX_HELD(phm)); 2634 mutex_exit(phm); 2635 phm = NULL; 2636 2637 /* 2638 * Hat layer locking need not be done to set 2639 * the following bits since the page is not hashed 2640 * and was on the free list (i.e., had no mappings). 2641 * 2642 * Set the reference bit to protect 2643 * against immediate pageout 2644 * 2645 * XXXmh modify freelist code to set reference 2646 * bit so we don't have to do it here. 2647 */ 2648 page_set_props(pp, P_REF); 2649 found_on_free++; 2650 } else { 2651 VM_STAT_ADD(page_create_exists); 2652 if (flags & PG_EXCL) { 2653 /* 2654 * Found an existing page, and the caller 2655 * wanted all new pages. Undo all of the work 2656 * we have done. 2657 */ 2658 mutex_exit(phm); 2659 phm = NULL; 2660 while (plist != NULL) { 2661 pp = plist; 2662 page_sub(&plist, pp); 2663 page_io_unlock(pp); 2664 /* large pages should not end up here */ 2665 ASSERT(pp->p_szc == 0); 2666 /*LINTED: constant in conditional ctx*/ 2667 VN_DISPOSE(pp, B_INVAL, 0, kcred); 2668 } 2669 VM_STAT_ADD(page_create_found_one); 2670 goto fail; 2671 } 2672 ASSERT(flags & PG_WAIT); 2673 if (!page_lock(pp, SE_EXCL, phm, P_NO_RECLAIM)) { 2674 /* 2675 * Start all over again if we blocked trying 2676 * to lock the page. 2677 */ 2678 mutex_exit(phm); 2679 VM_STAT_ADD(page_create_page_lock_failed); 2680 phm = NULL; 2681 goto top; 2682 } 2683 mutex_exit(phm); 2684 phm = NULL; 2685 2686 if (PP_ISFREE(pp)) { 2687 ASSERT(PP_ISAGED(pp) == 0); 2688 VM_STAT_ADD(pagecnt.pc_get_cache); 2689 page_list_sub(pp, PG_CACHE_LIST); 2690 PP_CLRFREE(pp); 2691 found_on_free++; 2692 } 2693 } 2694 2695 /* 2696 * Got a page! It is locked. Acquire the i/o 2697 * lock since we are going to use the p_next and 2698 * p_prev fields to link the requested pages together. 2699 */ 2700 page_io_lock(pp); 2701 page_add(&plist, pp); 2702 plist = plist->p_next; 2703 off += PAGESIZE; 2704 vaddr += PAGESIZE; 2705 } 2706 2707 ASSERT((flags & PG_EXCL) ? (found_on_free == pages_req) : 1); 2708 fail: 2709 if (npp != NULL) { 2710 /* 2711 * Did not need this page after all. 2712 * Put it back on the free list. 2713 */ 2714 VM_STAT_ADD(page_create_putbacks); 2715 PP_SETFREE(npp); 2716 PP_SETAGED(npp); 2717 npp->p_offset = (u_offset_t)-1; 2718 page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL); 2719 page_unlock(npp); 2720 2721 } 2722 2723 ASSERT(pages_req >= found_on_free); 2724 2725 { 2726 uint_t overshoot = (uint_t)(pages_req - found_on_free); 2727 2728 if (overshoot) { 2729 VM_STAT_ADD(page_create_overshoot); 2730 p = &pcf[pcf_index]; 2731 p->pcf_touch = 1; 2732 mutex_enter(&p->pcf_lock); 2733 if (p->pcf_block) { 2734 p->pcf_reserve += overshoot; 2735 } else { 2736 p->pcf_count += overshoot; 2737 if (p->pcf_wait) { 2738 mutex_enter(&new_freemem_lock); 2739 if (freemem_wait) { 2740 cv_signal(&freemem_cv); 2741 p->pcf_wait--; 2742 } else { 2743 p->pcf_wait = 0; 2744 } 2745 mutex_exit(&new_freemem_lock); 2746 } 2747 } 2748 mutex_exit(&p->pcf_lock); 2749 /* freemem is approximate, so this test OK */ 2750 if (!p->pcf_block) 2751 freemem += overshoot; 2752 } 2753 } 2754 2755 return (plist); 2756 } 2757 2758 /* 2759 * One or more constituent pages of this large page has been marked 2760 * toxic. Simply demote the large page to PAGESIZE pages and let 2761 * page_free() handle it. This routine should only be called by 2762 * large page free routines (page_free_pages() and page_destroy_pages(). 2763 * All pages are locked SE_EXCL and have already been marked free. 2764 */ 2765 static void 2766 page_free_toxic_pages(page_t *rootpp) 2767 { 2768 page_t *tpp; 2769 pgcnt_t i, pgcnt = page_get_pagecnt(rootpp->p_szc); 2770 uint_t szc = rootpp->p_szc; 2771 2772 for (i = 0, tpp = rootpp; i < pgcnt; i++, tpp = tpp->p_next) { 2773 ASSERT(tpp->p_szc == szc); 2774 ASSERT((PAGE_EXCL(tpp) && 2775 !page_iolock_assert(tpp)) || panicstr); 2776 tpp->p_szc = 0; 2777 } 2778 2779 while (rootpp != NULL) { 2780 tpp = rootpp; 2781 page_sub(&rootpp, tpp); 2782 ASSERT(PP_ISFREE(tpp)); 2783 PP_CLRFREE(tpp); 2784 page_free(tpp, 1); 2785 } 2786 } 2787 2788 /* 2789 * Put page on the "free" list. 2790 * The free list is really two lists maintained by 2791 * the PSM of whatever machine we happen to be on. 2792 */ 2793 void 2794 page_free(page_t *pp, int dontneed) 2795 { 2796 struct pcf *p; 2797 uint_t pcf_index; 2798 2799 ASSERT((PAGE_EXCL(pp) && 2800 !page_iolock_assert(pp)) || panicstr); 2801 2802 if (page_deteriorating(pp)) { 2803 volatile int i = 0; 2804 char *kaddr; 2805 volatile int rb, wb; 2806 uint64_t pa; 2807 volatile int ue = 0; 2808 on_trap_data_t otd; 2809 2810 if (pp->p_vnode != NULL) { 2811 /* 2812 * Let page_destroy() do its bean counting and 2813 * hash out the page; it will then call back 2814 * into page_free() with pp->p_vnode == NULL. 2815 */ 2816 page_destroy(pp, 0); 2817 return; 2818 } 2819 2820 if (page_isfailing(pp)) { 2821 /* 2822 * If we have already exceeded the limit for 2823 * pages retired, we will treat this page as 2824 * 'toxic' rather than failing. That will ensure 2825 * that the page is at least cleaned, and if 2826 * a UE is detected, the page will be retired 2827 * anyway. 2828 */ 2829 if (pages_retired_limit_exceeded()) { 2830 /* 2831 * clear the flag and reset to toxic 2832 */ 2833 page_clrtoxic(pp); 2834 page_settoxic(pp, PAGE_IS_TOXIC); 2835 } else { 2836 pa = ptob((uint64_t)page_pptonum(pp)); 2837 if (page_retire_messages) { 2838 cmn_err(CE_NOTE, "Page 0x%08x.%08x " 2839 "removed from service", 2840 (uint32_t)(pa >> 32), (uint32_t)pa); 2841 } 2842 goto page_failed; 2843 } 2844 } 2845 2846 pagescrub(pp, 0, PAGESIZE); 2847 2848 /* 2849 * We want to determine whether the error that occurred on 2850 * this page is transient or persistent, so we get a mapping 2851 * to the page and try every possible bit pattern to compare 2852 * what we write with what we read back. A smaller number 2853 * of bit patterns might suffice, but there's no point in 2854 * getting fancy. If this is the hot path on your system, 2855 * you've got bigger problems. 2856 */ 2857 kaddr = ppmapin(pp, PROT_READ | PROT_WRITE, (caddr_t)-1); 2858 for (wb = 0xff; wb >= 0; wb--) { 2859 if (on_trap(&otd, OT_DATA_EC)) { 2860 pa = ptob((uint64_t)page_pptonum(pp)) + i; 2861 page_settoxic(pp, PAGE_IS_FAILING); 2862 2863 if (page_retire_messages) { 2864 cmn_err(CE_WARN, "Uncorrectable Error " 2865 "occurred at PA 0x%08x.%08x while " 2866 "attempting to clear previously " 2867 "reported error; page removed from " 2868 "service", (uint32_t)(pa >> 32), 2869 (uint32_t)pa); 2870 } 2871 2872 ue++; 2873 break; 2874 } 2875 2876 /* 2877 * Write out the bit pattern, flush it to memory, and 2878 * read it back while under on_trap() protection. 2879 */ 2880 for (i = 0; i < PAGESIZE; i++) 2881 kaddr[i] = wb; 2882 2883 sync_data_memory(kaddr, PAGESIZE); 2884 2885 for (i = 0; i < PAGESIZE; i++) { 2886 if ((rb = (uchar_t)kaddr[i]) != wb) { 2887 page_settoxic(pp, PAGE_IS_FAILING); 2888 goto out; 2889 } 2890 } 2891 } 2892 out: 2893 no_trap(); 2894 ppmapout(kaddr); 2895 2896 if (wb >= 0 && !ue) { 2897 pa = ptob((uint64_t)page_pptonum(pp)) + i; 2898 if (page_retire_messages) { 2899 cmn_err(CE_WARN, "Data Mismatch occurred at PA " 2900 "0x%08x.%08x [ 0x%x != 0x%x ] while " 2901 "attempting to clear previously reported " 2902 "error; page removed from service", 2903 (uint32_t)(pa >> 32), (uint32_t)pa, rb, wb); 2904 } 2905 } 2906 page_failed: 2907 /* 2908 * DR operations change the association between a page_t 2909 * and the physical page it represents. Check if the 2910 * page is still bad. If it is, then retire it. 2911 */ 2912 if (page_isfaulty(pp) && page_isfailing(pp)) { 2913 /* 2914 * In the future, it might be useful to have a platform 2915 * callback here to tell the hardware to fence off this 2916 * page during the next reboot. 2917 * 2918 * We move the page to the retired_vnode here 2919 */ 2920 (void) page_hashin(pp, &retired_ppages, 2921 (u_offset_t)ptob((uint64_t)page_pptonum(pp)), NULL); 2922 mutex_enter(&freemem_lock); 2923 availrmem--; 2924 mutex_exit(&freemem_lock); 2925 page_retired(pp); 2926 page_downgrade(pp); 2927 2928 /* 2929 * If DR raced with the above page retirement code, 2930 * we might have retired a good page. If so, unretire 2931 * the page. 2932 */ 2933 if (!page_isfaulty(pp)) 2934 page_unretire_pages(); 2935 return; 2936 } 2937 2938 pa = ptob((uint64_t)page_pptonum(pp)); 2939 2940 if (page_retire_messages) { 2941 cmn_err(CE_NOTE, "Previously reported error on page " 2942 "0x%08x.%08x cleared", (uint32_t)(pa >> 32), 2943 (uint32_t)pa); 2944 } 2945 2946 page_clrtoxic(pp); 2947 } 2948 2949 if (PP_ISFREE(pp)) { 2950 panic("page_free: page %p is free", (void *)pp); 2951 } 2952 2953 if (pp->p_szc != 0) { 2954 if (pp->p_vnode == NULL || IS_SWAPFSVP(pp->p_vnode) || 2955 pp->p_vnode == &kvp) { 2956 panic("page_free: anon or kernel " 2957 "or no vnode large page %p", (void *)pp); 2958 } 2959 page_demote_vp_pages(pp); 2960 ASSERT(pp->p_szc == 0); 2961 } 2962 2963 /* 2964 * The page_struct_lock need not be acquired to examine these 2965 * fields since the page has an "exclusive" lock. 2966 */ 2967 if (hat_page_is_mapped(pp) || pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 2968 panic("page_free pp=%p, pfn=%lx, lckcnt=%d, cowcnt=%d", 2969 pp, page_pptonum(pp), pp->p_lckcnt, pp->p_cowcnt); 2970 /*NOTREACHED*/ 2971 } 2972 2973 ASSERT(!hat_page_getshare(pp)); 2974 2975 PP_SETFREE(pp); 2976 ASSERT(pp->p_vnode == NULL || !IS_VMODSORT(pp->p_vnode) || 2977 !hat_ismod(pp)); 2978 page_clr_all_props(pp); 2979 ASSERT(!hat_page_getshare(pp)); 2980 2981 /* 2982 * Now we add the page to the head of the free list. 2983 * But if this page is associated with a paged vnode 2984 * then we adjust the head forward so that the page is 2985 * effectively at the end of the list. 2986 */ 2987 if (pp->p_vnode == NULL) { 2988 /* 2989 * Page has no identity, put it on the free list. 2990 */ 2991 PP_SETAGED(pp); 2992 pp->p_offset = (u_offset_t)-1; 2993 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); 2994 VM_STAT_ADD(pagecnt.pc_free_free); 2995 TRACE_1(TR_FAC_VM, TR_PAGE_FREE_FREE, 2996 "page_free_free:pp %p", pp); 2997 } else { 2998 PP_CLRAGED(pp); 2999 3000 if (!dontneed || nopageage) { 3001 /* move it to the tail of the list */ 3002 page_list_add(pp, PG_CACHE_LIST | PG_LIST_TAIL); 3003 3004 VM_STAT_ADD(pagecnt.pc_free_cache); 3005 TRACE_1(TR_FAC_VM, TR_PAGE_FREE_CACHE_TAIL, 3006 "page_free_cache_tail:pp %p", pp); 3007 } else { 3008 page_list_add(pp, PG_CACHE_LIST | PG_LIST_HEAD); 3009 3010 VM_STAT_ADD(pagecnt.pc_free_dontneed); 3011 TRACE_1(TR_FAC_VM, TR_PAGE_FREE_CACHE_HEAD, 3012 "page_free_cache_head:pp %p", pp); 3013 } 3014 } 3015 page_unlock(pp); 3016 3017 /* 3018 * Now do the `freemem' accounting. 3019 */ 3020 pcf_index = PCF_INDEX(); 3021 p = &pcf[pcf_index]; 3022 p->pcf_touch = 1; 3023 3024 mutex_enter(&p->pcf_lock); 3025 if (p->pcf_block) { 3026 p->pcf_reserve += 1; 3027 } else { 3028 p->pcf_count += 1; 3029 if (p->pcf_wait) { 3030 mutex_enter(&new_freemem_lock); 3031 /* 3032 * Check to see if some other thread 3033 * is actually waiting. Another bucket 3034 * may have woken it up by now. If there 3035 * are no waiters, then set our pcf_wait 3036 * count to zero to avoid coming in here 3037 * next time. Also, since only one page 3038 * was put on the free list, just wake 3039 * up one waiter. 3040 */ 3041 if (freemem_wait) { 3042 cv_signal(&freemem_cv); 3043 p->pcf_wait--; 3044 } else { 3045 p->pcf_wait = 0; 3046 } 3047 mutex_exit(&new_freemem_lock); 3048 } 3049 } 3050 mutex_exit(&p->pcf_lock); 3051 3052 /* freemem is approximate, so this test OK */ 3053 if (!p->pcf_block) 3054 freemem += 1; 3055 } 3056 3057 /* 3058 * Put page on the "free" list during intial startup. 3059 * This happens during initial single threaded execution. 3060 */ 3061 void 3062 page_free_at_startup(page_t *pp) 3063 { 3064 struct pcf *p; 3065 uint_t pcf_index; 3066 3067 page_list_add(pp, PG_FREE_LIST | PG_LIST_HEAD | PG_LIST_ISINIT); 3068 VM_STAT_ADD(pagecnt.pc_free_free); 3069 3070 /* 3071 * Now do the `freemem' accounting. 3072 */ 3073 pcf_index = PCF_INDEX(); 3074 p = &pcf[pcf_index]; 3075 p->pcf_touch = 1; 3076 3077 ASSERT(p->pcf_block == 0); 3078 ASSERT(p->pcf_wait == 0); 3079 p->pcf_count += 1; 3080 3081 /* freemem is approximate, so this is OK */ 3082 freemem += 1; 3083 } 3084 3085 void 3086 page_free_pages(page_t *pp) 3087 { 3088 page_t *tpp, *rootpp = NULL; 3089 pgcnt_t pgcnt = page_get_pagecnt(pp->p_szc); 3090 pgcnt_t i; 3091 uint_t szc = pp->p_szc; 3092 int toxic = 0; 3093 3094 VM_STAT_ADD(pagecnt.pc_free_pages); 3095 TRACE_1(TR_FAC_VM, TR_PAGE_FREE_FREE, 3096 "page_free_free:pp %p", pp); 3097 3098 ASSERT(pp->p_szc != 0 && pp->p_szc < page_num_pagesizes()); 3099 if ((page_pptonum(pp) & (pgcnt - 1)) != 0) { 3100 panic("page_free_pages: not root page %p", (void *)pp); 3101 /*NOTREACHED*/ 3102 } 3103 3104 for (i = 0, tpp = pp; i < pgcnt; i++, tpp++) { 3105 ASSERT((PAGE_EXCL(tpp) && 3106 !page_iolock_assert(tpp)) || panicstr); 3107 if (PP_ISFREE(tpp)) { 3108 panic("page_free_pages: page %p is free", (void *)tpp); 3109 /*NOTREACHED*/ 3110 } 3111 if (hat_page_is_mapped(tpp) || tpp->p_lckcnt != 0 || 3112 tpp->p_cowcnt != 0) { 3113 panic("page_free_pages %p", (void *)tpp); 3114 /*NOTREACHED*/ 3115 } 3116 3117 ASSERT(!hat_page_getshare(tpp)); 3118 ASSERT(tpp->p_vnode == NULL); 3119 ASSERT(tpp->p_szc == szc); 3120 3121 if (page_deteriorating(tpp)) 3122 toxic = 1; 3123 3124 PP_SETFREE(tpp); 3125 page_clr_all_props(tpp); 3126 PP_SETAGED(tpp); 3127 tpp->p_offset = (u_offset_t)-1; 3128 ASSERT(tpp->p_next == tpp); 3129 ASSERT(tpp->p_prev == tpp); 3130 page_list_concat(&rootpp, &tpp); 3131 } 3132 ASSERT(rootpp == pp); 3133 3134 if (toxic) { 3135 page_free_toxic_pages(rootpp); 3136 return; 3137 } 3138 page_list_add_pages(rootpp, 0); 3139 page_create_putback(pgcnt); 3140 } 3141 3142 int free_pages = 1; 3143 3144 /* 3145 * This routine attempts to return pages to the cachelist via page_release(). 3146 * It does not *have* to be successful in all cases, since the pageout scanner 3147 * will catch any pages it misses. It does need to be fast and not introduce 3148 * too much overhead. 3149 * 3150 * If a page isn't found on the unlocked sweep of the page_hash bucket, we 3151 * don't lock and retry. This is ok, since the page scanner will eventually 3152 * find any page we miss in free_vp_pages(). 3153 */ 3154 void 3155 free_vp_pages(vnode_t *vp, u_offset_t off, size_t len) 3156 { 3157 page_t *pp; 3158 u_offset_t eoff; 3159 extern int swap_in_range(vnode_t *, u_offset_t, size_t); 3160 3161 eoff = off + len; 3162 3163 if (free_pages == 0) 3164 return; 3165 if (swap_in_range(vp, off, len)) 3166 return; 3167 3168 for (; off < eoff; off += PAGESIZE) { 3169 3170 /* 3171 * find the page using a fast, but inexact search. It'll be OK 3172 * if a few pages slip through the cracks here. 3173 */ 3174 pp = page_exists(vp, off); 3175 3176 /* 3177 * If we didn't find the page (it may not exist), the page 3178 * is free, looks still in use (shared), or we can't lock it, 3179 * just give up. 3180 */ 3181 if (pp == NULL || 3182 PP_ISFREE(pp) || 3183 page_share_cnt(pp) > 0 || 3184 !page_trylock(pp, SE_EXCL)) 3185 continue; 3186 3187 /* 3188 * Once we have locked pp, verify that it's still the 3189 * correct page and not already free 3190 */ 3191 ASSERT(PAGE_LOCKED_SE(pp, SE_EXCL)); 3192 if (pp->p_vnode != vp || pp->p_offset != off || PP_ISFREE(pp)) { 3193 page_unlock(pp); 3194 continue; 3195 } 3196 3197 /* 3198 * try to release the page... 3199 */ 3200 (void) page_release(pp, 1); 3201 } 3202 } 3203 3204 /* 3205 * Reclaim the given page from the free list. 3206 * Returns 1 on success or 0 on failure. 3207 * 3208 * The page is unlocked if it can't be reclaimed (when freemem == 0). 3209 * If `lock' is non-null, it will be dropped and re-acquired if 3210 * the routine must wait while freemem is 0. 3211 * 3212 * As it turns out, boot_getpages() does this. It picks a page, 3213 * based on where OBP mapped in some address, gets its pfn, searches 3214 * the memsegs, locks the page, then pulls it off the free list! 3215 */ 3216 int 3217 page_reclaim(page_t *pp, kmutex_t *lock) 3218 { 3219 struct pcf *p; 3220 uint_t pcf_index; 3221 struct cpu *cpup; 3222 int enough; 3223 uint_t i; 3224 3225 ASSERT(lock != NULL ? MUTEX_HELD(lock) : 1); 3226 ASSERT(PAGE_EXCL(pp) && PP_ISFREE(pp)); 3227 ASSERT(pp->p_szc == 0); 3228 3229 /* 3230 * If `freemem' is 0, we cannot reclaim this page from the 3231 * freelist, so release every lock we might hold: the page, 3232 * and the `lock' before blocking. 3233 * 3234 * The only way `freemem' can become 0 while there are pages 3235 * marked free (have their p->p_free bit set) is when the 3236 * system is low on memory and doing a page_create(). In 3237 * order to guarantee that once page_create() starts acquiring 3238 * pages it will be able to get all that it needs since `freemem' 3239 * was decreased by the requested amount. So, we need to release 3240 * this page, and let page_create() have it. 3241 * 3242 * Since `freemem' being zero is not supposed to happen, just 3243 * use the usual hash stuff as a starting point. If that bucket 3244 * is empty, then assume the worst, and start at the beginning 3245 * of the pcf array. If we always start at the beginning 3246 * when acquiring more than one pcf lock, there won't be any 3247 * deadlock problems. 3248 */ 3249 3250 /* TODO: Do we need to test kcage_freemem if PG_NORELOC(pp)? */ 3251 3252 if (freemem <= throttlefree && !page_create_throttle(1l, 0)) { 3253 pcf_acquire_all(); 3254 goto page_reclaim_nomem; 3255 } 3256 3257 enough = 0; 3258 pcf_index = PCF_INDEX(); 3259 p = &pcf[pcf_index]; 3260 p->pcf_touch = 1; 3261 mutex_enter(&p->pcf_lock); 3262 if (p->pcf_count >= 1) { 3263 enough = 1; 3264 p->pcf_count--; 3265 } 3266 mutex_exit(&p->pcf_lock); 3267 3268 if (!enough) { 3269 VM_STAT_ADD(page_reclaim_zero); 3270 /* 3271 * Check again. Its possible that some other thread 3272 * could have been right behind us, and added one 3273 * to a list somewhere. Acquire each of the pcf locks 3274 * until we find a page. 3275 */ 3276 p = pcf; 3277 for (i = 0; i < PCF_FANOUT; i++) { 3278 p->pcf_touch = 1; 3279 mutex_enter(&p->pcf_lock); 3280 if (p->pcf_count >= 1) { 3281 p->pcf_count -= 1; 3282 enough = 1; 3283 break; 3284 } 3285 p++; 3286 } 3287 3288 if (!enough) { 3289 page_reclaim_nomem: 3290 /* 3291 * We really can't have page `pp'. 3292 * Time for the no-memory dance with 3293 * page_free(). This is just like 3294 * page_create_wait(). Plus the added 3295 * attraction of releasing whatever mutex 3296 * we held when we were called with in `lock'. 3297 * Page_unlock() will wakeup any thread 3298 * waiting around for this page. 3299 */ 3300 if (lock) { 3301 VM_STAT_ADD(page_reclaim_zero_locked); 3302 mutex_exit(lock); 3303 } 3304 page_unlock(pp); 3305 3306 /* 3307 * get this before we drop all the pcf locks. 3308 */ 3309 mutex_enter(&new_freemem_lock); 3310 3311 p = pcf; 3312 for (i = 0; i < PCF_FANOUT; i++) { 3313 p->pcf_wait++; 3314 mutex_exit(&p->pcf_lock); 3315 p++; 3316 } 3317 3318 freemem_wait++; 3319 cv_wait(&freemem_cv, &new_freemem_lock); 3320 freemem_wait--; 3321 3322 mutex_exit(&new_freemem_lock); 3323 3324 if (lock) { 3325 mutex_enter(lock); 3326 } 3327 return (0); 3328 } 3329 3330 /* 3331 * There was a page to be found. 3332 * The pcf accounting has been done, 3333 * though none of the pcf_wait flags have been set, 3334 * drop the locks and continue on. 3335 */ 3336 while (p >= pcf) { 3337 mutex_exit(&p->pcf_lock); 3338 p--; 3339 } 3340 } 3341 3342 /* 3343 * freemem is not protected by any lock. Thus, we cannot 3344 * have any assertion containing freemem here. 3345 */ 3346 freemem -= 1; 3347 3348 VM_STAT_ADD(pagecnt.pc_reclaim); 3349 if (PP_ISAGED(pp)) { 3350 page_list_sub(pp, PG_FREE_LIST); 3351 TRACE_1(TR_FAC_VM, TR_PAGE_UNFREE_FREE, 3352 "page_reclaim_free:pp %p", pp); 3353 } else { 3354 page_list_sub(pp, PG_CACHE_LIST); 3355 TRACE_1(TR_FAC_VM, TR_PAGE_UNFREE_CACHE, 3356 "page_reclaim_cache:pp %p", pp); 3357 } 3358 3359 /* 3360 * clear the p_free & p_age bits since this page is no longer 3361 * on the free list. Notice that there was a brief time where 3362 * a page is marked as free, but is not on the list. 3363 * 3364 * Set the reference bit to protect against immediate pageout. 3365 */ 3366 PP_CLRFREE(pp); 3367 PP_CLRAGED(pp); 3368 page_set_props(pp, P_REF); 3369 3370 CPU_STATS_ENTER_K(); 3371 cpup = CPU; /* get cpup now that CPU cannot change */ 3372 CPU_STATS_ADDQ(cpup, vm, pgrec, 1); 3373 CPU_STATS_ADDQ(cpup, vm, pgfrec, 1); 3374 CPU_STATS_EXIT_K(); 3375 3376 return (1); 3377 } 3378 3379 3380 3381 /* 3382 * Destroy identity of the page and put it back on 3383 * the page free list. Assumes that the caller has 3384 * acquired the "exclusive" lock on the page. 3385 */ 3386 void 3387 page_destroy(page_t *pp, int dontfree) 3388 { 3389 ASSERT((PAGE_EXCL(pp) && 3390 !page_iolock_assert(pp)) || panicstr); 3391 3392 if (pp->p_szc != 0) { 3393 if (pp->p_vnode == NULL || IS_SWAPFSVP(pp->p_vnode) || 3394 pp->p_vnode == &kvp) { 3395 panic("page_destroy: anon or kernel or no vnode " 3396 "large page %p", (void *)pp); 3397 } 3398 page_demote_vp_pages(pp); 3399 ASSERT(pp->p_szc == 0); 3400 } 3401 3402 TRACE_1(TR_FAC_VM, TR_PAGE_DESTROY, "page_destroy:pp %p", pp); 3403 3404 /* 3405 * Unload translations, if any, then hash out the 3406 * page to erase its identity. 3407 */ 3408 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 3409 page_hashout(pp, NULL); 3410 3411 if (!dontfree) { 3412 /* 3413 * Acquire the "freemem_lock" for availrmem. 3414 * The page_struct_lock need not be acquired for lckcnt 3415 * and cowcnt since the page has an "exclusive" lock. 3416 */ 3417 if ((pp->p_lckcnt != 0) || (pp->p_cowcnt != 0)) { 3418 mutex_enter(&freemem_lock); 3419 if (pp->p_lckcnt != 0) { 3420 availrmem++; 3421 pp->p_lckcnt = 0; 3422 } 3423 if (pp->p_cowcnt != 0) { 3424 availrmem += pp->p_cowcnt; 3425 pp->p_cowcnt = 0; 3426 } 3427 mutex_exit(&freemem_lock); 3428 } 3429 /* 3430 * Put the page on the "free" list. 3431 */ 3432 page_free(pp, 0); 3433 } 3434 } 3435 3436 void 3437 page_destroy_pages(page_t *pp) 3438 { 3439 3440 page_t *tpp, *rootpp = NULL; 3441 pgcnt_t pgcnt = page_get_pagecnt(pp->p_szc); 3442 pgcnt_t i, pglcks = 0; 3443 uint_t szc = pp->p_szc; 3444 int toxic = 0; 3445 3446 ASSERT(pp->p_szc != 0 && pp->p_szc < page_num_pagesizes()); 3447 3448 VM_STAT_ADD(pagecnt.pc_destroy_pages); 3449 3450 TRACE_1(TR_FAC_VM, TR_PAGE_DESTROY, "page_destroy_pages:pp %p", pp); 3451 3452 if ((page_pptonum(pp) & (pgcnt - 1)) != 0) { 3453 panic("page_destroy_pages: not root page %p", (void *)pp); 3454 /*NOTREACHED*/ 3455 } 3456 3457 for (i = 0, tpp = pp; i < pgcnt; i++, tpp++) { 3458 ASSERT((PAGE_EXCL(tpp) && 3459 !page_iolock_assert(tpp)) || panicstr); 3460 (void) hat_pageunload(tpp, HAT_FORCE_PGUNLOAD); 3461 page_hashout(tpp, NULL); 3462 ASSERT(tpp->p_offset == (u_offset_t)-1); 3463 if (tpp->p_lckcnt != 0) { 3464 pglcks++; 3465 tpp->p_lckcnt = 0; 3466 } else if (tpp->p_cowcnt != 0) { 3467 pglcks += tpp->p_cowcnt; 3468 tpp->p_cowcnt = 0; 3469 } 3470 ASSERT(!hat_page_getshare(tpp)); 3471 ASSERT(tpp->p_vnode == NULL); 3472 ASSERT(tpp->p_szc == szc); 3473 3474 if (page_deteriorating(tpp)) 3475 toxic = 1; 3476 3477 PP_SETFREE(tpp); 3478 page_clr_all_props(tpp); 3479 PP_SETAGED(tpp); 3480 ASSERT(tpp->p_next == tpp); 3481 ASSERT(tpp->p_prev == tpp); 3482 page_list_concat(&rootpp, &tpp); 3483 } 3484 3485 ASSERT(rootpp == pp); 3486 if (pglcks != 0) { 3487 mutex_enter(&freemem_lock); 3488 availrmem += pglcks; 3489 mutex_exit(&freemem_lock); 3490 } 3491 3492 if (toxic) { 3493 page_free_toxic_pages(rootpp); 3494 return; 3495 } 3496 page_list_add_pages(rootpp, 0); 3497 page_create_putback(pgcnt); 3498 } 3499 3500 /* 3501 * Similar to page_destroy(), but destroys pages which are 3502 * locked and known to be on the page free list. Since 3503 * the page is known to be free and locked, no one can access 3504 * it. 3505 * 3506 * Also, the number of free pages does not change. 3507 */ 3508 void 3509 page_destroy_free(page_t *pp) 3510 { 3511 ASSERT(PAGE_EXCL(pp)); 3512 ASSERT(PP_ISFREE(pp)); 3513 ASSERT(pp->p_vnode); 3514 ASSERT(hat_page_getattr(pp, P_MOD | P_REF | P_RO) == 0); 3515 ASSERT(!hat_page_is_mapped(pp)); 3516 ASSERT(PP_ISAGED(pp) == 0); 3517 ASSERT(pp->p_szc == 0); 3518 3519 VM_STAT_ADD(pagecnt.pc_destroy_free); 3520 page_list_sub(pp, PG_CACHE_LIST); 3521 3522 page_hashout(pp, NULL); 3523 ASSERT(pp->p_vnode == NULL); 3524 ASSERT(pp->p_offset == (u_offset_t)-1); 3525 ASSERT(pp->p_hash == NULL); 3526 3527 PP_SETAGED(pp); 3528 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); 3529 page_unlock(pp); 3530 3531 mutex_enter(&new_freemem_lock); 3532 if (freemem_wait) { 3533 cv_signal(&freemem_cv); 3534 } 3535 mutex_exit(&new_freemem_lock); 3536 } 3537 3538 /* 3539 * Rename the page "opp" to have an identity specified 3540 * by [vp, off]. If a page already exists with this name 3541 * it is locked and destroyed. Note that the page's 3542 * translations are not unloaded during the rename. 3543 * 3544 * This routine is used by the anon layer to "steal" the 3545 * original page and is not unlike destroying a page and 3546 * creating a new page using the same page frame. 3547 * 3548 * XXX -- Could deadlock if caller 1 tries to rename A to B while 3549 * caller 2 tries to rename B to A. 3550 */ 3551 void 3552 page_rename(page_t *opp, vnode_t *vp, u_offset_t off) 3553 { 3554 page_t *pp; 3555 int olckcnt = 0; 3556 int ocowcnt = 0; 3557 kmutex_t *phm; 3558 ulong_t index; 3559 3560 ASSERT(PAGE_EXCL(opp) && !page_iolock_assert(opp)); 3561 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 3562 ASSERT(PP_ISFREE(opp) == 0); 3563 3564 VM_STAT_ADD(page_rename_count); 3565 3566 TRACE_3(TR_FAC_VM, TR_PAGE_RENAME, 3567 "page rename:pp %p vp %p off %llx", opp, vp, off); 3568 3569 /* 3570 * CacheFS may call page_rename for a large NFS page 3571 * when both CacheFS and NFS mount points are used 3572 * by applications. Demote this large page before 3573 * renaming it, to ensure that there are no "partial" 3574 * large pages left lying around. 3575 */ 3576 if (opp->p_szc != 0) { 3577 vnode_t *ovp = opp->p_vnode; 3578 ASSERT(ovp != NULL); 3579 ASSERT(!IS_SWAPFSVP(ovp)); 3580 ASSERT(ovp != &kvp); 3581 page_demote_vp_pages(opp); 3582 ASSERT(opp->p_szc == 0); 3583 } 3584 3585 page_hashout(opp, NULL); 3586 PP_CLRAGED(opp); 3587 3588 /* 3589 * Acquire the appropriate page hash lock, since 3590 * we're going to rename the page. 3591 */ 3592 index = PAGE_HASH_FUNC(vp, off); 3593 phm = PAGE_HASH_MUTEX(index); 3594 mutex_enter(phm); 3595 top: 3596 /* 3597 * Look for an existing page with this name and destroy it if found. 3598 * By holding the page hash lock all the way to the page_hashin() 3599 * call, we are assured that no page can be created with this 3600 * identity. In the case when the phm lock is dropped to undo any 3601 * hat layer mappings, the existing page is held with an "exclusive" 3602 * lock, again preventing another page from being created with 3603 * this identity. 3604 */ 3605 PAGE_HASH_SEARCH(index, pp, vp, off); 3606 if (pp != NULL) { 3607 VM_STAT_ADD(page_rename_exists); 3608 3609 /* 3610 * As it turns out, this is one of only two places where 3611 * page_lock() needs to hold the passed in lock in the 3612 * successful case. In all of the others, the lock could 3613 * be dropped as soon as the attempt is made to lock 3614 * the page. It is tempting to add yet another arguement, 3615 * PL_KEEP or PL_DROP, to let page_lock know what to do. 3616 */ 3617 if (!page_lock(pp, SE_EXCL, phm, P_RECLAIM)) { 3618 /* 3619 * Went to sleep because the page could not 3620 * be locked. We were woken up when the page 3621 * was unlocked, or when the page was destroyed. 3622 * In either case, `phm' was dropped while we 3623 * slept. Hence we should not just roar through 3624 * this loop. 3625 */ 3626 goto top; 3627 } 3628 3629 /* 3630 * If an existing page is a large page, then demote 3631 * it to ensure that no "partial" large pages are 3632 * "created" after page_rename. An existing page 3633 * can be a CacheFS page, and can't belong to swapfs. 3634 */ 3635 if (hat_page_is_mapped(pp)) { 3636 /* 3637 * Unload translations. Since we hold the 3638 * exclusive lock on this page, the page 3639 * can not be changed while we drop phm. 3640 * This is also not a lock protocol violation, 3641 * but rather the proper way to do things. 3642 */ 3643 mutex_exit(phm); 3644 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 3645 if (pp->p_szc != 0) { 3646 ASSERT(!IS_SWAPFSVP(vp)); 3647 ASSERT(vp != &kvp); 3648 page_demote_vp_pages(pp); 3649 ASSERT(pp->p_szc == 0); 3650 } 3651 mutex_enter(phm); 3652 } else if (pp->p_szc != 0) { 3653 ASSERT(!IS_SWAPFSVP(vp)); 3654 ASSERT(vp != &kvp); 3655 mutex_exit(phm); 3656 page_demote_vp_pages(pp); 3657 ASSERT(pp->p_szc == 0); 3658 mutex_enter(phm); 3659 } 3660 page_hashout(pp, phm); 3661 } 3662 /* 3663 * Hash in the page with the new identity. 3664 */ 3665 if (!page_hashin(opp, vp, off, phm)) { 3666 /* 3667 * We were holding phm while we searched for [vp, off] 3668 * and only dropped phm if we found and locked a page. 3669 * If we can't create this page now, then some thing 3670 * is really broken. 3671 */ 3672 panic("page_rename: Can't hash in page: %p", (void *)pp); 3673 /*NOTREACHED*/ 3674 } 3675 3676 ASSERT(MUTEX_HELD(phm)); 3677 mutex_exit(phm); 3678 3679 /* 3680 * Now that we have dropped phm, lets get around to finishing up 3681 * with pp. 3682 */ 3683 if (pp != NULL) { 3684 ASSERT(!hat_page_is_mapped(pp)); 3685 /* for now large pages should not end up here */ 3686 ASSERT(pp->p_szc == 0); 3687 /* 3688 * Save the locks for transfer to the new page and then 3689 * clear them so page_free doesn't think they're important. 3690 * The page_struct_lock need not be acquired for lckcnt and 3691 * cowcnt since the page has an "exclusive" lock. 3692 */ 3693 olckcnt = pp->p_lckcnt; 3694 ocowcnt = pp->p_cowcnt; 3695 pp->p_lckcnt = pp->p_cowcnt = 0; 3696 3697 /* 3698 * Put the page on the "free" list after we drop 3699 * the lock. The less work under the lock the better. 3700 */ 3701 /*LINTED: constant in conditional context*/ 3702 VN_DISPOSE(pp, B_FREE, 0, kcred); 3703 } 3704 3705 /* 3706 * Transfer the lock count from the old page (if any). 3707 * The page_struct_lock need not be acquired for lckcnt and 3708 * cowcnt since the page has an "exclusive" lock. 3709 */ 3710 opp->p_lckcnt += olckcnt; 3711 opp->p_cowcnt += ocowcnt; 3712 } 3713 3714 /* 3715 * low level routine to add page `pp' to the hash and vp chains for [vp, offset] 3716 * 3717 * Pages are normally inserted at the start of a vnode's v_pages list. 3718 * If the vnode is VMODSORT and the page is modified, it goes at the end. 3719 * This can happen when a modified page is relocated for DR. 3720 * 3721 * Returns 1 on success and 0 on failure. 3722 */ 3723 static int 3724 page_do_hashin(page_t *pp, vnode_t *vp, u_offset_t offset) 3725 { 3726 page_t **listp; 3727 page_t *tp; 3728 ulong_t index; 3729 3730 ASSERT(PAGE_EXCL(pp)); 3731 ASSERT(vp != NULL); 3732 ASSERT(MUTEX_HELD(page_vnode_mutex(vp))); 3733 3734 /* 3735 * Be sure to set these up before the page is inserted on the hash 3736 * list. As soon as the page is placed on the list some other 3737 * thread might get confused and wonder how this page could 3738 * possibly hash to this list. 3739 */ 3740 pp->p_vnode = vp; 3741 pp->p_offset = offset; 3742 3743 /* 3744 * record if this page is on a swap vnode 3745 */ 3746 if ((vp->v_flag & VISSWAP) != 0) 3747 PP_SETSWAP(pp); 3748 3749 index = PAGE_HASH_FUNC(vp, offset); 3750 ASSERT(MUTEX_HELD(PAGE_HASH_MUTEX(index))); 3751 listp = &page_hash[index]; 3752 3753 /* 3754 * If this page is already hashed in, fail this attempt to add it. 3755 */ 3756 for (tp = *listp; tp != NULL; tp = tp->p_hash) { 3757 if (tp->p_vnode == vp && tp->p_offset == offset) { 3758 pp->p_vnode = NULL; 3759 pp->p_offset = (u_offset_t)(-1); 3760 return (0); 3761 } 3762 } 3763 pp->p_hash = *listp; 3764 *listp = pp; 3765 3766 /* 3767 * Add the page to the vnode's list of pages 3768 */ 3769 if (vp->v_pages != NULL && IS_VMODSORT(vp) && hat_ismod(pp)) 3770 listp = &vp->v_pages->p_vpprev->p_vpnext; 3771 else 3772 listp = &vp->v_pages; 3773 3774 page_vpadd(listp, pp); 3775 3776 return (1); 3777 } 3778 3779 /* 3780 * Add page `pp' to both the hash and vp chains for [vp, offset]. 3781 * 3782 * Returns 1 on success and 0 on failure. 3783 * If hold is passed in, it is not dropped. 3784 */ 3785 int 3786 page_hashin(page_t *pp, vnode_t *vp, u_offset_t offset, kmutex_t *hold) 3787 { 3788 kmutex_t *phm = NULL; 3789 kmutex_t *vphm; 3790 int rc; 3791 3792 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 3793 3794 TRACE_3(TR_FAC_VM, TR_PAGE_HASHIN, 3795 "page_hashin:pp %p vp %p offset %llx", 3796 pp, vp, offset); 3797 3798 VM_STAT_ADD(hashin_count); 3799 3800 if (hold != NULL) 3801 phm = hold; 3802 else { 3803 VM_STAT_ADD(hashin_not_held); 3804 phm = PAGE_HASH_MUTEX(PAGE_HASH_FUNC(vp, offset)); 3805 mutex_enter(phm); 3806 } 3807 3808 vphm = page_vnode_mutex(vp); 3809 mutex_enter(vphm); 3810 rc = page_do_hashin(pp, vp, offset); 3811 mutex_exit(vphm); 3812 if (hold == NULL) 3813 mutex_exit(phm); 3814 if (rc == 0) 3815 VM_STAT_ADD(hashin_already); 3816 return (rc); 3817 } 3818 3819 /* 3820 * Remove page ``pp'' from the hash and vp chains and remove vp association. 3821 * All mutexes must be held 3822 */ 3823 static void 3824 page_do_hashout(page_t *pp) 3825 { 3826 page_t **hpp; 3827 page_t *hp; 3828 vnode_t *vp = pp->p_vnode; 3829 3830 ASSERT(vp != NULL); 3831 ASSERT(MUTEX_HELD(page_vnode_mutex(vp))); 3832 3833 /* 3834 * First, take pp off of its hash chain. 3835 */ 3836 hpp = &page_hash[PAGE_HASH_FUNC(vp, pp->p_offset)]; 3837 3838 for (;;) { 3839 hp = *hpp; 3840 if (hp == pp) 3841 break; 3842 if (hp == NULL) { 3843 panic("page_do_hashout"); 3844 /*NOTREACHED*/ 3845 } 3846 hpp = &hp->p_hash; 3847 } 3848 *hpp = pp->p_hash; 3849 3850 /* 3851 * Now remove it from its associated vnode. 3852 */ 3853 if (vp->v_pages) 3854 page_vpsub(&vp->v_pages, pp); 3855 3856 pp->p_hash = NULL; 3857 page_clr_all_props(pp); 3858 PP_CLRSWAP(pp); 3859 pp->p_vnode = NULL; 3860 pp->p_offset = (u_offset_t)-1; 3861 } 3862 3863 /* 3864 * Remove page ``pp'' from the hash and vp chains and remove vp association. 3865 * 3866 * When `phm' is non-NULL it contains the address of the mutex protecting the 3867 * hash list pp is on. It is not dropped. 3868 */ 3869 void 3870 page_hashout(page_t *pp, kmutex_t *phm) 3871 { 3872 vnode_t *vp; 3873 ulong_t index; 3874 kmutex_t *nphm; 3875 kmutex_t *vphm; 3876 kmutex_t *sep; 3877 3878 ASSERT(phm != NULL ? MUTEX_HELD(phm) : 1); 3879 ASSERT(pp->p_vnode != NULL); 3880 ASSERT((PAGE_EXCL(pp) && !page_iolock_assert(pp)) || panicstr); 3881 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(pp->p_vnode))); 3882 3883 vp = pp->p_vnode; 3884 3885 TRACE_2(TR_FAC_VM, TR_PAGE_HASHOUT, 3886 "page_hashout:pp %p vp %p", pp, vp); 3887 3888 /* Kernel probe */ 3889 TNF_PROBE_2(page_unmap, "vm pagefault", /* CSTYLED */, 3890 tnf_opaque, vnode, vp, 3891 tnf_offset, offset, pp->p_offset); 3892 3893 /* 3894 * 3895 */ 3896 VM_STAT_ADD(hashout_count); 3897 index = PAGE_HASH_FUNC(vp, pp->p_offset); 3898 if (phm == NULL) { 3899 VM_STAT_ADD(hashout_not_held); 3900 nphm = PAGE_HASH_MUTEX(index); 3901 mutex_enter(nphm); 3902 } 3903 ASSERT(phm ? phm == PAGE_HASH_MUTEX(index) : 1); 3904 3905 3906 /* 3907 * grab page vnode mutex and remove it... 3908 */ 3909 vphm = page_vnode_mutex(vp); 3910 mutex_enter(vphm); 3911 3912 page_do_hashout(pp); 3913 3914 mutex_exit(vphm); 3915 if (phm == NULL) 3916 mutex_exit(nphm); 3917 3918 /* 3919 * If the page was retired, update the pages_retired 3920 * total and clear the page flag 3921 */ 3922 if (page_isretired(pp)) { 3923 retired_page_removed(pp); 3924 } 3925 3926 /* 3927 * Wake up processes waiting for this page. The page's 3928 * identity has been changed, and is probably not the 3929 * desired page any longer. 3930 */ 3931 sep = page_se_mutex(pp); 3932 mutex_enter(sep); 3933 pp->p_selock &= ~SE_EWANTED; 3934 if (CV_HAS_WAITERS(&pp->p_cv)) 3935 cv_broadcast(&pp->p_cv); 3936 mutex_exit(sep); 3937 } 3938 3939 /* 3940 * Add the page to the front of a linked list of pages 3941 * using the p_next & p_prev pointers for the list. 3942 * The caller is responsible for protecting the list pointers. 3943 */ 3944 void 3945 page_add(page_t **ppp, page_t *pp) 3946 { 3947 ASSERT(PAGE_EXCL(pp) || (PAGE_SHARED(pp) && page_iolock_assert(pp))); 3948 3949 page_add_common(ppp, pp); 3950 } 3951 3952 3953 3954 /* 3955 * Common code for page_add() and mach_page_add() 3956 */ 3957 void 3958 page_add_common(page_t **ppp, page_t *pp) 3959 { 3960 if (*ppp == NULL) { 3961 pp->p_next = pp->p_prev = pp; 3962 } else { 3963 pp->p_next = *ppp; 3964 pp->p_prev = (*ppp)->p_prev; 3965 (*ppp)->p_prev = pp; 3966 pp->p_prev->p_next = pp; 3967 } 3968 *ppp = pp; 3969 } 3970 3971 3972 /* 3973 * Remove this page from a linked list of pages 3974 * using the p_next & p_prev pointers for the list. 3975 * 3976 * The caller is responsible for protecting the list pointers. 3977 */ 3978 void 3979 page_sub(page_t **ppp, page_t *pp) 3980 { 3981 ASSERT((PP_ISFREE(pp)) ? 1 : 3982 (PAGE_EXCL(pp)) || (PAGE_SHARED(pp) && page_iolock_assert(pp))); 3983 3984 if (*ppp == NULL || pp == NULL) { 3985 panic("page_sub: bad arg(s): pp %p, *ppp %p", 3986 (void *)pp, (void *)(*ppp)); 3987 /*NOTREACHED*/ 3988 } 3989 3990 page_sub_common(ppp, pp); 3991 } 3992 3993 3994 /* 3995 * Common code for page_sub() and mach_page_sub() 3996 */ 3997 void 3998 page_sub_common(page_t **ppp, page_t *pp) 3999 { 4000 if (*ppp == pp) 4001 *ppp = pp->p_next; /* go to next page */ 4002 4003 if (*ppp == pp) 4004 *ppp = NULL; /* page list is gone */ 4005 else { 4006 pp->p_prev->p_next = pp->p_next; 4007 pp->p_next->p_prev = pp->p_prev; 4008 } 4009 pp->p_prev = pp->p_next = pp; /* make pp a list of one */ 4010 } 4011 4012 4013 /* 4014 * Break page list cppp into two lists with npages in the first list. 4015 * The tail is returned in nppp. 4016 */ 4017 void 4018 page_list_break(page_t **oppp, page_t **nppp, pgcnt_t npages) 4019 { 4020 page_t *s1pp = *oppp; 4021 page_t *s2pp; 4022 page_t *e1pp, *e2pp; 4023 long n = 0; 4024 4025 if (s1pp == NULL) { 4026 *nppp = NULL; 4027 return; 4028 } 4029 if (npages == 0) { 4030 *nppp = s1pp; 4031 *oppp = NULL; 4032 return; 4033 } 4034 for (n = 0, s2pp = *oppp; n < npages; n++) { 4035 s2pp = s2pp->p_next; 4036 } 4037 /* Fix head and tail of new lists */ 4038 e1pp = s2pp->p_prev; 4039 e2pp = s1pp->p_prev; 4040 s1pp->p_prev = e1pp; 4041 e1pp->p_next = s1pp; 4042 s2pp->p_prev = e2pp; 4043 e2pp->p_next = s2pp; 4044 4045 /* second list empty */ 4046 if (s2pp == s1pp) { 4047 *oppp = s1pp; 4048 *nppp = NULL; 4049 } else { 4050 *oppp = s1pp; 4051 *nppp = s2pp; 4052 } 4053 } 4054 4055 /* 4056 * Concatenate page list nppp onto the end of list ppp. 4057 */ 4058 void 4059 page_list_concat(page_t **ppp, page_t **nppp) 4060 { 4061 page_t *s1pp, *s2pp, *e1pp, *e2pp; 4062 4063 if (*nppp == NULL) { 4064 return; 4065 } 4066 if (*ppp == NULL) { 4067 *ppp = *nppp; 4068 return; 4069 } 4070 s1pp = *ppp; 4071 e1pp = s1pp->p_prev; 4072 s2pp = *nppp; 4073 e2pp = s2pp->p_prev; 4074 s1pp->p_prev = e2pp; 4075 e2pp->p_next = s1pp; 4076 e1pp->p_next = s2pp; 4077 s2pp->p_prev = e1pp; 4078 } 4079 4080 /* 4081 * return the next page in the page list 4082 */ 4083 page_t * 4084 page_list_next(page_t *pp) 4085 { 4086 return (pp->p_next); 4087 } 4088 4089 4090 /* 4091 * Add the page to the front of the linked list of pages 4092 * using p_vpnext/p_vpprev pointers for the list. 4093 * 4094 * The caller is responsible for protecting the lists. 4095 */ 4096 void 4097 page_vpadd(page_t **ppp, page_t *pp) 4098 { 4099 if (*ppp == NULL) { 4100 pp->p_vpnext = pp->p_vpprev = pp; 4101 } else { 4102 pp->p_vpnext = *ppp; 4103 pp->p_vpprev = (*ppp)->p_vpprev; 4104 (*ppp)->p_vpprev = pp; 4105 pp->p_vpprev->p_vpnext = pp; 4106 } 4107 *ppp = pp; 4108 } 4109 4110 /* 4111 * Remove this page from the linked list of pages 4112 * using p_vpnext/p_vpprev pointers for the list. 4113 * 4114 * The caller is responsible for protecting the lists. 4115 */ 4116 void 4117 page_vpsub(page_t **ppp, page_t *pp) 4118 { 4119 if (*ppp == NULL || pp == NULL) { 4120 panic("page_vpsub: bad arg(s): pp %p, *ppp %p", 4121 (void *)pp, (void *)(*ppp)); 4122 /*NOTREACHED*/ 4123 } 4124 4125 if (*ppp == pp) 4126 *ppp = pp->p_vpnext; /* go to next page */ 4127 4128 if (*ppp == pp) 4129 *ppp = NULL; /* page list is gone */ 4130 else { 4131 pp->p_vpprev->p_vpnext = pp->p_vpnext; 4132 pp->p_vpnext->p_vpprev = pp->p_vpprev; 4133 } 4134 pp->p_vpprev = pp->p_vpnext = pp; /* make pp a list of one */ 4135 } 4136 4137 /* 4138 * Lock a physical page into memory "long term". Used to support "lock 4139 * in memory" functions. Accepts the page to be locked, and a cow variable 4140 * to indicate whether a the lock will travel to the new page during 4141 * a potential copy-on-write. 4142 */ 4143 int 4144 page_pp_lock( 4145 page_t *pp, /* page to be locked */ 4146 int cow, /* cow lock */ 4147 int kernel) /* must succeed -- ignore checking */ 4148 { 4149 int r = 0; /* result -- assume failure */ 4150 4151 ASSERT(PAGE_LOCKED(pp)); 4152 4153 page_struct_lock(pp); 4154 /* 4155 * Acquire the "freemem_lock" for availrmem. 4156 */ 4157 if (cow) { 4158 mutex_enter(&freemem_lock); 4159 if ((availrmem > pages_pp_maximum) && 4160 (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM)) { 4161 availrmem--; 4162 pages_locked++; 4163 mutex_exit(&freemem_lock); 4164 r = 1; 4165 if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) { 4166 cmn_err(CE_WARN, 4167 "COW lock limit reached on pfn 0x%lx", 4168 page_pptonum(pp)); 4169 } 4170 } else 4171 mutex_exit(&freemem_lock); 4172 } else { 4173 if (pp->p_lckcnt) { 4174 if (pp->p_lckcnt < (ushort_t)PAGE_LOCK_MAXIMUM) { 4175 r = 1; 4176 if (++pp->p_lckcnt == 4177 (ushort_t)PAGE_LOCK_MAXIMUM) { 4178 cmn_err(CE_WARN, "Page lock limit " 4179 "reached on pfn 0x%lx", 4180 page_pptonum(pp)); 4181 } 4182 } 4183 } else { 4184 if (kernel) { 4185 /* availrmem accounting done by caller */ 4186 ++pp->p_lckcnt; 4187 r = 1; 4188 } else { 4189 mutex_enter(&freemem_lock); 4190 if (availrmem > pages_pp_maximum) { 4191 availrmem--; 4192 pages_locked++; 4193 ++pp->p_lckcnt; 4194 r = 1; 4195 } 4196 mutex_exit(&freemem_lock); 4197 } 4198 } 4199 } 4200 page_struct_unlock(pp); 4201 return (r); 4202 } 4203 4204 /* 4205 * Decommit a lock on a physical page frame. Account for cow locks if 4206 * appropriate. 4207 */ 4208 void 4209 page_pp_unlock( 4210 page_t *pp, /* page to be unlocked */ 4211 int cow, /* expect cow lock */ 4212 int kernel) /* this was a kernel lock */ 4213 { 4214 ASSERT(PAGE_LOCKED(pp)); 4215 4216 page_struct_lock(pp); 4217 /* 4218 * Acquire the "freemem_lock" for availrmem. 4219 * If cowcnt or lcknt is already 0 do nothing; i.e., we 4220 * could be called to unlock even if nothing is locked. This could 4221 * happen if locked file pages were truncated (removing the lock) 4222 * and the file was grown again and new pages faulted in; the new 4223 * pages are unlocked but the segment still thinks they're locked. 4224 */ 4225 if (cow) { 4226 if (pp->p_cowcnt) { 4227 mutex_enter(&freemem_lock); 4228 pp->p_cowcnt--; 4229 availrmem++; 4230 pages_locked--; 4231 mutex_exit(&freemem_lock); 4232 } 4233 } else { 4234 if (pp->p_lckcnt && --pp->p_lckcnt == 0) { 4235 if (!kernel) { 4236 mutex_enter(&freemem_lock); 4237 availrmem++; 4238 pages_locked--; 4239 mutex_exit(&freemem_lock); 4240 } 4241 } 4242 } 4243 page_struct_unlock(pp); 4244 } 4245 4246 /* 4247 * This routine reserves availrmem for npages; 4248 * flags: KM_NOSLEEP or KM_SLEEP 4249 * returns 1 on success or 0 on failure 4250 */ 4251 int 4252 page_resv(pgcnt_t npages, uint_t flags) 4253 { 4254 mutex_enter(&freemem_lock); 4255 while (availrmem < tune.t_minarmem + npages) { 4256 if (flags & KM_NOSLEEP) { 4257 mutex_exit(&freemem_lock); 4258 return (0); 4259 } 4260 mutex_exit(&freemem_lock); 4261 page_needfree(npages); 4262 kmem_reap(); 4263 delay(hz >> 2); 4264 page_needfree(-(spgcnt_t)npages); 4265 mutex_enter(&freemem_lock); 4266 } 4267 availrmem -= npages; 4268 mutex_exit(&freemem_lock); 4269 return (1); 4270 } 4271 4272 /* 4273 * This routine unreserves availrmem for npages; 4274 */ 4275 void 4276 page_unresv(pgcnt_t npages) 4277 { 4278 mutex_enter(&freemem_lock); 4279 availrmem += npages; 4280 mutex_exit(&freemem_lock); 4281 } 4282 4283 /* 4284 * See Statement at the beginning of segvn_lockop() regarding 4285 * the way we handle cowcnts and lckcnts. 4286 * 4287 * Transfer cowcnt on 'opp' to cowcnt on 'npp' if the vpage 4288 * that breaks COW has PROT_WRITE. 4289 * 4290 * Note that, we may also break COW in case we are softlocking 4291 * on read access during physio; 4292 * in this softlock case, the vpage may not have PROT_WRITE. 4293 * So, we need to transfer lckcnt on 'opp' to lckcnt on 'npp' 4294 * if the vpage doesn't have PROT_WRITE. 4295 * 4296 * This routine is never called if we are stealing a page 4297 * in anon_private. 4298 * 4299 * The caller subtracted from availrmem for read only mapping. 4300 * if lckcnt is 1 increment availrmem. 4301 */ 4302 void 4303 page_pp_useclaim( 4304 page_t *opp, /* original page frame losing lock */ 4305 page_t *npp, /* new page frame gaining lock */ 4306 uint_t write_perm) /* set if vpage has PROT_WRITE */ 4307 { 4308 int payback = 0; 4309 4310 ASSERT(PAGE_LOCKED(opp)); 4311 ASSERT(PAGE_LOCKED(npp)); 4312 4313 page_struct_lock(opp); 4314 4315 ASSERT(npp->p_cowcnt == 0); 4316 ASSERT(npp->p_lckcnt == 0); 4317 4318 /* Don't use claim if nothing is locked (see page_pp_unlock above) */ 4319 if ((write_perm && opp->p_cowcnt != 0) || 4320 (!write_perm && opp->p_lckcnt != 0)) { 4321 4322 if (write_perm) { 4323 npp->p_cowcnt++; 4324 ASSERT(opp->p_cowcnt != 0); 4325 opp->p_cowcnt--; 4326 } else { 4327 4328 ASSERT(opp->p_lckcnt != 0); 4329 4330 /* 4331 * We didn't need availrmem decremented if p_lckcnt on 4332 * original page is 1. Here, we are unlocking 4333 * read-only copy belonging to original page and 4334 * are locking a copy belonging to new page. 4335 */ 4336 if (opp->p_lckcnt == 1) 4337 payback = 1; 4338 4339 npp->p_lckcnt++; 4340 opp->p_lckcnt--; 4341 } 4342 } 4343 if (payback) { 4344 mutex_enter(&freemem_lock); 4345 availrmem++; 4346 pages_useclaim--; 4347 mutex_exit(&freemem_lock); 4348 } 4349 page_struct_unlock(opp); 4350 } 4351 4352 /* 4353 * Simple claim adjust functions -- used to support changes in 4354 * claims due to changes in access permissions. Used by segvn_setprot(). 4355 */ 4356 int 4357 page_addclaim(page_t *pp) 4358 { 4359 int r = 0; /* result */ 4360 4361 ASSERT(PAGE_LOCKED(pp)); 4362 4363 page_struct_lock(pp); 4364 ASSERT(pp->p_lckcnt != 0); 4365 4366 if (pp->p_lckcnt == 1) { 4367 if (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM) { 4368 --pp->p_lckcnt; 4369 r = 1; 4370 if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) { 4371 cmn_err(CE_WARN, 4372 "COW lock limit reached on pfn 0x%lx", 4373 page_pptonum(pp)); 4374 } 4375 } 4376 } else { 4377 mutex_enter(&freemem_lock); 4378 if ((availrmem > pages_pp_maximum) && 4379 (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM)) { 4380 --availrmem; 4381 ++pages_claimed; 4382 mutex_exit(&freemem_lock); 4383 --pp->p_lckcnt; 4384 r = 1; 4385 if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) { 4386 cmn_err(CE_WARN, 4387 "COW lock limit reached on pfn 0x%lx", 4388 page_pptonum(pp)); 4389 } 4390 } else 4391 mutex_exit(&freemem_lock); 4392 } 4393 page_struct_unlock(pp); 4394 return (r); 4395 } 4396 4397 int 4398 page_subclaim(page_t *pp) 4399 { 4400 int r = 0; 4401 4402 ASSERT(PAGE_LOCKED(pp)); 4403 4404 page_struct_lock(pp); 4405 ASSERT(pp->p_cowcnt != 0); 4406 4407 if (pp->p_lckcnt) { 4408 if (pp->p_lckcnt < (ushort_t)PAGE_LOCK_MAXIMUM) { 4409 r = 1; 4410 /* 4411 * for availrmem 4412 */ 4413 mutex_enter(&freemem_lock); 4414 availrmem++; 4415 pages_claimed--; 4416 mutex_exit(&freemem_lock); 4417 4418 pp->p_cowcnt--; 4419 4420 if (++pp->p_lckcnt == (ushort_t)PAGE_LOCK_MAXIMUM) { 4421 cmn_err(CE_WARN, 4422 "Page lock limit reached on pfn 0x%lx", 4423 page_pptonum(pp)); 4424 } 4425 } 4426 } else { 4427 r = 1; 4428 pp->p_cowcnt--; 4429 pp->p_lckcnt++; 4430 } 4431 page_struct_unlock(pp); 4432 return (r); 4433 } 4434 4435 int 4436 page_addclaim_pages(page_t **ppa) 4437 { 4438 4439 pgcnt_t lckpgs = 0, pg_idx; 4440 4441 VM_STAT_ADD(pagecnt.pc_addclaim_pages); 4442 4443 mutex_enter(&page_llock); 4444 for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) { 4445 4446 ASSERT(PAGE_LOCKED(ppa[pg_idx])); 4447 ASSERT(ppa[pg_idx]->p_lckcnt != 0); 4448 if (ppa[pg_idx]->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) { 4449 mutex_exit(&page_llock); 4450 return (0); 4451 } 4452 if (ppa[pg_idx]->p_lckcnt > 1) 4453 lckpgs++; 4454 } 4455 4456 if (lckpgs != 0) { 4457 mutex_enter(&freemem_lock); 4458 if (availrmem >= pages_pp_maximum + lckpgs) { 4459 availrmem -= lckpgs; 4460 pages_claimed += lckpgs; 4461 } else { 4462 mutex_exit(&freemem_lock); 4463 mutex_exit(&page_llock); 4464 return (0); 4465 } 4466 mutex_exit(&freemem_lock); 4467 } 4468 4469 for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) { 4470 ppa[pg_idx]->p_lckcnt--; 4471 ppa[pg_idx]->p_cowcnt++; 4472 } 4473 mutex_exit(&page_llock); 4474 return (1); 4475 } 4476 4477 int 4478 page_subclaim_pages(page_t **ppa) 4479 { 4480 pgcnt_t ulckpgs = 0, pg_idx; 4481 4482 VM_STAT_ADD(pagecnt.pc_subclaim_pages); 4483 4484 mutex_enter(&page_llock); 4485 for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) { 4486 4487 ASSERT(PAGE_LOCKED(ppa[pg_idx])); 4488 ASSERT(ppa[pg_idx]->p_cowcnt != 0); 4489 if (ppa[pg_idx]->p_lckcnt == (ushort_t)PAGE_LOCK_MAXIMUM) { 4490 mutex_exit(&page_llock); 4491 return (0); 4492 } 4493 if (ppa[pg_idx]->p_lckcnt != 0) 4494 ulckpgs++; 4495 } 4496 4497 if (ulckpgs != 0) { 4498 mutex_enter(&freemem_lock); 4499 availrmem += ulckpgs; 4500 pages_claimed -= ulckpgs; 4501 mutex_exit(&freemem_lock); 4502 } 4503 4504 for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) { 4505 ppa[pg_idx]->p_cowcnt--; 4506 ppa[pg_idx]->p_lckcnt++; 4507 4508 } 4509 mutex_exit(&page_llock); 4510 return (1); 4511 } 4512 4513 page_t * 4514 page_numtopp(pfn_t pfnum, se_t se) 4515 { 4516 page_t *pp; 4517 4518 retry: 4519 pp = page_numtopp_nolock(pfnum); 4520 if (pp == NULL) { 4521 return ((page_t *)NULL); 4522 } 4523 4524 /* 4525 * Acquire the appropriate lock on the page. 4526 */ 4527 while (!page_lock(pp, se, (kmutex_t *)NULL, P_RECLAIM)) { 4528 if (page_pptonum(pp) != pfnum) 4529 goto retry; 4530 continue; 4531 } 4532 4533 if (page_pptonum(pp) != pfnum) { 4534 page_unlock(pp); 4535 goto retry; 4536 } 4537 4538 return (pp); 4539 } 4540 4541 page_t * 4542 page_numtopp_noreclaim(pfn_t pfnum, se_t se) 4543 { 4544 page_t *pp; 4545 4546 retry: 4547 pp = page_numtopp_nolock(pfnum); 4548 if (pp == NULL) { 4549 return ((page_t *)NULL); 4550 } 4551 4552 /* 4553 * Acquire the appropriate lock on the page. 4554 */ 4555 while (!page_lock(pp, se, (kmutex_t *)NULL, P_NO_RECLAIM)) { 4556 if (page_pptonum(pp) != pfnum) 4557 goto retry; 4558 continue; 4559 } 4560 4561 if (page_pptonum(pp) != pfnum) { 4562 page_unlock(pp); 4563 goto retry; 4564 } 4565 4566 return (pp); 4567 } 4568 4569 /* 4570 * This routine is like page_numtopp, but will only return page structs 4571 * for pages which are ok for loading into hardware using the page struct. 4572 */ 4573 page_t * 4574 page_numtopp_nowait(pfn_t pfnum, se_t se) 4575 { 4576 page_t *pp; 4577 4578 retry: 4579 pp = page_numtopp_nolock(pfnum); 4580 if (pp == NULL) { 4581 return ((page_t *)NULL); 4582 } 4583 4584 /* 4585 * Try to acquire the appropriate lock on the page. 4586 */ 4587 if (PP_ISFREE(pp)) 4588 pp = NULL; 4589 else { 4590 if (!page_trylock(pp, se)) 4591 pp = NULL; 4592 else { 4593 if (page_pptonum(pp) != pfnum) { 4594 page_unlock(pp); 4595 goto retry; 4596 } 4597 if (PP_ISFREE(pp)) { 4598 page_unlock(pp); 4599 pp = NULL; 4600 } 4601 } 4602 } 4603 return (pp); 4604 } 4605 4606 /* 4607 * Returns a count of dirty pages that are in the process 4608 * of being written out. If 'cleanit' is set, try to push the page. 4609 */ 4610 pgcnt_t 4611 page_busy(int cleanit) 4612 { 4613 page_t *page0 = page_first(); 4614 page_t *pp = page0; 4615 pgcnt_t nppbusy = 0; 4616 u_offset_t off; 4617 4618 do { 4619 vnode_t *vp = pp->p_vnode; 4620 4621 /* 4622 * A page is a candidate for syncing if it is: 4623 * 4624 * (a) On neither the freelist nor the cachelist 4625 * (b) Hashed onto a vnode 4626 * (c) Not a kernel page 4627 * (d) Dirty 4628 * (e) Not part of a swapfile 4629 * (f) a page which belongs to a real vnode; eg has a non-null 4630 * v_vfsp pointer. 4631 * (g) Backed by a filesystem which doesn't have a 4632 * stubbed-out sync operation 4633 */ 4634 if (!PP_ISFREE(pp) && vp != NULL && vp != &kvp && 4635 hat_ismod(pp) && !IS_SWAPVP(vp) && vp->v_vfsp != NULL && 4636 vfs_can_sync(vp->v_vfsp)) { 4637 nppbusy++; 4638 vfs_syncprogress(); 4639 4640 if (!cleanit) 4641 continue; 4642 if (!page_trylock(pp, SE_EXCL)) 4643 continue; 4644 4645 if (PP_ISFREE(pp) || vp == NULL || IS_SWAPVP(vp) || 4646 pp->p_lckcnt != 0 || pp->p_cowcnt != 0 || 4647 !(hat_pagesync(pp, 4648 HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD) & P_MOD)) { 4649 page_unlock(pp); 4650 continue; 4651 } 4652 off = pp->p_offset; 4653 VN_HOLD(vp); 4654 page_unlock(pp); 4655 (void) VOP_PUTPAGE(vp, off, PAGESIZE, 4656 B_ASYNC | B_FREE, kcred); 4657 VN_RELE(vp); 4658 } 4659 } while ((pp = page_next(pp)) != page0); 4660 4661 return (nppbusy); 4662 } 4663 4664 void page_invalidate_pages(void); 4665 4666 /* 4667 * callback handler to vm sub-system 4668 * 4669 * callers make sure no recursive entries to this func. 4670 */ 4671 /*ARGSUSED*/ 4672 boolean_t 4673 callb_vm_cpr(void *arg, int code) 4674 { 4675 if (code == CB_CODE_CPR_CHKPT) 4676 page_invalidate_pages(); 4677 return (B_TRUE); 4678 } 4679 4680 /* 4681 * Invalidate all pages of the system. 4682 * It shouldn't be called until all user page activities are all stopped. 4683 */ 4684 void 4685 page_invalidate_pages() 4686 { 4687 page_t *pp; 4688 page_t *page0; 4689 pgcnt_t nbusypages; 4690 int retry = 0; 4691 const int MAXRETRIES = 4; 4692 #if defined(__sparc) 4693 extern struct vnode prom_ppages; 4694 #endif /* __sparc */ 4695 4696 top: 4697 /* 4698 * Flush dirty pages and destory the clean ones. 4699 */ 4700 nbusypages = 0; 4701 4702 pp = page0 = page_first(); 4703 do { 4704 struct vnode *vp; 4705 u_offset_t offset; 4706 int mod; 4707 4708 /* 4709 * skip the page if it has no vnode or the page associated 4710 * with the kernel vnode or prom allocated kernel mem. 4711 */ 4712 #if defined(__sparc) 4713 if ((vp = pp->p_vnode) == NULL || vp == &kvp || 4714 vp == &prom_ppages) 4715 #else /* x86 doesn't have prom or prom_ppage */ 4716 if ((vp = pp->p_vnode) == NULL || vp == &kvp) 4717 #endif /* __sparc */ 4718 continue; 4719 4720 /* 4721 * skip the page which is already free invalidated. 4722 */ 4723 if (PP_ISFREE(pp) && PP_ISAGED(pp)) 4724 continue; 4725 4726 /* 4727 * skip pages that are already locked or can't be "exclusively" 4728 * locked or are already free. After we lock the page, check 4729 * the free and age bits again to be sure it's not destroied 4730 * yet. 4731 * To achieve max. parallelization, we use page_trylock instead 4732 * of page_lock so that we don't get block on individual pages 4733 * while we have thousands of other pages to process. 4734 */ 4735 if (!page_trylock(pp, SE_EXCL)) { 4736 nbusypages++; 4737 continue; 4738 } else if (PP_ISFREE(pp)) { 4739 if (!PP_ISAGED(pp)) { 4740 page_destroy_free(pp); 4741 } else { 4742 page_unlock(pp); 4743 } 4744 continue; 4745 } 4746 /* 4747 * Is this page involved in some I/O? shared? 4748 * 4749 * The page_struct_lock need not be acquired to 4750 * examine these fields since the page has an 4751 * "exclusive" lock. 4752 */ 4753 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 4754 page_unlock(pp); 4755 continue; 4756 } 4757 4758 if (vp->v_type == VCHR) { 4759 panic("vp->v_type == VCHR"); 4760 /*NOTREACHED*/ 4761 } 4762 4763 if (!page_try_demote_pages(pp)) { 4764 page_unlock(pp); 4765 continue; 4766 } 4767 4768 /* 4769 * Check the modified bit. Leave the bits alone in hardware 4770 * (they will be modified if we do the putpage). 4771 */ 4772 mod = (hat_pagesync(pp, HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD) 4773 & P_MOD); 4774 if (mod) { 4775 offset = pp->p_offset; 4776 /* 4777 * Hold the vnode before releasing the page lock 4778 * to prevent it from being freed and re-used by 4779 * some other thread. 4780 */ 4781 VN_HOLD(vp); 4782 page_unlock(pp); 4783 /* 4784 * No error return is checked here. Callers such as 4785 * cpr deals with the dirty pages at the dump time 4786 * if this putpage fails. 4787 */ 4788 (void) VOP_PUTPAGE(vp, offset, PAGESIZE, B_INVAL, 4789 kcred); 4790 VN_RELE(vp); 4791 } else { 4792 page_destroy(pp, 0); 4793 } 4794 } while ((pp = page_next(pp)) != page0); 4795 if (nbusypages && retry++ < MAXRETRIES) { 4796 delay(1); 4797 goto top; 4798 } 4799 } 4800 4801 /* 4802 * Replace the page "old" with the page "new" on the page hash and vnode lists 4803 * 4804 * the replacemnt must be done in place, ie the equivalent sequence: 4805 * 4806 * vp = old->p_vnode; 4807 * off = old->p_offset; 4808 * page_do_hashout(old) 4809 * page_do_hashin(new, vp, off) 4810 * 4811 * doesn't work, since 4812 * 1) if old is the only page on the vnode, the v_pages list has a window 4813 * where it looks empty. This will break file system assumptions. 4814 * and 4815 * 2) pvn_vplist_dirty() can't deal with pages moving on the v_pages list. 4816 */ 4817 static void 4818 page_do_relocate_hash(page_t *new, page_t *old) 4819 { 4820 page_t **hash_list; 4821 vnode_t *vp = old->p_vnode; 4822 kmutex_t *sep; 4823 4824 ASSERT(PAGE_EXCL(old)); 4825 ASSERT(PAGE_EXCL(new)); 4826 ASSERT(vp != NULL); 4827 ASSERT(MUTEX_HELD(page_vnode_mutex(vp))); 4828 ASSERT(MUTEX_HELD(PAGE_HASH_MUTEX(PAGE_HASH_FUNC(vp, old->p_offset)))); 4829 4830 /* 4831 * First find old page on the page hash list 4832 */ 4833 hash_list = &page_hash[PAGE_HASH_FUNC(vp, old->p_offset)]; 4834 4835 for (;;) { 4836 if (*hash_list == old) 4837 break; 4838 if (*hash_list == NULL) { 4839 panic("page_do_hashout"); 4840 /*NOTREACHED*/ 4841 } 4842 hash_list = &(*hash_list)->p_hash; 4843 } 4844 4845 /* 4846 * update new and replace old with new on the page hash list 4847 */ 4848 new->p_vnode = old->p_vnode; 4849 new->p_offset = old->p_offset; 4850 new->p_hash = old->p_hash; 4851 *hash_list = new; 4852 4853 if ((new->p_vnode->v_flag & VISSWAP) != 0) 4854 PP_SETSWAP(new); 4855 4856 /* 4857 * replace old with new on the vnode's page list 4858 */ 4859 if (old->p_vpnext == old) { 4860 new->p_vpnext = new; 4861 new->p_vpprev = new; 4862 } else { 4863 new->p_vpnext = old->p_vpnext; 4864 new->p_vpprev = old->p_vpprev; 4865 new->p_vpnext->p_vpprev = new; 4866 new->p_vpprev->p_vpnext = new; 4867 } 4868 if (vp->v_pages == old) 4869 vp->v_pages = new; 4870 4871 /* 4872 * clear out the old page 4873 */ 4874 old->p_hash = NULL; 4875 old->p_vpnext = NULL; 4876 old->p_vpprev = NULL; 4877 old->p_vnode = NULL; 4878 PP_CLRSWAP(old); 4879 old->p_offset = (u_offset_t)-1; 4880 page_clr_all_props(old); 4881 4882 /* 4883 * Wake up processes waiting for this page. The page's 4884 * identity has been changed, and is probably not the 4885 * desired page any longer. 4886 */ 4887 sep = page_se_mutex(old); 4888 mutex_enter(sep); 4889 old->p_selock &= ~SE_EWANTED; 4890 if (CV_HAS_WAITERS(&old->p_cv)) 4891 cv_broadcast(&old->p_cv); 4892 mutex_exit(sep); 4893 } 4894 4895 /* 4896 * This function moves the identity of page "pp_old" to page "pp_new". 4897 * Both pages must be locked on entry. "pp_new" is free, has no identity, 4898 * and need not be hashed out from anywhere. 4899 */ 4900 void 4901 page_relocate_hash(page_t *pp_new, page_t *pp_old) 4902 { 4903 vnode_t *vp = pp_old->p_vnode; 4904 u_offset_t off = pp_old->p_offset; 4905 kmutex_t *phm, *vphm; 4906 4907 /* 4908 * Rehash two pages 4909 */ 4910 ASSERT(PAGE_EXCL(pp_old)); 4911 ASSERT(PAGE_EXCL(pp_new)); 4912 ASSERT(vp != NULL); 4913 ASSERT(pp_new->p_vnode == NULL); 4914 4915 /* 4916 * hashout then hashin while holding the mutexes 4917 */ 4918 phm = PAGE_HASH_MUTEX(PAGE_HASH_FUNC(vp, off)); 4919 mutex_enter(phm); 4920 vphm = page_vnode_mutex(vp); 4921 mutex_enter(vphm); 4922 4923 page_do_relocate_hash(pp_new, pp_old); 4924 4925 mutex_exit(vphm); 4926 mutex_exit(phm); 4927 4928 /* 4929 * The page_struct_lock need not be acquired for lckcnt and 4930 * cowcnt since the page has an "exclusive" lock. 4931 */ 4932 ASSERT(pp_new->p_lckcnt == 0); 4933 ASSERT(pp_new->p_cowcnt == 0); 4934 pp_new->p_lckcnt = pp_old->p_lckcnt; 4935 pp_new->p_cowcnt = pp_old->p_cowcnt; 4936 pp_old->p_lckcnt = pp_old->p_cowcnt = 0; 4937 4938 /* The following comment preserved from page_flip(). */ 4939 /* XXX - Do we need to protect fsdata? */ 4940 pp_new->p_fsdata = pp_old->p_fsdata; 4941 } 4942 4943 /* 4944 * Helper routine used to lock all remaining members of a 4945 * large page. The caller is responsible for passing in a locked 4946 * pp. If pp is a large page, then it succeeds in locking all the 4947 * remaining constituent pages or it returns with only the 4948 * original page locked. 4949 * 4950 * Returns 1 on success, 0 on failure. 4951 * 4952 * If success is returned this routine gurantees p_szc for all constituent 4953 * pages of a large page pp belongs to can't change. To achieve this we 4954 * recheck szc of pp after locking all constituent pages and retry if szc 4955 * changed (it could only decrease). Since hat_page_demote() needs an EXCL 4956 * lock on one of constituent pages it can't be running after all constituent 4957 * pages are locked. hat_page_demote() with a lock on a constituent page 4958 * outside of this large page (i.e. pp belonged to a larger large page) is 4959 * already done with all constituent pages of pp since the root's p_szc is 4960 * changed last. Thefore no need to synchronize with hat_page_demote() that 4961 * locked a constituent page outside of pp's current large page. 4962 */ 4963 #ifdef DEBUG 4964 uint32_t gpg_trylock_mtbf = 0; 4965 #endif 4966 4967 int 4968 group_page_trylock(page_t *pp, se_t se) 4969 { 4970 page_t *tpp; 4971 pgcnt_t npgs, i, j; 4972 uint_t pszc = pp->p_szc; 4973 4974 #ifdef DEBUG 4975 if (gpg_trylock_mtbf && !(gethrtime() % gpg_trylock_mtbf)) { 4976 return (0); 4977 } 4978 #endif 4979 4980 if (pp != PP_GROUPLEADER(pp, pszc)) { 4981 return (0); 4982 } 4983 4984 retry: 4985 ASSERT(PAGE_LOCKED_SE(pp, se)); 4986 ASSERT(!PP_ISFREE(pp)); 4987 if (pszc == 0) { 4988 return (1); 4989 } 4990 npgs = page_get_pagecnt(pszc); 4991 tpp = pp + 1; 4992 for (i = 1; i < npgs; i++, tpp++) { 4993 if (!page_trylock(tpp, se)) { 4994 tpp = pp + 1; 4995 for (j = 1; j < i; j++, tpp++) { 4996 page_unlock(tpp); 4997 } 4998 return (0); 4999 } 5000 } 5001 if (pp->p_szc != pszc) { 5002 ASSERT(pp->p_szc < pszc); 5003 ASSERT(pp->p_vnode != NULL && pp->p_vnode != &kvp && 5004 !IS_SWAPFSVP(pp->p_vnode)); 5005 tpp = pp + 1; 5006 for (i = 1; i < npgs; i++, tpp++) { 5007 page_unlock(tpp); 5008 } 5009 pszc = pp->p_szc; 5010 goto retry; 5011 } 5012 return (1); 5013 } 5014 5015 void 5016 group_page_unlock(page_t *pp) 5017 { 5018 page_t *tpp; 5019 pgcnt_t npgs, i; 5020 5021 ASSERT(PAGE_LOCKED(pp)); 5022 ASSERT(!PP_ISFREE(pp)); 5023 ASSERT(pp == PP_PAGEROOT(pp)); 5024 npgs = page_get_pagecnt(pp->p_szc); 5025 for (i = 1, tpp = pp + 1; i < npgs; i++, tpp++) { 5026 page_unlock(tpp); 5027 } 5028 } 5029 5030 /* 5031 * returns 5032 * 0 : on success and *nrelocp is number of relocated PAGESIZE pages 5033 * ERANGE : this is not a base page 5034 * EBUSY : failure to get locks on the page/pages 5035 * ENOMEM : failure to obtain replacement pages 5036 * EAGAIN : OBP has not yet completed its boot-time handoff to the kernel 5037 * 5038 * Return with all constituent members of target and replacement 5039 * SE_EXCL locked. It is the callers responsibility to drop the 5040 * locks. 5041 */ 5042 int 5043 do_page_relocate( 5044 page_t **target, 5045 page_t **replacement, 5046 int grouplock, 5047 spgcnt_t *nrelocp, 5048 lgrp_t *lgrp) 5049 { 5050 #ifdef DEBUG 5051 page_t *first_repl; 5052 #endif /* DEBUG */ 5053 page_t *repl; 5054 page_t *targ; 5055 page_t *pl = NULL; 5056 uint_t ppattr; 5057 pfn_t pfn, repl_pfn; 5058 uint_t szc; 5059 spgcnt_t npgs, i; 5060 int repl_contig = 0; 5061 uint_t flags = 0; 5062 spgcnt_t dofree = 0; 5063 5064 *nrelocp = 0; 5065 5066 #if defined(__sparc) 5067 /* 5068 * We need to wait till OBP has completed 5069 * its boot-time handoff of its resources to the kernel 5070 * before we allow page relocation 5071 */ 5072 if (page_relocate_ready == 0) { 5073 return (EAGAIN); 5074 } 5075 #endif 5076 5077 /* 5078 * If this is not a base page, 5079 * just return with 0x0 pages relocated. 5080 */ 5081 targ = *target; 5082 ASSERT(PAGE_EXCL(targ)); 5083 ASSERT(!PP_ISFREE(targ)); 5084 szc = targ->p_szc; 5085 ASSERT(szc < mmu_page_sizes); 5086 VM_STAT_ADD(vmm_vmstats.ppr_reloc[szc]); 5087 pfn = targ->p_pagenum; 5088 if (pfn != PFN_BASE(pfn, szc)) { 5089 VM_STAT_ADD(vmm_vmstats.ppr_relocnoroot[szc]); 5090 return (ERANGE); 5091 } 5092 5093 if ((repl = *replacement) != NULL && repl->p_szc >= szc) { 5094 repl_pfn = repl->p_pagenum; 5095 if (repl_pfn != PFN_BASE(repl_pfn, szc)) { 5096 VM_STAT_ADD(vmm_vmstats.ppr_reloc_replnoroot[szc]); 5097 return (ERANGE); 5098 } 5099 repl_contig = 1; 5100 } 5101 5102 /* 5103 * We must lock all members of this large page or we cannot 5104 * relocate any part of it. 5105 */ 5106 if (grouplock != 0 && !group_page_trylock(targ, SE_EXCL)) { 5107 VM_STAT_ADD(vmm_vmstats.ppr_relocnolock[targ->p_szc]); 5108 return (EBUSY); 5109 } 5110 5111 /* 5112 * reread szc it could have been decreased before 5113 * group_page_trylock() was done. 5114 */ 5115 szc = targ->p_szc; 5116 ASSERT(szc < mmu_page_sizes); 5117 VM_STAT_ADD(vmm_vmstats.ppr_reloc[szc]); 5118 ASSERT(pfn == PFN_BASE(pfn, szc)); 5119 5120 npgs = page_get_pagecnt(targ->p_szc); 5121 5122 if (repl == NULL) { 5123 dofree = npgs; /* Size of target page in MMU pages */ 5124 if (!page_create_wait(dofree, 0)) { 5125 if (grouplock != 0) { 5126 group_page_unlock(targ); 5127 } 5128 VM_STAT_ADD(vmm_vmstats.ppr_relocnomem[szc]); 5129 return (ENOMEM); 5130 } 5131 5132 /* 5133 * seg kmem pages require that the target and replacement 5134 * page be the same pagesize. 5135 */ 5136 flags = (targ->p_vnode == &kvp) ? PGR_SAMESZC : 0; 5137 repl = page_get_replacement_page(targ, lgrp, flags); 5138 if (repl == NULL) { 5139 if (grouplock != 0) { 5140 group_page_unlock(targ); 5141 } 5142 page_create_putback(dofree); 5143 VM_STAT_ADD(vmm_vmstats.ppr_relocnomem[szc]); 5144 return (ENOMEM); 5145 } 5146 } 5147 #ifdef DEBUG 5148 else { 5149 ASSERT(PAGE_LOCKED(repl)); 5150 } 5151 #endif /* DEBUG */ 5152 5153 #if defined(__sparc) 5154 /* 5155 * Let hat_page_relocate() complete the relocation if it's kernel page 5156 */ 5157 if (targ->p_vnode == &kvp) { 5158 *replacement = repl; 5159 if (hat_page_relocate(target, replacement, nrelocp) != 0) { 5160 if (grouplock != 0) { 5161 group_page_unlock(targ); 5162 } 5163 if (dofree) { 5164 *replacement = NULL; 5165 page_free_replacement_page(repl); 5166 page_create_putback(dofree); 5167 } 5168 VM_STAT_ADD(vmm_vmstats.ppr_krelocfail[szc]); 5169 return (EAGAIN); 5170 } 5171 VM_STAT_ADD(vmm_vmstats.ppr_relocok[szc]); 5172 return (0); 5173 } 5174 #else 5175 #if defined(lint) 5176 dofree = dofree; 5177 #endif 5178 #endif 5179 5180 #ifdef DEBUG 5181 first_repl = repl; 5182 #endif /* DEBUG */ 5183 5184 for (i = 0; i < npgs; i++) { 5185 ASSERT(PAGE_EXCL(targ)); 5186 5187 (void) hat_pageunload(targ, HAT_FORCE_PGUNLOAD); 5188 5189 ASSERT(hat_page_getshare(targ) == 0); 5190 ASSERT(!PP_ISFREE(targ)); 5191 ASSERT(targ->p_pagenum == (pfn + i)); 5192 ASSERT(repl_contig == 0 || 5193 repl->p_pagenum == (repl_pfn + i)); 5194 5195 /* 5196 * Copy the page contents and attributes then 5197 * relocate the page in the page hash. 5198 */ 5199 ppcopy(targ, repl); 5200 ppattr = hat_page_getattr(targ, (P_MOD | P_REF | P_RO)); 5201 page_clr_all_props(repl); 5202 page_set_props(repl, ppattr); 5203 page_relocate_hash(repl, targ); 5204 5205 ASSERT(hat_page_getshare(targ) == 0); 5206 ASSERT(hat_page_getshare(repl) == 0); 5207 /* 5208 * Now clear the props on targ, after the 5209 * page_relocate_hash(), they no longer 5210 * have any meaning. 5211 */ 5212 page_clr_all_props(targ); 5213 ASSERT(targ->p_next == targ); 5214 ASSERT(targ->p_prev == targ); 5215 page_list_concat(&pl, &targ); 5216 5217 targ++; 5218 if (repl_contig != 0) { 5219 repl++; 5220 } else { 5221 repl = repl->p_next; 5222 } 5223 } 5224 /* assert that we have come full circle with repl */ 5225 ASSERT(repl_contig == 1 || first_repl == repl); 5226 5227 *target = pl; 5228 if (*replacement == NULL) { 5229 ASSERT(first_repl == repl); 5230 *replacement = repl; 5231 } 5232 VM_STAT_ADD(vmm_vmstats.ppr_relocok[szc]); 5233 *nrelocp = npgs; 5234 return (0); 5235 } 5236 /* 5237 * On success returns 0 and *nrelocp the number of PAGESIZE pages relocated. 5238 */ 5239 int 5240 page_relocate( 5241 page_t **target, 5242 page_t **replacement, 5243 int grouplock, 5244 int freetarget, 5245 spgcnt_t *nrelocp, 5246 lgrp_t *lgrp) 5247 { 5248 spgcnt_t ret; 5249 5250 /* do_page_relocate returns 0 on success or errno value */ 5251 ret = do_page_relocate(target, replacement, grouplock, nrelocp, lgrp); 5252 5253 if (ret != 0 || freetarget == 0) { 5254 return (ret); 5255 } 5256 if (*nrelocp == 1) { 5257 ASSERT(*target != NULL); 5258 page_free(*target, 1); 5259 } else { 5260 page_t *tpp = *target; 5261 uint_t szc = tpp->p_szc; 5262 pgcnt_t npgs = page_get_pagecnt(szc); 5263 ASSERT(npgs > 1); 5264 ASSERT(szc != 0); 5265 do { 5266 ASSERT(PAGE_EXCL(tpp)); 5267 ASSERT(!hat_page_is_mapped(tpp)); 5268 ASSERT(tpp->p_szc == szc); 5269 PP_SETFREE(tpp); 5270 PP_SETAGED(tpp); 5271 npgs--; 5272 } while ((tpp = tpp->p_next) != *target); 5273 ASSERT(npgs == 0); 5274 page_list_add_pages(*target, 0); 5275 npgs = page_get_pagecnt(szc); 5276 page_create_putback(npgs); 5277 } 5278 return (ret); 5279 } 5280 5281 /* 5282 * it is up to the caller to deal with pcf accounting. 5283 */ 5284 void 5285 page_free_replacement_page(page_t *pplist) 5286 { 5287 page_t *pp; 5288 5289 while (pplist != NULL) { 5290 /* 5291 * pp_targ is a linked list. 5292 */ 5293 pp = pplist; 5294 if (pp->p_szc == 0) { 5295 page_sub(&pplist, pp); 5296 page_clr_all_props(pp); 5297 PP_SETFREE(pp); 5298 PP_SETAGED(pp); 5299 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); 5300 page_unlock(pp); 5301 VM_STAT_ADD(pagecnt.pc_free_replacement_page[0]); 5302 } else { 5303 spgcnt_t curnpgs = page_get_pagecnt(pp->p_szc); 5304 page_t *tpp; 5305 page_list_break(&pp, &pplist, curnpgs); 5306 tpp = pp; 5307 do { 5308 ASSERT(PAGE_EXCL(tpp)); 5309 ASSERT(!hat_page_is_mapped(tpp)); 5310 page_clr_all_props(pp); 5311 PP_SETFREE(tpp); 5312 PP_SETAGED(tpp); 5313 } while ((tpp = tpp->p_next) != pp); 5314 page_list_add_pages(pp, 0); 5315 VM_STAT_ADD(pagecnt.pc_free_replacement_page[1]); 5316 } 5317 } 5318 } 5319 5320 /* 5321 * Relocate target to non-relocatable replacement page. 5322 */ 5323 int 5324 page_relocate_cage(page_t **target, page_t **replacement) 5325 { 5326 page_t *tpp, *rpp; 5327 spgcnt_t pgcnt, npgs; 5328 int result; 5329 5330 tpp = *target; 5331 5332 ASSERT(PAGE_EXCL(tpp)); 5333 ASSERT(tpp->p_szc == 0); 5334 5335 pgcnt = btop(page_get_pagesize(tpp->p_szc)); 5336 5337 do { 5338 (void) page_create_wait(pgcnt, PG_WAIT | PG_NORELOC); 5339 rpp = page_get_replacement_page(tpp, NULL, PGR_NORELOC); 5340 if (rpp == NULL) { 5341 page_create_putback(pgcnt); 5342 kcage_cageout_wakeup(); 5343 } 5344 } while (rpp == NULL); 5345 5346 ASSERT(PP_ISNORELOC(rpp)); 5347 5348 result = page_relocate(&tpp, &rpp, 0, 1, &npgs, NULL); 5349 5350 if (result == 0) { 5351 *replacement = rpp; 5352 if (pgcnt != npgs) 5353 panic("page_relocate_cage: partial relocation"); 5354 } 5355 5356 return (result); 5357 } 5358 5359 /* 5360 * Release the page lock on a page, place on cachelist 5361 * tail if no longer mapped. Caller can let us know if 5362 * the page is known to be clean. 5363 */ 5364 int 5365 page_release(page_t *pp, int checkmod) 5366 { 5367 int status; 5368 5369 ASSERT(PAGE_LOCKED(pp) && !PP_ISFREE(pp) && 5370 (pp->p_vnode != NULL)); 5371 5372 if (!hat_page_is_mapped(pp) && !IS_SWAPVP(pp->p_vnode) && 5373 ((PAGE_SHARED(pp) && page_tryupgrade(pp)) || PAGE_EXCL(pp)) && 5374 pp->p_lckcnt == 0 && pp->p_cowcnt == 0 && 5375 !hat_page_is_mapped(pp)) { 5376 5377 /* 5378 * If page is modified, unlock it 5379 * 5380 * (p_nrm & P_MOD) bit has the latest stuff because: 5381 * (1) We found that this page doesn't have any mappings 5382 * _after_ holding SE_EXCL and 5383 * (2) We didn't drop SE_EXCL lock after the check in (1) 5384 */ 5385 if (checkmod && hat_ismod(pp)) { 5386 page_unlock(pp); 5387 status = PGREL_MOD; 5388 } else { 5389 /*LINTED: constant in conditional context*/ 5390 VN_DISPOSE(pp, B_FREE, 0, kcred); 5391 status = PGREL_CLEAN; 5392 } 5393 } else { 5394 page_unlock(pp); 5395 status = PGREL_NOTREL; 5396 } 5397 return (status); 5398 } 5399 5400 int 5401 page_try_demote_pages(page_t *pp) 5402 { 5403 page_t *tpp, *rootpp = pp; 5404 pfn_t pfn = page_pptonum(pp); 5405 spgcnt_t i, npgs; 5406 uint_t szc = pp->p_szc; 5407 vnode_t *vp = pp->p_vnode; 5408 5409 ASSERT(PAGE_EXCL(rootpp)); 5410 5411 VM_STAT_ADD(pagecnt.pc_try_demote_pages[0]); 5412 5413 if (rootpp->p_szc == 0) { 5414 VM_STAT_ADD(pagecnt.pc_try_demote_pages[1]); 5415 return (1); 5416 } 5417 5418 if (vp != NULL && !IS_SWAPFSVP(vp) && vp != &kvp) { 5419 VM_STAT_ADD(pagecnt.pc_try_demote_pages[2]); 5420 page_demote_vp_pages(rootpp); 5421 ASSERT(pp->p_szc == 0); 5422 return (1); 5423 } 5424 5425 /* 5426 * Adjust rootpp if passed in is not the base 5427 * constituent page. 5428 */ 5429 npgs = page_get_pagecnt(rootpp->p_szc); 5430 ASSERT(npgs > 1); 5431 if (!IS_P2ALIGNED(pfn, npgs)) { 5432 pfn = P2ALIGN(pfn, npgs); 5433 rootpp = page_numtopp_nolock(pfn); 5434 VM_STAT_ADD(pagecnt.pc_try_demote_pages[3]); 5435 ASSERT(rootpp->p_vnode != NULL); 5436 ASSERT(rootpp->p_szc == szc); 5437 } 5438 5439 /* 5440 * We can't demote kernel pages since we can't hat_unload() 5441 * the mappings. 5442 */ 5443 if (rootpp->p_vnode == &kvp) 5444 return (0); 5445 5446 /* 5447 * Attempt to lock all constituent pages except the page passed 5448 * in since it's already locked. 5449 */ 5450 for (tpp = rootpp, i = 0; i < npgs; i++, tpp++) { 5451 ASSERT(!PP_ISFREE(tpp)); 5452 ASSERT(tpp->p_vnode != NULL); 5453 5454 if (tpp != pp && !page_trylock(tpp, SE_EXCL)) 5455 break; 5456 ASSERT(tpp->p_szc == rootpp->p_szc); 5457 ASSERT(page_pptonum(tpp) == page_pptonum(rootpp) + i); 5458 (void) hat_pageunload(tpp, HAT_FORCE_PGUNLOAD); 5459 } 5460 5461 /* 5462 * If we failed to lock them all then unlock what we have locked 5463 * so far and bail. 5464 */ 5465 if (i < npgs) { 5466 tpp = rootpp; 5467 while (i-- > 0) { 5468 if (tpp != pp) 5469 page_unlock(tpp); 5470 tpp++; 5471 } 5472 VM_STAT_ADD(pagecnt.pc_try_demote_pages[4]); 5473 return (0); 5474 } 5475 5476 /* 5477 * XXX probably p_szc clearing and page unlocking can be done within 5478 * one loop but since this is rare code we can play very safe. 5479 */ 5480 for (tpp = rootpp, i = 0; i < npgs; i++, tpp++) { 5481 ASSERT(PAGE_EXCL(tpp)); 5482 tpp->p_szc = 0; 5483 } 5484 5485 /* 5486 * Unlock all pages except the page passed in. 5487 */ 5488 for (tpp = rootpp, i = 0; i < npgs; i++, tpp++) { 5489 ASSERT(!hat_page_is_mapped(tpp)); 5490 if (tpp != pp) 5491 page_unlock(tpp); 5492 } 5493 VM_STAT_ADD(pagecnt.pc_try_demote_pages[5]); 5494 return (1); 5495 } 5496 5497 /* 5498 * Called by page_free() and page_destroy() to demote the page size code 5499 * (p_szc) to 0 (since we can't just put a single PAGESIZE page with non zero 5500 * p_szc on free list, neither can we just clear p_szc of a single page_t 5501 * within a large page since it will break other code that relies on p_szc 5502 * being the same for all page_t's of a large page). Anonymous pages should 5503 * never end up here because anon_map_getpages() cannot deal with p_szc 5504 * changes after a single constituent page is locked. While anonymous or 5505 * kernel large pages are demoted or freed the entire large page at a time 5506 * with all constituent pages locked EXCL for the file system pages we 5507 * have to be able to demote a large page (i.e. decrease all constituent pages 5508 * p_szc) with only just an EXCL lock on one of constituent pages. The reason 5509 * we can easily deal with anonymous page demotion the entire large page at a 5510 * time is that those operation originate at address space level and concern 5511 * the entire large page region with actual demotion only done when pages are 5512 * not shared with any other processes (therefore we can always get EXCL lock 5513 * on all anonymous constituent pages after clearing segment page 5514 * cache). However file system pages can be truncated or invalidated at a 5515 * PAGESIZE level from the file system side and end up in page_free() or 5516 * page_destroy() (we also allow only part of the large page to be SOFTLOCKed 5517 * and therfore pageout should be able to demote a large page by EXCL locking 5518 * any constituent page that is not under SOFTLOCK). In those cases we cannot 5519 * rely on being able to lock EXCL all constituent pages. 5520 * 5521 * To prevent szc changes on file system pages one has to lock all constituent 5522 * pages at least SHARED (or call page_szc_lock()). The only subsystem that 5523 * doesn't rely on locking all constituent pages (or using page_szc_lock()) to 5524 * prevent szc changes is hat layer that uses its own page level mlist 5525 * locks. hat assumes that szc doesn't change after mlist lock for a page is 5526 * taken. Therefore we need to change szc under hat level locks if we only 5527 * have an EXCL lock on a single constituent page and hat still references any 5528 * of constituent pages. (Note we can't "ignore" hat layer by simply 5529 * hat_pageunload() all constituent pages without having EXCL locks on all of 5530 * constituent pages). We use hat_page_demote() call to safely demote szc of 5531 * all constituent pages under hat locks when we only have an EXCL lock on one 5532 * of constituent pages. 5533 * 5534 * This routine calls page_szc_lock() before calling hat_page_demote() to 5535 * allow segvn in one special case not to lock all constituent pages SHARED 5536 * before calling hat_memload_array() that relies on p_szc not changeing even 5537 * before hat level mlist lock is taken. In that case segvn uses 5538 * page_szc_lock() to prevent hat_page_demote() changeing p_szc values. 5539 * 5540 * Anonymous or kernel page demotion still has to lock all pages exclusively 5541 * and do hat_pageunload() on all constituent pages before demoting the page 5542 * therefore there's no need for anonymous or kernel page demotion to use 5543 * hat_page_demote() mechanism. 5544 * 5545 * hat_page_demote() removes all large mappings that map pp and then decreases 5546 * p_szc starting from the last constituent page of the large page. By working 5547 * from the tail of a large page in pfn decreasing order allows one looking at 5548 * the root page to know that hat_page_demote() is done for root's szc area. 5549 * e.g. if a root page has szc 1 one knows it only has to lock all constituent 5550 * pages within szc 1 area to prevent szc changes because hat_page_demote() 5551 * that started on this page when it had szc > 1 is done for this szc 1 area. 5552 * 5553 * We are guranteed that all constituent pages of pp's large page belong to 5554 * the same vnode with the consecutive offsets increasing in the direction of 5555 * the pfn i.e. the identity of constituent pages can't change until their 5556 * p_szc is decreased. Therefore it's safe for hat_page_demote() to remove 5557 * large mappings to pp even though we don't lock any constituent page except 5558 * pp (i.e. we won't unload e.g. kernel locked page). 5559 */ 5560 static void 5561 page_demote_vp_pages(page_t *pp) 5562 { 5563 kmutex_t *mtx; 5564 5565 ASSERT(PAGE_EXCL(pp)); 5566 ASSERT(!PP_ISFREE(pp)); 5567 ASSERT(pp->p_vnode != NULL); 5568 ASSERT(!IS_SWAPFSVP(pp->p_vnode)); 5569 ASSERT(pp->p_vnode != &kvp); 5570 5571 VM_STAT_ADD(pagecnt.pc_demote_pages[0]); 5572 5573 mtx = page_szc_lock(pp); 5574 if (mtx != NULL) { 5575 hat_page_demote(pp); 5576 mutex_exit(mtx); 5577 } 5578 ASSERT(pp->p_szc == 0); 5579 } 5580 5581 /* 5582 * Page retire operation. 5583 * 5584 * page_retire() 5585 * Attempt to retire (throw away) page pp. We cannot do this if 5586 * the page is dirty; if the page is clean, we can try. We return 0 on 5587 * success, -1 on failure. This routine should be invoked by the platform's 5588 * memory error detection code. 5589 * 5590 * pages_retired_limit_exceeded() 5591 * We set a limit on the number of pages which may be retired. This 5592 * is set to a percentage of total physical memory. This limit is 5593 * enforced here. 5594 */ 5595 5596 static pgcnt_t retired_pgcnt = 0; 5597 5598 /* 5599 * routines to update the count of retired pages 5600 */ 5601 static void 5602 page_retired(page_t *pp) 5603 { 5604 ASSERT(pp); 5605 5606 page_settoxic(pp, PAGE_IS_RETIRED); 5607 atomic_add_long(&retired_pgcnt, 1); 5608 } 5609 5610 static void 5611 retired_page_removed(page_t *pp) 5612 { 5613 ASSERT(pp); 5614 ASSERT(page_isretired(pp)); 5615 ASSERT(retired_pgcnt > 0); 5616 5617 page_clrtoxic(pp); 5618 atomic_add_long(&retired_pgcnt, -1); 5619 } 5620 5621 5622 static int 5623 pages_retired_limit_exceeded() 5624 { 5625 pgcnt_t retired_max; 5626 5627 /* 5628 * If the percentage is zero or is not set correctly, 5629 * return TRUE so that pages are not retired. 5630 */ 5631 if (max_pages_retired_bps <= 0 || 5632 max_pages_retired_bps >= 10000) 5633 return (1); 5634 5635 /* 5636 * Calculate the maximum number of pages allowed to 5637 * be retired as a percentage of total physical memory 5638 * (Remember that we are using basis points, hence the 10000.) 5639 */ 5640 retired_max = (physmem * max_pages_retired_bps) / 10000; 5641 5642 /* 5643 * return 'TRUE' if we have already retired more 5644 * than the legal limit 5645 */ 5646 return (retired_pgcnt >= retired_max); 5647 } 5648 5649 #define PAGE_RETIRE_SELOCK 0 5650 #define PAGE_RETIRE_NORECLAIM 1 5651 #define PAGE_RETIRE_LOCKED 2 5652 #define PAGE_RETIRE_COW 3 5653 #define PAGE_RETIRE_DIRTY 4 5654 #define PAGE_RETIRE_LPAGE 5 5655 #define PAGE_RETIRE_SUCCESS 6 5656 #define PAGE_RETIRE_LIMIT 7 5657 #define PAGE_RETIRE_NCODES 8 5658 5659 typedef struct page_retire_op { 5660 int pr_count; 5661 short pr_unlock; 5662 short pr_retval; 5663 char *pr_message; 5664 } page_retire_op_t; 5665 5666 page_retire_op_t page_retire_ops[PAGE_RETIRE_NCODES] = { 5667 { 0, 0, -1, "cannot lock page" }, 5668 { 0, 0, -1, "cannot reclaim cached page" }, 5669 { 0, 1, -1, "page is locked" }, 5670 { 0, 1, -1, "copy-on-write page" }, 5671 { 0, 1, -1, "page is dirty" }, 5672 { 0, 1, -1, "cannot demote large page" }, 5673 { 0, 0, 0, "page successfully retired" }, 5674 { 0, 0, -1, "excess pages retired already" }, 5675 }; 5676 5677 static int 5678 page_retire_done(page_t *pp, int code) 5679 { 5680 page_retire_op_t *prop = &page_retire_ops[code]; 5681 5682 prop->pr_count++; 5683 5684 if (prop->pr_unlock) 5685 page_unlock(pp); 5686 5687 if (page_retire_messages > 1) { 5688 printf("page_retire(%p) pfn 0x%lx %s: %s\n", 5689 (void *)pp, page_pptonum(pp), 5690 prop->pr_retval == -1 ? "failed" : "succeeded", 5691 prop->pr_message); 5692 } 5693 5694 return (prop->pr_retval); 5695 } 5696 5697 int 5698 page_retire(page_t *pp, uchar_t flag) 5699 { 5700 uint64_t pa = ptob((uint64_t)page_pptonum(pp)); 5701 5702 ASSERT(flag == PAGE_IS_FAILING || flag == PAGE_IS_TOXIC); 5703 5704 /* 5705 * DR operations change the association between a page_t 5706 * and the physical page it represents. Check if the 5707 * page is still bad. 5708 */ 5709 if (!page_isfaulty(pp)) { 5710 page_clrtoxic(pp); 5711 return (page_retire_done(pp, PAGE_RETIRE_SUCCESS)); 5712 } 5713 5714 /* 5715 * We set the flag here so that even if we fail due 5716 * to exceeding the limit for retired pages, the 5717 * page will still be checked and either cleared 5718 * or retired in page_free(). 5719 */ 5720 page_settoxic(pp, flag); 5721 5722 if (flag == PAGE_IS_TOXIC) { 5723 if (page_retire_messages) { 5724 cmn_err(CE_NOTE, "Scheduling clearing of error on" 5725 " page 0x%08x.%08x", 5726 (uint32_t)(pa >> 32), (uint32_t)pa); 5727 } 5728 5729 } else { /* PAGE_IS_FAILING */ 5730 if (pages_retired_limit_exceeded()) { 5731 /* 5732 * Return as we have already exceeded the 5733 * maximum number of pages allowed to be 5734 * retired 5735 */ 5736 return (page_retire_done(pp, PAGE_RETIRE_LIMIT)); 5737 } 5738 5739 if (page_retire_messages) { 5740 cmn_err(CE_NOTE, "Scheduling removal of " 5741 "page 0x%08x.%08x", 5742 (uint32_t)(pa >> 32), (uint32_t)pa); 5743 } 5744 } 5745 5746 if (PAGE_LOCKED(pp) || !page_trylock(pp, SE_EXCL)) 5747 return (page_retire_done(pp, PAGE_RETIRE_SELOCK)); 5748 5749 /* 5750 * If this is a large page we first try and demote it 5751 * to PAGESIZE pages and then dispose of the toxic page. 5752 * On failure we will let the page free/destroy 5753 * code handle it later since this is a mapped page. 5754 * Note that free large pages can always be demoted. 5755 * 5756 */ 5757 if (pp->p_szc != 0) { 5758 if (PP_ISFREE(pp)) 5759 (void) page_demote_free_pages(pp); 5760 else 5761 (void) page_try_demote_pages(pp); 5762 5763 if (pp->p_szc != 0) 5764 return (page_retire_done(pp, PAGE_RETIRE_LPAGE)); 5765 } 5766 5767 if (PP_ISFREE(pp)) { 5768 if (!page_reclaim(pp, NULL)) 5769 return (page_retire_done(pp, PAGE_RETIRE_NORECLAIM)); 5770 /*LINTED: constant in conditional context*/ 5771 VN_DISPOSE(pp, pp->p_vnode ? B_INVAL : B_FREE, 0, kcred) 5772 return (page_retire_done(pp, PAGE_RETIRE_SUCCESS)); 5773 } 5774 5775 if (pp->p_lckcnt != 0) 5776 return (page_retire_done(pp, PAGE_RETIRE_LOCKED)); 5777 5778 if (pp->p_cowcnt != 0) 5779 return (page_retire_done(pp, PAGE_RETIRE_COW)); 5780 5781 /* 5782 * Unload all translations to this page. No new translations 5783 * can be created while we hold the exclusive lock on the page. 5784 */ 5785 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 5786 5787 if (hat_ismod(pp)) 5788 return (page_retire_done(pp, PAGE_RETIRE_DIRTY)); 5789 5790 /*LINTED: constant in conditional context*/ 5791 VN_DISPOSE(pp, B_INVAL, 0, kcred); 5792 5793 return (page_retire_done(pp, PAGE_RETIRE_SUCCESS)); 5794 } 5795 5796 /* 5797 * Mark any existing pages for migration in the given range 5798 */ 5799 void 5800 page_mark_migrate(struct seg *seg, caddr_t addr, size_t len, 5801 struct anon_map *amp, ulong_t anon_index, vnode_t *vp, 5802 u_offset_t vnoff, int rflag) 5803 { 5804 struct anon *ap; 5805 vnode_t *curvp; 5806 lgrp_t *from; 5807 pgcnt_t i; 5808 pgcnt_t nlocked; 5809 u_offset_t off; 5810 pfn_t pfn; 5811 size_t pgsz; 5812 size_t segpgsz; 5813 pgcnt_t pages; 5814 uint_t pszc; 5815 page_t **ppa; 5816 pgcnt_t ppa_nentries; 5817 page_t *pp; 5818 caddr_t va; 5819 ulong_t an_idx; 5820 anon_sync_obj_t cookie; 5821 5822 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 5823 5824 /* 5825 * Don't do anything if don't need to do lgroup optimizations 5826 * on this system 5827 */ 5828 if (!lgrp_optimizations()) 5829 return; 5830 5831 /* 5832 * Align address and length to (potentially large) page boundary 5833 */ 5834 segpgsz = page_get_pagesize(seg->s_szc); 5835 addr = (caddr_t)P2ALIGN((uintptr_t)addr, segpgsz); 5836 if (rflag) 5837 len = P2ROUNDUP(len, segpgsz); 5838 5839 /* 5840 * Allocate page array to accomodate largest page size 5841 */ 5842 pgsz = page_get_pagesize(page_num_pagesizes() - 1); 5843 ppa_nentries = btop(pgsz); 5844 ppa = kmem_zalloc(ppa_nentries * sizeof (page_t *), KM_SLEEP); 5845 5846 /* 5847 * Do one (large) page at a time 5848 */ 5849 va = addr; 5850 while (va < addr + len) { 5851 /* 5852 * Lookup (root) page for vnode and offset corresponding to 5853 * this virtual address 5854 * Try anonmap first since there may be copy-on-write 5855 * pages, but initialize vnode pointer and offset using 5856 * vnode arguments just in case there isn't an amp. 5857 */ 5858 curvp = vp; 5859 off = vnoff + va - seg->s_base; 5860 if (amp) { 5861 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5862 an_idx = anon_index + seg_page(seg, va); 5863 anon_array_enter(amp, an_idx, &cookie); 5864 ap = anon_get_ptr(amp->ahp, an_idx); 5865 if (ap) 5866 swap_xlate(ap, &curvp, &off); 5867 anon_array_exit(&cookie); 5868 ANON_LOCK_EXIT(&->a_rwlock); 5869 } 5870 5871 pp = NULL; 5872 if (curvp) 5873 pp = page_lookup(curvp, off, SE_SHARED); 5874 5875 /* 5876 * If there isn't a page at this virtual address, 5877 * skip to next page 5878 */ 5879 if (pp == NULL) { 5880 va += PAGESIZE; 5881 continue; 5882 } 5883 5884 /* 5885 * Figure out which lgroup this page is in for kstats 5886 */ 5887 pfn = page_pptonum(pp); 5888 from = lgrp_pfn_to_lgrp(pfn); 5889 5890 /* 5891 * Get page size, and round up and skip to next page boundary 5892 * if unaligned address 5893 */ 5894 pszc = pp->p_szc; 5895 pgsz = page_get_pagesize(pszc); 5896 pages = btop(pgsz); 5897 if (!IS_P2ALIGNED(va, pgsz) || 5898 !IS_P2ALIGNED(pfn, pages) || 5899 pgsz > segpgsz) { 5900 pgsz = MIN(pgsz, segpgsz); 5901 page_unlock(pp); 5902 i = btop(P2END((uintptr_t)va, pgsz) - 5903 (uintptr_t)va); 5904 va = (caddr_t)P2END((uintptr_t)va, pgsz); 5905 lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS, i); 5906 continue; 5907 } 5908 5909 /* 5910 * Upgrade to exclusive lock on page 5911 */ 5912 if (!page_tryupgrade(pp)) { 5913 page_unlock(pp); 5914 va += pgsz; 5915 lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS, 5916 btop(pgsz)); 5917 continue; 5918 } 5919 5920 /* 5921 * Remember pages locked exclusively and how many 5922 */ 5923 ppa[0] = pp; 5924 nlocked = 1; 5925 5926 /* 5927 * Lock constituent pages if this is large page 5928 */ 5929 if (pages > 1) { 5930 /* 5931 * Lock all constituents except root page, since it 5932 * should be locked already. 5933 */ 5934 for (i = 1; i < pages; i++) { 5935 pp++; 5936 if (!page_trylock(pp, SE_EXCL)) { 5937 break; 5938 } 5939 if (PP_ISFREE(pp) || 5940 pp->p_szc != pszc) { 5941 /* 5942 * hat_page_demote() raced in with us. 5943 */ 5944 ASSERT(!IS_SWAPFSVP(curvp)); 5945 page_unlock(pp); 5946 break; 5947 } 5948 ppa[nlocked] = pp; 5949 nlocked++; 5950 } 5951 } 5952 5953 /* 5954 * If all constituent pages couldn't be locked, 5955 * unlock pages locked so far and skip to next page. 5956 */ 5957 if (nlocked != pages) { 5958 for (i = 0; i < nlocked; i++) 5959 page_unlock(ppa[i]); 5960 va += pgsz; 5961 lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS, 5962 btop(pgsz)); 5963 continue; 5964 } 5965 5966 /* 5967 * hat_page_demote() can no longer happen 5968 * since last cons page had the right p_szc after 5969 * all cons pages were locked. all cons pages 5970 * should now have the same p_szc. 5971 */ 5972 5973 /* 5974 * All constituent pages locked successfully, so mark 5975 * large page for migration and unload the mappings of 5976 * constituent pages, so a fault will occur on any part of the 5977 * large page 5978 */ 5979 PP_SETMIGRATE(ppa[0]); 5980 for (i = 0; i < nlocked; i++) { 5981 pp = ppa[i]; 5982 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 5983 ASSERT(hat_page_getshare(pp) == 0); 5984 page_unlock(pp); 5985 } 5986 lgrp_stat_add(from->lgrp_id, LGRP_PMM_PGS, nlocked); 5987 5988 va += pgsz; 5989 } 5990 kmem_free(ppa, ppa_nentries * sizeof (page_t *)); 5991 } 5992 5993 /* 5994 * Migrate any pages that have been marked for migration in the given range 5995 */ 5996 void 5997 page_migrate( 5998 struct seg *seg, 5999 caddr_t addr, 6000 page_t **ppa, 6001 pgcnt_t npages) 6002 { 6003 lgrp_t *from; 6004 lgrp_t *to; 6005 page_t *newpp; 6006 page_t *pp; 6007 pfn_t pfn; 6008 size_t pgsz; 6009 spgcnt_t page_cnt; 6010 spgcnt_t i; 6011 uint_t pszc; 6012 6013 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 6014 6015 while (npages > 0) { 6016 pp = *ppa; 6017 pszc = pp->p_szc; 6018 pgsz = page_get_pagesize(pszc); 6019 page_cnt = btop(pgsz); 6020 6021 /* 6022 * Check to see whether this page is marked for migration 6023 * 6024 * Assume that root page of large page is marked for 6025 * migration and none of the other constituent pages 6026 * are marked. This really simplifies clearing the 6027 * migrate bit by not having to clear it from each 6028 * constituent page. 6029 * 6030 * note we don't want to relocate an entire large page if 6031 * someone is only using one subpage. 6032 */ 6033 if (npages < page_cnt) 6034 break; 6035 6036 /* 6037 * Is it marked for migration? 6038 */ 6039 if (!PP_ISMIGRATE(pp)) 6040 goto next; 6041 6042 /* 6043 * Determine lgroups that page is being migrated between 6044 */ 6045 pfn = page_pptonum(pp); 6046 if (!IS_P2ALIGNED(pfn, page_cnt)) { 6047 break; 6048 } 6049 from = lgrp_pfn_to_lgrp(pfn); 6050 to = lgrp_mem_choose(seg, addr, pgsz); 6051 6052 /* 6053 * Check to see whether we are trying to migrate page to lgroup 6054 * where it is allocated already 6055 */ 6056 if (to == from) { 6057 PP_CLRMIGRATE(pp); 6058 goto next; 6059 } 6060 6061 /* 6062 * Need to get exclusive lock's to migrate 6063 */ 6064 for (i = 0; i < page_cnt; i++) { 6065 ASSERT(PAGE_LOCKED(ppa[i])); 6066 if (page_pptonum(ppa[i]) != pfn + i || 6067 ppa[i]->p_szc != pszc) { 6068 break; 6069 } 6070 if (!page_tryupgrade(ppa[i])) { 6071 lgrp_stat_add(from->lgrp_id, 6072 LGRP_PM_FAIL_LOCK_PGS, 6073 page_cnt); 6074 break; 6075 } 6076 } 6077 if (i != page_cnt) { 6078 while (--i != -1) { 6079 page_downgrade(ppa[i]); 6080 } 6081 goto next; 6082 } 6083 6084 (void) page_create_wait(page_cnt, PG_WAIT); 6085 newpp = page_get_replacement_page(pp, to, PGR_SAMESZC); 6086 if (newpp == NULL) { 6087 page_create_putback(page_cnt); 6088 for (i = 0; i < page_cnt; i++) { 6089 page_downgrade(ppa[i]); 6090 } 6091 lgrp_stat_add(to->lgrp_id, LGRP_PM_FAIL_ALLOC_PGS, 6092 page_cnt); 6093 goto next; 6094 } 6095 ASSERT(newpp->p_szc == pszc); 6096 /* 6097 * Clear migrate bit and relocate page 6098 */ 6099 PP_CLRMIGRATE(pp); 6100 if (page_relocate(&pp, &newpp, 0, 1, &page_cnt, to)) { 6101 panic("page_migrate: page_relocate failed"); 6102 } 6103 ASSERT(page_cnt * PAGESIZE == pgsz); 6104 6105 /* 6106 * Keep stats for number of pages migrated from and to 6107 * each lgroup 6108 */ 6109 lgrp_stat_add(from->lgrp_id, LGRP_PM_SRC_PGS, page_cnt); 6110 lgrp_stat_add(to->lgrp_id, LGRP_PM_DEST_PGS, page_cnt); 6111 /* 6112 * update the page_t array we were passed in and 6113 * unlink constituent pages of a large page. 6114 */ 6115 for (i = 0; i < page_cnt; ++i, ++pp) { 6116 ASSERT(PAGE_EXCL(newpp)); 6117 ASSERT(newpp->p_szc == pszc); 6118 ppa[i] = newpp; 6119 pp = newpp; 6120 page_sub(&newpp, pp); 6121 page_downgrade(pp); 6122 } 6123 ASSERT(newpp == NULL); 6124 next: 6125 addr += pgsz; 6126 ppa += page_cnt; 6127 npages -= page_cnt; 6128 } 6129 } 6130 6131 /* 6132 * initialize the vnode for retired pages 6133 */ 6134 static void 6135 page_retired_init(void) 6136 { 6137 vn_setops(&retired_ppages, &retired_vnodeops); 6138 } 6139 6140 /* ARGSUSED */ 6141 static void 6142 retired_dispose(vnode_t *vp, page_t *pp, int flag, int dn, cred_t *cr) 6143 { 6144 panic("retired_dispose invoked"); 6145 } 6146 6147 /* ARGSUSED */ 6148 static void 6149 retired_inactive(vnode_t *vp, cred_t *cr) 6150 {} 6151 6152 void 6153 page_unretire_pages(void) 6154 { 6155 page_t *pp; 6156 kmutex_t *vphm; 6157 vnode_t *vp; 6158 page_t *rpages[UNRETIRE_PAGES]; 6159 pgcnt_t i, npages, rmem; 6160 uint64_t pa; 6161 6162 rmem = 0; 6163 6164 for (;;) { 6165 /* 6166 * We do this in 2 steps: 6167 * 6168 * 1. We walk the retired pages list and collect a list of 6169 * pages that have the toxic field cleared. 6170 * 6171 * 2. We iterate through the page list and unretire each one. 6172 * 6173 * We have to do it in two steps on account of the mutexes that 6174 * we need to acquire. 6175 */ 6176 6177 vp = &retired_ppages; 6178 vphm = page_vnode_mutex(vp); 6179 mutex_enter(vphm); 6180 6181 if ((pp = vp->v_pages) == NULL) { 6182 mutex_exit(vphm); 6183 break; 6184 } 6185 6186 i = 0; 6187 do { 6188 ASSERT(pp != NULL); 6189 ASSERT(pp->p_vnode == vp); 6190 6191 /* 6192 * DR operations change the association between a page_t 6193 * and the physical page it represents. Check if the 6194 * page is still bad. If not, unretire it. 6195 */ 6196 if (!page_isfaulty(pp)) 6197 rpages[i++] = pp; 6198 6199 pp = pp->p_vpnext; 6200 } while ((pp != vp->v_pages) && (i < UNRETIRE_PAGES)); 6201 6202 mutex_exit(vphm); 6203 6204 npages = i; 6205 for (i = 0; i < npages; i++) { 6206 pp = rpages[i]; 6207 pa = ptob((uint64_t)page_pptonum(pp)); 6208 6209 /* 6210 * Need to upgrade the shared lock to an exclusive 6211 * lock in order to hash out the page. 6212 * 6213 * The page could have been retired but the page lock 6214 * may not have been downgraded yet. If so, skip this 6215 * page. page_free() will call this function after the 6216 * lock is downgraded. 6217 */ 6218 6219 if (!PAGE_SHARED(pp) || !page_tryupgrade(pp)) 6220 continue; 6221 6222 /* 6223 * Both page_free() and DR call this function. They 6224 * can potentially call this function at the same 6225 * time and race with each other. 6226 */ 6227 if (!page_isretired(pp) || page_isfaulty(pp)) { 6228 page_downgrade(pp); 6229 continue; 6230 } 6231 6232 cmn_err(CE_NOTE, 6233 "unretiring retired page 0x%08x.%08x", 6234 (uint32_t)(pa >> 32), (uint32_t)pa); 6235 6236 /* 6237 * When a page is removed from the retired pages vnode, 6238 * its toxic field is also cleared. So, we do not have 6239 * to do that seperately here. 6240 */ 6241 page_hashout(pp, (kmutex_t *)NULL); 6242 6243 /* 6244 * This is a good page. So, free it. 6245 */ 6246 pp->p_vnode = NULL; 6247 page_free(pp, 1); 6248 rmem++; 6249 } 6250 6251 /* 6252 * If the rpages array was filled up, then there could be more 6253 * retired pages that are not faulty. We need to iterate 6254 * again and unretire them. Otherwise, we are done. 6255 */ 6256 if (npages < UNRETIRE_PAGES) 6257 break; 6258 } 6259 6260 mutex_enter(&freemem_lock); 6261 availrmem += rmem; 6262 mutex_exit(&freemem_lock); 6263 } 6264 6265 ulong_t mem_waiters = 0; 6266 ulong_t max_count = 20; 6267 #define MAX_DELAY 0x1ff 6268 6269 /* 6270 * Check if enough memory is available to proceed. 6271 * Depending on system configuration and how much memory is 6272 * reserved for swap we need to check against two variables. 6273 * e.g. on systems with little physical swap availrmem can be 6274 * more reliable indicator of how much memory is available. 6275 * On systems with large phys swap freemem can be better indicator. 6276 * If freemem drops below threshold level don't return an error 6277 * immediately but wake up pageout to free memory and block. 6278 * This is done number of times. If pageout is not able to free 6279 * memory within certain time return an error. 6280 * The same applies for availrmem but kmem_reap is used to 6281 * free memory. 6282 */ 6283 int 6284 page_mem_avail(pgcnt_t npages) 6285 { 6286 ulong_t count; 6287 6288 #if defined(__i386) 6289 if (freemem > desfree + npages && 6290 availrmem > swapfs_reserve + npages && 6291 btop(vmem_size(heap_arena, VMEM_FREE)) > tune.t_minarmem + 6292 npages) 6293 return (1); 6294 #else 6295 if (freemem > desfree + npages && 6296 availrmem > swapfs_reserve + npages) 6297 return (1); 6298 #endif 6299 6300 count = max_count; 6301 atomic_add_long(&mem_waiters, 1); 6302 6303 while (freemem < desfree + npages && --count) { 6304 cv_signal(&proc_pageout->p_cv); 6305 if (delay_sig(hz + (mem_waiters & MAX_DELAY))) { 6306 atomic_add_long(&mem_waiters, -1); 6307 return (0); 6308 } 6309 } 6310 if (count == 0) { 6311 atomic_add_long(&mem_waiters, -1); 6312 return (0); 6313 } 6314 6315 count = max_count; 6316 while (availrmem < swapfs_reserve + npages && --count) { 6317 kmem_reap(); 6318 if (delay_sig(hz + (mem_waiters & MAX_DELAY))) { 6319 atomic_add_long(&mem_waiters, -1); 6320 return (0); 6321 } 6322 } 6323 atomic_add_long(&mem_waiters, -1); 6324 if (count == 0) 6325 return (0); 6326 6327 #if defined(__i386) 6328 if (btop(vmem_size(heap_arena, VMEM_FREE)) < 6329 tune.t_minarmem + npages) 6330 return (0); 6331 #endif 6332 return (1); 6333 } 6334 6335 6336 /* 6337 * Search the memory segments to locate the desired page. Within a 6338 * segment, pages increase linearly with one page structure per 6339 * physical page frame (size PAGESIZE). The search begins 6340 * with the segment that was accessed last, to take advantage of locality. 6341 * If the hint misses, we start from the beginning of the sorted memseg list 6342 */ 6343 6344 6345 /* 6346 * Some data structures for pfn to pp lookup. 6347 */ 6348 ulong_t mhash_per_slot; 6349 struct memseg *memseg_hash[N_MEM_SLOTS]; 6350 6351 page_t * 6352 page_numtopp_nolock(pfn_t pfnum) 6353 { 6354 struct memseg *seg; 6355 page_t *pp; 6356 vm_cpu_data_t *vc = CPU->cpu_vm_data; 6357 6358 ASSERT(vc != NULL); 6359 6360 MEMSEG_STAT_INCR(nsearch); 6361 6362 /* Try last winner first */ 6363 if (((seg = vc->vc_pnum_memseg) != NULL) && 6364 (pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) { 6365 MEMSEG_STAT_INCR(nlastwon); 6366 pp = seg->pages + (pfnum - seg->pages_base); 6367 if (pp->p_pagenum == pfnum) 6368 return ((page_t *)pp); 6369 } 6370 6371 /* Else Try hash */ 6372 if (((seg = memseg_hash[MEMSEG_PFN_HASH(pfnum)]) != NULL) && 6373 (pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) { 6374 MEMSEG_STAT_INCR(nhashwon); 6375 vc->vc_pnum_memseg = seg; 6376 pp = seg->pages + (pfnum - seg->pages_base); 6377 if (pp->p_pagenum == pfnum) 6378 return ((page_t *)pp); 6379 } 6380 6381 /* Else Brute force */ 6382 for (seg = memsegs; seg != NULL; seg = seg->next) { 6383 if (pfnum >= seg->pages_base && pfnum < seg->pages_end) { 6384 vc->vc_pnum_memseg = seg; 6385 pp = seg->pages + (pfnum - seg->pages_base); 6386 return ((page_t *)pp); 6387 } 6388 } 6389 vc->vc_pnum_memseg = NULL; 6390 MEMSEG_STAT_INCR(nnotfound); 6391 return ((page_t *)NULL); 6392 6393 } 6394 6395 struct memseg * 6396 page_numtomemseg_nolock(pfn_t pfnum) 6397 { 6398 struct memseg *seg; 6399 page_t *pp; 6400 6401 /* Try hash */ 6402 if (((seg = memseg_hash[MEMSEG_PFN_HASH(pfnum)]) != NULL) && 6403 (pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) { 6404 pp = seg->pages + (pfnum - seg->pages_base); 6405 if (pp->p_pagenum == pfnum) 6406 return (seg); 6407 } 6408 6409 /* Else Brute force */ 6410 for (seg = memsegs; seg != NULL; seg = seg->next) { 6411 if (pfnum >= seg->pages_base && pfnum < seg->pages_end) { 6412 return (seg); 6413 } 6414 } 6415 return ((struct memseg *)NULL); 6416 } 6417 6418 /* 6419 * Given a page and a count return the page struct that is 6420 * n structs away from the current one in the global page 6421 * list. 6422 * 6423 * This function wraps to the first page upon 6424 * reaching the end of the memseg list. 6425 */ 6426 page_t * 6427 page_nextn(page_t *pp, ulong_t n) 6428 { 6429 struct memseg *seg; 6430 page_t *ppn; 6431 vm_cpu_data_t *vc = (vm_cpu_data_t *)CPU->cpu_vm_data; 6432 6433 ASSERT(vc != NULL); 6434 6435 if (((seg = vc->vc_pnext_memseg) == NULL) || 6436 (seg->pages_base == seg->pages_end) || 6437 !(pp >= seg->pages && pp < seg->epages)) { 6438 6439 for (seg = memsegs; seg; seg = seg->next) { 6440 if (pp >= seg->pages && pp < seg->epages) 6441 break; 6442 } 6443 6444 if (seg == NULL) { 6445 /* Memory delete got in, return something valid. */ 6446 /* TODO: fix me. */ 6447 seg = memsegs; 6448 pp = seg->pages; 6449 } 6450 } 6451 6452 /* check for wraparound - possible if n is large */ 6453 while ((ppn = (pp + n)) >= seg->epages || ppn < pp) { 6454 n -= seg->epages - pp; 6455 seg = seg->next; 6456 if (seg == NULL) 6457 seg = memsegs; 6458 pp = seg->pages; 6459 } 6460 vc->vc_pnext_memseg = seg; 6461 return (ppn); 6462 } 6463 6464 /* 6465 * Initialize for a loop using page_next_scan_large(). 6466 */ 6467 page_t * 6468 page_next_scan_init(void **cookie) 6469 { 6470 ASSERT(cookie != NULL); 6471 *cookie = (void *)memsegs; 6472 return ((page_t *)memsegs->pages); 6473 } 6474 6475 /* 6476 * Return the next page in a scan of page_t's, assuming we want 6477 * to skip over sub-pages within larger page sizes. 6478 * 6479 * The cookie is used to keep track of the current memseg. 6480 */ 6481 page_t * 6482 page_next_scan_large( 6483 page_t *pp, 6484 ulong_t *n, 6485 void **cookie) 6486 { 6487 struct memseg *seg = (struct memseg *)*cookie; 6488 page_t *new_pp; 6489 ulong_t cnt; 6490 pfn_t pfn; 6491 6492 6493 /* 6494 * get the count of page_t's to skip based on the page size 6495 */ 6496 ASSERT(pp != NULL); 6497 if (pp->p_szc == 0) { 6498 cnt = 1; 6499 } else { 6500 pfn = page_pptonum(pp); 6501 cnt = page_get_pagecnt(pp->p_szc); 6502 cnt -= pfn & (cnt - 1); 6503 } 6504 *n += cnt; 6505 new_pp = pp + cnt; 6506 6507 /* 6508 * Catch if we went past the end of the current memory segment. If so, 6509 * just move to the next segment with pages. 6510 */ 6511 if (new_pp >= seg->epages) { 6512 do { 6513 seg = seg->next; 6514 if (seg == NULL) 6515 seg = memsegs; 6516 } while (seg->pages == seg->epages); 6517 new_pp = seg->pages; 6518 *cookie = (void *)seg; 6519 } 6520 6521 return (new_pp); 6522 } 6523 6524 6525 /* 6526 * Returns next page in list. Note: this function wraps 6527 * to the first page in the list upon reaching the end 6528 * of the list. Callers should be aware of this fact. 6529 */ 6530 6531 /* We should change this be a #define */ 6532 6533 page_t * 6534 page_next(page_t *pp) 6535 { 6536 return (page_nextn(pp, 1)); 6537 } 6538 6539 page_t * 6540 page_first() 6541 { 6542 return ((page_t *)memsegs->pages); 6543 } 6544 6545 6546 /* 6547 * This routine is called at boot with the initial memory configuration 6548 * and when memory is added or removed. 6549 */ 6550 void 6551 build_pfn_hash() 6552 { 6553 pfn_t cur; 6554 pgcnt_t index; 6555 struct memseg *pseg; 6556 int i; 6557 6558 /* 6559 * Clear memseg_hash array. 6560 * Since memory add/delete is designed to operate concurrently 6561 * with normal operation, the hash rebuild must be able to run 6562 * concurrently with page_numtopp_nolock(). To support this 6563 * functionality, assignments to memseg_hash array members must 6564 * be done atomically. 6565 * 6566 * NOTE: bzero() does not currently guarantee this for kernel 6567 * threads, and cannot be used here. 6568 */ 6569 for (i = 0; i < N_MEM_SLOTS; i++) 6570 memseg_hash[i] = NULL; 6571 6572 hat_kpm_mseghash_clear(N_MEM_SLOTS); 6573 6574 /* 6575 * Physmax is the last valid pfn. 6576 */ 6577 mhash_per_slot = (physmax + 1) >> MEM_HASH_SHIFT; 6578 for (pseg = memsegs; pseg != NULL; pseg = pseg->next) { 6579 index = MEMSEG_PFN_HASH(pseg->pages_base); 6580 cur = pseg->pages_base; 6581 do { 6582 if (index >= N_MEM_SLOTS) 6583 index = MEMSEG_PFN_HASH(cur); 6584 6585 if (memseg_hash[index] == NULL || 6586 memseg_hash[index]->pages_base > pseg->pages_base) { 6587 memseg_hash[index] = pseg; 6588 hat_kpm_mseghash_update(index, pseg); 6589 } 6590 cur += mhash_per_slot; 6591 index++; 6592 } while (cur < pseg->pages_end); 6593 } 6594 } 6595 6596 /* 6597 * Return the pagenum for the pp 6598 */ 6599 pfn_t 6600 page_pptonum(page_t *pp) 6601 { 6602 return (pp->p_pagenum); 6603 } 6604 6605 /* 6606 * interface to the referenced and modified etc bits 6607 * in the PSM part of the page struct 6608 * when no locking is desired. 6609 */ 6610 void 6611 page_set_props(page_t *pp, uint_t flags) 6612 { 6613 ASSERT((flags & ~(P_MOD | P_REF | P_RO)) == 0); 6614 pp->p_nrm |= (uchar_t)flags; 6615 } 6616 6617 void 6618 page_clr_all_props(page_t *pp) 6619 { 6620 pp->p_nrm = 0; 6621 } 6622 6623 /* 6624 * The following functions is called from free_vp_pages() 6625 * for an inexact estimate of a newly free'd page... 6626 */ 6627 ulong_t 6628 page_share_cnt(page_t *pp) 6629 { 6630 return (hat_page_getshare(pp)); 6631 } 6632 6633 /* 6634 * The following functions are used in handling memory 6635 * errors. 6636 */ 6637 6638 int 6639 page_istoxic(page_t *pp) 6640 { 6641 return ((pp->p_toxic & PAGE_IS_TOXIC) == PAGE_IS_TOXIC); 6642 } 6643 6644 int 6645 page_isfailing(page_t *pp) 6646 { 6647 return ((pp->p_toxic & PAGE_IS_FAILING) == PAGE_IS_FAILING); 6648 } 6649 6650 int 6651 page_isretired(page_t *pp) 6652 { 6653 return ((pp->p_toxic & PAGE_IS_RETIRED) == PAGE_IS_RETIRED); 6654 } 6655 6656 int 6657 page_deteriorating(page_t *pp) 6658 { 6659 return ((pp->p_toxic & (PAGE_IS_TOXIC | PAGE_IS_FAILING)) != 0); 6660 } 6661 6662 void 6663 page_settoxic(page_t *pp, uchar_t flag) 6664 { 6665 uchar_t new_flag = 0; 6666 while ((new_flag & flag) != flag) { 6667 uchar_t old_flag = pp->p_toxic; 6668 new_flag = old_flag | flag; 6669 (void) cas8(&pp->p_toxic, old_flag, new_flag); 6670 new_flag = ((volatile page_t *)pp)->p_toxic; 6671 } 6672 } 6673 6674 void 6675 page_clrtoxic(page_t *pp) 6676 { 6677 /* 6678 * We don't need to worry about atomicity on the 6679 * p_toxic flag here as this is only called from 6680 * page_free() while holding an exclusive lock on 6681 * the page 6682 */ 6683 pp->p_toxic = PAGE_IS_OK; 6684 } 6685 6686 void 6687 page_clrtoxic_flag(page_t *pp, uchar_t flag) 6688 { 6689 uchar_t new_flag = ((volatile page_t *)pp)->p_toxic; 6690 while ((new_flag & flag) == flag) { 6691 uchar_t old_flag = new_flag; 6692 new_flag = old_flag & ~flag; 6693 (void) cas8(&pp->p_toxic, old_flag, new_flag); 6694 new_flag = ((volatile page_t *)pp)->p_toxic; 6695 } 6696 } 6697 6698 int 6699 page_isfaulty(page_t *pp) 6700 { 6701 return ((pp->p_toxic & PAGE_IS_FAULTY) == PAGE_IS_FAULTY); 6702 } 6703 6704 /* 6705 * The following four functions are called from /proc code 6706 * for the /proc/<pid>/xmap interface. 6707 */ 6708 int 6709 page_isshared(page_t *pp) 6710 { 6711 return (hat_page_getshare(pp) > 1); 6712 } 6713 6714 int 6715 page_isfree(page_t *pp) 6716 { 6717 return (PP_ISFREE(pp)); 6718 } 6719 6720 int 6721 page_isref(page_t *pp) 6722 { 6723 return (hat_page_getattr(pp, P_REF)); 6724 } 6725 6726 int 6727 page_ismod(page_t *pp) 6728 { 6729 return (hat_page_getattr(pp, P_MOD)); 6730 } 6731