1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 #pragma ident "%Z%%M% %I% %E% SMI" 40 41 /* 42 * VM - physical page management. 43 */ 44 45 #include <sys/types.h> 46 #include <sys/t_lock.h> 47 #include <sys/param.h> 48 #include <sys/systm.h> 49 #include <sys/errno.h> 50 #include <sys/time.h> 51 #include <sys/vnode.h> 52 #include <sys/vm.h> 53 #include <sys/vtrace.h> 54 #include <sys/swap.h> 55 #include <sys/cmn_err.h> 56 #include <sys/tuneable.h> 57 #include <sys/sysmacros.h> 58 #include <sys/cpuvar.h> 59 #include <sys/callb.h> 60 #include <sys/debug.h> 61 #include <sys/tnf_probe.h> 62 #include <sys/condvar_impl.h> 63 #include <sys/mem_config.h> 64 #include <sys/mem_cage.h> 65 #include <sys/kmem.h> 66 #include <sys/atomic.h> 67 #include <sys/strlog.h> 68 #include <sys/mman.h> 69 #include <sys/ontrap.h> 70 #include <sys/lgrp.h> 71 #include <sys/vfs.h> 72 73 #include <vm/hat.h> 74 #include <vm/anon.h> 75 #include <vm/page.h> 76 #include <vm/seg.h> 77 #include <vm/pvn.h> 78 #include <vm/seg_kmem.h> 79 #include <vm/vm_dep.h> 80 #include <sys/vm_usage.h> 81 #include <fs/fs_subr.h> 82 #include <sys/ddi.h> 83 #include <sys/modctl.h> 84 85 static int nopageage = 0; 86 87 static pgcnt_t max_page_get; /* max page_get request size in pages */ 88 pgcnt_t total_pages = 0; /* total number of pages (used by /proc) */ 89 90 /* 91 * freemem_lock protects all freemem variables: 92 * availrmem. Also this lock protects the globals which track the 93 * availrmem changes for accurate kernel footprint calculation. 94 * See below for an explanation of these 95 * globals. 96 */ 97 kmutex_t freemem_lock; 98 pgcnt_t availrmem; 99 pgcnt_t availrmem_initial; 100 101 /* 102 * These globals track availrmem changes to get a more accurate 103 * estimate of tke kernel size. Historically pp_kernel is used for 104 * kernel size and is based on availrmem. But availrmem is adjusted for 105 * locked pages in the system not just for kernel locked pages. 106 * These new counters will track the pages locked through segvn and 107 * by explicit user locking. 108 * 109 * pages_locked : How many pages are locked because of user specified 110 * locking through mlock or plock. 111 * 112 * pages_useclaim,pages_claimed : These two variables track the 113 * claim adjustments because of the protection changes on a segvn segment. 114 * 115 * All these globals are protected by the same lock which protects availrmem. 116 */ 117 pgcnt_t pages_locked = 0; 118 pgcnt_t pages_useclaim = 0; 119 pgcnt_t pages_claimed = 0; 120 121 122 /* 123 * new_freemem_lock protects freemem, freemem_wait & freemem_cv. 124 */ 125 static kmutex_t new_freemem_lock; 126 static uint_t freemem_wait; /* someone waiting for freemem */ 127 static kcondvar_t freemem_cv; 128 129 /* 130 * The logical page free list is maintained as two lists, the 'free' 131 * and the 'cache' lists. 132 * The free list contains those pages that should be reused first. 133 * 134 * The implementation of the lists is machine dependent. 135 * page_get_freelist(), page_get_cachelist(), 136 * page_list_sub(), and page_list_add() 137 * form the interface to the machine dependent implementation. 138 * 139 * Pages with p_free set are on the cache list. 140 * Pages with p_free and p_age set are on the free list, 141 * 142 * A page may be locked while on either list. 143 */ 144 145 /* 146 * free list accounting stuff. 147 * 148 * 149 * Spread out the value for the number of pages on the 150 * page free and page cache lists. If there is just one 151 * value, then it must be under just one lock. 152 * The lock contention and cache traffic are a real bother. 153 * 154 * When we acquire and then drop a single pcf lock 155 * we can start in the middle of the array of pcf structures. 156 * If we acquire more than one pcf lock at a time, we need to 157 * start at the front to avoid deadlocking. 158 * 159 * pcf_count holds the number of pages in each pool. 160 * 161 * pcf_block is set when page_create_get_something() has asked the 162 * PSM page freelist and page cachelist routines without specifying 163 * a color and nothing came back. This is used to block anything 164 * else from moving pages from one list to the other while the 165 * lists are searched again. If a page is freeed while pcf_block is 166 * set, then pcf_reserve is incremented. pcgs_unblock() takes care 167 * of clearning pcf_block, doing the wakeups, etc. 168 */ 169 170 #define MAX_PCF_FANOUT NCPU 171 static uint_t pcf_fanout = 1; /* Will get changed at boot time */ 172 static uint_t pcf_fanout_mask = 0; 173 174 struct pcf { 175 kmutex_t pcf_lock; /* protects the structure */ 176 uint_t pcf_count; /* page count */ 177 uint_t pcf_wait; /* number of waiters */ 178 uint_t pcf_block; /* pcgs flag to page_free() */ 179 uint_t pcf_reserve; /* pages freed after pcf_block set */ 180 uint_t pcf_fill[10]; /* to line up on the caches */ 181 }; 182 183 /* 184 * PCF_INDEX hash needs to be dynamic (every so often the hash changes where 185 * it will hash the cpu to). This is done to prevent a drain condition 186 * from happening. This drain condition will occur when pcf_count decrement 187 * occurs on cpu A and the increment of pcf_count always occurs on cpu B. An 188 * example of this shows up with device interrupts. The dma buffer is allocated 189 * by the cpu requesting the IO thus the pcf_count is decremented based on that. 190 * When the memory is returned by the interrupt thread, the pcf_count will be 191 * incremented based on the cpu servicing the interrupt. 192 */ 193 static struct pcf pcf[MAX_PCF_FANOUT]; 194 #define PCF_INDEX() ((int)(((long)CPU->cpu_seqid) + \ 195 (randtick() >> 24)) & (pcf_fanout_mask)) 196 197 static int pcf_decrement_bucket(pgcnt_t); 198 static int pcf_decrement_multiple(pgcnt_t *, pgcnt_t, int); 199 200 kmutex_t pcgs_lock; /* serializes page_create_get_ */ 201 kmutex_t pcgs_cagelock; /* serializes NOSLEEP cage allocs */ 202 kmutex_t pcgs_wait_lock; /* used for delay in pcgs */ 203 static kcondvar_t pcgs_cv; /* cv for delay in pcgs */ 204 205 #ifdef VM_STATS 206 207 /* 208 * No locks, but so what, they are only statistics. 209 */ 210 211 static struct page_tcnt { 212 int pc_free_cache; /* free's into cache list */ 213 int pc_free_dontneed; /* free's with dontneed */ 214 int pc_free_pageout; /* free's from pageout */ 215 int pc_free_free; /* free's into free list */ 216 int pc_free_pages; /* free's into large page free list */ 217 int pc_destroy_pages; /* large page destroy's */ 218 int pc_get_cache; /* get's from cache list */ 219 int pc_get_free; /* get's from free list */ 220 int pc_reclaim; /* reclaim's */ 221 int pc_abortfree; /* abort's of free pages */ 222 int pc_find_hit; /* find's that find page */ 223 int pc_find_miss; /* find's that don't find page */ 224 int pc_destroy_free; /* # of free pages destroyed */ 225 #define PC_HASH_CNT (4*PAGE_HASHAVELEN) 226 int pc_find_hashlen[PC_HASH_CNT+1]; 227 int pc_addclaim_pages; 228 int pc_subclaim_pages; 229 int pc_free_replacement_page[2]; 230 int pc_try_demote_pages[6]; 231 int pc_demote_pages[2]; 232 } pagecnt; 233 234 uint_t hashin_count; 235 uint_t hashin_not_held; 236 uint_t hashin_already; 237 238 uint_t hashout_count; 239 uint_t hashout_not_held; 240 241 uint_t page_create_count; 242 uint_t page_create_not_enough; 243 uint_t page_create_not_enough_again; 244 uint_t page_create_zero; 245 uint_t page_create_hashout; 246 uint_t page_create_page_lock_failed; 247 uint_t page_create_trylock_failed; 248 uint_t page_create_found_one; 249 uint_t page_create_hashin_failed; 250 uint_t page_create_dropped_phm; 251 252 uint_t page_create_new; 253 uint_t page_create_exists; 254 uint_t page_create_putbacks; 255 uint_t page_create_overshoot; 256 257 uint_t page_reclaim_zero; 258 uint_t page_reclaim_zero_locked; 259 260 uint_t page_rename_exists; 261 uint_t page_rename_count; 262 263 uint_t page_lookup_cnt[20]; 264 uint_t page_lookup_nowait_cnt[10]; 265 uint_t page_find_cnt; 266 uint_t page_exists_cnt; 267 uint_t page_exists_forreal_cnt; 268 uint_t page_lookup_dev_cnt; 269 uint_t get_cachelist_cnt; 270 uint_t page_create_cnt[10]; 271 uint_t alloc_pages[9]; 272 uint_t page_exphcontg[19]; 273 uint_t page_create_large_cnt[10]; 274 275 /* 276 * Collects statistics. 277 */ 278 #define PAGE_HASH_SEARCH(index, pp, vp, off) { \ 279 uint_t mylen = 0; \ 280 \ 281 for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash, mylen++) { \ 282 if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \ 283 break; \ 284 } \ 285 if ((pp) != NULL) \ 286 pagecnt.pc_find_hit++; \ 287 else \ 288 pagecnt.pc_find_miss++; \ 289 if (mylen > PC_HASH_CNT) \ 290 mylen = PC_HASH_CNT; \ 291 pagecnt.pc_find_hashlen[mylen]++; \ 292 } 293 294 #else /* VM_STATS */ 295 296 /* 297 * Don't collect statistics 298 */ 299 #define PAGE_HASH_SEARCH(index, pp, vp, off) { \ 300 for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \ 301 if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \ 302 break; \ 303 } \ 304 } 305 306 #endif /* VM_STATS */ 307 308 309 310 #ifdef DEBUG 311 #define MEMSEG_SEARCH_STATS 312 #endif 313 314 #ifdef MEMSEG_SEARCH_STATS 315 struct memseg_stats { 316 uint_t nsearch; 317 uint_t nlastwon; 318 uint_t nhashwon; 319 uint_t nnotfound; 320 } memseg_stats; 321 322 #define MEMSEG_STAT_INCR(v) \ 323 atomic_add_32(&memseg_stats.v, 1) 324 #else 325 #define MEMSEG_STAT_INCR(x) 326 #endif 327 328 struct memseg *memsegs; /* list of memory segments */ 329 330 /* 331 * /etc/system tunable to control large page allocation hueristic. 332 * 333 * Setting to LPAP_LOCAL will heavily prefer the local lgroup over remote lgroup 334 * for large page allocation requests. If a large page is not readily 335 * avaliable on the local freelists we will go through additional effort 336 * to create a large page, potentially moving smaller pages around to coalesce 337 * larger pages in the local lgroup. 338 * Default value of LPAP_DEFAULT will go to remote freelists if large pages 339 * are not readily available in the local lgroup. 340 */ 341 enum lpap { 342 LPAP_DEFAULT, /* default large page allocation policy */ 343 LPAP_LOCAL /* local large page allocation policy */ 344 }; 345 346 enum lpap lpg_alloc_prefer = LPAP_DEFAULT; 347 348 static void page_init_mem_config(void); 349 static int page_do_hashin(page_t *, vnode_t *, u_offset_t); 350 static void page_do_hashout(page_t *); 351 static void page_capture_init(); 352 int page_capture_take_action(page_t *, uint_t, void *); 353 354 static void page_demote_vp_pages(page_t *); 355 356 357 void 358 pcf_init(void) 359 360 { 361 int i; 362 363 if (boot_ncpus != -1) { 364 pcf_fanout = boot_ncpus; 365 } else { 366 pcf_fanout = max_ncpus; 367 } 368 #ifdef sun4v 369 /* 370 * Force at least 4 buckets if possible for sun4v. 371 */ 372 pcf_fanout = MAX(pcf_fanout, 4); 373 #endif /* sun4v */ 374 375 /* 376 * Round up to the nearest power of 2. 377 */ 378 pcf_fanout = MIN(pcf_fanout, MAX_PCF_FANOUT); 379 if (!ISP2(pcf_fanout)) { 380 pcf_fanout = 1 << highbit(pcf_fanout); 381 382 if (pcf_fanout > MAX_PCF_FANOUT) { 383 pcf_fanout = 1 << (highbit(MAX_PCF_FANOUT) - 1); 384 } 385 } 386 pcf_fanout_mask = pcf_fanout - 1; 387 } 388 389 /* 390 * vm subsystem related initialization 391 */ 392 void 393 vm_init(void) 394 { 395 boolean_t callb_vm_cpr(void *, int); 396 397 (void) callb_add(callb_vm_cpr, 0, CB_CL_CPR_VM, "vm"); 398 page_init_mem_config(); 399 page_retire_init(); 400 vm_usage_init(); 401 page_capture_init(); 402 } 403 404 /* 405 * This function is called at startup and when memory is added or deleted. 406 */ 407 void 408 init_pages_pp_maximum() 409 { 410 static pgcnt_t p_min; 411 static pgcnt_t pages_pp_maximum_startup; 412 static pgcnt_t avrmem_delta; 413 static int init_done; 414 static int user_set; /* true if set in /etc/system */ 415 416 if (init_done == 0) { 417 418 /* If the user specified a value, save it */ 419 if (pages_pp_maximum != 0) { 420 user_set = 1; 421 pages_pp_maximum_startup = pages_pp_maximum; 422 } 423 424 /* 425 * Setting of pages_pp_maximum is based first time 426 * on the value of availrmem just after the start-up 427 * allocations. To preserve this relationship at run 428 * time, use a delta from availrmem_initial. 429 */ 430 ASSERT(availrmem_initial >= availrmem); 431 avrmem_delta = availrmem_initial - availrmem; 432 433 /* The allowable floor of pages_pp_maximum */ 434 p_min = tune.t_minarmem + 100; 435 436 /* Make sure we don't come through here again. */ 437 init_done = 1; 438 } 439 /* 440 * Determine pages_pp_maximum, the number of currently available 441 * pages (availrmem) that can't be `locked'. If not set by 442 * the user, we set it to 4% of the currently available memory 443 * plus 4MB. 444 * But we also insist that it be greater than tune.t_minarmem; 445 * otherwise a process could lock down a lot of memory, get swapped 446 * out, and never have enough to get swapped back in. 447 */ 448 if (user_set) 449 pages_pp_maximum = pages_pp_maximum_startup; 450 else 451 pages_pp_maximum = ((availrmem_initial - avrmem_delta) / 25) 452 + btop(4 * 1024 * 1024); 453 454 if (pages_pp_maximum <= p_min) { 455 pages_pp_maximum = p_min; 456 } 457 } 458 459 void 460 set_max_page_get(pgcnt_t target_total_pages) 461 { 462 max_page_get = target_total_pages / 2; 463 } 464 465 static pgcnt_t pending_delete; 466 467 /*ARGSUSED*/ 468 static void 469 page_mem_config_post_add( 470 void *arg, 471 pgcnt_t delta_pages) 472 { 473 set_max_page_get(total_pages - pending_delete); 474 init_pages_pp_maximum(); 475 } 476 477 /*ARGSUSED*/ 478 static int 479 page_mem_config_pre_del( 480 void *arg, 481 pgcnt_t delta_pages) 482 { 483 pgcnt_t nv; 484 485 nv = atomic_add_long_nv(&pending_delete, (spgcnt_t)delta_pages); 486 set_max_page_get(total_pages - nv); 487 return (0); 488 } 489 490 /*ARGSUSED*/ 491 static void 492 page_mem_config_post_del( 493 void *arg, 494 pgcnt_t delta_pages, 495 int cancelled) 496 { 497 pgcnt_t nv; 498 499 nv = atomic_add_long_nv(&pending_delete, -(spgcnt_t)delta_pages); 500 set_max_page_get(total_pages - nv); 501 if (!cancelled) 502 init_pages_pp_maximum(); 503 } 504 505 static kphysm_setup_vector_t page_mem_config_vec = { 506 KPHYSM_SETUP_VECTOR_VERSION, 507 page_mem_config_post_add, 508 page_mem_config_pre_del, 509 page_mem_config_post_del, 510 }; 511 512 static void 513 page_init_mem_config(void) 514 { 515 int ret; 516 517 ret = kphysm_setup_func_register(&page_mem_config_vec, (void *)NULL); 518 ASSERT(ret == 0); 519 } 520 521 /* 522 * Evenly spread out the PCF counters for large free pages 523 */ 524 static void 525 page_free_large_ctr(pgcnt_t npages) 526 { 527 static struct pcf *p = pcf; 528 pgcnt_t lump; 529 530 freemem += npages; 531 532 lump = roundup(npages, pcf_fanout) / pcf_fanout; 533 534 while (npages > 0) { 535 536 ASSERT(!p->pcf_block); 537 538 if (lump < npages) { 539 p->pcf_count += (uint_t)lump; 540 npages -= lump; 541 } else { 542 p->pcf_count += (uint_t)npages; 543 npages = 0; 544 } 545 546 ASSERT(!p->pcf_wait); 547 548 if (++p > &pcf[pcf_fanout - 1]) 549 p = pcf; 550 } 551 552 ASSERT(npages == 0); 553 } 554 555 /* 556 * Add a physical chunk of memory to the system free lists during startup. 557 * Platform specific startup() allocates the memory for the page structs. 558 * 559 * num - number of page structures 560 * base - page number (pfn) to be associated with the first page. 561 * 562 * Since we are doing this during startup (ie. single threaded), we will 563 * use shortcut routines to avoid any locking overhead while putting all 564 * these pages on the freelists. 565 * 566 * NOTE: Any changes performed to page_free(), must also be performed to 567 * add_physmem() since this is how we initialize all page_t's at 568 * boot time. 569 */ 570 void 571 add_physmem( 572 page_t *pp, 573 pgcnt_t num, 574 pfn_t pnum) 575 { 576 page_t *root = NULL; 577 uint_t szc = page_num_pagesizes() - 1; 578 pgcnt_t large = page_get_pagecnt(szc); 579 pgcnt_t cnt = 0; 580 581 TRACE_2(TR_FAC_VM, TR_PAGE_INIT, 582 "add_physmem:pp %p num %lu", pp, num); 583 584 /* 585 * Arbitrarily limit the max page_get request 586 * to 1/2 of the page structs we have. 587 */ 588 total_pages += num; 589 set_max_page_get(total_pages); 590 591 PLCNT_MODIFY_MAX(pnum, (long)num); 592 593 /* 594 * The physical space for the pages array 595 * representing ram pages has already been 596 * allocated. Here we initialize each lock 597 * in the page structure, and put each on 598 * the free list 599 */ 600 for (; num; pp++, pnum++, num--) { 601 602 /* 603 * this needs to fill in the page number 604 * and do any other arch specific initialization 605 */ 606 add_physmem_cb(pp, pnum); 607 608 pp->p_lckcnt = 0; 609 pp->p_cowcnt = 0; 610 pp->p_slckcnt = 0; 611 612 /* 613 * Initialize the page lock as unlocked, since nobody 614 * can see or access this page yet. 615 */ 616 pp->p_selock = 0; 617 618 /* 619 * Initialize IO lock 620 */ 621 page_iolock_init(pp); 622 623 /* 624 * initialize other fields in the page_t 625 */ 626 PP_SETFREE(pp); 627 page_clr_all_props(pp); 628 PP_SETAGED(pp); 629 pp->p_offset = (u_offset_t)-1; 630 pp->p_next = pp; 631 pp->p_prev = pp; 632 633 /* 634 * Simple case: System doesn't support large pages. 635 */ 636 if (szc == 0) { 637 pp->p_szc = 0; 638 page_free_at_startup(pp); 639 continue; 640 } 641 642 /* 643 * Handle unaligned pages, we collect them up onto 644 * the root page until we have a full large page. 645 */ 646 if (!IS_P2ALIGNED(pnum, large)) { 647 648 /* 649 * If not in a large page, 650 * just free as small page. 651 */ 652 if (root == NULL) { 653 pp->p_szc = 0; 654 page_free_at_startup(pp); 655 continue; 656 } 657 658 /* 659 * Link a constituent page into the large page. 660 */ 661 pp->p_szc = szc; 662 page_list_concat(&root, &pp); 663 664 /* 665 * When large page is fully formed, free it. 666 */ 667 if (++cnt == large) { 668 page_free_large_ctr(cnt); 669 page_list_add_pages(root, PG_LIST_ISINIT); 670 root = NULL; 671 cnt = 0; 672 } 673 continue; 674 } 675 676 /* 677 * At this point we have a page number which 678 * is aligned. We assert that we aren't already 679 * in a different large page. 680 */ 681 ASSERT(IS_P2ALIGNED(pnum, large)); 682 ASSERT(root == NULL && cnt == 0); 683 684 /* 685 * If insufficient number of pages left to form 686 * a large page, just free the small page. 687 */ 688 if (num < large) { 689 pp->p_szc = 0; 690 page_free_at_startup(pp); 691 continue; 692 } 693 694 /* 695 * Otherwise start a new large page. 696 */ 697 pp->p_szc = szc; 698 cnt++; 699 root = pp; 700 } 701 ASSERT(root == NULL && cnt == 0); 702 } 703 704 /* 705 * Find a page representing the specified [vp, offset]. 706 * If we find the page but it is intransit coming in, 707 * it will have an "exclusive" lock and we wait for 708 * the i/o to complete. A page found on the free list 709 * is always reclaimed and then locked. On success, the page 710 * is locked, its data is valid and it isn't on the free 711 * list, while a NULL is returned if the page doesn't exist. 712 */ 713 page_t * 714 page_lookup(vnode_t *vp, u_offset_t off, se_t se) 715 { 716 return (page_lookup_create(vp, off, se, NULL, NULL, 0)); 717 } 718 719 /* 720 * Find a page representing the specified [vp, offset]. 721 * We either return the one we found or, if passed in, 722 * create one with identity of [vp, offset] of the 723 * pre-allocated page. If we find existing page but it is 724 * intransit coming in, it will have an "exclusive" lock 725 * and we wait for the i/o to complete. A page found on 726 * the free list is always reclaimed and then locked. 727 * On success, the page is locked, its data is valid and 728 * it isn't on the free list, while a NULL is returned 729 * if the page doesn't exist and newpp is NULL; 730 */ 731 page_t * 732 page_lookup_create( 733 vnode_t *vp, 734 u_offset_t off, 735 se_t se, 736 page_t *newpp, 737 spgcnt_t *nrelocp, 738 int flags) 739 { 740 page_t *pp; 741 kmutex_t *phm; 742 ulong_t index; 743 uint_t hash_locked; 744 uint_t es; 745 746 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 747 VM_STAT_ADD(page_lookup_cnt[0]); 748 ASSERT(newpp ? PAGE_EXCL(newpp) : 1); 749 750 /* 751 * Acquire the appropriate page hash lock since 752 * we have to search the hash list. Pages that 753 * hash to this list can't change identity while 754 * this lock is held. 755 */ 756 hash_locked = 0; 757 index = PAGE_HASH_FUNC(vp, off); 758 phm = NULL; 759 top: 760 PAGE_HASH_SEARCH(index, pp, vp, off); 761 if (pp != NULL) { 762 VM_STAT_ADD(page_lookup_cnt[1]); 763 es = (newpp != NULL) ? 1 : 0; 764 es |= flags; 765 if (!hash_locked) { 766 VM_STAT_ADD(page_lookup_cnt[2]); 767 if (!page_try_reclaim_lock(pp, se, es)) { 768 /* 769 * On a miss, acquire the phm. Then 770 * next time, page_lock() will be called, 771 * causing a wait if the page is busy. 772 * just looping with page_trylock() would 773 * get pretty boring. 774 */ 775 VM_STAT_ADD(page_lookup_cnt[3]); 776 phm = PAGE_HASH_MUTEX(index); 777 mutex_enter(phm); 778 hash_locked = 1; 779 goto top; 780 } 781 } else { 782 VM_STAT_ADD(page_lookup_cnt[4]); 783 if (!page_lock_es(pp, se, phm, P_RECLAIM, es)) { 784 VM_STAT_ADD(page_lookup_cnt[5]); 785 goto top; 786 } 787 } 788 789 /* 790 * Since `pp' is locked it can not change identity now. 791 * Reconfirm we locked the correct page. 792 * 793 * Both the p_vnode and p_offset *must* be cast volatile 794 * to force a reload of their values: The PAGE_HASH_SEARCH 795 * macro will have stuffed p_vnode and p_offset into 796 * registers before calling page_trylock(); another thread, 797 * actually holding the hash lock, could have changed the 798 * page's identity in memory, but our registers would not 799 * be changed, fooling the reconfirmation. If the hash 800 * lock was held during the search, the casting would 801 * not be needed. 802 */ 803 VM_STAT_ADD(page_lookup_cnt[6]); 804 if (((volatile struct vnode *)(pp->p_vnode) != vp) || 805 ((volatile u_offset_t)(pp->p_offset) != off)) { 806 VM_STAT_ADD(page_lookup_cnt[7]); 807 if (hash_locked) { 808 panic("page_lookup_create: lost page %p", 809 (void *)pp); 810 /*NOTREACHED*/ 811 } 812 page_unlock(pp); 813 phm = PAGE_HASH_MUTEX(index); 814 mutex_enter(phm); 815 hash_locked = 1; 816 goto top; 817 } 818 819 /* 820 * If page_trylock() was called, then pp may still be on 821 * the cachelist (can't be on the free list, it would not 822 * have been found in the search). If it is on the 823 * cachelist it must be pulled now. To pull the page from 824 * the cachelist, it must be exclusively locked. 825 * 826 * The other big difference between page_trylock() and 827 * page_lock(), is that page_lock() will pull the 828 * page from whatever free list (the cache list in this 829 * case) the page is on. If page_trylock() was used 830 * above, then we have to do the reclaim ourselves. 831 */ 832 if ((!hash_locked) && (PP_ISFREE(pp))) { 833 ASSERT(PP_ISAGED(pp) == 0); 834 VM_STAT_ADD(page_lookup_cnt[8]); 835 836 /* 837 * page_relcaim will insure that we 838 * have this page exclusively 839 */ 840 841 if (!page_reclaim(pp, NULL)) { 842 /* 843 * Page_reclaim dropped whatever lock 844 * we held. 845 */ 846 VM_STAT_ADD(page_lookup_cnt[9]); 847 phm = PAGE_HASH_MUTEX(index); 848 mutex_enter(phm); 849 hash_locked = 1; 850 goto top; 851 } else if (se == SE_SHARED && newpp == NULL) { 852 VM_STAT_ADD(page_lookup_cnt[10]); 853 page_downgrade(pp); 854 } 855 } 856 857 if (hash_locked) { 858 mutex_exit(phm); 859 } 860 861 if (newpp != NULL && pp->p_szc < newpp->p_szc && 862 PAGE_EXCL(pp) && nrelocp != NULL) { 863 ASSERT(nrelocp != NULL); 864 (void) page_relocate(&pp, &newpp, 1, 1, nrelocp, 865 NULL); 866 if (*nrelocp > 0) { 867 VM_STAT_COND_ADD(*nrelocp == 1, 868 page_lookup_cnt[11]); 869 VM_STAT_COND_ADD(*nrelocp > 1, 870 page_lookup_cnt[12]); 871 pp = newpp; 872 se = SE_EXCL; 873 } else { 874 if (se == SE_SHARED) { 875 page_downgrade(pp); 876 } 877 VM_STAT_ADD(page_lookup_cnt[13]); 878 } 879 } else if (newpp != NULL && nrelocp != NULL) { 880 if (PAGE_EXCL(pp) && se == SE_SHARED) { 881 page_downgrade(pp); 882 } 883 VM_STAT_COND_ADD(pp->p_szc < newpp->p_szc, 884 page_lookup_cnt[14]); 885 VM_STAT_COND_ADD(pp->p_szc == newpp->p_szc, 886 page_lookup_cnt[15]); 887 VM_STAT_COND_ADD(pp->p_szc > newpp->p_szc, 888 page_lookup_cnt[16]); 889 } else if (newpp != NULL && PAGE_EXCL(pp)) { 890 se = SE_EXCL; 891 } 892 } else if (!hash_locked) { 893 VM_STAT_ADD(page_lookup_cnt[17]); 894 phm = PAGE_HASH_MUTEX(index); 895 mutex_enter(phm); 896 hash_locked = 1; 897 goto top; 898 } else if (newpp != NULL) { 899 /* 900 * If we have a preallocated page then 901 * insert it now and basically behave like 902 * page_create. 903 */ 904 VM_STAT_ADD(page_lookup_cnt[18]); 905 /* 906 * Since we hold the page hash mutex and 907 * just searched for this page, page_hashin 908 * had better not fail. If it does, that 909 * means some thread did not follow the 910 * page hash mutex rules. Panic now and 911 * get it over with. As usual, go down 912 * holding all the locks. 913 */ 914 ASSERT(MUTEX_HELD(phm)); 915 if (!page_hashin(newpp, vp, off, phm)) { 916 ASSERT(MUTEX_HELD(phm)); 917 panic("page_lookup_create: hashin failed %p %p %llx %p", 918 (void *)newpp, (void *)vp, off, (void *)phm); 919 /*NOTREACHED*/ 920 } 921 ASSERT(MUTEX_HELD(phm)); 922 mutex_exit(phm); 923 phm = NULL; 924 page_set_props(newpp, P_REF); 925 page_io_lock(newpp); 926 pp = newpp; 927 se = SE_EXCL; 928 } else { 929 VM_STAT_ADD(page_lookup_cnt[19]); 930 mutex_exit(phm); 931 } 932 933 ASSERT(pp ? PAGE_LOCKED_SE(pp, se) : 1); 934 935 ASSERT(pp ? ((PP_ISFREE(pp) == 0) && (PP_ISAGED(pp) == 0)) : 1); 936 937 return (pp); 938 } 939 940 /* 941 * Search the hash list for the page representing the 942 * specified [vp, offset] and return it locked. Skip 943 * free pages and pages that cannot be locked as requested. 944 * Used while attempting to kluster pages. 945 */ 946 page_t * 947 page_lookup_nowait(vnode_t *vp, u_offset_t off, se_t se) 948 { 949 page_t *pp; 950 kmutex_t *phm; 951 ulong_t index; 952 uint_t locked; 953 954 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 955 VM_STAT_ADD(page_lookup_nowait_cnt[0]); 956 957 index = PAGE_HASH_FUNC(vp, off); 958 PAGE_HASH_SEARCH(index, pp, vp, off); 959 locked = 0; 960 if (pp == NULL) { 961 top: 962 VM_STAT_ADD(page_lookup_nowait_cnt[1]); 963 locked = 1; 964 phm = PAGE_HASH_MUTEX(index); 965 mutex_enter(phm); 966 PAGE_HASH_SEARCH(index, pp, vp, off); 967 } 968 969 if (pp == NULL || PP_ISFREE(pp)) { 970 VM_STAT_ADD(page_lookup_nowait_cnt[2]); 971 pp = NULL; 972 } else { 973 if (!page_trylock(pp, se)) { 974 VM_STAT_ADD(page_lookup_nowait_cnt[3]); 975 pp = NULL; 976 } else { 977 VM_STAT_ADD(page_lookup_nowait_cnt[4]); 978 /* 979 * See the comment in page_lookup() 980 */ 981 if (((volatile struct vnode *)(pp->p_vnode) != vp) || 982 ((u_offset_t)(pp->p_offset) != off)) { 983 VM_STAT_ADD(page_lookup_nowait_cnt[5]); 984 if (locked) { 985 panic("page_lookup_nowait %p", 986 (void *)pp); 987 /*NOTREACHED*/ 988 } 989 page_unlock(pp); 990 goto top; 991 } 992 if (PP_ISFREE(pp)) { 993 VM_STAT_ADD(page_lookup_nowait_cnt[6]); 994 page_unlock(pp); 995 pp = NULL; 996 } 997 } 998 } 999 if (locked) { 1000 VM_STAT_ADD(page_lookup_nowait_cnt[7]); 1001 mutex_exit(phm); 1002 } 1003 1004 ASSERT(pp ? PAGE_LOCKED_SE(pp, se) : 1); 1005 1006 return (pp); 1007 } 1008 1009 /* 1010 * Search the hash list for a page with the specified [vp, off] 1011 * that is known to exist and is already locked. This routine 1012 * is typically used by segment SOFTUNLOCK routines. 1013 */ 1014 page_t * 1015 page_find(vnode_t *vp, u_offset_t off) 1016 { 1017 page_t *pp; 1018 kmutex_t *phm; 1019 ulong_t index; 1020 1021 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 1022 VM_STAT_ADD(page_find_cnt); 1023 1024 index = PAGE_HASH_FUNC(vp, off); 1025 phm = PAGE_HASH_MUTEX(index); 1026 1027 mutex_enter(phm); 1028 PAGE_HASH_SEARCH(index, pp, vp, off); 1029 mutex_exit(phm); 1030 1031 ASSERT(pp == NULL || PAGE_LOCKED(pp) || panicstr); 1032 return (pp); 1033 } 1034 1035 /* 1036 * Determine whether a page with the specified [vp, off] 1037 * currently exists in the system. Obviously this should 1038 * only be considered as a hint since nothing prevents the 1039 * page from disappearing or appearing immediately after 1040 * the return from this routine. Subsequently, we don't 1041 * even bother to lock the list. 1042 */ 1043 page_t * 1044 page_exists(vnode_t *vp, u_offset_t off) 1045 { 1046 page_t *pp; 1047 ulong_t index; 1048 1049 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 1050 VM_STAT_ADD(page_exists_cnt); 1051 1052 index = PAGE_HASH_FUNC(vp, off); 1053 PAGE_HASH_SEARCH(index, pp, vp, off); 1054 1055 return (pp); 1056 } 1057 1058 /* 1059 * Determine if physically contiguous pages exist for [vp, off] - [vp, off + 1060 * page_size(szc)) range. if they exist and ppa is not NULL fill ppa array 1061 * with these pages locked SHARED. If necessary reclaim pages from 1062 * freelist. Return 1 if contiguous pages exist and 0 otherwise. 1063 * 1064 * If we fail to lock pages still return 1 if pages exist and contiguous. 1065 * But in this case return value is just a hint. ppa array won't be filled. 1066 * Caller should initialize ppa[0] as NULL to distinguish return value. 1067 * 1068 * Returns 0 if pages don't exist or not physically contiguous. 1069 * 1070 * This routine doesn't work for anonymous(swapfs) pages. 1071 */ 1072 int 1073 page_exists_physcontig(vnode_t *vp, u_offset_t off, uint_t szc, page_t *ppa[]) 1074 { 1075 pgcnt_t pages; 1076 pfn_t pfn; 1077 page_t *rootpp; 1078 pgcnt_t i; 1079 pgcnt_t j; 1080 u_offset_t save_off = off; 1081 ulong_t index; 1082 kmutex_t *phm; 1083 page_t *pp; 1084 uint_t pszc; 1085 int loopcnt = 0; 1086 1087 ASSERT(szc != 0); 1088 ASSERT(vp != NULL); 1089 ASSERT(!IS_SWAPFSVP(vp)); 1090 ASSERT(!VN_ISKAS(vp)); 1091 1092 again: 1093 if (++loopcnt > 3) { 1094 VM_STAT_ADD(page_exphcontg[0]); 1095 return (0); 1096 } 1097 1098 index = PAGE_HASH_FUNC(vp, off); 1099 phm = PAGE_HASH_MUTEX(index); 1100 1101 mutex_enter(phm); 1102 PAGE_HASH_SEARCH(index, pp, vp, off); 1103 mutex_exit(phm); 1104 1105 VM_STAT_ADD(page_exphcontg[1]); 1106 1107 if (pp == NULL) { 1108 VM_STAT_ADD(page_exphcontg[2]); 1109 return (0); 1110 } 1111 1112 pages = page_get_pagecnt(szc); 1113 rootpp = pp; 1114 pfn = rootpp->p_pagenum; 1115 1116 if ((pszc = pp->p_szc) >= szc && ppa != NULL) { 1117 VM_STAT_ADD(page_exphcontg[3]); 1118 if (!page_trylock(pp, SE_SHARED)) { 1119 VM_STAT_ADD(page_exphcontg[4]); 1120 return (1); 1121 } 1122 if (pp->p_szc != pszc || pp->p_vnode != vp || 1123 pp->p_offset != off) { 1124 VM_STAT_ADD(page_exphcontg[5]); 1125 page_unlock(pp); 1126 off = save_off; 1127 goto again; 1128 } 1129 /* 1130 * szc was non zero and vnode and offset matched after we 1131 * locked the page it means it can't become free on us. 1132 */ 1133 ASSERT(!PP_ISFREE(pp)); 1134 if (!IS_P2ALIGNED(pfn, pages)) { 1135 page_unlock(pp); 1136 return (0); 1137 } 1138 ppa[0] = pp; 1139 pp++; 1140 off += PAGESIZE; 1141 pfn++; 1142 for (i = 1; i < pages; i++, pp++, off += PAGESIZE, pfn++) { 1143 if (!page_trylock(pp, SE_SHARED)) { 1144 VM_STAT_ADD(page_exphcontg[6]); 1145 pp--; 1146 while (i-- > 0) { 1147 page_unlock(pp); 1148 pp--; 1149 } 1150 ppa[0] = NULL; 1151 return (1); 1152 } 1153 if (pp->p_szc != pszc) { 1154 VM_STAT_ADD(page_exphcontg[7]); 1155 page_unlock(pp); 1156 pp--; 1157 while (i-- > 0) { 1158 page_unlock(pp); 1159 pp--; 1160 } 1161 ppa[0] = NULL; 1162 off = save_off; 1163 goto again; 1164 } 1165 /* 1166 * szc the same as for previous already locked pages 1167 * with right identity. Since this page had correct 1168 * szc after we locked it can't get freed or destroyed 1169 * and therefore must have the expected identity. 1170 */ 1171 ASSERT(!PP_ISFREE(pp)); 1172 if (pp->p_vnode != vp || 1173 pp->p_offset != off) { 1174 panic("page_exists_physcontig: " 1175 "large page identity doesn't match"); 1176 } 1177 ppa[i] = pp; 1178 ASSERT(pp->p_pagenum == pfn); 1179 } 1180 VM_STAT_ADD(page_exphcontg[8]); 1181 ppa[pages] = NULL; 1182 return (1); 1183 } else if (pszc >= szc) { 1184 VM_STAT_ADD(page_exphcontg[9]); 1185 if (!IS_P2ALIGNED(pfn, pages)) { 1186 return (0); 1187 } 1188 return (1); 1189 } 1190 1191 if (!IS_P2ALIGNED(pfn, pages)) { 1192 VM_STAT_ADD(page_exphcontg[10]); 1193 return (0); 1194 } 1195 1196 if (page_numtomemseg_nolock(pfn) != 1197 page_numtomemseg_nolock(pfn + pages - 1)) { 1198 VM_STAT_ADD(page_exphcontg[11]); 1199 return (0); 1200 } 1201 1202 /* 1203 * We loop up 4 times across pages to promote page size. 1204 * We're extra cautious to promote page size atomically with respect 1205 * to everybody else. But we can probably optimize into 1 loop if 1206 * this becomes an issue. 1207 */ 1208 1209 for (i = 0; i < pages; i++, pp++, off += PAGESIZE, pfn++) { 1210 ASSERT(pp->p_pagenum == pfn); 1211 if (!page_trylock(pp, SE_EXCL)) { 1212 VM_STAT_ADD(page_exphcontg[12]); 1213 break; 1214 } 1215 if (pp->p_vnode != vp || 1216 pp->p_offset != off) { 1217 VM_STAT_ADD(page_exphcontg[13]); 1218 page_unlock(pp); 1219 break; 1220 } 1221 if (pp->p_szc >= szc) { 1222 ASSERT(i == 0); 1223 page_unlock(pp); 1224 off = save_off; 1225 goto again; 1226 } 1227 } 1228 1229 if (i != pages) { 1230 VM_STAT_ADD(page_exphcontg[14]); 1231 --pp; 1232 while (i-- > 0) { 1233 page_unlock(pp); 1234 --pp; 1235 } 1236 return (0); 1237 } 1238 1239 pp = rootpp; 1240 for (i = 0; i < pages; i++, pp++) { 1241 if (PP_ISFREE(pp)) { 1242 VM_STAT_ADD(page_exphcontg[15]); 1243 ASSERT(!PP_ISAGED(pp)); 1244 ASSERT(pp->p_szc == 0); 1245 if (!page_reclaim(pp, NULL)) { 1246 break; 1247 } 1248 } else { 1249 ASSERT(pp->p_szc < szc); 1250 VM_STAT_ADD(page_exphcontg[16]); 1251 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1252 } 1253 } 1254 if (i < pages) { 1255 VM_STAT_ADD(page_exphcontg[17]); 1256 /* 1257 * page_reclaim failed because we were out of memory. 1258 * drop the rest of the locks and return because this page 1259 * must be already reallocated anyway. 1260 */ 1261 pp = rootpp; 1262 for (j = 0; j < pages; j++, pp++) { 1263 if (j != i) { 1264 page_unlock(pp); 1265 } 1266 } 1267 return (0); 1268 } 1269 1270 off = save_off; 1271 pp = rootpp; 1272 for (i = 0; i < pages; i++, pp++, off += PAGESIZE) { 1273 ASSERT(PAGE_EXCL(pp)); 1274 ASSERT(!PP_ISFREE(pp)); 1275 ASSERT(!hat_page_is_mapped(pp)); 1276 ASSERT(pp->p_vnode == vp); 1277 ASSERT(pp->p_offset == off); 1278 pp->p_szc = szc; 1279 } 1280 pp = rootpp; 1281 for (i = 0; i < pages; i++, pp++) { 1282 if (ppa == NULL) { 1283 page_unlock(pp); 1284 } else { 1285 ppa[i] = pp; 1286 page_downgrade(ppa[i]); 1287 } 1288 } 1289 if (ppa != NULL) { 1290 ppa[pages] = NULL; 1291 } 1292 VM_STAT_ADD(page_exphcontg[18]); 1293 ASSERT(vp->v_pages != NULL); 1294 return (1); 1295 } 1296 1297 /* 1298 * Determine whether a page with the specified [vp, off] 1299 * currently exists in the system and if so return its 1300 * size code. Obviously this should only be considered as 1301 * a hint since nothing prevents the page from disappearing 1302 * or appearing immediately after the return from this routine. 1303 */ 1304 int 1305 page_exists_forreal(vnode_t *vp, u_offset_t off, uint_t *szc) 1306 { 1307 page_t *pp; 1308 kmutex_t *phm; 1309 ulong_t index; 1310 int rc = 0; 1311 1312 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 1313 ASSERT(szc != NULL); 1314 VM_STAT_ADD(page_exists_forreal_cnt); 1315 1316 index = PAGE_HASH_FUNC(vp, off); 1317 phm = PAGE_HASH_MUTEX(index); 1318 1319 mutex_enter(phm); 1320 PAGE_HASH_SEARCH(index, pp, vp, off); 1321 if (pp != NULL) { 1322 *szc = pp->p_szc; 1323 rc = 1; 1324 } 1325 mutex_exit(phm); 1326 return (rc); 1327 } 1328 1329 /* wakeup threads waiting for pages in page_create_get_something() */ 1330 void 1331 wakeup_pcgs(void) 1332 { 1333 if (!CV_HAS_WAITERS(&pcgs_cv)) 1334 return; 1335 cv_broadcast(&pcgs_cv); 1336 } 1337 1338 /* 1339 * 'freemem' is used all over the kernel as an indication of how many 1340 * pages are free (either on the cache list or on the free page list) 1341 * in the system. In very few places is a really accurate 'freemem' 1342 * needed. To avoid contention of the lock protecting a the 1343 * single freemem, it was spread out into NCPU buckets. Set_freemem 1344 * sets freemem to the total of all NCPU buckets. It is called from 1345 * clock() on each TICK. 1346 */ 1347 void 1348 set_freemem() 1349 { 1350 struct pcf *p; 1351 ulong_t t; 1352 uint_t i; 1353 1354 t = 0; 1355 p = pcf; 1356 for (i = 0; i < pcf_fanout; i++) { 1357 t += p->pcf_count; 1358 p++; 1359 } 1360 freemem = t; 1361 1362 /* 1363 * Don't worry about grabbing mutex. It's not that 1364 * critical if we miss a tick or two. This is 1365 * where we wakeup possible delayers in 1366 * page_create_get_something(). 1367 */ 1368 wakeup_pcgs(); 1369 } 1370 1371 ulong_t 1372 get_freemem() 1373 { 1374 struct pcf *p; 1375 ulong_t t; 1376 uint_t i; 1377 1378 t = 0; 1379 p = pcf; 1380 for (i = 0; i < pcf_fanout; i++) { 1381 t += p->pcf_count; 1382 p++; 1383 } 1384 /* 1385 * We just calculated it, might as well set it. 1386 */ 1387 freemem = t; 1388 return (t); 1389 } 1390 1391 /* 1392 * Acquire all of the page cache & free (pcf) locks. 1393 */ 1394 void 1395 pcf_acquire_all() 1396 { 1397 struct pcf *p; 1398 uint_t i; 1399 1400 p = pcf; 1401 for (i = 0; i < pcf_fanout; i++) { 1402 mutex_enter(&p->pcf_lock); 1403 p++; 1404 } 1405 } 1406 1407 /* 1408 * Release all the pcf_locks. 1409 */ 1410 void 1411 pcf_release_all() 1412 { 1413 struct pcf *p; 1414 uint_t i; 1415 1416 p = pcf; 1417 for (i = 0; i < pcf_fanout; i++) { 1418 mutex_exit(&p->pcf_lock); 1419 p++; 1420 } 1421 } 1422 1423 /* 1424 * Inform the VM system that we need some pages freed up. 1425 * Calls must be symmetric, e.g.: 1426 * 1427 * page_needfree(100); 1428 * wait a bit; 1429 * page_needfree(-100); 1430 */ 1431 void 1432 page_needfree(spgcnt_t npages) 1433 { 1434 mutex_enter(&new_freemem_lock); 1435 needfree += npages; 1436 mutex_exit(&new_freemem_lock); 1437 } 1438 1439 /* 1440 * Throttle for page_create(): try to prevent freemem from dropping 1441 * below throttlefree. We can't provide a 100% guarantee because 1442 * KM_NOSLEEP allocations, page_reclaim(), and various other things 1443 * nibble away at the freelist. However, we can block all PG_WAIT 1444 * allocations until memory becomes available. The motivation is 1445 * that several things can fall apart when there's no free memory: 1446 * 1447 * (1) If pageout() needs memory to push a page, the system deadlocks. 1448 * 1449 * (2) By (broken) specification, timeout(9F) can neither fail nor 1450 * block, so it has no choice but to panic the system if it 1451 * cannot allocate a callout structure. 1452 * 1453 * (3) Like timeout(), ddi_set_callback() cannot fail and cannot block; 1454 * it panics if it cannot allocate a callback structure. 1455 * 1456 * (4) Untold numbers of third-party drivers have not yet been hardened 1457 * against KM_NOSLEEP and/or allocb() failures; they simply assume 1458 * success and panic the system with a data fault on failure. 1459 * (The long-term solution to this particular problem is to ship 1460 * hostile fault-injecting DEBUG kernels with the DDK.) 1461 * 1462 * It is theoretically impossible to guarantee success of non-blocking 1463 * allocations, but in practice, this throttle is very hard to break. 1464 */ 1465 static int 1466 page_create_throttle(pgcnt_t npages, int flags) 1467 { 1468 ulong_t fm; 1469 uint_t i; 1470 pgcnt_t tf; /* effective value of throttlefree */ 1471 1472 /* 1473 * Never deny pages when: 1474 * - it's a thread that cannot block [NOMEMWAIT()] 1475 * - the allocation cannot block and must not fail 1476 * - the allocation cannot block and is pageout dispensated 1477 */ 1478 if (NOMEMWAIT() || 1479 ((flags & (PG_WAIT | PG_PANIC)) == PG_PANIC) || 1480 ((flags & (PG_WAIT | PG_PUSHPAGE)) == PG_PUSHPAGE)) 1481 return (1); 1482 1483 /* 1484 * If the allocation can't block, we look favorably upon it 1485 * unless we're below pageout_reserve. In that case we fail 1486 * the allocation because we want to make sure there are a few 1487 * pages available for pageout. 1488 */ 1489 if ((flags & PG_WAIT) == 0) 1490 return (freemem >= npages + pageout_reserve); 1491 1492 /* Calculate the effective throttlefree value */ 1493 tf = throttlefree - 1494 ((flags & PG_PUSHPAGE) ? pageout_reserve : 0); 1495 1496 cv_signal(&proc_pageout->p_cv); 1497 1498 for (;;) { 1499 fm = 0; 1500 pcf_acquire_all(); 1501 mutex_enter(&new_freemem_lock); 1502 for (i = 0; i < pcf_fanout; i++) { 1503 fm += pcf[i].pcf_count; 1504 pcf[i].pcf_wait++; 1505 mutex_exit(&pcf[i].pcf_lock); 1506 } 1507 freemem = fm; 1508 if (freemem >= npages + tf) { 1509 mutex_exit(&new_freemem_lock); 1510 break; 1511 } 1512 needfree += npages; 1513 freemem_wait++; 1514 cv_wait(&freemem_cv, &new_freemem_lock); 1515 freemem_wait--; 1516 needfree -= npages; 1517 mutex_exit(&new_freemem_lock); 1518 } 1519 return (1); 1520 } 1521 1522 /* 1523 * page_create_wait() is called to either coalesce pages from the 1524 * different pcf buckets or to wait because there simply are not 1525 * enough pages to satisfy the caller's request. 1526 * 1527 * Sadly, this is called from platform/vm/vm_machdep.c 1528 */ 1529 int 1530 page_create_wait(pgcnt_t npages, uint_t flags) 1531 { 1532 pgcnt_t total; 1533 uint_t i; 1534 struct pcf *p; 1535 1536 /* 1537 * Wait until there are enough free pages to satisfy our 1538 * entire request. 1539 * We set needfree += npages before prodding pageout, to make sure 1540 * it does real work when npages > lotsfree > freemem. 1541 */ 1542 VM_STAT_ADD(page_create_not_enough); 1543 1544 ASSERT(!kcage_on ? !(flags & PG_NORELOC) : 1); 1545 checkagain: 1546 if ((flags & PG_NORELOC) && 1547 kcage_freemem < kcage_throttlefree + npages) 1548 (void) kcage_create_throttle(npages, flags); 1549 1550 if (freemem < npages + throttlefree) 1551 if (!page_create_throttle(npages, flags)) 1552 return (0); 1553 1554 if (pcf_decrement_bucket(npages) || 1555 pcf_decrement_multiple(&total, npages, 0)) 1556 return (1); 1557 1558 /* 1559 * All of the pcf locks are held, there are not enough pages 1560 * to satisfy the request (npages < total). 1561 * Be sure to acquire the new_freemem_lock before dropping 1562 * the pcf locks. This prevents dropping wakeups in page_free(). 1563 * The order is always pcf_lock then new_freemem_lock. 1564 * 1565 * Since we hold all the pcf locks, it is a good time to set freemem. 1566 * 1567 * If the caller does not want to wait, return now. 1568 * Else turn the pageout daemon loose to find something 1569 * and wait till it does. 1570 * 1571 */ 1572 freemem = total; 1573 1574 if ((flags & PG_WAIT) == 0) { 1575 pcf_release_all(); 1576 1577 TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_NOMEM, 1578 "page_create_nomem:npages %ld freemem %ld", npages, freemem); 1579 return (0); 1580 } 1581 1582 ASSERT(proc_pageout != NULL); 1583 cv_signal(&proc_pageout->p_cv); 1584 1585 TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SLEEP_START, 1586 "page_create_sleep_start: freemem %ld needfree %ld", 1587 freemem, needfree); 1588 1589 /* 1590 * We are going to wait. 1591 * We currently hold all of the pcf_locks, 1592 * get the new_freemem_lock (it protects freemem_wait), 1593 * before dropping the pcf_locks. 1594 */ 1595 mutex_enter(&new_freemem_lock); 1596 1597 p = pcf; 1598 for (i = 0; i < pcf_fanout; i++) { 1599 p->pcf_wait++; 1600 mutex_exit(&p->pcf_lock); 1601 p++; 1602 } 1603 1604 needfree += npages; 1605 freemem_wait++; 1606 1607 cv_wait(&freemem_cv, &new_freemem_lock); 1608 1609 freemem_wait--; 1610 needfree -= npages; 1611 1612 mutex_exit(&new_freemem_lock); 1613 1614 TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SLEEP_END, 1615 "page_create_sleep_end: freemem %ld needfree %ld", 1616 freemem, needfree); 1617 1618 VM_STAT_ADD(page_create_not_enough_again); 1619 goto checkagain; 1620 } 1621 /* 1622 * A routine to do the opposite of page_create_wait(). 1623 */ 1624 void 1625 page_create_putback(spgcnt_t npages) 1626 { 1627 struct pcf *p; 1628 pgcnt_t lump; 1629 uint_t *which; 1630 1631 /* 1632 * When a contiguous lump is broken up, we have to 1633 * deal with lots of pages (min 64) so lets spread 1634 * the wealth around. 1635 */ 1636 lump = roundup(npages, pcf_fanout) / pcf_fanout; 1637 freemem += npages; 1638 1639 for (p = pcf; (npages > 0) && (p < &pcf[pcf_fanout]); p++) { 1640 which = &p->pcf_count; 1641 1642 mutex_enter(&p->pcf_lock); 1643 1644 if (p->pcf_block) { 1645 which = &p->pcf_reserve; 1646 } 1647 1648 if (lump < npages) { 1649 *which += (uint_t)lump; 1650 npages -= lump; 1651 } else { 1652 *which += (uint_t)npages; 1653 npages = 0; 1654 } 1655 1656 if (p->pcf_wait) { 1657 mutex_enter(&new_freemem_lock); 1658 /* 1659 * Check to see if some other thread 1660 * is actually waiting. Another bucket 1661 * may have woken it up by now. If there 1662 * are no waiters, then set our pcf_wait 1663 * count to zero to avoid coming in here 1664 * next time. 1665 */ 1666 if (freemem_wait) { 1667 if (npages > 1) { 1668 cv_broadcast(&freemem_cv); 1669 } else { 1670 cv_signal(&freemem_cv); 1671 } 1672 p->pcf_wait--; 1673 } else { 1674 p->pcf_wait = 0; 1675 } 1676 mutex_exit(&new_freemem_lock); 1677 } 1678 mutex_exit(&p->pcf_lock); 1679 } 1680 ASSERT(npages == 0); 1681 } 1682 1683 /* 1684 * A helper routine for page_create_get_something. 1685 * The indenting got to deep down there. 1686 * Unblock the pcf counters. Any pages freed after 1687 * pcf_block got set are moved to pcf_count and 1688 * wakeups (cv_broadcast() or cv_signal()) are done as needed. 1689 */ 1690 static void 1691 pcgs_unblock(void) 1692 { 1693 int i; 1694 struct pcf *p; 1695 1696 /* Update freemem while we're here. */ 1697 freemem = 0; 1698 p = pcf; 1699 for (i = 0; i < pcf_fanout; i++) { 1700 mutex_enter(&p->pcf_lock); 1701 ASSERT(p->pcf_count == 0); 1702 p->pcf_count = p->pcf_reserve; 1703 p->pcf_block = 0; 1704 freemem += p->pcf_count; 1705 if (p->pcf_wait) { 1706 mutex_enter(&new_freemem_lock); 1707 if (freemem_wait) { 1708 if (p->pcf_reserve > 1) { 1709 cv_broadcast(&freemem_cv); 1710 p->pcf_wait = 0; 1711 } else { 1712 cv_signal(&freemem_cv); 1713 p->pcf_wait--; 1714 } 1715 } else { 1716 p->pcf_wait = 0; 1717 } 1718 mutex_exit(&new_freemem_lock); 1719 } 1720 p->pcf_reserve = 0; 1721 mutex_exit(&p->pcf_lock); 1722 p++; 1723 } 1724 } 1725 1726 /* 1727 * Called from page_create_va() when both the cache and free lists 1728 * have been checked once. 1729 * 1730 * Either returns a page or panics since the accounting was done 1731 * way before we got here. 1732 * 1733 * We don't come here often, so leave the accounting on permanently. 1734 */ 1735 1736 #define MAX_PCGS 100 1737 1738 #ifdef DEBUG 1739 #define PCGS_TRIES 100 1740 #else /* DEBUG */ 1741 #define PCGS_TRIES 10 1742 #endif /* DEBUG */ 1743 1744 #ifdef VM_STATS 1745 uint_t pcgs_counts[PCGS_TRIES]; 1746 uint_t pcgs_too_many; 1747 uint_t pcgs_entered; 1748 uint_t pcgs_entered_noreloc; 1749 uint_t pcgs_locked; 1750 uint_t pcgs_cagelocked; 1751 #endif /* VM_STATS */ 1752 1753 static page_t * 1754 page_create_get_something(vnode_t *vp, u_offset_t off, struct seg *seg, 1755 caddr_t vaddr, uint_t flags) 1756 { 1757 uint_t count; 1758 page_t *pp; 1759 uint_t locked, i; 1760 struct pcf *p; 1761 lgrp_t *lgrp; 1762 int cagelocked = 0; 1763 1764 VM_STAT_ADD(pcgs_entered); 1765 1766 /* 1767 * Tap any reserve freelists: if we fail now, we'll die 1768 * since the page(s) we're looking for have already been 1769 * accounted for. 1770 */ 1771 flags |= PG_PANIC; 1772 1773 if ((flags & PG_NORELOC) != 0) { 1774 VM_STAT_ADD(pcgs_entered_noreloc); 1775 /* 1776 * Requests for free pages from critical threads 1777 * such as pageout still won't throttle here, but 1778 * we must try again, to give the cageout thread 1779 * another chance to catch up. Since we already 1780 * accounted for the pages, we had better get them 1781 * this time. 1782 * 1783 * N.B. All non-critical threads acquire the pcgs_cagelock 1784 * to serialize access to the freelists. This implements a 1785 * turnstile-type synchornization to avoid starvation of 1786 * critical requests for PG_NORELOC memory by non-critical 1787 * threads: all non-critical threads must acquire a 'ticket' 1788 * before passing through, which entails making sure 1789 * kcage_freemem won't fall below minfree prior to grabbing 1790 * pages from the freelists. 1791 */ 1792 if (kcage_create_throttle(1, flags) == KCT_NONCRIT) { 1793 mutex_enter(&pcgs_cagelock); 1794 cagelocked = 1; 1795 VM_STAT_ADD(pcgs_cagelocked); 1796 } 1797 } 1798 1799 /* 1800 * Time to get serious. 1801 * We failed to get a `correctly colored' page from both the 1802 * free and cache lists. 1803 * We escalate in stage. 1804 * 1805 * First try both lists without worring about color. 1806 * 1807 * Then, grab all page accounting locks (ie. pcf[]) and 1808 * steal any pages that they have and set the pcf_block flag to 1809 * stop deletions from the lists. This will help because 1810 * a page can get added to the free list while we are looking 1811 * at the cache list, then another page could be added to the cache 1812 * list allowing the page on the free list to be removed as we 1813 * move from looking at the cache list to the free list. This 1814 * could happen over and over. We would never find the page 1815 * we have accounted for. 1816 * 1817 * Noreloc pages are a subset of the global (relocatable) page pool. 1818 * They are not tracked separately in the pcf bins, so it is 1819 * impossible to know when doing pcf accounting if the available 1820 * page(s) are noreloc pages or not. When looking for a noreloc page 1821 * it is quite easy to end up here even if the global (relocatable) 1822 * page pool has plenty of free pages but the noreloc pool is empty. 1823 * 1824 * When the noreloc pool is empty (or low), additional noreloc pages 1825 * are created by converting pages from the global page pool. This 1826 * process will stall during pcf accounting if the pcf bins are 1827 * already locked. Such is the case when a noreloc allocation is 1828 * looping here in page_create_get_something waiting for more noreloc 1829 * pages to appear. 1830 * 1831 * Short of adding a new field to the pcf bins to accurately track 1832 * the number of free noreloc pages, we instead do not grab the 1833 * pcgs_lock, do not set the pcf blocks and do not timeout when 1834 * allocating a noreloc page. This allows noreloc allocations to 1835 * loop without blocking global page pool allocations. 1836 * 1837 * NOTE: the behaviour of page_create_get_something has not changed 1838 * for the case of global page pool allocations. 1839 */ 1840 1841 flags &= ~PG_MATCH_COLOR; 1842 locked = 0; 1843 #if defined(__i386) || defined(__amd64) 1844 flags = page_create_update_flags_x86(flags); 1845 #endif 1846 1847 lgrp = lgrp_mem_choose(seg, vaddr, PAGESIZE); 1848 1849 for (count = 0; kcage_on || count < MAX_PCGS; count++) { 1850 pp = page_get_freelist(vp, off, seg, vaddr, PAGESIZE, 1851 flags, lgrp); 1852 if (pp == NULL) { 1853 pp = page_get_cachelist(vp, off, seg, vaddr, 1854 flags, lgrp); 1855 } 1856 if (pp == NULL) { 1857 /* 1858 * Serialize. Don't fight with other pcgs(). 1859 */ 1860 if (!locked && (!kcage_on || !(flags & PG_NORELOC))) { 1861 mutex_enter(&pcgs_lock); 1862 VM_STAT_ADD(pcgs_locked); 1863 locked = 1; 1864 p = pcf; 1865 for (i = 0; i < pcf_fanout; i++) { 1866 mutex_enter(&p->pcf_lock); 1867 ASSERT(p->pcf_block == 0); 1868 p->pcf_block = 1; 1869 p->pcf_reserve = p->pcf_count; 1870 p->pcf_count = 0; 1871 mutex_exit(&p->pcf_lock); 1872 p++; 1873 } 1874 freemem = 0; 1875 } 1876 1877 if (count) { 1878 /* 1879 * Since page_free() puts pages on 1880 * a list then accounts for it, we 1881 * just have to wait for page_free() 1882 * to unlock any page it was working 1883 * with. The page_lock()-page_reclaim() 1884 * path falls in the same boat. 1885 * 1886 * We don't need to check on the 1887 * PG_WAIT flag, we have already 1888 * accounted for the page we are 1889 * looking for in page_create_va(). 1890 * 1891 * We just wait a moment to let any 1892 * locked pages on the lists free up, 1893 * then continue around and try again. 1894 * 1895 * Will be awakened by set_freemem(). 1896 */ 1897 mutex_enter(&pcgs_wait_lock); 1898 cv_wait(&pcgs_cv, &pcgs_wait_lock); 1899 mutex_exit(&pcgs_wait_lock); 1900 } 1901 } else { 1902 #ifdef VM_STATS 1903 if (count >= PCGS_TRIES) { 1904 VM_STAT_ADD(pcgs_too_many); 1905 } else { 1906 VM_STAT_ADD(pcgs_counts[count]); 1907 } 1908 #endif 1909 if (locked) { 1910 pcgs_unblock(); 1911 mutex_exit(&pcgs_lock); 1912 } 1913 if (cagelocked) 1914 mutex_exit(&pcgs_cagelock); 1915 return (pp); 1916 } 1917 } 1918 /* 1919 * we go down holding the pcf locks. 1920 */ 1921 panic("no %spage found %d", 1922 ((flags & PG_NORELOC) ? "non-reloc " : ""), count); 1923 /*NOTREACHED*/ 1924 } 1925 1926 /* 1927 * Create enough pages for "bytes" worth of data starting at 1928 * "off" in "vp". 1929 * 1930 * Where flag must be one of: 1931 * 1932 * PG_EXCL: Exclusive create (fail if any page already 1933 * exists in the page cache) which does not 1934 * wait for memory to become available. 1935 * 1936 * PG_WAIT: Non-exclusive create which can wait for 1937 * memory to become available. 1938 * 1939 * PG_PHYSCONTIG: Allocate physically contiguous pages. 1940 * (Not Supported) 1941 * 1942 * A doubly linked list of pages is returned to the caller. Each page 1943 * on the list has the "exclusive" (p_selock) lock and "iolock" (p_iolock) 1944 * lock. 1945 * 1946 * Unable to change the parameters to page_create() in a minor release, 1947 * we renamed page_create() to page_create_va(), changed all known calls 1948 * from page_create() to page_create_va(), and created this wrapper. 1949 * 1950 * Upon a major release, we should break compatibility by deleting this 1951 * wrapper, and replacing all the strings "page_create_va", with "page_create". 1952 * 1953 * NOTE: There is a copy of this interface as page_create_io() in 1954 * i86/vm/vm_machdep.c. Any bugs fixed here should be applied 1955 * there. 1956 */ 1957 page_t * 1958 page_create(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags) 1959 { 1960 caddr_t random_vaddr; 1961 struct seg kseg; 1962 1963 #ifdef DEBUG 1964 cmn_err(CE_WARN, "Using deprecated interface page_create: caller %p", 1965 (void *)caller()); 1966 #endif 1967 1968 random_vaddr = (caddr_t)(((uintptr_t)vp >> 7) ^ 1969 (uintptr_t)(off >> PAGESHIFT)); 1970 kseg.s_as = &kas; 1971 1972 return (page_create_va(vp, off, bytes, flags, &kseg, random_vaddr)); 1973 } 1974 1975 #ifdef DEBUG 1976 uint32_t pg_alloc_pgs_mtbf = 0; 1977 #endif 1978 1979 /* 1980 * Used for large page support. It will attempt to allocate 1981 * a large page(s) off the freelist. 1982 * 1983 * Returns non zero on failure. 1984 */ 1985 int 1986 page_alloc_pages(struct vnode *vp, struct seg *seg, caddr_t addr, 1987 page_t **basepp, page_t *ppa[], uint_t szc, int anypgsz, int pgflags) 1988 { 1989 pgcnt_t npgs, curnpgs, totpgs; 1990 size_t pgsz; 1991 page_t *pplist = NULL, *pp; 1992 int err = 0; 1993 lgrp_t *lgrp; 1994 1995 ASSERT(szc != 0 && szc <= (page_num_pagesizes() - 1)); 1996 ASSERT(pgflags == 0 || pgflags == PG_LOCAL); 1997 1998 /* 1999 * Check if system heavily prefers local large pages over remote 2000 * on systems with multiple lgroups. 2001 */ 2002 if (lpg_alloc_prefer == LPAP_LOCAL && nlgrps > 1) { 2003 pgflags = PG_LOCAL; 2004 } 2005 2006 VM_STAT_ADD(alloc_pages[0]); 2007 2008 #ifdef DEBUG 2009 if (pg_alloc_pgs_mtbf && !(gethrtime() % pg_alloc_pgs_mtbf)) { 2010 return (ENOMEM); 2011 } 2012 #endif 2013 2014 /* 2015 * One must be NULL but not both. 2016 * And one must be non NULL but not both. 2017 */ 2018 ASSERT(basepp != NULL || ppa != NULL); 2019 ASSERT(basepp == NULL || ppa == NULL); 2020 2021 #if defined(__i386) || defined(__amd64) 2022 while (page_chk_freelist(szc) == 0) { 2023 VM_STAT_ADD(alloc_pages[8]); 2024 if (anypgsz == 0 || --szc == 0) 2025 return (ENOMEM); 2026 } 2027 #endif 2028 2029 pgsz = page_get_pagesize(szc); 2030 totpgs = curnpgs = npgs = pgsz >> PAGESHIFT; 2031 2032 ASSERT(((uintptr_t)addr & (pgsz - 1)) == 0); 2033 2034 (void) page_create_wait(npgs, PG_WAIT); 2035 2036 while (npgs && szc) { 2037 lgrp = lgrp_mem_choose(seg, addr, pgsz); 2038 if (pgflags == PG_LOCAL) { 2039 pp = page_get_freelist(vp, 0, seg, addr, pgsz, 2040 pgflags, lgrp); 2041 if (pp == NULL) { 2042 pp = page_get_freelist(vp, 0, seg, addr, pgsz, 2043 0, lgrp); 2044 } 2045 } else { 2046 pp = page_get_freelist(vp, 0, seg, addr, pgsz, 2047 0, lgrp); 2048 } 2049 if (pp != NULL) { 2050 VM_STAT_ADD(alloc_pages[1]); 2051 page_list_concat(&pplist, &pp); 2052 ASSERT(npgs >= curnpgs); 2053 npgs -= curnpgs; 2054 } else if (anypgsz) { 2055 VM_STAT_ADD(alloc_pages[2]); 2056 szc--; 2057 pgsz = page_get_pagesize(szc); 2058 curnpgs = pgsz >> PAGESHIFT; 2059 } else { 2060 VM_STAT_ADD(alloc_pages[3]); 2061 ASSERT(npgs == totpgs); 2062 page_create_putback(npgs); 2063 return (ENOMEM); 2064 } 2065 } 2066 if (szc == 0) { 2067 VM_STAT_ADD(alloc_pages[4]); 2068 ASSERT(npgs != 0); 2069 page_create_putback(npgs); 2070 err = ENOMEM; 2071 } else if (basepp != NULL) { 2072 ASSERT(npgs == 0); 2073 ASSERT(ppa == NULL); 2074 *basepp = pplist; 2075 } 2076 2077 npgs = totpgs - npgs; 2078 pp = pplist; 2079 2080 /* 2081 * Clear the free and age bits. Also if we were passed in a ppa then 2082 * fill it in with all the constituent pages from the large page. But 2083 * if we failed to allocate all the pages just free what we got. 2084 */ 2085 while (npgs != 0) { 2086 ASSERT(PP_ISFREE(pp)); 2087 ASSERT(PP_ISAGED(pp)); 2088 if (ppa != NULL || err != 0) { 2089 if (err == 0) { 2090 VM_STAT_ADD(alloc_pages[5]); 2091 PP_CLRFREE(pp); 2092 PP_CLRAGED(pp); 2093 page_sub(&pplist, pp); 2094 *ppa++ = pp; 2095 npgs--; 2096 } else { 2097 VM_STAT_ADD(alloc_pages[6]); 2098 ASSERT(pp->p_szc != 0); 2099 curnpgs = page_get_pagecnt(pp->p_szc); 2100 page_list_break(&pp, &pplist, curnpgs); 2101 page_list_add_pages(pp, 0); 2102 page_create_putback(curnpgs); 2103 ASSERT(npgs >= curnpgs); 2104 npgs -= curnpgs; 2105 } 2106 pp = pplist; 2107 } else { 2108 VM_STAT_ADD(alloc_pages[7]); 2109 PP_CLRFREE(pp); 2110 PP_CLRAGED(pp); 2111 pp = pp->p_next; 2112 npgs--; 2113 } 2114 } 2115 return (err); 2116 } 2117 2118 /* 2119 * Get a single large page off of the freelists, and set it up for use. 2120 * Number of bytes requested must be a supported page size. 2121 * 2122 * Note that this call may fail even if there is sufficient 2123 * memory available or PG_WAIT is set, so the caller must 2124 * be willing to fallback on page_create_va(), block and retry, 2125 * or fail the requester. 2126 */ 2127 page_t * 2128 page_create_va_large(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags, 2129 struct seg *seg, caddr_t vaddr, void *arg) 2130 { 2131 pgcnt_t npages; 2132 page_t *pp; 2133 page_t *rootpp; 2134 lgrp_t *lgrp; 2135 lgrp_id_t *lgrpid = (lgrp_id_t *)arg; 2136 2137 ASSERT(vp != NULL); 2138 2139 ASSERT((flags & ~(PG_EXCL | PG_WAIT | 2140 PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == 0); 2141 /* but no others */ 2142 2143 ASSERT((flags & PG_EXCL) == PG_EXCL); 2144 2145 npages = btop(bytes); 2146 2147 if (!kcage_on || panicstr) { 2148 /* 2149 * Cage is OFF, or we are single threaded in 2150 * panic, so make everything a RELOC request. 2151 */ 2152 flags &= ~PG_NORELOC; 2153 } 2154 2155 /* 2156 * Make sure there's adequate physical memory available. 2157 * Note: PG_WAIT is ignored here. 2158 */ 2159 if (freemem <= throttlefree + npages) { 2160 VM_STAT_ADD(page_create_large_cnt[1]); 2161 return (NULL); 2162 } 2163 2164 /* 2165 * If cage is on, dampen draw from cage when available 2166 * cage space is low. 2167 */ 2168 if ((flags & (PG_NORELOC | PG_WAIT)) == (PG_NORELOC | PG_WAIT) && 2169 kcage_freemem < kcage_throttlefree + npages) { 2170 2171 /* 2172 * The cage is on, the caller wants PG_NORELOC 2173 * pages and available cage memory is very low. 2174 * Call kcage_create_throttle() to attempt to 2175 * control demand on the cage. 2176 */ 2177 if (kcage_create_throttle(npages, flags) == KCT_FAILURE) { 2178 VM_STAT_ADD(page_create_large_cnt[2]); 2179 return (NULL); 2180 } 2181 } 2182 2183 if (!pcf_decrement_bucket(npages) && 2184 !pcf_decrement_multiple(NULL, npages, 1)) { 2185 VM_STAT_ADD(page_create_large_cnt[4]); 2186 return (NULL); 2187 } 2188 2189 /* 2190 * This is where this function behaves fundamentally differently 2191 * than page_create_va(); since we're intending to map the page 2192 * with a single TTE, we have to get it as a physically contiguous 2193 * hardware pagesize chunk. If we can't, we fail. 2194 */ 2195 if (lgrpid != NULL && *lgrpid >= 0 && *lgrpid <= lgrp_alloc_max && 2196 LGRP_EXISTS(lgrp_table[*lgrpid])) 2197 lgrp = lgrp_table[*lgrpid]; 2198 else 2199 lgrp = lgrp_mem_choose(seg, vaddr, bytes); 2200 2201 if ((rootpp = page_get_freelist(&kvp, off, seg, vaddr, 2202 bytes, flags & ~PG_MATCH_COLOR, lgrp)) == NULL) { 2203 page_create_putback(npages); 2204 VM_STAT_ADD(page_create_large_cnt[5]); 2205 return (NULL); 2206 } 2207 2208 /* 2209 * if we got the page with the wrong mtype give it back this is a 2210 * workaround for CR 6249718. When CR 6249718 is fixed we never get 2211 * inside "if" and the workaround becomes just a nop 2212 */ 2213 if (kcage_on && (flags & PG_NORELOC) && !PP_ISNORELOC(rootpp)) { 2214 page_list_add_pages(rootpp, 0); 2215 page_create_putback(npages); 2216 VM_STAT_ADD(page_create_large_cnt[6]); 2217 return (NULL); 2218 } 2219 2220 /* 2221 * If satisfying this request has left us with too little 2222 * memory, start the wheels turning to get some back. The 2223 * first clause of the test prevents waking up the pageout 2224 * daemon in situations where it would decide that there's 2225 * nothing to do. 2226 */ 2227 if (nscan < desscan && freemem < minfree) { 2228 TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL, 2229 "pageout_cv_signal:freemem %ld", freemem); 2230 cv_signal(&proc_pageout->p_cv); 2231 } 2232 2233 pp = rootpp; 2234 while (npages--) { 2235 ASSERT(PAGE_EXCL(pp)); 2236 ASSERT(pp->p_vnode == NULL); 2237 ASSERT(!hat_page_is_mapped(pp)); 2238 PP_CLRFREE(pp); 2239 PP_CLRAGED(pp); 2240 if (!page_hashin(pp, vp, off, NULL)) 2241 panic("page_create_large: hashin failed: page %p", 2242 (void *)pp); 2243 page_io_lock(pp); 2244 off += PAGESIZE; 2245 pp = pp->p_next; 2246 } 2247 2248 VM_STAT_ADD(page_create_large_cnt[0]); 2249 return (rootpp); 2250 } 2251 2252 page_t * 2253 page_create_va(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags, 2254 struct seg *seg, caddr_t vaddr) 2255 { 2256 page_t *plist = NULL; 2257 pgcnt_t npages; 2258 pgcnt_t found_on_free = 0; 2259 pgcnt_t pages_req; 2260 page_t *npp = NULL; 2261 struct pcf *p; 2262 lgrp_t *lgrp; 2263 2264 TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START, 2265 "page_create_start:vp %p off %llx bytes %lu flags %x", 2266 vp, off, bytes, flags); 2267 2268 ASSERT(bytes != 0 && vp != NULL); 2269 2270 if ((flags & PG_EXCL) == 0 && (flags & PG_WAIT) == 0) { 2271 panic("page_create: invalid flags"); 2272 /*NOTREACHED*/ 2273 } 2274 ASSERT((flags & ~(PG_EXCL | PG_WAIT | 2275 PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == 0); 2276 /* but no others */ 2277 2278 pages_req = npages = btopr(bytes); 2279 /* 2280 * Try to see whether request is too large to *ever* be 2281 * satisfied, in order to prevent deadlock. We arbitrarily 2282 * decide to limit maximum size requests to max_page_get. 2283 */ 2284 if (npages >= max_page_get) { 2285 if ((flags & PG_WAIT) == 0) { 2286 TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_TOOBIG, 2287 "page_create_toobig:vp %p off %llx npages " 2288 "%lu max_page_get %lu", 2289 vp, off, npages, max_page_get); 2290 return (NULL); 2291 } else { 2292 cmn_err(CE_WARN, 2293 "Request for too much kernel memory " 2294 "(%lu bytes), will hang forever", bytes); 2295 for (;;) 2296 delay(1000000000); 2297 } 2298 } 2299 2300 if (!kcage_on || panicstr) { 2301 /* 2302 * Cage is OFF, or we are single threaded in 2303 * panic, so make everything a RELOC request. 2304 */ 2305 flags &= ~PG_NORELOC; 2306 } 2307 2308 if (freemem <= throttlefree + npages) 2309 if (!page_create_throttle(npages, flags)) 2310 return (NULL); 2311 2312 /* 2313 * If cage is on, dampen draw from cage when available 2314 * cage space is low. 2315 */ 2316 if ((flags & PG_NORELOC) && 2317 kcage_freemem < kcage_throttlefree + npages) { 2318 2319 /* 2320 * The cage is on, the caller wants PG_NORELOC 2321 * pages and available cage memory is very low. 2322 * Call kcage_create_throttle() to attempt to 2323 * control demand on the cage. 2324 */ 2325 if (kcage_create_throttle(npages, flags) == KCT_FAILURE) 2326 return (NULL); 2327 } 2328 2329 VM_STAT_ADD(page_create_cnt[0]); 2330 2331 if (!pcf_decrement_bucket(npages)) { 2332 /* 2333 * Have to look harder. If npages is greater than 2334 * one, then we might have to coalesce the counters. 2335 * 2336 * Go wait. We come back having accounted 2337 * for the memory. 2338 */ 2339 VM_STAT_ADD(page_create_cnt[1]); 2340 if (!page_create_wait(npages, flags)) { 2341 VM_STAT_ADD(page_create_cnt[2]); 2342 return (NULL); 2343 } 2344 } 2345 2346 TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS, 2347 "page_create_success:vp %p off %llx", vp, off); 2348 2349 /* 2350 * If satisfying this request has left us with too little 2351 * memory, start the wheels turning to get some back. The 2352 * first clause of the test prevents waking up the pageout 2353 * daemon in situations where it would decide that there's 2354 * nothing to do. 2355 */ 2356 if (nscan < desscan && freemem < minfree) { 2357 TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL, 2358 "pageout_cv_signal:freemem %ld", freemem); 2359 cv_signal(&proc_pageout->p_cv); 2360 } 2361 2362 /* 2363 * Loop around collecting the requested number of pages. 2364 * Most of the time, we have to `create' a new page. With 2365 * this in mind, pull the page off the free list before 2366 * getting the hash lock. This will minimize the hash 2367 * lock hold time, nesting, and the like. If it turns 2368 * out we don't need the page, we put it back at the end. 2369 */ 2370 while (npages--) { 2371 page_t *pp; 2372 kmutex_t *phm = NULL; 2373 ulong_t index; 2374 2375 index = PAGE_HASH_FUNC(vp, off); 2376 top: 2377 ASSERT(phm == NULL); 2378 ASSERT(index == PAGE_HASH_FUNC(vp, off)); 2379 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 2380 2381 if (npp == NULL) { 2382 /* 2383 * Try to get a page from the freelist (ie, 2384 * a page with no [vp, off] tag). If that 2385 * fails, use the cachelist. 2386 * 2387 * During the first attempt at both the free 2388 * and cache lists we try for the correct color. 2389 */ 2390 /* 2391 * XXXX-how do we deal with virtual indexed 2392 * caches and and colors? 2393 */ 2394 VM_STAT_ADD(page_create_cnt[4]); 2395 /* 2396 * Get lgroup to allocate next page of shared memory 2397 * from and use it to specify where to allocate 2398 * the physical memory 2399 */ 2400 lgrp = lgrp_mem_choose(seg, vaddr, PAGESIZE); 2401 npp = page_get_freelist(vp, off, seg, vaddr, PAGESIZE, 2402 flags | PG_MATCH_COLOR, lgrp); 2403 if (npp == NULL) { 2404 npp = page_get_cachelist(vp, off, seg, 2405 vaddr, flags | PG_MATCH_COLOR, lgrp); 2406 if (npp == NULL) { 2407 npp = page_create_get_something(vp, 2408 off, seg, vaddr, 2409 flags & ~PG_MATCH_COLOR); 2410 } 2411 2412 if (PP_ISAGED(npp) == 0) { 2413 /* 2414 * Since this page came from the 2415 * cachelist, we must destroy the 2416 * old vnode association. 2417 */ 2418 page_hashout(npp, NULL); 2419 } 2420 } 2421 } 2422 2423 /* 2424 * We own this page! 2425 */ 2426 ASSERT(PAGE_EXCL(npp)); 2427 ASSERT(npp->p_vnode == NULL); 2428 ASSERT(!hat_page_is_mapped(npp)); 2429 PP_CLRFREE(npp); 2430 PP_CLRAGED(npp); 2431 2432 /* 2433 * Here we have a page in our hot little mits and are 2434 * just waiting to stuff it on the appropriate lists. 2435 * Get the mutex and check to see if it really does 2436 * not exist. 2437 */ 2438 phm = PAGE_HASH_MUTEX(index); 2439 mutex_enter(phm); 2440 PAGE_HASH_SEARCH(index, pp, vp, off); 2441 if (pp == NULL) { 2442 VM_STAT_ADD(page_create_new); 2443 pp = npp; 2444 npp = NULL; 2445 if (!page_hashin(pp, vp, off, phm)) { 2446 /* 2447 * Since we hold the page hash mutex and 2448 * just searched for this page, page_hashin 2449 * had better not fail. If it does, that 2450 * means somethread did not follow the 2451 * page hash mutex rules. Panic now and 2452 * get it over with. As usual, go down 2453 * holding all the locks. 2454 */ 2455 ASSERT(MUTEX_HELD(phm)); 2456 panic("page_create: " 2457 "hashin failed %p %p %llx %p", 2458 (void *)pp, (void *)vp, off, (void *)phm); 2459 /*NOTREACHED*/ 2460 } 2461 ASSERT(MUTEX_HELD(phm)); 2462 mutex_exit(phm); 2463 phm = NULL; 2464 2465 /* 2466 * Hat layer locking need not be done to set 2467 * the following bits since the page is not hashed 2468 * and was on the free list (i.e., had no mappings). 2469 * 2470 * Set the reference bit to protect 2471 * against immediate pageout 2472 * 2473 * XXXmh modify freelist code to set reference 2474 * bit so we don't have to do it here. 2475 */ 2476 page_set_props(pp, P_REF); 2477 found_on_free++; 2478 } else { 2479 VM_STAT_ADD(page_create_exists); 2480 if (flags & PG_EXCL) { 2481 /* 2482 * Found an existing page, and the caller 2483 * wanted all new pages. Undo all of the work 2484 * we have done. 2485 */ 2486 mutex_exit(phm); 2487 phm = NULL; 2488 while (plist != NULL) { 2489 pp = plist; 2490 page_sub(&plist, pp); 2491 page_io_unlock(pp); 2492 /* large pages should not end up here */ 2493 ASSERT(pp->p_szc == 0); 2494 /*LINTED: constant in conditional ctx*/ 2495 VN_DISPOSE(pp, B_INVAL, 0, kcred); 2496 } 2497 VM_STAT_ADD(page_create_found_one); 2498 goto fail; 2499 } 2500 ASSERT(flags & PG_WAIT); 2501 if (!page_lock(pp, SE_EXCL, phm, P_NO_RECLAIM)) { 2502 /* 2503 * Start all over again if we blocked trying 2504 * to lock the page. 2505 */ 2506 mutex_exit(phm); 2507 VM_STAT_ADD(page_create_page_lock_failed); 2508 phm = NULL; 2509 goto top; 2510 } 2511 mutex_exit(phm); 2512 phm = NULL; 2513 2514 if (PP_ISFREE(pp)) { 2515 ASSERT(PP_ISAGED(pp) == 0); 2516 VM_STAT_ADD(pagecnt.pc_get_cache); 2517 page_list_sub(pp, PG_CACHE_LIST); 2518 PP_CLRFREE(pp); 2519 found_on_free++; 2520 } 2521 } 2522 2523 /* 2524 * Got a page! It is locked. Acquire the i/o 2525 * lock since we are going to use the p_next and 2526 * p_prev fields to link the requested pages together. 2527 */ 2528 page_io_lock(pp); 2529 page_add(&plist, pp); 2530 plist = plist->p_next; 2531 off += PAGESIZE; 2532 vaddr += PAGESIZE; 2533 } 2534 2535 ASSERT((flags & PG_EXCL) ? (found_on_free == pages_req) : 1); 2536 fail: 2537 if (npp != NULL) { 2538 /* 2539 * Did not need this page after all. 2540 * Put it back on the free list. 2541 */ 2542 VM_STAT_ADD(page_create_putbacks); 2543 PP_SETFREE(npp); 2544 PP_SETAGED(npp); 2545 npp->p_offset = (u_offset_t)-1; 2546 page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL); 2547 page_unlock(npp); 2548 2549 } 2550 2551 ASSERT(pages_req >= found_on_free); 2552 2553 { 2554 uint_t overshoot = (uint_t)(pages_req - found_on_free); 2555 2556 if (overshoot) { 2557 VM_STAT_ADD(page_create_overshoot); 2558 p = &pcf[PCF_INDEX()]; 2559 mutex_enter(&p->pcf_lock); 2560 if (p->pcf_block) { 2561 p->pcf_reserve += overshoot; 2562 } else { 2563 p->pcf_count += overshoot; 2564 if (p->pcf_wait) { 2565 mutex_enter(&new_freemem_lock); 2566 if (freemem_wait) { 2567 cv_signal(&freemem_cv); 2568 p->pcf_wait--; 2569 } else { 2570 p->pcf_wait = 0; 2571 } 2572 mutex_exit(&new_freemem_lock); 2573 } 2574 } 2575 mutex_exit(&p->pcf_lock); 2576 /* freemem is approximate, so this test OK */ 2577 if (!p->pcf_block) 2578 freemem += overshoot; 2579 } 2580 } 2581 2582 return (plist); 2583 } 2584 2585 /* 2586 * One or more constituent pages of this large page has been marked 2587 * toxic. Simply demote the large page to PAGESIZE pages and let 2588 * page_free() handle it. This routine should only be called by 2589 * large page free routines (page_free_pages() and page_destroy_pages(). 2590 * All pages are locked SE_EXCL and have already been marked free. 2591 */ 2592 static void 2593 page_free_toxic_pages(page_t *rootpp) 2594 { 2595 page_t *tpp; 2596 pgcnt_t i, pgcnt = page_get_pagecnt(rootpp->p_szc); 2597 uint_t szc = rootpp->p_szc; 2598 2599 for (i = 0, tpp = rootpp; i < pgcnt; i++, tpp = tpp->p_next) { 2600 ASSERT(tpp->p_szc == szc); 2601 ASSERT((PAGE_EXCL(tpp) && 2602 !page_iolock_assert(tpp)) || panicstr); 2603 tpp->p_szc = 0; 2604 } 2605 2606 while (rootpp != NULL) { 2607 tpp = rootpp; 2608 page_sub(&rootpp, tpp); 2609 ASSERT(PP_ISFREE(tpp)); 2610 PP_CLRFREE(tpp); 2611 page_free(tpp, 1); 2612 } 2613 } 2614 2615 /* 2616 * Put page on the "free" list. 2617 * The free list is really two lists maintained by 2618 * the PSM of whatever machine we happen to be on. 2619 */ 2620 void 2621 page_free(page_t *pp, int dontneed) 2622 { 2623 struct pcf *p; 2624 uint_t pcf_index; 2625 2626 ASSERT((PAGE_EXCL(pp) && 2627 !page_iolock_assert(pp)) || panicstr); 2628 2629 if (PP_ISFREE(pp)) { 2630 panic("page_free: page %p is free", (void *)pp); 2631 } 2632 2633 if (pp->p_szc != 0) { 2634 if (pp->p_vnode == NULL || IS_SWAPFSVP(pp->p_vnode) || 2635 PP_ISKAS(pp)) { 2636 panic("page_free: anon or kernel " 2637 "or no vnode large page %p", (void *)pp); 2638 } 2639 page_demote_vp_pages(pp); 2640 ASSERT(pp->p_szc == 0); 2641 } 2642 2643 /* 2644 * The page_struct_lock need not be acquired to examine these 2645 * fields since the page has an "exclusive" lock. 2646 */ 2647 if (hat_page_is_mapped(pp) || pp->p_lckcnt != 0 || pp->p_cowcnt != 0 || 2648 pp->p_slckcnt != 0) { 2649 panic("page_free pp=%p, pfn=%lx, lckcnt=%d, cowcnt=%d " 2650 "slckcnt = %d", pp, page_pptonum(pp), pp->p_lckcnt, 2651 pp->p_cowcnt, pp->p_slckcnt); 2652 /*NOTREACHED*/ 2653 } 2654 2655 ASSERT(!hat_page_getshare(pp)); 2656 2657 PP_SETFREE(pp); 2658 ASSERT(pp->p_vnode == NULL || !IS_VMODSORT(pp->p_vnode) || 2659 !hat_ismod(pp)); 2660 page_clr_all_props(pp); 2661 ASSERT(!hat_page_getshare(pp)); 2662 2663 /* 2664 * Now we add the page to the head of the free list. 2665 * But if this page is associated with a paged vnode 2666 * then we adjust the head forward so that the page is 2667 * effectively at the end of the list. 2668 */ 2669 if (pp->p_vnode == NULL) { 2670 /* 2671 * Page has no identity, put it on the free list. 2672 */ 2673 PP_SETAGED(pp); 2674 pp->p_offset = (u_offset_t)-1; 2675 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); 2676 VM_STAT_ADD(pagecnt.pc_free_free); 2677 TRACE_1(TR_FAC_VM, TR_PAGE_FREE_FREE, 2678 "page_free_free:pp %p", pp); 2679 } else { 2680 PP_CLRAGED(pp); 2681 2682 if (!dontneed || nopageage) { 2683 /* move it to the tail of the list */ 2684 page_list_add(pp, PG_CACHE_LIST | PG_LIST_TAIL); 2685 2686 VM_STAT_ADD(pagecnt.pc_free_cache); 2687 TRACE_1(TR_FAC_VM, TR_PAGE_FREE_CACHE_TAIL, 2688 "page_free_cache_tail:pp %p", pp); 2689 } else { 2690 page_list_add(pp, PG_CACHE_LIST | PG_LIST_HEAD); 2691 2692 VM_STAT_ADD(pagecnt.pc_free_dontneed); 2693 TRACE_1(TR_FAC_VM, TR_PAGE_FREE_CACHE_HEAD, 2694 "page_free_cache_head:pp %p", pp); 2695 } 2696 } 2697 page_unlock(pp); 2698 2699 /* 2700 * Now do the `freemem' accounting. 2701 */ 2702 pcf_index = PCF_INDEX(); 2703 p = &pcf[pcf_index]; 2704 2705 mutex_enter(&p->pcf_lock); 2706 if (p->pcf_block) { 2707 p->pcf_reserve += 1; 2708 } else { 2709 p->pcf_count += 1; 2710 if (p->pcf_wait) { 2711 mutex_enter(&new_freemem_lock); 2712 /* 2713 * Check to see if some other thread 2714 * is actually waiting. Another bucket 2715 * may have woken it up by now. If there 2716 * are no waiters, then set our pcf_wait 2717 * count to zero to avoid coming in here 2718 * next time. Also, since only one page 2719 * was put on the free list, just wake 2720 * up one waiter. 2721 */ 2722 if (freemem_wait) { 2723 cv_signal(&freemem_cv); 2724 p->pcf_wait--; 2725 } else { 2726 p->pcf_wait = 0; 2727 } 2728 mutex_exit(&new_freemem_lock); 2729 } 2730 } 2731 mutex_exit(&p->pcf_lock); 2732 2733 /* freemem is approximate, so this test OK */ 2734 if (!p->pcf_block) 2735 freemem += 1; 2736 } 2737 2738 /* 2739 * Put page on the "free" list during intial startup. 2740 * This happens during initial single threaded execution. 2741 */ 2742 void 2743 page_free_at_startup(page_t *pp) 2744 { 2745 struct pcf *p; 2746 uint_t pcf_index; 2747 2748 page_list_add(pp, PG_FREE_LIST | PG_LIST_HEAD | PG_LIST_ISINIT); 2749 VM_STAT_ADD(pagecnt.pc_free_free); 2750 2751 /* 2752 * Now do the `freemem' accounting. 2753 */ 2754 pcf_index = PCF_INDEX(); 2755 p = &pcf[pcf_index]; 2756 2757 ASSERT(p->pcf_block == 0); 2758 ASSERT(p->pcf_wait == 0); 2759 p->pcf_count += 1; 2760 2761 /* freemem is approximate, so this is OK */ 2762 freemem += 1; 2763 } 2764 2765 void 2766 page_free_pages(page_t *pp) 2767 { 2768 page_t *tpp, *rootpp = NULL; 2769 pgcnt_t pgcnt = page_get_pagecnt(pp->p_szc); 2770 pgcnt_t i; 2771 uint_t szc = pp->p_szc; 2772 2773 VM_STAT_ADD(pagecnt.pc_free_pages); 2774 TRACE_1(TR_FAC_VM, TR_PAGE_FREE_FREE, 2775 "page_free_free:pp %p", pp); 2776 2777 ASSERT(pp->p_szc != 0 && pp->p_szc < page_num_pagesizes()); 2778 if ((page_pptonum(pp) & (pgcnt - 1)) != 0) { 2779 panic("page_free_pages: not root page %p", (void *)pp); 2780 /*NOTREACHED*/ 2781 } 2782 2783 for (i = 0, tpp = pp; i < pgcnt; i++, tpp++) { 2784 ASSERT((PAGE_EXCL(tpp) && 2785 !page_iolock_assert(tpp)) || panicstr); 2786 if (PP_ISFREE(tpp)) { 2787 panic("page_free_pages: page %p is free", (void *)tpp); 2788 /*NOTREACHED*/ 2789 } 2790 if (hat_page_is_mapped(tpp) || tpp->p_lckcnt != 0 || 2791 tpp->p_cowcnt != 0 || tpp->p_slckcnt != 0) { 2792 panic("page_free_pages %p", (void *)tpp); 2793 /*NOTREACHED*/ 2794 } 2795 2796 ASSERT(!hat_page_getshare(tpp)); 2797 ASSERT(tpp->p_vnode == NULL); 2798 ASSERT(tpp->p_szc == szc); 2799 2800 PP_SETFREE(tpp); 2801 page_clr_all_props(tpp); 2802 PP_SETAGED(tpp); 2803 tpp->p_offset = (u_offset_t)-1; 2804 ASSERT(tpp->p_next == tpp); 2805 ASSERT(tpp->p_prev == tpp); 2806 page_list_concat(&rootpp, &tpp); 2807 } 2808 ASSERT(rootpp == pp); 2809 2810 page_list_add_pages(rootpp, 0); 2811 page_create_putback(pgcnt); 2812 } 2813 2814 int free_pages = 1; 2815 2816 /* 2817 * This routine attempts to return pages to the cachelist via page_release(). 2818 * It does not *have* to be successful in all cases, since the pageout scanner 2819 * will catch any pages it misses. It does need to be fast and not introduce 2820 * too much overhead. 2821 * 2822 * If a page isn't found on the unlocked sweep of the page_hash bucket, we 2823 * don't lock and retry. This is ok, since the page scanner will eventually 2824 * find any page we miss in free_vp_pages(). 2825 */ 2826 void 2827 free_vp_pages(vnode_t *vp, u_offset_t off, size_t len) 2828 { 2829 page_t *pp; 2830 u_offset_t eoff; 2831 extern int swap_in_range(vnode_t *, u_offset_t, size_t); 2832 2833 eoff = off + len; 2834 2835 if (free_pages == 0) 2836 return; 2837 if (swap_in_range(vp, off, len)) 2838 return; 2839 2840 for (; off < eoff; off += PAGESIZE) { 2841 2842 /* 2843 * find the page using a fast, but inexact search. It'll be OK 2844 * if a few pages slip through the cracks here. 2845 */ 2846 pp = page_exists(vp, off); 2847 2848 /* 2849 * If we didn't find the page (it may not exist), the page 2850 * is free, looks still in use (shared), or we can't lock it, 2851 * just give up. 2852 */ 2853 if (pp == NULL || 2854 PP_ISFREE(pp) || 2855 page_share_cnt(pp) > 0 || 2856 !page_trylock(pp, SE_EXCL)) 2857 continue; 2858 2859 /* 2860 * Once we have locked pp, verify that it's still the 2861 * correct page and not already free 2862 */ 2863 ASSERT(PAGE_LOCKED_SE(pp, SE_EXCL)); 2864 if (pp->p_vnode != vp || pp->p_offset != off || PP_ISFREE(pp)) { 2865 page_unlock(pp); 2866 continue; 2867 } 2868 2869 /* 2870 * try to release the page... 2871 */ 2872 (void) page_release(pp, 1); 2873 } 2874 } 2875 2876 /* 2877 * Reclaim the given page from the free list. 2878 * If pp is part of a large pages, only the given constituent page is reclaimed 2879 * and the large page it belonged to will be demoted. This can only happen 2880 * if the page is not on the cachelist. 2881 * 2882 * Returns 1 on success or 0 on failure. 2883 * 2884 * The page is unlocked if it can't be reclaimed (when freemem == 0). 2885 * If `lock' is non-null, it will be dropped and re-acquired if 2886 * the routine must wait while freemem is 0. 2887 * 2888 * As it turns out, boot_getpages() does this. It picks a page, 2889 * based on where OBP mapped in some address, gets its pfn, searches 2890 * the memsegs, locks the page, then pulls it off the free list! 2891 */ 2892 int 2893 page_reclaim(page_t *pp, kmutex_t *lock) 2894 { 2895 struct pcf *p; 2896 struct cpu *cpup; 2897 int enough; 2898 uint_t i; 2899 2900 ASSERT(lock != NULL ? MUTEX_HELD(lock) : 1); 2901 ASSERT(PAGE_EXCL(pp) && PP_ISFREE(pp)); 2902 2903 /* 2904 * If `freemem' is 0, we cannot reclaim this page from the 2905 * freelist, so release every lock we might hold: the page, 2906 * and the `lock' before blocking. 2907 * 2908 * The only way `freemem' can become 0 while there are pages 2909 * marked free (have their p->p_free bit set) is when the 2910 * system is low on memory and doing a page_create(). In 2911 * order to guarantee that once page_create() starts acquiring 2912 * pages it will be able to get all that it needs since `freemem' 2913 * was decreased by the requested amount. So, we need to release 2914 * this page, and let page_create() have it. 2915 * 2916 * Since `freemem' being zero is not supposed to happen, just 2917 * use the usual hash stuff as a starting point. If that bucket 2918 * is empty, then assume the worst, and start at the beginning 2919 * of the pcf array. If we always start at the beginning 2920 * when acquiring more than one pcf lock, there won't be any 2921 * deadlock problems. 2922 */ 2923 2924 /* TODO: Do we need to test kcage_freemem if PG_NORELOC(pp)? */ 2925 2926 if (freemem <= throttlefree && !page_create_throttle(1l, 0)) { 2927 pcf_acquire_all(); 2928 goto page_reclaim_nomem; 2929 } 2930 2931 enough = pcf_decrement_bucket(1); 2932 2933 if (!enough) { 2934 VM_STAT_ADD(page_reclaim_zero); 2935 /* 2936 * Check again. Its possible that some other thread 2937 * could have been right behind us, and added one 2938 * to a list somewhere. Acquire each of the pcf locks 2939 * until we find a page. 2940 */ 2941 p = pcf; 2942 for (i = 0; i < pcf_fanout; i++) { 2943 mutex_enter(&p->pcf_lock); 2944 if (p->pcf_count >= 1) { 2945 p->pcf_count -= 1; 2946 enough = 1; 2947 break; 2948 } 2949 p++; 2950 } 2951 2952 if (!enough) { 2953 page_reclaim_nomem: 2954 /* 2955 * We really can't have page `pp'. 2956 * Time for the no-memory dance with 2957 * page_free(). This is just like 2958 * page_create_wait(). Plus the added 2959 * attraction of releasing whatever mutex 2960 * we held when we were called with in `lock'. 2961 * Page_unlock() will wakeup any thread 2962 * waiting around for this page. 2963 */ 2964 if (lock) { 2965 VM_STAT_ADD(page_reclaim_zero_locked); 2966 mutex_exit(lock); 2967 } 2968 page_unlock(pp); 2969 2970 /* 2971 * get this before we drop all the pcf locks. 2972 */ 2973 mutex_enter(&new_freemem_lock); 2974 2975 p = pcf; 2976 for (i = 0; i < pcf_fanout; i++) { 2977 p->pcf_wait++; 2978 mutex_exit(&p->pcf_lock); 2979 p++; 2980 } 2981 2982 freemem_wait++; 2983 cv_wait(&freemem_cv, &new_freemem_lock); 2984 freemem_wait--; 2985 2986 mutex_exit(&new_freemem_lock); 2987 2988 if (lock) { 2989 mutex_enter(lock); 2990 } 2991 return (0); 2992 } 2993 2994 /* 2995 * The pcf accounting has been done, 2996 * though none of the pcf_wait flags have been set, 2997 * drop the locks and continue on. 2998 */ 2999 while (p >= pcf) { 3000 mutex_exit(&p->pcf_lock); 3001 p--; 3002 } 3003 } 3004 3005 /* 3006 * freemem is not protected by any lock. Thus, we cannot 3007 * have any assertion containing freemem here. 3008 */ 3009 freemem -= 1; 3010 3011 VM_STAT_ADD(pagecnt.pc_reclaim); 3012 3013 /* 3014 * page_list_sub will handle the case where pp is a large page. 3015 * It's possible that the page was promoted while on the freelist 3016 */ 3017 if (PP_ISAGED(pp)) { 3018 page_list_sub(pp, PG_FREE_LIST); 3019 TRACE_1(TR_FAC_VM, TR_PAGE_UNFREE_FREE, 3020 "page_reclaim_free:pp %p", pp); 3021 } else { 3022 page_list_sub(pp, PG_CACHE_LIST); 3023 TRACE_1(TR_FAC_VM, TR_PAGE_UNFREE_CACHE, 3024 "page_reclaim_cache:pp %p", pp); 3025 } 3026 3027 /* 3028 * clear the p_free & p_age bits since this page is no longer 3029 * on the free list. Notice that there was a brief time where 3030 * a page is marked as free, but is not on the list. 3031 * 3032 * Set the reference bit to protect against immediate pageout. 3033 */ 3034 PP_CLRFREE(pp); 3035 PP_CLRAGED(pp); 3036 page_set_props(pp, P_REF); 3037 3038 CPU_STATS_ENTER_K(); 3039 cpup = CPU; /* get cpup now that CPU cannot change */ 3040 CPU_STATS_ADDQ(cpup, vm, pgrec, 1); 3041 CPU_STATS_ADDQ(cpup, vm, pgfrec, 1); 3042 CPU_STATS_EXIT_K(); 3043 ASSERT(pp->p_szc == 0); 3044 3045 return (1); 3046 } 3047 3048 /* 3049 * Destroy identity of the page and put it back on 3050 * the page free list. Assumes that the caller has 3051 * acquired the "exclusive" lock on the page. 3052 */ 3053 void 3054 page_destroy(page_t *pp, int dontfree) 3055 { 3056 ASSERT((PAGE_EXCL(pp) && 3057 !page_iolock_assert(pp)) || panicstr); 3058 ASSERT(pp->p_slckcnt == 0 || panicstr); 3059 3060 if (pp->p_szc != 0) { 3061 if (pp->p_vnode == NULL || IS_SWAPFSVP(pp->p_vnode) || 3062 PP_ISKAS(pp)) { 3063 panic("page_destroy: anon or kernel or no vnode " 3064 "large page %p", (void *)pp); 3065 } 3066 page_demote_vp_pages(pp); 3067 ASSERT(pp->p_szc == 0); 3068 } 3069 3070 TRACE_1(TR_FAC_VM, TR_PAGE_DESTROY, "page_destroy:pp %p", pp); 3071 3072 /* 3073 * Unload translations, if any, then hash out the 3074 * page to erase its identity. 3075 */ 3076 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 3077 page_hashout(pp, NULL); 3078 3079 if (!dontfree) { 3080 /* 3081 * Acquire the "freemem_lock" for availrmem. 3082 * The page_struct_lock need not be acquired for lckcnt 3083 * and cowcnt since the page has an "exclusive" lock. 3084 */ 3085 if ((pp->p_lckcnt != 0) || (pp->p_cowcnt != 0)) { 3086 mutex_enter(&freemem_lock); 3087 if (pp->p_lckcnt != 0) { 3088 availrmem++; 3089 pp->p_lckcnt = 0; 3090 } 3091 if (pp->p_cowcnt != 0) { 3092 availrmem += pp->p_cowcnt; 3093 pp->p_cowcnt = 0; 3094 } 3095 mutex_exit(&freemem_lock); 3096 } 3097 /* 3098 * Put the page on the "free" list. 3099 */ 3100 page_free(pp, 0); 3101 } 3102 } 3103 3104 void 3105 page_destroy_pages(page_t *pp) 3106 { 3107 3108 page_t *tpp, *rootpp = NULL; 3109 pgcnt_t pgcnt = page_get_pagecnt(pp->p_szc); 3110 pgcnt_t i, pglcks = 0; 3111 uint_t szc = pp->p_szc; 3112 3113 ASSERT(pp->p_szc != 0 && pp->p_szc < page_num_pagesizes()); 3114 3115 VM_STAT_ADD(pagecnt.pc_destroy_pages); 3116 3117 TRACE_1(TR_FAC_VM, TR_PAGE_DESTROY, "page_destroy_pages:pp %p", pp); 3118 3119 if ((page_pptonum(pp) & (pgcnt - 1)) != 0) { 3120 panic("page_destroy_pages: not root page %p", (void *)pp); 3121 /*NOTREACHED*/ 3122 } 3123 3124 for (i = 0, tpp = pp; i < pgcnt; i++, tpp++) { 3125 ASSERT((PAGE_EXCL(tpp) && 3126 !page_iolock_assert(tpp)) || panicstr); 3127 ASSERT(tpp->p_slckcnt == 0 || panicstr); 3128 (void) hat_pageunload(tpp, HAT_FORCE_PGUNLOAD); 3129 page_hashout(tpp, NULL); 3130 ASSERT(tpp->p_offset == (u_offset_t)-1); 3131 if (tpp->p_lckcnt != 0) { 3132 pglcks++; 3133 tpp->p_lckcnt = 0; 3134 } else if (tpp->p_cowcnt != 0) { 3135 pglcks += tpp->p_cowcnt; 3136 tpp->p_cowcnt = 0; 3137 } 3138 ASSERT(!hat_page_getshare(tpp)); 3139 ASSERT(tpp->p_vnode == NULL); 3140 ASSERT(tpp->p_szc == szc); 3141 3142 PP_SETFREE(tpp); 3143 page_clr_all_props(tpp); 3144 PP_SETAGED(tpp); 3145 ASSERT(tpp->p_next == tpp); 3146 ASSERT(tpp->p_prev == tpp); 3147 page_list_concat(&rootpp, &tpp); 3148 } 3149 3150 ASSERT(rootpp == pp); 3151 if (pglcks != 0) { 3152 mutex_enter(&freemem_lock); 3153 availrmem += pglcks; 3154 mutex_exit(&freemem_lock); 3155 } 3156 3157 page_list_add_pages(rootpp, 0); 3158 page_create_putback(pgcnt); 3159 } 3160 3161 /* 3162 * Similar to page_destroy(), but destroys pages which are 3163 * locked and known to be on the page free list. Since 3164 * the page is known to be free and locked, no one can access 3165 * it. 3166 * 3167 * Also, the number of free pages does not change. 3168 */ 3169 void 3170 page_destroy_free(page_t *pp) 3171 { 3172 ASSERT(PAGE_EXCL(pp)); 3173 ASSERT(PP_ISFREE(pp)); 3174 ASSERT(pp->p_vnode); 3175 ASSERT(hat_page_getattr(pp, P_MOD | P_REF | P_RO) == 0); 3176 ASSERT(!hat_page_is_mapped(pp)); 3177 ASSERT(PP_ISAGED(pp) == 0); 3178 ASSERT(pp->p_szc == 0); 3179 3180 VM_STAT_ADD(pagecnt.pc_destroy_free); 3181 page_list_sub(pp, PG_CACHE_LIST); 3182 3183 page_hashout(pp, NULL); 3184 ASSERT(pp->p_vnode == NULL); 3185 ASSERT(pp->p_offset == (u_offset_t)-1); 3186 ASSERT(pp->p_hash == NULL); 3187 3188 PP_SETAGED(pp); 3189 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); 3190 page_unlock(pp); 3191 3192 mutex_enter(&new_freemem_lock); 3193 if (freemem_wait) { 3194 cv_signal(&freemem_cv); 3195 } 3196 mutex_exit(&new_freemem_lock); 3197 } 3198 3199 /* 3200 * Rename the page "opp" to have an identity specified 3201 * by [vp, off]. If a page already exists with this name 3202 * it is locked and destroyed. Note that the page's 3203 * translations are not unloaded during the rename. 3204 * 3205 * This routine is used by the anon layer to "steal" the 3206 * original page and is not unlike destroying a page and 3207 * creating a new page using the same page frame. 3208 * 3209 * XXX -- Could deadlock if caller 1 tries to rename A to B while 3210 * caller 2 tries to rename B to A. 3211 */ 3212 void 3213 page_rename(page_t *opp, vnode_t *vp, u_offset_t off) 3214 { 3215 page_t *pp; 3216 int olckcnt = 0; 3217 int ocowcnt = 0; 3218 kmutex_t *phm; 3219 ulong_t index; 3220 3221 ASSERT(PAGE_EXCL(opp) && !page_iolock_assert(opp)); 3222 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 3223 ASSERT(PP_ISFREE(opp) == 0); 3224 3225 VM_STAT_ADD(page_rename_count); 3226 3227 TRACE_3(TR_FAC_VM, TR_PAGE_RENAME, 3228 "page rename:pp %p vp %p off %llx", opp, vp, off); 3229 3230 /* 3231 * CacheFS may call page_rename for a large NFS page 3232 * when both CacheFS and NFS mount points are used 3233 * by applications. Demote this large page before 3234 * renaming it, to ensure that there are no "partial" 3235 * large pages left lying around. 3236 */ 3237 if (opp->p_szc != 0) { 3238 vnode_t *ovp = opp->p_vnode; 3239 ASSERT(ovp != NULL); 3240 ASSERT(!IS_SWAPFSVP(ovp)); 3241 ASSERT(!VN_ISKAS(ovp)); 3242 page_demote_vp_pages(opp); 3243 ASSERT(opp->p_szc == 0); 3244 } 3245 3246 page_hashout(opp, NULL); 3247 PP_CLRAGED(opp); 3248 3249 /* 3250 * Acquire the appropriate page hash lock, since 3251 * we're going to rename the page. 3252 */ 3253 index = PAGE_HASH_FUNC(vp, off); 3254 phm = PAGE_HASH_MUTEX(index); 3255 mutex_enter(phm); 3256 top: 3257 /* 3258 * Look for an existing page with this name and destroy it if found. 3259 * By holding the page hash lock all the way to the page_hashin() 3260 * call, we are assured that no page can be created with this 3261 * identity. In the case when the phm lock is dropped to undo any 3262 * hat layer mappings, the existing page is held with an "exclusive" 3263 * lock, again preventing another page from being created with 3264 * this identity. 3265 */ 3266 PAGE_HASH_SEARCH(index, pp, vp, off); 3267 if (pp != NULL) { 3268 VM_STAT_ADD(page_rename_exists); 3269 3270 /* 3271 * As it turns out, this is one of only two places where 3272 * page_lock() needs to hold the passed in lock in the 3273 * successful case. In all of the others, the lock could 3274 * be dropped as soon as the attempt is made to lock 3275 * the page. It is tempting to add yet another arguement, 3276 * PL_KEEP or PL_DROP, to let page_lock know what to do. 3277 */ 3278 if (!page_lock(pp, SE_EXCL, phm, P_RECLAIM)) { 3279 /* 3280 * Went to sleep because the page could not 3281 * be locked. We were woken up when the page 3282 * was unlocked, or when the page was destroyed. 3283 * In either case, `phm' was dropped while we 3284 * slept. Hence we should not just roar through 3285 * this loop. 3286 */ 3287 goto top; 3288 } 3289 3290 /* 3291 * If an existing page is a large page, then demote 3292 * it to ensure that no "partial" large pages are 3293 * "created" after page_rename. An existing page 3294 * can be a CacheFS page, and can't belong to swapfs. 3295 */ 3296 if (hat_page_is_mapped(pp)) { 3297 /* 3298 * Unload translations. Since we hold the 3299 * exclusive lock on this page, the page 3300 * can not be changed while we drop phm. 3301 * This is also not a lock protocol violation, 3302 * but rather the proper way to do things. 3303 */ 3304 mutex_exit(phm); 3305 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 3306 if (pp->p_szc != 0) { 3307 ASSERT(!IS_SWAPFSVP(vp)); 3308 ASSERT(!VN_ISKAS(vp)); 3309 page_demote_vp_pages(pp); 3310 ASSERT(pp->p_szc == 0); 3311 } 3312 mutex_enter(phm); 3313 } else if (pp->p_szc != 0) { 3314 ASSERT(!IS_SWAPFSVP(vp)); 3315 ASSERT(!VN_ISKAS(vp)); 3316 mutex_exit(phm); 3317 page_demote_vp_pages(pp); 3318 ASSERT(pp->p_szc == 0); 3319 mutex_enter(phm); 3320 } 3321 page_hashout(pp, phm); 3322 } 3323 /* 3324 * Hash in the page with the new identity. 3325 */ 3326 if (!page_hashin(opp, vp, off, phm)) { 3327 /* 3328 * We were holding phm while we searched for [vp, off] 3329 * and only dropped phm if we found and locked a page. 3330 * If we can't create this page now, then some thing 3331 * is really broken. 3332 */ 3333 panic("page_rename: Can't hash in page: %p", (void *)pp); 3334 /*NOTREACHED*/ 3335 } 3336 3337 ASSERT(MUTEX_HELD(phm)); 3338 mutex_exit(phm); 3339 3340 /* 3341 * Now that we have dropped phm, lets get around to finishing up 3342 * with pp. 3343 */ 3344 if (pp != NULL) { 3345 ASSERT(!hat_page_is_mapped(pp)); 3346 /* for now large pages should not end up here */ 3347 ASSERT(pp->p_szc == 0); 3348 /* 3349 * Save the locks for transfer to the new page and then 3350 * clear them so page_free doesn't think they're important. 3351 * The page_struct_lock need not be acquired for lckcnt and 3352 * cowcnt since the page has an "exclusive" lock. 3353 */ 3354 olckcnt = pp->p_lckcnt; 3355 ocowcnt = pp->p_cowcnt; 3356 pp->p_lckcnt = pp->p_cowcnt = 0; 3357 3358 /* 3359 * Put the page on the "free" list after we drop 3360 * the lock. The less work under the lock the better. 3361 */ 3362 /*LINTED: constant in conditional context*/ 3363 VN_DISPOSE(pp, B_FREE, 0, kcred); 3364 } 3365 3366 /* 3367 * Transfer the lock count from the old page (if any). 3368 * The page_struct_lock need not be acquired for lckcnt and 3369 * cowcnt since the page has an "exclusive" lock. 3370 */ 3371 opp->p_lckcnt += olckcnt; 3372 opp->p_cowcnt += ocowcnt; 3373 } 3374 3375 /* 3376 * low level routine to add page `pp' to the hash and vp chains for [vp, offset] 3377 * 3378 * Pages are normally inserted at the start of a vnode's v_pages list. 3379 * If the vnode is VMODSORT and the page is modified, it goes at the end. 3380 * This can happen when a modified page is relocated for DR. 3381 * 3382 * Returns 1 on success and 0 on failure. 3383 */ 3384 static int 3385 page_do_hashin(page_t *pp, vnode_t *vp, u_offset_t offset) 3386 { 3387 page_t **listp; 3388 page_t *tp; 3389 ulong_t index; 3390 3391 ASSERT(PAGE_EXCL(pp)); 3392 ASSERT(vp != NULL); 3393 ASSERT(MUTEX_HELD(page_vnode_mutex(vp))); 3394 3395 /* 3396 * Be sure to set these up before the page is inserted on the hash 3397 * list. As soon as the page is placed on the list some other 3398 * thread might get confused and wonder how this page could 3399 * possibly hash to this list. 3400 */ 3401 pp->p_vnode = vp; 3402 pp->p_offset = offset; 3403 3404 /* 3405 * record if this page is on a swap vnode 3406 */ 3407 if ((vp->v_flag & VISSWAP) != 0) 3408 PP_SETSWAP(pp); 3409 3410 index = PAGE_HASH_FUNC(vp, offset); 3411 ASSERT(MUTEX_HELD(PAGE_HASH_MUTEX(index))); 3412 listp = &page_hash[index]; 3413 3414 /* 3415 * If this page is already hashed in, fail this attempt to add it. 3416 */ 3417 for (tp = *listp; tp != NULL; tp = tp->p_hash) { 3418 if (tp->p_vnode == vp && tp->p_offset == offset) { 3419 pp->p_vnode = NULL; 3420 pp->p_offset = (u_offset_t)(-1); 3421 return (0); 3422 } 3423 } 3424 pp->p_hash = *listp; 3425 *listp = pp; 3426 3427 /* 3428 * Add the page to the vnode's list of pages 3429 */ 3430 if (vp->v_pages != NULL && IS_VMODSORT(vp) && hat_ismod(pp)) 3431 listp = &vp->v_pages->p_vpprev->p_vpnext; 3432 else 3433 listp = &vp->v_pages; 3434 3435 page_vpadd(listp, pp); 3436 3437 return (1); 3438 } 3439 3440 /* 3441 * Add page `pp' to both the hash and vp chains for [vp, offset]. 3442 * 3443 * Returns 1 on success and 0 on failure. 3444 * If hold is passed in, it is not dropped. 3445 */ 3446 int 3447 page_hashin(page_t *pp, vnode_t *vp, u_offset_t offset, kmutex_t *hold) 3448 { 3449 kmutex_t *phm = NULL; 3450 kmutex_t *vphm; 3451 int rc; 3452 3453 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 3454 3455 TRACE_3(TR_FAC_VM, TR_PAGE_HASHIN, 3456 "page_hashin:pp %p vp %p offset %llx", 3457 pp, vp, offset); 3458 3459 VM_STAT_ADD(hashin_count); 3460 3461 if (hold != NULL) 3462 phm = hold; 3463 else { 3464 VM_STAT_ADD(hashin_not_held); 3465 phm = PAGE_HASH_MUTEX(PAGE_HASH_FUNC(vp, offset)); 3466 mutex_enter(phm); 3467 } 3468 3469 vphm = page_vnode_mutex(vp); 3470 mutex_enter(vphm); 3471 rc = page_do_hashin(pp, vp, offset); 3472 mutex_exit(vphm); 3473 if (hold == NULL) 3474 mutex_exit(phm); 3475 if (rc == 0) 3476 VM_STAT_ADD(hashin_already); 3477 return (rc); 3478 } 3479 3480 /* 3481 * Remove page ``pp'' from the hash and vp chains and remove vp association. 3482 * All mutexes must be held 3483 */ 3484 static void 3485 page_do_hashout(page_t *pp) 3486 { 3487 page_t **hpp; 3488 page_t *hp; 3489 vnode_t *vp = pp->p_vnode; 3490 3491 ASSERT(vp != NULL); 3492 ASSERT(MUTEX_HELD(page_vnode_mutex(vp))); 3493 3494 /* 3495 * First, take pp off of its hash chain. 3496 */ 3497 hpp = &page_hash[PAGE_HASH_FUNC(vp, pp->p_offset)]; 3498 3499 for (;;) { 3500 hp = *hpp; 3501 if (hp == pp) 3502 break; 3503 if (hp == NULL) { 3504 panic("page_do_hashout"); 3505 /*NOTREACHED*/ 3506 } 3507 hpp = &hp->p_hash; 3508 } 3509 *hpp = pp->p_hash; 3510 3511 /* 3512 * Now remove it from its associated vnode. 3513 */ 3514 if (vp->v_pages) 3515 page_vpsub(&vp->v_pages, pp); 3516 3517 pp->p_hash = NULL; 3518 page_clr_all_props(pp); 3519 PP_CLRSWAP(pp); 3520 pp->p_vnode = NULL; 3521 pp->p_offset = (u_offset_t)-1; 3522 } 3523 3524 /* 3525 * Remove page ``pp'' from the hash and vp chains and remove vp association. 3526 * 3527 * When `phm' is non-NULL it contains the address of the mutex protecting the 3528 * hash list pp is on. It is not dropped. 3529 */ 3530 void 3531 page_hashout(page_t *pp, kmutex_t *phm) 3532 { 3533 vnode_t *vp; 3534 ulong_t index; 3535 kmutex_t *nphm; 3536 kmutex_t *vphm; 3537 kmutex_t *sep; 3538 3539 ASSERT(phm != NULL ? MUTEX_HELD(phm) : 1); 3540 ASSERT(pp->p_vnode != NULL); 3541 ASSERT((PAGE_EXCL(pp) && !page_iolock_assert(pp)) || panicstr); 3542 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(pp->p_vnode))); 3543 3544 vp = pp->p_vnode; 3545 3546 TRACE_2(TR_FAC_VM, TR_PAGE_HASHOUT, 3547 "page_hashout:pp %p vp %p", pp, vp); 3548 3549 /* Kernel probe */ 3550 TNF_PROBE_2(page_unmap, "vm pagefault", /* CSTYLED */, 3551 tnf_opaque, vnode, vp, 3552 tnf_offset, offset, pp->p_offset); 3553 3554 /* 3555 * 3556 */ 3557 VM_STAT_ADD(hashout_count); 3558 index = PAGE_HASH_FUNC(vp, pp->p_offset); 3559 if (phm == NULL) { 3560 VM_STAT_ADD(hashout_not_held); 3561 nphm = PAGE_HASH_MUTEX(index); 3562 mutex_enter(nphm); 3563 } 3564 ASSERT(phm ? phm == PAGE_HASH_MUTEX(index) : 1); 3565 3566 3567 /* 3568 * grab page vnode mutex and remove it... 3569 */ 3570 vphm = page_vnode_mutex(vp); 3571 mutex_enter(vphm); 3572 3573 page_do_hashout(pp); 3574 3575 mutex_exit(vphm); 3576 if (phm == NULL) 3577 mutex_exit(nphm); 3578 3579 /* 3580 * Wake up processes waiting for this page. The page's 3581 * identity has been changed, and is probably not the 3582 * desired page any longer. 3583 */ 3584 sep = page_se_mutex(pp); 3585 mutex_enter(sep); 3586 pp->p_selock &= ~SE_EWANTED; 3587 if (CV_HAS_WAITERS(&pp->p_cv)) 3588 cv_broadcast(&pp->p_cv); 3589 mutex_exit(sep); 3590 } 3591 3592 /* 3593 * Add the page to the front of a linked list of pages 3594 * using the p_next & p_prev pointers for the list. 3595 * The caller is responsible for protecting the list pointers. 3596 */ 3597 void 3598 page_add(page_t **ppp, page_t *pp) 3599 { 3600 ASSERT(PAGE_EXCL(pp) || (PAGE_SHARED(pp) && page_iolock_assert(pp))); 3601 3602 page_add_common(ppp, pp); 3603 } 3604 3605 3606 3607 /* 3608 * Common code for page_add() and mach_page_add() 3609 */ 3610 void 3611 page_add_common(page_t **ppp, page_t *pp) 3612 { 3613 if (*ppp == NULL) { 3614 pp->p_next = pp->p_prev = pp; 3615 } else { 3616 pp->p_next = *ppp; 3617 pp->p_prev = (*ppp)->p_prev; 3618 (*ppp)->p_prev = pp; 3619 pp->p_prev->p_next = pp; 3620 } 3621 *ppp = pp; 3622 } 3623 3624 3625 /* 3626 * Remove this page from a linked list of pages 3627 * using the p_next & p_prev pointers for the list. 3628 * 3629 * The caller is responsible for protecting the list pointers. 3630 */ 3631 void 3632 page_sub(page_t **ppp, page_t *pp) 3633 { 3634 ASSERT((PP_ISFREE(pp)) ? 1 : 3635 (PAGE_EXCL(pp)) || (PAGE_SHARED(pp) && page_iolock_assert(pp))); 3636 3637 if (*ppp == NULL || pp == NULL) { 3638 panic("page_sub: bad arg(s): pp %p, *ppp %p", 3639 (void *)pp, (void *)(*ppp)); 3640 /*NOTREACHED*/ 3641 } 3642 3643 page_sub_common(ppp, pp); 3644 } 3645 3646 3647 /* 3648 * Common code for page_sub() and mach_page_sub() 3649 */ 3650 void 3651 page_sub_common(page_t **ppp, page_t *pp) 3652 { 3653 if (*ppp == pp) 3654 *ppp = pp->p_next; /* go to next page */ 3655 3656 if (*ppp == pp) 3657 *ppp = NULL; /* page list is gone */ 3658 else { 3659 pp->p_prev->p_next = pp->p_next; 3660 pp->p_next->p_prev = pp->p_prev; 3661 } 3662 pp->p_prev = pp->p_next = pp; /* make pp a list of one */ 3663 } 3664 3665 3666 /* 3667 * Break page list cppp into two lists with npages in the first list. 3668 * The tail is returned in nppp. 3669 */ 3670 void 3671 page_list_break(page_t **oppp, page_t **nppp, pgcnt_t npages) 3672 { 3673 page_t *s1pp = *oppp; 3674 page_t *s2pp; 3675 page_t *e1pp, *e2pp; 3676 long n = 0; 3677 3678 if (s1pp == NULL) { 3679 *nppp = NULL; 3680 return; 3681 } 3682 if (npages == 0) { 3683 *nppp = s1pp; 3684 *oppp = NULL; 3685 return; 3686 } 3687 for (n = 0, s2pp = *oppp; n < npages; n++) { 3688 s2pp = s2pp->p_next; 3689 } 3690 /* Fix head and tail of new lists */ 3691 e1pp = s2pp->p_prev; 3692 e2pp = s1pp->p_prev; 3693 s1pp->p_prev = e1pp; 3694 e1pp->p_next = s1pp; 3695 s2pp->p_prev = e2pp; 3696 e2pp->p_next = s2pp; 3697 3698 /* second list empty */ 3699 if (s2pp == s1pp) { 3700 *oppp = s1pp; 3701 *nppp = NULL; 3702 } else { 3703 *oppp = s1pp; 3704 *nppp = s2pp; 3705 } 3706 } 3707 3708 /* 3709 * Concatenate page list nppp onto the end of list ppp. 3710 */ 3711 void 3712 page_list_concat(page_t **ppp, page_t **nppp) 3713 { 3714 page_t *s1pp, *s2pp, *e1pp, *e2pp; 3715 3716 if (*nppp == NULL) { 3717 return; 3718 } 3719 if (*ppp == NULL) { 3720 *ppp = *nppp; 3721 return; 3722 } 3723 s1pp = *ppp; 3724 e1pp = s1pp->p_prev; 3725 s2pp = *nppp; 3726 e2pp = s2pp->p_prev; 3727 s1pp->p_prev = e2pp; 3728 e2pp->p_next = s1pp; 3729 e1pp->p_next = s2pp; 3730 s2pp->p_prev = e1pp; 3731 } 3732 3733 /* 3734 * return the next page in the page list 3735 */ 3736 page_t * 3737 page_list_next(page_t *pp) 3738 { 3739 return (pp->p_next); 3740 } 3741 3742 3743 /* 3744 * Add the page to the front of the linked list of pages 3745 * using p_vpnext/p_vpprev pointers for the list. 3746 * 3747 * The caller is responsible for protecting the lists. 3748 */ 3749 void 3750 page_vpadd(page_t **ppp, page_t *pp) 3751 { 3752 if (*ppp == NULL) { 3753 pp->p_vpnext = pp->p_vpprev = pp; 3754 } else { 3755 pp->p_vpnext = *ppp; 3756 pp->p_vpprev = (*ppp)->p_vpprev; 3757 (*ppp)->p_vpprev = pp; 3758 pp->p_vpprev->p_vpnext = pp; 3759 } 3760 *ppp = pp; 3761 } 3762 3763 /* 3764 * Remove this page from the linked list of pages 3765 * using p_vpnext/p_vpprev pointers for the list. 3766 * 3767 * The caller is responsible for protecting the lists. 3768 */ 3769 void 3770 page_vpsub(page_t **ppp, page_t *pp) 3771 { 3772 if (*ppp == NULL || pp == NULL) { 3773 panic("page_vpsub: bad arg(s): pp %p, *ppp %p", 3774 (void *)pp, (void *)(*ppp)); 3775 /*NOTREACHED*/ 3776 } 3777 3778 if (*ppp == pp) 3779 *ppp = pp->p_vpnext; /* go to next page */ 3780 3781 if (*ppp == pp) 3782 *ppp = NULL; /* page list is gone */ 3783 else { 3784 pp->p_vpprev->p_vpnext = pp->p_vpnext; 3785 pp->p_vpnext->p_vpprev = pp->p_vpprev; 3786 } 3787 pp->p_vpprev = pp->p_vpnext = pp; /* make pp a list of one */ 3788 } 3789 3790 /* 3791 * Lock a physical page into memory "long term". Used to support "lock 3792 * in memory" functions. Accepts the page to be locked, and a cow variable 3793 * to indicate whether a the lock will travel to the new page during 3794 * a potential copy-on-write. 3795 */ 3796 int 3797 page_pp_lock( 3798 page_t *pp, /* page to be locked */ 3799 int cow, /* cow lock */ 3800 int kernel) /* must succeed -- ignore checking */ 3801 { 3802 int r = 0; /* result -- assume failure */ 3803 3804 ASSERT(PAGE_LOCKED(pp)); 3805 3806 page_struct_lock(pp); 3807 /* 3808 * Acquire the "freemem_lock" for availrmem. 3809 */ 3810 if (cow) { 3811 mutex_enter(&freemem_lock); 3812 if ((availrmem > pages_pp_maximum) && 3813 (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM)) { 3814 availrmem--; 3815 pages_locked++; 3816 mutex_exit(&freemem_lock); 3817 r = 1; 3818 if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) { 3819 cmn_err(CE_WARN, 3820 "COW lock limit reached on pfn 0x%lx", 3821 page_pptonum(pp)); 3822 } 3823 } else 3824 mutex_exit(&freemem_lock); 3825 } else { 3826 if (pp->p_lckcnt) { 3827 if (pp->p_lckcnt < (ushort_t)PAGE_LOCK_MAXIMUM) { 3828 r = 1; 3829 if (++pp->p_lckcnt == 3830 (ushort_t)PAGE_LOCK_MAXIMUM) { 3831 cmn_err(CE_WARN, "Page lock limit " 3832 "reached on pfn 0x%lx", 3833 page_pptonum(pp)); 3834 } 3835 } 3836 } else { 3837 if (kernel) { 3838 /* availrmem accounting done by caller */ 3839 ++pp->p_lckcnt; 3840 r = 1; 3841 } else { 3842 mutex_enter(&freemem_lock); 3843 if (availrmem > pages_pp_maximum) { 3844 availrmem--; 3845 pages_locked++; 3846 ++pp->p_lckcnt; 3847 r = 1; 3848 } 3849 mutex_exit(&freemem_lock); 3850 } 3851 } 3852 } 3853 page_struct_unlock(pp); 3854 return (r); 3855 } 3856 3857 /* 3858 * Decommit a lock on a physical page frame. Account for cow locks if 3859 * appropriate. 3860 */ 3861 void 3862 page_pp_unlock( 3863 page_t *pp, /* page to be unlocked */ 3864 int cow, /* expect cow lock */ 3865 int kernel) /* this was a kernel lock */ 3866 { 3867 ASSERT(PAGE_LOCKED(pp)); 3868 3869 page_struct_lock(pp); 3870 /* 3871 * Acquire the "freemem_lock" for availrmem. 3872 * If cowcnt or lcknt is already 0 do nothing; i.e., we 3873 * could be called to unlock even if nothing is locked. This could 3874 * happen if locked file pages were truncated (removing the lock) 3875 * and the file was grown again and new pages faulted in; the new 3876 * pages are unlocked but the segment still thinks they're locked. 3877 */ 3878 if (cow) { 3879 if (pp->p_cowcnt) { 3880 mutex_enter(&freemem_lock); 3881 pp->p_cowcnt--; 3882 availrmem++; 3883 pages_locked--; 3884 mutex_exit(&freemem_lock); 3885 } 3886 } else { 3887 if (pp->p_lckcnt && --pp->p_lckcnt == 0) { 3888 if (!kernel) { 3889 mutex_enter(&freemem_lock); 3890 availrmem++; 3891 pages_locked--; 3892 mutex_exit(&freemem_lock); 3893 } 3894 } 3895 } 3896 page_struct_unlock(pp); 3897 } 3898 3899 /* 3900 * This routine reserves availrmem for npages; 3901 * flags: KM_NOSLEEP or KM_SLEEP 3902 * returns 1 on success or 0 on failure 3903 */ 3904 int 3905 page_resv(pgcnt_t npages, uint_t flags) 3906 { 3907 mutex_enter(&freemem_lock); 3908 while (availrmem < tune.t_minarmem + npages) { 3909 if (flags & KM_NOSLEEP) { 3910 mutex_exit(&freemem_lock); 3911 return (0); 3912 } 3913 mutex_exit(&freemem_lock); 3914 page_needfree(npages); 3915 kmem_reap(); 3916 delay(hz >> 2); 3917 page_needfree(-(spgcnt_t)npages); 3918 mutex_enter(&freemem_lock); 3919 } 3920 availrmem -= npages; 3921 mutex_exit(&freemem_lock); 3922 return (1); 3923 } 3924 3925 /* 3926 * This routine unreserves availrmem for npages; 3927 */ 3928 void 3929 page_unresv(pgcnt_t npages) 3930 { 3931 mutex_enter(&freemem_lock); 3932 availrmem += npages; 3933 mutex_exit(&freemem_lock); 3934 } 3935 3936 /* 3937 * See Statement at the beginning of segvn_lockop() regarding 3938 * the way we handle cowcnts and lckcnts. 3939 * 3940 * Transfer cowcnt on 'opp' to cowcnt on 'npp' if the vpage 3941 * that breaks COW has PROT_WRITE. 3942 * 3943 * Note that, we may also break COW in case we are softlocking 3944 * on read access during physio; 3945 * in this softlock case, the vpage may not have PROT_WRITE. 3946 * So, we need to transfer lckcnt on 'opp' to lckcnt on 'npp' 3947 * if the vpage doesn't have PROT_WRITE. 3948 * 3949 * This routine is never called if we are stealing a page 3950 * in anon_private. 3951 * 3952 * The caller subtracted from availrmem for read only mapping. 3953 * if lckcnt is 1 increment availrmem. 3954 */ 3955 void 3956 page_pp_useclaim( 3957 page_t *opp, /* original page frame losing lock */ 3958 page_t *npp, /* new page frame gaining lock */ 3959 uint_t write_perm) /* set if vpage has PROT_WRITE */ 3960 { 3961 int payback = 0; 3962 3963 ASSERT(PAGE_LOCKED(opp)); 3964 ASSERT(PAGE_LOCKED(npp)); 3965 3966 page_struct_lock(opp); 3967 3968 ASSERT(npp->p_cowcnt == 0); 3969 ASSERT(npp->p_lckcnt == 0); 3970 3971 /* Don't use claim if nothing is locked (see page_pp_unlock above) */ 3972 if ((write_perm && opp->p_cowcnt != 0) || 3973 (!write_perm && opp->p_lckcnt != 0)) { 3974 3975 if (write_perm) { 3976 npp->p_cowcnt++; 3977 ASSERT(opp->p_cowcnt != 0); 3978 opp->p_cowcnt--; 3979 } else { 3980 3981 ASSERT(opp->p_lckcnt != 0); 3982 3983 /* 3984 * We didn't need availrmem decremented if p_lckcnt on 3985 * original page is 1. Here, we are unlocking 3986 * read-only copy belonging to original page and 3987 * are locking a copy belonging to new page. 3988 */ 3989 if (opp->p_lckcnt == 1) 3990 payback = 1; 3991 3992 npp->p_lckcnt++; 3993 opp->p_lckcnt--; 3994 } 3995 } 3996 if (payback) { 3997 mutex_enter(&freemem_lock); 3998 availrmem++; 3999 pages_useclaim--; 4000 mutex_exit(&freemem_lock); 4001 } 4002 page_struct_unlock(opp); 4003 } 4004 4005 /* 4006 * Simple claim adjust functions -- used to support changes in 4007 * claims due to changes in access permissions. Used by segvn_setprot(). 4008 */ 4009 int 4010 page_addclaim(page_t *pp) 4011 { 4012 int r = 0; /* result */ 4013 4014 ASSERT(PAGE_LOCKED(pp)); 4015 4016 page_struct_lock(pp); 4017 ASSERT(pp->p_lckcnt != 0); 4018 4019 if (pp->p_lckcnt == 1) { 4020 if (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM) { 4021 --pp->p_lckcnt; 4022 r = 1; 4023 if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) { 4024 cmn_err(CE_WARN, 4025 "COW lock limit reached on pfn 0x%lx", 4026 page_pptonum(pp)); 4027 } 4028 } 4029 } else { 4030 mutex_enter(&freemem_lock); 4031 if ((availrmem > pages_pp_maximum) && 4032 (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM)) { 4033 --availrmem; 4034 ++pages_claimed; 4035 mutex_exit(&freemem_lock); 4036 --pp->p_lckcnt; 4037 r = 1; 4038 if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) { 4039 cmn_err(CE_WARN, 4040 "COW lock limit reached on pfn 0x%lx", 4041 page_pptonum(pp)); 4042 } 4043 } else 4044 mutex_exit(&freemem_lock); 4045 } 4046 page_struct_unlock(pp); 4047 return (r); 4048 } 4049 4050 int 4051 page_subclaim(page_t *pp) 4052 { 4053 int r = 0; 4054 4055 ASSERT(PAGE_LOCKED(pp)); 4056 4057 page_struct_lock(pp); 4058 ASSERT(pp->p_cowcnt != 0); 4059 4060 if (pp->p_lckcnt) { 4061 if (pp->p_lckcnt < (ushort_t)PAGE_LOCK_MAXIMUM) { 4062 r = 1; 4063 /* 4064 * for availrmem 4065 */ 4066 mutex_enter(&freemem_lock); 4067 availrmem++; 4068 pages_claimed--; 4069 mutex_exit(&freemem_lock); 4070 4071 pp->p_cowcnt--; 4072 4073 if (++pp->p_lckcnt == (ushort_t)PAGE_LOCK_MAXIMUM) { 4074 cmn_err(CE_WARN, 4075 "Page lock limit reached on pfn 0x%lx", 4076 page_pptonum(pp)); 4077 } 4078 } 4079 } else { 4080 r = 1; 4081 pp->p_cowcnt--; 4082 pp->p_lckcnt++; 4083 } 4084 page_struct_unlock(pp); 4085 return (r); 4086 } 4087 4088 int 4089 page_addclaim_pages(page_t **ppa) 4090 { 4091 4092 pgcnt_t lckpgs = 0, pg_idx; 4093 4094 VM_STAT_ADD(pagecnt.pc_addclaim_pages); 4095 4096 mutex_enter(&page_llock); 4097 for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) { 4098 4099 ASSERT(PAGE_LOCKED(ppa[pg_idx])); 4100 ASSERT(ppa[pg_idx]->p_lckcnt != 0); 4101 if (ppa[pg_idx]->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) { 4102 mutex_exit(&page_llock); 4103 return (0); 4104 } 4105 if (ppa[pg_idx]->p_lckcnt > 1) 4106 lckpgs++; 4107 } 4108 4109 if (lckpgs != 0) { 4110 mutex_enter(&freemem_lock); 4111 if (availrmem >= pages_pp_maximum + lckpgs) { 4112 availrmem -= lckpgs; 4113 pages_claimed += lckpgs; 4114 } else { 4115 mutex_exit(&freemem_lock); 4116 mutex_exit(&page_llock); 4117 return (0); 4118 } 4119 mutex_exit(&freemem_lock); 4120 } 4121 4122 for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) { 4123 ppa[pg_idx]->p_lckcnt--; 4124 ppa[pg_idx]->p_cowcnt++; 4125 } 4126 mutex_exit(&page_llock); 4127 return (1); 4128 } 4129 4130 int 4131 page_subclaim_pages(page_t **ppa) 4132 { 4133 pgcnt_t ulckpgs = 0, pg_idx; 4134 4135 VM_STAT_ADD(pagecnt.pc_subclaim_pages); 4136 4137 mutex_enter(&page_llock); 4138 for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) { 4139 4140 ASSERT(PAGE_LOCKED(ppa[pg_idx])); 4141 ASSERT(ppa[pg_idx]->p_cowcnt != 0); 4142 if (ppa[pg_idx]->p_lckcnt == (ushort_t)PAGE_LOCK_MAXIMUM) { 4143 mutex_exit(&page_llock); 4144 return (0); 4145 } 4146 if (ppa[pg_idx]->p_lckcnt != 0) 4147 ulckpgs++; 4148 } 4149 4150 if (ulckpgs != 0) { 4151 mutex_enter(&freemem_lock); 4152 availrmem += ulckpgs; 4153 pages_claimed -= ulckpgs; 4154 mutex_exit(&freemem_lock); 4155 } 4156 4157 for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) { 4158 ppa[pg_idx]->p_cowcnt--; 4159 ppa[pg_idx]->p_lckcnt++; 4160 4161 } 4162 mutex_exit(&page_llock); 4163 return (1); 4164 } 4165 4166 page_t * 4167 page_numtopp(pfn_t pfnum, se_t se) 4168 { 4169 page_t *pp; 4170 4171 retry: 4172 pp = page_numtopp_nolock(pfnum); 4173 if (pp == NULL) { 4174 return ((page_t *)NULL); 4175 } 4176 4177 /* 4178 * Acquire the appropriate lock on the page. 4179 */ 4180 while (!page_lock(pp, se, (kmutex_t *)NULL, P_RECLAIM)) { 4181 if (page_pptonum(pp) != pfnum) 4182 goto retry; 4183 continue; 4184 } 4185 4186 if (page_pptonum(pp) != pfnum) { 4187 page_unlock(pp); 4188 goto retry; 4189 } 4190 4191 return (pp); 4192 } 4193 4194 page_t * 4195 page_numtopp_noreclaim(pfn_t pfnum, se_t se) 4196 { 4197 page_t *pp; 4198 4199 retry: 4200 pp = page_numtopp_nolock(pfnum); 4201 if (pp == NULL) { 4202 return ((page_t *)NULL); 4203 } 4204 4205 /* 4206 * Acquire the appropriate lock on the page. 4207 */ 4208 while (!page_lock(pp, se, (kmutex_t *)NULL, P_NO_RECLAIM)) { 4209 if (page_pptonum(pp) != pfnum) 4210 goto retry; 4211 continue; 4212 } 4213 4214 if (page_pptonum(pp) != pfnum) { 4215 page_unlock(pp); 4216 goto retry; 4217 } 4218 4219 return (pp); 4220 } 4221 4222 /* 4223 * This routine is like page_numtopp, but will only return page structs 4224 * for pages which are ok for loading into hardware using the page struct. 4225 */ 4226 page_t * 4227 page_numtopp_nowait(pfn_t pfnum, se_t se) 4228 { 4229 page_t *pp; 4230 4231 retry: 4232 pp = page_numtopp_nolock(pfnum); 4233 if (pp == NULL) { 4234 return ((page_t *)NULL); 4235 } 4236 4237 /* 4238 * Try to acquire the appropriate lock on the page. 4239 */ 4240 if (PP_ISFREE(pp)) 4241 pp = NULL; 4242 else { 4243 if (!page_trylock(pp, se)) 4244 pp = NULL; 4245 else { 4246 if (page_pptonum(pp) != pfnum) { 4247 page_unlock(pp); 4248 goto retry; 4249 } 4250 if (PP_ISFREE(pp)) { 4251 page_unlock(pp); 4252 pp = NULL; 4253 } 4254 } 4255 } 4256 return (pp); 4257 } 4258 4259 /* 4260 * Returns a count of dirty pages that are in the process 4261 * of being written out. If 'cleanit' is set, try to push the page. 4262 */ 4263 pgcnt_t 4264 page_busy(int cleanit) 4265 { 4266 page_t *page0 = page_first(); 4267 page_t *pp = page0; 4268 pgcnt_t nppbusy = 0; 4269 u_offset_t off; 4270 4271 do { 4272 vnode_t *vp = pp->p_vnode; 4273 4274 /* 4275 * A page is a candidate for syncing if it is: 4276 * 4277 * (a) On neither the freelist nor the cachelist 4278 * (b) Hashed onto a vnode 4279 * (c) Not a kernel page 4280 * (d) Dirty 4281 * (e) Not part of a swapfile 4282 * (f) a page which belongs to a real vnode; eg has a non-null 4283 * v_vfsp pointer. 4284 * (g) Backed by a filesystem which doesn't have a 4285 * stubbed-out sync operation 4286 */ 4287 if (!PP_ISFREE(pp) && vp != NULL && !VN_ISKAS(vp) && 4288 hat_ismod(pp) && !IS_SWAPVP(vp) && vp->v_vfsp != NULL && 4289 vfs_can_sync(vp->v_vfsp)) { 4290 nppbusy++; 4291 vfs_syncprogress(); 4292 4293 if (!cleanit) 4294 continue; 4295 if (!page_trylock(pp, SE_EXCL)) 4296 continue; 4297 4298 if (PP_ISFREE(pp) || vp == NULL || IS_SWAPVP(vp) || 4299 pp->p_lckcnt != 0 || pp->p_cowcnt != 0 || 4300 !(hat_pagesync(pp, 4301 HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD) & P_MOD)) { 4302 page_unlock(pp); 4303 continue; 4304 } 4305 off = pp->p_offset; 4306 VN_HOLD(vp); 4307 page_unlock(pp); 4308 (void) VOP_PUTPAGE(vp, off, PAGESIZE, 4309 B_ASYNC | B_FREE, kcred, NULL); 4310 VN_RELE(vp); 4311 } 4312 } while ((pp = page_next(pp)) != page0); 4313 4314 return (nppbusy); 4315 } 4316 4317 void page_invalidate_pages(void); 4318 4319 /* 4320 * callback handler to vm sub-system 4321 * 4322 * callers make sure no recursive entries to this func. 4323 */ 4324 /*ARGSUSED*/ 4325 boolean_t 4326 callb_vm_cpr(void *arg, int code) 4327 { 4328 if (code == CB_CODE_CPR_CHKPT) 4329 page_invalidate_pages(); 4330 return (B_TRUE); 4331 } 4332 4333 /* 4334 * Invalidate all pages of the system. 4335 * It shouldn't be called until all user page activities are all stopped. 4336 */ 4337 void 4338 page_invalidate_pages() 4339 { 4340 page_t *pp; 4341 page_t *page0; 4342 pgcnt_t nbusypages; 4343 int retry = 0; 4344 const int MAXRETRIES = 4; 4345 #if defined(__sparc) 4346 extern struct vnode prom_ppages; 4347 #endif /* __sparc */ 4348 4349 top: 4350 /* 4351 * Flush dirty pages and destroy the clean ones. 4352 */ 4353 nbusypages = 0; 4354 4355 pp = page0 = page_first(); 4356 do { 4357 struct vnode *vp; 4358 u_offset_t offset; 4359 int mod; 4360 4361 /* 4362 * skip the page if it has no vnode or the page associated 4363 * with the kernel vnode or prom allocated kernel mem. 4364 */ 4365 #if defined(__sparc) 4366 if ((vp = pp->p_vnode) == NULL || VN_ISKAS(vp) || 4367 vp == &prom_ppages) 4368 #else /* x86 doesn't have prom or prom_ppage */ 4369 if ((vp = pp->p_vnode) == NULL || VN_ISKAS(vp)) 4370 #endif /* __sparc */ 4371 continue; 4372 4373 /* 4374 * skip the page which is already free invalidated. 4375 */ 4376 if (PP_ISFREE(pp) && PP_ISAGED(pp)) 4377 continue; 4378 4379 /* 4380 * skip pages that are already locked or can't be "exclusively" 4381 * locked or are already free. After we lock the page, check 4382 * the free and age bits again to be sure it's not destroied 4383 * yet. 4384 * To achieve max. parallelization, we use page_trylock instead 4385 * of page_lock so that we don't get block on individual pages 4386 * while we have thousands of other pages to process. 4387 */ 4388 if (!page_trylock(pp, SE_EXCL)) { 4389 nbusypages++; 4390 continue; 4391 } else if (PP_ISFREE(pp)) { 4392 if (!PP_ISAGED(pp)) { 4393 page_destroy_free(pp); 4394 } else { 4395 page_unlock(pp); 4396 } 4397 continue; 4398 } 4399 /* 4400 * Is this page involved in some I/O? shared? 4401 * 4402 * The page_struct_lock need not be acquired to 4403 * examine these fields since the page has an 4404 * "exclusive" lock. 4405 */ 4406 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 4407 page_unlock(pp); 4408 continue; 4409 } 4410 4411 if (vp->v_type == VCHR) { 4412 panic("vp->v_type == VCHR"); 4413 /*NOTREACHED*/ 4414 } 4415 4416 if (!page_try_demote_pages(pp)) { 4417 page_unlock(pp); 4418 continue; 4419 } 4420 4421 /* 4422 * Check the modified bit. Leave the bits alone in hardware 4423 * (they will be modified if we do the putpage). 4424 */ 4425 mod = (hat_pagesync(pp, HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD) 4426 & P_MOD); 4427 if (mod) { 4428 offset = pp->p_offset; 4429 /* 4430 * Hold the vnode before releasing the page lock 4431 * to prevent it from being freed and re-used by 4432 * some other thread. 4433 */ 4434 VN_HOLD(vp); 4435 page_unlock(pp); 4436 /* 4437 * No error return is checked here. Callers such as 4438 * cpr deals with the dirty pages at the dump time 4439 * if this putpage fails. 4440 */ 4441 (void) VOP_PUTPAGE(vp, offset, PAGESIZE, B_INVAL, 4442 kcred, NULL); 4443 VN_RELE(vp); 4444 } else { 4445 page_destroy(pp, 0); 4446 } 4447 } while ((pp = page_next(pp)) != page0); 4448 if (nbusypages && retry++ < MAXRETRIES) { 4449 delay(1); 4450 goto top; 4451 } 4452 } 4453 4454 /* 4455 * Replace the page "old" with the page "new" on the page hash and vnode lists 4456 * 4457 * the replacement must be done in place, ie the equivalent sequence: 4458 * 4459 * vp = old->p_vnode; 4460 * off = old->p_offset; 4461 * page_do_hashout(old) 4462 * page_do_hashin(new, vp, off) 4463 * 4464 * doesn't work, since 4465 * 1) if old is the only page on the vnode, the v_pages list has a window 4466 * where it looks empty. This will break file system assumptions. 4467 * and 4468 * 2) pvn_vplist_dirty() can't deal with pages moving on the v_pages list. 4469 */ 4470 static void 4471 page_do_relocate_hash(page_t *new, page_t *old) 4472 { 4473 page_t **hash_list; 4474 vnode_t *vp = old->p_vnode; 4475 kmutex_t *sep; 4476 4477 ASSERT(PAGE_EXCL(old)); 4478 ASSERT(PAGE_EXCL(new)); 4479 ASSERT(vp != NULL); 4480 ASSERT(MUTEX_HELD(page_vnode_mutex(vp))); 4481 ASSERT(MUTEX_HELD(PAGE_HASH_MUTEX(PAGE_HASH_FUNC(vp, old->p_offset)))); 4482 4483 /* 4484 * First find old page on the page hash list 4485 */ 4486 hash_list = &page_hash[PAGE_HASH_FUNC(vp, old->p_offset)]; 4487 4488 for (;;) { 4489 if (*hash_list == old) 4490 break; 4491 if (*hash_list == NULL) { 4492 panic("page_do_hashout"); 4493 /*NOTREACHED*/ 4494 } 4495 hash_list = &(*hash_list)->p_hash; 4496 } 4497 4498 /* 4499 * update new and replace old with new on the page hash list 4500 */ 4501 new->p_vnode = old->p_vnode; 4502 new->p_offset = old->p_offset; 4503 new->p_hash = old->p_hash; 4504 *hash_list = new; 4505 4506 if ((new->p_vnode->v_flag & VISSWAP) != 0) 4507 PP_SETSWAP(new); 4508 4509 /* 4510 * replace old with new on the vnode's page list 4511 */ 4512 if (old->p_vpnext == old) { 4513 new->p_vpnext = new; 4514 new->p_vpprev = new; 4515 } else { 4516 new->p_vpnext = old->p_vpnext; 4517 new->p_vpprev = old->p_vpprev; 4518 new->p_vpnext->p_vpprev = new; 4519 new->p_vpprev->p_vpnext = new; 4520 } 4521 if (vp->v_pages == old) 4522 vp->v_pages = new; 4523 4524 /* 4525 * clear out the old page 4526 */ 4527 old->p_hash = NULL; 4528 old->p_vpnext = NULL; 4529 old->p_vpprev = NULL; 4530 old->p_vnode = NULL; 4531 PP_CLRSWAP(old); 4532 old->p_offset = (u_offset_t)-1; 4533 page_clr_all_props(old); 4534 4535 /* 4536 * Wake up processes waiting for this page. The page's 4537 * identity has been changed, and is probably not the 4538 * desired page any longer. 4539 */ 4540 sep = page_se_mutex(old); 4541 mutex_enter(sep); 4542 old->p_selock &= ~SE_EWANTED; 4543 if (CV_HAS_WAITERS(&old->p_cv)) 4544 cv_broadcast(&old->p_cv); 4545 mutex_exit(sep); 4546 } 4547 4548 /* 4549 * This function moves the identity of page "pp_old" to page "pp_new". 4550 * Both pages must be locked on entry. "pp_new" is free, has no identity, 4551 * and need not be hashed out from anywhere. 4552 */ 4553 void 4554 page_relocate_hash(page_t *pp_new, page_t *pp_old) 4555 { 4556 vnode_t *vp = pp_old->p_vnode; 4557 u_offset_t off = pp_old->p_offset; 4558 kmutex_t *phm, *vphm; 4559 4560 /* 4561 * Rehash two pages 4562 */ 4563 ASSERT(PAGE_EXCL(pp_old)); 4564 ASSERT(PAGE_EXCL(pp_new)); 4565 ASSERT(vp != NULL); 4566 ASSERT(pp_new->p_vnode == NULL); 4567 4568 /* 4569 * hashout then hashin while holding the mutexes 4570 */ 4571 phm = PAGE_HASH_MUTEX(PAGE_HASH_FUNC(vp, off)); 4572 mutex_enter(phm); 4573 vphm = page_vnode_mutex(vp); 4574 mutex_enter(vphm); 4575 4576 page_do_relocate_hash(pp_new, pp_old); 4577 4578 mutex_exit(vphm); 4579 mutex_exit(phm); 4580 4581 /* 4582 * The page_struct_lock need not be acquired for lckcnt and 4583 * cowcnt since the page has an "exclusive" lock. 4584 */ 4585 ASSERT(pp_new->p_lckcnt == 0); 4586 ASSERT(pp_new->p_cowcnt == 0); 4587 pp_new->p_lckcnt = pp_old->p_lckcnt; 4588 pp_new->p_cowcnt = pp_old->p_cowcnt; 4589 pp_old->p_lckcnt = pp_old->p_cowcnt = 0; 4590 4591 /* The following comment preserved from page_flip(). */ 4592 /* XXX - Do we need to protect fsdata? */ 4593 pp_new->p_fsdata = pp_old->p_fsdata; 4594 } 4595 4596 /* 4597 * Helper routine used to lock all remaining members of a 4598 * large page. The caller is responsible for passing in a locked 4599 * pp. If pp is a large page, then it succeeds in locking all the 4600 * remaining constituent pages or it returns with only the 4601 * original page locked. 4602 * 4603 * Returns 1 on success, 0 on failure. 4604 * 4605 * If success is returned this routine guarantees p_szc for all constituent 4606 * pages of a large page pp belongs to can't change. To achieve this we 4607 * recheck szc of pp after locking all constituent pages and retry if szc 4608 * changed (it could only decrease). Since hat_page_demote() needs an EXCL 4609 * lock on one of constituent pages it can't be running after all constituent 4610 * pages are locked. hat_page_demote() with a lock on a constituent page 4611 * outside of this large page (i.e. pp belonged to a larger large page) is 4612 * already done with all constituent pages of pp since the root's p_szc is 4613 * changed last. Therefore no need to synchronize with hat_page_demote() that 4614 * locked a constituent page outside of pp's current large page. 4615 */ 4616 #ifdef DEBUG 4617 uint32_t gpg_trylock_mtbf = 0; 4618 #endif 4619 4620 int 4621 group_page_trylock(page_t *pp, se_t se) 4622 { 4623 page_t *tpp; 4624 pgcnt_t npgs, i, j; 4625 uint_t pszc = pp->p_szc; 4626 4627 #ifdef DEBUG 4628 if (gpg_trylock_mtbf && !(gethrtime() % gpg_trylock_mtbf)) { 4629 return (0); 4630 } 4631 #endif 4632 4633 if (pp != PP_GROUPLEADER(pp, pszc)) { 4634 return (0); 4635 } 4636 4637 retry: 4638 ASSERT(PAGE_LOCKED_SE(pp, se)); 4639 ASSERT(!PP_ISFREE(pp)); 4640 if (pszc == 0) { 4641 return (1); 4642 } 4643 npgs = page_get_pagecnt(pszc); 4644 tpp = pp + 1; 4645 for (i = 1; i < npgs; i++, tpp++) { 4646 if (!page_trylock(tpp, se)) { 4647 tpp = pp + 1; 4648 for (j = 1; j < i; j++, tpp++) { 4649 page_unlock(tpp); 4650 } 4651 return (0); 4652 } 4653 } 4654 if (pp->p_szc != pszc) { 4655 ASSERT(pp->p_szc < pszc); 4656 ASSERT(pp->p_vnode != NULL && !PP_ISKAS(pp) && 4657 !IS_SWAPFSVP(pp->p_vnode)); 4658 tpp = pp + 1; 4659 for (i = 1; i < npgs; i++, tpp++) { 4660 page_unlock(tpp); 4661 } 4662 pszc = pp->p_szc; 4663 goto retry; 4664 } 4665 return (1); 4666 } 4667 4668 void 4669 group_page_unlock(page_t *pp) 4670 { 4671 page_t *tpp; 4672 pgcnt_t npgs, i; 4673 4674 ASSERT(PAGE_LOCKED(pp)); 4675 ASSERT(!PP_ISFREE(pp)); 4676 ASSERT(pp == PP_PAGEROOT(pp)); 4677 npgs = page_get_pagecnt(pp->p_szc); 4678 for (i = 1, tpp = pp + 1; i < npgs; i++, tpp++) { 4679 page_unlock(tpp); 4680 } 4681 } 4682 4683 /* 4684 * returns 4685 * 0 : on success and *nrelocp is number of relocated PAGESIZE pages 4686 * ERANGE : this is not a base page 4687 * EBUSY : failure to get locks on the page/pages 4688 * ENOMEM : failure to obtain replacement pages 4689 * EAGAIN : OBP has not yet completed its boot-time handoff to the kernel 4690 * EIO : An error occurred while trying to copy the page data 4691 * 4692 * Return with all constituent members of target and replacement 4693 * SE_EXCL locked. It is the callers responsibility to drop the 4694 * locks. 4695 */ 4696 int 4697 do_page_relocate( 4698 page_t **target, 4699 page_t **replacement, 4700 int grouplock, 4701 spgcnt_t *nrelocp, 4702 lgrp_t *lgrp) 4703 { 4704 page_t *first_repl; 4705 page_t *repl; 4706 page_t *targ; 4707 page_t *pl = NULL; 4708 uint_t ppattr; 4709 pfn_t pfn, repl_pfn; 4710 uint_t szc; 4711 spgcnt_t npgs, i; 4712 int repl_contig = 0; 4713 uint_t flags = 0; 4714 spgcnt_t dofree = 0; 4715 4716 *nrelocp = 0; 4717 4718 #if defined(__sparc) 4719 /* 4720 * We need to wait till OBP has completed 4721 * its boot-time handoff of its resources to the kernel 4722 * before we allow page relocation 4723 */ 4724 if (page_relocate_ready == 0) { 4725 return (EAGAIN); 4726 } 4727 #endif 4728 4729 /* 4730 * If this is not a base page, 4731 * just return with 0x0 pages relocated. 4732 */ 4733 targ = *target; 4734 ASSERT(PAGE_EXCL(targ)); 4735 ASSERT(!PP_ISFREE(targ)); 4736 szc = targ->p_szc; 4737 ASSERT(szc < mmu_page_sizes); 4738 VM_STAT_ADD(vmm_vmstats.ppr_reloc[szc]); 4739 pfn = targ->p_pagenum; 4740 if (pfn != PFN_BASE(pfn, szc)) { 4741 VM_STAT_ADD(vmm_vmstats.ppr_relocnoroot[szc]); 4742 return (ERANGE); 4743 } 4744 4745 if ((repl = *replacement) != NULL && repl->p_szc >= szc) { 4746 repl_pfn = repl->p_pagenum; 4747 if (repl_pfn != PFN_BASE(repl_pfn, szc)) { 4748 VM_STAT_ADD(vmm_vmstats.ppr_reloc_replnoroot[szc]); 4749 return (ERANGE); 4750 } 4751 repl_contig = 1; 4752 } 4753 4754 /* 4755 * We must lock all members of this large page or we cannot 4756 * relocate any part of it. 4757 */ 4758 if (grouplock != 0 && !group_page_trylock(targ, SE_EXCL)) { 4759 VM_STAT_ADD(vmm_vmstats.ppr_relocnolock[targ->p_szc]); 4760 return (EBUSY); 4761 } 4762 4763 /* 4764 * reread szc it could have been decreased before 4765 * group_page_trylock() was done. 4766 */ 4767 szc = targ->p_szc; 4768 ASSERT(szc < mmu_page_sizes); 4769 VM_STAT_ADD(vmm_vmstats.ppr_reloc[szc]); 4770 ASSERT(pfn == PFN_BASE(pfn, szc)); 4771 4772 npgs = page_get_pagecnt(targ->p_szc); 4773 4774 if (repl == NULL) { 4775 dofree = npgs; /* Size of target page in MMU pages */ 4776 if (!page_create_wait(dofree, 0)) { 4777 if (grouplock != 0) { 4778 group_page_unlock(targ); 4779 } 4780 VM_STAT_ADD(vmm_vmstats.ppr_relocnomem[szc]); 4781 return (ENOMEM); 4782 } 4783 4784 /* 4785 * seg kmem pages require that the target and replacement 4786 * page be the same pagesize. 4787 */ 4788 flags = (VN_ISKAS(targ->p_vnode)) ? PGR_SAMESZC : 0; 4789 repl = page_get_replacement_page(targ, lgrp, flags); 4790 if (repl == NULL) { 4791 if (grouplock != 0) { 4792 group_page_unlock(targ); 4793 } 4794 page_create_putback(dofree); 4795 VM_STAT_ADD(vmm_vmstats.ppr_relocnomem[szc]); 4796 return (ENOMEM); 4797 } 4798 } 4799 #ifdef DEBUG 4800 else { 4801 ASSERT(PAGE_LOCKED(repl)); 4802 } 4803 #endif /* DEBUG */ 4804 4805 #if defined(__sparc) 4806 /* 4807 * Let hat_page_relocate() complete the relocation if it's kernel page 4808 */ 4809 if (VN_ISKAS(targ->p_vnode)) { 4810 *replacement = repl; 4811 if (hat_page_relocate(target, replacement, nrelocp) != 0) { 4812 if (grouplock != 0) { 4813 group_page_unlock(targ); 4814 } 4815 if (dofree) { 4816 *replacement = NULL; 4817 page_free_replacement_page(repl); 4818 page_create_putback(dofree); 4819 } 4820 VM_STAT_ADD(vmm_vmstats.ppr_krelocfail[szc]); 4821 return (EAGAIN); 4822 } 4823 VM_STAT_ADD(vmm_vmstats.ppr_relocok[szc]); 4824 return (0); 4825 } 4826 #else 4827 #if defined(lint) 4828 dofree = dofree; 4829 #endif 4830 #endif 4831 4832 first_repl = repl; 4833 4834 for (i = 0; i < npgs; i++) { 4835 ASSERT(PAGE_EXCL(targ)); 4836 ASSERT(targ->p_slckcnt == 0); 4837 ASSERT(repl->p_slckcnt == 0); 4838 4839 (void) hat_pageunload(targ, HAT_FORCE_PGUNLOAD); 4840 4841 ASSERT(hat_page_getshare(targ) == 0); 4842 ASSERT(!PP_ISFREE(targ)); 4843 ASSERT(targ->p_pagenum == (pfn + i)); 4844 ASSERT(repl_contig == 0 || 4845 repl->p_pagenum == (repl_pfn + i)); 4846 4847 /* 4848 * Copy the page contents and attributes then 4849 * relocate the page in the page hash. 4850 */ 4851 if (ppcopy(targ, repl) == 0) { 4852 targ = *target; 4853 repl = first_repl; 4854 VM_STAT_ADD(vmm_vmstats.ppr_copyfail); 4855 if (grouplock != 0) { 4856 group_page_unlock(targ); 4857 } 4858 if (dofree) { 4859 *replacement = NULL; 4860 page_free_replacement_page(repl); 4861 page_create_putback(dofree); 4862 } 4863 return (EIO); 4864 } 4865 4866 targ++; 4867 if (repl_contig != 0) { 4868 repl++; 4869 } else { 4870 repl = repl->p_next; 4871 } 4872 } 4873 4874 repl = first_repl; 4875 targ = *target; 4876 4877 for (i = 0; i < npgs; i++) { 4878 ppattr = hat_page_getattr(targ, (P_MOD | P_REF | P_RO)); 4879 page_clr_all_props(repl); 4880 page_set_props(repl, ppattr); 4881 page_relocate_hash(repl, targ); 4882 4883 ASSERT(hat_page_getshare(targ) == 0); 4884 ASSERT(hat_page_getshare(repl) == 0); 4885 /* 4886 * Now clear the props on targ, after the 4887 * page_relocate_hash(), they no longer 4888 * have any meaning. 4889 */ 4890 page_clr_all_props(targ); 4891 ASSERT(targ->p_next == targ); 4892 ASSERT(targ->p_prev == targ); 4893 page_list_concat(&pl, &targ); 4894 4895 targ++; 4896 if (repl_contig != 0) { 4897 repl++; 4898 } else { 4899 repl = repl->p_next; 4900 } 4901 } 4902 /* assert that we have come full circle with repl */ 4903 ASSERT(repl_contig == 1 || first_repl == repl); 4904 4905 *target = pl; 4906 if (*replacement == NULL) { 4907 ASSERT(first_repl == repl); 4908 *replacement = repl; 4909 } 4910 VM_STAT_ADD(vmm_vmstats.ppr_relocok[szc]); 4911 *nrelocp = npgs; 4912 return (0); 4913 } 4914 /* 4915 * On success returns 0 and *nrelocp the number of PAGESIZE pages relocated. 4916 */ 4917 int 4918 page_relocate( 4919 page_t **target, 4920 page_t **replacement, 4921 int grouplock, 4922 int freetarget, 4923 spgcnt_t *nrelocp, 4924 lgrp_t *lgrp) 4925 { 4926 spgcnt_t ret; 4927 4928 /* do_page_relocate returns 0 on success or errno value */ 4929 ret = do_page_relocate(target, replacement, grouplock, nrelocp, lgrp); 4930 4931 if (ret != 0 || freetarget == 0) { 4932 return (ret); 4933 } 4934 if (*nrelocp == 1) { 4935 ASSERT(*target != NULL); 4936 page_free(*target, 1); 4937 } else { 4938 page_t *tpp = *target; 4939 uint_t szc = tpp->p_szc; 4940 pgcnt_t npgs = page_get_pagecnt(szc); 4941 ASSERT(npgs > 1); 4942 ASSERT(szc != 0); 4943 do { 4944 ASSERT(PAGE_EXCL(tpp)); 4945 ASSERT(!hat_page_is_mapped(tpp)); 4946 ASSERT(tpp->p_szc == szc); 4947 PP_SETFREE(tpp); 4948 PP_SETAGED(tpp); 4949 npgs--; 4950 } while ((tpp = tpp->p_next) != *target); 4951 ASSERT(npgs == 0); 4952 page_list_add_pages(*target, 0); 4953 npgs = page_get_pagecnt(szc); 4954 page_create_putback(npgs); 4955 } 4956 return (ret); 4957 } 4958 4959 /* 4960 * it is up to the caller to deal with pcf accounting. 4961 */ 4962 void 4963 page_free_replacement_page(page_t *pplist) 4964 { 4965 page_t *pp; 4966 4967 while (pplist != NULL) { 4968 /* 4969 * pp_targ is a linked list. 4970 */ 4971 pp = pplist; 4972 if (pp->p_szc == 0) { 4973 page_sub(&pplist, pp); 4974 page_clr_all_props(pp); 4975 PP_SETFREE(pp); 4976 PP_SETAGED(pp); 4977 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); 4978 page_unlock(pp); 4979 VM_STAT_ADD(pagecnt.pc_free_replacement_page[0]); 4980 } else { 4981 spgcnt_t curnpgs = page_get_pagecnt(pp->p_szc); 4982 page_t *tpp; 4983 page_list_break(&pp, &pplist, curnpgs); 4984 tpp = pp; 4985 do { 4986 ASSERT(PAGE_EXCL(tpp)); 4987 ASSERT(!hat_page_is_mapped(tpp)); 4988 page_clr_all_props(pp); 4989 PP_SETFREE(tpp); 4990 PP_SETAGED(tpp); 4991 } while ((tpp = tpp->p_next) != pp); 4992 page_list_add_pages(pp, 0); 4993 VM_STAT_ADD(pagecnt.pc_free_replacement_page[1]); 4994 } 4995 } 4996 } 4997 4998 /* 4999 * Relocate target to non-relocatable replacement page. 5000 */ 5001 int 5002 page_relocate_cage(page_t **target, page_t **replacement) 5003 { 5004 page_t *tpp, *rpp; 5005 spgcnt_t pgcnt, npgs; 5006 int result; 5007 5008 tpp = *target; 5009 5010 ASSERT(PAGE_EXCL(tpp)); 5011 ASSERT(tpp->p_szc == 0); 5012 5013 pgcnt = btop(page_get_pagesize(tpp->p_szc)); 5014 5015 do { 5016 (void) page_create_wait(pgcnt, PG_WAIT | PG_NORELOC); 5017 rpp = page_get_replacement_page(tpp, NULL, PGR_NORELOC); 5018 if (rpp == NULL) { 5019 page_create_putback(pgcnt); 5020 kcage_cageout_wakeup(); 5021 } 5022 } while (rpp == NULL); 5023 5024 ASSERT(PP_ISNORELOC(rpp)); 5025 5026 result = page_relocate(&tpp, &rpp, 0, 1, &npgs, NULL); 5027 5028 if (result == 0) { 5029 *replacement = rpp; 5030 if (pgcnt != npgs) 5031 panic("page_relocate_cage: partial relocation"); 5032 } 5033 5034 return (result); 5035 } 5036 5037 /* 5038 * Release the page lock on a page, place on cachelist 5039 * tail if no longer mapped. Caller can let us know if 5040 * the page is known to be clean. 5041 */ 5042 int 5043 page_release(page_t *pp, int checkmod) 5044 { 5045 int status; 5046 5047 ASSERT(PAGE_LOCKED(pp) && !PP_ISFREE(pp) && 5048 (pp->p_vnode != NULL)); 5049 5050 if (!hat_page_is_mapped(pp) && !IS_SWAPVP(pp->p_vnode) && 5051 ((PAGE_SHARED(pp) && page_tryupgrade(pp)) || PAGE_EXCL(pp)) && 5052 pp->p_lckcnt == 0 && pp->p_cowcnt == 0 && 5053 !hat_page_is_mapped(pp)) { 5054 5055 /* 5056 * If page is modified, unlock it 5057 * 5058 * (p_nrm & P_MOD) bit has the latest stuff because: 5059 * (1) We found that this page doesn't have any mappings 5060 * _after_ holding SE_EXCL and 5061 * (2) We didn't drop SE_EXCL lock after the check in (1) 5062 */ 5063 if (checkmod && hat_ismod(pp)) { 5064 page_unlock(pp); 5065 status = PGREL_MOD; 5066 } else { 5067 /*LINTED: constant in conditional context*/ 5068 VN_DISPOSE(pp, B_FREE, 0, kcred); 5069 status = PGREL_CLEAN; 5070 } 5071 } else { 5072 page_unlock(pp); 5073 status = PGREL_NOTREL; 5074 } 5075 return (status); 5076 } 5077 5078 /* 5079 * Given a constituent page, try to demote the large page on the freelist. 5080 * 5081 * Returns nonzero if the page could be demoted successfully. Returns with 5082 * the constituent page still locked. 5083 */ 5084 int 5085 page_try_demote_free_pages(page_t *pp) 5086 { 5087 page_t *rootpp = pp; 5088 pfn_t pfn = page_pptonum(pp); 5089 spgcnt_t npgs; 5090 uint_t szc = pp->p_szc; 5091 5092 ASSERT(PP_ISFREE(pp)); 5093 ASSERT(PAGE_EXCL(pp)); 5094 5095 /* 5096 * Adjust rootpp and lock it, if `pp' is not the base 5097 * constituent page. 5098 */ 5099 npgs = page_get_pagecnt(pp->p_szc); 5100 if (npgs == 1) { 5101 return (0); 5102 } 5103 5104 if (!IS_P2ALIGNED(pfn, npgs)) { 5105 pfn = P2ALIGN(pfn, npgs); 5106 rootpp = page_numtopp_nolock(pfn); 5107 } 5108 5109 if (pp != rootpp && !page_trylock(rootpp, SE_EXCL)) { 5110 return (0); 5111 } 5112 5113 if (rootpp->p_szc != szc) { 5114 if (pp != rootpp) 5115 page_unlock(rootpp); 5116 return (0); 5117 } 5118 5119 page_demote_free_pages(rootpp); 5120 5121 if (pp != rootpp) 5122 page_unlock(rootpp); 5123 5124 ASSERT(PP_ISFREE(pp)); 5125 ASSERT(PAGE_EXCL(pp)); 5126 return (1); 5127 } 5128 5129 /* 5130 * Given a constituent page, try to demote the large page. 5131 * 5132 * Returns nonzero if the page could be demoted successfully. Returns with 5133 * the constituent page still locked. 5134 */ 5135 int 5136 page_try_demote_pages(page_t *pp) 5137 { 5138 page_t *tpp, *rootpp = pp; 5139 pfn_t pfn = page_pptonum(pp); 5140 spgcnt_t i, npgs; 5141 uint_t szc = pp->p_szc; 5142 vnode_t *vp = pp->p_vnode; 5143 5144 ASSERT(PAGE_EXCL(pp)); 5145 5146 VM_STAT_ADD(pagecnt.pc_try_demote_pages[0]); 5147 5148 if (pp->p_szc == 0) { 5149 VM_STAT_ADD(pagecnt.pc_try_demote_pages[1]); 5150 return (1); 5151 } 5152 5153 if (vp != NULL && !IS_SWAPFSVP(vp) && !VN_ISKAS(vp)) { 5154 VM_STAT_ADD(pagecnt.pc_try_demote_pages[2]); 5155 page_demote_vp_pages(pp); 5156 ASSERT(pp->p_szc == 0); 5157 return (1); 5158 } 5159 5160 /* 5161 * Adjust rootpp if passed in is not the base 5162 * constituent page. 5163 */ 5164 npgs = page_get_pagecnt(pp->p_szc); 5165 ASSERT(npgs > 1); 5166 if (!IS_P2ALIGNED(pfn, npgs)) { 5167 pfn = P2ALIGN(pfn, npgs); 5168 rootpp = page_numtopp_nolock(pfn); 5169 VM_STAT_ADD(pagecnt.pc_try_demote_pages[3]); 5170 ASSERT(rootpp->p_vnode != NULL); 5171 ASSERT(rootpp->p_szc == szc); 5172 } 5173 5174 /* 5175 * We can't demote kernel pages since we can't hat_unload() 5176 * the mappings. 5177 */ 5178 if (VN_ISKAS(rootpp->p_vnode)) 5179 return (0); 5180 5181 /* 5182 * Attempt to lock all constituent pages except the page passed 5183 * in since it's already locked. 5184 */ 5185 for (tpp = rootpp, i = 0; i < npgs; i++, tpp++) { 5186 ASSERT(!PP_ISFREE(tpp)); 5187 ASSERT(tpp->p_vnode != NULL); 5188 5189 if (tpp != pp && !page_trylock(tpp, SE_EXCL)) 5190 break; 5191 ASSERT(tpp->p_szc == rootpp->p_szc); 5192 ASSERT(page_pptonum(tpp) == page_pptonum(rootpp) + i); 5193 } 5194 5195 /* 5196 * If we failed to lock them all then unlock what we have 5197 * locked so far and bail. 5198 */ 5199 if (i < npgs) { 5200 tpp = rootpp; 5201 while (i-- > 0) { 5202 if (tpp != pp) 5203 page_unlock(tpp); 5204 tpp++; 5205 } 5206 VM_STAT_ADD(pagecnt.pc_try_demote_pages[4]); 5207 return (0); 5208 } 5209 5210 for (tpp = rootpp, i = 0; i < npgs; i++, tpp++) { 5211 ASSERT(PAGE_EXCL(tpp)); 5212 ASSERT(tpp->p_slckcnt == 0); 5213 (void) hat_pageunload(tpp, HAT_FORCE_PGUNLOAD); 5214 tpp->p_szc = 0; 5215 } 5216 5217 /* 5218 * Unlock all pages except the page passed in. 5219 */ 5220 for (tpp = rootpp, i = 0; i < npgs; i++, tpp++) { 5221 ASSERT(!hat_page_is_mapped(tpp)); 5222 if (tpp != pp) 5223 page_unlock(tpp); 5224 } 5225 5226 VM_STAT_ADD(pagecnt.pc_try_demote_pages[5]); 5227 return (1); 5228 } 5229 5230 /* 5231 * Called by page_free() and page_destroy() to demote the page size code 5232 * (p_szc) to 0 (since we can't just put a single PAGESIZE page with non zero 5233 * p_szc on free list, neither can we just clear p_szc of a single page_t 5234 * within a large page since it will break other code that relies on p_szc 5235 * being the same for all page_t's of a large page). Anonymous pages should 5236 * never end up here because anon_map_getpages() cannot deal with p_szc 5237 * changes after a single constituent page is locked. While anonymous or 5238 * kernel large pages are demoted or freed the entire large page at a time 5239 * with all constituent pages locked EXCL for the file system pages we 5240 * have to be able to demote a large page (i.e. decrease all constituent pages 5241 * p_szc) with only just an EXCL lock on one of constituent pages. The reason 5242 * we can easily deal with anonymous page demotion the entire large page at a 5243 * time is that those operation originate at address space level and concern 5244 * the entire large page region with actual demotion only done when pages are 5245 * not shared with any other processes (therefore we can always get EXCL lock 5246 * on all anonymous constituent pages after clearing segment page 5247 * cache). However file system pages can be truncated or invalidated at a 5248 * PAGESIZE level from the file system side and end up in page_free() or 5249 * page_destroy() (we also allow only part of the large page to be SOFTLOCKed 5250 * and therefore pageout should be able to demote a large page by EXCL locking 5251 * any constituent page that is not under SOFTLOCK). In those cases we cannot 5252 * rely on being able to lock EXCL all constituent pages. 5253 * 5254 * To prevent szc changes on file system pages one has to lock all constituent 5255 * pages at least SHARED (or call page_szc_lock()). The only subsystem that 5256 * doesn't rely on locking all constituent pages (or using page_szc_lock()) to 5257 * prevent szc changes is hat layer that uses its own page level mlist 5258 * locks. hat assumes that szc doesn't change after mlist lock for a page is 5259 * taken. Therefore we need to change szc under hat level locks if we only 5260 * have an EXCL lock on a single constituent page and hat still references any 5261 * of constituent pages. (Note we can't "ignore" hat layer by simply 5262 * hat_pageunload() all constituent pages without having EXCL locks on all of 5263 * constituent pages). We use hat_page_demote() call to safely demote szc of 5264 * all constituent pages under hat locks when we only have an EXCL lock on one 5265 * of constituent pages. 5266 * 5267 * This routine calls page_szc_lock() before calling hat_page_demote() to 5268 * allow segvn in one special case not to lock all constituent pages SHARED 5269 * before calling hat_memload_array() that relies on p_szc not changing even 5270 * before hat level mlist lock is taken. In that case segvn uses 5271 * page_szc_lock() to prevent hat_page_demote() changing p_szc values. 5272 * 5273 * Anonymous or kernel page demotion still has to lock all pages exclusively 5274 * and do hat_pageunload() on all constituent pages before demoting the page 5275 * therefore there's no need for anonymous or kernel page demotion to use 5276 * hat_page_demote() mechanism. 5277 * 5278 * hat_page_demote() removes all large mappings that map pp and then decreases 5279 * p_szc starting from the last constituent page of the large page. By working 5280 * from the tail of a large page in pfn decreasing order allows one looking at 5281 * the root page to know that hat_page_demote() is done for root's szc area. 5282 * e.g. if a root page has szc 1 one knows it only has to lock all constituent 5283 * pages within szc 1 area to prevent szc changes because hat_page_demote() 5284 * that started on this page when it had szc > 1 is done for this szc 1 area. 5285 * 5286 * We are guaranteed that all constituent pages of pp's large page belong to 5287 * the same vnode with the consecutive offsets increasing in the direction of 5288 * the pfn i.e. the identity of constituent pages can't change until their 5289 * p_szc is decreased. Therefore it's safe for hat_page_demote() to remove 5290 * large mappings to pp even though we don't lock any constituent page except 5291 * pp (i.e. we won't unload e.g. kernel locked page). 5292 */ 5293 static void 5294 page_demote_vp_pages(page_t *pp) 5295 { 5296 kmutex_t *mtx; 5297 5298 ASSERT(PAGE_EXCL(pp)); 5299 ASSERT(!PP_ISFREE(pp)); 5300 ASSERT(pp->p_vnode != NULL); 5301 ASSERT(!IS_SWAPFSVP(pp->p_vnode)); 5302 ASSERT(!PP_ISKAS(pp)); 5303 5304 VM_STAT_ADD(pagecnt.pc_demote_pages[0]); 5305 5306 mtx = page_szc_lock(pp); 5307 if (mtx != NULL) { 5308 hat_page_demote(pp); 5309 mutex_exit(mtx); 5310 } 5311 ASSERT(pp->p_szc == 0); 5312 } 5313 5314 /* 5315 * Mark any existing pages for migration in the given range 5316 */ 5317 void 5318 page_mark_migrate(struct seg *seg, caddr_t addr, size_t len, 5319 struct anon_map *amp, ulong_t anon_index, vnode_t *vp, 5320 u_offset_t vnoff, int rflag) 5321 { 5322 struct anon *ap; 5323 vnode_t *curvp; 5324 lgrp_t *from; 5325 pgcnt_t i; 5326 pgcnt_t nlocked; 5327 u_offset_t off; 5328 pfn_t pfn; 5329 size_t pgsz; 5330 size_t segpgsz; 5331 pgcnt_t pages; 5332 uint_t pszc; 5333 page_t **ppa; 5334 pgcnt_t ppa_nentries; 5335 page_t *pp; 5336 caddr_t va; 5337 ulong_t an_idx; 5338 anon_sync_obj_t cookie; 5339 5340 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 5341 5342 /* 5343 * Don't do anything if don't need to do lgroup optimizations 5344 * on this system 5345 */ 5346 if (!lgrp_optimizations()) 5347 return; 5348 5349 /* 5350 * Align address and length to (potentially large) page boundary 5351 */ 5352 segpgsz = page_get_pagesize(seg->s_szc); 5353 addr = (caddr_t)P2ALIGN((uintptr_t)addr, segpgsz); 5354 if (rflag) 5355 len = P2ROUNDUP(len, segpgsz); 5356 5357 /* 5358 * Allocate page array to accommodate largest page size 5359 */ 5360 pgsz = page_get_pagesize(page_num_pagesizes() - 1); 5361 ppa_nentries = btop(pgsz); 5362 ppa = kmem_zalloc(ppa_nentries * sizeof (page_t *), KM_SLEEP); 5363 5364 /* 5365 * Do one (large) page at a time 5366 */ 5367 va = addr; 5368 while (va < addr + len) { 5369 /* 5370 * Lookup (root) page for vnode and offset corresponding to 5371 * this virtual address 5372 * Try anonmap first since there may be copy-on-write 5373 * pages, but initialize vnode pointer and offset using 5374 * vnode arguments just in case there isn't an amp. 5375 */ 5376 curvp = vp; 5377 off = vnoff + va - seg->s_base; 5378 if (amp) { 5379 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5380 an_idx = anon_index + seg_page(seg, va); 5381 anon_array_enter(amp, an_idx, &cookie); 5382 ap = anon_get_ptr(amp->ahp, an_idx); 5383 if (ap) 5384 swap_xlate(ap, &curvp, &off); 5385 anon_array_exit(&cookie); 5386 ANON_LOCK_EXIT(&->a_rwlock); 5387 } 5388 5389 pp = NULL; 5390 if (curvp) 5391 pp = page_lookup(curvp, off, SE_SHARED); 5392 5393 /* 5394 * If there isn't a page at this virtual address, 5395 * skip to next page 5396 */ 5397 if (pp == NULL) { 5398 va += PAGESIZE; 5399 continue; 5400 } 5401 5402 /* 5403 * Figure out which lgroup this page is in for kstats 5404 */ 5405 pfn = page_pptonum(pp); 5406 from = lgrp_pfn_to_lgrp(pfn); 5407 5408 /* 5409 * Get page size, and round up and skip to next page boundary 5410 * if unaligned address 5411 */ 5412 pszc = pp->p_szc; 5413 pgsz = page_get_pagesize(pszc); 5414 pages = btop(pgsz); 5415 if (!IS_P2ALIGNED(va, pgsz) || 5416 !IS_P2ALIGNED(pfn, pages) || 5417 pgsz > segpgsz) { 5418 pgsz = MIN(pgsz, segpgsz); 5419 page_unlock(pp); 5420 i = btop(P2END((uintptr_t)va, pgsz) - 5421 (uintptr_t)va); 5422 va = (caddr_t)P2END((uintptr_t)va, pgsz); 5423 lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS, i); 5424 continue; 5425 } 5426 5427 /* 5428 * Upgrade to exclusive lock on page 5429 */ 5430 if (!page_tryupgrade(pp)) { 5431 page_unlock(pp); 5432 va += pgsz; 5433 lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS, 5434 btop(pgsz)); 5435 continue; 5436 } 5437 5438 /* 5439 * Remember pages locked exclusively and how many 5440 */ 5441 ppa[0] = pp; 5442 nlocked = 1; 5443 5444 /* 5445 * Lock constituent pages if this is large page 5446 */ 5447 if (pages > 1) { 5448 /* 5449 * Lock all constituents except root page, since it 5450 * should be locked already. 5451 */ 5452 for (i = 1; i < pages; i++) { 5453 pp++; 5454 if (!page_trylock(pp, SE_EXCL)) { 5455 break; 5456 } 5457 if (PP_ISFREE(pp) || 5458 pp->p_szc != pszc) { 5459 /* 5460 * hat_page_demote() raced in with us. 5461 */ 5462 ASSERT(!IS_SWAPFSVP(curvp)); 5463 page_unlock(pp); 5464 break; 5465 } 5466 ppa[nlocked] = pp; 5467 nlocked++; 5468 } 5469 } 5470 5471 /* 5472 * If all constituent pages couldn't be locked, 5473 * unlock pages locked so far and skip to next page. 5474 */ 5475 if (nlocked != pages) { 5476 for (i = 0; i < nlocked; i++) 5477 page_unlock(ppa[i]); 5478 va += pgsz; 5479 lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS, 5480 btop(pgsz)); 5481 continue; 5482 } 5483 5484 /* 5485 * hat_page_demote() can no longer happen 5486 * since last cons page had the right p_szc after 5487 * all cons pages were locked. all cons pages 5488 * should now have the same p_szc. 5489 */ 5490 5491 /* 5492 * All constituent pages locked successfully, so mark 5493 * large page for migration and unload the mappings of 5494 * constituent pages, so a fault will occur on any part of the 5495 * large page 5496 */ 5497 PP_SETMIGRATE(ppa[0]); 5498 for (i = 0; i < nlocked; i++) { 5499 pp = ppa[i]; 5500 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 5501 ASSERT(hat_page_getshare(pp) == 0); 5502 page_unlock(pp); 5503 } 5504 lgrp_stat_add(from->lgrp_id, LGRP_PMM_PGS, nlocked); 5505 5506 va += pgsz; 5507 } 5508 kmem_free(ppa, ppa_nentries * sizeof (page_t *)); 5509 } 5510 5511 /* 5512 * Migrate any pages that have been marked for migration in the given range 5513 */ 5514 void 5515 page_migrate( 5516 struct seg *seg, 5517 caddr_t addr, 5518 page_t **ppa, 5519 pgcnt_t npages) 5520 { 5521 lgrp_t *from; 5522 lgrp_t *to; 5523 page_t *newpp; 5524 page_t *pp; 5525 pfn_t pfn; 5526 size_t pgsz; 5527 spgcnt_t page_cnt; 5528 spgcnt_t i; 5529 uint_t pszc; 5530 5531 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 5532 5533 while (npages > 0) { 5534 pp = *ppa; 5535 pszc = pp->p_szc; 5536 pgsz = page_get_pagesize(pszc); 5537 page_cnt = btop(pgsz); 5538 5539 /* 5540 * Check to see whether this page is marked for migration 5541 * 5542 * Assume that root page of large page is marked for 5543 * migration and none of the other constituent pages 5544 * are marked. This really simplifies clearing the 5545 * migrate bit by not having to clear it from each 5546 * constituent page. 5547 * 5548 * note we don't want to relocate an entire large page if 5549 * someone is only using one subpage. 5550 */ 5551 if (npages < page_cnt) 5552 break; 5553 5554 /* 5555 * Is it marked for migration? 5556 */ 5557 if (!PP_ISMIGRATE(pp)) 5558 goto next; 5559 5560 /* 5561 * Determine lgroups that page is being migrated between 5562 */ 5563 pfn = page_pptonum(pp); 5564 if (!IS_P2ALIGNED(pfn, page_cnt)) { 5565 break; 5566 } 5567 from = lgrp_pfn_to_lgrp(pfn); 5568 to = lgrp_mem_choose(seg, addr, pgsz); 5569 5570 /* 5571 * Need to get exclusive lock's to migrate 5572 */ 5573 for (i = 0; i < page_cnt; i++) { 5574 ASSERT(PAGE_LOCKED(ppa[i])); 5575 if (page_pptonum(ppa[i]) != pfn + i || 5576 ppa[i]->p_szc != pszc) { 5577 break; 5578 } 5579 if (!page_tryupgrade(ppa[i])) { 5580 lgrp_stat_add(from->lgrp_id, 5581 LGRP_PM_FAIL_LOCK_PGS, 5582 page_cnt); 5583 break; 5584 } 5585 5586 /* 5587 * Check to see whether we are trying to migrate 5588 * page to lgroup where it is allocated already. 5589 * If so, clear the migrate bit and skip to next 5590 * page. 5591 */ 5592 if (i == 0 && to == from) { 5593 PP_CLRMIGRATE(ppa[0]); 5594 page_downgrade(ppa[0]); 5595 goto next; 5596 } 5597 } 5598 5599 /* 5600 * If all constituent pages couldn't be locked, 5601 * unlock pages locked so far and skip to next page. 5602 */ 5603 if (i != page_cnt) { 5604 while (--i != -1) { 5605 page_downgrade(ppa[i]); 5606 } 5607 goto next; 5608 } 5609 5610 (void) page_create_wait(page_cnt, PG_WAIT); 5611 newpp = page_get_replacement_page(pp, to, PGR_SAMESZC); 5612 if (newpp == NULL) { 5613 page_create_putback(page_cnt); 5614 for (i = 0; i < page_cnt; i++) { 5615 page_downgrade(ppa[i]); 5616 } 5617 lgrp_stat_add(to->lgrp_id, LGRP_PM_FAIL_ALLOC_PGS, 5618 page_cnt); 5619 goto next; 5620 } 5621 ASSERT(newpp->p_szc == pszc); 5622 /* 5623 * Clear migrate bit and relocate page 5624 */ 5625 PP_CLRMIGRATE(pp); 5626 if (page_relocate(&pp, &newpp, 0, 1, &page_cnt, to)) { 5627 panic("page_migrate: page_relocate failed"); 5628 } 5629 ASSERT(page_cnt * PAGESIZE == pgsz); 5630 5631 /* 5632 * Keep stats for number of pages migrated from and to 5633 * each lgroup 5634 */ 5635 lgrp_stat_add(from->lgrp_id, LGRP_PM_SRC_PGS, page_cnt); 5636 lgrp_stat_add(to->lgrp_id, LGRP_PM_DEST_PGS, page_cnt); 5637 /* 5638 * update the page_t array we were passed in and 5639 * unlink constituent pages of a large page. 5640 */ 5641 for (i = 0; i < page_cnt; ++i, ++pp) { 5642 ASSERT(PAGE_EXCL(newpp)); 5643 ASSERT(newpp->p_szc == pszc); 5644 ppa[i] = newpp; 5645 pp = newpp; 5646 page_sub(&newpp, pp); 5647 page_downgrade(pp); 5648 } 5649 ASSERT(newpp == NULL); 5650 next: 5651 addr += pgsz; 5652 ppa += page_cnt; 5653 npages -= page_cnt; 5654 } 5655 } 5656 5657 ulong_t mem_waiters = 0; 5658 ulong_t max_count = 20; 5659 #define MAX_DELAY 0x1ff 5660 5661 /* 5662 * Check if enough memory is available to proceed. 5663 * Depending on system configuration and how much memory is 5664 * reserved for swap we need to check against two variables. 5665 * e.g. on systems with little physical swap availrmem can be 5666 * more reliable indicator of how much memory is available. 5667 * On systems with large phys swap freemem can be better indicator. 5668 * If freemem drops below threshold level don't return an error 5669 * immediately but wake up pageout to free memory and block. 5670 * This is done number of times. If pageout is not able to free 5671 * memory within certain time return an error. 5672 * The same applies for availrmem but kmem_reap is used to 5673 * free memory. 5674 */ 5675 int 5676 page_mem_avail(pgcnt_t npages) 5677 { 5678 ulong_t count; 5679 5680 #if defined(__i386) 5681 if (freemem > desfree + npages && 5682 availrmem > swapfs_reserve + npages && 5683 btop(vmem_size(heap_arena, VMEM_FREE)) > tune.t_minarmem + 5684 npages) 5685 return (1); 5686 #else 5687 if (freemem > desfree + npages && 5688 availrmem > swapfs_reserve + npages) 5689 return (1); 5690 #endif 5691 5692 count = max_count; 5693 atomic_add_long(&mem_waiters, 1); 5694 5695 while (freemem < desfree + npages && --count) { 5696 cv_signal(&proc_pageout->p_cv); 5697 if (delay_sig(hz + (mem_waiters & MAX_DELAY))) { 5698 atomic_add_long(&mem_waiters, -1); 5699 return (0); 5700 } 5701 } 5702 if (count == 0) { 5703 atomic_add_long(&mem_waiters, -1); 5704 return (0); 5705 } 5706 5707 count = max_count; 5708 while (availrmem < swapfs_reserve + npages && --count) { 5709 kmem_reap(); 5710 if (delay_sig(hz + (mem_waiters & MAX_DELAY))) { 5711 atomic_add_long(&mem_waiters, -1); 5712 return (0); 5713 } 5714 } 5715 atomic_add_long(&mem_waiters, -1); 5716 if (count == 0) 5717 return (0); 5718 5719 #if defined(__i386) 5720 if (btop(vmem_size(heap_arena, VMEM_FREE)) < 5721 tune.t_minarmem + npages) 5722 return (0); 5723 #endif 5724 return (1); 5725 } 5726 5727 #define MAX_CNT 60 /* max num of iterations */ 5728 /* 5729 * Reclaim/reserve availrmem for npages. 5730 * If there is not enough memory start reaping seg, kmem caches. 5731 * Start pageout scanner (via page_needfree()). 5732 * Exit after ~ MAX_CNT s regardless of how much memory has been released. 5733 * Note: There is no guarantee that any availrmem will be freed as 5734 * this memory typically is locked (kernel heap) or reserved for swap. 5735 * Also due to memory fragmentation kmem allocator may not be able 5736 * to free any memory (single user allocated buffer will prevent 5737 * freeing slab or a page). 5738 */ 5739 int 5740 page_reclaim_mem(pgcnt_t npages, pgcnt_t epages, int adjust) 5741 { 5742 int i = 0; 5743 int ret = 0; 5744 pgcnt_t deficit; 5745 pgcnt_t old_availrmem; 5746 5747 mutex_enter(&freemem_lock); 5748 old_availrmem = availrmem - 1; 5749 while ((availrmem < tune.t_minarmem + npages + epages) && 5750 (old_availrmem < availrmem) && (i++ < MAX_CNT)) { 5751 old_availrmem = availrmem; 5752 deficit = tune.t_minarmem + npages + epages - availrmem; 5753 mutex_exit(&freemem_lock); 5754 page_needfree(deficit); 5755 kmem_reap(); 5756 delay(hz); 5757 page_needfree(-(spgcnt_t)deficit); 5758 mutex_enter(&freemem_lock); 5759 } 5760 5761 if (adjust && (availrmem >= tune.t_minarmem + npages + epages)) { 5762 availrmem -= npages; 5763 ret = 1; 5764 } 5765 5766 mutex_exit(&freemem_lock); 5767 5768 return (ret); 5769 } 5770 5771 /* 5772 * Search the memory segments to locate the desired page. Within a 5773 * segment, pages increase linearly with one page structure per 5774 * physical page frame (size PAGESIZE). The search begins 5775 * with the segment that was accessed last, to take advantage of locality. 5776 * If the hint misses, we start from the beginning of the sorted memseg list 5777 */ 5778 5779 5780 /* 5781 * Some data structures for pfn to pp lookup. 5782 */ 5783 ulong_t mhash_per_slot; 5784 struct memseg *memseg_hash[N_MEM_SLOTS]; 5785 5786 page_t * 5787 page_numtopp_nolock(pfn_t pfnum) 5788 { 5789 struct memseg *seg; 5790 page_t *pp; 5791 vm_cpu_data_t *vc = CPU->cpu_vm_data; 5792 5793 ASSERT(vc != NULL); 5794 5795 MEMSEG_STAT_INCR(nsearch); 5796 5797 /* Try last winner first */ 5798 if (((seg = vc->vc_pnum_memseg) != NULL) && 5799 (pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) { 5800 MEMSEG_STAT_INCR(nlastwon); 5801 pp = seg->pages + (pfnum - seg->pages_base); 5802 if (pp->p_pagenum == pfnum) 5803 return ((page_t *)pp); 5804 } 5805 5806 /* Else Try hash */ 5807 if (((seg = memseg_hash[MEMSEG_PFN_HASH(pfnum)]) != NULL) && 5808 (pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) { 5809 MEMSEG_STAT_INCR(nhashwon); 5810 vc->vc_pnum_memseg = seg; 5811 pp = seg->pages + (pfnum - seg->pages_base); 5812 if (pp->p_pagenum == pfnum) 5813 return ((page_t *)pp); 5814 } 5815 5816 /* Else Brute force */ 5817 for (seg = memsegs; seg != NULL; seg = seg->next) { 5818 if (pfnum >= seg->pages_base && pfnum < seg->pages_end) { 5819 vc->vc_pnum_memseg = seg; 5820 pp = seg->pages + (pfnum - seg->pages_base); 5821 return ((page_t *)pp); 5822 } 5823 } 5824 vc->vc_pnum_memseg = NULL; 5825 MEMSEG_STAT_INCR(nnotfound); 5826 return ((page_t *)NULL); 5827 5828 } 5829 5830 struct memseg * 5831 page_numtomemseg_nolock(pfn_t pfnum) 5832 { 5833 struct memseg *seg; 5834 page_t *pp; 5835 5836 /* Try hash */ 5837 if (((seg = memseg_hash[MEMSEG_PFN_HASH(pfnum)]) != NULL) && 5838 (pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) { 5839 pp = seg->pages + (pfnum - seg->pages_base); 5840 if (pp->p_pagenum == pfnum) 5841 return (seg); 5842 } 5843 5844 /* Else Brute force */ 5845 for (seg = memsegs; seg != NULL; seg = seg->next) { 5846 if (pfnum >= seg->pages_base && pfnum < seg->pages_end) { 5847 return (seg); 5848 } 5849 } 5850 return ((struct memseg *)NULL); 5851 } 5852 5853 /* 5854 * Given a page and a count return the page struct that is 5855 * n structs away from the current one in the global page 5856 * list. 5857 * 5858 * This function wraps to the first page upon 5859 * reaching the end of the memseg list. 5860 */ 5861 page_t * 5862 page_nextn(page_t *pp, ulong_t n) 5863 { 5864 struct memseg *seg; 5865 page_t *ppn; 5866 vm_cpu_data_t *vc = (vm_cpu_data_t *)CPU->cpu_vm_data; 5867 5868 ASSERT(vc != NULL); 5869 5870 if (((seg = vc->vc_pnext_memseg) == NULL) || 5871 (seg->pages_base == seg->pages_end) || 5872 !(pp >= seg->pages && pp < seg->epages)) { 5873 5874 for (seg = memsegs; seg; seg = seg->next) { 5875 if (pp >= seg->pages && pp < seg->epages) 5876 break; 5877 } 5878 5879 if (seg == NULL) { 5880 /* Memory delete got in, return something valid. */ 5881 /* TODO: fix me. */ 5882 seg = memsegs; 5883 pp = seg->pages; 5884 } 5885 } 5886 5887 /* check for wraparound - possible if n is large */ 5888 while ((ppn = (pp + n)) >= seg->epages || ppn < pp) { 5889 n -= seg->epages - pp; 5890 seg = seg->next; 5891 if (seg == NULL) 5892 seg = memsegs; 5893 pp = seg->pages; 5894 } 5895 vc->vc_pnext_memseg = seg; 5896 return (ppn); 5897 } 5898 5899 /* 5900 * Initialize for a loop using page_next_scan_large(). 5901 */ 5902 page_t * 5903 page_next_scan_init(void **cookie) 5904 { 5905 ASSERT(cookie != NULL); 5906 *cookie = (void *)memsegs; 5907 return ((page_t *)memsegs->pages); 5908 } 5909 5910 /* 5911 * Return the next page in a scan of page_t's, assuming we want 5912 * to skip over sub-pages within larger page sizes. 5913 * 5914 * The cookie is used to keep track of the current memseg. 5915 */ 5916 page_t * 5917 page_next_scan_large( 5918 page_t *pp, 5919 ulong_t *n, 5920 void **cookie) 5921 { 5922 struct memseg *seg = (struct memseg *)*cookie; 5923 page_t *new_pp; 5924 ulong_t cnt; 5925 pfn_t pfn; 5926 5927 5928 /* 5929 * get the count of page_t's to skip based on the page size 5930 */ 5931 ASSERT(pp != NULL); 5932 if (pp->p_szc == 0) { 5933 cnt = 1; 5934 } else { 5935 pfn = page_pptonum(pp); 5936 cnt = page_get_pagecnt(pp->p_szc); 5937 cnt -= pfn & (cnt - 1); 5938 } 5939 *n += cnt; 5940 new_pp = pp + cnt; 5941 5942 /* 5943 * Catch if we went past the end of the current memory segment. If so, 5944 * just move to the next segment with pages. 5945 */ 5946 if (new_pp >= seg->epages) { 5947 do { 5948 seg = seg->next; 5949 if (seg == NULL) 5950 seg = memsegs; 5951 } while (seg->pages == seg->epages); 5952 new_pp = seg->pages; 5953 *cookie = (void *)seg; 5954 } 5955 5956 return (new_pp); 5957 } 5958 5959 5960 /* 5961 * Returns next page in list. Note: this function wraps 5962 * to the first page in the list upon reaching the end 5963 * of the list. Callers should be aware of this fact. 5964 */ 5965 5966 /* We should change this be a #define */ 5967 5968 page_t * 5969 page_next(page_t *pp) 5970 { 5971 return (page_nextn(pp, 1)); 5972 } 5973 5974 page_t * 5975 page_first() 5976 { 5977 return ((page_t *)memsegs->pages); 5978 } 5979 5980 5981 /* 5982 * This routine is called at boot with the initial memory configuration 5983 * and when memory is added or removed. 5984 */ 5985 void 5986 build_pfn_hash() 5987 { 5988 pfn_t cur; 5989 pgcnt_t index; 5990 struct memseg *pseg; 5991 int i; 5992 5993 /* 5994 * Clear memseg_hash array. 5995 * Since memory add/delete is designed to operate concurrently 5996 * with normal operation, the hash rebuild must be able to run 5997 * concurrently with page_numtopp_nolock(). To support this 5998 * functionality, assignments to memseg_hash array members must 5999 * be done atomically. 6000 * 6001 * NOTE: bzero() does not currently guarantee this for kernel 6002 * threads, and cannot be used here. 6003 */ 6004 for (i = 0; i < N_MEM_SLOTS; i++) 6005 memseg_hash[i] = NULL; 6006 6007 hat_kpm_mseghash_clear(N_MEM_SLOTS); 6008 6009 /* 6010 * Physmax is the last valid pfn. 6011 */ 6012 mhash_per_slot = (physmax + 1) >> MEM_HASH_SHIFT; 6013 for (pseg = memsegs; pseg != NULL; pseg = pseg->next) { 6014 index = MEMSEG_PFN_HASH(pseg->pages_base); 6015 cur = pseg->pages_base; 6016 do { 6017 if (index >= N_MEM_SLOTS) 6018 index = MEMSEG_PFN_HASH(cur); 6019 6020 if (memseg_hash[index] == NULL || 6021 memseg_hash[index]->pages_base > pseg->pages_base) { 6022 memseg_hash[index] = pseg; 6023 hat_kpm_mseghash_update(index, pseg); 6024 } 6025 cur += mhash_per_slot; 6026 index++; 6027 } while (cur < pseg->pages_end); 6028 } 6029 } 6030 6031 /* 6032 * Return the pagenum for the pp 6033 */ 6034 pfn_t 6035 page_pptonum(page_t *pp) 6036 { 6037 return (pp->p_pagenum); 6038 } 6039 6040 /* 6041 * interface to the referenced and modified etc bits 6042 * in the PSM part of the page struct 6043 * when no locking is desired. 6044 */ 6045 void 6046 page_set_props(page_t *pp, uint_t flags) 6047 { 6048 ASSERT((flags & ~(P_MOD | P_REF | P_RO)) == 0); 6049 pp->p_nrm |= (uchar_t)flags; 6050 } 6051 6052 void 6053 page_clr_all_props(page_t *pp) 6054 { 6055 pp->p_nrm = 0; 6056 } 6057 6058 /* 6059 * Clear p_lckcnt and p_cowcnt, adjusting freemem if required. 6060 */ 6061 int 6062 page_clear_lck_cow(page_t *pp, int adjust) 6063 { 6064 int f_amount; 6065 6066 ASSERT(PAGE_EXCL(pp)); 6067 6068 /* 6069 * The page_struct_lock need not be acquired here since 6070 * we require the caller hold the page exclusively locked. 6071 */ 6072 f_amount = 0; 6073 if (pp->p_lckcnt) { 6074 f_amount = 1; 6075 pp->p_lckcnt = 0; 6076 } 6077 if (pp->p_cowcnt) { 6078 f_amount += pp->p_cowcnt; 6079 pp->p_cowcnt = 0; 6080 } 6081 6082 if (adjust && f_amount) { 6083 mutex_enter(&freemem_lock); 6084 availrmem += f_amount; 6085 mutex_exit(&freemem_lock); 6086 } 6087 6088 return (f_amount); 6089 } 6090 6091 /* 6092 * The following functions is called from free_vp_pages() 6093 * for an inexact estimate of a newly free'd page... 6094 */ 6095 ulong_t 6096 page_share_cnt(page_t *pp) 6097 { 6098 return (hat_page_getshare(pp)); 6099 } 6100 6101 int 6102 page_isshared(page_t *pp) 6103 { 6104 return (hat_page_checkshare(pp, 1)); 6105 } 6106 6107 int 6108 page_isfree(page_t *pp) 6109 { 6110 return (PP_ISFREE(pp)); 6111 } 6112 6113 int 6114 page_isref(page_t *pp) 6115 { 6116 return (hat_page_getattr(pp, P_REF)); 6117 } 6118 6119 int 6120 page_ismod(page_t *pp) 6121 { 6122 return (hat_page_getattr(pp, P_MOD)); 6123 } 6124 6125 /* 6126 * The following code all currently relates to the page capture logic: 6127 * 6128 * This logic is used for cases where there is a desire to claim a certain 6129 * physical page in the system for the caller. As it may not be possible 6130 * to capture the page immediately, the p_toxic bits are used in the page 6131 * structure to indicate that someone wants to capture this page. When the 6132 * page gets unlocked, the toxic flag will be noted and an attempt to capture 6133 * the page will be made. If it is successful, the original callers callback 6134 * will be called with the page to do with it what they please. 6135 * 6136 * There is also an async thread which wakes up to attempt to capture 6137 * pages occasionally which have the capture bit set. All of the pages which 6138 * need to be captured asynchronously have been inserted into the 6139 * page_capture_hash and thus this thread walks that hash list. Items in the 6140 * hash have an expiration time so this thread handles that as well by removing 6141 * the item from the hash if it has expired. 6142 * 6143 * Some important things to note are: 6144 * - if the PR_CAPTURE bit is set on a page, then the page is in the 6145 * page_capture_hash. The page_capture_hash_head.pchh_mutex is needed 6146 * to set and clear this bit, and while the lock is held is the only time 6147 * you can add or remove an entry from the hash. 6148 * - the PR_CAPTURE bit can only be set and cleared while holding the 6149 * page_capture_hash_head.pchh_mutex 6150 * - the t_flag field of the thread struct is used with the T_CAPTURING 6151 * flag to prevent recursion while dealing with large pages. 6152 * - pages which need to be retired never expire on the page_capture_hash. 6153 */ 6154 6155 static void page_capture_thread(void); 6156 static kthread_t *pc_thread_id; 6157 kcondvar_t pc_cv; 6158 static kmutex_t pc_thread_mutex; 6159 static clock_t pc_thread_shortwait; 6160 static clock_t pc_thread_longwait; 6161 static int pc_thread_retry; 6162 6163 struct page_capture_callback pc_cb[PC_NUM_CALLBACKS]; 6164 6165 /* Note that this is a circular linked list */ 6166 typedef struct page_capture_hash_bucket { 6167 page_t *pp; 6168 uint_t szc; 6169 uint_t flags; 6170 clock_t expires; /* lbolt at which this request expires. */ 6171 void *datap; /* Cached data passed in for callback */ 6172 struct page_capture_hash_bucket *next; 6173 struct page_capture_hash_bucket *prev; 6174 } page_capture_hash_bucket_t; 6175 6176 /* 6177 * Each hash bucket will have it's own mutex and two lists which are: 6178 * active (0): represents requests which have not been processed by 6179 * the page_capture async thread yet. 6180 * walked (1): represents requests which have been processed by the 6181 * page_capture async thread within it's given walk of this bucket. 6182 * 6183 * These are all needed so that we can synchronize all async page_capture 6184 * events. When the async thread moves to a new bucket, it will append the 6185 * walked list to the active list and walk each item one at a time, moving it 6186 * from the active list to the walked list. Thus if there is an async request 6187 * outstanding for a given page, it will always be in one of the two lists. 6188 * New requests will always be added to the active list. 6189 * If we were not able to capture a page before the request expired, we'd free 6190 * up the request structure which would indicate to page_capture that there is 6191 * no longer a need for the given page, and clear the PR_CAPTURE flag if 6192 * possible. 6193 */ 6194 typedef struct page_capture_hash_head { 6195 kmutex_t pchh_mutex; 6196 uint_t num_pages; 6197 page_capture_hash_bucket_t lists[2]; /* sentinel nodes */ 6198 } page_capture_hash_head_t; 6199 6200 #ifdef DEBUG 6201 #define NUM_PAGE_CAPTURE_BUCKETS 4 6202 #else 6203 #define NUM_PAGE_CAPTURE_BUCKETS 64 6204 #endif 6205 6206 page_capture_hash_head_t page_capture_hash[NUM_PAGE_CAPTURE_BUCKETS]; 6207 6208 /* for now use a very simple hash based upon the size of a page struct */ 6209 #define PAGE_CAPTURE_HASH(pp) \ 6210 ((int)(((uintptr_t)pp >> 7) & (NUM_PAGE_CAPTURE_BUCKETS - 1))) 6211 6212 extern pgcnt_t swapfs_minfree; 6213 6214 int page_trycapture(page_t *pp, uint_t szc, uint_t flags, void *datap); 6215 6216 /* 6217 * a callback function is required for page capture requests. 6218 */ 6219 void 6220 page_capture_register_callback(uint_t index, clock_t duration, 6221 int (*cb_func)(page_t *, void *, uint_t)) 6222 { 6223 ASSERT(pc_cb[index].cb_active == 0); 6224 ASSERT(cb_func != NULL); 6225 rw_enter(&pc_cb[index].cb_rwlock, RW_WRITER); 6226 pc_cb[index].duration = duration; 6227 pc_cb[index].cb_func = cb_func; 6228 pc_cb[index].cb_active = 1; 6229 rw_exit(&pc_cb[index].cb_rwlock); 6230 } 6231 6232 void 6233 page_capture_unregister_callback(uint_t index) 6234 { 6235 int i, j; 6236 struct page_capture_hash_bucket *bp1; 6237 struct page_capture_hash_bucket *bp2; 6238 struct page_capture_hash_bucket *head = NULL; 6239 uint_t flags = (1 << index); 6240 6241 rw_enter(&pc_cb[index].cb_rwlock, RW_WRITER); 6242 ASSERT(pc_cb[index].cb_active == 1); 6243 pc_cb[index].duration = 0; /* Paranoia */ 6244 pc_cb[index].cb_func = NULL; /* Paranoia */ 6245 pc_cb[index].cb_active = 0; 6246 rw_exit(&pc_cb[index].cb_rwlock); 6247 6248 /* 6249 * Just move all the entries to a private list which we can walk 6250 * through without the need to hold any locks. 6251 * No more requests can get added to the hash lists for this consumer 6252 * as the cb_active field for the callback has been cleared. 6253 */ 6254 for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) { 6255 mutex_enter(&page_capture_hash[i].pchh_mutex); 6256 for (j = 0; j < 2; j++) { 6257 bp1 = page_capture_hash[i].lists[j].next; 6258 /* walk through all but first (sentinel) element */ 6259 while (bp1 != &page_capture_hash[i].lists[j]) { 6260 bp2 = bp1; 6261 if (bp2->flags & flags) { 6262 bp1 = bp2->next; 6263 bp1->prev = bp2->prev; 6264 bp2->prev->next = bp1; 6265 bp2->next = head; 6266 head = bp2; 6267 /* 6268 * Clear the PR_CAPTURE bit as we 6269 * hold appropriate locks here. 6270 */ 6271 page_clrtoxic(head->pp, PR_CAPTURE); 6272 page_capture_hash[i].num_pages--; 6273 continue; 6274 } 6275 bp1 = bp1->next; 6276 } 6277 } 6278 mutex_exit(&page_capture_hash[i].pchh_mutex); 6279 } 6280 6281 while (head != NULL) { 6282 bp1 = head; 6283 head = head->next; 6284 kmem_free(bp1, sizeof (*bp1)); 6285 } 6286 } 6287 6288 6289 /* 6290 * Find pp in the active list and move it to the walked list if it 6291 * exists. 6292 * Note that most often pp should be at the front of the active list 6293 * as it is currently used and thus there is no other sort of optimization 6294 * being done here as this is a linked list data structure. 6295 * Returns 1 on successful move or 0 if page could not be found. 6296 */ 6297 static int 6298 page_capture_move_to_walked(page_t *pp) 6299 { 6300 page_capture_hash_bucket_t *bp; 6301 int index; 6302 6303 index = PAGE_CAPTURE_HASH(pp); 6304 6305 mutex_enter(&page_capture_hash[index].pchh_mutex); 6306 bp = page_capture_hash[index].lists[0].next; 6307 while (bp != &page_capture_hash[index].lists[0]) { 6308 if (bp->pp == pp) { 6309 /* Remove from old list */ 6310 bp->next->prev = bp->prev; 6311 bp->prev->next = bp->next; 6312 6313 /* Add to new list */ 6314 bp->next = page_capture_hash[index].lists[1].next; 6315 bp->prev = &page_capture_hash[index].lists[1]; 6316 page_capture_hash[index].lists[1].next = bp; 6317 bp->next->prev = bp; 6318 mutex_exit(&page_capture_hash[index].pchh_mutex); 6319 6320 return (1); 6321 } 6322 bp = bp->next; 6323 } 6324 mutex_exit(&page_capture_hash[index].pchh_mutex); 6325 return (0); 6326 } 6327 6328 /* 6329 * Add a new entry to the page capture hash. The only case where a new 6330 * entry is not added is when the page capture consumer is no longer registered. 6331 * In this case, we'll silently not add the page to the hash. We know that 6332 * page retire will always be registered for the case where we are currently 6333 * unretiring a page and thus there are no conflicts. 6334 */ 6335 static void 6336 page_capture_add_hash(page_t *pp, uint_t szc, uint_t flags, void *datap) 6337 { 6338 page_capture_hash_bucket_t *bp1; 6339 page_capture_hash_bucket_t *bp2; 6340 int index; 6341 int cb_index; 6342 int i; 6343 #ifdef DEBUG 6344 page_capture_hash_bucket_t *tp1; 6345 int l; 6346 #endif 6347 6348 ASSERT(!(flags & CAPTURE_ASYNC)); 6349 6350 bp1 = kmem_alloc(sizeof (struct page_capture_hash_bucket), KM_SLEEP); 6351 6352 bp1->pp = pp; 6353 bp1->szc = szc; 6354 bp1->flags = flags; 6355 bp1->datap = datap; 6356 6357 for (cb_index = 0; cb_index < PC_NUM_CALLBACKS; cb_index++) { 6358 if ((flags >> cb_index) & 1) { 6359 break; 6360 } 6361 } 6362 6363 ASSERT(cb_index != PC_NUM_CALLBACKS); 6364 6365 rw_enter(&pc_cb[cb_index].cb_rwlock, RW_READER); 6366 if (pc_cb[cb_index].cb_active) { 6367 if (pc_cb[cb_index].duration == -1) { 6368 bp1->expires = (clock_t)-1; 6369 } else { 6370 bp1->expires = lbolt + pc_cb[cb_index].duration; 6371 } 6372 } else { 6373 /* There's no callback registered so don't add to the hash */ 6374 rw_exit(&pc_cb[cb_index].cb_rwlock); 6375 kmem_free(bp1, sizeof (*bp1)); 6376 return; 6377 } 6378 6379 index = PAGE_CAPTURE_HASH(pp); 6380 6381 /* 6382 * Only allow capture flag to be modified under this mutex. 6383 * Prevents multiple entries for same page getting added. 6384 */ 6385 mutex_enter(&page_capture_hash[index].pchh_mutex); 6386 6387 /* 6388 * if not already on the hash, set capture bit and add to the hash 6389 */ 6390 if (!(pp->p_toxic & PR_CAPTURE)) { 6391 #ifdef DEBUG 6392 /* Check for duplicate entries */ 6393 for (l = 0; l < 2; l++) { 6394 tp1 = page_capture_hash[index].lists[l].next; 6395 while (tp1 != &page_capture_hash[index].lists[l]) { 6396 if (tp1->pp == pp) { 6397 panic("page pp 0x%p already on hash " 6398 "at 0x%p\n", pp, tp1); 6399 } 6400 tp1 = tp1->next; 6401 } 6402 } 6403 6404 #endif 6405 page_settoxic(pp, PR_CAPTURE); 6406 bp1->next = page_capture_hash[index].lists[0].next; 6407 bp1->prev = &page_capture_hash[index].lists[0]; 6408 bp1->next->prev = bp1; 6409 page_capture_hash[index].lists[0].next = bp1; 6410 page_capture_hash[index].num_pages++; 6411 if (flags & CAPTURE_RETIRE) { 6412 page_retire_incr_pend_count(); 6413 } 6414 mutex_exit(&page_capture_hash[index].pchh_mutex); 6415 rw_exit(&pc_cb[cb_index].cb_rwlock); 6416 cv_signal(&pc_cv); 6417 return; 6418 } 6419 6420 /* 6421 * A page retire request will replace any other request. 6422 * A second physmem request which is for a different process than 6423 * the currently registered one will be dropped as there is 6424 * no way to hold the private data for both calls. 6425 * In the future, once there are more callers, this will have to 6426 * be worked out better as there needs to be private storage for 6427 * at least each type of caller (maybe have datap be an array of 6428 * *void's so that we can index based upon callers index). 6429 */ 6430 6431 /* walk hash list to update expire time */ 6432 for (i = 0; i < 2; i++) { 6433 bp2 = page_capture_hash[index].lists[i].next; 6434 while (bp2 != &page_capture_hash[index].lists[i]) { 6435 if (bp2->pp == pp) { 6436 if (flags & CAPTURE_RETIRE) { 6437 if (!(bp2->flags & CAPTURE_RETIRE)) { 6438 page_retire_incr_pend_count(); 6439 bp2->flags = flags; 6440 bp2->expires = bp1->expires; 6441 bp2->datap = datap; 6442 } 6443 } else { 6444 ASSERT(flags & CAPTURE_PHYSMEM); 6445 if (!(bp2->flags & CAPTURE_RETIRE) && 6446 (datap == bp2->datap)) { 6447 bp2->expires = bp1->expires; 6448 } 6449 } 6450 mutex_exit(&page_capture_hash[index]. 6451 pchh_mutex); 6452 rw_exit(&pc_cb[cb_index].cb_rwlock); 6453 kmem_free(bp1, sizeof (*bp1)); 6454 return; 6455 } 6456 bp2 = bp2->next; 6457 } 6458 } 6459 6460 /* 6461 * the PR_CAPTURE flag is protected by the page_capture_hash mutexes 6462 * and thus it either has to be set or not set and can't change 6463 * while holding the mutex above. 6464 */ 6465 panic("page_capture_add_hash, PR_CAPTURE flag set on pp %p\n", pp); 6466 } 6467 6468 /* 6469 * We have a page in our hands, lets try and make it ours by turning 6470 * it into a clean page like it had just come off the freelists. 6471 * 6472 * Returns 0 on success, with the page still EXCL locked. 6473 * On failure, the page will be unlocked, and returns EAGAIN 6474 */ 6475 static int 6476 page_capture_clean_page(page_t *pp) 6477 { 6478 page_t *newpp; 6479 int skip_unlock = 0; 6480 spgcnt_t count; 6481 page_t *tpp; 6482 int ret = 0; 6483 int extra; 6484 6485 ASSERT(PAGE_EXCL(pp)); 6486 ASSERT(!PP_RETIRED(pp)); 6487 ASSERT(curthread->t_flag & T_CAPTURING); 6488 6489 if (PP_ISFREE(pp)) { 6490 if (!page_reclaim(pp, NULL)) { 6491 skip_unlock = 1; 6492 ret = EAGAIN; 6493 goto cleanup; 6494 } 6495 ASSERT(pp->p_szc == 0); 6496 if (pp->p_vnode != NULL) { 6497 /* 6498 * Since this page came from the 6499 * cachelist, we must destroy the 6500 * old vnode association. 6501 */ 6502 page_hashout(pp, NULL); 6503 } 6504 goto cleanup; 6505 } 6506 6507 /* 6508 * If we know page_relocate will fail, skip it 6509 * It could still fail due to a UE on another page but we 6510 * can't do anything about that. 6511 */ 6512 if (pp->p_toxic & PR_UE) { 6513 goto skip_relocate; 6514 } 6515 6516 /* 6517 * It's possible that pages can not have a vnode as fsflush comes 6518 * through and cleans up these pages. It's ugly but that's how it is. 6519 */ 6520 if (pp->p_vnode == NULL) { 6521 goto skip_relocate; 6522 } 6523 6524 /* 6525 * Page was not free, so lets try to relocate it. 6526 * page_relocate only works with root pages, so if this is not a root 6527 * page, we need to demote it to try and relocate it. 6528 * Unfortunately this is the best we can do right now. 6529 */ 6530 newpp = NULL; 6531 if ((pp->p_szc > 0) && (pp != PP_PAGEROOT(pp))) { 6532 if (page_try_demote_pages(pp) == 0) { 6533 ret = EAGAIN; 6534 goto cleanup; 6535 } 6536 } 6537 ret = page_relocate(&pp, &newpp, 1, 0, &count, NULL); 6538 if (ret == 0) { 6539 page_t *npp; 6540 /* unlock the new page(s) */ 6541 while (count-- > 0) { 6542 ASSERT(newpp != NULL); 6543 npp = newpp; 6544 page_sub(&newpp, npp); 6545 page_unlock(npp); 6546 } 6547 ASSERT(newpp == NULL); 6548 /* 6549 * Check to see if the page we have is too large. 6550 * If so, demote it freeing up the extra pages. 6551 */ 6552 if (pp->p_szc > 0) { 6553 /* For now demote extra pages to szc == 0 */ 6554 extra = page_get_pagecnt(pp->p_szc) - 1; 6555 while (extra > 0) { 6556 tpp = pp->p_next; 6557 page_sub(&pp, tpp); 6558 tpp->p_szc = 0; 6559 page_free(tpp, 1); 6560 extra--; 6561 } 6562 /* Make sure to set our page to szc 0 as well */ 6563 ASSERT(pp->p_next == pp && pp->p_prev == pp); 6564 pp->p_szc = 0; 6565 } 6566 goto cleanup; 6567 } else if (ret == EIO) { 6568 ret = EAGAIN; 6569 goto cleanup; 6570 } else { 6571 /* 6572 * Need to reset return type as we failed to relocate the page 6573 * but that does not mean that some of the next steps will not 6574 * work. 6575 */ 6576 ret = 0; 6577 } 6578 6579 skip_relocate: 6580 6581 if (pp->p_szc > 0) { 6582 if (page_try_demote_pages(pp) == 0) { 6583 ret = EAGAIN; 6584 goto cleanup; 6585 } 6586 } 6587 6588 ASSERT(pp->p_szc == 0); 6589 6590 if (hat_ismod(pp)) { 6591 ret = EAGAIN; 6592 goto cleanup; 6593 } 6594 if (PP_ISKAS(pp)) { 6595 ret = EAGAIN; 6596 goto cleanup; 6597 } 6598 if (pp->p_lckcnt || pp->p_cowcnt) { 6599 ret = EAGAIN; 6600 goto cleanup; 6601 } 6602 6603 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 6604 ASSERT(!hat_page_is_mapped(pp)); 6605 6606 if (hat_ismod(pp)) { 6607 /* 6608 * This is a semi-odd case as the page is now modified but not 6609 * mapped as we just unloaded the mappings above. 6610 */ 6611 ret = EAGAIN; 6612 goto cleanup; 6613 } 6614 if (pp->p_vnode != NULL) { 6615 page_hashout(pp, NULL); 6616 } 6617 6618 /* 6619 * At this point, the page should be in a clean state and 6620 * we can do whatever we want with it. 6621 */ 6622 6623 cleanup: 6624 if (ret != 0) { 6625 if (!skip_unlock) { 6626 page_unlock(pp); 6627 } 6628 } else { 6629 ASSERT(pp->p_szc == 0); 6630 ASSERT(PAGE_EXCL(pp)); 6631 6632 pp->p_next = pp; 6633 pp->p_prev = pp; 6634 } 6635 return (ret); 6636 } 6637 6638 /* 6639 * Various callers of page_trycapture() can have different restrictions upon 6640 * what memory they have access to. 6641 * Returns 0 on success, with the following error codes on failure: 6642 * EPERM - The requested page is long term locked, and thus repeated 6643 * requests to capture this page will likely fail. 6644 * ENOMEM - There was not enough free memory in the system to safely 6645 * map the requested page. 6646 * ENOENT - The requested page was inside the kernel cage, and the 6647 * PHYSMEM_CAGE flag was not set. 6648 */ 6649 int 6650 page_capture_pre_checks(page_t *pp, uint_t flags) 6651 { 6652 #if defined(__sparc) 6653 extern struct vnode prom_ppages; 6654 #endif /* __sparc */ 6655 6656 ASSERT(pp != NULL); 6657 6658 #if defined(__sparc) 6659 if (pp->p_vnode == &prom_ppages) { 6660 return (EPERM); 6661 } 6662 6663 if (PP_ISNORELOC(pp) && !(flags & CAPTURE_GET_CAGE) && 6664 (flags & CAPTURE_PHYSMEM)) { 6665 return (ENOENT); 6666 } 6667 6668 if (PP_ISNORELOCKERNEL(pp)) { 6669 return (EPERM); 6670 } 6671 #else 6672 if (PP_ISKAS(pp)) { 6673 return (EPERM); 6674 } 6675 #endif /* __sparc */ 6676 6677 /* only physmem currently has the restrictions checked below */ 6678 if (!(flags & CAPTURE_PHYSMEM)) { 6679 return (0); 6680 } 6681 6682 if (availrmem < swapfs_minfree) { 6683 /* 6684 * We won't try to capture this page as we are 6685 * running low on memory. 6686 */ 6687 return (ENOMEM); 6688 } 6689 return (0); 6690 } 6691 6692 /* 6693 * Once we have a page in our mits, go ahead and complete the capture 6694 * operation. 6695 * Returns 1 on failure where page is no longer needed 6696 * Returns 0 on success 6697 * Returns -1 if there was a transient failure. 6698 * Failure cases must release the SE_EXCL lock on pp (usually via page_free). 6699 */ 6700 int 6701 page_capture_take_action(page_t *pp, uint_t flags, void *datap) 6702 { 6703 int cb_index; 6704 int ret = 0; 6705 page_capture_hash_bucket_t *bp1; 6706 page_capture_hash_bucket_t *bp2; 6707 int index; 6708 int found = 0; 6709 int i; 6710 6711 ASSERT(PAGE_EXCL(pp)); 6712 ASSERT(curthread->t_flag & T_CAPTURING); 6713 6714 for (cb_index = 0; cb_index < PC_NUM_CALLBACKS; cb_index++) { 6715 if ((flags >> cb_index) & 1) { 6716 break; 6717 } 6718 } 6719 ASSERT(cb_index < PC_NUM_CALLBACKS); 6720 6721 /* 6722 * Remove the entry from the page_capture hash, but don't free it yet 6723 * as we may need to put it back. 6724 * Since we own the page at this point in time, we should find it 6725 * in the hash if this is an ASYNC call. If we don't it's likely 6726 * that the page_capture_async() thread decided that this request 6727 * had expired, in which case we just continue on. 6728 */ 6729 if (flags & CAPTURE_ASYNC) { 6730 6731 index = PAGE_CAPTURE_HASH(pp); 6732 6733 mutex_enter(&page_capture_hash[index].pchh_mutex); 6734 for (i = 0; i < 2 && !found; i++) { 6735 bp1 = page_capture_hash[index].lists[i].next; 6736 while (bp1 != &page_capture_hash[index].lists[i]) { 6737 if (bp1->pp == pp) { 6738 bp1->next->prev = bp1->prev; 6739 bp1->prev->next = bp1->next; 6740 page_capture_hash[index].num_pages--; 6741 page_clrtoxic(pp, PR_CAPTURE); 6742 found = 1; 6743 break; 6744 } 6745 bp1 = bp1->next; 6746 } 6747 } 6748 mutex_exit(&page_capture_hash[index].pchh_mutex); 6749 } 6750 6751 /* Synchronize with the unregister func. */ 6752 rw_enter(&pc_cb[cb_index].cb_rwlock, RW_READER); 6753 if (!pc_cb[cb_index].cb_active) { 6754 page_free(pp, 1); 6755 rw_exit(&pc_cb[cb_index].cb_rwlock); 6756 if (found) { 6757 kmem_free(bp1, sizeof (*bp1)); 6758 } 6759 return (1); 6760 } 6761 6762 /* 6763 * We need to remove the entry from the page capture hash and turn off 6764 * the PR_CAPTURE bit before calling the callback. We'll need to cache 6765 * the entry here, and then based upon the return value, cleanup 6766 * appropriately or re-add it to the hash, making sure that someone else 6767 * hasn't already done so. 6768 * It should be rare for the callback to fail and thus it's ok for 6769 * the failure path to be a bit complicated as the success path is 6770 * cleaner and the locking rules are easier to follow. 6771 */ 6772 6773 ret = pc_cb[cb_index].cb_func(pp, datap, flags); 6774 6775 rw_exit(&pc_cb[cb_index].cb_rwlock); 6776 6777 /* 6778 * If this was an ASYNC request, we need to cleanup the hash if the 6779 * callback was successful or if the request was no longer valid. 6780 * For non-ASYNC requests, we return failure to map and the caller 6781 * will take care of adding the request to the hash. 6782 * Note also that the callback itself is responsible for the page 6783 * at this point in time in terms of locking ... The most common 6784 * case for the failure path should just be a page_free. 6785 */ 6786 if (ret >= 0) { 6787 if (found) { 6788 if (bp1->flags & CAPTURE_RETIRE) { 6789 page_retire_decr_pend_count(); 6790 } 6791 kmem_free(bp1, sizeof (*bp1)); 6792 } 6793 return (ret); 6794 } 6795 if (!found) { 6796 return (ret); 6797 } 6798 6799 ASSERT(flags & CAPTURE_ASYNC); 6800 6801 /* 6802 * Check for expiration time first as we can just free it up if it's 6803 * expired. 6804 */ 6805 if (lbolt > bp1->expires && bp1->expires != -1) { 6806 kmem_free(bp1, sizeof (*bp1)); 6807 return (ret); 6808 } 6809 6810 /* 6811 * The callback failed and there used to be an entry in the hash for 6812 * this page, so we need to add it back to the hash. 6813 */ 6814 mutex_enter(&page_capture_hash[index].pchh_mutex); 6815 if (!(pp->p_toxic & PR_CAPTURE)) { 6816 /* just add bp1 back to head of walked list */ 6817 page_settoxic(pp, PR_CAPTURE); 6818 bp1->next = page_capture_hash[index].lists[1].next; 6819 bp1->prev = &page_capture_hash[index].lists[1]; 6820 bp1->next->prev = bp1; 6821 page_capture_hash[index].lists[1].next = bp1; 6822 page_capture_hash[index].num_pages++; 6823 mutex_exit(&page_capture_hash[index].pchh_mutex); 6824 return (ret); 6825 } 6826 6827 /* 6828 * Otherwise there was a new capture request added to list 6829 * Need to make sure that our original data is represented if 6830 * appropriate. 6831 */ 6832 for (i = 0; i < 2; i++) { 6833 bp2 = page_capture_hash[index].lists[i].next; 6834 while (bp2 != &page_capture_hash[index].lists[i]) { 6835 if (bp2->pp == pp) { 6836 if (bp1->flags & CAPTURE_RETIRE) { 6837 if (!(bp2->flags & CAPTURE_RETIRE)) { 6838 bp2->szc = bp1->szc; 6839 bp2->flags = bp1->flags; 6840 bp2->expires = bp1->expires; 6841 bp2->datap = bp1->datap; 6842 } 6843 } else { 6844 ASSERT(bp1->flags & CAPTURE_PHYSMEM); 6845 if (!(bp2->flags & CAPTURE_RETIRE)) { 6846 bp2->szc = bp1->szc; 6847 bp2->flags = bp1->flags; 6848 bp2->expires = bp1->expires; 6849 bp2->datap = bp1->datap; 6850 } 6851 } 6852 mutex_exit(&page_capture_hash[index]. 6853 pchh_mutex); 6854 kmem_free(bp1, sizeof (*bp1)); 6855 return (ret); 6856 } 6857 bp2 = bp2->next; 6858 } 6859 } 6860 panic("PR_CAPTURE set but not on hash for pp 0x%p\n", pp); 6861 /*NOTREACHED*/ 6862 } 6863 6864 /* 6865 * Try to capture the given page for the caller specified in the flags 6866 * parameter. The page will either be captured and handed over to the 6867 * appropriate callback, or will be queued up in the page capture hash 6868 * to be captured asynchronously. 6869 * If the current request is due to an async capture, the page must be 6870 * exclusively locked before calling this function. 6871 * Currently szc must be 0 but in the future this should be expandable to 6872 * other page sizes. 6873 * Returns 0 on success, with the following error codes on failure: 6874 * EPERM - The requested page is long term locked, and thus repeated 6875 * requests to capture this page will likely fail. 6876 * ENOMEM - There was not enough free memory in the system to safely 6877 * map the requested page. 6878 * ENOENT - The requested page was inside the kernel cage, and the 6879 * CAPTURE_GET_CAGE flag was not set. 6880 * EAGAIN - The requested page could not be capturead at this point in 6881 * time but future requests will likely work. 6882 * EBUSY - The requested page is retired and the CAPTURE_GET_RETIRED flag 6883 * was not set. 6884 */ 6885 int 6886 page_itrycapture(page_t *pp, uint_t szc, uint_t flags, void *datap) 6887 { 6888 int ret; 6889 int cb_index; 6890 6891 if (flags & CAPTURE_ASYNC) { 6892 ASSERT(PAGE_EXCL(pp)); 6893 goto async; 6894 } 6895 6896 /* Make sure there's enough availrmem ... */ 6897 ret = page_capture_pre_checks(pp, flags); 6898 if (ret != 0) { 6899 return (ret); 6900 } 6901 6902 if (!page_trylock(pp, SE_EXCL)) { 6903 for (cb_index = 0; cb_index < PC_NUM_CALLBACKS; cb_index++) { 6904 if ((flags >> cb_index) & 1) { 6905 break; 6906 } 6907 } 6908 ASSERT(cb_index < PC_NUM_CALLBACKS); 6909 ret = EAGAIN; 6910 /* Special case for retired pages */ 6911 if (PP_RETIRED(pp)) { 6912 if (flags & CAPTURE_GET_RETIRED) { 6913 if (!page_unretire_pp(pp, PR_UNR_TEMP)) { 6914 /* 6915 * Need to set capture bit and add to 6916 * hash so that the page will be 6917 * retired when freed. 6918 */ 6919 page_capture_add_hash(pp, szc, 6920 CAPTURE_RETIRE, NULL); 6921 ret = 0; 6922 goto own_page; 6923 } 6924 } else { 6925 return (EBUSY); 6926 } 6927 } 6928 page_capture_add_hash(pp, szc, flags, datap); 6929 return (ret); 6930 } 6931 6932 async: 6933 ASSERT(PAGE_EXCL(pp)); 6934 6935 /* Need to check for physmem async requests that availrmem is sane */ 6936 if ((flags & (CAPTURE_ASYNC | CAPTURE_PHYSMEM)) == 6937 (CAPTURE_ASYNC | CAPTURE_PHYSMEM) && 6938 (availrmem < swapfs_minfree)) { 6939 page_unlock(pp); 6940 return (ENOMEM); 6941 } 6942 6943 ret = page_capture_clean_page(pp); 6944 6945 if (ret != 0) { 6946 /* We failed to get the page, so lets add it to the hash */ 6947 if (!(flags & CAPTURE_ASYNC)) { 6948 page_capture_add_hash(pp, szc, flags, datap); 6949 } 6950 return (ret); 6951 } 6952 6953 own_page: 6954 ASSERT(PAGE_EXCL(pp)); 6955 ASSERT(pp->p_szc == 0); 6956 6957 /* Call the callback */ 6958 ret = page_capture_take_action(pp, flags, datap); 6959 6960 if (ret == 0) { 6961 return (0); 6962 } 6963 6964 /* 6965 * Note that in the failure cases from page_capture_take_action, the 6966 * EXCL lock will have already been dropped. 6967 */ 6968 if ((ret == -1) && (!(flags & CAPTURE_ASYNC))) { 6969 page_capture_add_hash(pp, szc, flags, datap); 6970 } 6971 return (EAGAIN); 6972 } 6973 6974 int 6975 page_trycapture(page_t *pp, uint_t szc, uint_t flags, void *datap) 6976 { 6977 int ret; 6978 6979 curthread->t_flag |= T_CAPTURING; 6980 ret = page_itrycapture(pp, szc, flags, datap); 6981 curthread->t_flag &= ~T_CAPTURING; /* xor works as we know its set */ 6982 return (ret); 6983 } 6984 6985 /* 6986 * When unlocking a page which has the PR_CAPTURE bit set, this routine 6987 * gets called to try and capture the page. 6988 */ 6989 void 6990 page_unlock_capture(page_t *pp) 6991 { 6992 page_capture_hash_bucket_t *bp; 6993 int index; 6994 int i; 6995 uint_t szc; 6996 uint_t flags = 0; 6997 void *datap; 6998 kmutex_t *mp; 6999 extern vnode_t retired_pages; 7000 7001 /* 7002 * We need to protect against a possible deadlock here where we own 7003 * the vnode page hash mutex and want to acquire it again as there 7004 * are locations in the code, where we unlock a page while holding 7005 * the mutex which can lead to the page being captured and eventually 7006 * end up here. As we may be hashing out the old page and hashing into 7007 * the retire vnode, we need to make sure we don't own them. 7008 * Other callbacks who do hash operations also need to make sure that 7009 * before they hashin to a vnode that they do not currently own the 7010 * vphm mutex otherwise there will be a panic. 7011 */ 7012 if (mutex_owned(page_vnode_mutex(&retired_pages))) { 7013 page_unlock_nocapture(pp); 7014 return; 7015 } 7016 if (pp->p_vnode != NULL && mutex_owned(page_vnode_mutex(pp->p_vnode))) { 7017 page_unlock_nocapture(pp); 7018 return; 7019 } 7020 7021 index = PAGE_CAPTURE_HASH(pp); 7022 7023 mp = &page_capture_hash[index].pchh_mutex; 7024 mutex_enter(mp); 7025 for (i = 0; i < 2; i++) { 7026 bp = page_capture_hash[index].lists[i].next; 7027 while (bp != &page_capture_hash[index].lists[i]) { 7028 if (bp->pp == pp) { 7029 szc = bp->szc; 7030 flags = bp->flags | CAPTURE_ASYNC; 7031 datap = bp->datap; 7032 mutex_exit(mp); 7033 (void) page_trycapture(pp, szc, flags, datap); 7034 return; 7035 } 7036 bp = bp->next; 7037 } 7038 } 7039 7040 /* Failed to find page in hash so clear flags and unlock it. */ 7041 page_clrtoxic(pp, PR_CAPTURE); 7042 page_unlock(pp); 7043 7044 mutex_exit(mp); 7045 } 7046 7047 void 7048 page_capture_init() 7049 { 7050 int i; 7051 for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) { 7052 page_capture_hash[i].lists[0].next = 7053 &page_capture_hash[i].lists[0]; 7054 page_capture_hash[i].lists[0].prev = 7055 &page_capture_hash[i].lists[0]; 7056 page_capture_hash[i].lists[1].next = 7057 &page_capture_hash[i].lists[1]; 7058 page_capture_hash[i].lists[1].prev = 7059 &page_capture_hash[i].lists[1]; 7060 } 7061 7062 pc_thread_shortwait = 23 * hz; 7063 pc_thread_longwait = 1201 * hz; 7064 pc_thread_retry = 3; 7065 mutex_init(&pc_thread_mutex, NULL, MUTEX_DEFAULT, NULL); 7066 cv_init(&pc_cv, NULL, CV_DEFAULT, NULL); 7067 pc_thread_id = thread_create(NULL, 0, page_capture_thread, NULL, 0, &p0, 7068 TS_RUN, minclsyspri); 7069 } 7070 7071 /* 7072 * It is necessary to scrub any failing pages prior to reboot in order to 7073 * prevent a latent error trap from occurring on the next boot. 7074 */ 7075 void 7076 page_retire_mdboot() 7077 { 7078 page_t *pp; 7079 int i, j; 7080 page_capture_hash_bucket_t *bp; 7081 7082 /* walk lists looking for pages to scrub */ 7083 for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) { 7084 if (page_capture_hash[i].num_pages == 0) 7085 continue; 7086 7087 mutex_enter(&page_capture_hash[i].pchh_mutex); 7088 7089 for (j = 0; j < 2; j++) { 7090 bp = page_capture_hash[i].lists[j].next; 7091 while (bp != &page_capture_hash[i].lists[j]) { 7092 pp = bp->pp; 7093 if (!PP_ISKAS(pp) && PP_TOXIC(pp)) { 7094 pp->p_selock = -1; /* pacify ASSERTs */ 7095 PP_CLRFREE(pp); 7096 pagescrub(pp, 0, PAGESIZE); 7097 pp->p_selock = 0; 7098 } 7099 bp = bp->next; 7100 } 7101 } 7102 mutex_exit(&page_capture_hash[i].pchh_mutex); 7103 } 7104 } 7105 7106 /* 7107 * Walk the page_capture_hash trying to capture pages and also cleanup old 7108 * entries which have expired. 7109 */ 7110 void 7111 page_capture_async() 7112 { 7113 page_t *pp; 7114 int i; 7115 int ret; 7116 page_capture_hash_bucket_t *bp1, *bp2; 7117 uint_t szc; 7118 uint_t flags; 7119 void *datap; 7120 7121 /* If there are outstanding pages to be captured, get to work */ 7122 for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) { 7123 if (page_capture_hash[i].num_pages == 0) 7124 continue; 7125 /* Append list 1 to list 0 and then walk through list 0 */ 7126 mutex_enter(&page_capture_hash[i].pchh_mutex); 7127 bp1 = &page_capture_hash[i].lists[1]; 7128 bp2 = bp1->next; 7129 if (bp1 != bp2) { 7130 bp1->prev->next = page_capture_hash[i].lists[0].next; 7131 bp2->prev = &page_capture_hash[i].lists[0]; 7132 page_capture_hash[i].lists[0].next->prev = bp1->prev; 7133 page_capture_hash[i].lists[0].next = bp2; 7134 bp1->next = bp1; 7135 bp1->prev = bp1; 7136 } 7137 7138 /* list[1] will be empty now */ 7139 7140 bp1 = page_capture_hash[i].lists[0].next; 7141 while (bp1 != &page_capture_hash[i].lists[0]) { 7142 /* Check expiration time */ 7143 if ((lbolt > bp1->expires && bp1->expires != -1) || 7144 page_deleted(bp1->pp)) { 7145 page_capture_hash[i].lists[0].next = bp1->next; 7146 bp1->next->prev = 7147 &page_capture_hash[i].lists[0]; 7148 page_capture_hash[i].num_pages--; 7149 7150 /* 7151 * We can safely remove the PR_CAPTURE bit 7152 * without holding the EXCL lock on the page 7153 * as the PR_CAPTURE bit requres that the 7154 * page_capture_hash[].pchh_mutex be held 7155 * to modify it. 7156 */ 7157 page_clrtoxic(bp1->pp, PR_CAPTURE); 7158 mutex_exit(&page_capture_hash[i].pchh_mutex); 7159 kmem_free(bp1, sizeof (*bp1)); 7160 mutex_enter(&page_capture_hash[i].pchh_mutex); 7161 bp1 = page_capture_hash[i].lists[0].next; 7162 continue; 7163 } 7164 pp = bp1->pp; 7165 szc = bp1->szc; 7166 flags = bp1->flags; 7167 datap = bp1->datap; 7168 mutex_exit(&page_capture_hash[i].pchh_mutex); 7169 if (page_trylock(pp, SE_EXCL)) { 7170 ret = page_trycapture(pp, szc, 7171 flags | CAPTURE_ASYNC, datap); 7172 } else { 7173 ret = 1; /* move to walked hash */ 7174 } 7175 7176 if (ret != 0) { 7177 /* Move to walked hash */ 7178 (void) page_capture_move_to_walked(pp); 7179 } 7180 mutex_enter(&page_capture_hash[i].pchh_mutex); 7181 bp1 = page_capture_hash[i].lists[0].next; 7182 } 7183 7184 mutex_exit(&page_capture_hash[i].pchh_mutex); 7185 } 7186 } 7187 7188 /* 7189 * This function is called by the page_capture_thread, and is needed in 7190 * in order to initiate aio cleanup, so that pages used in aio 7191 * will be unlocked and subsequently retired by page_capture_thread. 7192 */ 7193 static int 7194 do_aio_cleanup(void) 7195 { 7196 proc_t *procp; 7197 int (*aio_cleanup_dr_delete_memory)(proc_t *); 7198 int cleaned = 0; 7199 7200 if (modload("sys", "kaio") == -1) { 7201 cmn_err(CE_WARN, "do_aio_cleanup: cannot load kaio"); 7202 return (0); 7203 } 7204 /* 7205 * We use the aio_cleanup_dr_delete_memory function to 7206 * initiate the actual clean up; this function will wake 7207 * up the per-process aio_cleanup_thread. 7208 */ 7209 aio_cleanup_dr_delete_memory = (int (*)(proc_t *)) 7210 modgetsymvalue("aio_cleanup_dr_delete_memory", 0); 7211 if (aio_cleanup_dr_delete_memory == NULL) { 7212 cmn_err(CE_WARN, 7213 "aio_cleanup_dr_delete_memory not found in kaio"); 7214 return (0); 7215 } 7216 mutex_enter(&pidlock); 7217 for (procp = practive; (procp != NULL); procp = procp->p_next) { 7218 mutex_enter(&procp->p_lock); 7219 if (procp->p_aio != NULL) { 7220 /* cleanup proc's outstanding kaio */ 7221 cleaned += (*aio_cleanup_dr_delete_memory)(procp); 7222 } 7223 mutex_exit(&procp->p_lock); 7224 } 7225 mutex_exit(&pidlock); 7226 return (cleaned); 7227 } 7228 7229 /* 7230 * helper function for page_capture_thread 7231 */ 7232 static void 7233 page_capture_handle_outstanding(void) 7234 { 7235 int ntry; 7236 7237 if (!page_retire_pend_count()) { 7238 /* 7239 * Do we really want to be this aggressive 7240 * for things other than page_retire? 7241 * Maybe have a counter for each callback 7242 * type to guide how aggressive we should 7243 * be here. Thus if there's at least one 7244 * page for page_retire we go ahead and reap 7245 * like this. 7246 */ 7247 kmem_reap(); 7248 seg_preap(); 7249 page_capture_async(); 7250 } else { 7251 /* 7252 * There are pages pending retirement, so 7253 * we reap prior to attempting to capture. 7254 */ 7255 kmem_reap(); 7256 7257 /* disable and purge seg_pcache */ 7258 (void) seg_p_disable(); 7259 for (ntry = 0; ntry < pc_thread_retry; ntry++) { 7260 if (!page_retire_pend_count()) 7261 break; 7262 if (do_aio_cleanup()) { 7263 /* 7264 * allow the apps cleanup threads 7265 * to run 7266 */ 7267 delay(pc_thread_shortwait); 7268 } 7269 page_capture_async(); 7270 } 7271 /* reenable seg_pcache */ 7272 seg_p_enable(); 7273 } 7274 } 7275 7276 /* 7277 * The page_capture_thread loops forever, looking to see if there are 7278 * pages still waiting to be captured. 7279 */ 7280 static void 7281 page_capture_thread(void) 7282 { 7283 callb_cpr_t c; 7284 int outstanding; 7285 int i; 7286 7287 CALLB_CPR_INIT(&c, &pc_thread_mutex, callb_generic_cpr, "page_capture"); 7288 7289 mutex_enter(&pc_thread_mutex); 7290 for (;;) { 7291 outstanding = 0; 7292 for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) 7293 outstanding += page_capture_hash[i].num_pages; 7294 if (outstanding) { 7295 page_capture_handle_outstanding(); 7296 CALLB_CPR_SAFE_BEGIN(&c); 7297 (void) cv_timedwait(&pc_cv, &pc_thread_mutex, 7298 lbolt + pc_thread_shortwait); 7299 CALLB_CPR_SAFE_END(&c, &pc_thread_mutex); 7300 } else { 7301 CALLB_CPR_SAFE_BEGIN(&c); 7302 (void) cv_timedwait(&pc_cv, &pc_thread_mutex, 7303 lbolt + pc_thread_longwait); 7304 CALLB_CPR_SAFE_END(&c, &pc_thread_mutex); 7305 } 7306 } 7307 /*NOTREACHED*/ 7308 } 7309 /* 7310 * Attempt to locate a bucket that has enough pages to satisfy the request. 7311 * The initial check is done without the lock to avoid unneeded contention. 7312 * The function returns 1 if enough pages were found, else 0 if it could not 7313 * find enough pages in a bucket. 7314 */ 7315 static int 7316 pcf_decrement_bucket(pgcnt_t npages) 7317 { 7318 struct pcf *p; 7319 struct pcf *q; 7320 int i; 7321 7322 p = &pcf[PCF_INDEX()]; 7323 q = &pcf[pcf_fanout]; 7324 for (i = 0; i < pcf_fanout; i++) { 7325 if (p->pcf_count > npages) { 7326 /* 7327 * a good one to try. 7328 */ 7329 mutex_enter(&p->pcf_lock); 7330 if (p->pcf_count > npages) { 7331 p->pcf_count -= (uint_t)npages; 7332 /* 7333 * freemem is not protected by any lock. 7334 * Thus, we cannot have any assertion 7335 * containing freemem here. 7336 */ 7337 freemem -= npages; 7338 mutex_exit(&p->pcf_lock); 7339 return (1); 7340 } 7341 mutex_exit(&p->pcf_lock); 7342 } 7343 p++; 7344 if (p >= q) { 7345 p = pcf; 7346 } 7347 } 7348 return (0); 7349 } 7350 7351 /* 7352 * Arguments: 7353 * pcftotal_ret: If the value is not NULL and we have walked all the 7354 * buckets but did not find enough pages then it will 7355 * be set to the total number of pages in all the pcf 7356 * buckets. 7357 * npages: Is the number of pages we have been requested to 7358 * find. 7359 * unlock: If set to 0 we will leave the buckets locked if the 7360 * requested number of pages are not found. 7361 * 7362 * Go and try to satisfy the page request from any number of buckets. 7363 * This can be a very expensive operation as we have to lock the buckets 7364 * we are checking (and keep them locked), starting at bucket 0. 7365 * 7366 * The function returns 1 if enough pages were found, else 0 if it could not 7367 * find enough pages in the buckets. 7368 * 7369 */ 7370 static int 7371 pcf_decrement_multiple(pgcnt_t *pcftotal_ret, pgcnt_t npages, int unlock) 7372 { 7373 struct pcf *p; 7374 pgcnt_t pcftotal; 7375 int i; 7376 7377 p = pcf; 7378 /* try to collect pages from several pcf bins */ 7379 for (pcftotal = 0, i = 0; i < pcf_fanout; i++) { 7380 mutex_enter(&p->pcf_lock); 7381 pcftotal += p->pcf_count; 7382 if (pcftotal >= npages) { 7383 /* 7384 * Wow! There are enough pages laying around 7385 * to satisfy the request. Do the accounting, 7386 * drop the locks we acquired, and go back. 7387 * 7388 * freemem is not protected by any lock. So, 7389 * we cannot have any assertion containing 7390 * freemem. 7391 */ 7392 freemem -= npages; 7393 while (p >= pcf) { 7394 if (p->pcf_count <= npages) { 7395 npages -= p->pcf_count; 7396 p->pcf_count = 0; 7397 } else { 7398 p->pcf_count -= (uint_t)npages; 7399 npages = 0; 7400 } 7401 mutex_exit(&p->pcf_lock); 7402 p--; 7403 } 7404 ASSERT(npages == 0); 7405 return (1); 7406 } 7407 p++; 7408 } 7409 if (unlock) { 7410 /* failed to collect pages - release the locks */ 7411 while (--p >= pcf) { 7412 mutex_exit(&p->pcf_lock); 7413 } 7414 } 7415 if (pcftotal_ret != NULL) 7416 *pcftotal_ret = pcftotal; 7417 return (0); 7418 } 7419