1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 26 /* All Rights Reserved */ 27 28 /* 29 * University Copyright- Copyright (c) 1982, 1986, 1988 30 * The Regents of the University of California 31 * All Rights Reserved 32 * 33 * University Acknowledgment- Portions of this document are derived from 34 * software developed by the University of California, Berkeley, and its 35 * contributors. 36 */ 37 38 /* 39 * VM - physical page management. 40 */ 41 42 #include <sys/types.h> 43 #include <sys/t_lock.h> 44 #include <sys/param.h> 45 #include <sys/systm.h> 46 #include <sys/errno.h> 47 #include <sys/time.h> 48 #include <sys/vnode.h> 49 #include <sys/vm.h> 50 #include <sys/vtrace.h> 51 #include <sys/swap.h> 52 #include <sys/cmn_err.h> 53 #include <sys/tuneable.h> 54 #include <sys/sysmacros.h> 55 #include <sys/cpuvar.h> 56 #include <sys/callb.h> 57 #include <sys/debug.h> 58 #include <sys/tnf_probe.h> 59 #include <sys/condvar_impl.h> 60 #include <sys/mem_config.h> 61 #include <sys/mem_cage.h> 62 #include <sys/kmem.h> 63 #include <sys/atomic.h> 64 #include <sys/strlog.h> 65 #include <sys/mman.h> 66 #include <sys/ontrap.h> 67 #include <sys/lgrp.h> 68 #include <sys/vfs.h> 69 70 #include <vm/hat.h> 71 #include <vm/anon.h> 72 #include <vm/page.h> 73 #include <vm/seg.h> 74 #include <vm/pvn.h> 75 #include <vm/seg_kmem.h> 76 #include <vm/vm_dep.h> 77 #include <sys/vm_usage.h> 78 #include <fs/fs_subr.h> 79 #include <sys/ddi.h> 80 #include <sys/modctl.h> 81 82 static pgcnt_t max_page_get; /* max page_get request size in pages */ 83 pgcnt_t total_pages = 0; /* total number of pages (used by /proc) */ 84 85 /* 86 * freemem_lock protects all freemem variables: 87 * availrmem. Also this lock protects the globals which track the 88 * availrmem changes for accurate kernel footprint calculation. 89 * See below for an explanation of these 90 * globals. 91 */ 92 kmutex_t freemem_lock; 93 pgcnt_t availrmem; 94 pgcnt_t availrmem_initial; 95 96 /* 97 * These globals track availrmem changes to get a more accurate 98 * estimate of tke kernel size. Historically pp_kernel is used for 99 * kernel size and is based on availrmem. But availrmem is adjusted for 100 * locked pages in the system not just for kernel locked pages. 101 * These new counters will track the pages locked through segvn and 102 * by explicit user locking. 103 * 104 * pages_locked : How many pages are locked because of user specified 105 * locking through mlock or plock. 106 * 107 * pages_useclaim,pages_claimed : These two variables track the 108 * claim adjustments because of the protection changes on a segvn segment. 109 * 110 * All these globals are protected by the same lock which protects availrmem. 111 */ 112 pgcnt_t pages_locked = 0; 113 pgcnt_t pages_useclaim = 0; 114 pgcnt_t pages_claimed = 0; 115 116 117 /* 118 * new_freemem_lock protects freemem, freemem_wait & freemem_cv. 119 */ 120 static kmutex_t new_freemem_lock; 121 static uint_t freemem_wait; /* someone waiting for freemem */ 122 static kcondvar_t freemem_cv; 123 124 /* 125 * The logical page free list is maintained as two lists, the 'free' 126 * and the 'cache' lists. 127 * The free list contains those pages that should be reused first. 128 * 129 * The implementation of the lists is machine dependent. 130 * page_get_freelist(), page_get_cachelist(), 131 * page_list_sub(), and page_list_add() 132 * form the interface to the machine dependent implementation. 133 * 134 * Pages with p_free set are on the cache list. 135 * Pages with p_free and p_age set are on the free list, 136 * 137 * A page may be locked while on either list. 138 */ 139 140 /* 141 * free list accounting stuff. 142 * 143 * 144 * Spread out the value for the number of pages on the 145 * page free and page cache lists. If there is just one 146 * value, then it must be under just one lock. 147 * The lock contention and cache traffic are a real bother. 148 * 149 * When we acquire and then drop a single pcf lock 150 * we can start in the middle of the array of pcf structures. 151 * If we acquire more than one pcf lock at a time, we need to 152 * start at the front to avoid deadlocking. 153 * 154 * pcf_count holds the number of pages in each pool. 155 * 156 * pcf_block is set when page_create_get_something() has asked the 157 * PSM page freelist and page cachelist routines without specifying 158 * a color and nothing came back. This is used to block anything 159 * else from moving pages from one list to the other while the 160 * lists are searched again. If a page is freeed while pcf_block is 161 * set, then pcf_reserve is incremented. pcgs_unblock() takes care 162 * of clearning pcf_block, doing the wakeups, etc. 163 */ 164 165 #define MAX_PCF_FANOUT NCPU 166 static uint_t pcf_fanout = 1; /* Will get changed at boot time */ 167 static uint_t pcf_fanout_mask = 0; 168 169 struct pcf { 170 kmutex_t pcf_lock; /* protects the structure */ 171 uint_t pcf_count; /* page count */ 172 uint_t pcf_wait; /* number of waiters */ 173 uint_t pcf_block; /* pcgs flag to page_free() */ 174 uint_t pcf_reserve; /* pages freed after pcf_block set */ 175 uint_t pcf_fill[10]; /* to line up on the caches */ 176 }; 177 178 /* 179 * PCF_INDEX hash needs to be dynamic (every so often the hash changes where 180 * it will hash the cpu to). This is done to prevent a drain condition 181 * from happening. This drain condition will occur when pcf_count decrement 182 * occurs on cpu A and the increment of pcf_count always occurs on cpu B. An 183 * example of this shows up with device interrupts. The dma buffer is allocated 184 * by the cpu requesting the IO thus the pcf_count is decremented based on that. 185 * When the memory is returned by the interrupt thread, the pcf_count will be 186 * incremented based on the cpu servicing the interrupt. 187 */ 188 static struct pcf pcf[MAX_PCF_FANOUT]; 189 #define PCF_INDEX() ((int)(((long)CPU->cpu_seqid) + \ 190 (randtick() >> 24)) & (pcf_fanout_mask)) 191 192 static int pcf_decrement_bucket(pgcnt_t); 193 static int pcf_decrement_multiple(pgcnt_t *, pgcnt_t, int); 194 195 kmutex_t pcgs_lock; /* serializes page_create_get_ */ 196 kmutex_t pcgs_cagelock; /* serializes NOSLEEP cage allocs */ 197 kmutex_t pcgs_wait_lock; /* used for delay in pcgs */ 198 static kcondvar_t pcgs_cv; /* cv for delay in pcgs */ 199 200 #ifdef VM_STATS 201 202 /* 203 * No locks, but so what, they are only statistics. 204 */ 205 206 static struct page_tcnt { 207 int pc_free_cache; /* free's into cache list */ 208 int pc_free_dontneed; /* free's with dontneed */ 209 int pc_free_pageout; /* free's from pageout */ 210 int pc_free_free; /* free's into free list */ 211 int pc_free_pages; /* free's into large page free list */ 212 int pc_destroy_pages; /* large page destroy's */ 213 int pc_get_cache; /* get's from cache list */ 214 int pc_get_free; /* get's from free list */ 215 int pc_reclaim; /* reclaim's */ 216 int pc_abortfree; /* abort's of free pages */ 217 int pc_find_hit; /* find's that find page */ 218 int pc_find_miss; /* find's that don't find page */ 219 int pc_destroy_free; /* # of free pages destroyed */ 220 #define PC_HASH_CNT (4*PAGE_HASHAVELEN) 221 int pc_find_hashlen[PC_HASH_CNT+1]; 222 int pc_addclaim_pages; 223 int pc_subclaim_pages; 224 int pc_free_replacement_page[2]; 225 int pc_try_demote_pages[6]; 226 int pc_demote_pages[2]; 227 } pagecnt; 228 229 uint_t hashin_count; 230 uint_t hashin_not_held; 231 uint_t hashin_already; 232 233 uint_t hashout_count; 234 uint_t hashout_not_held; 235 236 uint_t page_create_count; 237 uint_t page_create_not_enough; 238 uint_t page_create_not_enough_again; 239 uint_t page_create_zero; 240 uint_t page_create_hashout; 241 uint_t page_create_page_lock_failed; 242 uint_t page_create_trylock_failed; 243 uint_t page_create_found_one; 244 uint_t page_create_hashin_failed; 245 uint_t page_create_dropped_phm; 246 247 uint_t page_create_new; 248 uint_t page_create_exists; 249 uint_t page_create_putbacks; 250 uint_t page_create_overshoot; 251 252 uint_t page_reclaim_zero; 253 uint_t page_reclaim_zero_locked; 254 255 uint_t page_rename_exists; 256 uint_t page_rename_count; 257 258 uint_t page_lookup_cnt[20]; 259 uint_t page_lookup_nowait_cnt[10]; 260 uint_t page_find_cnt; 261 uint_t page_exists_cnt; 262 uint_t page_exists_forreal_cnt; 263 uint_t page_lookup_dev_cnt; 264 uint_t get_cachelist_cnt; 265 uint_t page_create_cnt[10]; 266 uint_t alloc_pages[9]; 267 uint_t page_exphcontg[19]; 268 uint_t page_create_large_cnt[10]; 269 270 /* 271 * Collects statistics. 272 */ 273 #define PAGE_HASH_SEARCH(index, pp, vp, off) { \ 274 uint_t mylen = 0; \ 275 \ 276 for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash, mylen++) { \ 277 if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \ 278 break; \ 279 } \ 280 if ((pp) != NULL) \ 281 pagecnt.pc_find_hit++; \ 282 else \ 283 pagecnt.pc_find_miss++; \ 284 if (mylen > PC_HASH_CNT) \ 285 mylen = PC_HASH_CNT; \ 286 pagecnt.pc_find_hashlen[mylen]++; \ 287 } 288 289 #else /* VM_STATS */ 290 291 /* 292 * Don't collect statistics 293 */ 294 #define PAGE_HASH_SEARCH(index, pp, vp, off) { \ 295 for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \ 296 if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \ 297 break; \ 298 } \ 299 } 300 301 #endif /* VM_STATS */ 302 303 304 305 #ifdef DEBUG 306 #define MEMSEG_SEARCH_STATS 307 #endif 308 309 #ifdef MEMSEG_SEARCH_STATS 310 struct memseg_stats { 311 uint_t nsearch; 312 uint_t nlastwon; 313 uint_t nhashwon; 314 uint_t nnotfound; 315 } memseg_stats; 316 317 #define MEMSEG_STAT_INCR(v) \ 318 atomic_inc_32(&memseg_stats.v) 319 #else 320 #define MEMSEG_STAT_INCR(x) 321 #endif 322 323 struct memseg *memsegs; /* list of memory segments */ 324 325 /* 326 * /etc/system tunable to control large page allocation hueristic. 327 * 328 * Setting to LPAP_LOCAL will heavily prefer the local lgroup over remote lgroup 329 * for large page allocation requests. If a large page is not readily 330 * avaliable on the local freelists we will go through additional effort 331 * to create a large page, potentially moving smaller pages around to coalesce 332 * larger pages in the local lgroup. 333 * Default value of LPAP_DEFAULT will go to remote freelists if large pages 334 * are not readily available in the local lgroup. 335 */ 336 enum lpap { 337 LPAP_DEFAULT, /* default large page allocation policy */ 338 LPAP_LOCAL /* local large page allocation policy */ 339 }; 340 341 enum lpap lpg_alloc_prefer = LPAP_DEFAULT; 342 343 static void page_init_mem_config(void); 344 static int page_do_hashin(page_t *, vnode_t *, u_offset_t); 345 static void page_do_hashout(page_t *); 346 static void page_capture_init(); 347 int page_capture_take_action(page_t *, uint_t, void *); 348 349 static void page_demote_vp_pages(page_t *); 350 351 352 void 353 pcf_init(void) 354 355 { 356 if (boot_ncpus != -1) { 357 pcf_fanout = boot_ncpus; 358 } else { 359 pcf_fanout = max_ncpus; 360 } 361 #ifdef sun4v 362 /* 363 * Force at least 4 buckets if possible for sun4v. 364 */ 365 pcf_fanout = MAX(pcf_fanout, 4); 366 #endif /* sun4v */ 367 368 /* 369 * Round up to the nearest power of 2. 370 */ 371 pcf_fanout = MIN(pcf_fanout, MAX_PCF_FANOUT); 372 if (!ISP2(pcf_fanout)) { 373 pcf_fanout = 1 << highbit(pcf_fanout); 374 375 if (pcf_fanout > MAX_PCF_FANOUT) { 376 pcf_fanout = 1 << (highbit(MAX_PCF_FANOUT) - 1); 377 } 378 } 379 pcf_fanout_mask = pcf_fanout - 1; 380 } 381 382 /* 383 * vm subsystem related initialization 384 */ 385 void 386 vm_init(void) 387 { 388 boolean_t callb_vm_cpr(void *, int); 389 390 (void) callb_add(callb_vm_cpr, 0, CB_CL_CPR_VM, "vm"); 391 page_init_mem_config(); 392 page_retire_init(); 393 vm_usage_init(); 394 page_capture_init(); 395 } 396 397 /* 398 * This function is called at startup and when memory is added or deleted. 399 */ 400 void 401 init_pages_pp_maximum() 402 { 403 static pgcnt_t p_min; 404 static pgcnt_t pages_pp_maximum_startup; 405 static pgcnt_t avrmem_delta; 406 static int init_done; 407 static int user_set; /* true if set in /etc/system */ 408 409 if (init_done == 0) { 410 411 /* If the user specified a value, save it */ 412 if (pages_pp_maximum != 0) { 413 user_set = 1; 414 pages_pp_maximum_startup = pages_pp_maximum; 415 } 416 417 /* 418 * Setting of pages_pp_maximum is based first time 419 * on the value of availrmem just after the start-up 420 * allocations. To preserve this relationship at run 421 * time, use a delta from availrmem_initial. 422 */ 423 ASSERT(availrmem_initial >= availrmem); 424 avrmem_delta = availrmem_initial - availrmem; 425 426 /* The allowable floor of pages_pp_maximum */ 427 p_min = tune.t_minarmem + 100; 428 429 /* Make sure we don't come through here again. */ 430 init_done = 1; 431 } 432 /* 433 * Determine pages_pp_maximum, the number of currently available 434 * pages (availrmem) that can't be `locked'. If not set by 435 * the user, we set it to 4% of the currently available memory 436 * plus 4MB. 437 * But we also insist that it be greater than tune.t_minarmem; 438 * otherwise a process could lock down a lot of memory, get swapped 439 * out, and never have enough to get swapped back in. 440 */ 441 if (user_set) 442 pages_pp_maximum = pages_pp_maximum_startup; 443 else 444 pages_pp_maximum = ((availrmem_initial - avrmem_delta) / 25) 445 + btop(4 * 1024 * 1024); 446 447 if (pages_pp_maximum <= p_min) { 448 pages_pp_maximum = p_min; 449 } 450 } 451 452 void 453 set_max_page_get(pgcnt_t target_total_pages) 454 { 455 max_page_get = target_total_pages / 2; 456 } 457 458 static pgcnt_t pending_delete; 459 460 /*ARGSUSED*/ 461 static void 462 page_mem_config_post_add( 463 void *arg, 464 pgcnt_t delta_pages) 465 { 466 set_max_page_get(total_pages - pending_delete); 467 init_pages_pp_maximum(); 468 } 469 470 /*ARGSUSED*/ 471 static int 472 page_mem_config_pre_del( 473 void *arg, 474 pgcnt_t delta_pages) 475 { 476 pgcnt_t nv; 477 478 nv = atomic_add_long_nv(&pending_delete, (spgcnt_t)delta_pages); 479 set_max_page_get(total_pages - nv); 480 return (0); 481 } 482 483 /*ARGSUSED*/ 484 static void 485 page_mem_config_post_del( 486 void *arg, 487 pgcnt_t delta_pages, 488 int cancelled) 489 { 490 pgcnt_t nv; 491 492 nv = atomic_add_long_nv(&pending_delete, -(spgcnt_t)delta_pages); 493 set_max_page_get(total_pages - nv); 494 if (!cancelled) 495 init_pages_pp_maximum(); 496 } 497 498 static kphysm_setup_vector_t page_mem_config_vec = { 499 KPHYSM_SETUP_VECTOR_VERSION, 500 page_mem_config_post_add, 501 page_mem_config_pre_del, 502 page_mem_config_post_del, 503 }; 504 505 static void 506 page_init_mem_config(void) 507 { 508 int ret; 509 510 ret = kphysm_setup_func_register(&page_mem_config_vec, (void *)NULL); 511 ASSERT(ret == 0); 512 } 513 514 /* 515 * Evenly spread out the PCF counters for large free pages 516 */ 517 static void 518 page_free_large_ctr(pgcnt_t npages) 519 { 520 static struct pcf *p = pcf; 521 pgcnt_t lump; 522 523 freemem += npages; 524 525 lump = roundup(npages, pcf_fanout) / pcf_fanout; 526 527 while (npages > 0) { 528 529 ASSERT(!p->pcf_block); 530 531 if (lump < npages) { 532 p->pcf_count += (uint_t)lump; 533 npages -= lump; 534 } else { 535 p->pcf_count += (uint_t)npages; 536 npages = 0; 537 } 538 539 ASSERT(!p->pcf_wait); 540 541 if (++p > &pcf[pcf_fanout - 1]) 542 p = pcf; 543 } 544 545 ASSERT(npages == 0); 546 } 547 548 /* 549 * Add a physical chunk of memory to the system free lists during startup. 550 * Platform specific startup() allocates the memory for the page structs. 551 * 552 * num - number of page structures 553 * base - page number (pfn) to be associated with the first page. 554 * 555 * Since we are doing this during startup (ie. single threaded), we will 556 * use shortcut routines to avoid any locking overhead while putting all 557 * these pages on the freelists. 558 * 559 * NOTE: Any changes performed to page_free(), must also be performed to 560 * add_physmem() since this is how we initialize all page_t's at 561 * boot time. 562 */ 563 void 564 add_physmem( 565 page_t *pp, 566 pgcnt_t num, 567 pfn_t pnum) 568 { 569 page_t *root = NULL; 570 uint_t szc = page_num_pagesizes() - 1; 571 pgcnt_t large = page_get_pagecnt(szc); 572 pgcnt_t cnt = 0; 573 574 TRACE_2(TR_FAC_VM, TR_PAGE_INIT, 575 "add_physmem:pp %p num %lu", pp, num); 576 577 /* 578 * Arbitrarily limit the max page_get request 579 * to 1/2 of the page structs we have. 580 */ 581 total_pages += num; 582 set_max_page_get(total_pages); 583 584 PLCNT_MODIFY_MAX(pnum, (long)num); 585 586 /* 587 * The physical space for the pages array 588 * representing ram pages has already been 589 * allocated. Here we initialize each lock 590 * in the page structure, and put each on 591 * the free list 592 */ 593 for (; num; pp++, pnum++, num--) { 594 595 /* 596 * this needs to fill in the page number 597 * and do any other arch specific initialization 598 */ 599 add_physmem_cb(pp, pnum); 600 601 pp->p_lckcnt = 0; 602 pp->p_cowcnt = 0; 603 pp->p_slckcnt = 0; 604 605 /* 606 * Initialize the page lock as unlocked, since nobody 607 * can see or access this page yet. 608 */ 609 pp->p_selock = 0; 610 611 /* 612 * Initialize IO lock 613 */ 614 page_iolock_init(pp); 615 616 /* 617 * initialize other fields in the page_t 618 */ 619 PP_SETFREE(pp); 620 page_clr_all_props(pp); 621 PP_SETAGED(pp); 622 pp->p_offset = (u_offset_t)-1; 623 pp->p_next = pp; 624 pp->p_prev = pp; 625 626 /* 627 * Simple case: System doesn't support large pages. 628 */ 629 if (szc == 0) { 630 pp->p_szc = 0; 631 page_free_at_startup(pp); 632 continue; 633 } 634 635 /* 636 * Handle unaligned pages, we collect them up onto 637 * the root page until we have a full large page. 638 */ 639 if (!IS_P2ALIGNED(pnum, large)) { 640 641 /* 642 * If not in a large page, 643 * just free as small page. 644 */ 645 if (root == NULL) { 646 pp->p_szc = 0; 647 page_free_at_startup(pp); 648 continue; 649 } 650 651 /* 652 * Link a constituent page into the large page. 653 */ 654 pp->p_szc = szc; 655 page_list_concat(&root, &pp); 656 657 /* 658 * When large page is fully formed, free it. 659 */ 660 if (++cnt == large) { 661 page_free_large_ctr(cnt); 662 page_list_add_pages(root, PG_LIST_ISINIT); 663 root = NULL; 664 cnt = 0; 665 } 666 continue; 667 } 668 669 /* 670 * At this point we have a page number which 671 * is aligned. We assert that we aren't already 672 * in a different large page. 673 */ 674 ASSERT(IS_P2ALIGNED(pnum, large)); 675 ASSERT(root == NULL && cnt == 0); 676 677 /* 678 * If insufficient number of pages left to form 679 * a large page, just free the small page. 680 */ 681 if (num < large) { 682 pp->p_szc = 0; 683 page_free_at_startup(pp); 684 continue; 685 } 686 687 /* 688 * Otherwise start a new large page. 689 */ 690 pp->p_szc = szc; 691 cnt++; 692 root = pp; 693 } 694 ASSERT(root == NULL && cnt == 0); 695 } 696 697 /* 698 * Find a page representing the specified [vp, offset]. 699 * If we find the page but it is intransit coming in, 700 * it will have an "exclusive" lock and we wait for 701 * the i/o to complete. A page found on the free list 702 * is always reclaimed and then locked. On success, the page 703 * is locked, its data is valid and it isn't on the free 704 * list, while a NULL is returned if the page doesn't exist. 705 */ 706 page_t * 707 page_lookup(vnode_t *vp, u_offset_t off, se_t se) 708 { 709 return (page_lookup_create(vp, off, se, NULL, NULL, 0)); 710 } 711 712 /* 713 * Find a page representing the specified [vp, offset]. 714 * We either return the one we found or, if passed in, 715 * create one with identity of [vp, offset] of the 716 * pre-allocated page. If we find existing page but it is 717 * intransit coming in, it will have an "exclusive" lock 718 * and we wait for the i/o to complete. A page found on 719 * the free list is always reclaimed and then locked. 720 * On success, the page is locked, its data is valid and 721 * it isn't on the free list, while a NULL is returned 722 * if the page doesn't exist and newpp is NULL; 723 */ 724 page_t * 725 page_lookup_create( 726 vnode_t *vp, 727 u_offset_t off, 728 se_t se, 729 page_t *newpp, 730 spgcnt_t *nrelocp, 731 int flags) 732 { 733 page_t *pp; 734 kmutex_t *phm; 735 ulong_t index; 736 uint_t hash_locked; 737 uint_t es; 738 739 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 740 VM_STAT_ADD(page_lookup_cnt[0]); 741 ASSERT(newpp ? PAGE_EXCL(newpp) : 1); 742 743 /* 744 * Acquire the appropriate page hash lock since 745 * we have to search the hash list. Pages that 746 * hash to this list can't change identity while 747 * this lock is held. 748 */ 749 hash_locked = 0; 750 index = PAGE_HASH_FUNC(vp, off); 751 phm = NULL; 752 top: 753 PAGE_HASH_SEARCH(index, pp, vp, off); 754 if (pp != NULL) { 755 VM_STAT_ADD(page_lookup_cnt[1]); 756 es = (newpp != NULL) ? 1 : 0; 757 es |= flags; 758 if (!hash_locked) { 759 VM_STAT_ADD(page_lookup_cnt[2]); 760 if (!page_try_reclaim_lock(pp, se, es)) { 761 /* 762 * On a miss, acquire the phm. Then 763 * next time, page_lock() will be called, 764 * causing a wait if the page is busy. 765 * just looping with page_trylock() would 766 * get pretty boring. 767 */ 768 VM_STAT_ADD(page_lookup_cnt[3]); 769 phm = PAGE_HASH_MUTEX(index); 770 mutex_enter(phm); 771 hash_locked = 1; 772 goto top; 773 } 774 } else { 775 VM_STAT_ADD(page_lookup_cnt[4]); 776 if (!page_lock_es(pp, se, phm, P_RECLAIM, es)) { 777 VM_STAT_ADD(page_lookup_cnt[5]); 778 goto top; 779 } 780 } 781 782 /* 783 * Since `pp' is locked it can not change identity now. 784 * Reconfirm we locked the correct page. 785 * 786 * Both the p_vnode and p_offset *must* be cast volatile 787 * to force a reload of their values: The PAGE_HASH_SEARCH 788 * macro will have stuffed p_vnode and p_offset into 789 * registers before calling page_trylock(); another thread, 790 * actually holding the hash lock, could have changed the 791 * page's identity in memory, but our registers would not 792 * be changed, fooling the reconfirmation. If the hash 793 * lock was held during the search, the casting would 794 * not be needed. 795 */ 796 VM_STAT_ADD(page_lookup_cnt[6]); 797 if (((volatile struct vnode *)(pp->p_vnode) != vp) || 798 ((volatile u_offset_t)(pp->p_offset) != off)) { 799 VM_STAT_ADD(page_lookup_cnt[7]); 800 if (hash_locked) { 801 panic("page_lookup_create: lost page %p", 802 (void *)pp); 803 /*NOTREACHED*/ 804 } 805 page_unlock(pp); 806 phm = PAGE_HASH_MUTEX(index); 807 mutex_enter(phm); 808 hash_locked = 1; 809 goto top; 810 } 811 812 /* 813 * If page_trylock() was called, then pp may still be on 814 * the cachelist (can't be on the free list, it would not 815 * have been found in the search). If it is on the 816 * cachelist it must be pulled now. To pull the page from 817 * the cachelist, it must be exclusively locked. 818 * 819 * The other big difference between page_trylock() and 820 * page_lock(), is that page_lock() will pull the 821 * page from whatever free list (the cache list in this 822 * case) the page is on. If page_trylock() was used 823 * above, then we have to do the reclaim ourselves. 824 */ 825 if ((!hash_locked) && (PP_ISFREE(pp))) { 826 ASSERT(PP_ISAGED(pp) == 0); 827 VM_STAT_ADD(page_lookup_cnt[8]); 828 829 /* 830 * page_relcaim will insure that we 831 * have this page exclusively 832 */ 833 834 if (!page_reclaim(pp, NULL)) { 835 /* 836 * Page_reclaim dropped whatever lock 837 * we held. 838 */ 839 VM_STAT_ADD(page_lookup_cnt[9]); 840 phm = PAGE_HASH_MUTEX(index); 841 mutex_enter(phm); 842 hash_locked = 1; 843 goto top; 844 } else if (se == SE_SHARED && newpp == NULL) { 845 VM_STAT_ADD(page_lookup_cnt[10]); 846 page_downgrade(pp); 847 } 848 } 849 850 if (hash_locked) { 851 mutex_exit(phm); 852 } 853 854 if (newpp != NULL && pp->p_szc < newpp->p_szc && 855 PAGE_EXCL(pp) && nrelocp != NULL) { 856 ASSERT(nrelocp != NULL); 857 (void) page_relocate(&pp, &newpp, 1, 1, nrelocp, 858 NULL); 859 if (*nrelocp > 0) { 860 VM_STAT_COND_ADD(*nrelocp == 1, 861 page_lookup_cnt[11]); 862 VM_STAT_COND_ADD(*nrelocp > 1, 863 page_lookup_cnt[12]); 864 pp = newpp; 865 se = SE_EXCL; 866 } else { 867 if (se == SE_SHARED) { 868 page_downgrade(pp); 869 } 870 VM_STAT_ADD(page_lookup_cnt[13]); 871 } 872 } else if (newpp != NULL && nrelocp != NULL) { 873 if (PAGE_EXCL(pp) && se == SE_SHARED) { 874 page_downgrade(pp); 875 } 876 VM_STAT_COND_ADD(pp->p_szc < newpp->p_szc, 877 page_lookup_cnt[14]); 878 VM_STAT_COND_ADD(pp->p_szc == newpp->p_szc, 879 page_lookup_cnt[15]); 880 VM_STAT_COND_ADD(pp->p_szc > newpp->p_szc, 881 page_lookup_cnt[16]); 882 } else if (newpp != NULL && PAGE_EXCL(pp)) { 883 se = SE_EXCL; 884 } 885 } else if (!hash_locked) { 886 VM_STAT_ADD(page_lookup_cnt[17]); 887 phm = PAGE_HASH_MUTEX(index); 888 mutex_enter(phm); 889 hash_locked = 1; 890 goto top; 891 } else if (newpp != NULL) { 892 /* 893 * If we have a preallocated page then 894 * insert it now and basically behave like 895 * page_create. 896 */ 897 VM_STAT_ADD(page_lookup_cnt[18]); 898 /* 899 * Since we hold the page hash mutex and 900 * just searched for this page, page_hashin 901 * had better not fail. If it does, that 902 * means some thread did not follow the 903 * page hash mutex rules. Panic now and 904 * get it over with. As usual, go down 905 * holding all the locks. 906 */ 907 ASSERT(MUTEX_HELD(phm)); 908 if (!page_hashin(newpp, vp, off, phm)) { 909 ASSERT(MUTEX_HELD(phm)); 910 panic("page_lookup_create: hashin failed %p %p %llx %p", 911 (void *)newpp, (void *)vp, off, (void *)phm); 912 /*NOTREACHED*/ 913 } 914 ASSERT(MUTEX_HELD(phm)); 915 mutex_exit(phm); 916 phm = NULL; 917 page_set_props(newpp, P_REF); 918 page_io_lock(newpp); 919 pp = newpp; 920 se = SE_EXCL; 921 } else { 922 VM_STAT_ADD(page_lookup_cnt[19]); 923 mutex_exit(phm); 924 } 925 926 ASSERT(pp ? PAGE_LOCKED_SE(pp, se) : 1); 927 928 ASSERT(pp ? ((PP_ISFREE(pp) == 0) && (PP_ISAGED(pp) == 0)) : 1); 929 930 return (pp); 931 } 932 933 /* 934 * Search the hash list for the page representing the 935 * specified [vp, offset] and return it locked. Skip 936 * free pages and pages that cannot be locked as requested. 937 * Used while attempting to kluster pages. 938 */ 939 page_t * 940 page_lookup_nowait(vnode_t *vp, u_offset_t off, se_t se) 941 { 942 page_t *pp; 943 kmutex_t *phm; 944 ulong_t index; 945 uint_t locked; 946 947 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 948 VM_STAT_ADD(page_lookup_nowait_cnt[0]); 949 950 index = PAGE_HASH_FUNC(vp, off); 951 PAGE_HASH_SEARCH(index, pp, vp, off); 952 locked = 0; 953 if (pp == NULL) { 954 top: 955 VM_STAT_ADD(page_lookup_nowait_cnt[1]); 956 locked = 1; 957 phm = PAGE_HASH_MUTEX(index); 958 mutex_enter(phm); 959 PAGE_HASH_SEARCH(index, pp, vp, off); 960 } 961 962 if (pp == NULL || PP_ISFREE(pp)) { 963 VM_STAT_ADD(page_lookup_nowait_cnt[2]); 964 pp = NULL; 965 } else { 966 if (!page_trylock(pp, se)) { 967 VM_STAT_ADD(page_lookup_nowait_cnt[3]); 968 pp = NULL; 969 } else { 970 VM_STAT_ADD(page_lookup_nowait_cnt[4]); 971 /* 972 * See the comment in page_lookup() 973 */ 974 if (((volatile struct vnode *)(pp->p_vnode) != vp) || 975 ((u_offset_t)(pp->p_offset) != off)) { 976 VM_STAT_ADD(page_lookup_nowait_cnt[5]); 977 if (locked) { 978 panic("page_lookup_nowait %p", 979 (void *)pp); 980 /*NOTREACHED*/ 981 } 982 page_unlock(pp); 983 goto top; 984 } 985 if (PP_ISFREE(pp)) { 986 VM_STAT_ADD(page_lookup_nowait_cnt[6]); 987 page_unlock(pp); 988 pp = NULL; 989 } 990 } 991 } 992 if (locked) { 993 VM_STAT_ADD(page_lookup_nowait_cnt[7]); 994 mutex_exit(phm); 995 } 996 997 ASSERT(pp ? PAGE_LOCKED_SE(pp, se) : 1); 998 999 return (pp); 1000 } 1001 1002 /* 1003 * Search the hash list for a page with the specified [vp, off] 1004 * that is known to exist and is already locked. This routine 1005 * is typically used by segment SOFTUNLOCK routines. 1006 */ 1007 page_t * 1008 page_find(vnode_t *vp, u_offset_t off) 1009 { 1010 page_t *pp; 1011 kmutex_t *phm; 1012 ulong_t index; 1013 1014 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 1015 VM_STAT_ADD(page_find_cnt); 1016 1017 index = PAGE_HASH_FUNC(vp, off); 1018 phm = PAGE_HASH_MUTEX(index); 1019 1020 mutex_enter(phm); 1021 PAGE_HASH_SEARCH(index, pp, vp, off); 1022 mutex_exit(phm); 1023 1024 ASSERT(pp == NULL || PAGE_LOCKED(pp) || panicstr); 1025 return (pp); 1026 } 1027 1028 /* 1029 * Determine whether a page with the specified [vp, off] 1030 * currently exists in the system. Obviously this should 1031 * only be considered as a hint since nothing prevents the 1032 * page from disappearing or appearing immediately after 1033 * the return from this routine. Subsequently, we don't 1034 * even bother to lock the list. 1035 */ 1036 page_t * 1037 page_exists(vnode_t *vp, u_offset_t off) 1038 { 1039 page_t *pp; 1040 ulong_t index; 1041 1042 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 1043 VM_STAT_ADD(page_exists_cnt); 1044 1045 index = PAGE_HASH_FUNC(vp, off); 1046 PAGE_HASH_SEARCH(index, pp, vp, off); 1047 1048 return (pp); 1049 } 1050 1051 /* 1052 * Determine if physically contiguous pages exist for [vp, off] - [vp, off + 1053 * page_size(szc)) range. if they exist and ppa is not NULL fill ppa array 1054 * with these pages locked SHARED. If necessary reclaim pages from 1055 * freelist. Return 1 if contiguous pages exist and 0 otherwise. 1056 * 1057 * If we fail to lock pages still return 1 if pages exist and contiguous. 1058 * But in this case return value is just a hint. ppa array won't be filled. 1059 * Caller should initialize ppa[0] as NULL to distinguish return value. 1060 * 1061 * Returns 0 if pages don't exist or not physically contiguous. 1062 * 1063 * This routine doesn't work for anonymous(swapfs) pages. 1064 */ 1065 int 1066 page_exists_physcontig(vnode_t *vp, u_offset_t off, uint_t szc, page_t *ppa[]) 1067 { 1068 pgcnt_t pages; 1069 pfn_t pfn; 1070 page_t *rootpp; 1071 pgcnt_t i; 1072 pgcnt_t j; 1073 u_offset_t save_off = off; 1074 ulong_t index; 1075 kmutex_t *phm; 1076 page_t *pp; 1077 uint_t pszc; 1078 int loopcnt = 0; 1079 1080 ASSERT(szc != 0); 1081 ASSERT(vp != NULL); 1082 ASSERT(!IS_SWAPFSVP(vp)); 1083 ASSERT(!VN_ISKAS(vp)); 1084 1085 again: 1086 if (++loopcnt > 3) { 1087 VM_STAT_ADD(page_exphcontg[0]); 1088 return (0); 1089 } 1090 1091 index = PAGE_HASH_FUNC(vp, off); 1092 phm = PAGE_HASH_MUTEX(index); 1093 1094 mutex_enter(phm); 1095 PAGE_HASH_SEARCH(index, pp, vp, off); 1096 mutex_exit(phm); 1097 1098 VM_STAT_ADD(page_exphcontg[1]); 1099 1100 if (pp == NULL) { 1101 VM_STAT_ADD(page_exphcontg[2]); 1102 return (0); 1103 } 1104 1105 pages = page_get_pagecnt(szc); 1106 rootpp = pp; 1107 pfn = rootpp->p_pagenum; 1108 1109 if ((pszc = pp->p_szc) >= szc && ppa != NULL) { 1110 VM_STAT_ADD(page_exphcontg[3]); 1111 if (!page_trylock(pp, SE_SHARED)) { 1112 VM_STAT_ADD(page_exphcontg[4]); 1113 return (1); 1114 } 1115 /* 1116 * Also check whether p_pagenum was modified by DR. 1117 */ 1118 if (pp->p_szc != pszc || pp->p_vnode != vp || 1119 pp->p_offset != off || pp->p_pagenum != pfn) { 1120 VM_STAT_ADD(page_exphcontg[5]); 1121 page_unlock(pp); 1122 off = save_off; 1123 goto again; 1124 } 1125 /* 1126 * szc was non zero and vnode and offset matched after we 1127 * locked the page it means it can't become free on us. 1128 */ 1129 ASSERT(!PP_ISFREE(pp)); 1130 if (!IS_P2ALIGNED(pfn, pages)) { 1131 page_unlock(pp); 1132 return (0); 1133 } 1134 ppa[0] = pp; 1135 pp++; 1136 off += PAGESIZE; 1137 pfn++; 1138 for (i = 1; i < pages; i++, pp++, off += PAGESIZE, pfn++) { 1139 if (!page_trylock(pp, SE_SHARED)) { 1140 VM_STAT_ADD(page_exphcontg[6]); 1141 pp--; 1142 while (i-- > 0) { 1143 page_unlock(pp); 1144 pp--; 1145 } 1146 ppa[0] = NULL; 1147 return (1); 1148 } 1149 if (pp->p_szc != pszc) { 1150 VM_STAT_ADD(page_exphcontg[7]); 1151 page_unlock(pp); 1152 pp--; 1153 while (i-- > 0) { 1154 page_unlock(pp); 1155 pp--; 1156 } 1157 ppa[0] = NULL; 1158 off = save_off; 1159 goto again; 1160 } 1161 /* 1162 * szc the same as for previous already locked pages 1163 * with right identity. Since this page had correct 1164 * szc after we locked it can't get freed or destroyed 1165 * and therefore must have the expected identity. 1166 */ 1167 ASSERT(!PP_ISFREE(pp)); 1168 if (pp->p_vnode != vp || 1169 pp->p_offset != off) { 1170 panic("page_exists_physcontig: " 1171 "large page identity doesn't match"); 1172 } 1173 ppa[i] = pp; 1174 ASSERT(pp->p_pagenum == pfn); 1175 } 1176 VM_STAT_ADD(page_exphcontg[8]); 1177 ppa[pages] = NULL; 1178 return (1); 1179 } else if (pszc >= szc) { 1180 VM_STAT_ADD(page_exphcontg[9]); 1181 if (!IS_P2ALIGNED(pfn, pages)) { 1182 return (0); 1183 } 1184 return (1); 1185 } 1186 1187 if (!IS_P2ALIGNED(pfn, pages)) { 1188 VM_STAT_ADD(page_exphcontg[10]); 1189 return (0); 1190 } 1191 1192 if (page_numtomemseg_nolock(pfn) != 1193 page_numtomemseg_nolock(pfn + pages - 1)) { 1194 VM_STAT_ADD(page_exphcontg[11]); 1195 return (0); 1196 } 1197 1198 /* 1199 * We loop up 4 times across pages to promote page size. 1200 * We're extra cautious to promote page size atomically with respect 1201 * to everybody else. But we can probably optimize into 1 loop if 1202 * this becomes an issue. 1203 */ 1204 1205 for (i = 0; i < pages; i++, pp++, off += PAGESIZE, pfn++) { 1206 if (!page_trylock(pp, SE_EXCL)) { 1207 VM_STAT_ADD(page_exphcontg[12]); 1208 break; 1209 } 1210 /* 1211 * Check whether p_pagenum was modified by DR. 1212 */ 1213 if (pp->p_pagenum != pfn) { 1214 page_unlock(pp); 1215 break; 1216 } 1217 if (pp->p_vnode != vp || 1218 pp->p_offset != off) { 1219 VM_STAT_ADD(page_exphcontg[13]); 1220 page_unlock(pp); 1221 break; 1222 } 1223 if (pp->p_szc >= szc) { 1224 ASSERT(i == 0); 1225 page_unlock(pp); 1226 off = save_off; 1227 goto again; 1228 } 1229 } 1230 1231 if (i != pages) { 1232 VM_STAT_ADD(page_exphcontg[14]); 1233 --pp; 1234 while (i-- > 0) { 1235 page_unlock(pp); 1236 --pp; 1237 } 1238 return (0); 1239 } 1240 1241 pp = rootpp; 1242 for (i = 0; i < pages; i++, pp++) { 1243 if (PP_ISFREE(pp)) { 1244 VM_STAT_ADD(page_exphcontg[15]); 1245 ASSERT(!PP_ISAGED(pp)); 1246 ASSERT(pp->p_szc == 0); 1247 if (!page_reclaim(pp, NULL)) { 1248 break; 1249 } 1250 } else { 1251 ASSERT(pp->p_szc < szc); 1252 VM_STAT_ADD(page_exphcontg[16]); 1253 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1254 } 1255 } 1256 if (i < pages) { 1257 VM_STAT_ADD(page_exphcontg[17]); 1258 /* 1259 * page_reclaim failed because we were out of memory. 1260 * drop the rest of the locks and return because this page 1261 * must be already reallocated anyway. 1262 */ 1263 pp = rootpp; 1264 for (j = 0; j < pages; j++, pp++) { 1265 if (j != i) { 1266 page_unlock(pp); 1267 } 1268 } 1269 return (0); 1270 } 1271 1272 off = save_off; 1273 pp = rootpp; 1274 for (i = 0; i < pages; i++, pp++, off += PAGESIZE) { 1275 ASSERT(PAGE_EXCL(pp)); 1276 ASSERT(!PP_ISFREE(pp)); 1277 ASSERT(!hat_page_is_mapped(pp)); 1278 ASSERT(pp->p_vnode == vp); 1279 ASSERT(pp->p_offset == off); 1280 pp->p_szc = szc; 1281 } 1282 pp = rootpp; 1283 for (i = 0; i < pages; i++, pp++) { 1284 if (ppa == NULL) { 1285 page_unlock(pp); 1286 } else { 1287 ppa[i] = pp; 1288 page_downgrade(ppa[i]); 1289 } 1290 } 1291 if (ppa != NULL) { 1292 ppa[pages] = NULL; 1293 } 1294 VM_STAT_ADD(page_exphcontg[18]); 1295 ASSERT(vp->v_pages != NULL); 1296 return (1); 1297 } 1298 1299 /* 1300 * Determine whether a page with the specified [vp, off] 1301 * currently exists in the system and if so return its 1302 * size code. Obviously this should only be considered as 1303 * a hint since nothing prevents the page from disappearing 1304 * or appearing immediately after the return from this routine. 1305 */ 1306 int 1307 page_exists_forreal(vnode_t *vp, u_offset_t off, uint_t *szc) 1308 { 1309 page_t *pp; 1310 kmutex_t *phm; 1311 ulong_t index; 1312 int rc = 0; 1313 1314 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 1315 ASSERT(szc != NULL); 1316 VM_STAT_ADD(page_exists_forreal_cnt); 1317 1318 index = PAGE_HASH_FUNC(vp, off); 1319 phm = PAGE_HASH_MUTEX(index); 1320 1321 mutex_enter(phm); 1322 PAGE_HASH_SEARCH(index, pp, vp, off); 1323 if (pp != NULL) { 1324 *szc = pp->p_szc; 1325 rc = 1; 1326 } 1327 mutex_exit(phm); 1328 return (rc); 1329 } 1330 1331 /* wakeup threads waiting for pages in page_create_get_something() */ 1332 void 1333 wakeup_pcgs(void) 1334 { 1335 if (!CV_HAS_WAITERS(&pcgs_cv)) 1336 return; 1337 cv_broadcast(&pcgs_cv); 1338 } 1339 1340 /* 1341 * 'freemem' is used all over the kernel as an indication of how many 1342 * pages are free (either on the cache list or on the free page list) 1343 * in the system. In very few places is a really accurate 'freemem' 1344 * needed. To avoid contention of the lock protecting a the 1345 * single freemem, it was spread out into NCPU buckets. Set_freemem 1346 * sets freemem to the total of all NCPU buckets. It is called from 1347 * clock() on each TICK. 1348 */ 1349 void 1350 set_freemem() 1351 { 1352 struct pcf *p; 1353 ulong_t t; 1354 uint_t i; 1355 1356 t = 0; 1357 p = pcf; 1358 for (i = 0; i < pcf_fanout; i++) { 1359 t += p->pcf_count; 1360 p++; 1361 } 1362 freemem = t; 1363 1364 /* 1365 * Don't worry about grabbing mutex. It's not that 1366 * critical if we miss a tick or two. This is 1367 * where we wakeup possible delayers in 1368 * page_create_get_something(). 1369 */ 1370 wakeup_pcgs(); 1371 } 1372 1373 ulong_t 1374 get_freemem() 1375 { 1376 struct pcf *p; 1377 ulong_t t; 1378 uint_t i; 1379 1380 t = 0; 1381 p = pcf; 1382 for (i = 0; i < pcf_fanout; i++) { 1383 t += p->pcf_count; 1384 p++; 1385 } 1386 /* 1387 * We just calculated it, might as well set it. 1388 */ 1389 freemem = t; 1390 return (t); 1391 } 1392 1393 /* 1394 * Acquire all of the page cache & free (pcf) locks. 1395 */ 1396 void 1397 pcf_acquire_all() 1398 { 1399 struct pcf *p; 1400 uint_t i; 1401 1402 p = pcf; 1403 for (i = 0; i < pcf_fanout; i++) { 1404 mutex_enter(&p->pcf_lock); 1405 p++; 1406 } 1407 } 1408 1409 /* 1410 * Release all the pcf_locks. 1411 */ 1412 void 1413 pcf_release_all() 1414 { 1415 struct pcf *p; 1416 uint_t i; 1417 1418 p = pcf; 1419 for (i = 0; i < pcf_fanout; i++) { 1420 mutex_exit(&p->pcf_lock); 1421 p++; 1422 } 1423 } 1424 1425 /* 1426 * Inform the VM system that we need some pages freed up. 1427 * Calls must be symmetric, e.g.: 1428 * 1429 * page_needfree(100); 1430 * wait a bit; 1431 * page_needfree(-100); 1432 */ 1433 void 1434 page_needfree(spgcnt_t npages) 1435 { 1436 mutex_enter(&new_freemem_lock); 1437 needfree += npages; 1438 mutex_exit(&new_freemem_lock); 1439 } 1440 1441 /* 1442 * Throttle for page_create(): try to prevent freemem from dropping 1443 * below throttlefree. We can't provide a 100% guarantee because 1444 * KM_NOSLEEP allocations, page_reclaim(), and various other things 1445 * nibble away at the freelist. However, we can block all PG_WAIT 1446 * allocations until memory becomes available. The motivation is 1447 * that several things can fall apart when there's no free memory: 1448 * 1449 * (1) If pageout() needs memory to push a page, the system deadlocks. 1450 * 1451 * (2) By (broken) specification, timeout(9F) can neither fail nor 1452 * block, so it has no choice but to panic the system if it 1453 * cannot allocate a callout structure. 1454 * 1455 * (3) Like timeout(), ddi_set_callback() cannot fail and cannot block; 1456 * it panics if it cannot allocate a callback structure. 1457 * 1458 * (4) Untold numbers of third-party drivers have not yet been hardened 1459 * against KM_NOSLEEP and/or allocb() failures; they simply assume 1460 * success and panic the system with a data fault on failure. 1461 * (The long-term solution to this particular problem is to ship 1462 * hostile fault-injecting DEBUG kernels with the DDK.) 1463 * 1464 * It is theoretically impossible to guarantee success of non-blocking 1465 * allocations, but in practice, this throttle is very hard to break. 1466 */ 1467 static int 1468 page_create_throttle(pgcnt_t npages, int flags) 1469 { 1470 ulong_t fm; 1471 uint_t i; 1472 pgcnt_t tf; /* effective value of throttlefree */ 1473 1474 /* 1475 * Normal priority allocations. 1476 */ 1477 if ((flags & (PG_WAIT | PG_NORMALPRI)) == PG_NORMALPRI) { 1478 ASSERT(!(flags & (PG_PANIC | PG_PUSHPAGE))); 1479 return (freemem >= npages + throttlefree); 1480 } 1481 1482 /* 1483 * Never deny pages when: 1484 * - it's a thread that cannot block [NOMEMWAIT()] 1485 * - the allocation cannot block and must not fail 1486 * - the allocation cannot block and is pageout dispensated 1487 */ 1488 if (NOMEMWAIT() || 1489 ((flags & (PG_WAIT | PG_PANIC)) == PG_PANIC) || 1490 ((flags & (PG_WAIT | PG_PUSHPAGE)) == PG_PUSHPAGE)) 1491 return (1); 1492 1493 /* 1494 * If the allocation can't block, we look favorably upon it 1495 * unless we're below pageout_reserve. In that case we fail 1496 * the allocation because we want to make sure there are a few 1497 * pages available for pageout. 1498 */ 1499 if ((flags & PG_WAIT) == 0) 1500 return (freemem >= npages + pageout_reserve); 1501 1502 /* Calculate the effective throttlefree value */ 1503 tf = throttlefree - 1504 ((flags & PG_PUSHPAGE) ? pageout_reserve : 0); 1505 1506 cv_signal(&proc_pageout->p_cv); 1507 1508 for (;;) { 1509 fm = 0; 1510 pcf_acquire_all(); 1511 mutex_enter(&new_freemem_lock); 1512 for (i = 0; i < pcf_fanout; i++) { 1513 fm += pcf[i].pcf_count; 1514 pcf[i].pcf_wait++; 1515 mutex_exit(&pcf[i].pcf_lock); 1516 } 1517 freemem = fm; 1518 if (freemem >= npages + tf) { 1519 mutex_exit(&new_freemem_lock); 1520 break; 1521 } 1522 needfree += npages; 1523 freemem_wait++; 1524 cv_wait(&freemem_cv, &new_freemem_lock); 1525 freemem_wait--; 1526 needfree -= npages; 1527 mutex_exit(&new_freemem_lock); 1528 } 1529 return (1); 1530 } 1531 1532 /* 1533 * page_create_wait() is called to either coalesce pages from the 1534 * different pcf buckets or to wait because there simply are not 1535 * enough pages to satisfy the caller's request. 1536 * 1537 * Sadly, this is called from platform/vm/vm_machdep.c 1538 */ 1539 int 1540 page_create_wait(pgcnt_t npages, uint_t flags) 1541 { 1542 pgcnt_t total; 1543 uint_t i; 1544 struct pcf *p; 1545 1546 /* 1547 * Wait until there are enough free pages to satisfy our 1548 * entire request. 1549 * We set needfree += npages before prodding pageout, to make sure 1550 * it does real work when npages > lotsfree > freemem. 1551 */ 1552 VM_STAT_ADD(page_create_not_enough); 1553 1554 ASSERT(!kcage_on ? !(flags & PG_NORELOC) : 1); 1555 checkagain: 1556 if ((flags & PG_NORELOC) && 1557 kcage_freemem < kcage_throttlefree + npages) 1558 (void) kcage_create_throttle(npages, flags); 1559 1560 if (freemem < npages + throttlefree) 1561 if (!page_create_throttle(npages, flags)) 1562 return (0); 1563 1564 if (pcf_decrement_bucket(npages) || 1565 pcf_decrement_multiple(&total, npages, 0)) 1566 return (1); 1567 1568 /* 1569 * All of the pcf locks are held, there are not enough pages 1570 * to satisfy the request (npages < total). 1571 * Be sure to acquire the new_freemem_lock before dropping 1572 * the pcf locks. This prevents dropping wakeups in page_free(). 1573 * The order is always pcf_lock then new_freemem_lock. 1574 * 1575 * Since we hold all the pcf locks, it is a good time to set freemem. 1576 * 1577 * If the caller does not want to wait, return now. 1578 * Else turn the pageout daemon loose to find something 1579 * and wait till it does. 1580 * 1581 */ 1582 freemem = total; 1583 1584 if ((flags & PG_WAIT) == 0) { 1585 pcf_release_all(); 1586 1587 TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_NOMEM, 1588 "page_create_nomem:npages %ld freemem %ld", npages, freemem); 1589 return (0); 1590 } 1591 1592 ASSERT(proc_pageout != NULL); 1593 cv_signal(&proc_pageout->p_cv); 1594 1595 TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SLEEP_START, 1596 "page_create_sleep_start: freemem %ld needfree %ld", 1597 freemem, needfree); 1598 1599 /* 1600 * We are going to wait. 1601 * We currently hold all of the pcf_locks, 1602 * get the new_freemem_lock (it protects freemem_wait), 1603 * before dropping the pcf_locks. 1604 */ 1605 mutex_enter(&new_freemem_lock); 1606 1607 p = pcf; 1608 for (i = 0; i < pcf_fanout; i++) { 1609 p->pcf_wait++; 1610 mutex_exit(&p->pcf_lock); 1611 p++; 1612 } 1613 1614 needfree += npages; 1615 freemem_wait++; 1616 1617 cv_wait(&freemem_cv, &new_freemem_lock); 1618 1619 freemem_wait--; 1620 needfree -= npages; 1621 1622 mutex_exit(&new_freemem_lock); 1623 1624 TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SLEEP_END, 1625 "page_create_sleep_end: freemem %ld needfree %ld", 1626 freemem, needfree); 1627 1628 VM_STAT_ADD(page_create_not_enough_again); 1629 goto checkagain; 1630 } 1631 /* 1632 * A routine to do the opposite of page_create_wait(). 1633 */ 1634 void 1635 page_create_putback(spgcnt_t npages) 1636 { 1637 struct pcf *p; 1638 pgcnt_t lump; 1639 uint_t *which; 1640 1641 /* 1642 * When a contiguous lump is broken up, we have to 1643 * deal with lots of pages (min 64) so lets spread 1644 * the wealth around. 1645 */ 1646 lump = roundup(npages, pcf_fanout) / pcf_fanout; 1647 freemem += npages; 1648 1649 for (p = pcf; (npages > 0) && (p < &pcf[pcf_fanout]); p++) { 1650 which = &p->pcf_count; 1651 1652 mutex_enter(&p->pcf_lock); 1653 1654 if (p->pcf_block) { 1655 which = &p->pcf_reserve; 1656 } 1657 1658 if (lump < npages) { 1659 *which += (uint_t)lump; 1660 npages -= lump; 1661 } else { 1662 *which += (uint_t)npages; 1663 npages = 0; 1664 } 1665 1666 if (p->pcf_wait) { 1667 mutex_enter(&new_freemem_lock); 1668 /* 1669 * Check to see if some other thread 1670 * is actually waiting. Another bucket 1671 * may have woken it up by now. If there 1672 * are no waiters, then set our pcf_wait 1673 * count to zero to avoid coming in here 1674 * next time. 1675 */ 1676 if (freemem_wait) { 1677 if (npages > 1) { 1678 cv_broadcast(&freemem_cv); 1679 } else { 1680 cv_signal(&freemem_cv); 1681 } 1682 p->pcf_wait--; 1683 } else { 1684 p->pcf_wait = 0; 1685 } 1686 mutex_exit(&new_freemem_lock); 1687 } 1688 mutex_exit(&p->pcf_lock); 1689 } 1690 ASSERT(npages == 0); 1691 } 1692 1693 /* 1694 * A helper routine for page_create_get_something. 1695 * The indenting got to deep down there. 1696 * Unblock the pcf counters. Any pages freed after 1697 * pcf_block got set are moved to pcf_count and 1698 * wakeups (cv_broadcast() or cv_signal()) are done as needed. 1699 */ 1700 static void 1701 pcgs_unblock(void) 1702 { 1703 int i; 1704 struct pcf *p; 1705 1706 /* Update freemem while we're here. */ 1707 freemem = 0; 1708 p = pcf; 1709 for (i = 0; i < pcf_fanout; i++) { 1710 mutex_enter(&p->pcf_lock); 1711 ASSERT(p->pcf_count == 0); 1712 p->pcf_count = p->pcf_reserve; 1713 p->pcf_block = 0; 1714 freemem += p->pcf_count; 1715 if (p->pcf_wait) { 1716 mutex_enter(&new_freemem_lock); 1717 if (freemem_wait) { 1718 if (p->pcf_reserve > 1) { 1719 cv_broadcast(&freemem_cv); 1720 p->pcf_wait = 0; 1721 } else { 1722 cv_signal(&freemem_cv); 1723 p->pcf_wait--; 1724 } 1725 } else { 1726 p->pcf_wait = 0; 1727 } 1728 mutex_exit(&new_freemem_lock); 1729 } 1730 p->pcf_reserve = 0; 1731 mutex_exit(&p->pcf_lock); 1732 p++; 1733 } 1734 } 1735 1736 /* 1737 * Called from page_create_va() when both the cache and free lists 1738 * have been checked once. 1739 * 1740 * Either returns a page or panics since the accounting was done 1741 * way before we got here. 1742 * 1743 * We don't come here often, so leave the accounting on permanently. 1744 */ 1745 1746 #define MAX_PCGS 100 1747 1748 #ifdef DEBUG 1749 #define PCGS_TRIES 100 1750 #else /* DEBUG */ 1751 #define PCGS_TRIES 10 1752 #endif /* DEBUG */ 1753 1754 #ifdef VM_STATS 1755 uint_t pcgs_counts[PCGS_TRIES]; 1756 uint_t pcgs_too_many; 1757 uint_t pcgs_entered; 1758 uint_t pcgs_entered_noreloc; 1759 uint_t pcgs_locked; 1760 uint_t pcgs_cagelocked; 1761 #endif /* VM_STATS */ 1762 1763 static page_t * 1764 page_create_get_something(vnode_t *vp, u_offset_t off, struct seg *seg, 1765 caddr_t vaddr, uint_t flags) 1766 { 1767 uint_t count; 1768 page_t *pp; 1769 uint_t locked, i; 1770 struct pcf *p; 1771 lgrp_t *lgrp; 1772 int cagelocked = 0; 1773 1774 VM_STAT_ADD(pcgs_entered); 1775 1776 /* 1777 * Tap any reserve freelists: if we fail now, we'll die 1778 * since the page(s) we're looking for have already been 1779 * accounted for. 1780 */ 1781 flags |= PG_PANIC; 1782 1783 if ((flags & PG_NORELOC) != 0) { 1784 VM_STAT_ADD(pcgs_entered_noreloc); 1785 /* 1786 * Requests for free pages from critical threads 1787 * such as pageout still won't throttle here, but 1788 * we must try again, to give the cageout thread 1789 * another chance to catch up. Since we already 1790 * accounted for the pages, we had better get them 1791 * this time. 1792 * 1793 * N.B. All non-critical threads acquire the pcgs_cagelock 1794 * to serialize access to the freelists. This implements a 1795 * turnstile-type synchornization to avoid starvation of 1796 * critical requests for PG_NORELOC memory by non-critical 1797 * threads: all non-critical threads must acquire a 'ticket' 1798 * before passing through, which entails making sure 1799 * kcage_freemem won't fall below minfree prior to grabbing 1800 * pages from the freelists. 1801 */ 1802 if (kcage_create_throttle(1, flags) == KCT_NONCRIT) { 1803 mutex_enter(&pcgs_cagelock); 1804 cagelocked = 1; 1805 VM_STAT_ADD(pcgs_cagelocked); 1806 } 1807 } 1808 1809 /* 1810 * Time to get serious. 1811 * We failed to get a `correctly colored' page from both the 1812 * free and cache lists. 1813 * We escalate in stage. 1814 * 1815 * First try both lists without worring about color. 1816 * 1817 * Then, grab all page accounting locks (ie. pcf[]) and 1818 * steal any pages that they have and set the pcf_block flag to 1819 * stop deletions from the lists. This will help because 1820 * a page can get added to the free list while we are looking 1821 * at the cache list, then another page could be added to the cache 1822 * list allowing the page on the free list to be removed as we 1823 * move from looking at the cache list to the free list. This 1824 * could happen over and over. We would never find the page 1825 * we have accounted for. 1826 * 1827 * Noreloc pages are a subset of the global (relocatable) page pool. 1828 * They are not tracked separately in the pcf bins, so it is 1829 * impossible to know when doing pcf accounting if the available 1830 * page(s) are noreloc pages or not. When looking for a noreloc page 1831 * it is quite easy to end up here even if the global (relocatable) 1832 * page pool has plenty of free pages but the noreloc pool is empty. 1833 * 1834 * When the noreloc pool is empty (or low), additional noreloc pages 1835 * are created by converting pages from the global page pool. This 1836 * process will stall during pcf accounting if the pcf bins are 1837 * already locked. Such is the case when a noreloc allocation is 1838 * looping here in page_create_get_something waiting for more noreloc 1839 * pages to appear. 1840 * 1841 * Short of adding a new field to the pcf bins to accurately track 1842 * the number of free noreloc pages, we instead do not grab the 1843 * pcgs_lock, do not set the pcf blocks and do not timeout when 1844 * allocating a noreloc page. This allows noreloc allocations to 1845 * loop without blocking global page pool allocations. 1846 * 1847 * NOTE: the behaviour of page_create_get_something has not changed 1848 * for the case of global page pool allocations. 1849 */ 1850 1851 flags &= ~PG_MATCH_COLOR; 1852 locked = 0; 1853 #if defined(__i386) || defined(__amd64) 1854 flags = page_create_update_flags_x86(flags); 1855 #endif 1856 1857 lgrp = lgrp_mem_choose(seg, vaddr, PAGESIZE); 1858 1859 for (count = 0; kcage_on || count < MAX_PCGS; count++) { 1860 pp = page_get_freelist(vp, off, seg, vaddr, PAGESIZE, 1861 flags, lgrp); 1862 if (pp == NULL) { 1863 pp = page_get_cachelist(vp, off, seg, vaddr, 1864 flags, lgrp); 1865 } 1866 if (pp == NULL) { 1867 /* 1868 * Serialize. Don't fight with other pcgs(). 1869 */ 1870 if (!locked && (!kcage_on || !(flags & PG_NORELOC))) { 1871 mutex_enter(&pcgs_lock); 1872 VM_STAT_ADD(pcgs_locked); 1873 locked = 1; 1874 p = pcf; 1875 for (i = 0; i < pcf_fanout; i++) { 1876 mutex_enter(&p->pcf_lock); 1877 ASSERT(p->pcf_block == 0); 1878 p->pcf_block = 1; 1879 p->pcf_reserve = p->pcf_count; 1880 p->pcf_count = 0; 1881 mutex_exit(&p->pcf_lock); 1882 p++; 1883 } 1884 freemem = 0; 1885 } 1886 1887 if (count) { 1888 /* 1889 * Since page_free() puts pages on 1890 * a list then accounts for it, we 1891 * just have to wait for page_free() 1892 * to unlock any page it was working 1893 * with. The page_lock()-page_reclaim() 1894 * path falls in the same boat. 1895 * 1896 * We don't need to check on the 1897 * PG_WAIT flag, we have already 1898 * accounted for the page we are 1899 * looking for in page_create_va(). 1900 * 1901 * We just wait a moment to let any 1902 * locked pages on the lists free up, 1903 * then continue around and try again. 1904 * 1905 * Will be awakened by set_freemem(). 1906 */ 1907 mutex_enter(&pcgs_wait_lock); 1908 cv_wait(&pcgs_cv, &pcgs_wait_lock); 1909 mutex_exit(&pcgs_wait_lock); 1910 } 1911 } else { 1912 #ifdef VM_STATS 1913 if (count >= PCGS_TRIES) { 1914 VM_STAT_ADD(pcgs_too_many); 1915 } else { 1916 VM_STAT_ADD(pcgs_counts[count]); 1917 } 1918 #endif 1919 if (locked) { 1920 pcgs_unblock(); 1921 mutex_exit(&pcgs_lock); 1922 } 1923 if (cagelocked) 1924 mutex_exit(&pcgs_cagelock); 1925 return (pp); 1926 } 1927 } 1928 /* 1929 * we go down holding the pcf locks. 1930 */ 1931 panic("no %spage found %d", 1932 ((flags & PG_NORELOC) ? "non-reloc " : ""), count); 1933 /*NOTREACHED*/ 1934 } 1935 1936 /* 1937 * Create enough pages for "bytes" worth of data starting at 1938 * "off" in "vp". 1939 * 1940 * Where flag must be one of: 1941 * 1942 * PG_EXCL: Exclusive create (fail if any page already 1943 * exists in the page cache) which does not 1944 * wait for memory to become available. 1945 * 1946 * PG_WAIT: Non-exclusive create which can wait for 1947 * memory to become available. 1948 * 1949 * PG_PHYSCONTIG: Allocate physically contiguous pages. 1950 * (Not Supported) 1951 * 1952 * A doubly linked list of pages is returned to the caller. Each page 1953 * on the list has the "exclusive" (p_selock) lock and "iolock" (p_iolock) 1954 * lock. 1955 * 1956 * Unable to change the parameters to page_create() in a minor release, 1957 * we renamed page_create() to page_create_va(), changed all known calls 1958 * from page_create() to page_create_va(), and created this wrapper. 1959 * 1960 * Upon a major release, we should break compatibility by deleting this 1961 * wrapper, and replacing all the strings "page_create_va", with "page_create". 1962 * 1963 * NOTE: There is a copy of this interface as page_create_io() in 1964 * i86/vm/vm_machdep.c. Any bugs fixed here should be applied 1965 * there. 1966 */ 1967 page_t * 1968 page_create(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags) 1969 { 1970 caddr_t random_vaddr; 1971 struct seg kseg; 1972 1973 #ifdef DEBUG 1974 cmn_err(CE_WARN, "Using deprecated interface page_create: caller %p", 1975 (void *)caller()); 1976 #endif 1977 1978 random_vaddr = (caddr_t)(((uintptr_t)vp >> 7) ^ 1979 (uintptr_t)(off >> PAGESHIFT)); 1980 kseg.s_as = &kas; 1981 1982 return (page_create_va(vp, off, bytes, flags, &kseg, random_vaddr)); 1983 } 1984 1985 #ifdef DEBUG 1986 uint32_t pg_alloc_pgs_mtbf = 0; 1987 #endif 1988 1989 /* 1990 * Used for large page support. It will attempt to allocate 1991 * a large page(s) off the freelist. 1992 * 1993 * Returns non zero on failure. 1994 */ 1995 int 1996 page_alloc_pages(struct vnode *vp, struct seg *seg, caddr_t addr, 1997 page_t **basepp, page_t *ppa[], uint_t szc, int anypgsz, int pgflags) 1998 { 1999 pgcnt_t npgs, curnpgs, totpgs; 2000 size_t pgsz; 2001 page_t *pplist = NULL, *pp; 2002 int err = 0; 2003 lgrp_t *lgrp; 2004 2005 ASSERT(szc != 0 && szc <= (page_num_pagesizes() - 1)); 2006 ASSERT(pgflags == 0 || pgflags == PG_LOCAL); 2007 2008 /* 2009 * Check if system heavily prefers local large pages over remote 2010 * on systems with multiple lgroups. 2011 */ 2012 if (lpg_alloc_prefer == LPAP_LOCAL && nlgrps > 1) { 2013 pgflags = PG_LOCAL; 2014 } 2015 2016 VM_STAT_ADD(alloc_pages[0]); 2017 2018 #ifdef DEBUG 2019 if (pg_alloc_pgs_mtbf && !(gethrtime() % pg_alloc_pgs_mtbf)) { 2020 return (ENOMEM); 2021 } 2022 #endif 2023 2024 /* 2025 * One must be NULL but not both. 2026 * And one must be non NULL but not both. 2027 */ 2028 ASSERT(basepp != NULL || ppa != NULL); 2029 ASSERT(basepp == NULL || ppa == NULL); 2030 2031 #if defined(__i386) || defined(__amd64) 2032 while (page_chk_freelist(szc) == 0) { 2033 VM_STAT_ADD(alloc_pages[8]); 2034 if (anypgsz == 0 || --szc == 0) 2035 return (ENOMEM); 2036 } 2037 #endif 2038 2039 pgsz = page_get_pagesize(szc); 2040 totpgs = curnpgs = npgs = pgsz >> PAGESHIFT; 2041 2042 ASSERT(((uintptr_t)addr & (pgsz - 1)) == 0); 2043 2044 (void) page_create_wait(npgs, PG_WAIT); 2045 2046 while (npgs && szc) { 2047 lgrp = lgrp_mem_choose(seg, addr, pgsz); 2048 if (pgflags == PG_LOCAL) { 2049 pp = page_get_freelist(vp, 0, seg, addr, pgsz, 2050 pgflags, lgrp); 2051 if (pp == NULL) { 2052 pp = page_get_freelist(vp, 0, seg, addr, pgsz, 2053 0, lgrp); 2054 } 2055 } else { 2056 pp = page_get_freelist(vp, 0, seg, addr, pgsz, 2057 0, lgrp); 2058 } 2059 if (pp != NULL) { 2060 VM_STAT_ADD(alloc_pages[1]); 2061 page_list_concat(&pplist, &pp); 2062 ASSERT(npgs >= curnpgs); 2063 npgs -= curnpgs; 2064 } else if (anypgsz) { 2065 VM_STAT_ADD(alloc_pages[2]); 2066 szc--; 2067 pgsz = page_get_pagesize(szc); 2068 curnpgs = pgsz >> PAGESHIFT; 2069 } else { 2070 VM_STAT_ADD(alloc_pages[3]); 2071 ASSERT(npgs == totpgs); 2072 page_create_putback(npgs); 2073 return (ENOMEM); 2074 } 2075 } 2076 if (szc == 0) { 2077 VM_STAT_ADD(alloc_pages[4]); 2078 ASSERT(npgs != 0); 2079 page_create_putback(npgs); 2080 err = ENOMEM; 2081 } else if (basepp != NULL) { 2082 ASSERT(npgs == 0); 2083 ASSERT(ppa == NULL); 2084 *basepp = pplist; 2085 } 2086 2087 npgs = totpgs - npgs; 2088 pp = pplist; 2089 2090 /* 2091 * Clear the free and age bits. Also if we were passed in a ppa then 2092 * fill it in with all the constituent pages from the large page. But 2093 * if we failed to allocate all the pages just free what we got. 2094 */ 2095 while (npgs != 0) { 2096 ASSERT(PP_ISFREE(pp)); 2097 ASSERT(PP_ISAGED(pp)); 2098 if (ppa != NULL || err != 0) { 2099 if (err == 0) { 2100 VM_STAT_ADD(alloc_pages[5]); 2101 PP_CLRFREE(pp); 2102 PP_CLRAGED(pp); 2103 page_sub(&pplist, pp); 2104 *ppa++ = pp; 2105 npgs--; 2106 } else { 2107 VM_STAT_ADD(alloc_pages[6]); 2108 ASSERT(pp->p_szc != 0); 2109 curnpgs = page_get_pagecnt(pp->p_szc); 2110 page_list_break(&pp, &pplist, curnpgs); 2111 page_list_add_pages(pp, 0); 2112 page_create_putback(curnpgs); 2113 ASSERT(npgs >= curnpgs); 2114 npgs -= curnpgs; 2115 } 2116 pp = pplist; 2117 } else { 2118 VM_STAT_ADD(alloc_pages[7]); 2119 PP_CLRFREE(pp); 2120 PP_CLRAGED(pp); 2121 pp = pp->p_next; 2122 npgs--; 2123 } 2124 } 2125 return (err); 2126 } 2127 2128 /* 2129 * Get a single large page off of the freelists, and set it up for use. 2130 * Number of bytes requested must be a supported page size. 2131 * 2132 * Note that this call may fail even if there is sufficient 2133 * memory available or PG_WAIT is set, so the caller must 2134 * be willing to fallback on page_create_va(), block and retry, 2135 * or fail the requester. 2136 */ 2137 page_t * 2138 page_create_va_large(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags, 2139 struct seg *seg, caddr_t vaddr, void *arg) 2140 { 2141 pgcnt_t npages; 2142 page_t *pp; 2143 page_t *rootpp; 2144 lgrp_t *lgrp; 2145 lgrp_id_t *lgrpid = (lgrp_id_t *)arg; 2146 2147 ASSERT(vp != NULL); 2148 2149 ASSERT((flags & ~(PG_EXCL | PG_WAIT | 2150 PG_NORELOC | PG_PANIC | PG_PUSHPAGE | PG_NORMALPRI)) == 0); 2151 /* but no others */ 2152 2153 ASSERT((flags & PG_EXCL) == PG_EXCL); 2154 2155 npages = btop(bytes); 2156 2157 if (!kcage_on || panicstr) { 2158 /* 2159 * Cage is OFF, or we are single threaded in 2160 * panic, so make everything a RELOC request. 2161 */ 2162 flags &= ~PG_NORELOC; 2163 } 2164 2165 /* 2166 * Make sure there's adequate physical memory available. 2167 * Note: PG_WAIT is ignored here. 2168 */ 2169 if (freemem <= throttlefree + npages) { 2170 VM_STAT_ADD(page_create_large_cnt[1]); 2171 return (NULL); 2172 } 2173 2174 /* 2175 * If cage is on, dampen draw from cage when available 2176 * cage space is low. 2177 */ 2178 if ((flags & (PG_NORELOC | PG_WAIT)) == (PG_NORELOC | PG_WAIT) && 2179 kcage_freemem < kcage_throttlefree + npages) { 2180 2181 /* 2182 * The cage is on, the caller wants PG_NORELOC 2183 * pages and available cage memory is very low. 2184 * Call kcage_create_throttle() to attempt to 2185 * control demand on the cage. 2186 */ 2187 if (kcage_create_throttle(npages, flags) == KCT_FAILURE) { 2188 VM_STAT_ADD(page_create_large_cnt[2]); 2189 return (NULL); 2190 } 2191 } 2192 2193 if (!pcf_decrement_bucket(npages) && 2194 !pcf_decrement_multiple(NULL, npages, 1)) { 2195 VM_STAT_ADD(page_create_large_cnt[4]); 2196 return (NULL); 2197 } 2198 2199 /* 2200 * This is where this function behaves fundamentally differently 2201 * than page_create_va(); since we're intending to map the page 2202 * with a single TTE, we have to get it as a physically contiguous 2203 * hardware pagesize chunk. If we can't, we fail. 2204 */ 2205 if (lgrpid != NULL && *lgrpid >= 0 && *lgrpid <= lgrp_alloc_max && 2206 LGRP_EXISTS(lgrp_table[*lgrpid])) 2207 lgrp = lgrp_table[*lgrpid]; 2208 else 2209 lgrp = lgrp_mem_choose(seg, vaddr, bytes); 2210 2211 if ((rootpp = page_get_freelist(&kvp, off, seg, vaddr, 2212 bytes, flags & ~PG_MATCH_COLOR, lgrp)) == NULL) { 2213 page_create_putback(npages); 2214 VM_STAT_ADD(page_create_large_cnt[5]); 2215 return (NULL); 2216 } 2217 2218 /* 2219 * if we got the page with the wrong mtype give it back this is a 2220 * workaround for CR 6249718. When CR 6249718 is fixed we never get 2221 * inside "if" and the workaround becomes just a nop 2222 */ 2223 if (kcage_on && (flags & PG_NORELOC) && !PP_ISNORELOC(rootpp)) { 2224 page_list_add_pages(rootpp, 0); 2225 page_create_putback(npages); 2226 VM_STAT_ADD(page_create_large_cnt[6]); 2227 return (NULL); 2228 } 2229 2230 /* 2231 * If satisfying this request has left us with too little 2232 * memory, start the wheels turning to get some back. The 2233 * first clause of the test prevents waking up the pageout 2234 * daemon in situations where it would decide that there's 2235 * nothing to do. 2236 */ 2237 if (nscan < desscan && freemem < minfree) { 2238 TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL, 2239 "pageout_cv_signal:freemem %ld", freemem); 2240 cv_signal(&proc_pageout->p_cv); 2241 } 2242 2243 pp = rootpp; 2244 while (npages--) { 2245 ASSERT(PAGE_EXCL(pp)); 2246 ASSERT(pp->p_vnode == NULL); 2247 ASSERT(!hat_page_is_mapped(pp)); 2248 PP_CLRFREE(pp); 2249 PP_CLRAGED(pp); 2250 if (!page_hashin(pp, vp, off, NULL)) 2251 panic("page_create_large: hashin failed: page %p", 2252 (void *)pp); 2253 page_io_lock(pp); 2254 off += PAGESIZE; 2255 pp = pp->p_next; 2256 } 2257 2258 VM_STAT_ADD(page_create_large_cnt[0]); 2259 return (rootpp); 2260 } 2261 2262 page_t * 2263 page_create_va(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags, 2264 struct seg *seg, caddr_t vaddr) 2265 { 2266 page_t *plist = NULL; 2267 pgcnt_t npages; 2268 pgcnt_t found_on_free = 0; 2269 pgcnt_t pages_req; 2270 page_t *npp = NULL; 2271 struct pcf *p; 2272 lgrp_t *lgrp; 2273 2274 TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START, 2275 "page_create_start:vp %p off %llx bytes %lu flags %x", 2276 vp, off, bytes, flags); 2277 2278 ASSERT(bytes != 0 && vp != NULL); 2279 2280 if ((flags & PG_EXCL) == 0 && (flags & PG_WAIT) == 0) { 2281 panic("page_create: invalid flags"); 2282 /*NOTREACHED*/ 2283 } 2284 ASSERT((flags & ~(PG_EXCL | PG_WAIT | 2285 PG_NORELOC | PG_PANIC | PG_PUSHPAGE | PG_NORMALPRI)) == 0); 2286 /* but no others */ 2287 2288 pages_req = npages = btopr(bytes); 2289 /* 2290 * Try to see whether request is too large to *ever* be 2291 * satisfied, in order to prevent deadlock. We arbitrarily 2292 * decide to limit maximum size requests to max_page_get. 2293 */ 2294 if (npages >= max_page_get) { 2295 if ((flags & PG_WAIT) == 0) { 2296 TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_TOOBIG, 2297 "page_create_toobig:vp %p off %llx npages " 2298 "%lu max_page_get %lu", 2299 vp, off, npages, max_page_get); 2300 return (NULL); 2301 } else { 2302 cmn_err(CE_WARN, 2303 "Request for too much kernel memory " 2304 "(%lu bytes), will hang forever", bytes); 2305 for (;;) 2306 delay(1000000000); 2307 } 2308 } 2309 2310 if (!kcage_on || panicstr) { 2311 /* 2312 * Cage is OFF, or we are single threaded in 2313 * panic, so make everything a RELOC request. 2314 */ 2315 flags &= ~PG_NORELOC; 2316 } 2317 2318 if (freemem <= throttlefree + npages) 2319 if (!page_create_throttle(npages, flags)) 2320 return (NULL); 2321 2322 /* 2323 * If cage is on, dampen draw from cage when available 2324 * cage space is low. 2325 */ 2326 if ((flags & PG_NORELOC) && 2327 kcage_freemem < kcage_throttlefree + npages) { 2328 2329 /* 2330 * The cage is on, the caller wants PG_NORELOC 2331 * pages and available cage memory is very low. 2332 * Call kcage_create_throttle() to attempt to 2333 * control demand on the cage. 2334 */ 2335 if (kcage_create_throttle(npages, flags) == KCT_FAILURE) 2336 return (NULL); 2337 } 2338 2339 VM_STAT_ADD(page_create_cnt[0]); 2340 2341 if (!pcf_decrement_bucket(npages)) { 2342 /* 2343 * Have to look harder. If npages is greater than 2344 * one, then we might have to coalesce the counters. 2345 * 2346 * Go wait. We come back having accounted 2347 * for the memory. 2348 */ 2349 VM_STAT_ADD(page_create_cnt[1]); 2350 if (!page_create_wait(npages, flags)) { 2351 VM_STAT_ADD(page_create_cnt[2]); 2352 return (NULL); 2353 } 2354 } 2355 2356 TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS, 2357 "page_create_success:vp %p off %llx", vp, off); 2358 2359 /* 2360 * If satisfying this request has left us with too little 2361 * memory, start the wheels turning to get some back. The 2362 * first clause of the test prevents waking up the pageout 2363 * daemon in situations where it would decide that there's 2364 * nothing to do. 2365 */ 2366 if (nscan < desscan && freemem < minfree) { 2367 TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL, 2368 "pageout_cv_signal:freemem %ld", freemem); 2369 cv_signal(&proc_pageout->p_cv); 2370 } 2371 2372 /* 2373 * Loop around collecting the requested number of pages. 2374 * Most of the time, we have to `create' a new page. With 2375 * this in mind, pull the page off the free list before 2376 * getting the hash lock. This will minimize the hash 2377 * lock hold time, nesting, and the like. If it turns 2378 * out we don't need the page, we put it back at the end. 2379 */ 2380 while (npages--) { 2381 page_t *pp; 2382 kmutex_t *phm = NULL; 2383 ulong_t index; 2384 2385 index = PAGE_HASH_FUNC(vp, off); 2386 top: 2387 ASSERT(phm == NULL); 2388 ASSERT(index == PAGE_HASH_FUNC(vp, off)); 2389 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 2390 2391 if (npp == NULL) { 2392 /* 2393 * Try to get a page from the freelist (ie, 2394 * a page with no [vp, off] tag). If that 2395 * fails, use the cachelist. 2396 * 2397 * During the first attempt at both the free 2398 * and cache lists we try for the correct color. 2399 */ 2400 /* 2401 * XXXX-how do we deal with virtual indexed 2402 * caches and and colors? 2403 */ 2404 VM_STAT_ADD(page_create_cnt[4]); 2405 /* 2406 * Get lgroup to allocate next page of shared memory 2407 * from and use it to specify where to allocate 2408 * the physical memory 2409 */ 2410 lgrp = lgrp_mem_choose(seg, vaddr, PAGESIZE); 2411 npp = page_get_freelist(vp, off, seg, vaddr, PAGESIZE, 2412 flags | PG_MATCH_COLOR, lgrp); 2413 if (npp == NULL) { 2414 npp = page_get_cachelist(vp, off, seg, 2415 vaddr, flags | PG_MATCH_COLOR, lgrp); 2416 if (npp == NULL) { 2417 npp = page_create_get_something(vp, 2418 off, seg, vaddr, 2419 flags & ~PG_MATCH_COLOR); 2420 } 2421 2422 if (PP_ISAGED(npp) == 0) { 2423 /* 2424 * Since this page came from the 2425 * cachelist, we must destroy the 2426 * old vnode association. 2427 */ 2428 page_hashout(npp, NULL); 2429 } 2430 } 2431 } 2432 2433 /* 2434 * We own this page! 2435 */ 2436 ASSERT(PAGE_EXCL(npp)); 2437 ASSERT(npp->p_vnode == NULL); 2438 ASSERT(!hat_page_is_mapped(npp)); 2439 PP_CLRFREE(npp); 2440 PP_CLRAGED(npp); 2441 2442 /* 2443 * Here we have a page in our hot little mits and are 2444 * just waiting to stuff it on the appropriate lists. 2445 * Get the mutex and check to see if it really does 2446 * not exist. 2447 */ 2448 phm = PAGE_HASH_MUTEX(index); 2449 mutex_enter(phm); 2450 PAGE_HASH_SEARCH(index, pp, vp, off); 2451 if (pp == NULL) { 2452 VM_STAT_ADD(page_create_new); 2453 pp = npp; 2454 npp = NULL; 2455 if (!page_hashin(pp, vp, off, phm)) { 2456 /* 2457 * Since we hold the page hash mutex and 2458 * just searched for this page, page_hashin 2459 * had better not fail. If it does, that 2460 * means somethread did not follow the 2461 * page hash mutex rules. Panic now and 2462 * get it over with. As usual, go down 2463 * holding all the locks. 2464 */ 2465 ASSERT(MUTEX_HELD(phm)); 2466 panic("page_create: " 2467 "hashin failed %p %p %llx %p", 2468 (void *)pp, (void *)vp, off, (void *)phm); 2469 /*NOTREACHED*/ 2470 } 2471 ASSERT(MUTEX_HELD(phm)); 2472 mutex_exit(phm); 2473 phm = NULL; 2474 2475 /* 2476 * Hat layer locking need not be done to set 2477 * the following bits since the page is not hashed 2478 * and was on the free list (i.e., had no mappings). 2479 * 2480 * Set the reference bit to protect 2481 * against immediate pageout 2482 * 2483 * XXXmh modify freelist code to set reference 2484 * bit so we don't have to do it here. 2485 */ 2486 page_set_props(pp, P_REF); 2487 found_on_free++; 2488 } else { 2489 VM_STAT_ADD(page_create_exists); 2490 if (flags & PG_EXCL) { 2491 /* 2492 * Found an existing page, and the caller 2493 * wanted all new pages. Undo all of the work 2494 * we have done. 2495 */ 2496 mutex_exit(phm); 2497 phm = NULL; 2498 while (plist != NULL) { 2499 pp = plist; 2500 page_sub(&plist, pp); 2501 page_io_unlock(pp); 2502 /* large pages should not end up here */ 2503 ASSERT(pp->p_szc == 0); 2504 /*LINTED: constant in conditional ctx*/ 2505 VN_DISPOSE(pp, B_INVAL, 0, kcred); 2506 } 2507 VM_STAT_ADD(page_create_found_one); 2508 goto fail; 2509 } 2510 ASSERT(flags & PG_WAIT); 2511 if (!page_lock(pp, SE_EXCL, phm, P_NO_RECLAIM)) { 2512 /* 2513 * Start all over again if we blocked trying 2514 * to lock the page. 2515 */ 2516 mutex_exit(phm); 2517 VM_STAT_ADD(page_create_page_lock_failed); 2518 phm = NULL; 2519 goto top; 2520 } 2521 mutex_exit(phm); 2522 phm = NULL; 2523 2524 if (PP_ISFREE(pp)) { 2525 ASSERT(PP_ISAGED(pp) == 0); 2526 VM_STAT_ADD(pagecnt.pc_get_cache); 2527 page_list_sub(pp, PG_CACHE_LIST); 2528 PP_CLRFREE(pp); 2529 found_on_free++; 2530 } 2531 } 2532 2533 /* 2534 * Got a page! It is locked. Acquire the i/o 2535 * lock since we are going to use the p_next and 2536 * p_prev fields to link the requested pages together. 2537 */ 2538 page_io_lock(pp); 2539 page_add(&plist, pp); 2540 plist = plist->p_next; 2541 off += PAGESIZE; 2542 vaddr += PAGESIZE; 2543 } 2544 2545 ASSERT((flags & PG_EXCL) ? (found_on_free == pages_req) : 1); 2546 fail: 2547 if (npp != NULL) { 2548 /* 2549 * Did not need this page after all. 2550 * Put it back on the free list. 2551 */ 2552 VM_STAT_ADD(page_create_putbacks); 2553 PP_SETFREE(npp); 2554 PP_SETAGED(npp); 2555 npp->p_offset = (u_offset_t)-1; 2556 page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL); 2557 page_unlock(npp); 2558 2559 } 2560 2561 ASSERT(pages_req >= found_on_free); 2562 2563 { 2564 uint_t overshoot = (uint_t)(pages_req - found_on_free); 2565 2566 if (overshoot) { 2567 VM_STAT_ADD(page_create_overshoot); 2568 p = &pcf[PCF_INDEX()]; 2569 mutex_enter(&p->pcf_lock); 2570 if (p->pcf_block) { 2571 p->pcf_reserve += overshoot; 2572 } else { 2573 p->pcf_count += overshoot; 2574 if (p->pcf_wait) { 2575 mutex_enter(&new_freemem_lock); 2576 if (freemem_wait) { 2577 cv_signal(&freemem_cv); 2578 p->pcf_wait--; 2579 } else { 2580 p->pcf_wait = 0; 2581 } 2582 mutex_exit(&new_freemem_lock); 2583 } 2584 } 2585 mutex_exit(&p->pcf_lock); 2586 /* freemem is approximate, so this test OK */ 2587 if (!p->pcf_block) 2588 freemem += overshoot; 2589 } 2590 } 2591 2592 return (plist); 2593 } 2594 2595 /* 2596 * One or more constituent pages of this large page has been marked 2597 * toxic. Simply demote the large page to PAGESIZE pages and let 2598 * page_free() handle it. This routine should only be called by 2599 * large page free routines (page_free_pages() and page_destroy_pages(). 2600 * All pages are locked SE_EXCL and have already been marked free. 2601 */ 2602 static void 2603 page_free_toxic_pages(page_t *rootpp) 2604 { 2605 page_t *tpp; 2606 pgcnt_t i, pgcnt = page_get_pagecnt(rootpp->p_szc); 2607 uint_t szc = rootpp->p_szc; 2608 2609 for (i = 0, tpp = rootpp; i < pgcnt; i++, tpp = tpp->p_next) { 2610 ASSERT(tpp->p_szc == szc); 2611 ASSERT((PAGE_EXCL(tpp) && 2612 !page_iolock_assert(tpp)) || panicstr); 2613 tpp->p_szc = 0; 2614 } 2615 2616 while (rootpp != NULL) { 2617 tpp = rootpp; 2618 page_sub(&rootpp, tpp); 2619 ASSERT(PP_ISFREE(tpp)); 2620 PP_CLRFREE(tpp); 2621 page_free(tpp, 1); 2622 } 2623 } 2624 2625 /* 2626 * Put page on the "free" list. 2627 * The free list is really two lists maintained by 2628 * the PSM of whatever machine we happen to be on. 2629 */ 2630 void 2631 page_free(page_t *pp, int dontneed) 2632 { 2633 struct pcf *p; 2634 uint_t pcf_index; 2635 2636 ASSERT((PAGE_EXCL(pp) && 2637 !page_iolock_assert(pp)) || panicstr); 2638 2639 if (PP_ISFREE(pp)) { 2640 panic("page_free: page %p is free", (void *)pp); 2641 } 2642 2643 if (pp->p_szc != 0) { 2644 if (pp->p_vnode == NULL || IS_SWAPFSVP(pp->p_vnode) || 2645 PP_ISKAS(pp)) { 2646 panic("page_free: anon or kernel " 2647 "or no vnode large page %p", (void *)pp); 2648 } 2649 page_demote_vp_pages(pp); 2650 ASSERT(pp->p_szc == 0); 2651 } 2652 2653 /* 2654 * The page_struct_lock need not be acquired to examine these 2655 * fields since the page has an "exclusive" lock. 2656 */ 2657 if (hat_page_is_mapped(pp) || pp->p_lckcnt != 0 || pp->p_cowcnt != 0 || 2658 pp->p_slckcnt != 0) { 2659 panic("page_free pp=%p, pfn=%lx, lckcnt=%d, cowcnt=%d " 2660 "slckcnt = %d", (void *)pp, page_pptonum(pp), pp->p_lckcnt, 2661 pp->p_cowcnt, pp->p_slckcnt); 2662 /*NOTREACHED*/ 2663 } 2664 2665 ASSERT(!hat_page_getshare(pp)); 2666 2667 PP_SETFREE(pp); 2668 ASSERT(pp->p_vnode == NULL || !IS_VMODSORT(pp->p_vnode) || 2669 !hat_ismod(pp)); 2670 page_clr_all_props(pp); 2671 ASSERT(!hat_page_getshare(pp)); 2672 2673 /* 2674 * Now we add the page to the head of the free list. 2675 * But if this page is associated with a paged vnode 2676 * then we adjust the head forward so that the page is 2677 * effectively at the end of the list. 2678 */ 2679 if (pp->p_vnode == NULL) { 2680 /* 2681 * Page has no identity, put it on the free list. 2682 */ 2683 PP_SETAGED(pp); 2684 pp->p_offset = (u_offset_t)-1; 2685 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); 2686 VM_STAT_ADD(pagecnt.pc_free_free); 2687 TRACE_1(TR_FAC_VM, TR_PAGE_FREE_FREE, 2688 "page_free_free:pp %p", pp); 2689 } else { 2690 PP_CLRAGED(pp); 2691 2692 if (!dontneed) { 2693 /* move it to the tail of the list */ 2694 page_list_add(pp, PG_CACHE_LIST | PG_LIST_TAIL); 2695 2696 VM_STAT_ADD(pagecnt.pc_free_cache); 2697 TRACE_1(TR_FAC_VM, TR_PAGE_FREE_CACHE_TAIL, 2698 "page_free_cache_tail:pp %p", pp); 2699 } else { 2700 page_list_add(pp, PG_CACHE_LIST | PG_LIST_HEAD); 2701 2702 VM_STAT_ADD(pagecnt.pc_free_dontneed); 2703 TRACE_1(TR_FAC_VM, TR_PAGE_FREE_CACHE_HEAD, 2704 "page_free_cache_head:pp %p", pp); 2705 } 2706 } 2707 page_unlock(pp); 2708 2709 /* 2710 * Now do the `freemem' accounting. 2711 */ 2712 pcf_index = PCF_INDEX(); 2713 p = &pcf[pcf_index]; 2714 2715 mutex_enter(&p->pcf_lock); 2716 if (p->pcf_block) { 2717 p->pcf_reserve += 1; 2718 } else { 2719 p->pcf_count += 1; 2720 if (p->pcf_wait) { 2721 mutex_enter(&new_freemem_lock); 2722 /* 2723 * Check to see if some other thread 2724 * is actually waiting. Another bucket 2725 * may have woken it up by now. If there 2726 * are no waiters, then set our pcf_wait 2727 * count to zero to avoid coming in here 2728 * next time. Also, since only one page 2729 * was put on the free list, just wake 2730 * up one waiter. 2731 */ 2732 if (freemem_wait) { 2733 cv_signal(&freemem_cv); 2734 p->pcf_wait--; 2735 } else { 2736 p->pcf_wait = 0; 2737 } 2738 mutex_exit(&new_freemem_lock); 2739 } 2740 } 2741 mutex_exit(&p->pcf_lock); 2742 2743 /* freemem is approximate, so this test OK */ 2744 if (!p->pcf_block) 2745 freemem += 1; 2746 } 2747 2748 /* 2749 * Put page on the "free" list during intial startup. 2750 * This happens during initial single threaded execution. 2751 */ 2752 void 2753 page_free_at_startup(page_t *pp) 2754 { 2755 struct pcf *p; 2756 uint_t pcf_index; 2757 2758 page_list_add(pp, PG_FREE_LIST | PG_LIST_HEAD | PG_LIST_ISINIT); 2759 VM_STAT_ADD(pagecnt.pc_free_free); 2760 2761 /* 2762 * Now do the `freemem' accounting. 2763 */ 2764 pcf_index = PCF_INDEX(); 2765 p = &pcf[pcf_index]; 2766 2767 ASSERT(p->pcf_block == 0); 2768 ASSERT(p->pcf_wait == 0); 2769 p->pcf_count += 1; 2770 2771 /* freemem is approximate, so this is OK */ 2772 freemem += 1; 2773 } 2774 2775 void 2776 page_free_pages(page_t *pp) 2777 { 2778 page_t *tpp, *rootpp = NULL; 2779 pgcnt_t pgcnt = page_get_pagecnt(pp->p_szc); 2780 pgcnt_t i; 2781 uint_t szc = pp->p_szc; 2782 2783 VM_STAT_ADD(pagecnt.pc_free_pages); 2784 TRACE_1(TR_FAC_VM, TR_PAGE_FREE_FREE, 2785 "page_free_free:pp %p", pp); 2786 2787 ASSERT(pp->p_szc != 0 && pp->p_szc < page_num_pagesizes()); 2788 if ((page_pptonum(pp) & (pgcnt - 1)) != 0) { 2789 panic("page_free_pages: not root page %p", (void *)pp); 2790 /*NOTREACHED*/ 2791 } 2792 2793 for (i = 0, tpp = pp; i < pgcnt; i++, tpp++) { 2794 ASSERT((PAGE_EXCL(tpp) && 2795 !page_iolock_assert(tpp)) || panicstr); 2796 if (PP_ISFREE(tpp)) { 2797 panic("page_free_pages: page %p is free", (void *)tpp); 2798 /*NOTREACHED*/ 2799 } 2800 if (hat_page_is_mapped(tpp) || tpp->p_lckcnt != 0 || 2801 tpp->p_cowcnt != 0 || tpp->p_slckcnt != 0) { 2802 panic("page_free_pages %p", (void *)tpp); 2803 /*NOTREACHED*/ 2804 } 2805 2806 ASSERT(!hat_page_getshare(tpp)); 2807 ASSERT(tpp->p_vnode == NULL); 2808 ASSERT(tpp->p_szc == szc); 2809 2810 PP_SETFREE(tpp); 2811 page_clr_all_props(tpp); 2812 PP_SETAGED(tpp); 2813 tpp->p_offset = (u_offset_t)-1; 2814 ASSERT(tpp->p_next == tpp); 2815 ASSERT(tpp->p_prev == tpp); 2816 page_list_concat(&rootpp, &tpp); 2817 } 2818 ASSERT(rootpp == pp); 2819 2820 page_list_add_pages(rootpp, 0); 2821 page_create_putback(pgcnt); 2822 } 2823 2824 int free_pages = 1; 2825 2826 /* 2827 * This routine attempts to return pages to the cachelist via page_release(). 2828 * It does not *have* to be successful in all cases, since the pageout scanner 2829 * will catch any pages it misses. It does need to be fast and not introduce 2830 * too much overhead. 2831 * 2832 * If a page isn't found on the unlocked sweep of the page_hash bucket, we 2833 * don't lock and retry. This is ok, since the page scanner will eventually 2834 * find any page we miss in free_vp_pages(). 2835 */ 2836 void 2837 free_vp_pages(vnode_t *vp, u_offset_t off, size_t len) 2838 { 2839 page_t *pp; 2840 u_offset_t eoff; 2841 extern int swap_in_range(vnode_t *, u_offset_t, size_t); 2842 2843 eoff = off + len; 2844 2845 if (free_pages == 0) 2846 return; 2847 if (swap_in_range(vp, off, len)) 2848 return; 2849 2850 for (; off < eoff; off += PAGESIZE) { 2851 2852 /* 2853 * find the page using a fast, but inexact search. It'll be OK 2854 * if a few pages slip through the cracks here. 2855 */ 2856 pp = page_exists(vp, off); 2857 2858 /* 2859 * If we didn't find the page (it may not exist), the page 2860 * is free, looks still in use (shared), or we can't lock it, 2861 * just give up. 2862 */ 2863 if (pp == NULL || 2864 PP_ISFREE(pp) || 2865 page_share_cnt(pp) > 0 || 2866 !page_trylock(pp, SE_EXCL)) 2867 continue; 2868 2869 /* 2870 * Once we have locked pp, verify that it's still the 2871 * correct page and not already free 2872 */ 2873 ASSERT(PAGE_LOCKED_SE(pp, SE_EXCL)); 2874 if (pp->p_vnode != vp || pp->p_offset != off || PP_ISFREE(pp)) { 2875 page_unlock(pp); 2876 continue; 2877 } 2878 2879 /* 2880 * try to release the page... 2881 */ 2882 (void) page_release(pp, 1); 2883 } 2884 } 2885 2886 /* 2887 * Reclaim the given page from the free list. 2888 * If pp is part of a large pages, only the given constituent page is reclaimed 2889 * and the large page it belonged to will be demoted. This can only happen 2890 * if the page is not on the cachelist. 2891 * 2892 * Returns 1 on success or 0 on failure. 2893 * 2894 * The page is unlocked if it can't be reclaimed (when freemem == 0). 2895 * If `lock' is non-null, it will be dropped and re-acquired if 2896 * the routine must wait while freemem is 0. 2897 * 2898 * As it turns out, boot_getpages() does this. It picks a page, 2899 * based on where OBP mapped in some address, gets its pfn, searches 2900 * the memsegs, locks the page, then pulls it off the free list! 2901 */ 2902 int 2903 page_reclaim(page_t *pp, kmutex_t *lock) 2904 { 2905 struct pcf *p; 2906 struct cpu *cpup; 2907 int enough; 2908 uint_t i; 2909 2910 ASSERT(lock != NULL ? MUTEX_HELD(lock) : 1); 2911 ASSERT(PAGE_EXCL(pp) && PP_ISFREE(pp)); 2912 2913 /* 2914 * If `freemem' is 0, we cannot reclaim this page from the 2915 * freelist, so release every lock we might hold: the page, 2916 * and the `lock' before blocking. 2917 * 2918 * The only way `freemem' can become 0 while there are pages 2919 * marked free (have their p->p_free bit set) is when the 2920 * system is low on memory and doing a page_create(). In 2921 * order to guarantee that once page_create() starts acquiring 2922 * pages it will be able to get all that it needs since `freemem' 2923 * was decreased by the requested amount. So, we need to release 2924 * this page, and let page_create() have it. 2925 * 2926 * Since `freemem' being zero is not supposed to happen, just 2927 * use the usual hash stuff as a starting point. If that bucket 2928 * is empty, then assume the worst, and start at the beginning 2929 * of the pcf array. If we always start at the beginning 2930 * when acquiring more than one pcf lock, there won't be any 2931 * deadlock problems. 2932 */ 2933 2934 /* TODO: Do we need to test kcage_freemem if PG_NORELOC(pp)? */ 2935 2936 if (freemem <= throttlefree && !page_create_throttle(1l, 0)) { 2937 pcf_acquire_all(); 2938 goto page_reclaim_nomem; 2939 } 2940 2941 enough = pcf_decrement_bucket(1); 2942 2943 if (!enough) { 2944 VM_STAT_ADD(page_reclaim_zero); 2945 /* 2946 * Check again. Its possible that some other thread 2947 * could have been right behind us, and added one 2948 * to a list somewhere. Acquire each of the pcf locks 2949 * until we find a page. 2950 */ 2951 p = pcf; 2952 for (i = 0; i < pcf_fanout; i++) { 2953 mutex_enter(&p->pcf_lock); 2954 if (p->pcf_count >= 1) { 2955 p->pcf_count -= 1; 2956 /* 2957 * freemem is not protected by any lock. Thus, 2958 * we cannot have any assertion containing 2959 * freemem here. 2960 */ 2961 freemem -= 1; 2962 enough = 1; 2963 break; 2964 } 2965 p++; 2966 } 2967 2968 if (!enough) { 2969 page_reclaim_nomem: 2970 /* 2971 * We really can't have page `pp'. 2972 * Time for the no-memory dance with 2973 * page_free(). This is just like 2974 * page_create_wait(). Plus the added 2975 * attraction of releasing whatever mutex 2976 * we held when we were called with in `lock'. 2977 * Page_unlock() will wakeup any thread 2978 * waiting around for this page. 2979 */ 2980 if (lock) { 2981 VM_STAT_ADD(page_reclaim_zero_locked); 2982 mutex_exit(lock); 2983 } 2984 page_unlock(pp); 2985 2986 /* 2987 * get this before we drop all the pcf locks. 2988 */ 2989 mutex_enter(&new_freemem_lock); 2990 2991 p = pcf; 2992 for (i = 0; i < pcf_fanout; i++) { 2993 p->pcf_wait++; 2994 mutex_exit(&p->pcf_lock); 2995 p++; 2996 } 2997 2998 freemem_wait++; 2999 cv_wait(&freemem_cv, &new_freemem_lock); 3000 freemem_wait--; 3001 3002 mutex_exit(&new_freemem_lock); 3003 3004 if (lock) { 3005 mutex_enter(lock); 3006 } 3007 return (0); 3008 } 3009 3010 /* 3011 * The pcf accounting has been done, 3012 * though none of the pcf_wait flags have been set, 3013 * drop the locks and continue on. 3014 */ 3015 while (p >= pcf) { 3016 mutex_exit(&p->pcf_lock); 3017 p--; 3018 } 3019 } 3020 3021 3022 VM_STAT_ADD(pagecnt.pc_reclaim); 3023 3024 /* 3025 * page_list_sub will handle the case where pp is a large page. 3026 * It's possible that the page was promoted while on the freelist 3027 */ 3028 if (PP_ISAGED(pp)) { 3029 page_list_sub(pp, PG_FREE_LIST); 3030 TRACE_1(TR_FAC_VM, TR_PAGE_UNFREE_FREE, 3031 "page_reclaim_free:pp %p", pp); 3032 } else { 3033 page_list_sub(pp, PG_CACHE_LIST); 3034 TRACE_1(TR_FAC_VM, TR_PAGE_UNFREE_CACHE, 3035 "page_reclaim_cache:pp %p", pp); 3036 } 3037 3038 /* 3039 * clear the p_free & p_age bits since this page is no longer 3040 * on the free list. Notice that there was a brief time where 3041 * a page is marked as free, but is not on the list. 3042 * 3043 * Set the reference bit to protect against immediate pageout. 3044 */ 3045 PP_CLRFREE(pp); 3046 PP_CLRAGED(pp); 3047 page_set_props(pp, P_REF); 3048 3049 CPU_STATS_ENTER_K(); 3050 cpup = CPU; /* get cpup now that CPU cannot change */ 3051 CPU_STATS_ADDQ(cpup, vm, pgrec, 1); 3052 CPU_STATS_ADDQ(cpup, vm, pgfrec, 1); 3053 CPU_STATS_EXIT_K(); 3054 ASSERT(pp->p_szc == 0); 3055 3056 return (1); 3057 } 3058 3059 /* 3060 * Destroy identity of the page and put it back on 3061 * the page free list. Assumes that the caller has 3062 * acquired the "exclusive" lock on the page. 3063 */ 3064 void 3065 page_destroy(page_t *pp, int dontfree) 3066 { 3067 ASSERT((PAGE_EXCL(pp) && 3068 !page_iolock_assert(pp)) || panicstr); 3069 ASSERT(pp->p_slckcnt == 0 || panicstr); 3070 3071 if (pp->p_szc != 0) { 3072 if (pp->p_vnode == NULL || IS_SWAPFSVP(pp->p_vnode) || 3073 PP_ISKAS(pp)) { 3074 panic("page_destroy: anon or kernel or no vnode " 3075 "large page %p", (void *)pp); 3076 } 3077 page_demote_vp_pages(pp); 3078 ASSERT(pp->p_szc == 0); 3079 } 3080 3081 TRACE_1(TR_FAC_VM, TR_PAGE_DESTROY, "page_destroy:pp %p", pp); 3082 3083 /* 3084 * Unload translations, if any, then hash out the 3085 * page to erase its identity. 3086 */ 3087 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 3088 page_hashout(pp, NULL); 3089 3090 if (!dontfree) { 3091 /* 3092 * Acquire the "freemem_lock" for availrmem. 3093 * The page_struct_lock need not be acquired for lckcnt 3094 * and cowcnt since the page has an "exclusive" lock. 3095 * We are doing a modified version of page_pp_unlock here. 3096 */ 3097 if ((pp->p_lckcnt != 0) || (pp->p_cowcnt != 0)) { 3098 mutex_enter(&freemem_lock); 3099 if (pp->p_lckcnt != 0) { 3100 availrmem++; 3101 pages_locked--; 3102 pp->p_lckcnt = 0; 3103 } 3104 if (pp->p_cowcnt != 0) { 3105 availrmem += pp->p_cowcnt; 3106 pages_locked -= pp->p_cowcnt; 3107 pp->p_cowcnt = 0; 3108 } 3109 mutex_exit(&freemem_lock); 3110 } 3111 /* 3112 * Put the page on the "free" list. 3113 */ 3114 page_free(pp, 0); 3115 } 3116 } 3117 3118 void 3119 page_destroy_pages(page_t *pp) 3120 { 3121 3122 page_t *tpp, *rootpp = NULL; 3123 pgcnt_t pgcnt = page_get_pagecnt(pp->p_szc); 3124 pgcnt_t i, pglcks = 0; 3125 uint_t szc = pp->p_szc; 3126 3127 ASSERT(pp->p_szc != 0 && pp->p_szc < page_num_pagesizes()); 3128 3129 VM_STAT_ADD(pagecnt.pc_destroy_pages); 3130 3131 TRACE_1(TR_FAC_VM, TR_PAGE_DESTROY, "page_destroy_pages:pp %p", pp); 3132 3133 if ((page_pptonum(pp) & (pgcnt - 1)) != 0) { 3134 panic("page_destroy_pages: not root page %p", (void *)pp); 3135 /*NOTREACHED*/ 3136 } 3137 3138 for (i = 0, tpp = pp; i < pgcnt; i++, tpp++) { 3139 ASSERT((PAGE_EXCL(tpp) && 3140 !page_iolock_assert(tpp)) || panicstr); 3141 ASSERT(tpp->p_slckcnt == 0 || panicstr); 3142 (void) hat_pageunload(tpp, HAT_FORCE_PGUNLOAD); 3143 page_hashout(tpp, NULL); 3144 ASSERT(tpp->p_offset == (u_offset_t)-1); 3145 if (tpp->p_lckcnt != 0) { 3146 pglcks++; 3147 tpp->p_lckcnt = 0; 3148 } else if (tpp->p_cowcnt != 0) { 3149 pglcks += tpp->p_cowcnt; 3150 tpp->p_cowcnt = 0; 3151 } 3152 ASSERT(!hat_page_getshare(tpp)); 3153 ASSERT(tpp->p_vnode == NULL); 3154 ASSERT(tpp->p_szc == szc); 3155 3156 PP_SETFREE(tpp); 3157 page_clr_all_props(tpp); 3158 PP_SETAGED(tpp); 3159 ASSERT(tpp->p_next == tpp); 3160 ASSERT(tpp->p_prev == tpp); 3161 page_list_concat(&rootpp, &tpp); 3162 } 3163 3164 ASSERT(rootpp == pp); 3165 if (pglcks != 0) { 3166 mutex_enter(&freemem_lock); 3167 availrmem += pglcks; 3168 mutex_exit(&freemem_lock); 3169 } 3170 3171 page_list_add_pages(rootpp, 0); 3172 page_create_putback(pgcnt); 3173 } 3174 3175 /* 3176 * Similar to page_destroy(), but destroys pages which are 3177 * locked and known to be on the page free list. Since 3178 * the page is known to be free and locked, no one can access 3179 * it. 3180 * 3181 * Also, the number of free pages does not change. 3182 */ 3183 void 3184 page_destroy_free(page_t *pp) 3185 { 3186 ASSERT(PAGE_EXCL(pp)); 3187 ASSERT(PP_ISFREE(pp)); 3188 ASSERT(pp->p_vnode); 3189 ASSERT(hat_page_getattr(pp, P_MOD | P_REF | P_RO) == 0); 3190 ASSERT(!hat_page_is_mapped(pp)); 3191 ASSERT(PP_ISAGED(pp) == 0); 3192 ASSERT(pp->p_szc == 0); 3193 3194 VM_STAT_ADD(pagecnt.pc_destroy_free); 3195 page_list_sub(pp, PG_CACHE_LIST); 3196 3197 page_hashout(pp, NULL); 3198 ASSERT(pp->p_vnode == NULL); 3199 ASSERT(pp->p_offset == (u_offset_t)-1); 3200 ASSERT(pp->p_hash == NULL); 3201 3202 PP_SETAGED(pp); 3203 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); 3204 page_unlock(pp); 3205 3206 mutex_enter(&new_freemem_lock); 3207 if (freemem_wait) { 3208 cv_signal(&freemem_cv); 3209 } 3210 mutex_exit(&new_freemem_lock); 3211 } 3212 3213 /* 3214 * Rename the page "opp" to have an identity specified 3215 * by [vp, off]. If a page already exists with this name 3216 * it is locked and destroyed. Note that the page's 3217 * translations are not unloaded during the rename. 3218 * 3219 * This routine is used by the anon layer to "steal" the 3220 * original page and is not unlike destroying a page and 3221 * creating a new page using the same page frame. 3222 * 3223 * XXX -- Could deadlock if caller 1 tries to rename A to B while 3224 * caller 2 tries to rename B to A. 3225 */ 3226 void 3227 page_rename(page_t *opp, vnode_t *vp, u_offset_t off) 3228 { 3229 page_t *pp; 3230 int olckcnt = 0; 3231 int ocowcnt = 0; 3232 kmutex_t *phm; 3233 ulong_t index; 3234 3235 ASSERT(PAGE_EXCL(opp) && !page_iolock_assert(opp)); 3236 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 3237 ASSERT(PP_ISFREE(opp) == 0); 3238 3239 VM_STAT_ADD(page_rename_count); 3240 3241 TRACE_3(TR_FAC_VM, TR_PAGE_RENAME, 3242 "page rename:pp %p vp %p off %llx", opp, vp, off); 3243 3244 /* 3245 * CacheFS may call page_rename for a large NFS page 3246 * when both CacheFS and NFS mount points are used 3247 * by applications. Demote this large page before 3248 * renaming it, to ensure that there are no "partial" 3249 * large pages left lying around. 3250 */ 3251 if (opp->p_szc != 0) { 3252 vnode_t *ovp = opp->p_vnode; 3253 ASSERT(ovp != NULL); 3254 ASSERT(!IS_SWAPFSVP(ovp)); 3255 ASSERT(!VN_ISKAS(ovp)); 3256 page_demote_vp_pages(opp); 3257 ASSERT(opp->p_szc == 0); 3258 } 3259 3260 page_hashout(opp, NULL); 3261 PP_CLRAGED(opp); 3262 3263 /* 3264 * Acquire the appropriate page hash lock, since 3265 * we're going to rename the page. 3266 */ 3267 index = PAGE_HASH_FUNC(vp, off); 3268 phm = PAGE_HASH_MUTEX(index); 3269 mutex_enter(phm); 3270 top: 3271 /* 3272 * Look for an existing page with this name and destroy it if found. 3273 * By holding the page hash lock all the way to the page_hashin() 3274 * call, we are assured that no page can be created with this 3275 * identity. In the case when the phm lock is dropped to undo any 3276 * hat layer mappings, the existing page is held with an "exclusive" 3277 * lock, again preventing another page from being created with 3278 * this identity. 3279 */ 3280 PAGE_HASH_SEARCH(index, pp, vp, off); 3281 if (pp != NULL) { 3282 VM_STAT_ADD(page_rename_exists); 3283 3284 /* 3285 * As it turns out, this is one of only two places where 3286 * page_lock() needs to hold the passed in lock in the 3287 * successful case. In all of the others, the lock could 3288 * be dropped as soon as the attempt is made to lock 3289 * the page. It is tempting to add yet another arguement, 3290 * PL_KEEP or PL_DROP, to let page_lock know what to do. 3291 */ 3292 if (!page_lock(pp, SE_EXCL, phm, P_RECLAIM)) { 3293 /* 3294 * Went to sleep because the page could not 3295 * be locked. We were woken up when the page 3296 * was unlocked, or when the page was destroyed. 3297 * In either case, `phm' was dropped while we 3298 * slept. Hence we should not just roar through 3299 * this loop. 3300 */ 3301 goto top; 3302 } 3303 3304 /* 3305 * If an existing page is a large page, then demote 3306 * it to ensure that no "partial" large pages are 3307 * "created" after page_rename. An existing page 3308 * can be a CacheFS page, and can't belong to swapfs. 3309 */ 3310 if (hat_page_is_mapped(pp)) { 3311 /* 3312 * Unload translations. Since we hold the 3313 * exclusive lock on this page, the page 3314 * can not be changed while we drop phm. 3315 * This is also not a lock protocol violation, 3316 * but rather the proper way to do things. 3317 */ 3318 mutex_exit(phm); 3319 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 3320 if (pp->p_szc != 0) { 3321 ASSERT(!IS_SWAPFSVP(vp)); 3322 ASSERT(!VN_ISKAS(vp)); 3323 page_demote_vp_pages(pp); 3324 ASSERT(pp->p_szc == 0); 3325 } 3326 mutex_enter(phm); 3327 } else if (pp->p_szc != 0) { 3328 ASSERT(!IS_SWAPFSVP(vp)); 3329 ASSERT(!VN_ISKAS(vp)); 3330 mutex_exit(phm); 3331 page_demote_vp_pages(pp); 3332 ASSERT(pp->p_szc == 0); 3333 mutex_enter(phm); 3334 } 3335 page_hashout(pp, phm); 3336 } 3337 /* 3338 * Hash in the page with the new identity. 3339 */ 3340 if (!page_hashin(opp, vp, off, phm)) { 3341 /* 3342 * We were holding phm while we searched for [vp, off] 3343 * and only dropped phm if we found and locked a page. 3344 * If we can't create this page now, then some thing 3345 * is really broken. 3346 */ 3347 panic("page_rename: Can't hash in page: %p", (void *)pp); 3348 /*NOTREACHED*/ 3349 } 3350 3351 ASSERT(MUTEX_HELD(phm)); 3352 mutex_exit(phm); 3353 3354 /* 3355 * Now that we have dropped phm, lets get around to finishing up 3356 * with pp. 3357 */ 3358 if (pp != NULL) { 3359 ASSERT(!hat_page_is_mapped(pp)); 3360 /* for now large pages should not end up here */ 3361 ASSERT(pp->p_szc == 0); 3362 /* 3363 * Save the locks for transfer to the new page and then 3364 * clear them so page_free doesn't think they're important. 3365 * The page_struct_lock need not be acquired for lckcnt and 3366 * cowcnt since the page has an "exclusive" lock. 3367 */ 3368 olckcnt = pp->p_lckcnt; 3369 ocowcnt = pp->p_cowcnt; 3370 pp->p_lckcnt = pp->p_cowcnt = 0; 3371 3372 /* 3373 * Put the page on the "free" list after we drop 3374 * the lock. The less work under the lock the better. 3375 */ 3376 /*LINTED: constant in conditional context*/ 3377 VN_DISPOSE(pp, B_FREE, 0, kcred); 3378 } 3379 3380 /* 3381 * Transfer the lock count from the old page (if any). 3382 * The page_struct_lock need not be acquired for lckcnt and 3383 * cowcnt since the page has an "exclusive" lock. 3384 */ 3385 opp->p_lckcnt += olckcnt; 3386 opp->p_cowcnt += ocowcnt; 3387 } 3388 3389 /* 3390 * low level routine to add page `pp' to the hash and vp chains for [vp, offset] 3391 * 3392 * Pages are normally inserted at the start of a vnode's v_pages list. 3393 * If the vnode is VMODSORT and the page is modified, it goes at the end. 3394 * This can happen when a modified page is relocated for DR. 3395 * 3396 * Returns 1 on success and 0 on failure. 3397 */ 3398 static int 3399 page_do_hashin(page_t *pp, vnode_t *vp, u_offset_t offset) 3400 { 3401 page_t **listp; 3402 page_t *tp; 3403 ulong_t index; 3404 3405 ASSERT(PAGE_EXCL(pp)); 3406 ASSERT(vp != NULL); 3407 ASSERT(MUTEX_HELD(page_vnode_mutex(vp))); 3408 3409 /* 3410 * Be sure to set these up before the page is inserted on the hash 3411 * list. As soon as the page is placed on the list some other 3412 * thread might get confused and wonder how this page could 3413 * possibly hash to this list. 3414 */ 3415 pp->p_vnode = vp; 3416 pp->p_offset = offset; 3417 3418 /* 3419 * record if this page is on a swap vnode 3420 */ 3421 if ((vp->v_flag & VISSWAP) != 0) 3422 PP_SETSWAP(pp); 3423 3424 index = PAGE_HASH_FUNC(vp, offset); 3425 ASSERT(MUTEX_HELD(PAGE_HASH_MUTEX(index))); 3426 listp = &page_hash[index]; 3427 3428 /* 3429 * If this page is already hashed in, fail this attempt to add it. 3430 */ 3431 for (tp = *listp; tp != NULL; tp = tp->p_hash) { 3432 if (tp->p_vnode == vp && tp->p_offset == offset) { 3433 pp->p_vnode = NULL; 3434 pp->p_offset = (u_offset_t)(-1); 3435 return (0); 3436 } 3437 } 3438 pp->p_hash = *listp; 3439 *listp = pp; 3440 3441 /* 3442 * Add the page to the vnode's list of pages 3443 */ 3444 if (vp->v_pages != NULL && IS_VMODSORT(vp) && hat_ismod(pp)) 3445 listp = &vp->v_pages->p_vpprev->p_vpnext; 3446 else 3447 listp = &vp->v_pages; 3448 3449 page_vpadd(listp, pp); 3450 3451 return (1); 3452 } 3453 3454 /* 3455 * Add page `pp' to both the hash and vp chains for [vp, offset]. 3456 * 3457 * Returns 1 on success and 0 on failure. 3458 * If hold is passed in, it is not dropped. 3459 */ 3460 int 3461 page_hashin(page_t *pp, vnode_t *vp, u_offset_t offset, kmutex_t *hold) 3462 { 3463 kmutex_t *phm = NULL; 3464 kmutex_t *vphm; 3465 int rc; 3466 3467 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 3468 ASSERT(pp->p_fsdata == 0 || panicstr); 3469 3470 TRACE_3(TR_FAC_VM, TR_PAGE_HASHIN, 3471 "page_hashin:pp %p vp %p offset %llx", 3472 pp, vp, offset); 3473 3474 VM_STAT_ADD(hashin_count); 3475 3476 if (hold != NULL) 3477 phm = hold; 3478 else { 3479 VM_STAT_ADD(hashin_not_held); 3480 phm = PAGE_HASH_MUTEX(PAGE_HASH_FUNC(vp, offset)); 3481 mutex_enter(phm); 3482 } 3483 3484 vphm = page_vnode_mutex(vp); 3485 mutex_enter(vphm); 3486 rc = page_do_hashin(pp, vp, offset); 3487 mutex_exit(vphm); 3488 if (hold == NULL) 3489 mutex_exit(phm); 3490 if (rc == 0) 3491 VM_STAT_ADD(hashin_already); 3492 return (rc); 3493 } 3494 3495 /* 3496 * Remove page ``pp'' from the hash and vp chains and remove vp association. 3497 * All mutexes must be held 3498 */ 3499 static void 3500 page_do_hashout(page_t *pp) 3501 { 3502 page_t **hpp; 3503 page_t *hp; 3504 vnode_t *vp = pp->p_vnode; 3505 3506 ASSERT(vp != NULL); 3507 ASSERT(MUTEX_HELD(page_vnode_mutex(vp))); 3508 3509 /* 3510 * First, take pp off of its hash chain. 3511 */ 3512 hpp = &page_hash[PAGE_HASH_FUNC(vp, pp->p_offset)]; 3513 3514 for (;;) { 3515 hp = *hpp; 3516 if (hp == pp) 3517 break; 3518 if (hp == NULL) { 3519 panic("page_do_hashout"); 3520 /*NOTREACHED*/ 3521 } 3522 hpp = &hp->p_hash; 3523 } 3524 *hpp = pp->p_hash; 3525 3526 /* 3527 * Now remove it from its associated vnode. 3528 */ 3529 if (vp->v_pages) 3530 page_vpsub(&vp->v_pages, pp); 3531 3532 pp->p_hash = NULL; 3533 page_clr_all_props(pp); 3534 PP_CLRSWAP(pp); 3535 pp->p_vnode = NULL; 3536 pp->p_offset = (u_offset_t)-1; 3537 pp->p_fsdata = 0; 3538 } 3539 3540 /* 3541 * Remove page ``pp'' from the hash and vp chains and remove vp association. 3542 * 3543 * When `phm' is non-NULL it contains the address of the mutex protecting the 3544 * hash list pp is on. It is not dropped. 3545 */ 3546 void 3547 page_hashout(page_t *pp, kmutex_t *phm) 3548 { 3549 vnode_t *vp; 3550 ulong_t index; 3551 kmutex_t *nphm; 3552 kmutex_t *vphm; 3553 kmutex_t *sep; 3554 3555 ASSERT(phm != NULL ? MUTEX_HELD(phm) : 1); 3556 ASSERT(pp->p_vnode != NULL); 3557 ASSERT((PAGE_EXCL(pp) && !page_iolock_assert(pp)) || panicstr); 3558 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(pp->p_vnode))); 3559 3560 vp = pp->p_vnode; 3561 3562 TRACE_2(TR_FAC_VM, TR_PAGE_HASHOUT, 3563 "page_hashout:pp %p vp %p", pp, vp); 3564 3565 /* Kernel probe */ 3566 TNF_PROBE_2(page_unmap, "vm pagefault", /* CSTYLED */, 3567 tnf_opaque, vnode, vp, 3568 tnf_offset, offset, pp->p_offset); 3569 3570 /* 3571 * 3572 */ 3573 VM_STAT_ADD(hashout_count); 3574 index = PAGE_HASH_FUNC(vp, pp->p_offset); 3575 if (phm == NULL) { 3576 VM_STAT_ADD(hashout_not_held); 3577 nphm = PAGE_HASH_MUTEX(index); 3578 mutex_enter(nphm); 3579 } 3580 ASSERT(phm ? phm == PAGE_HASH_MUTEX(index) : 1); 3581 3582 3583 /* 3584 * grab page vnode mutex and remove it... 3585 */ 3586 vphm = page_vnode_mutex(vp); 3587 mutex_enter(vphm); 3588 3589 page_do_hashout(pp); 3590 3591 mutex_exit(vphm); 3592 if (phm == NULL) 3593 mutex_exit(nphm); 3594 3595 /* 3596 * Wake up processes waiting for this page. The page's 3597 * identity has been changed, and is probably not the 3598 * desired page any longer. 3599 */ 3600 sep = page_se_mutex(pp); 3601 mutex_enter(sep); 3602 pp->p_selock &= ~SE_EWANTED; 3603 if (CV_HAS_WAITERS(&pp->p_cv)) 3604 cv_broadcast(&pp->p_cv); 3605 mutex_exit(sep); 3606 } 3607 3608 /* 3609 * Add the page to the front of a linked list of pages 3610 * using the p_next & p_prev pointers for the list. 3611 * The caller is responsible for protecting the list pointers. 3612 */ 3613 void 3614 page_add(page_t **ppp, page_t *pp) 3615 { 3616 ASSERT(PAGE_EXCL(pp) || (PAGE_SHARED(pp) && page_iolock_assert(pp))); 3617 3618 page_add_common(ppp, pp); 3619 } 3620 3621 3622 3623 /* 3624 * Common code for page_add() and mach_page_add() 3625 */ 3626 void 3627 page_add_common(page_t **ppp, page_t *pp) 3628 { 3629 if (*ppp == NULL) { 3630 pp->p_next = pp->p_prev = pp; 3631 } else { 3632 pp->p_next = *ppp; 3633 pp->p_prev = (*ppp)->p_prev; 3634 (*ppp)->p_prev = pp; 3635 pp->p_prev->p_next = pp; 3636 } 3637 *ppp = pp; 3638 } 3639 3640 3641 /* 3642 * Remove this page from a linked list of pages 3643 * using the p_next & p_prev pointers for the list. 3644 * 3645 * The caller is responsible for protecting the list pointers. 3646 */ 3647 void 3648 page_sub(page_t **ppp, page_t *pp) 3649 { 3650 ASSERT((PP_ISFREE(pp)) ? 1 : 3651 (PAGE_EXCL(pp)) || (PAGE_SHARED(pp) && page_iolock_assert(pp))); 3652 3653 if (*ppp == NULL || pp == NULL) { 3654 panic("page_sub: bad arg(s): pp %p, *ppp %p", 3655 (void *)pp, (void *)(*ppp)); 3656 /*NOTREACHED*/ 3657 } 3658 3659 page_sub_common(ppp, pp); 3660 } 3661 3662 3663 /* 3664 * Common code for page_sub() and mach_page_sub() 3665 */ 3666 void 3667 page_sub_common(page_t **ppp, page_t *pp) 3668 { 3669 if (*ppp == pp) 3670 *ppp = pp->p_next; /* go to next page */ 3671 3672 if (*ppp == pp) 3673 *ppp = NULL; /* page list is gone */ 3674 else { 3675 pp->p_prev->p_next = pp->p_next; 3676 pp->p_next->p_prev = pp->p_prev; 3677 } 3678 pp->p_prev = pp->p_next = pp; /* make pp a list of one */ 3679 } 3680 3681 3682 /* 3683 * Break page list cppp into two lists with npages in the first list. 3684 * The tail is returned in nppp. 3685 */ 3686 void 3687 page_list_break(page_t **oppp, page_t **nppp, pgcnt_t npages) 3688 { 3689 page_t *s1pp = *oppp; 3690 page_t *s2pp; 3691 page_t *e1pp, *e2pp; 3692 long n = 0; 3693 3694 if (s1pp == NULL) { 3695 *nppp = NULL; 3696 return; 3697 } 3698 if (npages == 0) { 3699 *nppp = s1pp; 3700 *oppp = NULL; 3701 return; 3702 } 3703 for (n = 0, s2pp = *oppp; n < npages; n++) { 3704 s2pp = s2pp->p_next; 3705 } 3706 /* Fix head and tail of new lists */ 3707 e1pp = s2pp->p_prev; 3708 e2pp = s1pp->p_prev; 3709 s1pp->p_prev = e1pp; 3710 e1pp->p_next = s1pp; 3711 s2pp->p_prev = e2pp; 3712 e2pp->p_next = s2pp; 3713 3714 /* second list empty */ 3715 if (s2pp == s1pp) { 3716 *oppp = s1pp; 3717 *nppp = NULL; 3718 } else { 3719 *oppp = s1pp; 3720 *nppp = s2pp; 3721 } 3722 } 3723 3724 /* 3725 * Concatenate page list nppp onto the end of list ppp. 3726 */ 3727 void 3728 page_list_concat(page_t **ppp, page_t **nppp) 3729 { 3730 page_t *s1pp, *s2pp, *e1pp, *e2pp; 3731 3732 if (*nppp == NULL) { 3733 return; 3734 } 3735 if (*ppp == NULL) { 3736 *ppp = *nppp; 3737 return; 3738 } 3739 s1pp = *ppp; 3740 e1pp = s1pp->p_prev; 3741 s2pp = *nppp; 3742 e2pp = s2pp->p_prev; 3743 s1pp->p_prev = e2pp; 3744 e2pp->p_next = s1pp; 3745 e1pp->p_next = s2pp; 3746 s2pp->p_prev = e1pp; 3747 } 3748 3749 /* 3750 * return the next page in the page list 3751 */ 3752 page_t * 3753 page_list_next(page_t *pp) 3754 { 3755 return (pp->p_next); 3756 } 3757 3758 3759 /* 3760 * Add the page to the front of the linked list of pages 3761 * using p_vpnext/p_vpprev pointers for the list. 3762 * 3763 * The caller is responsible for protecting the lists. 3764 */ 3765 void 3766 page_vpadd(page_t **ppp, page_t *pp) 3767 { 3768 if (*ppp == NULL) { 3769 pp->p_vpnext = pp->p_vpprev = pp; 3770 } else { 3771 pp->p_vpnext = *ppp; 3772 pp->p_vpprev = (*ppp)->p_vpprev; 3773 (*ppp)->p_vpprev = pp; 3774 pp->p_vpprev->p_vpnext = pp; 3775 } 3776 *ppp = pp; 3777 } 3778 3779 /* 3780 * Remove this page from the linked list of pages 3781 * using p_vpnext/p_vpprev pointers for the list. 3782 * 3783 * The caller is responsible for protecting the lists. 3784 */ 3785 void 3786 page_vpsub(page_t **ppp, page_t *pp) 3787 { 3788 if (*ppp == NULL || pp == NULL) { 3789 panic("page_vpsub: bad arg(s): pp %p, *ppp %p", 3790 (void *)pp, (void *)(*ppp)); 3791 /*NOTREACHED*/ 3792 } 3793 3794 if (*ppp == pp) 3795 *ppp = pp->p_vpnext; /* go to next page */ 3796 3797 if (*ppp == pp) 3798 *ppp = NULL; /* page list is gone */ 3799 else { 3800 pp->p_vpprev->p_vpnext = pp->p_vpnext; 3801 pp->p_vpnext->p_vpprev = pp->p_vpprev; 3802 } 3803 pp->p_vpprev = pp->p_vpnext = pp; /* make pp a list of one */ 3804 } 3805 3806 /* 3807 * Lock a physical page into memory "long term". Used to support "lock 3808 * in memory" functions. Accepts the page to be locked, and a cow variable 3809 * to indicate whether a the lock will travel to the new page during 3810 * a potential copy-on-write. 3811 */ 3812 int 3813 page_pp_lock( 3814 page_t *pp, /* page to be locked */ 3815 int cow, /* cow lock */ 3816 int kernel) /* must succeed -- ignore checking */ 3817 { 3818 int r = 0; /* result -- assume failure */ 3819 3820 ASSERT(PAGE_LOCKED(pp)); 3821 3822 page_struct_lock(pp); 3823 /* 3824 * Acquire the "freemem_lock" for availrmem. 3825 */ 3826 if (cow) { 3827 mutex_enter(&freemem_lock); 3828 if ((availrmem > pages_pp_maximum) && 3829 (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM)) { 3830 availrmem--; 3831 pages_locked++; 3832 mutex_exit(&freemem_lock); 3833 r = 1; 3834 if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) { 3835 cmn_err(CE_WARN, 3836 "COW lock limit reached on pfn 0x%lx", 3837 page_pptonum(pp)); 3838 } 3839 } else 3840 mutex_exit(&freemem_lock); 3841 } else { 3842 if (pp->p_lckcnt) { 3843 if (pp->p_lckcnt < (ushort_t)PAGE_LOCK_MAXIMUM) { 3844 r = 1; 3845 if (++pp->p_lckcnt == 3846 (ushort_t)PAGE_LOCK_MAXIMUM) { 3847 cmn_err(CE_WARN, "Page lock limit " 3848 "reached on pfn 0x%lx", 3849 page_pptonum(pp)); 3850 } 3851 } 3852 } else { 3853 if (kernel) { 3854 /* availrmem accounting done by caller */ 3855 ++pp->p_lckcnt; 3856 r = 1; 3857 } else { 3858 mutex_enter(&freemem_lock); 3859 if (availrmem > pages_pp_maximum) { 3860 availrmem--; 3861 pages_locked++; 3862 ++pp->p_lckcnt; 3863 r = 1; 3864 } 3865 mutex_exit(&freemem_lock); 3866 } 3867 } 3868 } 3869 page_struct_unlock(pp); 3870 return (r); 3871 } 3872 3873 /* 3874 * Decommit a lock on a physical page frame. Account for cow locks if 3875 * appropriate. 3876 */ 3877 void 3878 page_pp_unlock( 3879 page_t *pp, /* page to be unlocked */ 3880 int cow, /* expect cow lock */ 3881 int kernel) /* this was a kernel lock */ 3882 { 3883 ASSERT(PAGE_LOCKED(pp)); 3884 3885 page_struct_lock(pp); 3886 /* 3887 * Acquire the "freemem_lock" for availrmem. 3888 * If cowcnt or lcknt is already 0 do nothing; i.e., we 3889 * could be called to unlock even if nothing is locked. This could 3890 * happen if locked file pages were truncated (removing the lock) 3891 * and the file was grown again and new pages faulted in; the new 3892 * pages are unlocked but the segment still thinks they're locked. 3893 */ 3894 if (cow) { 3895 if (pp->p_cowcnt) { 3896 mutex_enter(&freemem_lock); 3897 pp->p_cowcnt--; 3898 availrmem++; 3899 pages_locked--; 3900 mutex_exit(&freemem_lock); 3901 } 3902 } else { 3903 if (pp->p_lckcnt && --pp->p_lckcnt == 0) { 3904 if (!kernel) { 3905 mutex_enter(&freemem_lock); 3906 availrmem++; 3907 pages_locked--; 3908 mutex_exit(&freemem_lock); 3909 } 3910 } 3911 } 3912 page_struct_unlock(pp); 3913 } 3914 3915 /* 3916 * This routine reserves availrmem for npages; 3917 * flags: KM_NOSLEEP or KM_SLEEP 3918 * returns 1 on success or 0 on failure 3919 */ 3920 int 3921 page_resv(pgcnt_t npages, uint_t flags) 3922 { 3923 mutex_enter(&freemem_lock); 3924 while (availrmem < tune.t_minarmem + npages) { 3925 if (flags & KM_NOSLEEP) { 3926 mutex_exit(&freemem_lock); 3927 return (0); 3928 } 3929 mutex_exit(&freemem_lock); 3930 page_needfree(npages); 3931 kmem_reap(); 3932 delay(hz >> 2); 3933 page_needfree(-(spgcnt_t)npages); 3934 mutex_enter(&freemem_lock); 3935 } 3936 availrmem -= npages; 3937 mutex_exit(&freemem_lock); 3938 return (1); 3939 } 3940 3941 /* 3942 * This routine unreserves availrmem for npages; 3943 */ 3944 void 3945 page_unresv(pgcnt_t npages) 3946 { 3947 mutex_enter(&freemem_lock); 3948 availrmem += npages; 3949 mutex_exit(&freemem_lock); 3950 } 3951 3952 /* 3953 * See Statement at the beginning of segvn_lockop() regarding 3954 * the way we handle cowcnts and lckcnts. 3955 * 3956 * Transfer cowcnt on 'opp' to cowcnt on 'npp' if the vpage 3957 * that breaks COW has PROT_WRITE. 3958 * 3959 * Note that, we may also break COW in case we are softlocking 3960 * on read access during physio; 3961 * in this softlock case, the vpage may not have PROT_WRITE. 3962 * So, we need to transfer lckcnt on 'opp' to lckcnt on 'npp' 3963 * if the vpage doesn't have PROT_WRITE. 3964 * 3965 * This routine is never called if we are stealing a page 3966 * in anon_private. 3967 * 3968 * The caller subtracted from availrmem for read only mapping. 3969 * if lckcnt is 1 increment availrmem. 3970 */ 3971 void 3972 page_pp_useclaim( 3973 page_t *opp, /* original page frame losing lock */ 3974 page_t *npp, /* new page frame gaining lock */ 3975 uint_t write_perm) /* set if vpage has PROT_WRITE */ 3976 { 3977 int payback = 0; 3978 int nidx, oidx; 3979 3980 ASSERT(PAGE_LOCKED(opp)); 3981 ASSERT(PAGE_LOCKED(npp)); 3982 3983 /* 3984 * Since we have two pages we probably have two locks. We need to take 3985 * them in a defined order to avoid deadlocks. It's also possible they 3986 * both hash to the same lock in which case this is a non-issue. 3987 */ 3988 nidx = PAGE_LLOCK_HASH(PP_PAGEROOT(npp)); 3989 oidx = PAGE_LLOCK_HASH(PP_PAGEROOT(opp)); 3990 if (nidx < oidx) { 3991 page_struct_lock(npp); 3992 page_struct_lock(opp); 3993 } else if (oidx < nidx) { 3994 page_struct_lock(opp); 3995 page_struct_lock(npp); 3996 } else { /* The pages hash to the same lock */ 3997 page_struct_lock(npp); 3998 } 3999 4000 ASSERT(npp->p_cowcnt == 0); 4001 ASSERT(npp->p_lckcnt == 0); 4002 4003 /* Don't use claim if nothing is locked (see page_pp_unlock above) */ 4004 if ((write_perm && opp->p_cowcnt != 0) || 4005 (!write_perm && opp->p_lckcnt != 0)) { 4006 4007 if (write_perm) { 4008 npp->p_cowcnt++; 4009 ASSERT(opp->p_cowcnt != 0); 4010 opp->p_cowcnt--; 4011 } else { 4012 4013 ASSERT(opp->p_lckcnt != 0); 4014 4015 /* 4016 * We didn't need availrmem decremented if p_lckcnt on 4017 * original page is 1. Here, we are unlocking 4018 * read-only copy belonging to original page and 4019 * are locking a copy belonging to new page. 4020 */ 4021 if (opp->p_lckcnt == 1) 4022 payback = 1; 4023 4024 npp->p_lckcnt++; 4025 opp->p_lckcnt--; 4026 } 4027 } 4028 if (payback) { 4029 mutex_enter(&freemem_lock); 4030 availrmem++; 4031 pages_useclaim--; 4032 mutex_exit(&freemem_lock); 4033 } 4034 4035 if (nidx < oidx) { 4036 page_struct_unlock(opp); 4037 page_struct_unlock(npp); 4038 } else if (oidx < nidx) { 4039 page_struct_unlock(npp); 4040 page_struct_unlock(opp); 4041 } else { /* The pages hash to the same lock */ 4042 page_struct_unlock(npp); 4043 } 4044 } 4045 4046 /* 4047 * Simple claim adjust functions -- used to support changes in 4048 * claims due to changes in access permissions. Used by segvn_setprot(). 4049 */ 4050 int 4051 page_addclaim(page_t *pp) 4052 { 4053 int r = 0; /* result */ 4054 4055 ASSERT(PAGE_LOCKED(pp)); 4056 4057 page_struct_lock(pp); 4058 ASSERT(pp->p_lckcnt != 0); 4059 4060 if (pp->p_lckcnt == 1) { 4061 if (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM) { 4062 --pp->p_lckcnt; 4063 r = 1; 4064 if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) { 4065 cmn_err(CE_WARN, 4066 "COW lock limit reached on pfn 0x%lx", 4067 page_pptonum(pp)); 4068 } 4069 } 4070 } else { 4071 mutex_enter(&freemem_lock); 4072 if ((availrmem > pages_pp_maximum) && 4073 (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM)) { 4074 --availrmem; 4075 ++pages_claimed; 4076 mutex_exit(&freemem_lock); 4077 --pp->p_lckcnt; 4078 r = 1; 4079 if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) { 4080 cmn_err(CE_WARN, 4081 "COW lock limit reached on pfn 0x%lx", 4082 page_pptonum(pp)); 4083 } 4084 } else 4085 mutex_exit(&freemem_lock); 4086 } 4087 page_struct_unlock(pp); 4088 return (r); 4089 } 4090 4091 int 4092 page_subclaim(page_t *pp) 4093 { 4094 int r = 0; 4095 4096 ASSERT(PAGE_LOCKED(pp)); 4097 4098 page_struct_lock(pp); 4099 ASSERT(pp->p_cowcnt != 0); 4100 4101 if (pp->p_lckcnt) { 4102 if (pp->p_lckcnt < (ushort_t)PAGE_LOCK_MAXIMUM) { 4103 r = 1; 4104 /* 4105 * for availrmem 4106 */ 4107 mutex_enter(&freemem_lock); 4108 availrmem++; 4109 pages_claimed--; 4110 mutex_exit(&freemem_lock); 4111 4112 pp->p_cowcnt--; 4113 4114 if (++pp->p_lckcnt == (ushort_t)PAGE_LOCK_MAXIMUM) { 4115 cmn_err(CE_WARN, 4116 "Page lock limit reached on pfn 0x%lx", 4117 page_pptonum(pp)); 4118 } 4119 } 4120 } else { 4121 r = 1; 4122 pp->p_cowcnt--; 4123 pp->p_lckcnt++; 4124 } 4125 page_struct_unlock(pp); 4126 return (r); 4127 } 4128 4129 /* 4130 * Variant of page_addclaim(), where ppa[] contains the pages of a single large 4131 * page. 4132 */ 4133 int 4134 page_addclaim_pages(page_t **ppa) 4135 { 4136 pgcnt_t lckpgs = 0, pg_idx; 4137 4138 VM_STAT_ADD(pagecnt.pc_addclaim_pages); 4139 4140 /* 4141 * Only need to take the page struct lock on the large page root. 4142 */ 4143 page_struct_lock(ppa[0]); 4144 for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) { 4145 4146 ASSERT(PAGE_LOCKED(ppa[pg_idx])); 4147 ASSERT(ppa[pg_idx]->p_lckcnt != 0); 4148 if (ppa[pg_idx]->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) { 4149 page_struct_unlock(ppa[0]); 4150 return (0); 4151 } 4152 if (ppa[pg_idx]->p_lckcnt > 1) 4153 lckpgs++; 4154 } 4155 4156 if (lckpgs != 0) { 4157 mutex_enter(&freemem_lock); 4158 if (availrmem >= pages_pp_maximum + lckpgs) { 4159 availrmem -= lckpgs; 4160 pages_claimed += lckpgs; 4161 } else { 4162 mutex_exit(&freemem_lock); 4163 page_struct_unlock(ppa[0]); 4164 return (0); 4165 } 4166 mutex_exit(&freemem_lock); 4167 } 4168 4169 for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) { 4170 ppa[pg_idx]->p_lckcnt--; 4171 ppa[pg_idx]->p_cowcnt++; 4172 } 4173 page_struct_unlock(ppa[0]); 4174 return (1); 4175 } 4176 4177 /* 4178 * Variant of page_subclaim(), where ppa[] contains the pages of a single large 4179 * page. 4180 */ 4181 int 4182 page_subclaim_pages(page_t **ppa) 4183 { 4184 pgcnt_t ulckpgs = 0, pg_idx; 4185 4186 VM_STAT_ADD(pagecnt.pc_subclaim_pages); 4187 4188 /* 4189 * Only need to take the page struct lock on the large page root. 4190 */ 4191 page_struct_lock(ppa[0]); 4192 for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) { 4193 4194 ASSERT(PAGE_LOCKED(ppa[pg_idx])); 4195 ASSERT(ppa[pg_idx]->p_cowcnt != 0); 4196 if (ppa[pg_idx]->p_lckcnt == (ushort_t)PAGE_LOCK_MAXIMUM) { 4197 page_struct_unlock(ppa[0]); 4198 return (0); 4199 } 4200 if (ppa[pg_idx]->p_lckcnt != 0) 4201 ulckpgs++; 4202 } 4203 4204 if (ulckpgs != 0) { 4205 mutex_enter(&freemem_lock); 4206 availrmem += ulckpgs; 4207 pages_claimed -= ulckpgs; 4208 mutex_exit(&freemem_lock); 4209 } 4210 4211 for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) { 4212 ppa[pg_idx]->p_cowcnt--; 4213 ppa[pg_idx]->p_lckcnt++; 4214 4215 } 4216 page_struct_unlock(ppa[0]); 4217 return (1); 4218 } 4219 4220 page_t * 4221 page_numtopp(pfn_t pfnum, se_t se) 4222 { 4223 page_t *pp; 4224 4225 retry: 4226 pp = page_numtopp_nolock(pfnum); 4227 if (pp == NULL) { 4228 return ((page_t *)NULL); 4229 } 4230 4231 /* 4232 * Acquire the appropriate lock on the page. 4233 */ 4234 while (!page_lock(pp, se, (kmutex_t *)NULL, P_RECLAIM)) { 4235 if (page_pptonum(pp) != pfnum) 4236 goto retry; 4237 continue; 4238 } 4239 4240 if (page_pptonum(pp) != pfnum) { 4241 page_unlock(pp); 4242 goto retry; 4243 } 4244 4245 return (pp); 4246 } 4247 4248 page_t * 4249 page_numtopp_noreclaim(pfn_t pfnum, se_t se) 4250 { 4251 page_t *pp; 4252 4253 retry: 4254 pp = page_numtopp_nolock(pfnum); 4255 if (pp == NULL) { 4256 return ((page_t *)NULL); 4257 } 4258 4259 /* 4260 * Acquire the appropriate lock on the page. 4261 */ 4262 while (!page_lock(pp, se, (kmutex_t *)NULL, P_NO_RECLAIM)) { 4263 if (page_pptonum(pp) != pfnum) 4264 goto retry; 4265 continue; 4266 } 4267 4268 if (page_pptonum(pp) != pfnum) { 4269 page_unlock(pp); 4270 goto retry; 4271 } 4272 4273 return (pp); 4274 } 4275 4276 /* 4277 * This routine is like page_numtopp, but will only return page structs 4278 * for pages which are ok for loading into hardware using the page struct. 4279 */ 4280 page_t * 4281 page_numtopp_nowait(pfn_t pfnum, se_t se) 4282 { 4283 page_t *pp; 4284 4285 retry: 4286 pp = page_numtopp_nolock(pfnum); 4287 if (pp == NULL) { 4288 return ((page_t *)NULL); 4289 } 4290 4291 /* 4292 * Try to acquire the appropriate lock on the page. 4293 */ 4294 if (PP_ISFREE(pp)) 4295 pp = NULL; 4296 else { 4297 if (!page_trylock(pp, se)) 4298 pp = NULL; 4299 else { 4300 if (page_pptonum(pp) != pfnum) { 4301 page_unlock(pp); 4302 goto retry; 4303 } 4304 if (PP_ISFREE(pp)) { 4305 page_unlock(pp); 4306 pp = NULL; 4307 } 4308 } 4309 } 4310 return (pp); 4311 } 4312 4313 #define SYNC_PROGRESS_NPAGES 1000 4314 4315 /* 4316 * Returns a count of dirty pages that are in the process 4317 * of being written out. If 'cleanit' is set, try to push the page. 4318 */ 4319 pgcnt_t 4320 page_busy(int cleanit) 4321 { 4322 page_t *page0 = page_first(); 4323 page_t *pp = page0; 4324 pgcnt_t nppbusy = 0; 4325 int counter = 0; 4326 u_offset_t off; 4327 4328 do { 4329 vnode_t *vp = pp->p_vnode; 4330 4331 /* 4332 * Reset the sync timeout. The page list is very long 4333 * on large memory systems. 4334 */ 4335 if (++counter > SYNC_PROGRESS_NPAGES) { 4336 counter = 0; 4337 vfs_syncprogress(); 4338 } 4339 4340 /* 4341 * A page is a candidate for syncing if it is: 4342 * 4343 * (a) On neither the freelist nor the cachelist 4344 * (b) Hashed onto a vnode 4345 * (c) Not a kernel page 4346 * (d) Dirty 4347 * (e) Not part of a swapfile 4348 * (f) a page which belongs to a real vnode; eg has a non-null 4349 * v_vfsp pointer. 4350 * (g) Backed by a filesystem which doesn't have a 4351 * stubbed-out sync operation 4352 */ 4353 if (!PP_ISFREE(pp) && vp != NULL && !VN_ISKAS(vp) && 4354 hat_ismod(pp) && !IS_SWAPVP(vp) && vp->v_vfsp != NULL && 4355 vfs_can_sync(vp->v_vfsp)) { 4356 nppbusy++; 4357 4358 if (!cleanit) 4359 continue; 4360 if (!page_trylock(pp, SE_EXCL)) 4361 continue; 4362 4363 if (PP_ISFREE(pp) || vp == NULL || IS_SWAPVP(vp) || 4364 pp->p_lckcnt != 0 || pp->p_cowcnt != 0 || 4365 !(hat_pagesync(pp, 4366 HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD) & P_MOD)) { 4367 page_unlock(pp); 4368 continue; 4369 } 4370 off = pp->p_offset; 4371 VN_HOLD(vp); 4372 page_unlock(pp); 4373 (void) VOP_PUTPAGE(vp, off, PAGESIZE, 4374 B_ASYNC | B_FREE, kcred, NULL); 4375 VN_RELE(vp); 4376 } 4377 } while ((pp = page_next(pp)) != page0); 4378 4379 vfs_syncprogress(); 4380 return (nppbusy); 4381 } 4382 4383 void page_invalidate_pages(void); 4384 4385 /* 4386 * callback handler to vm sub-system 4387 * 4388 * callers make sure no recursive entries to this func. 4389 */ 4390 /*ARGSUSED*/ 4391 boolean_t 4392 callb_vm_cpr(void *arg, int code) 4393 { 4394 if (code == CB_CODE_CPR_CHKPT) 4395 page_invalidate_pages(); 4396 return (B_TRUE); 4397 } 4398 4399 /* 4400 * Invalidate all pages of the system. 4401 * It shouldn't be called until all user page activities are all stopped. 4402 */ 4403 void 4404 page_invalidate_pages() 4405 { 4406 page_t *pp; 4407 page_t *page0; 4408 pgcnt_t nbusypages; 4409 int retry = 0; 4410 const int MAXRETRIES = 4; 4411 top: 4412 /* 4413 * Flush dirty pages and destroy the clean ones. 4414 */ 4415 nbusypages = 0; 4416 4417 pp = page0 = page_first(); 4418 do { 4419 struct vnode *vp; 4420 u_offset_t offset; 4421 int mod; 4422 4423 /* 4424 * skip the page if it has no vnode or the page associated 4425 * with the kernel vnode or prom allocated kernel mem. 4426 */ 4427 if ((vp = pp->p_vnode) == NULL || VN_ISKAS(vp)) 4428 continue; 4429 4430 /* 4431 * skip the page which is already free invalidated. 4432 */ 4433 if (PP_ISFREE(pp) && PP_ISAGED(pp)) 4434 continue; 4435 4436 /* 4437 * skip pages that are already locked or can't be "exclusively" 4438 * locked or are already free. After we lock the page, check 4439 * the free and age bits again to be sure it's not destroyed 4440 * yet. 4441 * To achieve max. parallelization, we use page_trylock instead 4442 * of page_lock so that we don't get block on individual pages 4443 * while we have thousands of other pages to process. 4444 */ 4445 if (!page_trylock(pp, SE_EXCL)) { 4446 nbusypages++; 4447 continue; 4448 } else if (PP_ISFREE(pp)) { 4449 if (!PP_ISAGED(pp)) { 4450 page_destroy_free(pp); 4451 } else { 4452 page_unlock(pp); 4453 } 4454 continue; 4455 } 4456 /* 4457 * Is this page involved in some I/O? shared? 4458 * 4459 * The page_struct_lock need not be acquired to 4460 * examine these fields since the page has an 4461 * "exclusive" lock. 4462 */ 4463 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 4464 page_unlock(pp); 4465 continue; 4466 } 4467 4468 if (vp->v_type == VCHR) { 4469 panic("vp->v_type == VCHR"); 4470 /*NOTREACHED*/ 4471 } 4472 4473 if (!page_try_demote_pages(pp)) { 4474 page_unlock(pp); 4475 continue; 4476 } 4477 4478 /* 4479 * Check the modified bit. Leave the bits alone in hardware 4480 * (they will be modified if we do the putpage). 4481 */ 4482 mod = (hat_pagesync(pp, HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD) 4483 & P_MOD); 4484 if (mod) { 4485 offset = pp->p_offset; 4486 /* 4487 * Hold the vnode before releasing the page lock 4488 * to prevent it from being freed and re-used by 4489 * some other thread. 4490 */ 4491 VN_HOLD(vp); 4492 page_unlock(pp); 4493 /* 4494 * No error return is checked here. Callers such as 4495 * cpr deals with the dirty pages at the dump time 4496 * if this putpage fails. 4497 */ 4498 (void) VOP_PUTPAGE(vp, offset, PAGESIZE, B_INVAL, 4499 kcred, NULL); 4500 VN_RELE(vp); 4501 } else { 4502 /*LINTED: constant in conditional context*/ 4503 VN_DISPOSE(pp, B_INVAL, 0, kcred); 4504 } 4505 } while ((pp = page_next(pp)) != page0); 4506 if (nbusypages && retry++ < MAXRETRIES) { 4507 delay(1); 4508 goto top; 4509 } 4510 } 4511 4512 /* 4513 * Replace the page "old" with the page "new" on the page hash and vnode lists 4514 * 4515 * the replacement must be done in place, ie the equivalent sequence: 4516 * 4517 * vp = old->p_vnode; 4518 * off = old->p_offset; 4519 * page_do_hashout(old) 4520 * page_do_hashin(new, vp, off) 4521 * 4522 * doesn't work, since 4523 * 1) if old is the only page on the vnode, the v_pages list has a window 4524 * where it looks empty. This will break file system assumptions. 4525 * and 4526 * 2) pvn_vplist_dirty() can't deal with pages moving on the v_pages list. 4527 */ 4528 static void 4529 page_do_relocate_hash(page_t *new, page_t *old) 4530 { 4531 page_t **hash_list; 4532 vnode_t *vp = old->p_vnode; 4533 kmutex_t *sep; 4534 4535 ASSERT(PAGE_EXCL(old)); 4536 ASSERT(PAGE_EXCL(new)); 4537 ASSERT(vp != NULL); 4538 ASSERT(MUTEX_HELD(page_vnode_mutex(vp))); 4539 ASSERT(MUTEX_HELD(PAGE_HASH_MUTEX(PAGE_HASH_FUNC(vp, old->p_offset)))); 4540 4541 /* 4542 * First find old page on the page hash list 4543 */ 4544 hash_list = &page_hash[PAGE_HASH_FUNC(vp, old->p_offset)]; 4545 4546 for (;;) { 4547 if (*hash_list == old) 4548 break; 4549 if (*hash_list == NULL) { 4550 panic("page_do_hashout"); 4551 /*NOTREACHED*/ 4552 } 4553 hash_list = &(*hash_list)->p_hash; 4554 } 4555 4556 /* 4557 * update new and replace old with new on the page hash list 4558 */ 4559 new->p_vnode = old->p_vnode; 4560 new->p_offset = old->p_offset; 4561 new->p_hash = old->p_hash; 4562 *hash_list = new; 4563 4564 if ((new->p_vnode->v_flag & VISSWAP) != 0) 4565 PP_SETSWAP(new); 4566 4567 /* 4568 * replace old with new on the vnode's page list 4569 */ 4570 if (old->p_vpnext == old) { 4571 new->p_vpnext = new; 4572 new->p_vpprev = new; 4573 } else { 4574 new->p_vpnext = old->p_vpnext; 4575 new->p_vpprev = old->p_vpprev; 4576 new->p_vpnext->p_vpprev = new; 4577 new->p_vpprev->p_vpnext = new; 4578 } 4579 if (vp->v_pages == old) 4580 vp->v_pages = new; 4581 4582 /* 4583 * clear out the old page 4584 */ 4585 old->p_hash = NULL; 4586 old->p_vpnext = NULL; 4587 old->p_vpprev = NULL; 4588 old->p_vnode = NULL; 4589 PP_CLRSWAP(old); 4590 old->p_offset = (u_offset_t)-1; 4591 page_clr_all_props(old); 4592 4593 /* 4594 * Wake up processes waiting for this page. The page's 4595 * identity has been changed, and is probably not the 4596 * desired page any longer. 4597 */ 4598 sep = page_se_mutex(old); 4599 mutex_enter(sep); 4600 old->p_selock &= ~SE_EWANTED; 4601 if (CV_HAS_WAITERS(&old->p_cv)) 4602 cv_broadcast(&old->p_cv); 4603 mutex_exit(sep); 4604 } 4605 4606 /* 4607 * This function moves the identity of page "pp_old" to page "pp_new". 4608 * Both pages must be locked on entry. "pp_new" is free, has no identity, 4609 * and need not be hashed out from anywhere. 4610 */ 4611 void 4612 page_relocate_hash(page_t *pp_new, page_t *pp_old) 4613 { 4614 vnode_t *vp = pp_old->p_vnode; 4615 u_offset_t off = pp_old->p_offset; 4616 kmutex_t *phm, *vphm; 4617 4618 /* 4619 * Rehash two pages 4620 */ 4621 ASSERT(PAGE_EXCL(pp_old)); 4622 ASSERT(PAGE_EXCL(pp_new)); 4623 ASSERT(vp != NULL); 4624 ASSERT(pp_new->p_vnode == NULL); 4625 4626 /* 4627 * hashout then hashin while holding the mutexes 4628 */ 4629 phm = PAGE_HASH_MUTEX(PAGE_HASH_FUNC(vp, off)); 4630 mutex_enter(phm); 4631 vphm = page_vnode_mutex(vp); 4632 mutex_enter(vphm); 4633 4634 page_do_relocate_hash(pp_new, pp_old); 4635 4636 /* The following comment preserved from page_flip(). */ 4637 pp_new->p_fsdata = pp_old->p_fsdata; 4638 pp_old->p_fsdata = 0; 4639 mutex_exit(vphm); 4640 mutex_exit(phm); 4641 4642 /* 4643 * The page_struct_lock need not be acquired for lckcnt and 4644 * cowcnt since the page has an "exclusive" lock. 4645 */ 4646 ASSERT(pp_new->p_lckcnt == 0); 4647 ASSERT(pp_new->p_cowcnt == 0); 4648 pp_new->p_lckcnt = pp_old->p_lckcnt; 4649 pp_new->p_cowcnt = pp_old->p_cowcnt; 4650 pp_old->p_lckcnt = pp_old->p_cowcnt = 0; 4651 4652 } 4653 4654 /* 4655 * Helper routine used to lock all remaining members of a 4656 * large page. The caller is responsible for passing in a locked 4657 * pp. If pp is a large page, then it succeeds in locking all the 4658 * remaining constituent pages or it returns with only the 4659 * original page locked. 4660 * 4661 * Returns 1 on success, 0 on failure. 4662 * 4663 * If success is returned this routine guarantees p_szc for all constituent 4664 * pages of a large page pp belongs to can't change. To achieve this we 4665 * recheck szc of pp after locking all constituent pages and retry if szc 4666 * changed (it could only decrease). Since hat_page_demote() needs an EXCL 4667 * lock on one of constituent pages it can't be running after all constituent 4668 * pages are locked. hat_page_demote() with a lock on a constituent page 4669 * outside of this large page (i.e. pp belonged to a larger large page) is 4670 * already done with all constituent pages of pp since the root's p_szc is 4671 * changed last. Therefore no need to synchronize with hat_page_demote() that 4672 * locked a constituent page outside of pp's current large page. 4673 */ 4674 #ifdef DEBUG 4675 uint32_t gpg_trylock_mtbf = 0; 4676 #endif 4677 4678 int 4679 group_page_trylock(page_t *pp, se_t se) 4680 { 4681 page_t *tpp; 4682 pgcnt_t npgs, i, j; 4683 uint_t pszc = pp->p_szc; 4684 4685 #ifdef DEBUG 4686 if (gpg_trylock_mtbf && !(gethrtime() % gpg_trylock_mtbf)) { 4687 return (0); 4688 } 4689 #endif 4690 4691 if (pp != PP_GROUPLEADER(pp, pszc)) { 4692 return (0); 4693 } 4694 4695 retry: 4696 ASSERT(PAGE_LOCKED_SE(pp, se)); 4697 ASSERT(!PP_ISFREE(pp)); 4698 if (pszc == 0) { 4699 return (1); 4700 } 4701 npgs = page_get_pagecnt(pszc); 4702 tpp = pp + 1; 4703 for (i = 1; i < npgs; i++, tpp++) { 4704 if (!page_trylock(tpp, se)) { 4705 tpp = pp + 1; 4706 for (j = 1; j < i; j++, tpp++) { 4707 page_unlock(tpp); 4708 } 4709 return (0); 4710 } 4711 } 4712 if (pp->p_szc != pszc) { 4713 ASSERT(pp->p_szc < pszc); 4714 ASSERT(pp->p_vnode != NULL && !PP_ISKAS(pp) && 4715 !IS_SWAPFSVP(pp->p_vnode)); 4716 tpp = pp + 1; 4717 for (i = 1; i < npgs; i++, tpp++) { 4718 page_unlock(tpp); 4719 } 4720 pszc = pp->p_szc; 4721 goto retry; 4722 } 4723 return (1); 4724 } 4725 4726 void 4727 group_page_unlock(page_t *pp) 4728 { 4729 page_t *tpp; 4730 pgcnt_t npgs, i; 4731 4732 ASSERT(PAGE_LOCKED(pp)); 4733 ASSERT(!PP_ISFREE(pp)); 4734 ASSERT(pp == PP_PAGEROOT(pp)); 4735 npgs = page_get_pagecnt(pp->p_szc); 4736 for (i = 1, tpp = pp + 1; i < npgs; i++, tpp++) { 4737 page_unlock(tpp); 4738 } 4739 } 4740 4741 /* 4742 * returns 4743 * 0 : on success and *nrelocp is number of relocated PAGESIZE pages 4744 * ERANGE : this is not a base page 4745 * EBUSY : failure to get locks on the page/pages 4746 * ENOMEM : failure to obtain replacement pages 4747 * EAGAIN : OBP has not yet completed its boot-time handoff to the kernel 4748 * EIO : An error occurred while trying to copy the page data 4749 * 4750 * Return with all constituent members of target and replacement 4751 * SE_EXCL locked. It is the callers responsibility to drop the 4752 * locks. 4753 */ 4754 int 4755 do_page_relocate( 4756 page_t **target, 4757 page_t **replacement, 4758 int grouplock, 4759 spgcnt_t *nrelocp, 4760 lgrp_t *lgrp) 4761 { 4762 page_t *first_repl; 4763 page_t *repl; 4764 page_t *targ; 4765 page_t *pl = NULL; 4766 uint_t ppattr; 4767 pfn_t pfn, repl_pfn; 4768 uint_t szc; 4769 spgcnt_t npgs, i; 4770 int repl_contig = 0; 4771 uint_t flags = 0; 4772 spgcnt_t dofree = 0; 4773 4774 *nrelocp = 0; 4775 4776 #if defined(__sparc) 4777 /* 4778 * We need to wait till OBP has completed 4779 * its boot-time handoff of its resources to the kernel 4780 * before we allow page relocation 4781 */ 4782 if (page_relocate_ready == 0) { 4783 return (EAGAIN); 4784 } 4785 #endif 4786 4787 /* 4788 * If this is not a base page, 4789 * just return with 0x0 pages relocated. 4790 */ 4791 targ = *target; 4792 ASSERT(PAGE_EXCL(targ)); 4793 ASSERT(!PP_ISFREE(targ)); 4794 szc = targ->p_szc; 4795 ASSERT(szc < mmu_page_sizes); 4796 VM_STAT_ADD(vmm_vmstats.ppr_reloc[szc]); 4797 pfn = targ->p_pagenum; 4798 if (pfn != PFN_BASE(pfn, szc)) { 4799 VM_STAT_ADD(vmm_vmstats.ppr_relocnoroot[szc]); 4800 return (ERANGE); 4801 } 4802 4803 if ((repl = *replacement) != NULL && repl->p_szc >= szc) { 4804 repl_pfn = repl->p_pagenum; 4805 if (repl_pfn != PFN_BASE(repl_pfn, szc)) { 4806 VM_STAT_ADD(vmm_vmstats.ppr_reloc_replnoroot[szc]); 4807 return (ERANGE); 4808 } 4809 repl_contig = 1; 4810 } 4811 4812 /* 4813 * We must lock all members of this large page or we cannot 4814 * relocate any part of it. 4815 */ 4816 if (grouplock != 0 && !group_page_trylock(targ, SE_EXCL)) { 4817 VM_STAT_ADD(vmm_vmstats.ppr_relocnolock[targ->p_szc]); 4818 return (EBUSY); 4819 } 4820 4821 /* 4822 * reread szc it could have been decreased before 4823 * group_page_trylock() was done. 4824 */ 4825 szc = targ->p_szc; 4826 ASSERT(szc < mmu_page_sizes); 4827 VM_STAT_ADD(vmm_vmstats.ppr_reloc[szc]); 4828 ASSERT(pfn == PFN_BASE(pfn, szc)); 4829 4830 npgs = page_get_pagecnt(targ->p_szc); 4831 4832 if (repl == NULL) { 4833 dofree = npgs; /* Size of target page in MMU pages */ 4834 if (!page_create_wait(dofree, 0)) { 4835 if (grouplock != 0) { 4836 group_page_unlock(targ); 4837 } 4838 VM_STAT_ADD(vmm_vmstats.ppr_relocnomem[szc]); 4839 return (ENOMEM); 4840 } 4841 4842 /* 4843 * seg kmem pages require that the target and replacement 4844 * page be the same pagesize. 4845 */ 4846 flags = (VN_ISKAS(targ->p_vnode)) ? PGR_SAMESZC : 0; 4847 repl = page_get_replacement_page(targ, lgrp, flags); 4848 if (repl == NULL) { 4849 if (grouplock != 0) { 4850 group_page_unlock(targ); 4851 } 4852 page_create_putback(dofree); 4853 VM_STAT_ADD(vmm_vmstats.ppr_relocnomem[szc]); 4854 return (ENOMEM); 4855 } 4856 } 4857 #ifdef DEBUG 4858 else { 4859 ASSERT(PAGE_LOCKED(repl)); 4860 } 4861 #endif /* DEBUG */ 4862 4863 #if defined(__sparc) 4864 /* 4865 * Let hat_page_relocate() complete the relocation if it's kernel page 4866 */ 4867 if (VN_ISKAS(targ->p_vnode)) { 4868 *replacement = repl; 4869 if (hat_page_relocate(target, replacement, nrelocp) != 0) { 4870 if (grouplock != 0) { 4871 group_page_unlock(targ); 4872 } 4873 if (dofree) { 4874 *replacement = NULL; 4875 page_free_replacement_page(repl); 4876 page_create_putback(dofree); 4877 } 4878 VM_STAT_ADD(vmm_vmstats.ppr_krelocfail[szc]); 4879 return (EAGAIN); 4880 } 4881 VM_STAT_ADD(vmm_vmstats.ppr_relocok[szc]); 4882 return (0); 4883 } 4884 #else 4885 #if defined(lint) 4886 dofree = dofree; 4887 #endif 4888 #endif 4889 4890 first_repl = repl; 4891 4892 for (i = 0; i < npgs; i++) { 4893 ASSERT(PAGE_EXCL(targ)); 4894 ASSERT(targ->p_slckcnt == 0); 4895 ASSERT(repl->p_slckcnt == 0); 4896 4897 (void) hat_pageunload(targ, HAT_FORCE_PGUNLOAD); 4898 4899 ASSERT(hat_page_getshare(targ) == 0); 4900 ASSERT(!PP_ISFREE(targ)); 4901 ASSERT(targ->p_pagenum == (pfn + i)); 4902 ASSERT(repl_contig == 0 || 4903 repl->p_pagenum == (repl_pfn + i)); 4904 4905 /* 4906 * Copy the page contents and attributes then 4907 * relocate the page in the page hash. 4908 */ 4909 if (ppcopy(targ, repl) == 0) { 4910 targ = *target; 4911 repl = first_repl; 4912 VM_STAT_ADD(vmm_vmstats.ppr_copyfail); 4913 if (grouplock != 0) { 4914 group_page_unlock(targ); 4915 } 4916 if (dofree) { 4917 *replacement = NULL; 4918 page_free_replacement_page(repl); 4919 page_create_putback(dofree); 4920 } 4921 return (EIO); 4922 } 4923 4924 targ++; 4925 if (repl_contig != 0) { 4926 repl++; 4927 } else { 4928 repl = repl->p_next; 4929 } 4930 } 4931 4932 repl = first_repl; 4933 targ = *target; 4934 4935 for (i = 0; i < npgs; i++) { 4936 ppattr = hat_page_getattr(targ, (P_MOD | P_REF | P_RO)); 4937 page_clr_all_props(repl); 4938 page_set_props(repl, ppattr); 4939 page_relocate_hash(repl, targ); 4940 4941 ASSERT(hat_page_getshare(targ) == 0); 4942 ASSERT(hat_page_getshare(repl) == 0); 4943 /* 4944 * Now clear the props on targ, after the 4945 * page_relocate_hash(), they no longer 4946 * have any meaning. 4947 */ 4948 page_clr_all_props(targ); 4949 ASSERT(targ->p_next == targ); 4950 ASSERT(targ->p_prev == targ); 4951 page_list_concat(&pl, &targ); 4952 4953 targ++; 4954 if (repl_contig != 0) { 4955 repl++; 4956 } else { 4957 repl = repl->p_next; 4958 } 4959 } 4960 /* assert that we have come full circle with repl */ 4961 ASSERT(repl_contig == 1 || first_repl == repl); 4962 4963 *target = pl; 4964 if (*replacement == NULL) { 4965 ASSERT(first_repl == repl); 4966 *replacement = repl; 4967 } 4968 VM_STAT_ADD(vmm_vmstats.ppr_relocok[szc]); 4969 *nrelocp = npgs; 4970 return (0); 4971 } 4972 /* 4973 * On success returns 0 and *nrelocp the number of PAGESIZE pages relocated. 4974 */ 4975 int 4976 page_relocate( 4977 page_t **target, 4978 page_t **replacement, 4979 int grouplock, 4980 int freetarget, 4981 spgcnt_t *nrelocp, 4982 lgrp_t *lgrp) 4983 { 4984 spgcnt_t ret; 4985 4986 /* do_page_relocate returns 0 on success or errno value */ 4987 ret = do_page_relocate(target, replacement, grouplock, nrelocp, lgrp); 4988 4989 if (ret != 0 || freetarget == 0) { 4990 return (ret); 4991 } 4992 if (*nrelocp == 1) { 4993 ASSERT(*target != NULL); 4994 page_free(*target, 1); 4995 } else { 4996 page_t *tpp = *target; 4997 uint_t szc = tpp->p_szc; 4998 pgcnt_t npgs = page_get_pagecnt(szc); 4999 ASSERT(npgs > 1); 5000 ASSERT(szc != 0); 5001 do { 5002 ASSERT(PAGE_EXCL(tpp)); 5003 ASSERT(!hat_page_is_mapped(tpp)); 5004 ASSERT(tpp->p_szc == szc); 5005 PP_SETFREE(tpp); 5006 PP_SETAGED(tpp); 5007 npgs--; 5008 } while ((tpp = tpp->p_next) != *target); 5009 ASSERT(npgs == 0); 5010 page_list_add_pages(*target, 0); 5011 npgs = page_get_pagecnt(szc); 5012 page_create_putback(npgs); 5013 } 5014 return (ret); 5015 } 5016 5017 /* 5018 * it is up to the caller to deal with pcf accounting. 5019 */ 5020 void 5021 page_free_replacement_page(page_t *pplist) 5022 { 5023 page_t *pp; 5024 5025 while (pplist != NULL) { 5026 /* 5027 * pp_targ is a linked list. 5028 */ 5029 pp = pplist; 5030 if (pp->p_szc == 0) { 5031 page_sub(&pplist, pp); 5032 page_clr_all_props(pp); 5033 PP_SETFREE(pp); 5034 PP_SETAGED(pp); 5035 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); 5036 page_unlock(pp); 5037 VM_STAT_ADD(pagecnt.pc_free_replacement_page[0]); 5038 } else { 5039 spgcnt_t curnpgs = page_get_pagecnt(pp->p_szc); 5040 page_t *tpp; 5041 page_list_break(&pp, &pplist, curnpgs); 5042 tpp = pp; 5043 do { 5044 ASSERT(PAGE_EXCL(tpp)); 5045 ASSERT(!hat_page_is_mapped(tpp)); 5046 page_clr_all_props(tpp); 5047 PP_SETFREE(tpp); 5048 PP_SETAGED(tpp); 5049 } while ((tpp = tpp->p_next) != pp); 5050 page_list_add_pages(pp, 0); 5051 VM_STAT_ADD(pagecnt.pc_free_replacement_page[1]); 5052 } 5053 } 5054 } 5055 5056 /* 5057 * Relocate target to non-relocatable replacement page. 5058 */ 5059 int 5060 page_relocate_cage(page_t **target, page_t **replacement) 5061 { 5062 page_t *tpp, *rpp; 5063 spgcnt_t pgcnt, npgs; 5064 int result; 5065 5066 tpp = *target; 5067 5068 ASSERT(PAGE_EXCL(tpp)); 5069 ASSERT(tpp->p_szc == 0); 5070 5071 pgcnt = btop(page_get_pagesize(tpp->p_szc)); 5072 5073 do { 5074 (void) page_create_wait(pgcnt, PG_WAIT | PG_NORELOC); 5075 rpp = page_get_replacement_page(tpp, NULL, PGR_NORELOC); 5076 if (rpp == NULL) { 5077 page_create_putback(pgcnt); 5078 kcage_cageout_wakeup(); 5079 } 5080 } while (rpp == NULL); 5081 5082 ASSERT(PP_ISNORELOC(rpp)); 5083 5084 result = page_relocate(&tpp, &rpp, 0, 1, &npgs, NULL); 5085 5086 if (result == 0) { 5087 *replacement = rpp; 5088 if (pgcnt != npgs) 5089 panic("page_relocate_cage: partial relocation"); 5090 } 5091 5092 return (result); 5093 } 5094 5095 /* 5096 * Release the page lock on a page, place on cachelist 5097 * tail if no longer mapped. Caller can let us know if 5098 * the page is known to be clean. 5099 */ 5100 int 5101 page_release(page_t *pp, int checkmod) 5102 { 5103 int status; 5104 5105 ASSERT(PAGE_LOCKED(pp) && !PP_ISFREE(pp) && 5106 (pp->p_vnode != NULL)); 5107 5108 if (!hat_page_is_mapped(pp) && !IS_SWAPVP(pp->p_vnode) && 5109 ((PAGE_SHARED(pp) && page_tryupgrade(pp)) || PAGE_EXCL(pp)) && 5110 pp->p_lckcnt == 0 && pp->p_cowcnt == 0 && 5111 !hat_page_is_mapped(pp)) { 5112 5113 /* 5114 * If page is modified, unlock it 5115 * 5116 * (p_nrm & P_MOD) bit has the latest stuff because: 5117 * (1) We found that this page doesn't have any mappings 5118 * _after_ holding SE_EXCL and 5119 * (2) We didn't drop SE_EXCL lock after the check in (1) 5120 */ 5121 if (checkmod && hat_ismod(pp)) { 5122 page_unlock(pp); 5123 status = PGREL_MOD; 5124 } else { 5125 /*LINTED: constant in conditional context*/ 5126 VN_DISPOSE(pp, B_FREE, 0, kcred); 5127 status = PGREL_CLEAN; 5128 } 5129 } else { 5130 page_unlock(pp); 5131 status = PGREL_NOTREL; 5132 } 5133 return (status); 5134 } 5135 5136 /* 5137 * Given a constituent page, try to demote the large page on the freelist. 5138 * 5139 * Returns nonzero if the page could be demoted successfully. Returns with 5140 * the constituent page still locked. 5141 */ 5142 int 5143 page_try_demote_free_pages(page_t *pp) 5144 { 5145 page_t *rootpp = pp; 5146 pfn_t pfn = page_pptonum(pp); 5147 spgcnt_t npgs; 5148 uint_t szc = pp->p_szc; 5149 5150 ASSERT(PP_ISFREE(pp)); 5151 ASSERT(PAGE_EXCL(pp)); 5152 5153 /* 5154 * Adjust rootpp and lock it, if `pp' is not the base 5155 * constituent page. 5156 */ 5157 npgs = page_get_pagecnt(pp->p_szc); 5158 if (npgs == 1) { 5159 return (0); 5160 } 5161 5162 if (!IS_P2ALIGNED(pfn, npgs)) { 5163 pfn = P2ALIGN(pfn, npgs); 5164 rootpp = page_numtopp_nolock(pfn); 5165 } 5166 5167 if (pp != rootpp && !page_trylock(rootpp, SE_EXCL)) { 5168 return (0); 5169 } 5170 5171 if (rootpp->p_szc != szc) { 5172 if (pp != rootpp) 5173 page_unlock(rootpp); 5174 return (0); 5175 } 5176 5177 page_demote_free_pages(rootpp); 5178 5179 if (pp != rootpp) 5180 page_unlock(rootpp); 5181 5182 ASSERT(PP_ISFREE(pp)); 5183 ASSERT(PAGE_EXCL(pp)); 5184 return (1); 5185 } 5186 5187 /* 5188 * Given a constituent page, try to demote the large page. 5189 * 5190 * Returns nonzero if the page could be demoted successfully. Returns with 5191 * the constituent page still locked. 5192 */ 5193 int 5194 page_try_demote_pages(page_t *pp) 5195 { 5196 page_t *tpp, *rootpp = pp; 5197 pfn_t pfn = page_pptonum(pp); 5198 spgcnt_t i, npgs; 5199 uint_t szc = pp->p_szc; 5200 vnode_t *vp = pp->p_vnode; 5201 5202 ASSERT(PAGE_EXCL(pp)); 5203 5204 VM_STAT_ADD(pagecnt.pc_try_demote_pages[0]); 5205 5206 if (pp->p_szc == 0) { 5207 VM_STAT_ADD(pagecnt.pc_try_demote_pages[1]); 5208 return (1); 5209 } 5210 5211 if (vp != NULL && !IS_SWAPFSVP(vp) && !VN_ISKAS(vp)) { 5212 VM_STAT_ADD(pagecnt.pc_try_demote_pages[2]); 5213 page_demote_vp_pages(pp); 5214 ASSERT(pp->p_szc == 0); 5215 return (1); 5216 } 5217 5218 /* 5219 * Adjust rootpp if passed in is not the base 5220 * constituent page. 5221 */ 5222 npgs = page_get_pagecnt(pp->p_szc); 5223 ASSERT(npgs > 1); 5224 if (!IS_P2ALIGNED(pfn, npgs)) { 5225 pfn = P2ALIGN(pfn, npgs); 5226 rootpp = page_numtopp_nolock(pfn); 5227 VM_STAT_ADD(pagecnt.pc_try_demote_pages[3]); 5228 ASSERT(rootpp->p_vnode != NULL); 5229 ASSERT(rootpp->p_szc == szc); 5230 } 5231 5232 /* 5233 * We can't demote kernel pages since we can't hat_unload() 5234 * the mappings. 5235 */ 5236 if (VN_ISKAS(rootpp->p_vnode)) 5237 return (0); 5238 5239 /* 5240 * Attempt to lock all constituent pages except the page passed 5241 * in since it's already locked. 5242 */ 5243 for (tpp = rootpp, i = 0; i < npgs; i++, tpp++) { 5244 ASSERT(!PP_ISFREE(tpp)); 5245 ASSERT(tpp->p_vnode != NULL); 5246 5247 if (tpp != pp && !page_trylock(tpp, SE_EXCL)) 5248 break; 5249 ASSERT(tpp->p_szc == rootpp->p_szc); 5250 ASSERT(page_pptonum(tpp) == page_pptonum(rootpp) + i); 5251 } 5252 5253 /* 5254 * If we failed to lock them all then unlock what we have 5255 * locked so far and bail. 5256 */ 5257 if (i < npgs) { 5258 tpp = rootpp; 5259 while (i-- > 0) { 5260 if (tpp != pp) 5261 page_unlock(tpp); 5262 tpp++; 5263 } 5264 VM_STAT_ADD(pagecnt.pc_try_demote_pages[4]); 5265 return (0); 5266 } 5267 5268 for (tpp = rootpp, i = 0; i < npgs; i++, tpp++) { 5269 ASSERT(PAGE_EXCL(tpp)); 5270 ASSERT(tpp->p_slckcnt == 0); 5271 (void) hat_pageunload(tpp, HAT_FORCE_PGUNLOAD); 5272 tpp->p_szc = 0; 5273 } 5274 5275 /* 5276 * Unlock all pages except the page passed in. 5277 */ 5278 for (tpp = rootpp, i = 0; i < npgs; i++, tpp++) { 5279 ASSERT(!hat_page_is_mapped(tpp)); 5280 if (tpp != pp) 5281 page_unlock(tpp); 5282 } 5283 5284 VM_STAT_ADD(pagecnt.pc_try_demote_pages[5]); 5285 return (1); 5286 } 5287 5288 /* 5289 * Called by page_free() and page_destroy() to demote the page size code 5290 * (p_szc) to 0 (since we can't just put a single PAGESIZE page with non zero 5291 * p_szc on free list, neither can we just clear p_szc of a single page_t 5292 * within a large page since it will break other code that relies on p_szc 5293 * being the same for all page_t's of a large page). Anonymous pages should 5294 * never end up here because anon_map_getpages() cannot deal with p_szc 5295 * changes after a single constituent page is locked. While anonymous or 5296 * kernel large pages are demoted or freed the entire large page at a time 5297 * with all constituent pages locked EXCL for the file system pages we 5298 * have to be able to demote a large page (i.e. decrease all constituent pages 5299 * p_szc) with only just an EXCL lock on one of constituent pages. The reason 5300 * we can easily deal with anonymous page demotion the entire large page at a 5301 * time is that those operation originate at address space level and concern 5302 * the entire large page region with actual demotion only done when pages are 5303 * not shared with any other processes (therefore we can always get EXCL lock 5304 * on all anonymous constituent pages after clearing segment page 5305 * cache). However file system pages can be truncated or invalidated at a 5306 * PAGESIZE level from the file system side and end up in page_free() or 5307 * page_destroy() (we also allow only part of the large page to be SOFTLOCKed 5308 * and therefore pageout should be able to demote a large page by EXCL locking 5309 * any constituent page that is not under SOFTLOCK). In those cases we cannot 5310 * rely on being able to lock EXCL all constituent pages. 5311 * 5312 * To prevent szc changes on file system pages one has to lock all constituent 5313 * pages at least SHARED (or call page_szc_lock()). The only subsystem that 5314 * doesn't rely on locking all constituent pages (or using page_szc_lock()) to 5315 * prevent szc changes is hat layer that uses its own page level mlist 5316 * locks. hat assumes that szc doesn't change after mlist lock for a page is 5317 * taken. Therefore we need to change szc under hat level locks if we only 5318 * have an EXCL lock on a single constituent page and hat still references any 5319 * of constituent pages. (Note we can't "ignore" hat layer by simply 5320 * hat_pageunload() all constituent pages without having EXCL locks on all of 5321 * constituent pages). We use hat_page_demote() call to safely demote szc of 5322 * all constituent pages under hat locks when we only have an EXCL lock on one 5323 * of constituent pages. 5324 * 5325 * This routine calls page_szc_lock() before calling hat_page_demote() to 5326 * allow segvn in one special case not to lock all constituent pages SHARED 5327 * before calling hat_memload_array() that relies on p_szc not changing even 5328 * before hat level mlist lock is taken. In that case segvn uses 5329 * page_szc_lock() to prevent hat_page_demote() changing p_szc values. 5330 * 5331 * Anonymous or kernel page demotion still has to lock all pages exclusively 5332 * and do hat_pageunload() on all constituent pages before demoting the page 5333 * therefore there's no need for anonymous or kernel page demotion to use 5334 * hat_page_demote() mechanism. 5335 * 5336 * hat_page_demote() removes all large mappings that map pp and then decreases 5337 * p_szc starting from the last constituent page of the large page. By working 5338 * from the tail of a large page in pfn decreasing order allows one looking at 5339 * the root page to know that hat_page_demote() is done for root's szc area. 5340 * e.g. if a root page has szc 1 one knows it only has to lock all constituent 5341 * pages within szc 1 area to prevent szc changes because hat_page_demote() 5342 * that started on this page when it had szc > 1 is done for this szc 1 area. 5343 * 5344 * We are guaranteed that all constituent pages of pp's large page belong to 5345 * the same vnode with the consecutive offsets increasing in the direction of 5346 * the pfn i.e. the identity of constituent pages can't change until their 5347 * p_szc is decreased. Therefore it's safe for hat_page_demote() to remove 5348 * large mappings to pp even though we don't lock any constituent page except 5349 * pp (i.e. we won't unload e.g. kernel locked page). 5350 */ 5351 static void 5352 page_demote_vp_pages(page_t *pp) 5353 { 5354 kmutex_t *mtx; 5355 5356 ASSERT(PAGE_EXCL(pp)); 5357 ASSERT(!PP_ISFREE(pp)); 5358 ASSERT(pp->p_vnode != NULL); 5359 ASSERT(!IS_SWAPFSVP(pp->p_vnode)); 5360 ASSERT(!PP_ISKAS(pp)); 5361 5362 VM_STAT_ADD(pagecnt.pc_demote_pages[0]); 5363 5364 mtx = page_szc_lock(pp); 5365 if (mtx != NULL) { 5366 hat_page_demote(pp); 5367 mutex_exit(mtx); 5368 } 5369 ASSERT(pp->p_szc == 0); 5370 } 5371 5372 /* 5373 * Mark any existing pages for migration in the given range 5374 */ 5375 void 5376 page_mark_migrate(struct seg *seg, caddr_t addr, size_t len, 5377 struct anon_map *amp, ulong_t anon_index, vnode_t *vp, 5378 u_offset_t vnoff, int rflag) 5379 { 5380 struct anon *ap; 5381 vnode_t *curvp; 5382 lgrp_t *from; 5383 pgcnt_t nlocked; 5384 u_offset_t off; 5385 pfn_t pfn; 5386 size_t pgsz; 5387 size_t segpgsz; 5388 pgcnt_t pages; 5389 uint_t pszc; 5390 page_t *pp0, *pp; 5391 caddr_t va; 5392 ulong_t an_idx; 5393 anon_sync_obj_t cookie; 5394 5395 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); 5396 5397 /* 5398 * Don't do anything if don't need to do lgroup optimizations 5399 * on this system 5400 */ 5401 if (!lgrp_optimizations()) 5402 return; 5403 5404 /* 5405 * Align address and length to (potentially large) page boundary 5406 */ 5407 segpgsz = page_get_pagesize(seg->s_szc); 5408 addr = (caddr_t)P2ALIGN((uintptr_t)addr, segpgsz); 5409 if (rflag) 5410 len = P2ROUNDUP(len, segpgsz); 5411 5412 /* 5413 * Do one (large) page at a time 5414 */ 5415 va = addr; 5416 while (va < addr + len) { 5417 /* 5418 * Lookup (root) page for vnode and offset corresponding to 5419 * this virtual address 5420 * Try anonmap first since there may be copy-on-write 5421 * pages, but initialize vnode pointer and offset using 5422 * vnode arguments just in case there isn't an amp. 5423 */ 5424 curvp = vp; 5425 off = vnoff + va - seg->s_base; 5426 if (amp) { 5427 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5428 an_idx = anon_index + seg_page(seg, va); 5429 anon_array_enter(amp, an_idx, &cookie); 5430 ap = anon_get_ptr(amp->ahp, an_idx); 5431 if (ap) 5432 swap_xlate(ap, &curvp, &off); 5433 anon_array_exit(&cookie); 5434 ANON_LOCK_EXIT(&->a_rwlock); 5435 } 5436 5437 pp = NULL; 5438 if (curvp) 5439 pp = page_lookup(curvp, off, SE_SHARED); 5440 5441 /* 5442 * If there isn't a page at this virtual address, 5443 * skip to next page 5444 */ 5445 if (pp == NULL) { 5446 va += PAGESIZE; 5447 continue; 5448 } 5449 5450 /* 5451 * Figure out which lgroup this page is in for kstats 5452 */ 5453 pfn = page_pptonum(pp); 5454 from = lgrp_pfn_to_lgrp(pfn); 5455 5456 /* 5457 * Get page size, and round up and skip to next page boundary 5458 * if unaligned address 5459 */ 5460 pszc = pp->p_szc; 5461 pgsz = page_get_pagesize(pszc); 5462 pages = btop(pgsz); 5463 if (!IS_P2ALIGNED(va, pgsz) || 5464 !IS_P2ALIGNED(pfn, pages) || 5465 pgsz > segpgsz) { 5466 pgsz = MIN(pgsz, segpgsz); 5467 page_unlock(pp); 5468 pages = btop(P2END((uintptr_t)va, pgsz) - 5469 (uintptr_t)va); 5470 va = (caddr_t)P2END((uintptr_t)va, pgsz); 5471 lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS, pages); 5472 continue; 5473 } 5474 5475 /* 5476 * Upgrade to exclusive lock on page 5477 */ 5478 if (!page_tryupgrade(pp)) { 5479 page_unlock(pp); 5480 va += pgsz; 5481 lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS, 5482 btop(pgsz)); 5483 continue; 5484 } 5485 5486 pp0 = pp++; 5487 nlocked = 1; 5488 5489 /* 5490 * Lock constituent pages if this is large page 5491 */ 5492 if (pages > 1) { 5493 /* 5494 * Lock all constituents except root page, since it 5495 * should be locked already. 5496 */ 5497 for (; nlocked < pages; nlocked++) { 5498 if (!page_trylock(pp, SE_EXCL)) { 5499 break; 5500 } 5501 if (PP_ISFREE(pp) || 5502 pp->p_szc != pszc) { 5503 /* 5504 * hat_page_demote() raced in with us. 5505 */ 5506 ASSERT(!IS_SWAPFSVP(curvp)); 5507 page_unlock(pp); 5508 break; 5509 } 5510 pp++; 5511 } 5512 } 5513 5514 /* 5515 * If all constituent pages couldn't be locked, 5516 * unlock pages locked so far and skip to next page. 5517 */ 5518 if (nlocked < pages) { 5519 while (pp0 < pp) { 5520 page_unlock(pp0++); 5521 } 5522 va += pgsz; 5523 lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS, 5524 btop(pgsz)); 5525 continue; 5526 } 5527 5528 /* 5529 * hat_page_demote() can no longer happen 5530 * since last cons page had the right p_szc after 5531 * all cons pages were locked. all cons pages 5532 * should now have the same p_szc. 5533 */ 5534 5535 /* 5536 * All constituent pages locked successfully, so mark 5537 * large page for migration and unload the mappings of 5538 * constituent pages, so a fault will occur on any part of the 5539 * large page 5540 */ 5541 PP_SETMIGRATE(pp0); 5542 while (pp0 < pp) { 5543 (void) hat_pageunload(pp0, HAT_FORCE_PGUNLOAD); 5544 ASSERT(hat_page_getshare(pp0) == 0); 5545 page_unlock(pp0++); 5546 } 5547 lgrp_stat_add(from->lgrp_id, LGRP_PMM_PGS, nlocked); 5548 5549 va += pgsz; 5550 } 5551 } 5552 5553 /* 5554 * Migrate any pages that have been marked for migration in the given range 5555 */ 5556 void 5557 page_migrate( 5558 struct seg *seg, 5559 caddr_t addr, 5560 page_t **ppa, 5561 pgcnt_t npages) 5562 { 5563 lgrp_t *from; 5564 lgrp_t *to; 5565 page_t *newpp; 5566 page_t *pp; 5567 pfn_t pfn; 5568 size_t pgsz; 5569 spgcnt_t page_cnt; 5570 spgcnt_t i; 5571 uint_t pszc; 5572 5573 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as)); 5574 5575 while (npages > 0) { 5576 pp = *ppa; 5577 pszc = pp->p_szc; 5578 pgsz = page_get_pagesize(pszc); 5579 page_cnt = btop(pgsz); 5580 5581 /* 5582 * Check to see whether this page is marked for migration 5583 * 5584 * Assume that root page of large page is marked for 5585 * migration and none of the other constituent pages 5586 * are marked. This really simplifies clearing the 5587 * migrate bit by not having to clear it from each 5588 * constituent page. 5589 * 5590 * note we don't want to relocate an entire large page if 5591 * someone is only using one subpage. 5592 */ 5593 if (npages < page_cnt) 5594 break; 5595 5596 /* 5597 * Is it marked for migration? 5598 */ 5599 if (!PP_ISMIGRATE(pp)) 5600 goto next; 5601 5602 /* 5603 * Determine lgroups that page is being migrated between 5604 */ 5605 pfn = page_pptonum(pp); 5606 if (!IS_P2ALIGNED(pfn, page_cnt)) { 5607 break; 5608 } 5609 from = lgrp_pfn_to_lgrp(pfn); 5610 to = lgrp_mem_choose(seg, addr, pgsz); 5611 5612 /* 5613 * Need to get exclusive lock's to migrate 5614 */ 5615 for (i = 0; i < page_cnt; i++) { 5616 ASSERT(PAGE_LOCKED(ppa[i])); 5617 if (page_pptonum(ppa[i]) != pfn + i || 5618 ppa[i]->p_szc != pszc) { 5619 break; 5620 } 5621 if (!page_tryupgrade(ppa[i])) { 5622 lgrp_stat_add(from->lgrp_id, 5623 LGRP_PM_FAIL_LOCK_PGS, 5624 page_cnt); 5625 break; 5626 } 5627 5628 /* 5629 * Check to see whether we are trying to migrate 5630 * page to lgroup where it is allocated already. 5631 * If so, clear the migrate bit and skip to next 5632 * page. 5633 */ 5634 if (i == 0 && to == from) { 5635 PP_CLRMIGRATE(ppa[0]); 5636 page_downgrade(ppa[0]); 5637 goto next; 5638 } 5639 } 5640 5641 /* 5642 * If all constituent pages couldn't be locked, 5643 * unlock pages locked so far and skip to next page. 5644 */ 5645 if (i != page_cnt) { 5646 while (--i != -1) { 5647 page_downgrade(ppa[i]); 5648 } 5649 goto next; 5650 } 5651 5652 (void) page_create_wait(page_cnt, PG_WAIT); 5653 newpp = page_get_replacement_page(pp, to, PGR_SAMESZC); 5654 if (newpp == NULL) { 5655 page_create_putback(page_cnt); 5656 for (i = 0; i < page_cnt; i++) { 5657 page_downgrade(ppa[i]); 5658 } 5659 lgrp_stat_add(to->lgrp_id, LGRP_PM_FAIL_ALLOC_PGS, 5660 page_cnt); 5661 goto next; 5662 } 5663 ASSERT(newpp->p_szc == pszc); 5664 /* 5665 * Clear migrate bit and relocate page 5666 */ 5667 PP_CLRMIGRATE(pp); 5668 if (page_relocate(&pp, &newpp, 0, 1, &page_cnt, to)) { 5669 panic("page_migrate: page_relocate failed"); 5670 } 5671 ASSERT(page_cnt * PAGESIZE == pgsz); 5672 5673 /* 5674 * Keep stats for number of pages migrated from and to 5675 * each lgroup 5676 */ 5677 lgrp_stat_add(from->lgrp_id, LGRP_PM_SRC_PGS, page_cnt); 5678 lgrp_stat_add(to->lgrp_id, LGRP_PM_DEST_PGS, page_cnt); 5679 /* 5680 * update the page_t array we were passed in and 5681 * unlink constituent pages of a large page. 5682 */ 5683 for (i = 0; i < page_cnt; ++i, ++pp) { 5684 ASSERT(PAGE_EXCL(newpp)); 5685 ASSERT(newpp->p_szc == pszc); 5686 ppa[i] = newpp; 5687 pp = newpp; 5688 page_sub(&newpp, pp); 5689 page_downgrade(pp); 5690 } 5691 ASSERT(newpp == NULL); 5692 next: 5693 addr += pgsz; 5694 ppa += page_cnt; 5695 npages -= page_cnt; 5696 } 5697 } 5698 5699 #define MAX_CNT 60 /* max num of iterations */ 5700 /* 5701 * Reclaim/reserve availrmem for npages. 5702 * If there is not enough memory start reaping seg, kmem caches. 5703 * Start pageout scanner (via page_needfree()). 5704 * Exit after ~ MAX_CNT s regardless of how much memory has been released. 5705 * Note: There is no guarantee that any availrmem will be freed as 5706 * this memory typically is locked (kernel heap) or reserved for swap. 5707 * Also due to memory fragmentation kmem allocator may not be able 5708 * to free any memory (single user allocated buffer will prevent 5709 * freeing slab or a page). 5710 */ 5711 int 5712 page_reclaim_mem(pgcnt_t npages, pgcnt_t epages, int adjust) 5713 { 5714 int i = 0; 5715 int ret = 0; 5716 pgcnt_t deficit; 5717 pgcnt_t old_availrmem; 5718 5719 mutex_enter(&freemem_lock); 5720 old_availrmem = availrmem - 1; 5721 while ((availrmem < tune.t_minarmem + npages + epages) && 5722 (old_availrmem < availrmem) && (i++ < MAX_CNT)) { 5723 old_availrmem = availrmem; 5724 deficit = tune.t_minarmem + npages + epages - availrmem; 5725 mutex_exit(&freemem_lock); 5726 page_needfree(deficit); 5727 kmem_reap(); 5728 delay(hz); 5729 page_needfree(-(spgcnt_t)deficit); 5730 mutex_enter(&freemem_lock); 5731 } 5732 5733 if (adjust && (availrmem >= tune.t_minarmem + npages + epages)) { 5734 availrmem -= npages; 5735 ret = 1; 5736 } 5737 5738 mutex_exit(&freemem_lock); 5739 5740 return (ret); 5741 } 5742 5743 /* 5744 * Search the memory segments to locate the desired page. Within a 5745 * segment, pages increase linearly with one page structure per 5746 * physical page frame (size PAGESIZE). The search begins 5747 * with the segment that was accessed last, to take advantage of locality. 5748 * If the hint misses, we start from the beginning of the sorted memseg list 5749 */ 5750 5751 5752 /* 5753 * Some data structures for pfn to pp lookup. 5754 */ 5755 ulong_t mhash_per_slot; 5756 struct memseg *memseg_hash[N_MEM_SLOTS]; 5757 5758 page_t * 5759 page_numtopp_nolock(pfn_t pfnum) 5760 { 5761 struct memseg *seg; 5762 page_t *pp; 5763 vm_cpu_data_t *vc; 5764 5765 /* 5766 * We need to disable kernel preemption while referencing the 5767 * cpu_vm_data field in order to prevent us from being switched to 5768 * another cpu and trying to reference it after it has been freed. 5769 * This will keep us on cpu and prevent it from being removed while 5770 * we are still on it. 5771 * 5772 * We may be caching a memseg in vc_pnum_memseg/vc_pnext_memseg 5773 * which is being resued by DR who will flush those references 5774 * before modifying the reused memseg. See memseg_cpu_vm_flush(). 5775 */ 5776 kpreempt_disable(); 5777 vc = CPU->cpu_vm_data; 5778 ASSERT(vc != NULL); 5779 5780 MEMSEG_STAT_INCR(nsearch); 5781 5782 /* Try last winner first */ 5783 if (((seg = vc->vc_pnum_memseg) != NULL) && 5784 (pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) { 5785 MEMSEG_STAT_INCR(nlastwon); 5786 pp = seg->pages + (pfnum - seg->pages_base); 5787 if (pp->p_pagenum == pfnum) { 5788 kpreempt_enable(); 5789 return ((page_t *)pp); 5790 } 5791 } 5792 5793 /* Else Try hash */ 5794 if (((seg = memseg_hash[MEMSEG_PFN_HASH(pfnum)]) != NULL) && 5795 (pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) { 5796 MEMSEG_STAT_INCR(nhashwon); 5797 vc->vc_pnum_memseg = seg; 5798 pp = seg->pages + (pfnum - seg->pages_base); 5799 if (pp->p_pagenum == pfnum) { 5800 kpreempt_enable(); 5801 return ((page_t *)pp); 5802 } 5803 } 5804 5805 /* Else Brute force */ 5806 for (seg = memsegs; seg != NULL; seg = seg->next) { 5807 if (pfnum >= seg->pages_base && pfnum < seg->pages_end) { 5808 vc->vc_pnum_memseg = seg; 5809 pp = seg->pages + (pfnum - seg->pages_base); 5810 if (pp->p_pagenum == pfnum) { 5811 kpreempt_enable(); 5812 return ((page_t *)pp); 5813 } 5814 } 5815 } 5816 vc->vc_pnum_memseg = NULL; 5817 kpreempt_enable(); 5818 MEMSEG_STAT_INCR(nnotfound); 5819 return ((page_t *)NULL); 5820 5821 } 5822 5823 struct memseg * 5824 page_numtomemseg_nolock(pfn_t pfnum) 5825 { 5826 struct memseg *seg; 5827 page_t *pp; 5828 5829 /* 5830 * We may be caching a memseg in vc_pnum_memseg/vc_pnext_memseg 5831 * which is being resued by DR who will flush those references 5832 * before modifying the reused memseg. See memseg_cpu_vm_flush(). 5833 */ 5834 kpreempt_disable(); 5835 /* Try hash */ 5836 if (((seg = memseg_hash[MEMSEG_PFN_HASH(pfnum)]) != NULL) && 5837 (pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) { 5838 pp = seg->pages + (pfnum - seg->pages_base); 5839 if (pp->p_pagenum == pfnum) { 5840 kpreempt_enable(); 5841 return (seg); 5842 } 5843 } 5844 5845 /* Else Brute force */ 5846 for (seg = memsegs; seg != NULL; seg = seg->next) { 5847 if (pfnum >= seg->pages_base && pfnum < seg->pages_end) { 5848 pp = seg->pages + (pfnum - seg->pages_base); 5849 if (pp->p_pagenum == pfnum) { 5850 kpreempt_enable(); 5851 return (seg); 5852 } 5853 } 5854 } 5855 kpreempt_enable(); 5856 return ((struct memseg *)NULL); 5857 } 5858 5859 /* 5860 * Given a page and a count return the page struct that is 5861 * n structs away from the current one in the global page 5862 * list. 5863 * 5864 * This function wraps to the first page upon 5865 * reaching the end of the memseg list. 5866 */ 5867 page_t * 5868 page_nextn(page_t *pp, ulong_t n) 5869 { 5870 struct memseg *seg; 5871 page_t *ppn; 5872 vm_cpu_data_t *vc; 5873 5874 /* 5875 * We need to disable kernel preemption while referencing the 5876 * cpu_vm_data field in order to prevent us from being switched to 5877 * another cpu and trying to reference it after it has been freed. 5878 * This will keep us on cpu and prevent it from being removed while 5879 * we are still on it. 5880 * 5881 * We may be caching a memseg in vc_pnum_memseg/vc_pnext_memseg 5882 * which is being resued by DR who will flush those references 5883 * before modifying the reused memseg. See memseg_cpu_vm_flush(). 5884 */ 5885 kpreempt_disable(); 5886 vc = (vm_cpu_data_t *)CPU->cpu_vm_data; 5887 5888 ASSERT(vc != NULL); 5889 5890 if (((seg = vc->vc_pnext_memseg) == NULL) || 5891 (seg->pages_base == seg->pages_end) || 5892 !(pp >= seg->pages && pp < seg->epages)) { 5893 5894 for (seg = memsegs; seg; seg = seg->next) { 5895 if (pp >= seg->pages && pp < seg->epages) 5896 break; 5897 } 5898 5899 if (seg == NULL) { 5900 /* Memory delete got in, return something valid. */ 5901 /* TODO: fix me. */ 5902 seg = memsegs; 5903 pp = seg->pages; 5904 } 5905 } 5906 5907 /* check for wraparound - possible if n is large */ 5908 while ((ppn = (pp + n)) >= seg->epages || ppn < pp) { 5909 n -= seg->epages - pp; 5910 seg = seg->next; 5911 if (seg == NULL) 5912 seg = memsegs; 5913 pp = seg->pages; 5914 } 5915 vc->vc_pnext_memseg = seg; 5916 kpreempt_enable(); 5917 return (ppn); 5918 } 5919 5920 /* 5921 * Initialize for a loop using page_next_scan_large(). 5922 */ 5923 page_t * 5924 page_next_scan_init(void **cookie) 5925 { 5926 ASSERT(cookie != NULL); 5927 *cookie = (void *)memsegs; 5928 return ((page_t *)memsegs->pages); 5929 } 5930 5931 /* 5932 * Return the next page in a scan of page_t's, assuming we want 5933 * to skip over sub-pages within larger page sizes. 5934 * 5935 * The cookie is used to keep track of the current memseg. 5936 */ 5937 page_t * 5938 page_next_scan_large( 5939 page_t *pp, 5940 ulong_t *n, 5941 void **cookie) 5942 { 5943 struct memseg *seg = (struct memseg *)*cookie; 5944 page_t *new_pp; 5945 ulong_t cnt; 5946 pfn_t pfn; 5947 5948 5949 /* 5950 * get the count of page_t's to skip based on the page size 5951 */ 5952 ASSERT(pp != NULL); 5953 if (pp->p_szc == 0) { 5954 cnt = 1; 5955 } else { 5956 pfn = page_pptonum(pp); 5957 cnt = page_get_pagecnt(pp->p_szc); 5958 cnt -= pfn & (cnt - 1); 5959 } 5960 *n += cnt; 5961 new_pp = pp + cnt; 5962 5963 /* 5964 * Catch if we went past the end of the current memory segment. If so, 5965 * just move to the next segment with pages. 5966 */ 5967 if (new_pp >= seg->epages || seg->pages_base == seg->pages_end) { 5968 do { 5969 seg = seg->next; 5970 if (seg == NULL) 5971 seg = memsegs; 5972 } while (seg->pages_base == seg->pages_end); 5973 new_pp = seg->pages; 5974 *cookie = (void *)seg; 5975 } 5976 5977 return (new_pp); 5978 } 5979 5980 5981 /* 5982 * Returns next page in list. Note: this function wraps 5983 * to the first page in the list upon reaching the end 5984 * of the list. Callers should be aware of this fact. 5985 */ 5986 5987 /* We should change this be a #define */ 5988 5989 page_t * 5990 page_next(page_t *pp) 5991 { 5992 return (page_nextn(pp, 1)); 5993 } 5994 5995 page_t * 5996 page_first() 5997 { 5998 return ((page_t *)memsegs->pages); 5999 } 6000 6001 6002 /* 6003 * This routine is called at boot with the initial memory configuration 6004 * and when memory is added or removed. 6005 */ 6006 void 6007 build_pfn_hash() 6008 { 6009 pfn_t cur; 6010 pgcnt_t index; 6011 struct memseg *pseg; 6012 int i; 6013 6014 /* 6015 * Clear memseg_hash array. 6016 * Since memory add/delete is designed to operate concurrently 6017 * with normal operation, the hash rebuild must be able to run 6018 * concurrently with page_numtopp_nolock(). To support this 6019 * functionality, assignments to memseg_hash array members must 6020 * be done atomically. 6021 * 6022 * NOTE: bzero() does not currently guarantee this for kernel 6023 * threads, and cannot be used here. 6024 */ 6025 for (i = 0; i < N_MEM_SLOTS; i++) 6026 memseg_hash[i] = NULL; 6027 6028 hat_kpm_mseghash_clear(N_MEM_SLOTS); 6029 6030 /* 6031 * Physmax is the last valid pfn. 6032 */ 6033 mhash_per_slot = (physmax + 1) >> MEM_HASH_SHIFT; 6034 for (pseg = memsegs; pseg != NULL; pseg = pseg->next) { 6035 index = MEMSEG_PFN_HASH(pseg->pages_base); 6036 cur = pseg->pages_base; 6037 do { 6038 if (index >= N_MEM_SLOTS) 6039 index = MEMSEG_PFN_HASH(cur); 6040 6041 if (memseg_hash[index] == NULL || 6042 memseg_hash[index]->pages_base > pseg->pages_base) { 6043 memseg_hash[index] = pseg; 6044 hat_kpm_mseghash_update(index, pseg); 6045 } 6046 cur += mhash_per_slot; 6047 index++; 6048 } while (cur < pseg->pages_end); 6049 } 6050 } 6051 6052 /* 6053 * Return the pagenum for the pp 6054 */ 6055 pfn_t 6056 page_pptonum(page_t *pp) 6057 { 6058 return (pp->p_pagenum); 6059 } 6060 6061 /* 6062 * interface to the referenced and modified etc bits 6063 * in the PSM part of the page struct 6064 * when no locking is desired. 6065 */ 6066 void 6067 page_set_props(page_t *pp, uint_t flags) 6068 { 6069 ASSERT((flags & ~(P_MOD | P_REF | P_RO)) == 0); 6070 pp->p_nrm |= (uchar_t)flags; 6071 } 6072 6073 void 6074 page_clr_all_props(page_t *pp) 6075 { 6076 pp->p_nrm = 0; 6077 } 6078 6079 /* 6080 * Clear p_lckcnt and p_cowcnt, adjusting freemem if required. 6081 */ 6082 int 6083 page_clear_lck_cow(page_t *pp, int adjust) 6084 { 6085 int f_amount; 6086 6087 ASSERT(PAGE_EXCL(pp)); 6088 6089 /* 6090 * The page_struct_lock need not be acquired here since 6091 * we require the caller hold the page exclusively locked. 6092 */ 6093 f_amount = 0; 6094 if (pp->p_lckcnt) { 6095 f_amount = 1; 6096 pp->p_lckcnt = 0; 6097 } 6098 if (pp->p_cowcnt) { 6099 f_amount += pp->p_cowcnt; 6100 pp->p_cowcnt = 0; 6101 } 6102 6103 if (adjust && f_amount) { 6104 mutex_enter(&freemem_lock); 6105 availrmem += f_amount; 6106 mutex_exit(&freemem_lock); 6107 } 6108 6109 return (f_amount); 6110 } 6111 6112 /* 6113 * The following functions is called from free_vp_pages() 6114 * for an inexact estimate of a newly free'd page... 6115 */ 6116 ulong_t 6117 page_share_cnt(page_t *pp) 6118 { 6119 return (hat_page_getshare(pp)); 6120 } 6121 6122 int 6123 page_isshared(page_t *pp) 6124 { 6125 return (hat_page_checkshare(pp, 1)); 6126 } 6127 6128 int 6129 page_isfree(page_t *pp) 6130 { 6131 return (PP_ISFREE(pp)); 6132 } 6133 6134 int 6135 page_isref(page_t *pp) 6136 { 6137 return (hat_page_getattr(pp, P_REF)); 6138 } 6139 6140 int 6141 page_ismod(page_t *pp) 6142 { 6143 return (hat_page_getattr(pp, P_MOD)); 6144 } 6145 6146 /* 6147 * The following code all currently relates to the page capture logic: 6148 * 6149 * This logic is used for cases where there is a desire to claim a certain 6150 * physical page in the system for the caller. As it may not be possible 6151 * to capture the page immediately, the p_toxic bits are used in the page 6152 * structure to indicate that someone wants to capture this page. When the 6153 * page gets unlocked, the toxic flag will be noted and an attempt to capture 6154 * the page will be made. If it is successful, the original callers callback 6155 * will be called with the page to do with it what they please. 6156 * 6157 * There is also an async thread which wakes up to attempt to capture 6158 * pages occasionally which have the capture bit set. All of the pages which 6159 * need to be captured asynchronously have been inserted into the 6160 * page_capture_hash and thus this thread walks that hash list. Items in the 6161 * hash have an expiration time so this thread handles that as well by removing 6162 * the item from the hash if it has expired. 6163 * 6164 * Some important things to note are: 6165 * - if the PR_CAPTURE bit is set on a page, then the page is in the 6166 * page_capture_hash. The page_capture_hash_head.pchh_mutex is needed 6167 * to set and clear this bit, and while the lock is held is the only time 6168 * you can add or remove an entry from the hash. 6169 * - the PR_CAPTURE bit can only be set and cleared while holding the 6170 * page_capture_hash_head.pchh_mutex 6171 * - the t_flag field of the thread struct is used with the T_CAPTURING 6172 * flag to prevent recursion while dealing with large pages. 6173 * - pages which need to be retired never expire on the page_capture_hash. 6174 */ 6175 6176 static void page_capture_thread(void); 6177 static kthread_t *pc_thread_id; 6178 kcondvar_t pc_cv; 6179 static kmutex_t pc_thread_mutex; 6180 static clock_t pc_thread_shortwait; 6181 static clock_t pc_thread_longwait; 6182 static int pc_thread_retry; 6183 6184 struct page_capture_callback pc_cb[PC_NUM_CALLBACKS]; 6185 6186 /* Note that this is a circular linked list */ 6187 typedef struct page_capture_hash_bucket { 6188 page_t *pp; 6189 uchar_t szc; 6190 uchar_t pri; 6191 uint_t flags; 6192 clock_t expires; /* lbolt at which this request expires. */ 6193 void *datap; /* Cached data passed in for callback */ 6194 struct page_capture_hash_bucket *next; 6195 struct page_capture_hash_bucket *prev; 6196 } page_capture_hash_bucket_t; 6197 6198 #define PC_PRI_HI 0 /* capture now */ 6199 #define PC_PRI_LO 1 /* capture later */ 6200 #define PC_NUM_PRI 2 6201 6202 #define PAGE_CAPTURE_PRIO(pp) (PP_ISRAF(pp) ? PC_PRI_LO : PC_PRI_HI) 6203 6204 6205 /* 6206 * Each hash bucket will have it's own mutex and two lists which are: 6207 * active (0): represents requests which have not been processed by 6208 * the page_capture async thread yet. 6209 * walked (1): represents requests which have been processed by the 6210 * page_capture async thread within it's given walk of this bucket. 6211 * 6212 * These are all needed so that we can synchronize all async page_capture 6213 * events. When the async thread moves to a new bucket, it will append the 6214 * walked list to the active list and walk each item one at a time, moving it 6215 * from the active list to the walked list. Thus if there is an async request 6216 * outstanding for a given page, it will always be in one of the two lists. 6217 * New requests will always be added to the active list. 6218 * If we were not able to capture a page before the request expired, we'd free 6219 * up the request structure which would indicate to page_capture that there is 6220 * no longer a need for the given page, and clear the PR_CAPTURE flag if 6221 * possible. 6222 */ 6223 typedef struct page_capture_hash_head { 6224 kmutex_t pchh_mutex; 6225 uint_t num_pages[PC_NUM_PRI]; 6226 page_capture_hash_bucket_t lists[2]; /* sentinel nodes */ 6227 } page_capture_hash_head_t; 6228 6229 #ifdef DEBUG 6230 #define NUM_PAGE_CAPTURE_BUCKETS 4 6231 #else 6232 #define NUM_PAGE_CAPTURE_BUCKETS 64 6233 #endif 6234 6235 page_capture_hash_head_t page_capture_hash[NUM_PAGE_CAPTURE_BUCKETS]; 6236 6237 /* for now use a very simple hash based upon the size of a page struct */ 6238 #define PAGE_CAPTURE_HASH(pp) \ 6239 ((int)(((uintptr_t)pp >> 7) & (NUM_PAGE_CAPTURE_BUCKETS - 1))) 6240 6241 extern pgcnt_t swapfs_minfree; 6242 6243 int page_trycapture(page_t *pp, uint_t szc, uint_t flags, void *datap); 6244 6245 /* 6246 * a callback function is required for page capture requests. 6247 */ 6248 void 6249 page_capture_register_callback(uint_t index, clock_t duration, 6250 int (*cb_func)(page_t *, void *, uint_t)) 6251 { 6252 ASSERT(pc_cb[index].cb_active == 0); 6253 ASSERT(cb_func != NULL); 6254 rw_enter(&pc_cb[index].cb_rwlock, RW_WRITER); 6255 pc_cb[index].duration = duration; 6256 pc_cb[index].cb_func = cb_func; 6257 pc_cb[index].cb_active = 1; 6258 rw_exit(&pc_cb[index].cb_rwlock); 6259 } 6260 6261 void 6262 page_capture_unregister_callback(uint_t index) 6263 { 6264 int i, j; 6265 struct page_capture_hash_bucket *bp1; 6266 struct page_capture_hash_bucket *bp2; 6267 struct page_capture_hash_bucket *head = NULL; 6268 uint_t flags = (1 << index); 6269 6270 rw_enter(&pc_cb[index].cb_rwlock, RW_WRITER); 6271 ASSERT(pc_cb[index].cb_active == 1); 6272 pc_cb[index].duration = 0; /* Paranoia */ 6273 pc_cb[index].cb_func = NULL; /* Paranoia */ 6274 pc_cb[index].cb_active = 0; 6275 rw_exit(&pc_cb[index].cb_rwlock); 6276 6277 /* 6278 * Just move all the entries to a private list which we can walk 6279 * through without the need to hold any locks. 6280 * No more requests can get added to the hash lists for this consumer 6281 * as the cb_active field for the callback has been cleared. 6282 */ 6283 for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) { 6284 mutex_enter(&page_capture_hash[i].pchh_mutex); 6285 for (j = 0; j < 2; j++) { 6286 bp1 = page_capture_hash[i].lists[j].next; 6287 /* walk through all but first (sentinel) element */ 6288 while (bp1 != &page_capture_hash[i].lists[j]) { 6289 bp2 = bp1; 6290 if (bp2->flags & flags) { 6291 bp1 = bp2->next; 6292 bp1->prev = bp2->prev; 6293 bp2->prev->next = bp1; 6294 bp2->next = head; 6295 head = bp2; 6296 /* 6297 * Clear the PR_CAPTURE bit as we 6298 * hold appropriate locks here. 6299 */ 6300 page_clrtoxic(head->pp, PR_CAPTURE); 6301 page_capture_hash[i]. 6302 num_pages[bp2->pri]--; 6303 continue; 6304 } 6305 bp1 = bp1->next; 6306 } 6307 } 6308 mutex_exit(&page_capture_hash[i].pchh_mutex); 6309 } 6310 6311 while (head != NULL) { 6312 bp1 = head; 6313 head = head->next; 6314 kmem_free(bp1, sizeof (*bp1)); 6315 } 6316 } 6317 6318 6319 /* 6320 * Find pp in the active list and move it to the walked list if it 6321 * exists. 6322 * Note that most often pp should be at the front of the active list 6323 * as it is currently used and thus there is no other sort of optimization 6324 * being done here as this is a linked list data structure. 6325 * Returns 1 on successful move or 0 if page could not be found. 6326 */ 6327 static int 6328 page_capture_move_to_walked(page_t *pp) 6329 { 6330 page_capture_hash_bucket_t *bp; 6331 int index; 6332 6333 index = PAGE_CAPTURE_HASH(pp); 6334 6335 mutex_enter(&page_capture_hash[index].pchh_mutex); 6336 bp = page_capture_hash[index].lists[0].next; 6337 while (bp != &page_capture_hash[index].lists[0]) { 6338 if (bp->pp == pp) { 6339 /* Remove from old list */ 6340 bp->next->prev = bp->prev; 6341 bp->prev->next = bp->next; 6342 6343 /* Add to new list */ 6344 bp->next = page_capture_hash[index].lists[1].next; 6345 bp->prev = &page_capture_hash[index].lists[1]; 6346 page_capture_hash[index].lists[1].next = bp; 6347 bp->next->prev = bp; 6348 6349 /* 6350 * There is a small probability of page on a free 6351 * list being retired while being allocated 6352 * and before P_RAF is set on it. The page may 6353 * end up marked as high priority request instead 6354 * of low priority request. 6355 * If P_RAF page is not marked as low priority request 6356 * change it to low priority request. 6357 */ 6358 page_capture_hash[index].num_pages[bp->pri]--; 6359 bp->pri = PAGE_CAPTURE_PRIO(pp); 6360 page_capture_hash[index].num_pages[bp->pri]++; 6361 mutex_exit(&page_capture_hash[index].pchh_mutex); 6362 return (1); 6363 } 6364 bp = bp->next; 6365 } 6366 mutex_exit(&page_capture_hash[index].pchh_mutex); 6367 return (0); 6368 } 6369 6370 /* 6371 * Add a new entry to the page capture hash. The only case where a new 6372 * entry is not added is when the page capture consumer is no longer registered. 6373 * In this case, we'll silently not add the page to the hash. We know that 6374 * page retire will always be registered for the case where we are currently 6375 * unretiring a page and thus there are no conflicts. 6376 */ 6377 static void 6378 page_capture_add_hash(page_t *pp, uint_t szc, uint_t flags, void *datap) 6379 { 6380 page_capture_hash_bucket_t *bp1; 6381 page_capture_hash_bucket_t *bp2; 6382 int index; 6383 int cb_index; 6384 int i; 6385 uchar_t pri; 6386 #ifdef DEBUG 6387 page_capture_hash_bucket_t *tp1; 6388 int l; 6389 #endif 6390 6391 ASSERT(!(flags & CAPTURE_ASYNC)); 6392 6393 bp1 = kmem_alloc(sizeof (struct page_capture_hash_bucket), KM_SLEEP); 6394 6395 bp1->pp = pp; 6396 bp1->szc = szc; 6397 bp1->flags = flags; 6398 bp1->datap = datap; 6399 6400 for (cb_index = 0; cb_index < PC_NUM_CALLBACKS; cb_index++) { 6401 if ((flags >> cb_index) & 1) { 6402 break; 6403 } 6404 } 6405 6406 ASSERT(cb_index != PC_NUM_CALLBACKS); 6407 6408 rw_enter(&pc_cb[cb_index].cb_rwlock, RW_READER); 6409 if (pc_cb[cb_index].cb_active) { 6410 if (pc_cb[cb_index].duration == -1) { 6411 bp1->expires = (clock_t)-1; 6412 } else { 6413 bp1->expires = ddi_get_lbolt() + 6414 pc_cb[cb_index].duration; 6415 } 6416 } else { 6417 /* There's no callback registered so don't add to the hash */ 6418 rw_exit(&pc_cb[cb_index].cb_rwlock); 6419 kmem_free(bp1, sizeof (*bp1)); 6420 return; 6421 } 6422 6423 index = PAGE_CAPTURE_HASH(pp); 6424 6425 /* 6426 * Only allow capture flag to be modified under this mutex. 6427 * Prevents multiple entries for same page getting added. 6428 */ 6429 mutex_enter(&page_capture_hash[index].pchh_mutex); 6430 6431 /* 6432 * if not already on the hash, set capture bit and add to the hash 6433 */ 6434 if (!(pp->p_toxic & PR_CAPTURE)) { 6435 #ifdef DEBUG 6436 /* Check for duplicate entries */ 6437 for (l = 0; l < 2; l++) { 6438 tp1 = page_capture_hash[index].lists[l].next; 6439 while (tp1 != &page_capture_hash[index].lists[l]) { 6440 if (tp1->pp == pp) { 6441 panic("page pp 0x%p already on hash " 6442 "at 0x%p\n", 6443 (void *)pp, (void *)tp1); 6444 } 6445 tp1 = tp1->next; 6446 } 6447 } 6448 6449 #endif 6450 page_settoxic(pp, PR_CAPTURE); 6451 pri = PAGE_CAPTURE_PRIO(pp); 6452 bp1->pri = pri; 6453 bp1->next = page_capture_hash[index].lists[0].next; 6454 bp1->prev = &page_capture_hash[index].lists[0]; 6455 bp1->next->prev = bp1; 6456 page_capture_hash[index].lists[0].next = bp1; 6457 page_capture_hash[index].num_pages[pri]++; 6458 if (flags & CAPTURE_RETIRE) { 6459 page_retire_incr_pend_count(datap); 6460 } 6461 mutex_exit(&page_capture_hash[index].pchh_mutex); 6462 rw_exit(&pc_cb[cb_index].cb_rwlock); 6463 cv_signal(&pc_cv); 6464 return; 6465 } 6466 6467 /* 6468 * A page retire request will replace any other request. 6469 * A second physmem request which is for a different process than 6470 * the currently registered one will be dropped as there is 6471 * no way to hold the private data for both calls. 6472 * In the future, once there are more callers, this will have to 6473 * be worked out better as there needs to be private storage for 6474 * at least each type of caller (maybe have datap be an array of 6475 * *void's so that we can index based upon callers index). 6476 */ 6477 6478 /* walk hash list to update expire time */ 6479 for (i = 0; i < 2; i++) { 6480 bp2 = page_capture_hash[index].lists[i].next; 6481 while (bp2 != &page_capture_hash[index].lists[i]) { 6482 if (bp2->pp == pp) { 6483 if (flags & CAPTURE_RETIRE) { 6484 if (!(bp2->flags & CAPTURE_RETIRE)) { 6485 page_retire_incr_pend_count( 6486 datap); 6487 bp2->flags = flags; 6488 bp2->expires = bp1->expires; 6489 bp2->datap = datap; 6490 } 6491 } else { 6492 ASSERT(flags & CAPTURE_PHYSMEM); 6493 if (!(bp2->flags & CAPTURE_RETIRE) && 6494 (datap == bp2->datap)) { 6495 bp2->expires = bp1->expires; 6496 } 6497 } 6498 mutex_exit(&page_capture_hash[index]. 6499 pchh_mutex); 6500 rw_exit(&pc_cb[cb_index].cb_rwlock); 6501 kmem_free(bp1, sizeof (*bp1)); 6502 return; 6503 } 6504 bp2 = bp2->next; 6505 } 6506 } 6507 6508 /* 6509 * the PR_CAPTURE flag is protected by the page_capture_hash mutexes 6510 * and thus it either has to be set or not set and can't change 6511 * while holding the mutex above. 6512 */ 6513 panic("page_capture_add_hash, PR_CAPTURE flag set on pp %p\n", 6514 (void *)pp); 6515 } 6516 6517 /* 6518 * We have a page in our hands, lets try and make it ours by turning 6519 * it into a clean page like it had just come off the freelists. 6520 * 6521 * Returns 0 on success, with the page still EXCL locked. 6522 * On failure, the page will be unlocked, and returns EAGAIN 6523 */ 6524 static int 6525 page_capture_clean_page(page_t *pp) 6526 { 6527 page_t *newpp; 6528 int skip_unlock = 0; 6529 spgcnt_t count; 6530 page_t *tpp; 6531 int ret = 0; 6532 int extra; 6533 6534 ASSERT(PAGE_EXCL(pp)); 6535 ASSERT(!PP_RETIRED(pp)); 6536 ASSERT(curthread->t_flag & T_CAPTURING); 6537 6538 if (PP_ISFREE(pp)) { 6539 if (!page_reclaim(pp, NULL)) { 6540 skip_unlock = 1; 6541 ret = EAGAIN; 6542 goto cleanup; 6543 } 6544 ASSERT(pp->p_szc == 0); 6545 if (pp->p_vnode != NULL) { 6546 /* 6547 * Since this page came from the 6548 * cachelist, we must destroy the 6549 * old vnode association. 6550 */ 6551 page_hashout(pp, NULL); 6552 } 6553 goto cleanup; 6554 } 6555 6556 /* 6557 * If we know page_relocate will fail, skip it 6558 * It could still fail due to a UE on another page but we 6559 * can't do anything about that. 6560 */ 6561 if (pp->p_toxic & PR_UE) { 6562 goto skip_relocate; 6563 } 6564 6565 /* 6566 * It's possible that pages can not have a vnode as fsflush comes 6567 * through and cleans up these pages. It's ugly but that's how it is. 6568 */ 6569 if (pp->p_vnode == NULL) { 6570 goto skip_relocate; 6571 } 6572 6573 /* 6574 * Page was not free, so lets try to relocate it. 6575 * page_relocate only works with root pages, so if this is not a root 6576 * page, we need to demote it to try and relocate it. 6577 * Unfortunately this is the best we can do right now. 6578 */ 6579 newpp = NULL; 6580 if ((pp->p_szc > 0) && (pp != PP_PAGEROOT(pp))) { 6581 if (page_try_demote_pages(pp) == 0) { 6582 ret = EAGAIN; 6583 goto cleanup; 6584 } 6585 } 6586 ret = page_relocate(&pp, &newpp, 1, 0, &count, NULL); 6587 if (ret == 0) { 6588 page_t *npp; 6589 /* unlock the new page(s) */ 6590 while (count-- > 0) { 6591 ASSERT(newpp != NULL); 6592 npp = newpp; 6593 page_sub(&newpp, npp); 6594 page_unlock(npp); 6595 } 6596 ASSERT(newpp == NULL); 6597 /* 6598 * Check to see if the page we have is too large. 6599 * If so, demote it freeing up the extra pages. 6600 */ 6601 if (pp->p_szc > 0) { 6602 /* For now demote extra pages to szc == 0 */ 6603 extra = page_get_pagecnt(pp->p_szc) - 1; 6604 while (extra > 0) { 6605 tpp = pp->p_next; 6606 page_sub(&pp, tpp); 6607 tpp->p_szc = 0; 6608 page_free(tpp, 1); 6609 extra--; 6610 } 6611 /* Make sure to set our page to szc 0 as well */ 6612 ASSERT(pp->p_next == pp && pp->p_prev == pp); 6613 pp->p_szc = 0; 6614 } 6615 goto cleanup; 6616 } else if (ret == EIO) { 6617 ret = EAGAIN; 6618 goto cleanup; 6619 } else { 6620 /* 6621 * Need to reset return type as we failed to relocate the page 6622 * but that does not mean that some of the next steps will not 6623 * work. 6624 */ 6625 ret = 0; 6626 } 6627 6628 skip_relocate: 6629 6630 if (pp->p_szc > 0) { 6631 if (page_try_demote_pages(pp) == 0) { 6632 ret = EAGAIN; 6633 goto cleanup; 6634 } 6635 } 6636 6637 ASSERT(pp->p_szc == 0); 6638 6639 if (hat_ismod(pp)) { 6640 ret = EAGAIN; 6641 goto cleanup; 6642 } 6643 if (PP_ISKAS(pp)) { 6644 ret = EAGAIN; 6645 goto cleanup; 6646 } 6647 if (pp->p_lckcnt || pp->p_cowcnt) { 6648 ret = EAGAIN; 6649 goto cleanup; 6650 } 6651 6652 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 6653 ASSERT(!hat_page_is_mapped(pp)); 6654 6655 if (hat_ismod(pp)) { 6656 /* 6657 * This is a semi-odd case as the page is now modified but not 6658 * mapped as we just unloaded the mappings above. 6659 */ 6660 ret = EAGAIN; 6661 goto cleanup; 6662 } 6663 if (pp->p_vnode != NULL) { 6664 page_hashout(pp, NULL); 6665 } 6666 6667 /* 6668 * At this point, the page should be in a clean state and 6669 * we can do whatever we want with it. 6670 */ 6671 6672 cleanup: 6673 if (ret != 0) { 6674 if (!skip_unlock) { 6675 page_unlock(pp); 6676 } 6677 } else { 6678 ASSERT(pp->p_szc == 0); 6679 ASSERT(PAGE_EXCL(pp)); 6680 6681 pp->p_next = pp; 6682 pp->p_prev = pp; 6683 } 6684 return (ret); 6685 } 6686 6687 /* 6688 * Various callers of page_trycapture() can have different restrictions upon 6689 * what memory they have access to. 6690 * Returns 0 on success, with the following error codes on failure: 6691 * EPERM - The requested page is long term locked, and thus repeated 6692 * requests to capture this page will likely fail. 6693 * ENOMEM - There was not enough free memory in the system to safely 6694 * map the requested page. 6695 * ENOENT - The requested page was inside the kernel cage, and the 6696 * PHYSMEM_CAGE flag was not set. 6697 */ 6698 int 6699 page_capture_pre_checks(page_t *pp, uint_t flags) 6700 { 6701 ASSERT(pp != NULL); 6702 6703 #if defined(__sparc) 6704 if (pp->p_vnode == &promvp) { 6705 return (EPERM); 6706 } 6707 6708 if (PP_ISNORELOC(pp) && !(flags & CAPTURE_GET_CAGE) && 6709 (flags & CAPTURE_PHYSMEM)) { 6710 return (ENOENT); 6711 } 6712 6713 if (PP_ISNORELOCKERNEL(pp)) { 6714 return (EPERM); 6715 } 6716 #else 6717 if (PP_ISKAS(pp)) { 6718 return (EPERM); 6719 } 6720 #endif /* __sparc */ 6721 6722 /* only physmem currently has the restrictions checked below */ 6723 if (!(flags & CAPTURE_PHYSMEM)) { 6724 return (0); 6725 } 6726 6727 if (availrmem < swapfs_minfree) { 6728 /* 6729 * We won't try to capture this page as we are 6730 * running low on memory. 6731 */ 6732 return (ENOMEM); 6733 } 6734 return (0); 6735 } 6736 6737 /* 6738 * Once we have a page in our mits, go ahead and complete the capture 6739 * operation. 6740 * Returns 1 on failure where page is no longer needed 6741 * Returns 0 on success 6742 * Returns -1 if there was a transient failure. 6743 * Failure cases must release the SE_EXCL lock on pp (usually via page_free). 6744 */ 6745 int 6746 page_capture_take_action(page_t *pp, uint_t flags, void *datap) 6747 { 6748 int cb_index; 6749 int ret = 0; 6750 page_capture_hash_bucket_t *bp1; 6751 page_capture_hash_bucket_t *bp2; 6752 int index; 6753 int found = 0; 6754 int i; 6755 6756 ASSERT(PAGE_EXCL(pp)); 6757 ASSERT(curthread->t_flag & T_CAPTURING); 6758 6759 for (cb_index = 0; cb_index < PC_NUM_CALLBACKS; cb_index++) { 6760 if ((flags >> cb_index) & 1) { 6761 break; 6762 } 6763 } 6764 ASSERT(cb_index < PC_NUM_CALLBACKS); 6765 6766 /* 6767 * Remove the entry from the page_capture hash, but don't free it yet 6768 * as we may need to put it back. 6769 * Since we own the page at this point in time, we should find it 6770 * in the hash if this is an ASYNC call. If we don't it's likely 6771 * that the page_capture_async() thread decided that this request 6772 * had expired, in which case we just continue on. 6773 */ 6774 if (flags & CAPTURE_ASYNC) { 6775 6776 index = PAGE_CAPTURE_HASH(pp); 6777 6778 mutex_enter(&page_capture_hash[index].pchh_mutex); 6779 for (i = 0; i < 2 && !found; i++) { 6780 bp1 = page_capture_hash[index].lists[i].next; 6781 while (bp1 != &page_capture_hash[index].lists[i]) { 6782 if (bp1->pp == pp) { 6783 bp1->next->prev = bp1->prev; 6784 bp1->prev->next = bp1->next; 6785 page_capture_hash[index]. 6786 num_pages[bp1->pri]--; 6787 page_clrtoxic(pp, PR_CAPTURE); 6788 found = 1; 6789 break; 6790 } 6791 bp1 = bp1->next; 6792 } 6793 } 6794 mutex_exit(&page_capture_hash[index].pchh_mutex); 6795 } 6796 6797 /* Synchronize with the unregister func. */ 6798 rw_enter(&pc_cb[cb_index].cb_rwlock, RW_READER); 6799 if (!pc_cb[cb_index].cb_active) { 6800 page_free(pp, 1); 6801 rw_exit(&pc_cb[cb_index].cb_rwlock); 6802 if (found) { 6803 kmem_free(bp1, sizeof (*bp1)); 6804 } 6805 return (1); 6806 } 6807 6808 /* 6809 * We need to remove the entry from the page capture hash and turn off 6810 * the PR_CAPTURE bit before calling the callback. We'll need to cache 6811 * the entry here, and then based upon the return value, cleanup 6812 * appropriately or re-add it to the hash, making sure that someone else 6813 * hasn't already done so. 6814 * It should be rare for the callback to fail and thus it's ok for 6815 * the failure path to be a bit complicated as the success path is 6816 * cleaner and the locking rules are easier to follow. 6817 */ 6818 6819 ret = pc_cb[cb_index].cb_func(pp, datap, flags); 6820 6821 rw_exit(&pc_cb[cb_index].cb_rwlock); 6822 6823 /* 6824 * If this was an ASYNC request, we need to cleanup the hash if the 6825 * callback was successful or if the request was no longer valid. 6826 * For non-ASYNC requests, we return failure to map and the caller 6827 * will take care of adding the request to the hash. 6828 * Note also that the callback itself is responsible for the page 6829 * at this point in time in terms of locking ... The most common 6830 * case for the failure path should just be a page_free. 6831 */ 6832 if (ret >= 0) { 6833 if (found) { 6834 if (bp1->flags & CAPTURE_RETIRE) { 6835 page_retire_decr_pend_count(datap); 6836 } 6837 kmem_free(bp1, sizeof (*bp1)); 6838 } 6839 return (ret); 6840 } 6841 if (!found) { 6842 return (ret); 6843 } 6844 6845 ASSERT(flags & CAPTURE_ASYNC); 6846 6847 /* 6848 * Check for expiration time first as we can just free it up if it's 6849 * expired. 6850 */ 6851 if (ddi_get_lbolt() > bp1->expires && bp1->expires != -1) { 6852 kmem_free(bp1, sizeof (*bp1)); 6853 return (ret); 6854 } 6855 6856 /* 6857 * The callback failed and there used to be an entry in the hash for 6858 * this page, so we need to add it back to the hash. 6859 */ 6860 mutex_enter(&page_capture_hash[index].pchh_mutex); 6861 if (!(pp->p_toxic & PR_CAPTURE)) { 6862 /* just add bp1 back to head of walked list */ 6863 page_settoxic(pp, PR_CAPTURE); 6864 bp1->next = page_capture_hash[index].lists[1].next; 6865 bp1->prev = &page_capture_hash[index].lists[1]; 6866 bp1->next->prev = bp1; 6867 bp1->pri = PAGE_CAPTURE_PRIO(pp); 6868 page_capture_hash[index].lists[1].next = bp1; 6869 page_capture_hash[index].num_pages[bp1->pri]++; 6870 mutex_exit(&page_capture_hash[index].pchh_mutex); 6871 return (ret); 6872 } 6873 6874 /* 6875 * Otherwise there was a new capture request added to list 6876 * Need to make sure that our original data is represented if 6877 * appropriate. 6878 */ 6879 for (i = 0; i < 2; i++) { 6880 bp2 = page_capture_hash[index].lists[i].next; 6881 while (bp2 != &page_capture_hash[index].lists[i]) { 6882 if (bp2->pp == pp) { 6883 if (bp1->flags & CAPTURE_RETIRE) { 6884 if (!(bp2->flags & CAPTURE_RETIRE)) { 6885 bp2->szc = bp1->szc; 6886 bp2->flags = bp1->flags; 6887 bp2->expires = bp1->expires; 6888 bp2->datap = bp1->datap; 6889 } 6890 } else { 6891 ASSERT(bp1->flags & CAPTURE_PHYSMEM); 6892 if (!(bp2->flags & CAPTURE_RETIRE)) { 6893 bp2->szc = bp1->szc; 6894 bp2->flags = bp1->flags; 6895 bp2->expires = bp1->expires; 6896 bp2->datap = bp1->datap; 6897 } 6898 } 6899 page_capture_hash[index].num_pages[bp2->pri]--; 6900 bp2->pri = PAGE_CAPTURE_PRIO(pp); 6901 page_capture_hash[index].num_pages[bp2->pri]++; 6902 mutex_exit(&page_capture_hash[index]. 6903 pchh_mutex); 6904 kmem_free(bp1, sizeof (*bp1)); 6905 return (ret); 6906 } 6907 bp2 = bp2->next; 6908 } 6909 } 6910 panic("PR_CAPTURE set but not on hash for pp 0x%p\n", (void *)pp); 6911 /*NOTREACHED*/ 6912 } 6913 6914 /* 6915 * Try to capture the given page for the caller specified in the flags 6916 * parameter. The page will either be captured and handed over to the 6917 * appropriate callback, or will be queued up in the page capture hash 6918 * to be captured asynchronously. 6919 * If the current request is due to an async capture, the page must be 6920 * exclusively locked before calling this function. 6921 * Currently szc must be 0 but in the future this should be expandable to 6922 * other page sizes. 6923 * Returns 0 on success, with the following error codes on failure: 6924 * EPERM - The requested page is long term locked, and thus repeated 6925 * requests to capture this page will likely fail. 6926 * ENOMEM - There was not enough free memory in the system to safely 6927 * map the requested page. 6928 * ENOENT - The requested page was inside the kernel cage, and the 6929 * CAPTURE_GET_CAGE flag was not set. 6930 * EAGAIN - The requested page could not be capturead at this point in 6931 * time but future requests will likely work. 6932 * EBUSY - The requested page is retired and the CAPTURE_GET_RETIRED flag 6933 * was not set. 6934 */ 6935 int 6936 page_itrycapture(page_t *pp, uint_t szc, uint_t flags, void *datap) 6937 { 6938 int ret; 6939 int cb_index; 6940 6941 if (flags & CAPTURE_ASYNC) { 6942 ASSERT(PAGE_EXCL(pp)); 6943 goto async; 6944 } 6945 6946 /* Make sure there's enough availrmem ... */ 6947 ret = page_capture_pre_checks(pp, flags); 6948 if (ret != 0) { 6949 return (ret); 6950 } 6951 6952 if (!page_trylock(pp, SE_EXCL)) { 6953 for (cb_index = 0; cb_index < PC_NUM_CALLBACKS; cb_index++) { 6954 if ((flags >> cb_index) & 1) { 6955 break; 6956 } 6957 } 6958 ASSERT(cb_index < PC_NUM_CALLBACKS); 6959 ret = EAGAIN; 6960 /* Special case for retired pages */ 6961 if (PP_RETIRED(pp)) { 6962 if (flags & CAPTURE_GET_RETIRED) { 6963 if (!page_unretire_pp(pp, PR_UNR_TEMP)) { 6964 /* 6965 * Need to set capture bit and add to 6966 * hash so that the page will be 6967 * retired when freed. 6968 */ 6969 page_capture_add_hash(pp, szc, 6970 CAPTURE_RETIRE, NULL); 6971 ret = 0; 6972 goto own_page; 6973 } 6974 } else { 6975 return (EBUSY); 6976 } 6977 } 6978 page_capture_add_hash(pp, szc, flags, datap); 6979 return (ret); 6980 } 6981 6982 async: 6983 ASSERT(PAGE_EXCL(pp)); 6984 6985 /* Need to check for physmem async requests that availrmem is sane */ 6986 if ((flags & (CAPTURE_ASYNC | CAPTURE_PHYSMEM)) == 6987 (CAPTURE_ASYNC | CAPTURE_PHYSMEM) && 6988 (availrmem < swapfs_minfree)) { 6989 page_unlock(pp); 6990 return (ENOMEM); 6991 } 6992 6993 ret = page_capture_clean_page(pp); 6994 6995 if (ret != 0) { 6996 /* We failed to get the page, so lets add it to the hash */ 6997 if (!(flags & CAPTURE_ASYNC)) { 6998 page_capture_add_hash(pp, szc, flags, datap); 6999 } 7000 return (ret); 7001 } 7002 7003 own_page: 7004 ASSERT(PAGE_EXCL(pp)); 7005 ASSERT(pp->p_szc == 0); 7006 7007 /* Call the callback */ 7008 ret = page_capture_take_action(pp, flags, datap); 7009 7010 if (ret == 0) { 7011 return (0); 7012 } 7013 7014 /* 7015 * Note that in the failure cases from page_capture_take_action, the 7016 * EXCL lock will have already been dropped. 7017 */ 7018 if ((ret == -1) && (!(flags & CAPTURE_ASYNC))) { 7019 page_capture_add_hash(pp, szc, flags, datap); 7020 } 7021 return (EAGAIN); 7022 } 7023 7024 int 7025 page_trycapture(page_t *pp, uint_t szc, uint_t flags, void *datap) 7026 { 7027 int ret; 7028 7029 curthread->t_flag |= T_CAPTURING; 7030 ret = page_itrycapture(pp, szc, flags, datap); 7031 curthread->t_flag &= ~T_CAPTURING; /* xor works as we know its set */ 7032 return (ret); 7033 } 7034 7035 /* 7036 * When unlocking a page which has the PR_CAPTURE bit set, this routine 7037 * gets called to try and capture the page. 7038 */ 7039 void 7040 page_unlock_capture(page_t *pp) 7041 { 7042 page_capture_hash_bucket_t *bp; 7043 int index; 7044 int i; 7045 uint_t szc; 7046 uint_t flags = 0; 7047 void *datap; 7048 kmutex_t *mp; 7049 extern vnode_t retired_pages; 7050 7051 /* 7052 * We need to protect against a possible deadlock here where we own 7053 * the vnode page hash mutex and want to acquire it again as there 7054 * are locations in the code, where we unlock a page while holding 7055 * the mutex which can lead to the page being captured and eventually 7056 * end up here. As we may be hashing out the old page and hashing into 7057 * the retire vnode, we need to make sure we don't own them. 7058 * Other callbacks who do hash operations also need to make sure that 7059 * before they hashin to a vnode that they do not currently own the 7060 * vphm mutex otherwise there will be a panic. 7061 */ 7062 if (mutex_owned(page_vnode_mutex(&retired_pages))) { 7063 page_unlock_nocapture(pp); 7064 return; 7065 } 7066 if (pp->p_vnode != NULL && mutex_owned(page_vnode_mutex(pp->p_vnode))) { 7067 page_unlock_nocapture(pp); 7068 return; 7069 } 7070 7071 index = PAGE_CAPTURE_HASH(pp); 7072 7073 mp = &page_capture_hash[index].pchh_mutex; 7074 mutex_enter(mp); 7075 for (i = 0; i < 2; i++) { 7076 bp = page_capture_hash[index].lists[i].next; 7077 while (bp != &page_capture_hash[index].lists[i]) { 7078 if (bp->pp == pp) { 7079 szc = bp->szc; 7080 flags = bp->flags | CAPTURE_ASYNC; 7081 datap = bp->datap; 7082 mutex_exit(mp); 7083 (void) page_trycapture(pp, szc, flags, datap); 7084 return; 7085 } 7086 bp = bp->next; 7087 } 7088 } 7089 7090 /* Failed to find page in hash so clear flags and unlock it. */ 7091 page_clrtoxic(pp, PR_CAPTURE); 7092 page_unlock(pp); 7093 7094 mutex_exit(mp); 7095 } 7096 7097 void 7098 page_capture_init() 7099 { 7100 int i; 7101 for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) { 7102 page_capture_hash[i].lists[0].next = 7103 &page_capture_hash[i].lists[0]; 7104 page_capture_hash[i].lists[0].prev = 7105 &page_capture_hash[i].lists[0]; 7106 page_capture_hash[i].lists[1].next = 7107 &page_capture_hash[i].lists[1]; 7108 page_capture_hash[i].lists[1].prev = 7109 &page_capture_hash[i].lists[1]; 7110 } 7111 7112 pc_thread_shortwait = 23 * hz; 7113 pc_thread_longwait = 1201 * hz; 7114 pc_thread_retry = 3; 7115 mutex_init(&pc_thread_mutex, NULL, MUTEX_DEFAULT, NULL); 7116 cv_init(&pc_cv, NULL, CV_DEFAULT, NULL); 7117 pc_thread_id = thread_create(NULL, 0, page_capture_thread, NULL, 0, &p0, 7118 TS_RUN, minclsyspri); 7119 } 7120 7121 /* 7122 * It is necessary to scrub any failing pages prior to reboot in order to 7123 * prevent a latent error trap from occurring on the next boot. 7124 */ 7125 void 7126 page_retire_mdboot() 7127 { 7128 page_t *pp; 7129 int i, j; 7130 page_capture_hash_bucket_t *bp; 7131 uchar_t pri; 7132 7133 /* walk lists looking for pages to scrub */ 7134 for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) { 7135 for (pri = 0; pri < PC_NUM_PRI; pri++) { 7136 if (page_capture_hash[i].num_pages[pri] != 0) { 7137 break; 7138 } 7139 } 7140 if (pri == PC_NUM_PRI) 7141 continue; 7142 7143 mutex_enter(&page_capture_hash[i].pchh_mutex); 7144 7145 for (j = 0; j < 2; j++) { 7146 bp = page_capture_hash[i].lists[j].next; 7147 while (bp != &page_capture_hash[i].lists[j]) { 7148 pp = bp->pp; 7149 if (PP_TOXIC(pp)) { 7150 if (page_trylock(pp, SE_EXCL)) { 7151 PP_CLRFREE(pp); 7152 pagescrub(pp, 0, PAGESIZE); 7153 page_unlock(pp); 7154 } 7155 } 7156 bp = bp->next; 7157 } 7158 } 7159 mutex_exit(&page_capture_hash[i].pchh_mutex); 7160 } 7161 } 7162 7163 /* 7164 * Walk the page_capture_hash trying to capture pages and also cleanup old 7165 * entries which have expired. 7166 */ 7167 void 7168 page_capture_async() 7169 { 7170 page_t *pp; 7171 int i; 7172 int ret; 7173 page_capture_hash_bucket_t *bp1, *bp2; 7174 uint_t szc; 7175 uint_t flags; 7176 void *datap; 7177 uchar_t pri; 7178 7179 /* If there are outstanding pages to be captured, get to work */ 7180 for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) { 7181 for (pri = 0; pri < PC_NUM_PRI; pri++) { 7182 if (page_capture_hash[i].num_pages[pri] != 0) 7183 break; 7184 } 7185 if (pri == PC_NUM_PRI) 7186 continue; 7187 7188 /* Append list 1 to list 0 and then walk through list 0 */ 7189 mutex_enter(&page_capture_hash[i].pchh_mutex); 7190 bp1 = &page_capture_hash[i].lists[1]; 7191 bp2 = bp1->next; 7192 if (bp1 != bp2) { 7193 bp1->prev->next = page_capture_hash[i].lists[0].next; 7194 bp2->prev = &page_capture_hash[i].lists[0]; 7195 page_capture_hash[i].lists[0].next->prev = bp1->prev; 7196 page_capture_hash[i].lists[0].next = bp2; 7197 bp1->next = bp1; 7198 bp1->prev = bp1; 7199 } 7200 7201 /* list[1] will be empty now */ 7202 7203 bp1 = page_capture_hash[i].lists[0].next; 7204 while (bp1 != &page_capture_hash[i].lists[0]) { 7205 /* Check expiration time */ 7206 if ((ddi_get_lbolt() > bp1->expires && 7207 bp1->expires != -1) || 7208 page_deleted(bp1->pp)) { 7209 page_capture_hash[i].lists[0].next = bp1->next; 7210 bp1->next->prev = 7211 &page_capture_hash[i].lists[0]; 7212 page_capture_hash[i].num_pages[bp1->pri]--; 7213 7214 /* 7215 * We can safely remove the PR_CAPTURE bit 7216 * without holding the EXCL lock on the page 7217 * as the PR_CAPTURE bit requres that the 7218 * page_capture_hash[].pchh_mutex be held 7219 * to modify it. 7220 */ 7221 page_clrtoxic(bp1->pp, PR_CAPTURE); 7222 mutex_exit(&page_capture_hash[i].pchh_mutex); 7223 kmem_free(bp1, sizeof (*bp1)); 7224 mutex_enter(&page_capture_hash[i].pchh_mutex); 7225 bp1 = page_capture_hash[i].lists[0].next; 7226 continue; 7227 } 7228 pp = bp1->pp; 7229 szc = bp1->szc; 7230 flags = bp1->flags; 7231 datap = bp1->datap; 7232 mutex_exit(&page_capture_hash[i].pchh_mutex); 7233 if (page_trylock(pp, SE_EXCL)) { 7234 ret = page_trycapture(pp, szc, 7235 flags | CAPTURE_ASYNC, datap); 7236 } else { 7237 ret = 1; /* move to walked hash */ 7238 } 7239 7240 if (ret != 0) { 7241 /* Move to walked hash */ 7242 (void) page_capture_move_to_walked(pp); 7243 } 7244 mutex_enter(&page_capture_hash[i].pchh_mutex); 7245 bp1 = page_capture_hash[i].lists[0].next; 7246 } 7247 7248 mutex_exit(&page_capture_hash[i].pchh_mutex); 7249 } 7250 } 7251 7252 /* 7253 * This function is called by the page_capture_thread, and is needed in 7254 * in order to initiate aio cleanup, so that pages used in aio 7255 * will be unlocked and subsequently retired by page_capture_thread. 7256 */ 7257 static int 7258 do_aio_cleanup(void) 7259 { 7260 proc_t *procp; 7261 int (*aio_cleanup_dr_delete_memory)(proc_t *); 7262 int cleaned = 0; 7263 7264 if (modload("sys", "kaio") == -1) { 7265 cmn_err(CE_WARN, "do_aio_cleanup: cannot load kaio"); 7266 return (0); 7267 } 7268 /* 7269 * We use the aio_cleanup_dr_delete_memory function to 7270 * initiate the actual clean up; this function will wake 7271 * up the per-process aio_cleanup_thread. 7272 */ 7273 aio_cleanup_dr_delete_memory = (int (*)(proc_t *)) 7274 modgetsymvalue("aio_cleanup_dr_delete_memory", 0); 7275 if (aio_cleanup_dr_delete_memory == NULL) { 7276 cmn_err(CE_WARN, 7277 "aio_cleanup_dr_delete_memory not found in kaio"); 7278 return (0); 7279 } 7280 mutex_enter(&pidlock); 7281 for (procp = practive; (procp != NULL); procp = procp->p_next) { 7282 mutex_enter(&procp->p_lock); 7283 if (procp->p_aio != NULL) { 7284 /* cleanup proc's outstanding kaio */ 7285 cleaned += (*aio_cleanup_dr_delete_memory)(procp); 7286 } 7287 mutex_exit(&procp->p_lock); 7288 } 7289 mutex_exit(&pidlock); 7290 return (cleaned); 7291 } 7292 7293 /* 7294 * helper function for page_capture_thread 7295 */ 7296 static void 7297 page_capture_handle_outstanding(void) 7298 { 7299 int ntry; 7300 7301 /* Reap pages before attempting capture pages */ 7302 kmem_reap(); 7303 7304 if ((page_retire_pend_count() > page_retire_pend_kas_count()) && 7305 hat_supported(HAT_DYNAMIC_ISM_UNMAP, (void *)0)) { 7306 /* 7307 * Note: Purging only for platforms that support 7308 * ISM hat_pageunload() - mainly SPARC. On x86/x64 7309 * platforms ISM pages SE_SHARED locked until destroyed. 7310 */ 7311 7312 /* disable and purge seg_pcache */ 7313 (void) seg_p_disable(); 7314 for (ntry = 0; ntry < pc_thread_retry; ntry++) { 7315 if (!page_retire_pend_count()) 7316 break; 7317 if (do_aio_cleanup()) { 7318 /* 7319 * allow the apps cleanup threads 7320 * to run 7321 */ 7322 delay(pc_thread_shortwait); 7323 } 7324 page_capture_async(); 7325 } 7326 /* reenable seg_pcache */ 7327 seg_p_enable(); 7328 7329 /* completed what can be done. break out */ 7330 return; 7331 } 7332 7333 /* 7334 * For kernel pages and/or unsupported HAT_DYNAMIC_ISM_UNMAP, reap 7335 * and then attempt to capture. 7336 */ 7337 seg_preap(); 7338 page_capture_async(); 7339 } 7340 7341 /* 7342 * The page_capture_thread loops forever, looking to see if there are 7343 * pages still waiting to be captured. 7344 */ 7345 static void 7346 page_capture_thread(void) 7347 { 7348 callb_cpr_t c; 7349 int i; 7350 int high_pri_pages; 7351 int low_pri_pages; 7352 clock_t timeout; 7353 7354 CALLB_CPR_INIT(&c, &pc_thread_mutex, callb_generic_cpr, "page_capture"); 7355 7356 mutex_enter(&pc_thread_mutex); 7357 for (;;) { 7358 high_pri_pages = 0; 7359 low_pri_pages = 0; 7360 for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) { 7361 high_pri_pages += 7362 page_capture_hash[i].num_pages[PC_PRI_HI]; 7363 low_pri_pages += 7364 page_capture_hash[i].num_pages[PC_PRI_LO]; 7365 } 7366 7367 timeout = pc_thread_longwait; 7368 if (high_pri_pages != 0) { 7369 timeout = pc_thread_shortwait; 7370 page_capture_handle_outstanding(); 7371 } else if (low_pri_pages != 0) { 7372 page_capture_async(); 7373 } 7374 CALLB_CPR_SAFE_BEGIN(&c); 7375 (void) cv_reltimedwait(&pc_cv, &pc_thread_mutex, 7376 timeout, TR_CLOCK_TICK); 7377 CALLB_CPR_SAFE_END(&c, &pc_thread_mutex); 7378 } 7379 /*NOTREACHED*/ 7380 } 7381 /* 7382 * Attempt to locate a bucket that has enough pages to satisfy the request. 7383 * The initial check is done without the lock to avoid unneeded contention. 7384 * The function returns 1 if enough pages were found, else 0 if it could not 7385 * find enough pages in a bucket. 7386 */ 7387 static int 7388 pcf_decrement_bucket(pgcnt_t npages) 7389 { 7390 struct pcf *p; 7391 struct pcf *q; 7392 int i; 7393 7394 p = &pcf[PCF_INDEX()]; 7395 q = &pcf[pcf_fanout]; 7396 for (i = 0; i < pcf_fanout; i++) { 7397 if (p->pcf_count > npages) { 7398 /* 7399 * a good one to try. 7400 */ 7401 mutex_enter(&p->pcf_lock); 7402 if (p->pcf_count > npages) { 7403 p->pcf_count -= (uint_t)npages; 7404 /* 7405 * freemem is not protected by any lock. 7406 * Thus, we cannot have any assertion 7407 * containing freemem here. 7408 */ 7409 freemem -= npages; 7410 mutex_exit(&p->pcf_lock); 7411 return (1); 7412 } 7413 mutex_exit(&p->pcf_lock); 7414 } 7415 p++; 7416 if (p >= q) { 7417 p = pcf; 7418 } 7419 } 7420 return (0); 7421 } 7422 7423 /* 7424 * Arguments: 7425 * pcftotal_ret: If the value is not NULL and we have walked all the 7426 * buckets but did not find enough pages then it will 7427 * be set to the total number of pages in all the pcf 7428 * buckets. 7429 * npages: Is the number of pages we have been requested to 7430 * find. 7431 * unlock: If set to 0 we will leave the buckets locked if the 7432 * requested number of pages are not found. 7433 * 7434 * Go and try to satisfy the page request from any number of buckets. 7435 * This can be a very expensive operation as we have to lock the buckets 7436 * we are checking (and keep them locked), starting at bucket 0. 7437 * 7438 * The function returns 1 if enough pages were found, else 0 if it could not 7439 * find enough pages in the buckets. 7440 * 7441 */ 7442 static int 7443 pcf_decrement_multiple(pgcnt_t *pcftotal_ret, pgcnt_t npages, int unlock) 7444 { 7445 struct pcf *p; 7446 pgcnt_t pcftotal; 7447 int i; 7448 7449 p = pcf; 7450 /* try to collect pages from several pcf bins */ 7451 for (pcftotal = 0, i = 0; i < pcf_fanout; i++) { 7452 mutex_enter(&p->pcf_lock); 7453 pcftotal += p->pcf_count; 7454 if (pcftotal >= npages) { 7455 /* 7456 * Wow! There are enough pages laying around 7457 * to satisfy the request. Do the accounting, 7458 * drop the locks we acquired, and go back. 7459 * 7460 * freemem is not protected by any lock. So, 7461 * we cannot have any assertion containing 7462 * freemem. 7463 */ 7464 freemem -= npages; 7465 while (p >= pcf) { 7466 if (p->pcf_count <= npages) { 7467 npages -= p->pcf_count; 7468 p->pcf_count = 0; 7469 } else { 7470 p->pcf_count -= (uint_t)npages; 7471 npages = 0; 7472 } 7473 mutex_exit(&p->pcf_lock); 7474 p--; 7475 } 7476 ASSERT(npages == 0); 7477 return (1); 7478 } 7479 p++; 7480 } 7481 if (unlock) { 7482 /* failed to collect pages - release the locks */ 7483 while (--p >= pcf) { 7484 mutex_exit(&p->pcf_lock); 7485 } 7486 } 7487 if (pcftotal_ret != NULL) 7488 *pcftotal_ret = pcftotal; 7489 return (0); 7490 } 7491