1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 #pragma ident "%Z%%M% %I% %E% SMI" 40 41 /* 42 * VM - physical page management. 43 */ 44 45 #include <sys/types.h> 46 #include <sys/t_lock.h> 47 #include <sys/param.h> 48 #include <sys/systm.h> 49 #include <sys/errno.h> 50 #include <sys/time.h> 51 #include <sys/vnode.h> 52 #include <sys/vm.h> 53 #include <sys/vtrace.h> 54 #include <sys/swap.h> 55 #include <sys/cmn_err.h> 56 #include <sys/tuneable.h> 57 #include <sys/sysmacros.h> 58 #include <sys/cpuvar.h> 59 #include <sys/callb.h> 60 #include <sys/debug.h> 61 #include <sys/tnf_probe.h> 62 #include <sys/condvar_impl.h> 63 #include <sys/mem_config.h> 64 #include <sys/mem_cage.h> 65 #include <sys/kmem.h> 66 #include <sys/atomic.h> 67 #include <sys/strlog.h> 68 #include <sys/mman.h> 69 #include <sys/ontrap.h> 70 #include <sys/lgrp.h> 71 #include <sys/vfs.h> 72 73 #include <vm/hat.h> 74 #include <vm/anon.h> 75 #include <vm/page.h> 76 #include <vm/seg.h> 77 #include <vm/pvn.h> 78 #include <vm/seg_kmem.h> 79 #include <vm/vm_dep.h> 80 #include <sys/vm_usage.h> 81 #include <fs/fs_subr.h> 82 83 static int nopageage = 0; 84 85 static pgcnt_t max_page_get; /* max page_get request size in pages */ 86 pgcnt_t total_pages = 0; /* total number of pages (used by /proc) */ 87 88 /* 89 * freemem_lock protects all freemem variables: 90 * availrmem. Also this lock protects the globals which track the 91 * availrmem changes for accurate kernel footprint calculation. 92 * See below for an explanation of these 93 * globals. 94 */ 95 kmutex_t freemem_lock; 96 pgcnt_t availrmem; 97 pgcnt_t availrmem_initial; 98 99 /* 100 * These globals track availrmem changes to get a more accurate 101 * estimate of tke kernel size. Historically pp_kernel is used for 102 * kernel size and is based on availrmem. But availrmem is adjusted for 103 * locked pages in the system not just for kernel locked pages. 104 * These new counters will track the pages locked through segvn and 105 * by explicit user locking. 106 * 107 * segvn_pages_locked : This keeps track on a global basis how many pages 108 * are currently locked because of I/O. 109 * 110 * pages_locked : How many pages are locked becuase of user specified 111 * locking through mlock or plock. 112 * 113 * pages_useclaim,pages_claimed : These two variables track the 114 * cliam adjustments because of the protection changes on a segvn segment. 115 * 116 * All these globals are protected by the same lock which protects availrmem. 117 */ 118 pgcnt_t segvn_pages_locked; 119 pgcnt_t pages_locked; 120 pgcnt_t pages_useclaim; 121 pgcnt_t pages_claimed; 122 123 124 /* 125 * new_freemem_lock protects freemem, freemem_wait & freemem_cv. 126 */ 127 static kmutex_t new_freemem_lock; 128 static uint_t freemem_wait; /* someone waiting for freemem */ 129 static kcondvar_t freemem_cv; 130 131 /* 132 * The logical page free list is maintained as two lists, the 'free' 133 * and the 'cache' lists. 134 * The free list contains those pages that should be reused first. 135 * 136 * The implementation of the lists is machine dependent. 137 * page_get_freelist(), page_get_cachelist(), 138 * page_list_sub(), and page_list_add() 139 * form the interface to the machine dependent implementation. 140 * 141 * Pages with p_free set are on the cache list. 142 * Pages with p_free and p_age set are on the free list, 143 * 144 * A page may be locked while on either list. 145 */ 146 147 /* 148 * free list accounting stuff. 149 * 150 * 151 * Spread out the value for the number of pages on the 152 * page free and page cache lists. If there is just one 153 * value, then it must be under just one lock. 154 * The lock contention and cache traffic are a real bother. 155 * 156 * When we acquire and then drop a single pcf lock 157 * we can start in the middle of the array of pcf structures. 158 * If we acquire more than one pcf lock at a time, we need to 159 * start at the front to avoid deadlocking. 160 * 161 * pcf_count holds the number of pages in each pool. 162 * 163 * pcf_block is set when page_create_get_something() has asked the 164 * PSM page freelist and page cachelist routines without specifying 165 * a color and nothing came back. This is used to block anything 166 * else from moving pages from one list to the other while the 167 * lists are searched again. If a page is freeed while pcf_block is 168 * set, then pcf_reserve is incremented. pcgs_unblock() takes care 169 * of clearning pcf_block, doing the wakeups, etc. 170 */ 171 172 #if NCPU <= 4 173 #define PAD 2 174 #define PCF_FANOUT 4 175 static uint_t pcf_mask = PCF_FANOUT - 1; 176 #else 177 #define PAD 10 178 #ifdef sun4v 179 #define PCF_FANOUT 32 180 #else 181 #define PCF_FANOUT 128 182 #endif 183 static uint_t pcf_mask = PCF_FANOUT - 1; 184 #endif 185 186 struct pcf { 187 kmutex_t pcf_lock; /* protects the structure */ 188 uint_t pcf_count; /* page count */ 189 uint_t pcf_wait; /* number of waiters */ 190 uint_t pcf_block; /* pcgs flag to page_free() */ 191 uint_t pcf_reserve; /* pages freed after pcf_block set */ 192 uint_t pcf_fill[PAD]; /* to line up on the caches */ 193 }; 194 195 static struct pcf pcf[PCF_FANOUT]; 196 #define PCF_INDEX() ((CPU->cpu_id) & (pcf_mask)) 197 198 kmutex_t pcgs_lock; /* serializes page_create_get_ */ 199 kmutex_t pcgs_cagelock; /* serializes NOSLEEP cage allocs */ 200 kmutex_t pcgs_wait_lock; /* used for delay in pcgs */ 201 static kcondvar_t pcgs_cv; /* cv for delay in pcgs */ 202 203 #ifdef VM_STATS 204 205 /* 206 * No locks, but so what, they are only statistics. 207 */ 208 209 static struct page_tcnt { 210 int pc_free_cache; /* free's into cache list */ 211 int pc_free_dontneed; /* free's with dontneed */ 212 int pc_free_pageout; /* free's from pageout */ 213 int pc_free_free; /* free's into free list */ 214 int pc_free_pages; /* free's into large page free list */ 215 int pc_destroy_pages; /* large page destroy's */ 216 int pc_get_cache; /* get's from cache list */ 217 int pc_get_free; /* get's from free list */ 218 int pc_reclaim; /* reclaim's */ 219 int pc_abortfree; /* abort's of free pages */ 220 int pc_find_hit; /* find's that find page */ 221 int pc_find_miss; /* find's that don't find page */ 222 int pc_destroy_free; /* # of free pages destroyed */ 223 #define PC_HASH_CNT (4*PAGE_HASHAVELEN) 224 int pc_find_hashlen[PC_HASH_CNT+1]; 225 int pc_addclaim_pages; 226 int pc_subclaim_pages; 227 int pc_free_replacement_page[2]; 228 int pc_try_demote_pages[6]; 229 int pc_demote_pages[2]; 230 } pagecnt; 231 232 uint_t hashin_count; 233 uint_t hashin_not_held; 234 uint_t hashin_already; 235 236 uint_t hashout_count; 237 uint_t hashout_not_held; 238 239 uint_t page_create_count; 240 uint_t page_create_not_enough; 241 uint_t page_create_not_enough_again; 242 uint_t page_create_zero; 243 uint_t page_create_hashout; 244 uint_t page_create_page_lock_failed; 245 uint_t page_create_trylock_failed; 246 uint_t page_create_found_one; 247 uint_t page_create_hashin_failed; 248 uint_t page_create_dropped_phm; 249 250 uint_t page_create_new; 251 uint_t page_create_exists; 252 uint_t page_create_putbacks; 253 uint_t page_create_overshoot; 254 255 uint_t page_reclaim_zero; 256 uint_t page_reclaim_zero_locked; 257 258 uint_t page_rename_exists; 259 uint_t page_rename_count; 260 261 uint_t page_lookup_cnt[20]; 262 uint_t page_lookup_nowait_cnt[10]; 263 uint_t page_find_cnt; 264 uint_t page_exists_cnt; 265 uint_t page_exists_forreal_cnt; 266 uint_t page_lookup_dev_cnt; 267 uint_t get_cachelist_cnt; 268 uint_t page_create_cnt[10]; 269 uint_t alloc_pages[8]; 270 uint_t page_exphcontg[19]; 271 uint_t page_create_large_cnt[10]; 272 273 /* 274 * Collects statistics. 275 */ 276 #define PAGE_HASH_SEARCH(index, pp, vp, off) { \ 277 uint_t mylen = 0; \ 278 \ 279 for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash, mylen++) { \ 280 if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \ 281 break; \ 282 } \ 283 if ((pp) != NULL) \ 284 pagecnt.pc_find_hit++; \ 285 else \ 286 pagecnt.pc_find_miss++; \ 287 if (mylen > PC_HASH_CNT) \ 288 mylen = PC_HASH_CNT; \ 289 pagecnt.pc_find_hashlen[mylen]++; \ 290 } 291 292 #else /* VM_STATS */ 293 294 /* 295 * Don't collect statistics 296 */ 297 #define PAGE_HASH_SEARCH(index, pp, vp, off) { \ 298 for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \ 299 if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \ 300 break; \ 301 } \ 302 } 303 304 #endif /* VM_STATS */ 305 306 307 308 #ifdef DEBUG 309 #define MEMSEG_SEARCH_STATS 310 #endif 311 312 #ifdef MEMSEG_SEARCH_STATS 313 struct memseg_stats { 314 uint_t nsearch; 315 uint_t nlastwon; 316 uint_t nhashwon; 317 uint_t nnotfound; 318 } memseg_stats; 319 320 #define MEMSEG_STAT_INCR(v) \ 321 atomic_add_32(&memseg_stats.v, 1) 322 #else 323 #define MEMSEG_STAT_INCR(x) 324 #endif 325 326 struct memseg *memsegs; /* list of memory segments */ 327 328 329 static void page_init_mem_config(void); 330 static int page_do_hashin(page_t *, vnode_t *, u_offset_t); 331 static void page_do_hashout(page_t *); 332 333 static void page_demote_vp_pages(page_t *); 334 335 /* 336 * vm subsystem related initialization 337 */ 338 void 339 vm_init(void) 340 { 341 boolean_t callb_vm_cpr(void *, int); 342 343 (void) callb_add(callb_vm_cpr, 0, CB_CL_CPR_VM, "vm"); 344 page_init_mem_config(); 345 page_retire_init(); 346 vm_usage_init(); 347 } 348 349 /* 350 * This function is called at startup and when memory is added or deleted. 351 */ 352 void 353 init_pages_pp_maximum() 354 { 355 static pgcnt_t p_min; 356 static pgcnt_t pages_pp_maximum_startup; 357 static pgcnt_t avrmem_delta; 358 static int init_done; 359 static int user_set; /* true if set in /etc/system */ 360 361 if (init_done == 0) { 362 363 /* If the user specified a value, save it */ 364 if (pages_pp_maximum != 0) { 365 user_set = 1; 366 pages_pp_maximum_startup = pages_pp_maximum; 367 } 368 369 /* 370 * Setting of pages_pp_maximum is based first time 371 * on the value of availrmem just after the start-up 372 * allocations. To preserve this relationship at run 373 * time, use a delta from availrmem_initial. 374 */ 375 ASSERT(availrmem_initial >= availrmem); 376 avrmem_delta = availrmem_initial - availrmem; 377 378 /* The allowable floor of pages_pp_maximum */ 379 p_min = tune.t_minarmem + 100; 380 381 /* Make sure we don't come through here again. */ 382 init_done = 1; 383 } 384 /* 385 * Determine pages_pp_maximum, the number of currently available 386 * pages (availrmem) that can't be `locked'. If not set by 387 * the user, we set it to 4% of the currently available memory 388 * plus 4MB. 389 * But we also insist that it be greater than tune.t_minarmem; 390 * otherwise a process could lock down a lot of memory, get swapped 391 * out, and never have enough to get swapped back in. 392 */ 393 if (user_set) 394 pages_pp_maximum = pages_pp_maximum_startup; 395 else 396 pages_pp_maximum = ((availrmem_initial - avrmem_delta) / 25) 397 + btop(4 * 1024 * 1024); 398 399 if (pages_pp_maximum <= p_min) { 400 pages_pp_maximum = p_min; 401 } 402 } 403 404 void 405 set_max_page_get(pgcnt_t target_total_pages) 406 { 407 max_page_get = target_total_pages / 2; 408 } 409 410 static pgcnt_t pending_delete; 411 412 /*ARGSUSED*/ 413 static void 414 page_mem_config_post_add( 415 void *arg, 416 pgcnt_t delta_pages) 417 { 418 set_max_page_get(total_pages - pending_delete); 419 init_pages_pp_maximum(); 420 } 421 422 /*ARGSUSED*/ 423 static int 424 page_mem_config_pre_del( 425 void *arg, 426 pgcnt_t delta_pages) 427 { 428 pgcnt_t nv; 429 430 nv = atomic_add_long_nv(&pending_delete, (spgcnt_t)delta_pages); 431 set_max_page_get(total_pages - nv); 432 return (0); 433 } 434 435 /*ARGSUSED*/ 436 static void 437 page_mem_config_post_del( 438 void *arg, 439 pgcnt_t delta_pages, 440 int cancelled) 441 { 442 pgcnt_t nv; 443 444 nv = atomic_add_long_nv(&pending_delete, -(spgcnt_t)delta_pages); 445 set_max_page_get(total_pages - nv); 446 if (!cancelled) 447 init_pages_pp_maximum(); 448 } 449 450 static kphysm_setup_vector_t page_mem_config_vec = { 451 KPHYSM_SETUP_VECTOR_VERSION, 452 page_mem_config_post_add, 453 page_mem_config_pre_del, 454 page_mem_config_post_del, 455 }; 456 457 static void 458 page_init_mem_config(void) 459 { 460 int ret; 461 462 ret = kphysm_setup_func_register(&page_mem_config_vec, (void *)NULL); 463 ASSERT(ret == 0); 464 } 465 466 /* 467 * Evenly spread out the PCF counters for large free pages 468 */ 469 static void 470 page_free_large_ctr(pgcnt_t npages) 471 { 472 static struct pcf *p = pcf; 473 pgcnt_t lump; 474 475 freemem += npages; 476 477 lump = roundup(npages, PCF_FANOUT) / PCF_FANOUT; 478 479 while (npages > 0) { 480 481 ASSERT(!p->pcf_block); 482 483 if (lump < npages) { 484 p->pcf_count += (uint_t)lump; 485 npages -= lump; 486 } else { 487 p->pcf_count += (uint_t)npages; 488 npages = 0; 489 } 490 491 ASSERT(!p->pcf_wait); 492 493 if (++p > &pcf[PCF_FANOUT - 1]) 494 p = pcf; 495 } 496 497 ASSERT(npages == 0); 498 } 499 500 /* 501 * Add a physical chunk of memory to the system freee lists during startup. 502 * Platform specific startup() allocates the memory for the page structs. 503 * 504 * num - number of page structures 505 * base - page number (pfn) to be associated with the first page. 506 * 507 * Since we are doing this during startup (ie. single threaded), we will 508 * use shortcut routines to avoid any locking overhead while putting all 509 * these pages on the freelists. 510 * 511 * NOTE: Any changes performed to page_free(), must also be performed to 512 * add_physmem() since this is how we initialize all page_t's at 513 * boot time. 514 */ 515 void 516 add_physmem( 517 page_t *pp, 518 pgcnt_t num, 519 pfn_t pnum) 520 { 521 page_t *root = NULL; 522 uint_t szc = page_num_pagesizes() - 1; 523 pgcnt_t large = page_get_pagecnt(szc); 524 pgcnt_t cnt = 0; 525 526 TRACE_2(TR_FAC_VM, TR_PAGE_INIT, 527 "add_physmem:pp %p num %lu", pp, num); 528 529 /* 530 * Arbitrarily limit the max page_get request 531 * to 1/2 of the page structs we have. 532 */ 533 total_pages += num; 534 set_max_page_get(total_pages); 535 536 PLCNT_MODIFY_MAX(pnum, (long)num); 537 538 /* 539 * The physical space for the pages array 540 * representing ram pages has already been 541 * allocated. Here we initialize each lock 542 * in the page structure, and put each on 543 * the free list 544 */ 545 for (; num; pp++, pnum++, num--) { 546 547 /* 548 * this needs to fill in the page number 549 * and do any other arch specific initialization 550 */ 551 add_physmem_cb(pp, pnum); 552 553 pp->p_lckcnt = 0; 554 pp->p_cowcnt = 0; 555 pp->p_slckcnt = 0; 556 557 /* 558 * Initialize the page lock as unlocked, since nobody 559 * can see or access this page yet. 560 */ 561 pp->p_selock = 0; 562 563 /* 564 * Initialize IO lock 565 */ 566 page_iolock_init(pp); 567 568 /* 569 * initialize other fields in the page_t 570 */ 571 PP_SETFREE(pp); 572 page_clr_all_props(pp); 573 PP_SETAGED(pp); 574 pp->p_offset = (u_offset_t)-1; 575 pp->p_next = pp; 576 pp->p_prev = pp; 577 578 /* 579 * Simple case: System doesn't support large pages. 580 */ 581 if (szc == 0) { 582 pp->p_szc = 0; 583 page_free_at_startup(pp); 584 continue; 585 } 586 587 /* 588 * Handle unaligned pages, we collect them up onto 589 * the root page until we have a full large page. 590 */ 591 if (!IS_P2ALIGNED(pnum, large)) { 592 593 /* 594 * If not in a large page, 595 * just free as small page. 596 */ 597 if (root == NULL) { 598 pp->p_szc = 0; 599 page_free_at_startup(pp); 600 continue; 601 } 602 603 /* 604 * Link a constituent page into the large page. 605 */ 606 pp->p_szc = szc; 607 page_list_concat(&root, &pp); 608 609 /* 610 * When large page is fully formed, free it. 611 */ 612 if (++cnt == large) { 613 page_free_large_ctr(cnt); 614 page_list_add_pages(root, PG_LIST_ISINIT); 615 root = NULL; 616 cnt = 0; 617 } 618 continue; 619 } 620 621 /* 622 * At this point we have a page number which 623 * is aligned. We assert that we aren't already 624 * in a different large page. 625 */ 626 ASSERT(IS_P2ALIGNED(pnum, large)); 627 ASSERT(root == NULL && cnt == 0); 628 629 /* 630 * If insufficient number of pages left to form 631 * a large page, just free the small page. 632 */ 633 if (num < large) { 634 pp->p_szc = 0; 635 page_free_at_startup(pp); 636 continue; 637 } 638 639 /* 640 * Otherwise start a new large page. 641 */ 642 pp->p_szc = szc; 643 cnt++; 644 root = pp; 645 } 646 ASSERT(root == NULL && cnt == 0); 647 } 648 649 /* 650 * Find a page representing the specified [vp, offset]. 651 * If we find the page but it is intransit coming in, 652 * it will have an "exclusive" lock and we wait for 653 * the i/o to complete. A page found on the free list 654 * is always reclaimed and then locked. On success, the page 655 * is locked, its data is valid and it isn't on the free 656 * list, while a NULL is returned if the page doesn't exist. 657 */ 658 page_t * 659 page_lookup(vnode_t *vp, u_offset_t off, se_t se) 660 { 661 return (page_lookup_create(vp, off, se, NULL, NULL, 0)); 662 } 663 664 /* 665 * Find a page representing the specified [vp, offset]. 666 * We either return the one we found or, if passed in, 667 * create one with identity of [vp, offset] of the 668 * pre-allocated page. If we find exsisting page but it is 669 * intransit coming in, it will have an "exclusive" lock 670 * and we wait for the i/o to complete. A page found on 671 * the free list is always reclaimed and then locked. 672 * On success, the page is locked, its data is valid and 673 * it isn't on the free list, while a NULL is returned 674 * if the page doesn't exist and newpp is NULL; 675 */ 676 page_t * 677 page_lookup_create( 678 vnode_t *vp, 679 u_offset_t off, 680 se_t se, 681 page_t *newpp, 682 spgcnt_t *nrelocp, 683 int flags) 684 { 685 page_t *pp; 686 kmutex_t *phm; 687 ulong_t index; 688 uint_t hash_locked; 689 uint_t es; 690 691 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 692 VM_STAT_ADD(page_lookup_cnt[0]); 693 ASSERT(newpp ? PAGE_EXCL(newpp) : 1); 694 695 /* 696 * Acquire the appropriate page hash lock since 697 * we have to search the hash list. Pages that 698 * hash to this list can't change identity while 699 * this lock is held. 700 */ 701 hash_locked = 0; 702 index = PAGE_HASH_FUNC(vp, off); 703 phm = NULL; 704 top: 705 PAGE_HASH_SEARCH(index, pp, vp, off); 706 if (pp != NULL) { 707 VM_STAT_ADD(page_lookup_cnt[1]); 708 es = (newpp != NULL) ? 1 : 0; 709 es |= flags; 710 if (!hash_locked) { 711 VM_STAT_ADD(page_lookup_cnt[2]); 712 if (!page_try_reclaim_lock(pp, se, es)) { 713 /* 714 * On a miss, acquire the phm. Then 715 * next time, page_lock() will be called, 716 * causing a wait if the page is busy. 717 * just looping with page_trylock() would 718 * get pretty boring. 719 */ 720 VM_STAT_ADD(page_lookup_cnt[3]); 721 phm = PAGE_HASH_MUTEX(index); 722 mutex_enter(phm); 723 hash_locked = 1; 724 goto top; 725 } 726 } else { 727 VM_STAT_ADD(page_lookup_cnt[4]); 728 if (!page_lock_es(pp, se, phm, P_RECLAIM, es)) { 729 VM_STAT_ADD(page_lookup_cnt[5]); 730 goto top; 731 } 732 } 733 734 /* 735 * Since `pp' is locked it can not change identity now. 736 * Reconfirm we locked the correct page. 737 * 738 * Both the p_vnode and p_offset *must* be cast volatile 739 * to force a reload of their values: The PAGE_HASH_SEARCH 740 * macro will have stuffed p_vnode and p_offset into 741 * registers before calling page_trylock(); another thread, 742 * actually holding the hash lock, could have changed the 743 * page's identity in memory, but our registers would not 744 * be changed, fooling the reconfirmation. If the hash 745 * lock was held during the search, the casting would 746 * not be needed. 747 */ 748 VM_STAT_ADD(page_lookup_cnt[6]); 749 if (((volatile struct vnode *)(pp->p_vnode) != vp) || 750 ((volatile u_offset_t)(pp->p_offset) != off)) { 751 VM_STAT_ADD(page_lookup_cnt[7]); 752 if (hash_locked) { 753 panic("page_lookup_create: lost page %p", 754 (void *)pp); 755 /*NOTREACHED*/ 756 } 757 page_unlock(pp); 758 phm = PAGE_HASH_MUTEX(index); 759 mutex_enter(phm); 760 hash_locked = 1; 761 goto top; 762 } 763 764 /* 765 * If page_trylock() was called, then pp may still be on 766 * the cachelist (can't be on the free list, it would not 767 * have been found in the search). If it is on the 768 * cachelist it must be pulled now. To pull the page from 769 * the cachelist, it must be exclusively locked. 770 * 771 * The other big difference between page_trylock() and 772 * page_lock(), is that page_lock() will pull the 773 * page from whatever free list (the cache list in this 774 * case) the page is on. If page_trylock() was used 775 * above, then we have to do the reclaim ourselves. 776 */ 777 if ((!hash_locked) && (PP_ISFREE(pp))) { 778 ASSERT(PP_ISAGED(pp) == 0); 779 VM_STAT_ADD(page_lookup_cnt[8]); 780 781 /* 782 * page_relcaim will insure that we 783 * have this page exclusively 784 */ 785 786 if (!page_reclaim(pp, NULL)) { 787 /* 788 * Page_reclaim dropped whatever lock 789 * we held. 790 */ 791 VM_STAT_ADD(page_lookup_cnt[9]); 792 phm = PAGE_HASH_MUTEX(index); 793 mutex_enter(phm); 794 hash_locked = 1; 795 goto top; 796 } else if (se == SE_SHARED && newpp == NULL) { 797 VM_STAT_ADD(page_lookup_cnt[10]); 798 page_downgrade(pp); 799 } 800 } 801 802 if (hash_locked) { 803 mutex_exit(phm); 804 } 805 806 if (newpp != NULL && pp->p_szc < newpp->p_szc && 807 PAGE_EXCL(pp) && nrelocp != NULL) { 808 ASSERT(nrelocp != NULL); 809 (void) page_relocate(&pp, &newpp, 1, 1, nrelocp, 810 NULL); 811 if (*nrelocp > 0) { 812 VM_STAT_COND_ADD(*nrelocp == 1, 813 page_lookup_cnt[11]); 814 VM_STAT_COND_ADD(*nrelocp > 1, 815 page_lookup_cnt[12]); 816 pp = newpp; 817 se = SE_EXCL; 818 } else { 819 if (se == SE_SHARED) { 820 page_downgrade(pp); 821 } 822 VM_STAT_ADD(page_lookup_cnt[13]); 823 } 824 } else if (newpp != NULL && nrelocp != NULL) { 825 if (PAGE_EXCL(pp) && se == SE_SHARED) { 826 page_downgrade(pp); 827 } 828 VM_STAT_COND_ADD(pp->p_szc < newpp->p_szc, 829 page_lookup_cnt[14]); 830 VM_STAT_COND_ADD(pp->p_szc == newpp->p_szc, 831 page_lookup_cnt[15]); 832 VM_STAT_COND_ADD(pp->p_szc > newpp->p_szc, 833 page_lookup_cnt[16]); 834 } else if (newpp != NULL && PAGE_EXCL(pp)) { 835 se = SE_EXCL; 836 } 837 } else if (!hash_locked) { 838 VM_STAT_ADD(page_lookup_cnt[17]); 839 phm = PAGE_HASH_MUTEX(index); 840 mutex_enter(phm); 841 hash_locked = 1; 842 goto top; 843 } else if (newpp != NULL) { 844 /* 845 * If we have a preallocated page then 846 * insert it now and basically behave like 847 * page_create. 848 */ 849 VM_STAT_ADD(page_lookup_cnt[18]); 850 /* 851 * Since we hold the page hash mutex and 852 * just searched for this page, page_hashin 853 * had better not fail. If it does, that 854 * means some thread did not follow the 855 * page hash mutex rules. Panic now and 856 * get it over with. As usual, go down 857 * holding all the locks. 858 */ 859 ASSERT(MUTEX_HELD(phm)); 860 if (!page_hashin(newpp, vp, off, phm)) { 861 ASSERT(MUTEX_HELD(phm)); 862 panic("page_lookup_create: hashin failed %p %p %llx %p", 863 (void *)newpp, (void *)vp, off, (void *)phm); 864 /*NOTREACHED*/ 865 } 866 ASSERT(MUTEX_HELD(phm)); 867 mutex_exit(phm); 868 phm = NULL; 869 page_set_props(newpp, P_REF); 870 page_io_lock(newpp); 871 pp = newpp; 872 se = SE_EXCL; 873 } else { 874 VM_STAT_ADD(page_lookup_cnt[19]); 875 mutex_exit(phm); 876 } 877 878 ASSERT(pp ? PAGE_LOCKED_SE(pp, se) : 1); 879 880 ASSERT(pp ? ((PP_ISFREE(pp) == 0) && (PP_ISAGED(pp) == 0)) : 1); 881 882 return (pp); 883 } 884 885 /* 886 * Search the hash list for the page representing the 887 * specified [vp, offset] and return it locked. Skip 888 * free pages and pages that cannot be locked as requested. 889 * Used while attempting to kluster pages. 890 */ 891 page_t * 892 page_lookup_nowait(vnode_t *vp, u_offset_t off, se_t se) 893 { 894 page_t *pp; 895 kmutex_t *phm; 896 ulong_t index; 897 uint_t locked; 898 899 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 900 VM_STAT_ADD(page_lookup_nowait_cnt[0]); 901 902 index = PAGE_HASH_FUNC(vp, off); 903 PAGE_HASH_SEARCH(index, pp, vp, off); 904 locked = 0; 905 if (pp == NULL) { 906 top: 907 VM_STAT_ADD(page_lookup_nowait_cnt[1]); 908 locked = 1; 909 phm = PAGE_HASH_MUTEX(index); 910 mutex_enter(phm); 911 PAGE_HASH_SEARCH(index, pp, vp, off); 912 } 913 914 if (pp == NULL || PP_ISFREE(pp)) { 915 VM_STAT_ADD(page_lookup_nowait_cnt[2]); 916 pp = NULL; 917 } else { 918 if (!page_trylock(pp, se)) { 919 VM_STAT_ADD(page_lookup_nowait_cnt[3]); 920 pp = NULL; 921 } else { 922 VM_STAT_ADD(page_lookup_nowait_cnt[4]); 923 /* 924 * See the comment in page_lookup() 925 */ 926 if (((volatile struct vnode *)(pp->p_vnode) != vp) || 927 ((u_offset_t)(pp->p_offset) != off)) { 928 VM_STAT_ADD(page_lookup_nowait_cnt[5]); 929 if (locked) { 930 panic("page_lookup_nowait %p", 931 (void *)pp); 932 /*NOTREACHED*/ 933 } 934 page_unlock(pp); 935 goto top; 936 } 937 if (PP_ISFREE(pp)) { 938 VM_STAT_ADD(page_lookup_nowait_cnt[6]); 939 page_unlock(pp); 940 pp = NULL; 941 } 942 } 943 } 944 if (locked) { 945 VM_STAT_ADD(page_lookup_nowait_cnt[7]); 946 mutex_exit(phm); 947 } 948 949 ASSERT(pp ? PAGE_LOCKED_SE(pp, se) : 1); 950 951 return (pp); 952 } 953 954 /* 955 * Search the hash list for a page with the specified [vp, off] 956 * that is known to exist and is already locked. This routine 957 * is typically used by segment SOFTUNLOCK routines. 958 */ 959 page_t * 960 page_find(vnode_t *vp, u_offset_t off) 961 { 962 page_t *pp; 963 kmutex_t *phm; 964 ulong_t index; 965 966 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 967 VM_STAT_ADD(page_find_cnt); 968 969 index = PAGE_HASH_FUNC(vp, off); 970 phm = PAGE_HASH_MUTEX(index); 971 972 mutex_enter(phm); 973 PAGE_HASH_SEARCH(index, pp, vp, off); 974 mutex_exit(phm); 975 976 ASSERT(pp == NULL || PAGE_LOCKED(pp) || panicstr); 977 return (pp); 978 } 979 980 /* 981 * Determine whether a page with the specified [vp, off] 982 * currently exists in the system. Obviously this should 983 * only be considered as a hint since nothing prevents the 984 * page from disappearing or appearing immediately after 985 * the return from this routine. Subsequently, we don't 986 * even bother to lock the list. 987 */ 988 page_t * 989 page_exists(vnode_t *vp, u_offset_t off) 990 { 991 page_t *pp; 992 ulong_t index; 993 994 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 995 VM_STAT_ADD(page_exists_cnt); 996 997 index = PAGE_HASH_FUNC(vp, off); 998 PAGE_HASH_SEARCH(index, pp, vp, off); 999 1000 return (pp); 1001 } 1002 1003 /* 1004 * Determine if physically contiguous pages exist for [vp, off] - [vp, off + 1005 * page_size(szc)) range. if they exist and ppa is not NULL fill ppa array 1006 * with these pages locked SHARED. If necessary reclaim pages from 1007 * freelist. Return 1 if contiguous pages exist and 0 otherwise. 1008 * 1009 * If we fail to lock pages still return 1 if pages exist and contiguous. 1010 * But in this case return value is just a hint. ppa array won't be filled. 1011 * Caller should initialize ppa[0] as NULL to distinguish return value. 1012 * 1013 * Returns 0 if pages don't exist or not physically contiguous. 1014 * 1015 * This routine doesn't work for anonymous(swapfs) pages. 1016 */ 1017 int 1018 page_exists_physcontig(vnode_t *vp, u_offset_t off, uint_t szc, page_t *ppa[]) 1019 { 1020 pgcnt_t pages; 1021 pfn_t pfn; 1022 page_t *rootpp; 1023 pgcnt_t i; 1024 pgcnt_t j; 1025 u_offset_t save_off = off; 1026 ulong_t index; 1027 kmutex_t *phm; 1028 page_t *pp; 1029 uint_t pszc; 1030 int loopcnt = 0; 1031 1032 ASSERT(szc != 0); 1033 ASSERT(vp != NULL); 1034 ASSERT(!IS_SWAPFSVP(vp)); 1035 ASSERT(vp != &kvp); 1036 1037 again: 1038 if (++loopcnt > 3) { 1039 VM_STAT_ADD(page_exphcontg[0]); 1040 return (0); 1041 } 1042 1043 index = PAGE_HASH_FUNC(vp, off); 1044 phm = PAGE_HASH_MUTEX(index); 1045 1046 mutex_enter(phm); 1047 PAGE_HASH_SEARCH(index, pp, vp, off); 1048 mutex_exit(phm); 1049 1050 VM_STAT_ADD(page_exphcontg[1]); 1051 1052 if (pp == NULL) { 1053 VM_STAT_ADD(page_exphcontg[2]); 1054 return (0); 1055 } 1056 1057 pages = page_get_pagecnt(szc); 1058 rootpp = pp; 1059 pfn = rootpp->p_pagenum; 1060 1061 if ((pszc = pp->p_szc) >= szc && ppa != NULL) { 1062 VM_STAT_ADD(page_exphcontg[3]); 1063 if (!page_trylock(pp, SE_SHARED)) { 1064 VM_STAT_ADD(page_exphcontg[4]); 1065 return (1); 1066 } 1067 if (pp->p_szc != pszc || pp->p_vnode != vp || 1068 pp->p_offset != off) { 1069 VM_STAT_ADD(page_exphcontg[5]); 1070 page_unlock(pp); 1071 off = save_off; 1072 goto again; 1073 } 1074 /* 1075 * szc was non zero and vnode and offset matched after we 1076 * locked the page it means it can't become free on us. 1077 */ 1078 ASSERT(!PP_ISFREE(pp)); 1079 if (!IS_P2ALIGNED(pfn, pages)) { 1080 page_unlock(pp); 1081 return (0); 1082 } 1083 ppa[0] = pp; 1084 pp++; 1085 off += PAGESIZE; 1086 pfn++; 1087 for (i = 1; i < pages; i++, pp++, off += PAGESIZE, pfn++) { 1088 if (!page_trylock(pp, SE_SHARED)) { 1089 VM_STAT_ADD(page_exphcontg[6]); 1090 pp--; 1091 while (i-- > 0) { 1092 page_unlock(pp); 1093 pp--; 1094 } 1095 ppa[0] = NULL; 1096 return (1); 1097 } 1098 if (pp->p_szc != pszc) { 1099 VM_STAT_ADD(page_exphcontg[7]); 1100 page_unlock(pp); 1101 pp--; 1102 while (i-- > 0) { 1103 page_unlock(pp); 1104 pp--; 1105 } 1106 ppa[0] = NULL; 1107 off = save_off; 1108 goto again; 1109 } 1110 /* 1111 * szc the same as for previous already locked pages 1112 * with right identity. Since this page had correct 1113 * szc after we locked it can't get freed or destroyed 1114 * and therefore must have the expected identity. 1115 */ 1116 ASSERT(!PP_ISFREE(pp)); 1117 if (pp->p_vnode != vp || 1118 pp->p_offset != off) { 1119 panic("page_exists_physcontig: " 1120 "large page identity doesn't match"); 1121 } 1122 ppa[i] = pp; 1123 ASSERT(pp->p_pagenum == pfn); 1124 } 1125 VM_STAT_ADD(page_exphcontg[8]); 1126 ppa[pages] = NULL; 1127 return (1); 1128 } else if (pszc >= szc) { 1129 VM_STAT_ADD(page_exphcontg[9]); 1130 if (!IS_P2ALIGNED(pfn, pages)) { 1131 return (0); 1132 } 1133 return (1); 1134 } 1135 1136 if (!IS_P2ALIGNED(pfn, pages)) { 1137 VM_STAT_ADD(page_exphcontg[10]); 1138 return (0); 1139 } 1140 1141 if (page_numtomemseg_nolock(pfn) != 1142 page_numtomemseg_nolock(pfn + pages - 1)) { 1143 VM_STAT_ADD(page_exphcontg[11]); 1144 return (0); 1145 } 1146 1147 /* 1148 * We loop up 4 times across pages to promote page size. 1149 * We're extra cautious to promote page size atomically with respect 1150 * to everybody else. But we can probably optimize into 1 loop if 1151 * this becomes an issue. 1152 */ 1153 1154 for (i = 0; i < pages; i++, pp++, off += PAGESIZE, pfn++) { 1155 ASSERT(pp->p_pagenum == pfn); 1156 if (!page_trylock(pp, SE_EXCL)) { 1157 VM_STAT_ADD(page_exphcontg[12]); 1158 break; 1159 } 1160 if (pp->p_vnode != vp || 1161 pp->p_offset != off) { 1162 VM_STAT_ADD(page_exphcontg[13]); 1163 page_unlock(pp); 1164 break; 1165 } 1166 if (pp->p_szc >= szc) { 1167 ASSERT(i == 0); 1168 page_unlock(pp); 1169 off = save_off; 1170 goto again; 1171 } 1172 } 1173 1174 if (i != pages) { 1175 VM_STAT_ADD(page_exphcontg[14]); 1176 --pp; 1177 while (i-- > 0) { 1178 page_unlock(pp); 1179 --pp; 1180 } 1181 return (0); 1182 } 1183 1184 pp = rootpp; 1185 for (i = 0; i < pages; i++, pp++) { 1186 if (PP_ISFREE(pp)) { 1187 VM_STAT_ADD(page_exphcontg[15]); 1188 ASSERT(!PP_ISAGED(pp)); 1189 ASSERT(pp->p_szc == 0); 1190 if (!page_reclaim(pp, NULL)) { 1191 break; 1192 } 1193 } else { 1194 ASSERT(pp->p_szc < szc); 1195 VM_STAT_ADD(page_exphcontg[16]); 1196 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1197 } 1198 } 1199 if (i < pages) { 1200 VM_STAT_ADD(page_exphcontg[17]); 1201 /* 1202 * page_reclaim failed because we were out of memory. 1203 * drop the rest of the locks and return because this page 1204 * must be already reallocated anyway. 1205 */ 1206 pp = rootpp; 1207 for (j = 0; j < pages; j++, pp++) { 1208 if (j != i) { 1209 page_unlock(pp); 1210 } 1211 } 1212 return (0); 1213 } 1214 1215 off = save_off; 1216 pp = rootpp; 1217 for (i = 0; i < pages; i++, pp++, off += PAGESIZE) { 1218 ASSERT(PAGE_EXCL(pp)); 1219 ASSERT(!PP_ISFREE(pp)); 1220 ASSERT(!hat_page_is_mapped(pp)); 1221 ASSERT(pp->p_vnode == vp); 1222 ASSERT(pp->p_offset == off); 1223 pp->p_szc = szc; 1224 } 1225 pp = rootpp; 1226 for (i = 0; i < pages; i++, pp++) { 1227 if (ppa == NULL) { 1228 page_unlock(pp); 1229 } else { 1230 ppa[i] = pp; 1231 page_downgrade(ppa[i]); 1232 } 1233 } 1234 if (ppa != NULL) { 1235 ppa[pages] = NULL; 1236 } 1237 VM_STAT_ADD(page_exphcontg[18]); 1238 ASSERT(vp->v_pages != NULL); 1239 return (1); 1240 } 1241 1242 /* 1243 * Determine whether a page with the specified [vp, off] 1244 * currently exists in the system and if so return its 1245 * size code. Obviously this should only be considered as 1246 * a hint since nothing prevents the page from disappearing 1247 * or appearing immediately after the return from this routine. 1248 */ 1249 int 1250 page_exists_forreal(vnode_t *vp, u_offset_t off, uint_t *szc) 1251 { 1252 page_t *pp; 1253 kmutex_t *phm; 1254 ulong_t index; 1255 int rc = 0; 1256 1257 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 1258 ASSERT(szc != NULL); 1259 VM_STAT_ADD(page_exists_forreal_cnt); 1260 1261 index = PAGE_HASH_FUNC(vp, off); 1262 phm = PAGE_HASH_MUTEX(index); 1263 1264 mutex_enter(phm); 1265 PAGE_HASH_SEARCH(index, pp, vp, off); 1266 if (pp != NULL) { 1267 *szc = pp->p_szc; 1268 rc = 1; 1269 } 1270 mutex_exit(phm); 1271 return (rc); 1272 } 1273 1274 /* wakeup threads waiting for pages in page_create_get_something() */ 1275 void 1276 wakeup_pcgs(void) 1277 { 1278 if (!CV_HAS_WAITERS(&pcgs_cv)) 1279 return; 1280 cv_broadcast(&pcgs_cv); 1281 } 1282 1283 /* 1284 * 'freemem' is used all over the kernel as an indication of how many 1285 * pages are free (either on the cache list or on the free page list) 1286 * in the system. In very few places is a really accurate 'freemem' 1287 * needed. To avoid contention of the lock protecting a the 1288 * single freemem, it was spread out into NCPU buckets. Set_freemem 1289 * sets freemem to the total of all NCPU buckets. It is called from 1290 * clock() on each TICK. 1291 */ 1292 void 1293 set_freemem() 1294 { 1295 struct pcf *p; 1296 ulong_t t; 1297 uint_t i; 1298 1299 t = 0; 1300 p = pcf; 1301 for (i = 0; i < PCF_FANOUT; i++) { 1302 t += p->pcf_count; 1303 p++; 1304 } 1305 freemem = t; 1306 1307 /* 1308 * Don't worry about grabbing mutex. It's not that 1309 * critical if we miss a tick or two. This is 1310 * where we wakeup possible delayers in 1311 * page_create_get_something(). 1312 */ 1313 wakeup_pcgs(); 1314 } 1315 1316 ulong_t 1317 get_freemem() 1318 { 1319 struct pcf *p; 1320 ulong_t t; 1321 uint_t i; 1322 1323 t = 0; 1324 p = pcf; 1325 for (i = 0; i < PCF_FANOUT; i++) { 1326 t += p->pcf_count; 1327 p++; 1328 } 1329 /* 1330 * We just calculated it, might as well set it. 1331 */ 1332 freemem = t; 1333 return (t); 1334 } 1335 1336 /* 1337 * Acquire all of the page cache & free (pcf) locks. 1338 */ 1339 void 1340 pcf_acquire_all() 1341 { 1342 struct pcf *p; 1343 uint_t i; 1344 1345 p = pcf; 1346 for (i = 0; i < PCF_FANOUT; i++) { 1347 mutex_enter(&p->pcf_lock); 1348 p++; 1349 } 1350 } 1351 1352 /* 1353 * Release all the pcf_locks. 1354 */ 1355 void 1356 pcf_release_all() 1357 { 1358 struct pcf *p; 1359 uint_t i; 1360 1361 p = pcf; 1362 for (i = 0; i < PCF_FANOUT; i++) { 1363 mutex_exit(&p->pcf_lock); 1364 p++; 1365 } 1366 } 1367 1368 /* 1369 * Inform the VM system that we need some pages freed up. 1370 * Calls must be symmetric, e.g.: 1371 * 1372 * page_needfree(100); 1373 * wait a bit; 1374 * page_needfree(-100); 1375 */ 1376 void 1377 page_needfree(spgcnt_t npages) 1378 { 1379 mutex_enter(&new_freemem_lock); 1380 needfree += npages; 1381 mutex_exit(&new_freemem_lock); 1382 } 1383 1384 /* 1385 * Throttle for page_create(): try to prevent freemem from dropping 1386 * below throttlefree. We can't provide a 100% guarantee because 1387 * KM_NOSLEEP allocations, page_reclaim(), and various other things 1388 * nibble away at the freelist. However, we can block all PG_WAIT 1389 * allocations until memory becomes available. The motivation is 1390 * that several things can fall apart when there's no free memory: 1391 * 1392 * (1) If pageout() needs memory to push a page, the system deadlocks. 1393 * 1394 * (2) By (broken) specification, timeout(9F) can neither fail nor 1395 * block, so it has no choice but to panic the system if it 1396 * cannot allocate a callout structure. 1397 * 1398 * (3) Like timeout(), ddi_set_callback() cannot fail and cannot block; 1399 * it panics if it cannot allocate a callback structure. 1400 * 1401 * (4) Untold numbers of third-party drivers have not yet been hardened 1402 * against KM_NOSLEEP and/or allocb() failures; they simply assume 1403 * success and panic the system with a data fault on failure. 1404 * (The long-term solution to this particular problem is to ship 1405 * hostile fault-injecting DEBUG kernels with the DDK.) 1406 * 1407 * It is theoretically impossible to guarantee success of non-blocking 1408 * allocations, but in practice, this throttle is very hard to break. 1409 */ 1410 static int 1411 page_create_throttle(pgcnt_t npages, int flags) 1412 { 1413 ulong_t fm; 1414 uint_t i; 1415 pgcnt_t tf; /* effective value of throttlefree */ 1416 1417 /* 1418 * Never deny pages when: 1419 * - it's a thread that cannot block [NOMEMWAIT()] 1420 * - the allocation cannot block and must not fail 1421 * - the allocation cannot block and is pageout dispensated 1422 */ 1423 if (NOMEMWAIT() || 1424 ((flags & (PG_WAIT | PG_PANIC)) == PG_PANIC) || 1425 ((flags & (PG_WAIT | PG_PUSHPAGE)) == PG_PUSHPAGE)) 1426 return (1); 1427 1428 /* 1429 * If the allocation can't block, we look favorably upon it 1430 * unless we're below pageout_reserve. In that case we fail 1431 * the allocation because we want to make sure there are a few 1432 * pages available for pageout. 1433 */ 1434 if ((flags & PG_WAIT) == 0) 1435 return (freemem >= npages + pageout_reserve); 1436 1437 /* Calculate the effective throttlefree value */ 1438 tf = throttlefree - 1439 ((flags & PG_PUSHPAGE) ? pageout_reserve : 0); 1440 1441 cv_signal(&proc_pageout->p_cv); 1442 1443 while (freemem < npages + tf) { 1444 pcf_acquire_all(); 1445 mutex_enter(&new_freemem_lock); 1446 fm = 0; 1447 for (i = 0; i < PCF_FANOUT; i++) { 1448 fm += pcf[i].pcf_count; 1449 pcf[i].pcf_wait++; 1450 mutex_exit(&pcf[i].pcf_lock); 1451 } 1452 freemem = fm; 1453 needfree += npages; 1454 freemem_wait++; 1455 cv_wait(&freemem_cv, &new_freemem_lock); 1456 freemem_wait--; 1457 needfree -= npages; 1458 mutex_exit(&new_freemem_lock); 1459 } 1460 return (1); 1461 } 1462 1463 /* 1464 * page_create_wait() is called to either coalecse pages from the 1465 * different pcf buckets or to wait because there simply are not 1466 * enough pages to satisfy the caller's request. 1467 * 1468 * Sadly, this is called from platform/vm/vm_machdep.c 1469 */ 1470 int 1471 page_create_wait(size_t npages, uint_t flags) 1472 { 1473 pgcnt_t total; 1474 uint_t i; 1475 struct pcf *p; 1476 1477 /* 1478 * Wait until there are enough free pages to satisfy our 1479 * entire request. 1480 * We set needfree += npages before prodding pageout, to make sure 1481 * it does real work when npages > lotsfree > freemem. 1482 */ 1483 VM_STAT_ADD(page_create_not_enough); 1484 1485 ASSERT(!kcage_on ? !(flags & PG_NORELOC) : 1); 1486 checkagain: 1487 if ((flags & PG_NORELOC) && 1488 kcage_freemem < kcage_throttlefree + npages) 1489 (void) kcage_create_throttle(npages, flags); 1490 1491 if (freemem < npages + throttlefree) 1492 if (!page_create_throttle(npages, flags)) 1493 return (0); 1494 1495 /* 1496 * Since page_create_va() looked at every 1497 * bucket, assume we are going to have to wait. 1498 * Get all of the pcf locks. 1499 */ 1500 total = 0; 1501 p = pcf; 1502 for (i = 0; i < PCF_FANOUT; i++) { 1503 mutex_enter(&p->pcf_lock); 1504 total += p->pcf_count; 1505 if (total >= npages) { 1506 /* 1507 * Wow! There are enough pages laying around 1508 * to satisfy the request. Do the accounting, 1509 * drop the locks we acquired, and go back. 1510 * 1511 * freemem is not protected by any lock. So, 1512 * we cannot have any assertion containing 1513 * freemem. 1514 */ 1515 freemem -= npages; 1516 1517 while (p >= pcf) { 1518 if (p->pcf_count <= npages) { 1519 npages -= p->pcf_count; 1520 p->pcf_count = 0; 1521 } else { 1522 p->pcf_count -= (uint_t)npages; 1523 npages = 0; 1524 } 1525 mutex_exit(&p->pcf_lock); 1526 p--; 1527 } 1528 ASSERT(npages == 0); 1529 return (1); 1530 } 1531 p++; 1532 } 1533 1534 /* 1535 * All of the pcf locks are held, there are not enough pages 1536 * to satisfy the request (npages < total). 1537 * Be sure to acquire the new_freemem_lock before dropping 1538 * the pcf locks. This prevents dropping wakeups in page_free(). 1539 * The order is always pcf_lock then new_freemem_lock. 1540 * 1541 * Since we hold all the pcf locks, it is a good time to set freemem. 1542 * 1543 * If the caller does not want to wait, return now. 1544 * Else turn the pageout daemon loose to find something 1545 * and wait till it does. 1546 * 1547 */ 1548 freemem = total; 1549 1550 if ((flags & PG_WAIT) == 0) { 1551 pcf_release_all(); 1552 1553 TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_NOMEM, 1554 "page_create_nomem:npages %ld freemem %ld", npages, freemem); 1555 return (0); 1556 } 1557 1558 ASSERT(proc_pageout != NULL); 1559 cv_signal(&proc_pageout->p_cv); 1560 1561 TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SLEEP_START, 1562 "page_create_sleep_start: freemem %ld needfree %ld", 1563 freemem, needfree); 1564 1565 /* 1566 * We are going to wait. 1567 * We currently hold all of the pcf_locks, 1568 * get the new_freemem_lock (it protects freemem_wait), 1569 * before dropping the pcf_locks. 1570 */ 1571 mutex_enter(&new_freemem_lock); 1572 1573 p = pcf; 1574 for (i = 0; i < PCF_FANOUT; i++) { 1575 p->pcf_wait++; 1576 mutex_exit(&p->pcf_lock); 1577 p++; 1578 } 1579 1580 needfree += npages; 1581 freemem_wait++; 1582 1583 cv_wait(&freemem_cv, &new_freemem_lock); 1584 1585 freemem_wait--; 1586 needfree -= npages; 1587 1588 mutex_exit(&new_freemem_lock); 1589 1590 TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SLEEP_END, 1591 "page_create_sleep_end: freemem %ld needfree %ld", 1592 freemem, needfree); 1593 1594 VM_STAT_ADD(page_create_not_enough_again); 1595 goto checkagain; 1596 } 1597 1598 /* 1599 * A routine to do the opposite of page_create_wait(). 1600 */ 1601 void 1602 page_create_putback(spgcnt_t npages) 1603 { 1604 struct pcf *p; 1605 pgcnt_t lump; 1606 uint_t *which; 1607 1608 /* 1609 * When a contiguous lump is broken up, we have to 1610 * deal with lots of pages (min 64) so lets spread 1611 * the wealth around. 1612 */ 1613 lump = roundup(npages, PCF_FANOUT) / PCF_FANOUT; 1614 freemem += npages; 1615 1616 for (p = pcf; (npages > 0) && (p < &pcf[PCF_FANOUT]); p++) { 1617 which = &p->pcf_count; 1618 1619 mutex_enter(&p->pcf_lock); 1620 1621 if (p->pcf_block) { 1622 which = &p->pcf_reserve; 1623 } 1624 1625 if (lump < npages) { 1626 *which += (uint_t)lump; 1627 npages -= lump; 1628 } else { 1629 *which += (uint_t)npages; 1630 npages = 0; 1631 } 1632 1633 if (p->pcf_wait) { 1634 mutex_enter(&new_freemem_lock); 1635 /* 1636 * Check to see if some other thread 1637 * is actually waiting. Another bucket 1638 * may have woken it up by now. If there 1639 * are no waiters, then set our pcf_wait 1640 * count to zero to avoid coming in here 1641 * next time. 1642 */ 1643 if (freemem_wait) { 1644 if (npages > 1) { 1645 cv_broadcast(&freemem_cv); 1646 } else { 1647 cv_signal(&freemem_cv); 1648 } 1649 p->pcf_wait--; 1650 } else { 1651 p->pcf_wait = 0; 1652 } 1653 mutex_exit(&new_freemem_lock); 1654 } 1655 mutex_exit(&p->pcf_lock); 1656 } 1657 ASSERT(npages == 0); 1658 } 1659 1660 /* 1661 * A helper routine for page_create_get_something. 1662 * The indenting got to deep down there. 1663 * Unblock the pcf counters. Any pages freed after 1664 * pcf_block got set are moved to pcf_count and 1665 * wakeups (cv_broadcast() or cv_signal()) are done as needed. 1666 */ 1667 static void 1668 pcgs_unblock(void) 1669 { 1670 int i; 1671 struct pcf *p; 1672 1673 /* Update freemem while we're here. */ 1674 freemem = 0; 1675 p = pcf; 1676 for (i = 0; i < PCF_FANOUT; i++) { 1677 mutex_enter(&p->pcf_lock); 1678 ASSERT(p->pcf_count == 0); 1679 p->pcf_count = p->pcf_reserve; 1680 p->pcf_block = 0; 1681 freemem += p->pcf_count; 1682 if (p->pcf_wait) { 1683 mutex_enter(&new_freemem_lock); 1684 if (freemem_wait) { 1685 if (p->pcf_reserve > 1) { 1686 cv_broadcast(&freemem_cv); 1687 p->pcf_wait = 0; 1688 } else { 1689 cv_signal(&freemem_cv); 1690 p->pcf_wait--; 1691 } 1692 } else { 1693 p->pcf_wait = 0; 1694 } 1695 mutex_exit(&new_freemem_lock); 1696 } 1697 p->pcf_reserve = 0; 1698 mutex_exit(&p->pcf_lock); 1699 p++; 1700 } 1701 } 1702 1703 /* 1704 * Called from page_create_va() when both the cache and free lists 1705 * have been checked once. 1706 * 1707 * Either returns a page or panics since the accounting was done 1708 * way before we got here. 1709 * 1710 * We don't come here often, so leave the accounting on permanently. 1711 */ 1712 1713 #define MAX_PCGS 100 1714 1715 #ifdef DEBUG 1716 #define PCGS_TRIES 100 1717 #else /* DEBUG */ 1718 #define PCGS_TRIES 10 1719 #endif /* DEBUG */ 1720 1721 #ifdef VM_STATS 1722 uint_t pcgs_counts[PCGS_TRIES]; 1723 uint_t pcgs_too_many; 1724 uint_t pcgs_entered; 1725 uint_t pcgs_entered_noreloc; 1726 uint_t pcgs_locked; 1727 uint_t pcgs_cagelocked; 1728 #endif /* VM_STATS */ 1729 1730 static page_t * 1731 page_create_get_something(vnode_t *vp, u_offset_t off, struct seg *seg, 1732 caddr_t vaddr, uint_t flags) 1733 { 1734 uint_t count; 1735 page_t *pp; 1736 uint_t locked, i; 1737 struct pcf *p; 1738 lgrp_t *lgrp; 1739 int cagelocked = 0; 1740 1741 VM_STAT_ADD(pcgs_entered); 1742 1743 /* 1744 * Tap any reserve freelists: if we fail now, we'll die 1745 * since the page(s) we're looking for have already been 1746 * accounted for. 1747 */ 1748 flags |= PG_PANIC; 1749 1750 if ((flags & PG_NORELOC) != 0) { 1751 VM_STAT_ADD(pcgs_entered_noreloc); 1752 /* 1753 * Requests for free pages from critical threads 1754 * such as pageout still won't throttle here, but 1755 * we must try again, to give the cageout thread 1756 * another chance to catch up. Since we already 1757 * accounted for the pages, we had better get them 1758 * this time. 1759 * 1760 * N.B. All non-critical threads acquire the pcgs_cagelock 1761 * to serialize access to the freelists. This implements a 1762 * turnstile-type synchornization to avoid starvation of 1763 * critical requests for PG_NORELOC memory by non-critical 1764 * threads: all non-critical threads must acquire a 'ticket' 1765 * before passing through, which entails making sure 1766 * kcage_freemem won't fall below minfree prior to grabbing 1767 * pages from the freelists. 1768 */ 1769 if (kcage_create_throttle(1, flags) == KCT_NONCRIT) { 1770 mutex_enter(&pcgs_cagelock); 1771 cagelocked = 1; 1772 VM_STAT_ADD(pcgs_cagelocked); 1773 } 1774 } 1775 1776 /* 1777 * Time to get serious. 1778 * We failed to get a `correctly colored' page from both the 1779 * free and cache lists. 1780 * We escalate in stage. 1781 * 1782 * First try both lists without worring about color. 1783 * 1784 * Then, grab all page accounting locks (ie. pcf[]) and 1785 * steal any pages that they have and set the pcf_block flag to 1786 * stop deletions from the lists. This will help because 1787 * a page can get added to the free list while we are looking 1788 * at the cache list, then another page could be added to the cache 1789 * list allowing the page on the free list to be removed as we 1790 * move from looking at the cache list to the free list. This 1791 * could happen over and over. We would never find the page 1792 * we have accounted for. 1793 * 1794 * Noreloc pages are a subset of the global (relocatable) page pool. 1795 * They are not tracked separately in the pcf bins, so it is 1796 * impossible to know when doing pcf accounting if the available 1797 * page(s) are noreloc pages or not. When looking for a noreloc page 1798 * it is quite easy to end up here even if the global (relocatable) 1799 * page pool has plenty of free pages but the noreloc pool is empty. 1800 * 1801 * When the noreloc pool is empty (or low), additional noreloc pages 1802 * are created by converting pages from the global page pool. This 1803 * process will stall during pcf accounting if the pcf bins are 1804 * already locked. Such is the case when a noreloc allocation is 1805 * looping here in page_create_get_something waiting for more noreloc 1806 * pages to appear. 1807 * 1808 * Short of adding a new field to the pcf bins to accurately track 1809 * the number of free noreloc pages, we instead do not grab the 1810 * pcgs_lock, do not set the pcf blocks and do not timeout when 1811 * allocating a noreloc page. This allows noreloc allocations to 1812 * loop without blocking global page pool allocations. 1813 * 1814 * NOTE: the behaviour of page_create_get_something has not changed 1815 * for the case of global page pool allocations. 1816 */ 1817 1818 flags &= ~PG_MATCH_COLOR; 1819 locked = 0; 1820 #if defined(__i386) || defined(__amd64) 1821 /* 1822 * page_create_get_something may be called because 4g memory may be 1823 * depleted. Set flags to allow for relocation of base page below 1824 * 4g if necessary. 1825 */ 1826 if (physmax4g) 1827 flags |= (PGI_PGCPSZC0 | PGI_PGCPHIPRI); 1828 #endif 1829 1830 lgrp = lgrp_mem_choose(seg, vaddr, PAGESIZE); 1831 1832 for (count = 0; kcage_on || count < MAX_PCGS; count++) { 1833 pp = page_get_freelist(vp, off, seg, vaddr, PAGESIZE, 1834 flags, lgrp); 1835 if (pp == NULL) { 1836 pp = page_get_cachelist(vp, off, seg, vaddr, 1837 flags, lgrp); 1838 } 1839 if (pp == NULL) { 1840 /* 1841 * Serialize. Don't fight with other pcgs(). 1842 */ 1843 if (!locked && (!kcage_on || !(flags & PG_NORELOC))) { 1844 mutex_enter(&pcgs_lock); 1845 VM_STAT_ADD(pcgs_locked); 1846 locked = 1; 1847 p = pcf; 1848 for (i = 0; i < PCF_FANOUT; i++) { 1849 mutex_enter(&p->pcf_lock); 1850 ASSERT(p->pcf_block == 0); 1851 p->pcf_block = 1; 1852 p->pcf_reserve = p->pcf_count; 1853 p->pcf_count = 0; 1854 mutex_exit(&p->pcf_lock); 1855 p++; 1856 } 1857 freemem = 0; 1858 } 1859 1860 if (count) { 1861 /* 1862 * Since page_free() puts pages on 1863 * a list then accounts for it, we 1864 * just have to wait for page_free() 1865 * to unlock any page it was working 1866 * with. The page_lock()-page_reclaim() 1867 * path falls in the same boat. 1868 * 1869 * We don't need to check on the 1870 * PG_WAIT flag, we have already 1871 * accounted for the page we are 1872 * looking for in page_create_va(). 1873 * 1874 * We just wait a moment to let any 1875 * locked pages on the lists free up, 1876 * then continue around and try again. 1877 * 1878 * Will be awakened by set_freemem(). 1879 */ 1880 mutex_enter(&pcgs_wait_lock); 1881 cv_wait(&pcgs_cv, &pcgs_wait_lock); 1882 mutex_exit(&pcgs_wait_lock); 1883 } 1884 } else { 1885 #ifdef VM_STATS 1886 if (count >= PCGS_TRIES) { 1887 VM_STAT_ADD(pcgs_too_many); 1888 } else { 1889 VM_STAT_ADD(pcgs_counts[count]); 1890 } 1891 #endif 1892 if (locked) { 1893 pcgs_unblock(); 1894 mutex_exit(&pcgs_lock); 1895 } 1896 if (cagelocked) 1897 mutex_exit(&pcgs_cagelock); 1898 return (pp); 1899 } 1900 } 1901 /* 1902 * we go down holding the pcf locks. 1903 */ 1904 panic("no %spage found %d", 1905 ((flags & PG_NORELOC) ? "non-reloc " : ""), count); 1906 /*NOTREACHED*/ 1907 } 1908 1909 /* 1910 * Create enough pages for "bytes" worth of data starting at 1911 * "off" in "vp". 1912 * 1913 * Where flag must be one of: 1914 * 1915 * PG_EXCL: Exclusive create (fail if any page already 1916 * exists in the page cache) which does not 1917 * wait for memory to become available. 1918 * 1919 * PG_WAIT: Non-exclusive create which can wait for 1920 * memory to become available. 1921 * 1922 * PG_PHYSCONTIG: Allocate physically contiguous pages. 1923 * (Not Supported) 1924 * 1925 * A doubly linked list of pages is returned to the caller. Each page 1926 * on the list has the "exclusive" (p_selock) lock and "iolock" (p_iolock) 1927 * lock. 1928 * 1929 * Unable to change the parameters to page_create() in a minor release, 1930 * we renamed page_create() to page_create_va(), changed all known calls 1931 * from page_create() to page_create_va(), and created this wrapper. 1932 * 1933 * Upon a major release, we should break compatibility by deleting this 1934 * wrapper, and replacing all the strings "page_create_va", with "page_create". 1935 * 1936 * NOTE: There is a copy of this interface as page_create_io() in 1937 * i86/vm/vm_machdep.c. Any bugs fixed here should be applied 1938 * there. 1939 */ 1940 page_t * 1941 page_create(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags) 1942 { 1943 caddr_t random_vaddr; 1944 struct seg kseg; 1945 1946 #ifdef DEBUG 1947 cmn_err(CE_WARN, "Using deprecated interface page_create: caller %p", 1948 (void *)caller()); 1949 #endif 1950 1951 random_vaddr = (caddr_t)(((uintptr_t)vp >> 7) ^ 1952 (uintptr_t)(off >> PAGESHIFT)); 1953 kseg.s_as = &kas; 1954 1955 return (page_create_va(vp, off, bytes, flags, &kseg, random_vaddr)); 1956 } 1957 1958 #ifdef DEBUG 1959 uint32_t pg_alloc_pgs_mtbf = 0; 1960 #endif 1961 1962 /* 1963 * Used for large page support. It will attempt to allocate 1964 * a large page(s) off the freelist. 1965 * 1966 * Returns non zero on failure. 1967 */ 1968 int 1969 page_alloc_pages(struct vnode *vp, struct seg *seg, caddr_t addr, 1970 page_t **basepp, page_t *ppa[], uint_t szc, int anypgsz) 1971 { 1972 pgcnt_t npgs, curnpgs, totpgs; 1973 size_t pgsz; 1974 page_t *pplist = NULL, *pp; 1975 int err = 0; 1976 lgrp_t *lgrp; 1977 1978 ASSERT(szc != 0 && szc <= (page_num_pagesizes() - 1)); 1979 1980 VM_STAT_ADD(alloc_pages[0]); 1981 1982 #ifdef DEBUG 1983 if (pg_alloc_pgs_mtbf && !(gethrtime() % pg_alloc_pgs_mtbf)) { 1984 return (ENOMEM); 1985 } 1986 #endif 1987 1988 pgsz = page_get_pagesize(szc); 1989 totpgs = curnpgs = npgs = pgsz >> PAGESHIFT; 1990 1991 ASSERT(((uintptr_t)addr & (pgsz - 1)) == 0); 1992 /* 1993 * One must be NULL but not both. 1994 * And one must be non NULL but not both. 1995 */ 1996 ASSERT(basepp != NULL || ppa != NULL); 1997 ASSERT(basepp == NULL || ppa == NULL); 1998 1999 (void) page_create_wait(npgs, PG_WAIT); 2000 2001 while (npgs && szc) { 2002 lgrp = lgrp_mem_choose(seg, addr, pgsz); 2003 pp = page_get_freelist(vp, 0, seg, addr, pgsz, 0, lgrp); 2004 if (pp != NULL) { 2005 VM_STAT_ADD(alloc_pages[1]); 2006 page_list_concat(&pplist, &pp); 2007 ASSERT(npgs >= curnpgs); 2008 npgs -= curnpgs; 2009 } else if (anypgsz) { 2010 VM_STAT_ADD(alloc_pages[2]); 2011 szc--; 2012 pgsz = page_get_pagesize(szc); 2013 curnpgs = pgsz >> PAGESHIFT; 2014 } else { 2015 VM_STAT_ADD(alloc_pages[3]); 2016 ASSERT(npgs == totpgs); 2017 page_create_putback(npgs); 2018 return (ENOMEM); 2019 } 2020 } 2021 if (szc == 0) { 2022 VM_STAT_ADD(alloc_pages[4]); 2023 ASSERT(npgs != 0); 2024 page_create_putback(npgs); 2025 err = ENOMEM; 2026 } else if (basepp != NULL) { 2027 ASSERT(npgs == 0); 2028 ASSERT(ppa == NULL); 2029 *basepp = pplist; 2030 } 2031 2032 npgs = totpgs - npgs; 2033 pp = pplist; 2034 2035 /* 2036 * Clear the free and age bits. Also if we were passed in a ppa then 2037 * fill it in with all the constituent pages from the large page. But 2038 * if we failed to allocate all the pages just free what we got. 2039 */ 2040 while (npgs != 0) { 2041 ASSERT(PP_ISFREE(pp)); 2042 ASSERT(PP_ISAGED(pp)); 2043 if (ppa != NULL || err != 0) { 2044 if (err == 0) { 2045 VM_STAT_ADD(alloc_pages[5]); 2046 PP_CLRFREE(pp); 2047 PP_CLRAGED(pp); 2048 page_sub(&pplist, pp); 2049 *ppa++ = pp; 2050 npgs--; 2051 } else { 2052 VM_STAT_ADD(alloc_pages[6]); 2053 ASSERT(pp->p_szc != 0); 2054 curnpgs = page_get_pagecnt(pp->p_szc); 2055 page_list_break(&pp, &pplist, curnpgs); 2056 page_list_add_pages(pp, 0); 2057 page_create_putback(curnpgs); 2058 ASSERT(npgs >= curnpgs); 2059 npgs -= curnpgs; 2060 } 2061 pp = pplist; 2062 } else { 2063 VM_STAT_ADD(alloc_pages[7]); 2064 PP_CLRFREE(pp); 2065 PP_CLRAGED(pp); 2066 pp = pp->p_next; 2067 npgs--; 2068 } 2069 } 2070 return (err); 2071 } 2072 2073 /* 2074 * Get a single large page off of the freelists, and set it up for use. 2075 * Number of bytes requested must be a supported page size. 2076 * 2077 * Note that this call may fail even if there is sufficient 2078 * memory available or PG_WAIT is set, so the caller must 2079 * be willing to fallback on page_create_va(), block and retry, 2080 * or fail the requester. 2081 */ 2082 page_t * 2083 page_create_va_large(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags, 2084 struct seg *seg, caddr_t vaddr, void *arg) 2085 { 2086 pgcnt_t npages, pcftotal; 2087 page_t *pp; 2088 page_t *rootpp; 2089 lgrp_t *lgrp; 2090 uint_t enough; 2091 uint_t pcf_index; 2092 uint_t i; 2093 struct pcf *p; 2094 struct pcf *q; 2095 lgrp_id_t *lgrpid = (lgrp_id_t *)arg; 2096 2097 ASSERT(vp != NULL); 2098 2099 ASSERT((flags & ~(PG_EXCL | PG_WAIT | 2100 PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == 0); 2101 /* but no others */ 2102 2103 ASSERT((flags & PG_EXCL) == PG_EXCL); 2104 2105 npages = btop(bytes); 2106 2107 if (!kcage_on || panicstr) { 2108 /* 2109 * Cage is OFF, or we are single threaded in 2110 * panic, so make everything a RELOC request. 2111 */ 2112 flags &= ~PG_NORELOC; 2113 } 2114 2115 /* 2116 * Make sure there's adequate physical memory available. 2117 * Note: PG_WAIT is ignored here. 2118 */ 2119 if (freemem <= throttlefree + npages) { 2120 VM_STAT_ADD(page_create_large_cnt[1]); 2121 return (NULL); 2122 } 2123 2124 /* 2125 * If cage is on, dampen draw from cage when available 2126 * cage space is low. 2127 */ 2128 if ((flags & (PG_NORELOC | PG_WAIT)) == (PG_NORELOC | PG_WAIT) && 2129 kcage_freemem < kcage_throttlefree + npages) { 2130 2131 /* 2132 * The cage is on, the caller wants PG_NORELOC 2133 * pages and available cage memory is very low. 2134 * Call kcage_create_throttle() to attempt to 2135 * control demand on the cage. 2136 */ 2137 if (kcage_create_throttle(npages, flags) == KCT_FAILURE) { 2138 VM_STAT_ADD(page_create_large_cnt[2]); 2139 return (NULL); 2140 } 2141 } 2142 2143 enough = 0; 2144 pcf_index = PCF_INDEX(); 2145 p = &pcf[pcf_index]; 2146 q = &pcf[PCF_FANOUT]; 2147 for (pcftotal = 0, i = 0; i < PCF_FANOUT; i++) { 2148 if (p->pcf_count > npages) { 2149 /* 2150 * a good one to try. 2151 */ 2152 mutex_enter(&p->pcf_lock); 2153 if (p->pcf_count > npages) { 2154 p->pcf_count -= (uint_t)npages; 2155 /* 2156 * freemem is not protected by any lock. 2157 * Thus, we cannot have any assertion 2158 * containing freemem here. 2159 */ 2160 freemem -= npages; 2161 enough = 1; 2162 mutex_exit(&p->pcf_lock); 2163 break; 2164 } 2165 mutex_exit(&p->pcf_lock); 2166 } 2167 pcftotal += p->pcf_count; 2168 p++; 2169 if (p >= q) { 2170 p = pcf; 2171 } 2172 } 2173 2174 if (!enough) { 2175 /* If there isn't enough memory available, give up. */ 2176 if (pcftotal < npages) { 2177 VM_STAT_ADD(page_create_large_cnt[3]); 2178 return (NULL); 2179 } 2180 2181 /* try to collect pages from several pcf bins */ 2182 for (p = pcf, pcftotal = 0, i = 0; i < PCF_FANOUT; i++) { 2183 mutex_enter(&p->pcf_lock); 2184 pcftotal += p->pcf_count; 2185 if (pcftotal >= npages) { 2186 /* 2187 * Wow! There are enough pages laying around 2188 * to satisfy the request. Do the accounting, 2189 * drop the locks we acquired, and go back. 2190 * 2191 * freemem is not protected by any lock. So, 2192 * we cannot have any assertion containing 2193 * freemem. 2194 */ 2195 pgcnt_t tpages = npages; 2196 freemem -= npages; 2197 while (p >= pcf) { 2198 if (p->pcf_count <= tpages) { 2199 tpages -= p->pcf_count; 2200 p->pcf_count = 0; 2201 } else { 2202 p->pcf_count -= (uint_t)tpages; 2203 tpages = 0; 2204 } 2205 mutex_exit(&p->pcf_lock); 2206 p--; 2207 } 2208 ASSERT(tpages == 0); 2209 break; 2210 } 2211 p++; 2212 } 2213 if (i == PCF_FANOUT) { 2214 /* failed to collect pages - release the locks */ 2215 while (--p >= pcf) { 2216 mutex_exit(&p->pcf_lock); 2217 } 2218 VM_STAT_ADD(page_create_large_cnt[4]); 2219 return (NULL); 2220 } 2221 } 2222 2223 /* 2224 * This is where this function behaves fundamentally differently 2225 * than page_create_va(); since we're intending to map the page 2226 * with a single TTE, we have to get it as a physically contiguous 2227 * hardware pagesize chunk. If we can't, we fail. 2228 */ 2229 if (lgrpid != NULL && *lgrpid >= 0 && *lgrpid <= lgrp_alloc_max && 2230 LGRP_EXISTS(lgrp_table[*lgrpid])) 2231 lgrp = lgrp_table[*lgrpid]; 2232 else 2233 lgrp = lgrp_mem_choose(seg, vaddr, bytes); 2234 2235 if ((rootpp = page_get_freelist(&kvp, off, seg, vaddr, 2236 bytes, flags & ~PG_MATCH_COLOR, lgrp)) == NULL) { 2237 page_create_putback(npages); 2238 VM_STAT_ADD(page_create_large_cnt[5]); 2239 return (NULL); 2240 } 2241 2242 /* 2243 * if we got the page with the wrong mtype give it back this is a 2244 * workaround for CR 6249718. When CR 6249718 is fixed we never get 2245 * inside "if" and the workaround becomes just a nop 2246 */ 2247 if (kcage_on && (flags & PG_NORELOC) && !PP_ISNORELOC(rootpp)) { 2248 page_list_add_pages(rootpp, 0); 2249 page_create_putback(npages); 2250 VM_STAT_ADD(page_create_large_cnt[6]); 2251 return (NULL); 2252 } 2253 2254 /* 2255 * If satisfying this request has left us with too little 2256 * memory, start the wheels turning to get some back. The 2257 * first clause of the test prevents waking up the pageout 2258 * daemon in situations where it would decide that there's 2259 * nothing to do. 2260 */ 2261 if (nscan < desscan && freemem < minfree) { 2262 TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL, 2263 "pageout_cv_signal:freemem %ld", freemem); 2264 cv_signal(&proc_pageout->p_cv); 2265 } 2266 2267 pp = rootpp; 2268 while (npages--) { 2269 ASSERT(PAGE_EXCL(pp)); 2270 ASSERT(pp->p_vnode == NULL); 2271 ASSERT(!hat_page_is_mapped(pp)); 2272 PP_CLRFREE(pp); 2273 PP_CLRAGED(pp); 2274 if (!page_hashin(pp, vp, off, NULL)) 2275 panic("page_create_large: hashin failed: page %p", 2276 (void *)pp); 2277 page_io_lock(pp); 2278 off += PAGESIZE; 2279 pp = pp->p_next; 2280 } 2281 2282 VM_STAT_ADD(page_create_large_cnt[0]); 2283 return (rootpp); 2284 } 2285 2286 page_t * 2287 page_create_va(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags, 2288 struct seg *seg, caddr_t vaddr) 2289 { 2290 page_t *plist = NULL; 2291 pgcnt_t npages; 2292 pgcnt_t found_on_free = 0; 2293 pgcnt_t pages_req; 2294 page_t *npp = NULL; 2295 uint_t enough; 2296 uint_t i; 2297 uint_t pcf_index; 2298 struct pcf *p; 2299 struct pcf *q; 2300 lgrp_t *lgrp; 2301 2302 TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START, 2303 "page_create_start:vp %p off %llx bytes %lu flags %x", 2304 vp, off, bytes, flags); 2305 2306 ASSERT(bytes != 0 && vp != NULL); 2307 2308 if ((flags & PG_EXCL) == 0 && (flags & PG_WAIT) == 0) { 2309 panic("page_create: invalid flags"); 2310 /*NOTREACHED*/ 2311 } 2312 ASSERT((flags & ~(PG_EXCL | PG_WAIT | 2313 PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == 0); 2314 /* but no others */ 2315 2316 pages_req = npages = btopr(bytes); 2317 /* 2318 * Try to see whether request is too large to *ever* be 2319 * satisfied, in order to prevent deadlock. We arbitrarily 2320 * decide to limit maximum size requests to max_page_get. 2321 */ 2322 if (npages >= max_page_get) { 2323 if ((flags & PG_WAIT) == 0) { 2324 TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_TOOBIG, 2325 "page_create_toobig:vp %p off %llx npages " 2326 "%lu max_page_get %lu", 2327 vp, off, npages, max_page_get); 2328 return (NULL); 2329 } else { 2330 cmn_err(CE_WARN, 2331 "Request for too much kernel memory " 2332 "(%lu bytes), will hang forever", bytes); 2333 for (;;) 2334 delay(1000000000); 2335 } 2336 } 2337 2338 if (!kcage_on || panicstr) { 2339 /* 2340 * Cage is OFF, or we are single threaded in 2341 * panic, so make everything a RELOC request. 2342 */ 2343 flags &= ~PG_NORELOC; 2344 } 2345 2346 if (freemem <= throttlefree + npages) 2347 if (!page_create_throttle(npages, flags)) 2348 return (NULL); 2349 2350 /* 2351 * If cage is on, dampen draw from cage when available 2352 * cage space is low. 2353 */ 2354 if ((flags & PG_NORELOC) && 2355 kcage_freemem < kcage_throttlefree + npages) { 2356 2357 /* 2358 * The cage is on, the caller wants PG_NORELOC 2359 * pages and available cage memory is very low. 2360 * Call kcage_create_throttle() to attempt to 2361 * control demand on the cage. 2362 */ 2363 if (kcage_create_throttle(npages, flags) == KCT_FAILURE) 2364 return (NULL); 2365 } 2366 2367 VM_STAT_ADD(page_create_cnt[0]); 2368 2369 enough = 0; 2370 pcf_index = PCF_INDEX(); 2371 2372 p = &pcf[pcf_index]; 2373 q = &pcf[PCF_FANOUT]; 2374 for (i = 0; i < PCF_FANOUT; i++) { 2375 if (p->pcf_count > npages) { 2376 /* 2377 * a good one to try. 2378 */ 2379 mutex_enter(&p->pcf_lock); 2380 if (p->pcf_count > npages) { 2381 p->pcf_count -= (uint_t)npages; 2382 /* 2383 * freemem is not protected by any lock. 2384 * Thus, we cannot have any assertion 2385 * containing freemem here. 2386 */ 2387 freemem -= npages; 2388 enough = 1; 2389 mutex_exit(&p->pcf_lock); 2390 break; 2391 } 2392 mutex_exit(&p->pcf_lock); 2393 } 2394 p++; 2395 if (p >= q) { 2396 p = pcf; 2397 } 2398 } 2399 2400 if (!enough) { 2401 /* 2402 * Have to look harder. If npages is greater than 2403 * one, then we might have to coalecse the counters. 2404 * 2405 * Go wait. We come back having accounted 2406 * for the memory. 2407 */ 2408 VM_STAT_ADD(page_create_cnt[1]); 2409 if (!page_create_wait(npages, flags)) { 2410 VM_STAT_ADD(page_create_cnt[2]); 2411 return (NULL); 2412 } 2413 } 2414 2415 TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS, 2416 "page_create_success:vp %p off %llx", vp, off); 2417 2418 /* 2419 * If satisfying this request has left us with too little 2420 * memory, start the wheels turning to get some back. The 2421 * first clause of the test prevents waking up the pageout 2422 * daemon in situations where it would decide that there's 2423 * nothing to do. 2424 */ 2425 if (nscan < desscan && freemem < minfree) { 2426 TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL, 2427 "pageout_cv_signal:freemem %ld", freemem); 2428 cv_signal(&proc_pageout->p_cv); 2429 } 2430 2431 /* 2432 * Loop around collecting the requested number of pages. 2433 * Most of the time, we have to `create' a new page. With 2434 * this in mind, pull the page off the free list before 2435 * getting the hash lock. This will minimize the hash 2436 * lock hold time, nesting, and the like. If it turns 2437 * out we don't need the page, we put it back at the end. 2438 */ 2439 while (npages--) { 2440 page_t *pp; 2441 kmutex_t *phm = NULL; 2442 ulong_t index; 2443 2444 index = PAGE_HASH_FUNC(vp, off); 2445 top: 2446 ASSERT(phm == NULL); 2447 ASSERT(index == PAGE_HASH_FUNC(vp, off)); 2448 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 2449 2450 if (npp == NULL) { 2451 /* 2452 * Try to get a page from the freelist (ie, 2453 * a page with no [vp, off] tag). If that 2454 * fails, use the cachelist. 2455 * 2456 * During the first attempt at both the free 2457 * and cache lists we try for the correct color. 2458 */ 2459 /* 2460 * XXXX-how do we deal with virtual indexed 2461 * caches and and colors? 2462 */ 2463 VM_STAT_ADD(page_create_cnt[4]); 2464 /* 2465 * Get lgroup to allocate next page of shared memory 2466 * from and use it to specify where to allocate 2467 * the physical memory 2468 */ 2469 lgrp = lgrp_mem_choose(seg, vaddr, PAGESIZE); 2470 npp = page_get_freelist(vp, off, seg, vaddr, PAGESIZE, 2471 flags | PG_MATCH_COLOR, lgrp); 2472 if (npp == NULL) { 2473 npp = page_get_cachelist(vp, off, seg, 2474 vaddr, flags | PG_MATCH_COLOR, lgrp); 2475 if (npp == NULL) { 2476 npp = page_create_get_something(vp, 2477 off, seg, vaddr, 2478 flags & ~PG_MATCH_COLOR); 2479 } 2480 2481 if (PP_ISAGED(npp) == 0) { 2482 /* 2483 * Since this page came from the 2484 * cachelist, we must destroy the 2485 * old vnode association. 2486 */ 2487 page_hashout(npp, NULL); 2488 } 2489 } 2490 } 2491 2492 /* 2493 * We own this page! 2494 */ 2495 ASSERT(PAGE_EXCL(npp)); 2496 ASSERT(npp->p_vnode == NULL); 2497 ASSERT(!hat_page_is_mapped(npp)); 2498 PP_CLRFREE(npp); 2499 PP_CLRAGED(npp); 2500 2501 /* 2502 * Here we have a page in our hot little mits and are 2503 * just waiting to stuff it on the appropriate lists. 2504 * Get the mutex and check to see if it really does 2505 * not exist. 2506 */ 2507 phm = PAGE_HASH_MUTEX(index); 2508 mutex_enter(phm); 2509 PAGE_HASH_SEARCH(index, pp, vp, off); 2510 if (pp == NULL) { 2511 VM_STAT_ADD(page_create_new); 2512 pp = npp; 2513 npp = NULL; 2514 if (!page_hashin(pp, vp, off, phm)) { 2515 /* 2516 * Since we hold the page hash mutex and 2517 * just searched for this page, page_hashin 2518 * had better not fail. If it does, that 2519 * means somethread did not follow the 2520 * page hash mutex rules. Panic now and 2521 * get it over with. As usual, go down 2522 * holding all the locks. 2523 */ 2524 ASSERT(MUTEX_HELD(phm)); 2525 panic("page_create: " 2526 "hashin failed %p %p %llx %p", 2527 (void *)pp, (void *)vp, off, (void *)phm); 2528 /*NOTREACHED*/ 2529 } 2530 ASSERT(MUTEX_HELD(phm)); 2531 mutex_exit(phm); 2532 phm = NULL; 2533 2534 /* 2535 * Hat layer locking need not be done to set 2536 * the following bits since the page is not hashed 2537 * and was on the free list (i.e., had no mappings). 2538 * 2539 * Set the reference bit to protect 2540 * against immediate pageout 2541 * 2542 * XXXmh modify freelist code to set reference 2543 * bit so we don't have to do it here. 2544 */ 2545 page_set_props(pp, P_REF); 2546 found_on_free++; 2547 } else { 2548 VM_STAT_ADD(page_create_exists); 2549 if (flags & PG_EXCL) { 2550 /* 2551 * Found an existing page, and the caller 2552 * wanted all new pages. Undo all of the work 2553 * we have done. 2554 */ 2555 mutex_exit(phm); 2556 phm = NULL; 2557 while (plist != NULL) { 2558 pp = plist; 2559 page_sub(&plist, pp); 2560 page_io_unlock(pp); 2561 /* large pages should not end up here */ 2562 ASSERT(pp->p_szc == 0); 2563 /*LINTED: constant in conditional ctx*/ 2564 VN_DISPOSE(pp, B_INVAL, 0, kcred); 2565 } 2566 VM_STAT_ADD(page_create_found_one); 2567 goto fail; 2568 } 2569 ASSERT(flags & PG_WAIT); 2570 if (!page_lock(pp, SE_EXCL, phm, P_NO_RECLAIM)) { 2571 /* 2572 * Start all over again if we blocked trying 2573 * to lock the page. 2574 */ 2575 mutex_exit(phm); 2576 VM_STAT_ADD(page_create_page_lock_failed); 2577 phm = NULL; 2578 goto top; 2579 } 2580 mutex_exit(phm); 2581 phm = NULL; 2582 2583 if (PP_ISFREE(pp)) { 2584 ASSERT(PP_ISAGED(pp) == 0); 2585 VM_STAT_ADD(pagecnt.pc_get_cache); 2586 page_list_sub(pp, PG_CACHE_LIST); 2587 PP_CLRFREE(pp); 2588 found_on_free++; 2589 } 2590 } 2591 2592 /* 2593 * Got a page! It is locked. Acquire the i/o 2594 * lock since we are going to use the p_next and 2595 * p_prev fields to link the requested pages together. 2596 */ 2597 page_io_lock(pp); 2598 page_add(&plist, pp); 2599 plist = plist->p_next; 2600 off += PAGESIZE; 2601 vaddr += PAGESIZE; 2602 } 2603 2604 ASSERT((flags & PG_EXCL) ? (found_on_free == pages_req) : 1); 2605 fail: 2606 if (npp != NULL) { 2607 /* 2608 * Did not need this page after all. 2609 * Put it back on the free list. 2610 */ 2611 VM_STAT_ADD(page_create_putbacks); 2612 PP_SETFREE(npp); 2613 PP_SETAGED(npp); 2614 npp->p_offset = (u_offset_t)-1; 2615 page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL); 2616 page_unlock(npp); 2617 2618 } 2619 2620 ASSERT(pages_req >= found_on_free); 2621 2622 { 2623 uint_t overshoot = (uint_t)(pages_req - found_on_free); 2624 2625 if (overshoot) { 2626 VM_STAT_ADD(page_create_overshoot); 2627 p = &pcf[pcf_index]; 2628 mutex_enter(&p->pcf_lock); 2629 if (p->pcf_block) { 2630 p->pcf_reserve += overshoot; 2631 } else { 2632 p->pcf_count += overshoot; 2633 if (p->pcf_wait) { 2634 mutex_enter(&new_freemem_lock); 2635 if (freemem_wait) { 2636 cv_signal(&freemem_cv); 2637 p->pcf_wait--; 2638 } else { 2639 p->pcf_wait = 0; 2640 } 2641 mutex_exit(&new_freemem_lock); 2642 } 2643 } 2644 mutex_exit(&p->pcf_lock); 2645 /* freemem is approximate, so this test OK */ 2646 if (!p->pcf_block) 2647 freemem += overshoot; 2648 } 2649 } 2650 2651 return (plist); 2652 } 2653 2654 /* 2655 * One or more constituent pages of this large page has been marked 2656 * toxic. Simply demote the large page to PAGESIZE pages and let 2657 * page_free() handle it. This routine should only be called by 2658 * large page free routines (page_free_pages() and page_destroy_pages(). 2659 * All pages are locked SE_EXCL and have already been marked free. 2660 */ 2661 static void 2662 page_free_toxic_pages(page_t *rootpp) 2663 { 2664 page_t *tpp; 2665 pgcnt_t i, pgcnt = page_get_pagecnt(rootpp->p_szc); 2666 uint_t szc = rootpp->p_szc; 2667 2668 for (i = 0, tpp = rootpp; i < pgcnt; i++, tpp = tpp->p_next) { 2669 ASSERT(tpp->p_szc == szc); 2670 ASSERT((PAGE_EXCL(tpp) && 2671 !page_iolock_assert(tpp)) || panicstr); 2672 tpp->p_szc = 0; 2673 } 2674 2675 while (rootpp != NULL) { 2676 tpp = rootpp; 2677 page_sub(&rootpp, tpp); 2678 ASSERT(PP_ISFREE(tpp)); 2679 PP_CLRFREE(tpp); 2680 page_free(tpp, 1); 2681 } 2682 } 2683 2684 /* 2685 * Put page on the "free" list. 2686 * The free list is really two lists maintained by 2687 * the PSM of whatever machine we happen to be on. 2688 */ 2689 void 2690 page_free(page_t *pp, int dontneed) 2691 { 2692 struct pcf *p; 2693 uint_t pcf_index; 2694 2695 ASSERT((PAGE_EXCL(pp) && 2696 !page_iolock_assert(pp)) || panicstr); 2697 2698 if (PP_ISFREE(pp)) { 2699 panic("page_free: page %p is free", (void *)pp); 2700 } 2701 2702 if (pp->p_szc != 0) { 2703 if (pp->p_vnode == NULL || IS_SWAPFSVP(pp->p_vnode) || 2704 pp->p_vnode == &kvp) { 2705 panic("page_free: anon or kernel " 2706 "or no vnode large page %p", (void *)pp); 2707 } 2708 page_demote_vp_pages(pp); 2709 ASSERT(pp->p_szc == 0); 2710 } 2711 2712 /* 2713 * The page_struct_lock need not be acquired to examine these 2714 * fields since the page has an "exclusive" lock. 2715 */ 2716 if (hat_page_is_mapped(pp) || pp->p_lckcnt != 0 || pp->p_cowcnt != 0 || 2717 pp->p_slckcnt != 0) { 2718 panic("page_free pp=%p, pfn=%lx, lckcnt=%d, cowcnt=%d " 2719 "slckcnt = %d", pp, page_pptonum(pp), pp->p_lckcnt, 2720 pp->p_cowcnt, pp->p_slckcnt); 2721 /*NOTREACHED*/ 2722 } 2723 2724 ASSERT(!hat_page_getshare(pp)); 2725 2726 PP_SETFREE(pp); 2727 ASSERT(pp->p_vnode == NULL || !IS_VMODSORT(pp->p_vnode) || 2728 !hat_ismod(pp)); 2729 page_clr_all_props(pp); 2730 ASSERT(!hat_page_getshare(pp)); 2731 2732 /* 2733 * Now we add the page to the head of the free list. 2734 * But if this page is associated with a paged vnode 2735 * then we adjust the head forward so that the page is 2736 * effectively at the end of the list. 2737 */ 2738 if (pp->p_vnode == NULL) { 2739 /* 2740 * Page has no identity, put it on the free list. 2741 */ 2742 PP_SETAGED(pp); 2743 pp->p_offset = (u_offset_t)-1; 2744 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); 2745 VM_STAT_ADD(pagecnt.pc_free_free); 2746 TRACE_1(TR_FAC_VM, TR_PAGE_FREE_FREE, 2747 "page_free_free:pp %p", pp); 2748 } else { 2749 PP_CLRAGED(pp); 2750 2751 if (!dontneed || nopageage) { 2752 /* move it to the tail of the list */ 2753 page_list_add(pp, PG_CACHE_LIST | PG_LIST_TAIL); 2754 2755 VM_STAT_ADD(pagecnt.pc_free_cache); 2756 TRACE_1(TR_FAC_VM, TR_PAGE_FREE_CACHE_TAIL, 2757 "page_free_cache_tail:pp %p", pp); 2758 } else { 2759 page_list_add(pp, PG_CACHE_LIST | PG_LIST_HEAD); 2760 2761 VM_STAT_ADD(pagecnt.pc_free_dontneed); 2762 TRACE_1(TR_FAC_VM, TR_PAGE_FREE_CACHE_HEAD, 2763 "page_free_cache_head:pp %p", pp); 2764 } 2765 } 2766 page_unlock(pp); 2767 2768 /* 2769 * Now do the `freemem' accounting. 2770 */ 2771 pcf_index = PCF_INDEX(); 2772 p = &pcf[pcf_index]; 2773 2774 mutex_enter(&p->pcf_lock); 2775 if (p->pcf_block) { 2776 p->pcf_reserve += 1; 2777 } else { 2778 p->pcf_count += 1; 2779 if (p->pcf_wait) { 2780 mutex_enter(&new_freemem_lock); 2781 /* 2782 * Check to see if some other thread 2783 * is actually waiting. Another bucket 2784 * may have woken it up by now. If there 2785 * are no waiters, then set our pcf_wait 2786 * count to zero to avoid coming in here 2787 * next time. Also, since only one page 2788 * was put on the free list, just wake 2789 * up one waiter. 2790 */ 2791 if (freemem_wait) { 2792 cv_signal(&freemem_cv); 2793 p->pcf_wait--; 2794 } else { 2795 p->pcf_wait = 0; 2796 } 2797 mutex_exit(&new_freemem_lock); 2798 } 2799 } 2800 mutex_exit(&p->pcf_lock); 2801 2802 /* freemem is approximate, so this test OK */ 2803 if (!p->pcf_block) 2804 freemem += 1; 2805 } 2806 2807 /* 2808 * Put page on the "free" list during intial startup. 2809 * This happens during initial single threaded execution. 2810 */ 2811 void 2812 page_free_at_startup(page_t *pp) 2813 { 2814 struct pcf *p; 2815 uint_t pcf_index; 2816 2817 page_list_add(pp, PG_FREE_LIST | PG_LIST_HEAD | PG_LIST_ISINIT); 2818 VM_STAT_ADD(pagecnt.pc_free_free); 2819 2820 /* 2821 * Now do the `freemem' accounting. 2822 */ 2823 pcf_index = PCF_INDEX(); 2824 p = &pcf[pcf_index]; 2825 2826 ASSERT(p->pcf_block == 0); 2827 ASSERT(p->pcf_wait == 0); 2828 p->pcf_count += 1; 2829 2830 /* freemem is approximate, so this is OK */ 2831 freemem += 1; 2832 } 2833 2834 void 2835 page_free_pages(page_t *pp) 2836 { 2837 page_t *tpp, *rootpp = NULL; 2838 pgcnt_t pgcnt = page_get_pagecnt(pp->p_szc); 2839 pgcnt_t i; 2840 uint_t szc = pp->p_szc; 2841 2842 VM_STAT_ADD(pagecnt.pc_free_pages); 2843 TRACE_1(TR_FAC_VM, TR_PAGE_FREE_FREE, 2844 "page_free_free:pp %p", pp); 2845 2846 ASSERT(pp->p_szc != 0 && pp->p_szc < page_num_pagesizes()); 2847 if ((page_pptonum(pp) & (pgcnt - 1)) != 0) { 2848 panic("page_free_pages: not root page %p", (void *)pp); 2849 /*NOTREACHED*/ 2850 } 2851 2852 for (i = 0, tpp = pp; i < pgcnt; i++, tpp++) { 2853 ASSERT((PAGE_EXCL(tpp) && 2854 !page_iolock_assert(tpp)) || panicstr); 2855 if (PP_ISFREE(tpp)) { 2856 panic("page_free_pages: page %p is free", (void *)tpp); 2857 /*NOTREACHED*/ 2858 } 2859 if (hat_page_is_mapped(tpp) || tpp->p_lckcnt != 0 || 2860 tpp->p_cowcnt != 0 || tpp->p_slckcnt != 0) { 2861 panic("page_free_pages %p", (void *)tpp); 2862 /*NOTREACHED*/ 2863 } 2864 2865 ASSERT(!hat_page_getshare(tpp)); 2866 ASSERT(tpp->p_vnode == NULL); 2867 ASSERT(tpp->p_szc == szc); 2868 2869 PP_SETFREE(tpp); 2870 page_clr_all_props(tpp); 2871 PP_SETAGED(tpp); 2872 tpp->p_offset = (u_offset_t)-1; 2873 ASSERT(tpp->p_next == tpp); 2874 ASSERT(tpp->p_prev == tpp); 2875 page_list_concat(&rootpp, &tpp); 2876 } 2877 ASSERT(rootpp == pp); 2878 2879 page_list_add_pages(rootpp, 0); 2880 page_create_putback(pgcnt); 2881 } 2882 2883 int free_pages = 1; 2884 2885 /* 2886 * This routine attempts to return pages to the cachelist via page_release(). 2887 * It does not *have* to be successful in all cases, since the pageout scanner 2888 * will catch any pages it misses. It does need to be fast and not introduce 2889 * too much overhead. 2890 * 2891 * If a page isn't found on the unlocked sweep of the page_hash bucket, we 2892 * don't lock and retry. This is ok, since the page scanner will eventually 2893 * find any page we miss in free_vp_pages(). 2894 */ 2895 void 2896 free_vp_pages(vnode_t *vp, u_offset_t off, size_t len) 2897 { 2898 page_t *pp; 2899 u_offset_t eoff; 2900 extern int swap_in_range(vnode_t *, u_offset_t, size_t); 2901 2902 eoff = off + len; 2903 2904 if (free_pages == 0) 2905 return; 2906 if (swap_in_range(vp, off, len)) 2907 return; 2908 2909 for (; off < eoff; off += PAGESIZE) { 2910 2911 /* 2912 * find the page using a fast, but inexact search. It'll be OK 2913 * if a few pages slip through the cracks here. 2914 */ 2915 pp = page_exists(vp, off); 2916 2917 /* 2918 * If we didn't find the page (it may not exist), the page 2919 * is free, looks still in use (shared), or we can't lock it, 2920 * just give up. 2921 */ 2922 if (pp == NULL || 2923 PP_ISFREE(pp) || 2924 page_share_cnt(pp) > 0 || 2925 !page_trylock(pp, SE_EXCL)) 2926 continue; 2927 2928 /* 2929 * Once we have locked pp, verify that it's still the 2930 * correct page and not already free 2931 */ 2932 ASSERT(PAGE_LOCKED_SE(pp, SE_EXCL)); 2933 if (pp->p_vnode != vp || pp->p_offset != off || PP_ISFREE(pp)) { 2934 page_unlock(pp); 2935 continue; 2936 } 2937 2938 /* 2939 * try to release the page... 2940 */ 2941 (void) page_release(pp, 1); 2942 } 2943 } 2944 2945 /* 2946 * Reclaim the given page from the free list. 2947 * Returns 1 on success or 0 on failure. 2948 * 2949 * The page is unlocked if it can't be reclaimed (when freemem == 0). 2950 * If `lock' is non-null, it will be dropped and re-acquired if 2951 * the routine must wait while freemem is 0. 2952 * 2953 * As it turns out, boot_getpages() does this. It picks a page, 2954 * based on where OBP mapped in some address, gets its pfn, searches 2955 * the memsegs, locks the page, then pulls it off the free list! 2956 */ 2957 int 2958 page_reclaim(page_t *pp, kmutex_t *lock) 2959 { 2960 struct pcf *p; 2961 uint_t pcf_index; 2962 struct cpu *cpup; 2963 uint_t i; 2964 pgcnt_t npgs, need; 2965 pgcnt_t collected = 0; 2966 2967 ASSERT(lock != NULL ? MUTEX_HELD(lock) : 1); 2968 ASSERT(PAGE_EXCL(pp) && PP_ISFREE(pp)); 2969 2970 npgs = page_get_pagecnt(pp->p_szc); 2971 2972 /* 2973 * If `freemem' is 0, we cannot reclaim this page from the 2974 * freelist, so release every lock we might hold: the page, 2975 * and the `lock' before blocking. 2976 * 2977 * The only way `freemem' can become 0 while there are pages 2978 * marked free (have their p->p_free bit set) is when the 2979 * system is low on memory and doing a page_create(). In 2980 * order to guarantee that once page_create() starts acquiring 2981 * pages it will be able to get all that it needs since `freemem' 2982 * was decreased by the requested amount. So, we need to release 2983 * this page, and let page_create() have it. 2984 * 2985 * Since `freemem' being zero is not supposed to happen, just 2986 * use the usual hash stuff as a starting point. If that bucket 2987 * is empty, then assume the worst, and start at the beginning 2988 * of the pcf array. If we always start at the beginning 2989 * when acquiring more than one pcf lock, there won't be any 2990 * deadlock problems. 2991 */ 2992 2993 /* TODO: Do we need to test kcage_freemem if PG_NORELOC(pp)? */ 2994 2995 if (freemem <= throttlefree && !page_create_throttle(npgs, 0)) { 2996 pcf_acquire_all(); 2997 goto page_reclaim_nomem; 2998 } 2999 3000 pcf_index = PCF_INDEX(); 3001 p = &pcf[pcf_index]; 3002 mutex_enter(&p->pcf_lock); 3003 if (p->pcf_count >= npgs) { 3004 collected = npgs; 3005 p->pcf_count -= npgs; 3006 } 3007 mutex_exit(&p->pcf_lock); 3008 need = npgs - collected; 3009 3010 if (need > 0) { 3011 VM_STAT_ADD(page_reclaim_zero); 3012 /* 3013 * Check again. Its possible that some other thread 3014 * could have been right behind us, and added one 3015 * to a list somewhere. Acquire each of the pcf locks 3016 * until we find a page. 3017 */ 3018 p = pcf; 3019 for (i = 0; i < PCF_FANOUT; i++) { 3020 mutex_enter(&p->pcf_lock); 3021 if (p->pcf_count) { 3022 if (p->pcf_count >= need) { 3023 p->pcf_count -= need; 3024 collected += need; 3025 need = 0; 3026 break; 3027 } else if (p->pcf_count) { 3028 collected += p->pcf_count; 3029 need -= p->pcf_count; 3030 p->pcf_count = 0; 3031 } 3032 } 3033 p++; 3034 } 3035 3036 if (need > 0) { 3037 page_reclaim_nomem: 3038 /* 3039 * We really can't have page `pp'. 3040 * Time for the no-memory dance with 3041 * page_free(). This is just like 3042 * page_create_wait(). Plus the added 3043 * attraction of releasing whatever mutex 3044 * we held when we were called with in `lock'. 3045 * Page_unlock() will wakeup any thread 3046 * waiting around for this page. 3047 */ 3048 if (lock) { 3049 VM_STAT_ADD(page_reclaim_zero_locked); 3050 mutex_exit(lock); 3051 } 3052 page_unlock(pp); 3053 3054 /* 3055 * get this before we drop all the pcf locks. 3056 */ 3057 mutex_enter(&new_freemem_lock); 3058 3059 p = pcf; 3060 p->pcf_count += collected; 3061 for (i = 0; i < PCF_FANOUT; i++) { 3062 p->pcf_wait++; 3063 mutex_exit(&p->pcf_lock); 3064 p++; 3065 } 3066 3067 freemem_wait++; 3068 cv_wait(&freemem_cv, &new_freemem_lock); 3069 freemem_wait--; 3070 3071 mutex_exit(&new_freemem_lock); 3072 3073 if (lock) { 3074 mutex_enter(lock); 3075 } 3076 return (0); 3077 } 3078 3079 /* 3080 * We beat the PCF bins over the head until 3081 * we got the memory that we wanted. 3082 * The pcf accounting has been done, 3083 * though none of the pcf_wait flags have been set, 3084 * drop the locks and continue on. 3085 */ 3086 ASSERT(collected == npgs); 3087 while (p >= pcf) { 3088 mutex_exit(&p->pcf_lock); 3089 p--; 3090 } 3091 } 3092 3093 /* 3094 * freemem is not protected by any lock. Thus, we cannot 3095 * have any assertion containing freemem here. 3096 */ 3097 freemem -= npgs; 3098 3099 VM_STAT_ADD(pagecnt.pc_reclaim); 3100 if (PP_ISAGED(pp)) { 3101 if (npgs > 1) { 3102 page_list_sub_pages(pp, pp->p_szc); 3103 } else { 3104 page_list_sub(pp, PG_FREE_LIST); 3105 } 3106 TRACE_1(TR_FAC_VM, TR_PAGE_UNFREE_FREE, 3107 "page_reclaim_free:pp %p", pp); 3108 } else { 3109 ASSERT(npgs == 1); 3110 page_list_sub(pp, PG_CACHE_LIST); 3111 TRACE_1(TR_FAC_VM, TR_PAGE_UNFREE_CACHE, 3112 "page_reclaim_cache:pp %p", pp); 3113 } 3114 3115 /* 3116 * clear the p_free & p_age bits since this page is no longer 3117 * on the free list. Notice that there was a brief time where 3118 * a page is marked as free, but is not on the list. 3119 * 3120 * Set the reference bit to protect against immediate pageout. 3121 */ 3122 for (i = 0; i < npgs; i++, pp++) { 3123 PP_CLRFREE(pp); 3124 PP_CLRAGED(pp); 3125 page_set_props(pp, P_REF); 3126 } 3127 3128 CPU_STATS_ENTER_K(); 3129 cpup = CPU; /* get cpup now that CPU cannot change */ 3130 CPU_STATS_ADDQ(cpup, vm, pgrec, 1); 3131 CPU_STATS_ADDQ(cpup, vm, pgfrec, 1); 3132 CPU_STATS_EXIT_K(); 3133 3134 return (1); 3135 } 3136 3137 3138 3139 /* 3140 * Destroy identity of the page and put it back on 3141 * the page free list. Assumes that the caller has 3142 * acquired the "exclusive" lock on the page. 3143 */ 3144 void 3145 page_destroy(page_t *pp, int dontfree) 3146 { 3147 ASSERT((PAGE_EXCL(pp) && 3148 !page_iolock_assert(pp)) || panicstr); 3149 ASSERT(pp->p_slckcnt == 0 || panicstr); 3150 3151 if (pp->p_szc != 0) { 3152 if (pp->p_vnode == NULL || IS_SWAPFSVP(pp->p_vnode) || 3153 pp->p_vnode == &kvp) { 3154 panic("page_destroy: anon or kernel or no vnode " 3155 "large page %p", (void *)pp); 3156 } 3157 page_demote_vp_pages(pp); 3158 ASSERT(pp->p_szc == 0); 3159 } 3160 3161 TRACE_1(TR_FAC_VM, TR_PAGE_DESTROY, "page_destroy:pp %p", pp); 3162 3163 /* 3164 * Unload translations, if any, then hash out the 3165 * page to erase its identity. 3166 */ 3167 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 3168 page_hashout(pp, NULL); 3169 3170 if (!dontfree) { 3171 /* 3172 * Acquire the "freemem_lock" for availrmem. 3173 * The page_struct_lock need not be acquired for lckcnt 3174 * and cowcnt since the page has an "exclusive" lock. 3175 */ 3176 if ((pp->p_lckcnt != 0) || (pp->p_cowcnt != 0)) { 3177 mutex_enter(&freemem_lock); 3178 if (pp->p_lckcnt != 0) { 3179 availrmem++; 3180 pp->p_lckcnt = 0; 3181 } 3182 if (pp->p_cowcnt != 0) { 3183 availrmem += pp->p_cowcnt; 3184 pp->p_cowcnt = 0; 3185 } 3186 mutex_exit(&freemem_lock); 3187 } 3188 /* 3189 * Put the page on the "free" list. 3190 */ 3191 page_free(pp, 0); 3192 } 3193 } 3194 3195 void 3196 page_destroy_pages(page_t *pp) 3197 { 3198 3199 page_t *tpp, *rootpp = NULL; 3200 pgcnt_t pgcnt = page_get_pagecnt(pp->p_szc); 3201 pgcnt_t i, pglcks = 0; 3202 uint_t szc = pp->p_szc; 3203 3204 ASSERT(pp->p_szc != 0 && pp->p_szc < page_num_pagesizes()); 3205 3206 VM_STAT_ADD(pagecnt.pc_destroy_pages); 3207 3208 TRACE_1(TR_FAC_VM, TR_PAGE_DESTROY, "page_destroy_pages:pp %p", pp); 3209 3210 if ((page_pptonum(pp) & (pgcnt - 1)) != 0) { 3211 panic("page_destroy_pages: not root page %p", (void *)pp); 3212 /*NOTREACHED*/ 3213 } 3214 3215 for (i = 0, tpp = pp; i < pgcnt; i++, tpp++) { 3216 ASSERT((PAGE_EXCL(tpp) && 3217 !page_iolock_assert(tpp)) || panicstr); 3218 ASSERT(tpp->p_slckcnt == 0 || panicstr); 3219 (void) hat_pageunload(tpp, HAT_FORCE_PGUNLOAD); 3220 page_hashout(tpp, NULL); 3221 ASSERT(tpp->p_offset == (u_offset_t)-1); 3222 if (tpp->p_lckcnt != 0) { 3223 pglcks++; 3224 tpp->p_lckcnt = 0; 3225 } else if (tpp->p_cowcnt != 0) { 3226 pglcks += tpp->p_cowcnt; 3227 tpp->p_cowcnt = 0; 3228 } 3229 ASSERT(!hat_page_getshare(tpp)); 3230 ASSERT(tpp->p_vnode == NULL); 3231 ASSERT(tpp->p_szc == szc); 3232 3233 PP_SETFREE(tpp); 3234 page_clr_all_props(tpp); 3235 PP_SETAGED(tpp); 3236 ASSERT(tpp->p_next == tpp); 3237 ASSERT(tpp->p_prev == tpp); 3238 page_list_concat(&rootpp, &tpp); 3239 } 3240 3241 ASSERT(rootpp == pp); 3242 if (pglcks != 0) { 3243 mutex_enter(&freemem_lock); 3244 availrmem += pglcks; 3245 mutex_exit(&freemem_lock); 3246 } 3247 3248 page_list_add_pages(rootpp, 0); 3249 page_create_putback(pgcnt); 3250 } 3251 3252 /* 3253 * Similar to page_destroy(), but destroys pages which are 3254 * locked and known to be on the page free list. Since 3255 * the page is known to be free and locked, no one can access 3256 * it. 3257 * 3258 * Also, the number of free pages does not change. 3259 */ 3260 void 3261 page_destroy_free(page_t *pp) 3262 { 3263 ASSERT(PAGE_EXCL(pp)); 3264 ASSERT(PP_ISFREE(pp)); 3265 ASSERT(pp->p_vnode); 3266 ASSERT(hat_page_getattr(pp, P_MOD | P_REF | P_RO) == 0); 3267 ASSERT(!hat_page_is_mapped(pp)); 3268 ASSERT(PP_ISAGED(pp) == 0); 3269 ASSERT(pp->p_szc == 0); 3270 3271 VM_STAT_ADD(pagecnt.pc_destroy_free); 3272 page_list_sub(pp, PG_CACHE_LIST); 3273 3274 page_hashout(pp, NULL); 3275 ASSERT(pp->p_vnode == NULL); 3276 ASSERT(pp->p_offset == (u_offset_t)-1); 3277 ASSERT(pp->p_hash == NULL); 3278 3279 PP_SETAGED(pp); 3280 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); 3281 page_unlock(pp); 3282 3283 mutex_enter(&new_freemem_lock); 3284 if (freemem_wait) { 3285 cv_signal(&freemem_cv); 3286 } 3287 mutex_exit(&new_freemem_lock); 3288 } 3289 3290 /* 3291 * Rename the page "opp" to have an identity specified 3292 * by [vp, off]. If a page already exists with this name 3293 * it is locked and destroyed. Note that the page's 3294 * translations are not unloaded during the rename. 3295 * 3296 * This routine is used by the anon layer to "steal" the 3297 * original page and is not unlike destroying a page and 3298 * creating a new page using the same page frame. 3299 * 3300 * XXX -- Could deadlock if caller 1 tries to rename A to B while 3301 * caller 2 tries to rename B to A. 3302 */ 3303 void 3304 page_rename(page_t *opp, vnode_t *vp, u_offset_t off) 3305 { 3306 page_t *pp; 3307 int olckcnt = 0; 3308 int ocowcnt = 0; 3309 kmutex_t *phm; 3310 ulong_t index; 3311 3312 ASSERT(PAGE_EXCL(opp) && !page_iolock_assert(opp)); 3313 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 3314 ASSERT(PP_ISFREE(opp) == 0); 3315 3316 VM_STAT_ADD(page_rename_count); 3317 3318 TRACE_3(TR_FAC_VM, TR_PAGE_RENAME, 3319 "page rename:pp %p vp %p off %llx", opp, vp, off); 3320 3321 /* 3322 * CacheFS may call page_rename for a large NFS page 3323 * when both CacheFS and NFS mount points are used 3324 * by applications. Demote this large page before 3325 * renaming it, to ensure that there are no "partial" 3326 * large pages left lying around. 3327 */ 3328 if (opp->p_szc != 0) { 3329 vnode_t *ovp = opp->p_vnode; 3330 ASSERT(ovp != NULL); 3331 ASSERT(!IS_SWAPFSVP(ovp)); 3332 ASSERT(ovp != &kvp); 3333 page_demote_vp_pages(opp); 3334 ASSERT(opp->p_szc == 0); 3335 } 3336 3337 page_hashout(opp, NULL); 3338 PP_CLRAGED(opp); 3339 3340 /* 3341 * Acquire the appropriate page hash lock, since 3342 * we're going to rename the page. 3343 */ 3344 index = PAGE_HASH_FUNC(vp, off); 3345 phm = PAGE_HASH_MUTEX(index); 3346 mutex_enter(phm); 3347 top: 3348 /* 3349 * Look for an existing page with this name and destroy it if found. 3350 * By holding the page hash lock all the way to the page_hashin() 3351 * call, we are assured that no page can be created with this 3352 * identity. In the case when the phm lock is dropped to undo any 3353 * hat layer mappings, the existing page is held with an "exclusive" 3354 * lock, again preventing another page from being created with 3355 * this identity. 3356 */ 3357 PAGE_HASH_SEARCH(index, pp, vp, off); 3358 if (pp != NULL) { 3359 VM_STAT_ADD(page_rename_exists); 3360 3361 /* 3362 * As it turns out, this is one of only two places where 3363 * page_lock() needs to hold the passed in lock in the 3364 * successful case. In all of the others, the lock could 3365 * be dropped as soon as the attempt is made to lock 3366 * the page. It is tempting to add yet another arguement, 3367 * PL_KEEP or PL_DROP, to let page_lock know what to do. 3368 */ 3369 if (!page_lock(pp, SE_EXCL, phm, P_RECLAIM)) { 3370 /* 3371 * Went to sleep because the page could not 3372 * be locked. We were woken up when the page 3373 * was unlocked, or when the page was destroyed. 3374 * In either case, `phm' was dropped while we 3375 * slept. Hence we should not just roar through 3376 * this loop. 3377 */ 3378 goto top; 3379 } 3380 3381 /* 3382 * If an existing page is a large page, then demote 3383 * it to ensure that no "partial" large pages are 3384 * "created" after page_rename. An existing page 3385 * can be a CacheFS page, and can't belong to swapfs. 3386 */ 3387 if (hat_page_is_mapped(pp)) { 3388 /* 3389 * Unload translations. Since we hold the 3390 * exclusive lock on this page, the page 3391 * can not be changed while we drop phm. 3392 * This is also not a lock protocol violation, 3393 * but rather the proper way to do things. 3394 */ 3395 mutex_exit(phm); 3396 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 3397 if (pp->p_szc != 0) { 3398 ASSERT(!IS_SWAPFSVP(vp)); 3399 ASSERT(vp != &kvp); 3400 page_demote_vp_pages(pp); 3401 ASSERT(pp->p_szc == 0); 3402 } 3403 mutex_enter(phm); 3404 } else if (pp->p_szc != 0) { 3405 ASSERT(!IS_SWAPFSVP(vp)); 3406 ASSERT(vp != &kvp); 3407 mutex_exit(phm); 3408 page_demote_vp_pages(pp); 3409 ASSERT(pp->p_szc == 0); 3410 mutex_enter(phm); 3411 } 3412 page_hashout(pp, phm); 3413 } 3414 /* 3415 * Hash in the page with the new identity. 3416 */ 3417 if (!page_hashin(opp, vp, off, phm)) { 3418 /* 3419 * We were holding phm while we searched for [vp, off] 3420 * and only dropped phm if we found and locked a page. 3421 * If we can't create this page now, then some thing 3422 * is really broken. 3423 */ 3424 panic("page_rename: Can't hash in page: %p", (void *)pp); 3425 /*NOTREACHED*/ 3426 } 3427 3428 ASSERT(MUTEX_HELD(phm)); 3429 mutex_exit(phm); 3430 3431 /* 3432 * Now that we have dropped phm, lets get around to finishing up 3433 * with pp. 3434 */ 3435 if (pp != NULL) { 3436 ASSERT(!hat_page_is_mapped(pp)); 3437 /* for now large pages should not end up here */ 3438 ASSERT(pp->p_szc == 0); 3439 /* 3440 * Save the locks for transfer to the new page and then 3441 * clear them so page_free doesn't think they're important. 3442 * The page_struct_lock need not be acquired for lckcnt and 3443 * cowcnt since the page has an "exclusive" lock. 3444 */ 3445 olckcnt = pp->p_lckcnt; 3446 ocowcnt = pp->p_cowcnt; 3447 pp->p_lckcnt = pp->p_cowcnt = 0; 3448 3449 /* 3450 * Put the page on the "free" list after we drop 3451 * the lock. The less work under the lock the better. 3452 */ 3453 /*LINTED: constant in conditional context*/ 3454 VN_DISPOSE(pp, B_FREE, 0, kcred); 3455 } 3456 3457 /* 3458 * Transfer the lock count from the old page (if any). 3459 * The page_struct_lock need not be acquired for lckcnt and 3460 * cowcnt since the page has an "exclusive" lock. 3461 */ 3462 opp->p_lckcnt += olckcnt; 3463 opp->p_cowcnt += ocowcnt; 3464 } 3465 3466 /* 3467 * low level routine to add page `pp' to the hash and vp chains for [vp, offset] 3468 * 3469 * Pages are normally inserted at the start of a vnode's v_pages list. 3470 * If the vnode is VMODSORT and the page is modified, it goes at the end. 3471 * This can happen when a modified page is relocated for DR. 3472 * 3473 * Returns 1 on success and 0 on failure. 3474 */ 3475 static int 3476 page_do_hashin(page_t *pp, vnode_t *vp, u_offset_t offset) 3477 { 3478 page_t **listp; 3479 page_t *tp; 3480 ulong_t index; 3481 3482 ASSERT(PAGE_EXCL(pp)); 3483 ASSERT(vp != NULL); 3484 ASSERT(MUTEX_HELD(page_vnode_mutex(vp))); 3485 3486 /* 3487 * Be sure to set these up before the page is inserted on the hash 3488 * list. As soon as the page is placed on the list some other 3489 * thread might get confused and wonder how this page could 3490 * possibly hash to this list. 3491 */ 3492 pp->p_vnode = vp; 3493 pp->p_offset = offset; 3494 3495 /* 3496 * record if this page is on a swap vnode 3497 */ 3498 if ((vp->v_flag & VISSWAP) != 0) 3499 PP_SETSWAP(pp); 3500 3501 index = PAGE_HASH_FUNC(vp, offset); 3502 ASSERT(MUTEX_HELD(PAGE_HASH_MUTEX(index))); 3503 listp = &page_hash[index]; 3504 3505 /* 3506 * If this page is already hashed in, fail this attempt to add it. 3507 */ 3508 for (tp = *listp; tp != NULL; tp = tp->p_hash) { 3509 if (tp->p_vnode == vp && tp->p_offset == offset) { 3510 pp->p_vnode = NULL; 3511 pp->p_offset = (u_offset_t)(-1); 3512 return (0); 3513 } 3514 } 3515 pp->p_hash = *listp; 3516 *listp = pp; 3517 3518 /* 3519 * Add the page to the vnode's list of pages 3520 */ 3521 if (vp->v_pages != NULL && IS_VMODSORT(vp) && hat_ismod(pp)) 3522 listp = &vp->v_pages->p_vpprev->p_vpnext; 3523 else 3524 listp = &vp->v_pages; 3525 3526 page_vpadd(listp, pp); 3527 3528 return (1); 3529 } 3530 3531 /* 3532 * Add page `pp' to both the hash and vp chains for [vp, offset]. 3533 * 3534 * Returns 1 on success and 0 on failure. 3535 * If hold is passed in, it is not dropped. 3536 */ 3537 int 3538 page_hashin(page_t *pp, vnode_t *vp, u_offset_t offset, kmutex_t *hold) 3539 { 3540 kmutex_t *phm = NULL; 3541 kmutex_t *vphm; 3542 int rc; 3543 3544 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp))); 3545 3546 TRACE_3(TR_FAC_VM, TR_PAGE_HASHIN, 3547 "page_hashin:pp %p vp %p offset %llx", 3548 pp, vp, offset); 3549 3550 VM_STAT_ADD(hashin_count); 3551 3552 if (hold != NULL) 3553 phm = hold; 3554 else { 3555 VM_STAT_ADD(hashin_not_held); 3556 phm = PAGE_HASH_MUTEX(PAGE_HASH_FUNC(vp, offset)); 3557 mutex_enter(phm); 3558 } 3559 3560 vphm = page_vnode_mutex(vp); 3561 mutex_enter(vphm); 3562 rc = page_do_hashin(pp, vp, offset); 3563 mutex_exit(vphm); 3564 if (hold == NULL) 3565 mutex_exit(phm); 3566 if (rc == 0) 3567 VM_STAT_ADD(hashin_already); 3568 return (rc); 3569 } 3570 3571 /* 3572 * Remove page ``pp'' from the hash and vp chains and remove vp association. 3573 * All mutexes must be held 3574 */ 3575 static void 3576 page_do_hashout(page_t *pp) 3577 { 3578 page_t **hpp; 3579 page_t *hp; 3580 vnode_t *vp = pp->p_vnode; 3581 3582 ASSERT(vp != NULL); 3583 ASSERT(MUTEX_HELD(page_vnode_mutex(vp))); 3584 3585 /* 3586 * First, take pp off of its hash chain. 3587 */ 3588 hpp = &page_hash[PAGE_HASH_FUNC(vp, pp->p_offset)]; 3589 3590 for (;;) { 3591 hp = *hpp; 3592 if (hp == pp) 3593 break; 3594 if (hp == NULL) { 3595 panic("page_do_hashout"); 3596 /*NOTREACHED*/ 3597 } 3598 hpp = &hp->p_hash; 3599 } 3600 *hpp = pp->p_hash; 3601 3602 /* 3603 * Now remove it from its associated vnode. 3604 */ 3605 if (vp->v_pages) 3606 page_vpsub(&vp->v_pages, pp); 3607 3608 pp->p_hash = NULL; 3609 page_clr_all_props(pp); 3610 PP_CLRSWAP(pp); 3611 pp->p_vnode = NULL; 3612 pp->p_offset = (u_offset_t)-1; 3613 } 3614 3615 /* 3616 * Remove page ``pp'' from the hash and vp chains and remove vp association. 3617 * 3618 * When `phm' is non-NULL it contains the address of the mutex protecting the 3619 * hash list pp is on. It is not dropped. 3620 */ 3621 void 3622 page_hashout(page_t *pp, kmutex_t *phm) 3623 { 3624 vnode_t *vp; 3625 ulong_t index; 3626 kmutex_t *nphm; 3627 kmutex_t *vphm; 3628 kmutex_t *sep; 3629 3630 ASSERT(phm != NULL ? MUTEX_HELD(phm) : 1); 3631 ASSERT(pp->p_vnode != NULL); 3632 ASSERT((PAGE_EXCL(pp) && !page_iolock_assert(pp)) || panicstr); 3633 ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(pp->p_vnode))); 3634 3635 vp = pp->p_vnode; 3636 3637 TRACE_2(TR_FAC_VM, TR_PAGE_HASHOUT, 3638 "page_hashout:pp %p vp %p", pp, vp); 3639 3640 /* Kernel probe */ 3641 TNF_PROBE_2(page_unmap, "vm pagefault", /* CSTYLED */, 3642 tnf_opaque, vnode, vp, 3643 tnf_offset, offset, pp->p_offset); 3644 3645 /* 3646 * 3647 */ 3648 VM_STAT_ADD(hashout_count); 3649 index = PAGE_HASH_FUNC(vp, pp->p_offset); 3650 if (phm == NULL) { 3651 VM_STAT_ADD(hashout_not_held); 3652 nphm = PAGE_HASH_MUTEX(index); 3653 mutex_enter(nphm); 3654 } 3655 ASSERT(phm ? phm == PAGE_HASH_MUTEX(index) : 1); 3656 3657 3658 /* 3659 * grab page vnode mutex and remove it... 3660 */ 3661 vphm = page_vnode_mutex(vp); 3662 mutex_enter(vphm); 3663 3664 page_do_hashout(pp); 3665 3666 mutex_exit(vphm); 3667 if (phm == NULL) 3668 mutex_exit(nphm); 3669 3670 /* 3671 * Wake up processes waiting for this page. The page's 3672 * identity has been changed, and is probably not the 3673 * desired page any longer. 3674 */ 3675 sep = page_se_mutex(pp); 3676 mutex_enter(sep); 3677 pp->p_selock &= ~SE_EWANTED; 3678 if (CV_HAS_WAITERS(&pp->p_cv)) 3679 cv_broadcast(&pp->p_cv); 3680 mutex_exit(sep); 3681 } 3682 3683 /* 3684 * Add the page to the front of a linked list of pages 3685 * using the p_next & p_prev pointers for the list. 3686 * The caller is responsible for protecting the list pointers. 3687 */ 3688 void 3689 page_add(page_t **ppp, page_t *pp) 3690 { 3691 ASSERT(PAGE_EXCL(pp) || (PAGE_SHARED(pp) && page_iolock_assert(pp))); 3692 3693 page_add_common(ppp, pp); 3694 } 3695 3696 3697 3698 /* 3699 * Common code for page_add() and mach_page_add() 3700 */ 3701 void 3702 page_add_common(page_t **ppp, page_t *pp) 3703 { 3704 if (*ppp == NULL) { 3705 pp->p_next = pp->p_prev = pp; 3706 } else { 3707 pp->p_next = *ppp; 3708 pp->p_prev = (*ppp)->p_prev; 3709 (*ppp)->p_prev = pp; 3710 pp->p_prev->p_next = pp; 3711 } 3712 *ppp = pp; 3713 } 3714 3715 3716 /* 3717 * Remove this page from a linked list of pages 3718 * using the p_next & p_prev pointers for the list. 3719 * 3720 * The caller is responsible for protecting the list pointers. 3721 */ 3722 void 3723 page_sub(page_t **ppp, page_t *pp) 3724 { 3725 ASSERT((PP_ISFREE(pp)) ? 1 : 3726 (PAGE_EXCL(pp)) || (PAGE_SHARED(pp) && page_iolock_assert(pp))); 3727 3728 if (*ppp == NULL || pp == NULL) { 3729 panic("page_sub: bad arg(s): pp %p, *ppp %p", 3730 (void *)pp, (void *)(*ppp)); 3731 /*NOTREACHED*/ 3732 } 3733 3734 page_sub_common(ppp, pp); 3735 } 3736 3737 3738 /* 3739 * Common code for page_sub() and mach_page_sub() 3740 */ 3741 void 3742 page_sub_common(page_t **ppp, page_t *pp) 3743 { 3744 if (*ppp == pp) 3745 *ppp = pp->p_next; /* go to next page */ 3746 3747 if (*ppp == pp) 3748 *ppp = NULL; /* page list is gone */ 3749 else { 3750 pp->p_prev->p_next = pp->p_next; 3751 pp->p_next->p_prev = pp->p_prev; 3752 } 3753 pp->p_prev = pp->p_next = pp; /* make pp a list of one */ 3754 } 3755 3756 3757 /* 3758 * Break page list cppp into two lists with npages in the first list. 3759 * The tail is returned in nppp. 3760 */ 3761 void 3762 page_list_break(page_t **oppp, page_t **nppp, pgcnt_t npages) 3763 { 3764 page_t *s1pp = *oppp; 3765 page_t *s2pp; 3766 page_t *e1pp, *e2pp; 3767 long n = 0; 3768 3769 if (s1pp == NULL) { 3770 *nppp = NULL; 3771 return; 3772 } 3773 if (npages == 0) { 3774 *nppp = s1pp; 3775 *oppp = NULL; 3776 return; 3777 } 3778 for (n = 0, s2pp = *oppp; n < npages; n++) { 3779 s2pp = s2pp->p_next; 3780 } 3781 /* Fix head and tail of new lists */ 3782 e1pp = s2pp->p_prev; 3783 e2pp = s1pp->p_prev; 3784 s1pp->p_prev = e1pp; 3785 e1pp->p_next = s1pp; 3786 s2pp->p_prev = e2pp; 3787 e2pp->p_next = s2pp; 3788 3789 /* second list empty */ 3790 if (s2pp == s1pp) { 3791 *oppp = s1pp; 3792 *nppp = NULL; 3793 } else { 3794 *oppp = s1pp; 3795 *nppp = s2pp; 3796 } 3797 } 3798 3799 /* 3800 * Concatenate page list nppp onto the end of list ppp. 3801 */ 3802 void 3803 page_list_concat(page_t **ppp, page_t **nppp) 3804 { 3805 page_t *s1pp, *s2pp, *e1pp, *e2pp; 3806 3807 if (*nppp == NULL) { 3808 return; 3809 } 3810 if (*ppp == NULL) { 3811 *ppp = *nppp; 3812 return; 3813 } 3814 s1pp = *ppp; 3815 e1pp = s1pp->p_prev; 3816 s2pp = *nppp; 3817 e2pp = s2pp->p_prev; 3818 s1pp->p_prev = e2pp; 3819 e2pp->p_next = s1pp; 3820 e1pp->p_next = s2pp; 3821 s2pp->p_prev = e1pp; 3822 } 3823 3824 /* 3825 * return the next page in the page list 3826 */ 3827 page_t * 3828 page_list_next(page_t *pp) 3829 { 3830 return (pp->p_next); 3831 } 3832 3833 3834 /* 3835 * Add the page to the front of the linked list of pages 3836 * using p_vpnext/p_vpprev pointers for the list. 3837 * 3838 * The caller is responsible for protecting the lists. 3839 */ 3840 void 3841 page_vpadd(page_t **ppp, page_t *pp) 3842 { 3843 if (*ppp == NULL) { 3844 pp->p_vpnext = pp->p_vpprev = pp; 3845 } else { 3846 pp->p_vpnext = *ppp; 3847 pp->p_vpprev = (*ppp)->p_vpprev; 3848 (*ppp)->p_vpprev = pp; 3849 pp->p_vpprev->p_vpnext = pp; 3850 } 3851 *ppp = pp; 3852 } 3853 3854 /* 3855 * Remove this page from the linked list of pages 3856 * using p_vpnext/p_vpprev pointers for the list. 3857 * 3858 * The caller is responsible for protecting the lists. 3859 */ 3860 void 3861 page_vpsub(page_t **ppp, page_t *pp) 3862 { 3863 if (*ppp == NULL || pp == NULL) { 3864 panic("page_vpsub: bad arg(s): pp %p, *ppp %p", 3865 (void *)pp, (void *)(*ppp)); 3866 /*NOTREACHED*/ 3867 } 3868 3869 if (*ppp == pp) 3870 *ppp = pp->p_vpnext; /* go to next page */ 3871 3872 if (*ppp == pp) 3873 *ppp = NULL; /* page list is gone */ 3874 else { 3875 pp->p_vpprev->p_vpnext = pp->p_vpnext; 3876 pp->p_vpnext->p_vpprev = pp->p_vpprev; 3877 } 3878 pp->p_vpprev = pp->p_vpnext = pp; /* make pp a list of one */ 3879 } 3880 3881 /* 3882 * Lock a physical page into memory "long term". Used to support "lock 3883 * in memory" functions. Accepts the page to be locked, and a cow variable 3884 * to indicate whether a the lock will travel to the new page during 3885 * a potential copy-on-write. 3886 */ 3887 int 3888 page_pp_lock( 3889 page_t *pp, /* page to be locked */ 3890 int cow, /* cow lock */ 3891 int kernel) /* must succeed -- ignore checking */ 3892 { 3893 int r = 0; /* result -- assume failure */ 3894 3895 ASSERT(PAGE_LOCKED(pp)); 3896 3897 page_struct_lock(pp); 3898 /* 3899 * Acquire the "freemem_lock" for availrmem. 3900 */ 3901 if (cow) { 3902 mutex_enter(&freemem_lock); 3903 if ((availrmem > pages_pp_maximum) && 3904 (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM)) { 3905 availrmem--; 3906 pages_locked++; 3907 mutex_exit(&freemem_lock); 3908 r = 1; 3909 if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) { 3910 cmn_err(CE_WARN, 3911 "COW lock limit reached on pfn 0x%lx", 3912 page_pptonum(pp)); 3913 } 3914 } else 3915 mutex_exit(&freemem_lock); 3916 } else { 3917 if (pp->p_lckcnt) { 3918 if (pp->p_lckcnt < (ushort_t)PAGE_LOCK_MAXIMUM) { 3919 r = 1; 3920 if (++pp->p_lckcnt == 3921 (ushort_t)PAGE_LOCK_MAXIMUM) { 3922 cmn_err(CE_WARN, "Page lock limit " 3923 "reached on pfn 0x%lx", 3924 page_pptonum(pp)); 3925 } 3926 } 3927 } else { 3928 if (kernel) { 3929 /* availrmem accounting done by caller */ 3930 ++pp->p_lckcnt; 3931 r = 1; 3932 } else { 3933 mutex_enter(&freemem_lock); 3934 if (availrmem > pages_pp_maximum) { 3935 availrmem--; 3936 pages_locked++; 3937 ++pp->p_lckcnt; 3938 r = 1; 3939 } 3940 mutex_exit(&freemem_lock); 3941 } 3942 } 3943 } 3944 page_struct_unlock(pp); 3945 return (r); 3946 } 3947 3948 /* 3949 * Decommit a lock on a physical page frame. Account for cow locks if 3950 * appropriate. 3951 */ 3952 void 3953 page_pp_unlock( 3954 page_t *pp, /* page to be unlocked */ 3955 int cow, /* expect cow lock */ 3956 int kernel) /* this was a kernel lock */ 3957 { 3958 ASSERT(PAGE_LOCKED(pp)); 3959 3960 page_struct_lock(pp); 3961 /* 3962 * Acquire the "freemem_lock" for availrmem. 3963 * If cowcnt or lcknt is already 0 do nothing; i.e., we 3964 * could be called to unlock even if nothing is locked. This could 3965 * happen if locked file pages were truncated (removing the lock) 3966 * and the file was grown again and new pages faulted in; the new 3967 * pages are unlocked but the segment still thinks they're locked. 3968 */ 3969 if (cow) { 3970 if (pp->p_cowcnt) { 3971 mutex_enter(&freemem_lock); 3972 pp->p_cowcnt--; 3973 availrmem++; 3974 pages_locked--; 3975 mutex_exit(&freemem_lock); 3976 } 3977 } else { 3978 if (pp->p_lckcnt && --pp->p_lckcnt == 0) { 3979 if (!kernel) { 3980 mutex_enter(&freemem_lock); 3981 availrmem++; 3982 pages_locked--; 3983 mutex_exit(&freemem_lock); 3984 } 3985 } 3986 } 3987 page_struct_unlock(pp); 3988 } 3989 3990 /* 3991 * This routine reserves availrmem for npages; 3992 * flags: KM_NOSLEEP or KM_SLEEP 3993 * returns 1 on success or 0 on failure 3994 */ 3995 int 3996 page_resv(pgcnt_t npages, uint_t flags) 3997 { 3998 mutex_enter(&freemem_lock); 3999 while (availrmem < tune.t_minarmem + npages) { 4000 if (flags & KM_NOSLEEP) { 4001 mutex_exit(&freemem_lock); 4002 return (0); 4003 } 4004 mutex_exit(&freemem_lock); 4005 page_needfree(npages); 4006 kmem_reap(); 4007 delay(hz >> 2); 4008 page_needfree(-(spgcnt_t)npages); 4009 mutex_enter(&freemem_lock); 4010 } 4011 availrmem -= npages; 4012 mutex_exit(&freemem_lock); 4013 return (1); 4014 } 4015 4016 /* 4017 * This routine unreserves availrmem for npages; 4018 */ 4019 void 4020 page_unresv(pgcnt_t npages) 4021 { 4022 mutex_enter(&freemem_lock); 4023 availrmem += npages; 4024 mutex_exit(&freemem_lock); 4025 } 4026 4027 /* 4028 * See Statement at the beginning of segvn_lockop() regarding 4029 * the way we handle cowcnts and lckcnts. 4030 * 4031 * Transfer cowcnt on 'opp' to cowcnt on 'npp' if the vpage 4032 * that breaks COW has PROT_WRITE. 4033 * 4034 * Note that, we may also break COW in case we are softlocking 4035 * on read access during physio; 4036 * in this softlock case, the vpage may not have PROT_WRITE. 4037 * So, we need to transfer lckcnt on 'opp' to lckcnt on 'npp' 4038 * if the vpage doesn't have PROT_WRITE. 4039 * 4040 * This routine is never called if we are stealing a page 4041 * in anon_private. 4042 * 4043 * The caller subtracted from availrmem for read only mapping. 4044 * if lckcnt is 1 increment availrmem. 4045 */ 4046 void 4047 page_pp_useclaim( 4048 page_t *opp, /* original page frame losing lock */ 4049 page_t *npp, /* new page frame gaining lock */ 4050 uint_t write_perm) /* set if vpage has PROT_WRITE */ 4051 { 4052 int payback = 0; 4053 4054 ASSERT(PAGE_LOCKED(opp)); 4055 ASSERT(PAGE_LOCKED(npp)); 4056 4057 page_struct_lock(opp); 4058 4059 ASSERT(npp->p_cowcnt == 0); 4060 ASSERT(npp->p_lckcnt == 0); 4061 4062 /* Don't use claim if nothing is locked (see page_pp_unlock above) */ 4063 if ((write_perm && opp->p_cowcnt != 0) || 4064 (!write_perm && opp->p_lckcnt != 0)) { 4065 4066 if (write_perm) { 4067 npp->p_cowcnt++; 4068 ASSERT(opp->p_cowcnt != 0); 4069 opp->p_cowcnt--; 4070 } else { 4071 4072 ASSERT(opp->p_lckcnt != 0); 4073 4074 /* 4075 * We didn't need availrmem decremented if p_lckcnt on 4076 * original page is 1. Here, we are unlocking 4077 * read-only copy belonging to original page and 4078 * are locking a copy belonging to new page. 4079 */ 4080 if (opp->p_lckcnt == 1) 4081 payback = 1; 4082 4083 npp->p_lckcnt++; 4084 opp->p_lckcnt--; 4085 } 4086 } 4087 if (payback) { 4088 mutex_enter(&freemem_lock); 4089 availrmem++; 4090 pages_useclaim--; 4091 mutex_exit(&freemem_lock); 4092 } 4093 page_struct_unlock(opp); 4094 } 4095 4096 /* 4097 * Simple claim adjust functions -- used to support changes in 4098 * claims due to changes in access permissions. Used by segvn_setprot(). 4099 */ 4100 int 4101 page_addclaim(page_t *pp) 4102 { 4103 int r = 0; /* result */ 4104 4105 ASSERT(PAGE_LOCKED(pp)); 4106 4107 page_struct_lock(pp); 4108 ASSERT(pp->p_lckcnt != 0); 4109 4110 if (pp->p_lckcnt == 1) { 4111 if (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM) { 4112 --pp->p_lckcnt; 4113 r = 1; 4114 if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) { 4115 cmn_err(CE_WARN, 4116 "COW lock limit reached on pfn 0x%lx", 4117 page_pptonum(pp)); 4118 } 4119 } 4120 } else { 4121 mutex_enter(&freemem_lock); 4122 if ((availrmem > pages_pp_maximum) && 4123 (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM)) { 4124 --availrmem; 4125 ++pages_claimed; 4126 mutex_exit(&freemem_lock); 4127 --pp->p_lckcnt; 4128 r = 1; 4129 if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) { 4130 cmn_err(CE_WARN, 4131 "COW lock limit reached on pfn 0x%lx", 4132 page_pptonum(pp)); 4133 } 4134 } else 4135 mutex_exit(&freemem_lock); 4136 } 4137 page_struct_unlock(pp); 4138 return (r); 4139 } 4140 4141 int 4142 page_subclaim(page_t *pp) 4143 { 4144 int r = 0; 4145 4146 ASSERT(PAGE_LOCKED(pp)); 4147 4148 page_struct_lock(pp); 4149 ASSERT(pp->p_cowcnt != 0); 4150 4151 if (pp->p_lckcnt) { 4152 if (pp->p_lckcnt < (ushort_t)PAGE_LOCK_MAXIMUM) { 4153 r = 1; 4154 /* 4155 * for availrmem 4156 */ 4157 mutex_enter(&freemem_lock); 4158 availrmem++; 4159 pages_claimed--; 4160 mutex_exit(&freemem_lock); 4161 4162 pp->p_cowcnt--; 4163 4164 if (++pp->p_lckcnt == (ushort_t)PAGE_LOCK_MAXIMUM) { 4165 cmn_err(CE_WARN, 4166 "Page lock limit reached on pfn 0x%lx", 4167 page_pptonum(pp)); 4168 } 4169 } 4170 } else { 4171 r = 1; 4172 pp->p_cowcnt--; 4173 pp->p_lckcnt++; 4174 } 4175 page_struct_unlock(pp); 4176 return (r); 4177 } 4178 4179 int 4180 page_addclaim_pages(page_t **ppa) 4181 { 4182 4183 pgcnt_t lckpgs = 0, pg_idx; 4184 4185 VM_STAT_ADD(pagecnt.pc_addclaim_pages); 4186 4187 mutex_enter(&page_llock); 4188 for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) { 4189 4190 ASSERT(PAGE_LOCKED(ppa[pg_idx])); 4191 ASSERT(ppa[pg_idx]->p_lckcnt != 0); 4192 if (ppa[pg_idx]->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) { 4193 mutex_exit(&page_llock); 4194 return (0); 4195 } 4196 if (ppa[pg_idx]->p_lckcnt > 1) 4197 lckpgs++; 4198 } 4199 4200 if (lckpgs != 0) { 4201 mutex_enter(&freemem_lock); 4202 if (availrmem >= pages_pp_maximum + lckpgs) { 4203 availrmem -= lckpgs; 4204 pages_claimed += lckpgs; 4205 } else { 4206 mutex_exit(&freemem_lock); 4207 mutex_exit(&page_llock); 4208 return (0); 4209 } 4210 mutex_exit(&freemem_lock); 4211 } 4212 4213 for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) { 4214 ppa[pg_idx]->p_lckcnt--; 4215 ppa[pg_idx]->p_cowcnt++; 4216 } 4217 mutex_exit(&page_llock); 4218 return (1); 4219 } 4220 4221 int 4222 page_subclaim_pages(page_t **ppa) 4223 { 4224 pgcnt_t ulckpgs = 0, pg_idx; 4225 4226 VM_STAT_ADD(pagecnt.pc_subclaim_pages); 4227 4228 mutex_enter(&page_llock); 4229 for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) { 4230 4231 ASSERT(PAGE_LOCKED(ppa[pg_idx])); 4232 ASSERT(ppa[pg_idx]->p_cowcnt != 0); 4233 if (ppa[pg_idx]->p_lckcnt == (ushort_t)PAGE_LOCK_MAXIMUM) { 4234 mutex_exit(&page_llock); 4235 return (0); 4236 } 4237 if (ppa[pg_idx]->p_lckcnt != 0) 4238 ulckpgs++; 4239 } 4240 4241 if (ulckpgs != 0) { 4242 mutex_enter(&freemem_lock); 4243 availrmem += ulckpgs; 4244 pages_claimed -= ulckpgs; 4245 mutex_exit(&freemem_lock); 4246 } 4247 4248 for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) { 4249 ppa[pg_idx]->p_cowcnt--; 4250 ppa[pg_idx]->p_lckcnt++; 4251 4252 } 4253 mutex_exit(&page_llock); 4254 return (1); 4255 } 4256 4257 page_t * 4258 page_numtopp(pfn_t pfnum, se_t se) 4259 { 4260 page_t *pp; 4261 4262 retry: 4263 pp = page_numtopp_nolock(pfnum); 4264 if (pp == NULL) { 4265 return ((page_t *)NULL); 4266 } 4267 4268 /* 4269 * Acquire the appropriate lock on the page. 4270 */ 4271 while (!page_lock(pp, se, (kmutex_t *)NULL, P_RECLAIM)) { 4272 if (page_pptonum(pp) != pfnum) 4273 goto retry; 4274 continue; 4275 } 4276 4277 if (page_pptonum(pp) != pfnum) { 4278 page_unlock(pp); 4279 goto retry; 4280 } 4281 4282 return (pp); 4283 } 4284 4285 page_t * 4286 page_numtopp_noreclaim(pfn_t pfnum, se_t se) 4287 { 4288 page_t *pp; 4289 4290 retry: 4291 pp = page_numtopp_nolock(pfnum); 4292 if (pp == NULL) { 4293 return ((page_t *)NULL); 4294 } 4295 4296 /* 4297 * Acquire the appropriate lock on the page. 4298 */ 4299 while (!page_lock(pp, se, (kmutex_t *)NULL, P_NO_RECLAIM)) { 4300 if (page_pptonum(pp) != pfnum) 4301 goto retry; 4302 continue; 4303 } 4304 4305 if (page_pptonum(pp) != pfnum) { 4306 page_unlock(pp); 4307 goto retry; 4308 } 4309 4310 return (pp); 4311 } 4312 4313 /* 4314 * This routine is like page_numtopp, but will only return page structs 4315 * for pages which are ok for loading into hardware using the page struct. 4316 */ 4317 page_t * 4318 page_numtopp_nowait(pfn_t pfnum, se_t se) 4319 { 4320 page_t *pp; 4321 4322 retry: 4323 pp = page_numtopp_nolock(pfnum); 4324 if (pp == NULL) { 4325 return ((page_t *)NULL); 4326 } 4327 4328 /* 4329 * Try to acquire the appropriate lock on the page. 4330 */ 4331 if (PP_ISFREE(pp)) 4332 pp = NULL; 4333 else { 4334 if (!page_trylock(pp, se)) 4335 pp = NULL; 4336 else { 4337 if (page_pptonum(pp) != pfnum) { 4338 page_unlock(pp); 4339 goto retry; 4340 } 4341 if (PP_ISFREE(pp)) { 4342 page_unlock(pp); 4343 pp = NULL; 4344 } 4345 } 4346 } 4347 return (pp); 4348 } 4349 4350 /* 4351 * Returns a count of dirty pages that are in the process 4352 * of being written out. If 'cleanit' is set, try to push the page. 4353 */ 4354 pgcnt_t 4355 page_busy(int cleanit) 4356 { 4357 page_t *page0 = page_first(); 4358 page_t *pp = page0; 4359 pgcnt_t nppbusy = 0; 4360 u_offset_t off; 4361 4362 do { 4363 vnode_t *vp = pp->p_vnode; 4364 4365 /* 4366 * A page is a candidate for syncing if it is: 4367 * 4368 * (a) On neither the freelist nor the cachelist 4369 * (b) Hashed onto a vnode 4370 * (c) Not a kernel page 4371 * (d) Dirty 4372 * (e) Not part of a swapfile 4373 * (f) a page which belongs to a real vnode; eg has a non-null 4374 * v_vfsp pointer. 4375 * (g) Backed by a filesystem which doesn't have a 4376 * stubbed-out sync operation 4377 */ 4378 if (!PP_ISFREE(pp) && vp != NULL && vp != &kvp && 4379 hat_ismod(pp) && !IS_SWAPVP(vp) && vp->v_vfsp != NULL && 4380 vfs_can_sync(vp->v_vfsp)) { 4381 nppbusy++; 4382 vfs_syncprogress(); 4383 4384 if (!cleanit) 4385 continue; 4386 if (!page_trylock(pp, SE_EXCL)) 4387 continue; 4388 4389 if (PP_ISFREE(pp) || vp == NULL || IS_SWAPVP(vp) || 4390 pp->p_lckcnt != 0 || pp->p_cowcnt != 0 || 4391 !(hat_pagesync(pp, 4392 HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD) & P_MOD)) { 4393 page_unlock(pp); 4394 continue; 4395 } 4396 off = pp->p_offset; 4397 VN_HOLD(vp); 4398 page_unlock(pp); 4399 (void) VOP_PUTPAGE(vp, off, PAGESIZE, 4400 B_ASYNC | B_FREE, kcred); 4401 VN_RELE(vp); 4402 } 4403 } while ((pp = page_next(pp)) != page0); 4404 4405 return (nppbusy); 4406 } 4407 4408 void page_invalidate_pages(void); 4409 4410 /* 4411 * callback handler to vm sub-system 4412 * 4413 * callers make sure no recursive entries to this func. 4414 */ 4415 /*ARGSUSED*/ 4416 boolean_t 4417 callb_vm_cpr(void *arg, int code) 4418 { 4419 if (code == CB_CODE_CPR_CHKPT) 4420 page_invalidate_pages(); 4421 return (B_TRUE); 4422 } 4423 4424 /* 4425 * Invalidate all pages of the system. 4426 * It shouldn't be called until all user page activities are all stopped. 4427 */ 4428 void 4429 page_invalidate_pages() 4430 { 4431 page_t *pp; 4432 page_t *page0; 4433 pgcnt_t nbusypages; 4434 int retry = 0; 4435 const int MAXRETRIES = 4; 4436 #if defined(__sparc) 4437 extern struct vnode prom_ppages; 4438 #endif /* __sparc */ 4439 4440 top: 4441 /* 4442 * Flush dirty pages and destory the clean ones. 4443 */ 4444 nbusypages = 0; 4445 4446 pp = page0 = page_first(); 4447 do { 4448 struct vnode *vp; 4449 u_offset_t offset; 4450 int mod; 4451 4452 /* 4453 * skip the page if it has no vnode or the page associated 4454 * with the kernel vnode or prom allocated kernel mem. 4455 */ 4456 #if defined(__sparc) 4457 if ((vp = pp->p_vnode) == NULL || vp == &kvp || 4458 vp == &prom_ppages) 4459 #else /* x86 doesn't have prom or prom_ppage */ 4460 if ((vp = pp->p_vnode) == NULL || vp == &kvp) 4461 #endif /* __sparc */ 4462 continue; 4463 4464 /* 4465 * skip the page which is already free invalidated. 4466 */ 4467 if (PP_ISFREE(pp) && PP_ISAGED(pp)) 4468 continue; 4469 4470 /* 4471 * skip pages that are already locked or can't be "exclusively" 4472 * locked or are already free. After we lock the page, check 4473 * the free and age bits again to be sure it's not destroied 4474 * yet. 4475 * To achieve max. parallelization, we use page_trylock instead 4476 * of page_lock so that we don't get block on individual pages 4477 * while we have thousands of other pages to process. 4478 */ 4479 if (!page_trylock(pp, SE_EXCL)) { 4480 nbusypages++; 4481 continue; 4482 } else if (PP_ISFREE(pp)) { 4483 if (!PP_ISAGED(pp)) { 4484 page_destroy_free(pp); 4485 } else { 4486 page_unlock(pp); 4487 } 4488 continue; 4489 } 4490 /* 4491 * Is this page involved in some I/O? shared? 4492 * 4493 * The page_struct_lock need not be acquired to 4494 * examine these fields since the page has an 4495 * "exclusive" lock. 4496 */ 4497 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 4498 page_unlock(pp); 4499 continue; 4500 } 4501 4502 if (vp->v_type == VCHR) { 4503 panic("vp->v_type == VCHR"); 4504 /*NOTREACHED*/ 4505 } 4506 4507 if (!page_try_demote_pages(pp)) { 4508 page_unlock(pp); 4509 continue; 4510 } 4511 4512 /* 4513 * Check the modified bit. Leave the bits alone in hardware 4514 * (they will be modified if we do the putpage). 4515 */ 4516 mod = (hat_pagesync(pp, HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD) 4517 & P_MOD); 4518 if (mod) { 4519 offset = pp->p_offset; 4520 /* 4521 * Hold the vnode before releasing the page lock 4522 * to prevent it from being freed and re-used by 4523 * some other thread. 4524 */ 4525 VN_HOLD(vp); 4526 page_unlock(pp); 4527 /* 4528 * No error return is checked here. Callers such as 4529 * cpr deals with the dirty pages at the dump time 4530 * if this putpage fails. 4531 */ 4532 (void) VOP_PUTPAGE(vp, offset, PAGESIZE, B_INVAL, 4533 kcred); 4534 VN_RELE(vp); 4535 } else { 4536 page_destroy(pp, 0); 4537 } 4538 } while ((pp = page_next(pp)) != page0); 4539 if (nbusypages && retry++ < MAXRETRIES) { 4540 delay(1); 4541 goto top; 4542 } 4543 } 4544 4545 /* 4546 * Replace the page "old" with the page "new" on the page hash and vnode lists 4547 * 4548 * the replacemnt must be done in place, ie the equivalent sequence: 4549 * 4550 * vp = old->p_vnode; 4551 * off = old->p_offset; 4552 * page_do_hashout(old) 4553 * page_do_hashin(new, vp, off) 4554 * 4555 * doesn't work, since 4556 * 1) if old is the only page on the vnode, the v_pages list has a window 4557 * where it looks empty. This will break file system assumptions. 4558 * and 4559 * 2) pvn_vplist_dirty() can't deal with pages moving on the v_pages list. 4560 */ 4561 static void 4562 page_do_relocate_hash(page_t *new, page_t *old) 4563 { 4564 page_t **hash_list; 4565 vnode_t *vp = old->p_vnode; 4566 kmutex_t *sep; 4567 4568 ASSERT(PAGE_EXCL(old)); 4569 ASSERT(PAGE_EXCL(new)); 4570 ASSERT(vp != NULL); 4571 ASSERT(MUTEX_HELD(page_vnode_mutex(vp))); 4572 ASSERT(MUTEX_HELD(PAGE_HASH_MUTEX(PAGE_HASH_FUNC(vp, old->p_offset)))); 4573 4574 /* 4575 * First find old page on the page hash list 4576 */ 4577 hash_list = &page_hash[PAGE_HASH_FUNC(vp, old->p_offset)]; 4578 4579 for (;;) { 4580 if (*hash_list == old) 4581 break; 4582 if (*hash_list == NULL) { 4583 panic("page_do_hashout"); 4584 /*NOTREACHED*/ 4585 } 4586 hash_list = &(*hash_list)->p_hash; 4587 } 4588 4589 /* 4590 * update new and replace old with new on the page hash list 4591 */ 4592 new->p_vnode = old->p_vnode; 4593 new->p_offset = old->p_offset; 4594 new->p_hash = old->p_hash; 4595 *hash_list = new; 4596 4597 if ((new->p_vnode->v_flag & VISSWAP) != 0) 4598 PP_SETSWAP(new); 4599 4600 /* 4601 * replace old with new on the vnode's page list 4602 */ 4603 if (old->p_vpnext == old) { 4604 new->p_vpnext = new; 4605 new->p_vpprev = new; 4606 } else { 4607 new->p_vpnext = old->p_vpnext; 4608 new->p_vpprev = old->p_vpprev; 4609 new->p_vpnext->p_vpprev = new; 4610 new->p_vpprev->p_vpnext = new; 4611 } 4612 if (vp->v_pages == old) 4613 vp->v_pages = new; 4614 4615 /* 4616 * clear out the old page 4617 */ 4618 old->p_hash = NULL; 4619 old->p_vpnext = NULL; 4620 old->p_vpprev = NULL; 4621 old->p_vnode = NULL; 4622 PP_CLRSWAP(old); 4623 old->p_offset = (u_offset_t)-1; 4624 page_clr_all_props(old); 4625 4626 /* 4627 * Wake up processes waiting for this page. The page's 4628 * identity has been changed, and is probably not the 4629 * desired page any longer. 4630 */ 4631 sep = page_se_mutex(old); 4632 mutex_enter(sep); 4633 old->p_selock &= ~SE_EWANTED; 4634 if (CV_HAS_WAITERS(&old->p_cv)) 4635 cv_broadcast(&old->p_cv); 4636 mutex_exit(sep); 4637 } 4638 4639 /* 4640 * This function moves the identity of page "pp_old" to page "pp_new". 4641 * Both pages must be locked on entry. "pp_new" is free, has no identity, 4642 * and need not be hashed out from anywhere. 4643 */ 4644 void 4645 page_relocate_hash(page_t *pp_new, page_t *pp_old) 4646 { 4647 vnode_t *vp = pp_old->p_vnode; 4648 u_offset_t off = pp_old->p_offset; 4649 kmutex_t *phm, *vphm; 4650 4651 /* 4652 * Rehash two pages 4653 */ 4654 ASSERT(PAGE_EXCL(pp_old)); 4655 ASSERT(PAGE_EXCL(pp_new)); 4656 ASSERT(vp != NULL); 4657 ASSERT(pp_new->p_vnode == NULL); 4658 4659 /* 4660 * hashout then hashin while holding the mutexes 4661 */ 4662 phm = PAGE_HASH_MUTEX(PAGE_HASH_FUNC(vp, off)); 4663 mutex_enter(phm); 4664 vphm = page_vnode_mutex(vp); 4665 mutex_enter(vphm); 4666 4667 page_do_relocate_hash(pp_new, pp_old); 4668 4669 mutex_exit(vphm); 4670 mutex_exit(phm); 4671 4672 /* 4673 * The page_struct_lock need not be acquired for lckcnt and 4674 * cowcnt since the page has an "exclusive" lock. 4675 */ 4676 ASSERT(pp_new->p_lckcnt == 0); 4677 ASSERT(pp_new->p_cowcnt == 0); 4678 pp_new->p_lckcnt = pp_old->p_lckcnt; 4679 pp_new->p_cowcnt = pp_old->p_cowcnt; 4680 pp_old->p_lckcnt = pp_old->p_cowcnt = 0; 4681 4682 /* The following comment preserved from page_flip(). */ 4683 /* XXX - Do we need to protect fsdata? */ 4684 pp_new->p_fsdata = pp_old->p_fsdata; 4685 } 4686 4687 /* 4688 * Helper routine used to lock all remaining members of a 4689 * large page. The caller is responsible for passing in a locked 4690 * pp. If pp is a large page, then it succeeds in locking all the 4691 * remaining constituent pages or it returns with only the 4692 * original page locked. 4693 * 4694 * Returns 1 on success, 0 on failure. 4695 * 4696 * If success is returned this routine gurantees p_szc for all constituent 4697 * pages of a large page pp belongs to can't change. To achieve this we 4698 * recheck szc of pp after locking all constituent pages and retry if szc 4699 * changed (it could only decrease). Since hat_page_demote() needs an EXCL 4700 * lock on one of constituent pages it can't be running after all constituent 4701 * pages are locked. hat_page_demote() with a lock on a constituent page 4702 * outside of this large page (i.e. pp belonged to a larger large page) is 4703 * already done with all constituent pages of pp since the root's p_szc is 4704 * changed last. Thefore no need to synchronize with hat_page_demote() that 4705 * locked a constituent page outside of pp's current large page. 4706 */ 4707 #ifdef DEBUG 4708 uint32_t gpg_trylock_mtbf = 0; 4709 #endif 4710 4711 int 4712 group_page_trylock(page_t *pp, se_t se) 4713 { 4714 page_t *tpp; 4715 pgcnt_t npgs, i, j; 4716 uint_t pszc = pp->p_szc; 4717 4718 #ifdef DEBUG 4719 if (gpg_trylock_mtbf && !(gethrtime() % gpg_trylock_mtbf)) { 4720 return (0); 4721 } 4722 #endif 4723 4724 if (pp != PP_GROUPLEADER(pp, pszc)) { 4725 return (0); 4726 } 4727 4728 retry: 4729 ASSERT(PAGE_LOCKED_SE(pp, se)); 4730 ASSERT(!PP_ISFREE(pp)); 4731 if (pszc == 0) { 4732 return (1); 4733 } 4734 npgs = page_get_pagecnt(pszc); 4735 tpp = pp + 1; 4736 for (i = 1; i < npgs; i++, tpp++) { 4737 if (!page_trylock(tpp, se)) { 4738 tpp = pp + 1; 4739 for (j = 1; j < i; j++, tpp++) { 4740 page_unlock(tpp); 4741 } 4742 return (0); 4743 } 4744 } 4745 if (pp->p_szc != pszc) { 4746 ASSERT(pp->p_szc < pszc); 4747 ASSERT(pp->p_vnode != NULL && pp->p_vnode != &kvp && 4748 !IS_SWAPFSVP(pp->p_vnode)); 4749 tpp = pp + 1; 4750 for (i = 1; i < npgs; i++, tpp++) { 4751 page_unlock(tpp); 4752 } 4753 pszc = pp->p_szc; 4754 goto retry; 4755 } 4756 return (1); 4757 } 4758 4759 void 4760 group_page_unlock(page_t *pp) 4761 { 4762 page_t *tpp; 4763 pgcnt_t npgs, i; 4764 4765 ASSERT(PAGE_LOCKED(pp)); 4766 ASSERT(!PP_ISFREE(pp)); 4767 ASSERT(pp == PP_PAGEROOT(pp)); 4768 npgs = page_get_pagecnt(pp->p_szc); 4769 for (i = 1, tpp = pp + 1; i < npgs; i++, tpp++) { 4770 page_unlock(tpp); 4771 } 4772 } 4773 4774 /* 4775 * returns 4776 * 0 : on success and *nrelocp is number of relocated PAGESIZE pages 4777 * ERANGE : this is not a base page 4778 * EBUSY : failure to get locks on the page/pages 4779 * ENOMEM : failure to obtain replacement pages 4780 * EAGAIN : OBP has not yet completed its boot-time handoff to the kernel 4781 * 4782 * Return with all constituent members of target and replacement 4783 * SE_EXCL locked. It is the callers responsibility to drop the 4784 * locks. 4785 */ 4786 int 4787 do_page_relocate( 4788 page_t **target, 4789 page_t **replacement, 4790 int grouplock, 4791 spgcnt_t *nrelocp, 4792 lgrp_t *lgrp) 4793 { 4794 #ifdef DEBUG 4795 page_t *first_repl; 4796 #endif /* DEBUG */ 4797 page_t *repl; 4798 page_t *targ; 4799 page_t *pl = NULL; 4800 uint_t ppattr; 4801 pfn_t pfn, repl_pfn; 4802 uint_t szc; 4803 spgcnt_t npgs, i; 4804 int repl_contig = 0; 4805 uint_t flags = 0; 4806 spgcnt_t dofree = 0; 4807 4808 *nrelocp = 0; 4809 4810 #if defined(__sparc) 4811 /* 4812 * We need to wait till OBP has completed 4813 * its boot-time handoff of its resources to the kernel 4814 * before we allow page relocation 4815 */ 4816 if (page_relocate_ready == 0) { 4817 return (EAGAIN); 4818 } 4819 #endif 4820 4821 /* 4822 * If this is not a base page, 4823 * just return with 0x0 pages relocated. 4824 */ 4825 targ = *target; 4826 ASSERT(PAGE_EXCL(targ)); 4827 ASSERT(!PP_ISFREE(targ)); 4828 szc = targ->p_szc; 4829 ASSERT(szc < mmu_page_sizes); 4830 VM_STAT_ADD(vmm_vmstats.ppr_reloc[szc]); 4831 pfn = targ->p_pagenum; 4832 if (pfn != PFN_BASE(pfn, szc)) { 4833 VM_STAT_ADD(vmm_vmstats.ppr_relocnoroot[szc]); 4834 return (ERANGE); 4835 } 4836 4837 if ((repl = *replacement) != NULL && repl->p_szc >= szc) { 4838 repl_pfn = repl->p_pagenum; 4839 if (repl_pfn != PFN_BASE(repl_pfn, szc)) { 4840 VM_STAT_ADD(vmm_vmstats.ppr_reloc_replnoroot[szc]); 4841 return (ERANGE); 4842 } 4843 repl_contig = 1; 4844 } 4845 4846 /* 4847 * We must lock all members of this large page or we cannot 4848 * relocate any part of it. 4849 */ 4850 if (grouplock != 0 && !group_page_trylock(targ, SE_EXCL)) { 4851 VM_STAT_ADD(vmm_vmstats.ppr_relocnolock[targ->p_szc]); 4852 return (EBUSY); 4853 } 4854 4855 /* 4856 * reread szc it could have been decreased before 4857 * group_page_trylock() was done. 4858 */ 4859 szc = targ->p_szc; 4860 ASSERT(szc < mmu_page_sizes); 4861 VM_STAT_ADD(vmm_vmstats.ppr_reloc[szc]); 4862 ASSERT(pfn == PFN_BASE(pfn, szc)); 4863 4864 npgs = page_get_pagecnt(targ->p_szc); 4865 4866 if (repl == NULL) { 4867 dofree = npgs; /* Size of target page in MMU pages */ 4868 if (!page_create_wait(dofree, 0)) { 4869 if (grouplock != 0) { 4870 group_page_unlock(targ); 4871 } 4872 VM_STAT_ADD(vmm_vmstats.ppr_relocnomem[szc]); 4873 return (ENOMEM); 4874 } 4875 4876 /* 4877 * seg kmem pages require that the target and replacement 4878 * page be the same pagesize. 4879 */ 4880 flags = (targ->p_vnode == &kvp) ? PGR_SAMESZC : 0; 4881 repl = page_get_replacement_page(targ, lgrp, flags); 4882 if (repl == NULL) { 4883 if (grouplock != 0) { 4884 group_page_unlock(targ); 4885 } 4886 page_create_putback(dofree); 4887 VM_STAT_ADD(vmm_vmstats.ppr_relocnomem[szc]); 4888 return (ENOMEM); 4889 } 4890 } 4891 #ifdef DEBUG 4892 else { 4893 ASSERT(PAGE_LOCKED(repl)); 4894 } 4895 #endif /* DEBUG */ 4896 4897 #if defined(__sparc) 4898 /* 4899 * Let hat_page_relocate() complete the relocation if it's kernel page 4900 */ 4901 if (targ->p_vnode == &kvp) { 4902 *replacement = repl; 4903 if (hat_page_relocate(target, replacement, nrelocp) != 0) { 4904 if (grouplock != 0) { 4905 group_page_unlock(targ); 4906 } 4907 if (dofree) { 4908 *replacement = NULL; 4909 page_free_replacement_page(repl); 4910 page_create_putback(dofree); 4911 } 4912 VM_STAT_ADD(vmm_vmstats.ppr_krelocfail[szc]); 4913 return (EAGAIN); 4914 } 4915 VM_STAT_ADD(vmm_vmstats.ppr_relocok[szc]); 4916 return (0); 4917 } 4918 #else 4919 #if defined(lint) 4920 dofree = dofree; 4921 #endif 4922 #endif 4923 4924 #ifdef DEBUG 4925 first_repl = repl; 4926 #endif /* DEBUG */ 4927 4928 for (i = 0; i < npgs; i++) { 4929 ASSERT(PAGE_EXCL(targ)); 4930 ASSERT(targ->p_slckcnt == 0); 4931 ASSERT(repl->p_slckcnt == 0); 4932 4933 (void) hat_pageunload(targ, HAT_FORCE_PGUNLOAD); 4934 4935 ASSERT(hat_page_getshare(targ) == 0); 4936 ASSERT(!PP_ISFREE(targ)); 4937 ASSERT(targ->p_pagenum == (pfn + i)); 4938 ASSERT(repl_contig == 0 || 4939 repl->p_pagenum == (repl_pfn + i)); 4940 4941 /* 4942 * Copy the page contents and attributes then 4943 * relocate the page in the page hash. 4944 */ 4945 ppcopy(targ, repl); 4946 ppattr = hat_page_getattr(targ, (P_MOD | P_REF | P_RO)); 4947 page_clr_all_props(repl); 4948 page_set_props(repl, ppattr); 4949 page_relocate_hash(repl, targ); 4950 4951 ASSERT(hat_page_getshare(targ) == 0); 4952 ASSERT(hat_page_getshare(repl) == 0); 4953 /* 4954 * Now clear the props on targ, after the 4955 * page_relocate_hash(), they no longer 4956 * have any meaning. 4957 */ 4958 page_clr_all_props(targ); 4959 ASSERT(targ->p_next == targ); 4960 ASSERT(targ->p_prev == targ); 4961 page_list_concat(&pl, &targ); 4962 4963 targ++; 4964 if (repl_contig != 0) { 4965 repl++; 4966 } else { 4967 repl = repl->p_next; 4968 } 4969 } 4970 /* assert that we have come full circle with repl */ 4971 ASSERT(repl_contig == 1 || first_repl == repl); 4972 4973 *target = pl; 4974 if (*replacement == NULL) { 4975 ASSERT(first_repl == repl); 4976 *replacement = repl; 4977 } 4978 VM_STAT_ADD(vmm_vmstats.ppr_relocok[szc]); 4979 *nrelocp = npgs; 4980 return (0); 4981 } 4982 /* 4983 * On success returns 0 and *nrelocp the number of PAGESIZE pages relocated. 4984 */ 4985 int 4986 page_relocate( 4987 page_t **target, 4988 page_t **replacement, 4989 int grouplock, 4990 int freetarget, 4991 spgcnt_t *nrelocp, 4992 lgrp_t *lgrp) 4993 { 4994 spgcnt_t ret; 4995 4996 /* do_page_relocate returns 0 on success or errno value */ 4997 ret = do_page_relocate(target, replacement, grouplock, nrelocp, lgrp); 4998 4999 if (ret != 0 || freetarget == 0) { 5000 return (ret); 5001 } 5002 if (*nrelocp == 1) { 5003 ASSERT(*target != NULL); 5004 page_free(*target, 1); 5005 } else { 5006 page_t *tpp = *target; 5007 uint_t szc = tpp->p_szc; 5008 pgcnt_t npgs = page_get_pagecnt(szc); 5009 ASSERT(npgs > 1); 5010 ASSERT(szc != 0); 5011 do { 5012 ASSERT(PAGE_EXCL(tpp)); 5013 ASSERT(!hat_page_is_mapped(tpp)); 5014 ASSERT(tpp->p_szc == szc); 5015 PP_SETFREE(tpp); 5016 PP_SETAGED(tpp); 5017 npgs--; 5018 } while ((tpp = tpp->p_next) != *target); 5019 ASSERT(npgs == 0); 5020 page_list_add_pages(*target, 0); 5021 npgs = page_get_pagecnt(szc); 5022 page_create_putback(npgs); 5023 } 5024 return (ret); 5025 } 5026 5027 /* 5028 * it is up to the caller to deal with pcf accounting. 5029 */ 5030 void 5031 page_free_replacement_page(page_t *pplist) 5032 { 5033 page_t *pp; 5034 5035 while (pplist != NULL) { 5036 /* 5037 * pp_targ is a linked list. 5038 */ 5039 pp = pplist; 5040 if (pp->p_szc == 0) { 5041 page_sub(&pplist, pp); 5042 page_clr_all_props(pp); 5043 PP_SETFREE(pp); 5044 PP_SETAGED(pp); 5045 page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL); 5046 page_unlock(pp); 5047 VM_STAT_ADD(pagecnt.pc_free_replacement_page[0]); 5048 } else { 5049 spgcnt_t curnpgs = page_get_pagecnt(pp->p_szc); 5050 page_t *tpp; 5051 page_list_break(&pp, &pplist, curnpgs); 5052 tpp = pp; 5053 do { 5054 ASSERT(PAGE_EXCL(tpp)); 5055 ASSERT(!hat_page_is_mapped(tpp)); 5056 page_clr_all_props(pp); 5057 PP_SETFREE(tpp); 5058 PP_SETAGED(tpp); 5059 } while ((tpp = tpp->p_next) != pp); 5060 page_list_add_pages(pp, 0); 5061 VM_STAT_ADD(pagecnt.pc_free_replacement_page[1]); 5062 } 5063 } 5064 } 5065 5066 /* 5067 * Relocate target to non-relocatable replacement page. 5068 */ 5069 int 5070 page_relocate_cage(page_t **target, page_t **replacement) 5071 { 5072 page_t *tpp, *rpp; 5073 spgcnt_t pgcnt, npgs; 5074 int result; 5075 5076 tpp = *target; 5077 5078 ASSERT(PAGE_EXCL(tpp)); 5079 ASSERT(tpp->p_szc == 0); 5080 5081 pgcnt = btop(page_get_pagesize(tpp->p_szc)); 5082 5083 do { 5084 (void) page_create_wait(pgcnt, PG_WAIT | PG_NORELOC); 5085 rpp = page_get_replacement_page(tpp, NULL, PGR_NORELOC); 5086 if (rpp == NULL) { 5087 page_create_putback(pgcnt); 5088 kcage_cageout_wakeup(); 5089 } 5090 } while (rpp == NULL); 5091 5092 ASSERT(PP_ISNORELOC(rpp)); 5093 5094 result = page_relocate(&tpp, &rpp, 0, 1, &npgs, NULL); 5095 5096 if (result == 0) { 5097 *replacement = rpp; 5098 if (pgcnt != npgs) 5099 panic("page_relocate_cage: partial relocation"); 5100 } 5101 5102 return (result); 5103 } 5104 5105 /* 5106 * Release the page lock on a page, place on cachelist 5107 * tail if no longer mapped. Caller can let us know if 5108 * the page is known to be clean. 5109 */ 5110 int 5111 page_release(page_t *pp, int checkmod) 5112 { 5113 int status; 5114 5115 ASSERT(PAGE_LOCKED(pp) && !PP_ISFREE(pp) && 5116 (pp->p_vnode != NULL)); 5117 5118 if (!hat_page_is_mapped(pp) && !IS_SWAPVP(pp->p_vnode) && 5119 ((PAGE_SHARED(pp) && page_tryupgrade(pp)) || PAGE_EXCL(pp)) && 5120 pp->p_lckcnt == 0 && pp->p_cowcnt == 0 && 5121 !hat_page_is_mapped(pp)) { 5122 5123 /* 5124 * If page is modified, unlock it 5125 * 5126 * (p_nrm & P_MOD) bit has the latest stuff because: 5127 * (1) We found that this page doesn't have any mappings 5128 * _after_ holding SE_EXCL and 5129 * (2) We didn't drop SE_EXCL lock after the check in (1) 5130 */ 5131 if (checkmod && hat_ismod(pp)) { 5132 page_unlock(pp); 5133 status = PGREL_MOD; 5134 } else { 5135 /*LINTED: constant in conditional context*/ 5136 VN_DISPOSE(pp, B_FREE, 0, kcred); 5137 status = PGREL_CLEAN; 5138 } 5139 } else { 5140 page_unlock(pp); 5141 status = PGREL_NOTREL; 5142 } 5143 return (status); 5144 } 5145 5146 /* 5147 * Given a constituent page, try to demote the large page on the freelist. 5148 * 5149 * Returns nonzero if the page could be demoted successfully. Returns with 5150 * the constituent page still locked. 5151 */ 5152 int 5153 page_try_demote_free_pages(page_t *pp) 5154 { 5155 page_t *rootpp = pp; 5156 pfn_t pfn = page_pptonum(pp); 5157 spgcnt_t npgs; 5158 uint_t szc = pp->p_szc; 5159 5160 ASSERT(PP_ISFREE(pp)); 5161 ASSERT(PAGE_EXCL(pp)); 5162 5163 /* 5164 * Adjust rootpp and lock it, if `pp' is not the base 5165 * constituent page. 5166 */ 5167 npgs = page_get_pagecnt(pp->p_szc); 5168 if (npgs == 1) { 5169 return (0); 5170 } 5171 5172 if (!IS_P2ALIGNED(pfn, npgs)) { 5173 pfn = P2ALIGN(pfn, npgs); 5174 rootpp = page_numtopp_nolock(pfn); 5175 } 5176 5177 if (pp != rootpp && !page_trylock(rootpp, SE_EXCL)) { 5178 return (0); 5179 } 5180 5181 if (rootpp->p_szc != szc) { 5182 if (pp != rootpp) 5183 page_unlock(rootpp); 5184 return (0); 5185 } 5186 5187 page_demote_free_pages(rootpp); 5188 5189 if (pp != rootpp) 5190 page_unlock(rootpp); 5191 5192 ASSERT(PP_ISFREE(pp)); 5193 ASSERT(PAGE_EXCL(pp)); 5194 return (1); 5195 } 5196 5197 /* 5198 * Given a constituent page, try to demote the large page. 5199 * 5200 * Returns nonzero if the page could be demoted successfully. Returns with 5201 * the constituent page still locked. 5202 */ 5203 int 5204 page_try_demote_pages(page_t *pp) 5205 { 5206 page_t *tpp, *rootpp = pp; 5207 pfn_t pfn = page_pptonum(pp); 5208 spgcnt_t i, npgs; 5209 uint_t szc = pp->p_szc; 5210 vnode_t *vp = pp->p_vnode; 5211 5212 ASSERT(PAGE_EXCL(pp)); 5213 5214 VM_STAT_ADD(pagecnt.pc_try_demote_pages[0]); 5215 5216 if (pp->p_szc == 0) { 5217 VM_STAT_ADD(pagecnt.pc_try_demote_pages[1]); 5218 return (1); 5219 } 5220 5221 if (vp != NULL && !IS_SWAPFSVP(vp) && vp != &kvp) { 5222 VM_STAT_ADD(pagecnt.pc_try_demote_pages[2]); 5223 page_demote_vp_pages(pp); 5224 ASSERT(pp->p_szc == 0); 5225 return (1); 5226 } 5227 5228 /* 5229 * Adjust rootpp if passed in is not the base 5230 * constituent page. 5231 */ 5232 npgs = page_get_pagecnt(pp->p_szc); 5233 ASSERT(npgs > 1); 5234 if (!IS_P2ALIGNED(pfn, npgs)) { 5235 pfn = P2ALIGN(pfn, npgs); 5236 rootpp = page_numtopp_nolock(pfn); 5237 VM_STAT_ADD(pagecnt.pc_try_demote_pages[3]); 5238 ASSERT(rootpp->p_vnode != NULL); 5239 ASSERT(rootpp->p_szc == szc); 5240 } 5241 5242 /* 5243 * We can't demote kernel pages since we can't hat_unload() 5244 * the mappings. 5245 */ 5246 if (rootpp->p_vnode == &kvp) 5247 return (0); 5248 5249 /* 5250 * Attempt to lock all constituent pages except the page passed 5251 * in since it's already locked. 5252 */ 5253 for (tpp = rootpp, i = 0; i < npgs; i++, tpp++) { 5254 ASSERT(!PP_ISFREE(tpp)); 5255 ASSERT(tpp->p_vnode != NULL); 5256 5257 if (tpp != pp && !page_trylock(tpp, SE_EXCL)) 5258 break; 5259 ASSERT(tpp->p_szc == rootpp->p_szc); 5260 ASSERT(page_pptonum(tpp) == page_pptonum(rootpp) + i); 5261 } 5262 5263 /* 5264 * If we failed to lock them all then unlock what we have 5265 * locked so far and bail. 5266 */ 5267 if (i < npgs) { 5268 tpp = rootpp; 5269 while (i-- > 0) { 5270 if (tpp != pp) 5271 page_unlock(tpp); 5272 tpp++; 5273 } 5274 VM_STAT_ADD(pagecnt.pc_try_demote_pages[4]); 5275 return (0); 5276 } 5277 5278 for (tpp = rootpp, i = 0; i < npgs; i++, tpp++) { 5279 ASSERT(PAGE_EXCL(tpp)); 5280 ASSERT(tpp->p_slckcnt == 0); 5281 (void) hat_pageunload(tpp, HAT_FORCE_PGUNLOAD); 5282 tpp->p_szc = 0; 5283 } 5284 5285 /* 5286 * Unlock all pages except the page passed in. 5287 */ 5288 for (tpp = rootpp, i = 0; i < npgs; i++, tpp++) { 5289 ASSERT(!hat_page_is_mapped(tpp)); 5290 if (tpp != pp) 5291 page_unlock(tpp); 5292 } 5293 5294 VM_STAT_ADD(pagecnt.pc_try_demote_pages[5]); 5295 return (1); 5296 } 5297 5298 /* 5299 * Called by page_free() and page_destroy() to demote the page size code 5300 * (p_szc) to 0 (since we can't just put a single PAGESIZE page with non zero 5301 * p_szc on free list, neither can we just clear p_szc of a single page_t 5302 * within a large page since it will break other code that relies on p_szc 5303 * being the same for all page_t's of a large page). Anonymous pages should 5304 * never end up here because anon_map_getpages() cannot deal with p_szc 5305 * changes after a single constituent page is locked. While anonymous or 5306 * kernel large pages are demoted or freed the entire large page at a time 5307 * with all constituent pages locked EXCL for the file system pages we 5308 * have to be able to demote a large page (i.e. decrease all constituent pages 5309 * p_szc) with only just an EXCL lock on one of constituent pages. The reason 5310 * we can easily deal with anonymous page demotion the entire large page at a 5311 * time is that those operation originate at address space level and concern 5312 * the entire large page region with actual demotion only done when pages are 5313 * not shared with any other processes (therefore we can always get EXCL lock 5314 * on all anonymous constituent pages after clearing segment page 5315 * cache). However file system pages can be truncated or invalidated at a 5316 * PAGESIZE level from the file system side and end up in page_free() or 5317 * page_destroy() (we also allow only part of the large page to be SOFTLOCKed 5318 * and therfore pageout should be able to demote a large page by EXCL locking 5319 * any constituent page that is not under SOFTLOCK). In those cases we cannot 5320 * rely on being able to lock EXCL all constituent pages. 5321 * 5322 * To prevent szc changes on file system pages one has to lock all constituent 5323 * pages at least SHARED (or call page_szc_lock()). The only subsystem that 5324 * doesn't rely on locking all constituent pages (or using page_szc_lock()) to 5325 * prevent szc changes is hat layer that uses its own page level mlist 5326 * locks. hat assumes that szc doesn't change after mlist lock for a page is 5327 * taken. Therefore we need to change szc under hat level locks if we only 5328 * have an EXCL lock on a single constituent page and hat still references any 5329 * of constituent pages. (Note we can't "ignore" hat layer by simply 5330 * hat_pageunload() all constituent pages without having EXCL locks on all of 5331 * constituent pages). We use hat_page_demote() call to safely demote szc of 5332 * all constituent pages under hat locks when we only have an EXCL lock on one 5333 * of constituent pages. 5334 * 5335 * This routine calls page_szc_lock() before calling hat_page_demote() to 5336 * allow segvn in one special case not to lock all constituent pages SHARED 5337 * before calling hat_memload_array() that relies on p_szc not changeing even 5338 * before hat level mlist lock is taken. In that case segvn uses 5339 * page_szc_lock() to prevent hat_page_demote() changeing p_szc values. 5340 * 5341 * Anonymous or kernel page demotion still has to lock all pages exclusively 5342 * and do hat_pageunload() on all constituent pages before demoting the page 5343 * therefore there's no need for anonymous or kernel page demotion to use 5344 * hat_page_demote() mechanism. 5345 * 5346 * hat_page_demote() removes all large mappings that map pp and then decreases 5347 * p_szc starting from the last constituent page of the large page. By working 5348 * from the tail of a large page in pfn decreasing order allows one looking at 5349 * the root page to know that hat_page_demote() is done for root's szc area. 5350 * e.g. if a root page has szc 1 one knows it only has to lock all constituent 5351 * pages within szc 1 area to prevent szc changes because hat_page_demote() 5352 * that started on this page when it had szc > 1 is done for this szc 1 area. 5353 * 5354 * We are guranteed that all constituent pages of pp's large page belong to 5355 * the same vnode with the consecutive offsets increasing in the direction of 5356 * the pfn i.e. the identity of constituent pages can't change until their 5357 * p_szc is decreased. Therefore it's safe for hat_page_demote() to remove 5358 * large mappings to pp even though we don't lock any constituent page except 5359 * pp (i.e. we won't unload e.g. kernel locked page). 5360 */ 5361 static void 5362 page_demote_vp_pages(page_t *pp) 5363 { 5364 kmutex_t *mtx; 5365 5366 ASSERT(PAGE_EXCL(pp)); 5367 ASSERT(!PP_ISFREE(pp)); 5368 ASSERT(pp->p_vnode != NULL); 5369 ASSERT(!IS_SWAPFSVP(pp->p_vnode)); 5370 ASSERT(pp->p_vnode != &kvp); 5371 5372 VM_STAT_ADD(pagecnt.pc_demote_pages[0]); 5373 5374 mtx = page_szc_lock(pp); 5375 if (mtx != NULL) { 5376 hat_page_demote(pp); 5377 mutex_exit(mtx); 5378 } 5379 ASSERT(pp->p_szc == 0); 5380 } 5381 5382 /* 5383 * Mark any existing pages for migration in the given range 5384 */ 5385 void 5386 page_mark_migrate(struct seg *seg, caddr_t addr, size_t len, 5387 struct anon_map *amp, ulong_t anon_index, vnode_t *vp, 5388 u_offset_t vnoff, int rflag) 5389 { 5390 struct anon *ap; 5391 vnode_t *curvp; 5392 lgrp_t *from; 5393 pgcnt_t i; 5394 pgcnt_t nlocked; 5395 u_offset_t off; 5396 pfn_t pfn; 5397 size_t pgsz; 5398 size_t segpgsz; 5399 pgcnt_t pages; 5400 uint_t pszc; 5401 page_t **ppa; 5402 pgcnt_t ppa_nentries; 5403 page_t *pp; 5404 caddr_t va; 5405 ulong_t an_idx; 5406 anon_sync_obj_t cookie; 5407 5408 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 5409 5410 /* 5411 * Don't do anything if don't need to do lgroup optimizations 5412 * on this system 5413 */ 5414 if (!lgrp_optimizations()) 5415 return; 5416 5417 /* 5418 * Align address and length to (potentially large) page boundary 5419 */ 5420 segpgsz = page_get_pagesize(seg->s_szc); 5421 addr = (caddr_t)P2ALIGN((uintptr_t)addr, segpgsz); 5422 if (rflag) 5423 len = P2ROUNDUP(len, segpgsz); 5424 5425 /* 5426 * Allocate page array to accomodate largest page size 5427 */ 5428 pgsz = page_get_pagesize(page_num_pagesizes() - 1); 5429 ppa_nentries = btop(pgsz); 5430 ppa = kmem_zalloc(ppa_nentries * sizeof (page_t *), KM_SLEEP); 5431 5432 /* 5433 * Do one (large) page at a time 5434 */ 5435 va = addr; 5436 while (va < addr + len) { 5437 /* 5438 * Lookup (root) page for vnode and offset corresponding to 5439 * this virtual address 5440 * Try anonmap first since there may be copy-on-write 5441 * pages, but initialize vnode pointer and offset using 5442 * vnode arguments just in case there isn't an amp. 5443 */ 5444 curvp = vp; 5445 off = vnoff + va - seg->s_base; 5446 if (amp) { 5447 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 5448 an_idx = anon_index + seg_page(seg, va); 5449 anon_array_enter(amp, an_idx, &cookie); 5450 ap = anon_get_ptr(amp->ahp, an_idx); 5451 if (ap) 5452 swap_xlate(ap, &curvp, &off); 5453 anon_array_exit(&cookie); 5454 ANON_LOCK_EXIT(&->a_rwlock); 5455 } 5456 5457 pp = NULL; 5458 if (curvp) 5459 pp = page_lookup(curvp, off, SE_SHARED); 5460 5461 /* 5462 * If there isn't a page at this virtual address, 5463 * skip to next page 5464 */ 5465 if (pp == NULL) { 5466 va += PAGESIZE; 5467 continue; 5468 } 5469 5470 /* 5471 * Figure out which lgroup this page is in for kstats 5472 */ 5473 pfn = page_pptonum(pp); 5474 from = lgrp_pfn_to_lgrp(pfn); 5475 5476 /* 5477 * Get page size, and round up and skip to next page boundary 5478 * if unaligned address 5479 */ 5480 pszc = pp->p_szc; 5481 pgsz = page_get_pagesize(pszc); 5482 pages = btop(pgsz); 5483 if (!IS_P2ALIGNED(va, pgsz) || 5484 !IS_P2ALIGNED(pfn, pages) || 5485 pgsz > segpgsz) { 5486 pgsz = MIN(pgsz, segpgsz); 5487 page_unlock(pp); 5488 i = btop(P2END((uintptr_t)va, pgsz) - 5489 (uintptr_t)va); 5490 va = (caddr_t)P2END((uintptr_t)va, pgsz); 5491 lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS, i); 5492 continue; 5493 } 5494 5495 /* 5496 * Upgrade to exclusive lock on page 5497 */ 5498 if (!page_tryupgrade(pp)) { 5499 page_unlock(pp); 5500 va += pgsz; 5501 lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS, 5502 btop(pgsz)); 5503 continue; 5504 } 5505 5506 /* 5507 * Remember pages locked exclusively and how many 5508 */ 5509 ppa[0] = pp; 5510 nlocked = 1; 5511 5512 /* 5513 * Lock constituent pages if this is large page 5514 */ 5515 if (pages > 1) { 5516 /* 5517 * Lock all constituents except root page, since it 5518 * should be locked already. 5519 */ 5520 for (i = 1; i < pages; i++) { 5521 pp++; 5522 if (!page_trylock(pp, SE_EXCL)) { 5523 break; 5524 } 5525 if (PP_ISFREE(pp) || 5526 pp->p_szc != pszc) { 5527 /* 5528 * hat_page_demote() raced in with us. 5529 */ 5530 ASSERT(!IS_SWAPFSVP(curvp)); 5531 page_unlock(pp); 5532 break; 5533 } 5534 ppa[nlocked] = pp; 5535 nlocked++; 5536 } 5537 } 5538 5539 /* 5540 * If all constituent pages couldn't be locked, 5541 * unlock pages locked so far and skip to next page. 5542 */ 5543 if (nlocked != pages) { 5544 for (i = 0; i < nlocked; i++) 5545 page_unlock(ppa[i]); 5546 va += pgsz; 5547 lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS, 5548 btop(pgsz)); 5549 continue; 5550 } 5551 5552 /* 5553 * hat_page_demote() can no longer happen 5554 * since last cons page had the right p_szc after 5555 * all cons pages were locked. all cons pages 5556 * should now have the same p_szc. 5557 */ 5558 5559 /* 5560 * All constituent pages locked successfully, so mark 5561 * large page for migration and unload the mappings of 5562 * constituent pages, so a fault will occur on any part of the 5563 * large page 5564 */ 5565 PP_SETMIGRATE(ppa[0]); 5566 for (i = 0; i < nlocked; i++) { 5567 pp = ppa[i]; 5568 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 5569 ASSERT(hat_page_getshare(pp) == 0); 5570 page_unlock(pp); 5571 } 5572 lgrp_stat_add(from->lgrp_id, LGRP_PMM_PGS, nlocked); 5573 5574 va += pgsz; 5575 } 5576 kmem_free(ppa, ppa_nentries * sizeof (page_t *)); 5577 } 5578 5579 /* 5580 * Migrate any pages that have been marked for migration in the given range 5581 */ 5582 void 5583 page_migrate( 5584 struct seg *seg, 5585 caddr_t addr, 5586 page_t **ppa, 5587 pgcnt_t npages) 5588 { 5589 lgrp_t *from; 5590 lgrp_t *to; 5591 page_t *newpp; 5592 page_t *pp; 5593 pfn_t pfn; 5594 size_t pgsz; 5595 spgcnt_t page_cnt; 5596 spgcnt_t i; 5597 uint_t pszc; 5598 5599 ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); 5600 5601 while (npages > 0) { 5602 pp = *ppa; 5603 pszc = pp->p_szc; 5604 pgsz = page_get_pagesize(pszc); 5605 page_cnt = btop(pgsz); 5606 5607 /* 5608 * Check to see whether this page is marked for migration 5609 * 5610 * Assume that root page of large page is marked for 5611 * migration and none of the other constituent pages 5612 * are marked. This really simplifies clearing the 5613 * migrate bit by not having to clear it from each 5614 * constituent page. 5615 * 5616 * note we don't want to relocate an entire large page if 5617 * someone is only using one subpage. 5618 */ 5619 if (npages < page_cnt) 5620 break; 5621 5622 /* 5623 * Is it marked for migration? 5624 */ 5625 if (!PP_ISMIGRATE(pp)) 5626 goto next; 5627 5628 /* 5629 * Determine lgroups that page is being migrated between 5630 */ 5631 pfn = page_pptonum(pp); 5632 if (!IS_P2ALIGNED(pfn, page_cnt)) { 5633 break; 5634 } 5635 from = lgrp_pfn_to_lgrp(pfn); 5636 to = lgrp_mem_choose(seg, addr, pgsz); 5637 5638 /* 5639 * Check to see whether we are trying to migrate page to lgroup 5640 * where it is allocated already 5641 */ 5642 if (to == from) { 5643 PP_CLRMIGRATE(pp); 5644 goto next; 5645 } 5646 5647 /* 5648 * Need to get exclusive lock's to migrate 5649 */ 5650 for (i = 0; i < page_cnt; i++) { 5651 ASSERT(PAGE_LOCKED(ppa[i])); 5652 if (page_pptonum(ppa[i]) != pfn + i || 5653 ppa[i]->p_szc != pszc) { 5654 break; 5655 } 5656 if (!page_tryupgrade(ppa[i])) { 5657 lgrp_stat_add(from->lgrp_id, 5658 LGRP_PM_FAIL_LOCK_PGS, 5659 page_cnt); 5660 break; 5661 } 5662 } 5663 if (i != page_cnt) { 5664 while (--i != -1) { 5665 page_downgrade(ppa[i]); 5666 } 5667 goto next; 5668 } 5669 5670 (void) page_create_wait(page_cnt, PG_WAIT); 5671 newpp = page_get_replacement_page(pp, to, PGR_SAMESZC); 5672 if (newpp == NULL) { 5673 page_create_putback(page_cnt); 5674 for (i = 0; i < page_cnt; i++) { 5675 page_downgrade(ppa[i]); 5676 } 5677 lgrp_stat_add(to->lgrp_id, LGRP_PM_FAIL_ALLOC_PGS, 5678 page_cnt); 5679 goto next; 5680 } 5681 ASSERT(newpp->p_szc == pszc); 5682 /* 5683 * Clear migrate bit and relocate page 5684 */ 5685 PP_CLRMIGRATE(pp); 5686 if (page_relocate(&pp, &newpp, 0, 1, &page_cnt, to)) { 5687 panic("page_migrate: page_relocate failed"); 5688 } 5689 ASSERT(page_cnt * PAGESIZE == pgsz); 5690 5691 /* 5692 * Keep stats for number of pages migrated from and to 5693 * each lgroup 5694 */ 5695 lgrp_stat_add(from->lgrp_id, LGRP_PM_SRC_PGS, page_cnt); 5696 lgrp_stat_add(to->lgrp_id, LGRP_PM_DEST_PGS, page_cnt); 5697 /* 5698 * update the page_t array we were passed in and 5699 * unlink constituent pages of a large page. 5700 */ 5701 for (i = 0; i < page_cnt; ++i, ++pp) { 5702 ASSERT(PAGE_EXCL(newpp)); 5703 ASSERT(newpp->p_szc == pszc); 5704 ppa[i] = newpp; 5705 pp = newpp; 5706 page_sub(&newpp, pp); 5707 page_downgrade(pp); 5708 } 5709 ASSERT(newpp == NULL); 5710 next: 5711 addr += pgsz; 5712 ppa += page_cnt; 5713 npages -= page_cnt; 5714 } 5715 } 5716 5717 ulong_t mem_waiters = 0; 5718 ulong_t max_count = 20; 5719 #define MAX_DELAY 0x1ff 5720 5721 /* 5722 * Check if enough memory is available to proceed. 5723 * Depending on system configuration and how much memory is 5724 * reserved for swap we need to check against two variables. 5725 * e.g. on systems with little physical swap availrmem can be 5726 * more reliable indicator of how much memory is available. 5727 * On systems with large phys swap freemem can be better indicator. 5728 * If freemem drops below threshold level don't return an error 5729 * immediately but wake up pageout to free memory and block. 5730 * This is done number of times. If pageout is not able to free 5731 * memory within certain time return an error. 5732 * The same applies for availrmem but kmem_reap is used to 5733 * free memory. 5734 */ 5735 int 5736 page_mem_avail(pgcnt_t npages) 5737 { 5738 ulong_t count; 5739 5740 #if defined(__i386) 5741 if (freemem > desfree + npages && 5742 availrmem > swapfs_reserve + npages && 5743 btop(vmem_size(heap_arena, VMEM_FREE)) > tune.t_minarmem + 5744 npages) 5745 return (1); 5746 #else 5747 if (freemem > desfree + npages && 5748 availrmem > swapfs_reserve + npages) 5749 return (1); 5750 #endif 5751 5752 count = max_count; 5753 atomic_add_long(&mem_waiters, 1); 5754 5755 while (freemem < desfree + npages && --count) { 5756 cv_signal(&proc_pageout->p_cv); 5757 if (delay_sig(hz + (mem_waiters & MAX_DELAY))) { 5758 atomic_add_long(&mem_waiters, -1); 5759 return (0); 5760 } 5761 } 5762 if (count == 0) { 5763 atomic_add_long(&mem_waiters, -1); 5764 return (0); 5765 } 5766 5767 count = max_count; 5768 while (availrmem < swapfs_reserve + npages && --count) { 5769 kmem_reap(); 5770 if (delay_sig(hz + (mem_waiters & MAX_DELAY))) { 5771 atomic_add_long(&mem_waiters, -1); 5772 return (0); 5773 } 5774 } 5775 atomic_add_long(&mem_waiters, -1); 5776 if (count == 0) 5777 return (0); 5778 5779 #if defined(__i386) 5780 if (btop(vmem_size(heap_arena, VMEM_FREE)) < 5781 tune.t_minarmem + npages) 5782 return (0); 5783 #endif 5784 return (1); 5785 } 5786 5787 #define MAX_CNT 60 /* max num of iterations */ 5788 /* 5789 * Reclaim/reserve availrmem for npages. 5790 * If there is not enough memory start reaping seg, kmem caches. 5791 * Start pageout scanner (via page_needfree()). 5792 * Exit after ~ MAX_CNT s regardless of how much memory has been released. 5793 * Note: There is no guarantee that any availrmem will be freed as 5794 * this memory typically is locked (kernel heap) or reserved for swap. 5795 * Also due to memory fragmentation kmem allocator may not be able 5796 * to free any memory (single user allocated buffer will prevent 5797 * freeing slab or a page). 5798 */ 5799 int 5800 page_reclaim_mem(pgcnt_t npages, pgcnt_t epages, int adjust) 5801 { 5802 int i = 0; 5803 int ret = 0; 5804 pgcnt_t deficit; 5805 pgcnt_t old_availrmem; 5806 5807 mutex_enter(&freemem_lock); 5808 old_availrmem = availrmem - 1; 5809 while ((availrmem < tune.t_minarmem + npages + epages) && 5810 (old_availrmem < availrmem) && (i++ < MAX_CNT)) { 5811 old_availrmem = availrmem; 5812 deficit = tune.t_minarmem + npages + epages - availrmem; 5813 mutex_exit(&freemem_lock); 5814 page_needfree(deficit); 5815 seg_preap(); 5816 kmem_reap(); 5817 delay(hz); 5818 page_needfree(-(spgcnt_t)deficit); 5819 mutex_enter(&freemem_lock); 5820 } 5821 5822 if (adjust && (availrmem >= tune.t_minarmem + npages + epages)) { 5823 availrmem -= npages; 5824 ret = 1; 5825 } 5826 5827 mutex_exit(&freemem_lock); 5828 5829 return (ret); 5830 } 5831 5832 /* 5833 * Search the memory segments to locate the desired page. Within a 5834 * segment, pages increase linearly with one page structure per 5835 * physical page frame (size PAGESIZE). The search begins 5836 * with the segment that was accessed last, to take advantage of locality. 5837 * If the hint misses, we start from the beginning of the sorted memseg list 5838 */ 5839 5840 5841 /* 5842 * Some data structures for pfn to pp lookup. 5843 */ 5844 ulong_t mhash_per_slot; 5845 struct memseg *memseg_hash[N_MEM_SLOTS]; 5846 5847 page_t * 5848 page_numtopp_nolock(pfn_t pfnum) 5849 { 5850 struct memseg *seg; 5851 page_t *pp; 5852 vm_cpu_data_t *vc = CPU->cpu_vm_data; 5853 5854 ASSERT(vc != NULL); 5855 5856 MEMSEG_STAT_INCR(nsearch); 5857 5858 /* Try last winner first */ 5859 if (((seg = vc->vc_pnum_memseg) != NULL) && 5860 (pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) { 5861 MEMSEG_STAT_INCR(nlastwon); 5862 pp = seg->pages + (pfnum - seg->pages_base); 5863 if (pp->p_pagenum == pfnum) 5864 return ((page_t *)pp); 5865 } 5866 5867 /* Else Try hash */ 5868 if (((seg = memseg_hash[MEMSEG_PFN_HASH(pfnum)]) != NULL) && 5869 (pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) { 5870 MEMSEG_STAT_INCR(nhashwon); 5871 vc->vc_pnum_memseg = seg; 5872 pp = seg->pages + (pfnum - seg->pages_base); 5873 if (pp->p_pagenum == pfnum) 5874 return ((page_t *)pp); 5875 } 5876 5877 /* Else Brute force */ 5878 for (seg = memsegs; seg != NULL; seg = seg->next) { 5879 if (pfnum >= seg->pages_base && pfnum < seg->pages_end) { 5880 vc->vc_pnum_memseg = seg; 5881 pp = seg->pages + (pfnum - seg->pages_base); 5882 return ((page_t *)pp); 5883 } 5884 } 5885 vc->vc_pnum_memseg = NULL; 5886 MEMSEG_STAT_INCR(nnotfound); 5887 return ((page_t *)NULL); 5888 5889 } 5890 5891 struct memseg * 5892 page_numtomemseg_nolock(pfn_t pfnum) 5893 { 5894 struct memseg *seg; 5895 page_t *pp; 5896 5897 /* Try hash */ 5898 if (((seg = memseg_hash[MEMSEG_PFN_HASH(pfnum)]) != NULL) && 5899 (pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) { 5900 pp = seg->pages + (pfnum - seg->pages_base); 5901 if (pp->p_pagenum == pfnum) 5902 return (seg); 5903 } 5904 5905 /* Else Brute force */ 5906 for (seg = memsegs; seg != NULL; seg = seg->next) { 5907 if (pfnum >= seg->pages_base && pfnum < seg->pages_end) { 5908 return (seg); 5909 } 5910 } 5911 return ((struct memseg *)NULL); 5912 } 5913 5914 /* 5915 * Given a page and a count return the page struct that is 5916 * n structs away from the current one in the global page 5917 * list. 5918 * 5919 * This function wraps to the first page upon 5920 * reaching the end of the memseg list. 5921 */ 5922 page_t * 5923 page_nextn(page_t *pp, ulong_t n) 5924 { 5925 struct memseg *seg; 5926 page_t *ppn; 5927 vm_cpu_data_t *vc = (vm_cpu_data_t *)CPU->cpu_vm_data; 5928 5929 ASSERT(vc != NULL); 5930 5931 if (((seg = vc->vc_pnext_memseg) == NULL) || 5932 (seg->pages_base == seg->pages_end) || 5933 !(pp >= seg->pages && pp < seg->epages)) { 5934 5935 for (seg = memsegs; seg; seg = seg->next) { 5936 if (pp >= seg->pages && pp < seg->epages) 5937 break; 5938 } 5939 5940 if (seg == NULL) { 5941 /* Memory delete got in, return something valid. */ 5942 /* TODO: fix me. */ 5943 seg = memsegs; 5944 pp = seg->pages; 5945 } 5946 } 5947 5948 /* check for wraparound - possible if n is large */ 5949 while ((ppn = (pp + n)) >= seg->epages || ppn < pp) { 5950 n -= seg->epages - pp; 5951 seg = seg->next; 5952 if (seg == NULL) 5953 seg = memsegs; 5954 pp = seg->pages; 5955 } 5956 vc->vc_pnext_memseg = seg; 5957 return (ppn); 5958 } 5959 5960 /* 5961 * Initialize for a loop using page_next_scan_large(). 5962 */ 5963 page_t * 5964 page_next_scan_init(void **cookie) 5965 { 5966 ASSERT(cookie != NULL); 5967 *cookie = (void *)memsegs; 5968 return ((page_t *)memsegs->pages); 5969 } 5970 5971 /* 5972 * Return the next page in a scan of page_t's, assuming we want 5973 * to skip over sub-pages within larger page sizes. 5974 * 5975 * The cookie is used to keep track of the current memseg. 5976 */ 5977 page_t * 5978 page_next_scan_large( 5979 page_t *pp, 5980 ulong_t *n, 5981 void **cookie) 5982 { 5983 struct memseg *seg = (struct memseg *)*cookie; 5984 page_t *new_pp; 5985 ulong_t cnt; 5986 pfn_t pfn; 5987 5988 5989 /* 5990 * get the count of page_t's to skip based on the page size 5991 */ 5992 ASSERT(pp != NULL); 5993 if (pp->p_szc == 0) { 5994 cnt = 1; 5995 } else { 5996 pfn = page_pptonum(pp); 5997 cnt = page_get_pagecnt(pp->p_szc); 5998 cnt -= pfn & (cnt - 1); 5999 } 6000 *n += cnt; 6001 new_pp = pp + cnt; 6002 6003 /* 6004 * Catch if we went past the end of the current memory segment. If so, 6005 * just move to the next segment with pages. 6006 */ 6007 if (new_pp >= seg->epages) { 6008 do { 6009 seg = seg->next; 6010 if (seg == NULL) 6011 seg = memsegs; 6012 } while (seg->pages == seg->epages); 6013 new_pp = seg->pages; 6014 *cookie = (void *)seg; 6015 } 6016 6017 return (new_pp); 6018 } 6019 6020 6021 /* 6022 * Returns next page in list. Note: this function wraps 6023 * to the first page in the list upon reaching the end 6024 * of the list. Callers should be aware of this fact. 6025 */ 6026 6027 /* We should change this be a #define */ 6028 6029 page_t * 6030 page_next(page_t *pp) 6031 { 6032 return (page_nextn(pp, 1)); 6033 } 6034 6035 page_t * 6036 page_first() 6037 { 6038 return ((page_t *)memsegs->pages); 6039 } 6040 6041 6042 /* 6043 * This routine is called at boot with the initial memory configuration 6044 * and when memory is added or removed. 6045 */ 6046 void 6047 build_pfn_hash() 6048 { 6049 pfn_t cur; 6050 pgcnt_t index; 6051 struct memseg *pseg; 6052 int i; 6053 6054 /* 6055 * Clear memseg_hash array. 6056 * Since memory add/delete is designed to operate concurrently 6057 * with normal operation, the hash rebuild must be able to run 6058 * concurrently with page_numtopp_nolock(). To support this 6059 * functionality, assignments to memseg_hash array members must 6060 * be done atomically. 6061 * 6062 * NOTE: bzero() does not currently guarantee this for kernel 6063 * threads, and cannot be used here. 6064 */ 6065 for (i = 0; i < N_MEM_SLOTS; i++) 6066 memseg_hash[i] = NULL; 6067 6068 hat_kpm_mseghash_clear(N_MEM_SLOTS); 6069 6070 /* 6071 * Physmax is the last valid pfn. 6072 */ 6073 mhash_per_slot = (physmax + 1) >> MEM_HASH_SHIFT; 6074 for (pseg = memsegs; pseg != NULL; pseg = pseg->next) { 6075 index = MEMSEG_PFN_HASH(pseg->pages_base); 6076 cur = pseg->pages_base; 6077 do { 6078 if (index >= N_MEM_SLOTS) 6079 index = MEMSEG_PFN_HASH(cur); 6080 6081 if (memseg_hash[index] == NULL || 6082 memseg_hash[index]->pages_base > pseg->pages_base) { 6083 memseg_hash[index] = pseg; 6084 hat_kpm_mseghash_update(index, pseg); 6085 } 6086 cur += mhash_per_slot; 6087 index++; 6088 } while (cur < pseg->pages_end); 6089 } 6090 } 6091 6092 /* 6093 * Return the pagenum for the pp 6094 */ 6095 pfn_t 6096 page_pptonum(page_t *pp) 6097 { 6098 return (pp->p_pagenum); 6099 } 6100 6101 /* 6102 * interface to the referenced and modified etc bits 6103 * in the PSM part of the page struct 6104 * when no locking is desired. 6105 */ 6106 void 6107 page_set_props(page_t *pp, uint_t flags) 6108 { 6109 ASSERT((flags & ~(P_MOD | P_REF | P_RO)) == 0); 6110 pp->p_nrm |= (uchar_t)flags; 6111 } 6112 6113 void 6114 page_clr_all_props(page_t *pp) 6115 { 6116 pp->p_nrm = 0; 6117 } 6118 6119 /* 6120 * Clear p_lckcnt and p_cowcnt, adjusting freemem if required. 6121 */ 6122 int 6123 page_clear_lck_cow(page_t *pp, int adjust) 6124 { 6125 int f_amount; 6126 6127 ASSERT(PAGE_EXCL(pp)); 6128 6129 /* 6130 * The page_struct_lock need not be acquired here since 6131 * we require the caller hold the page exclusively locked. 6132 */ 6133 f_amount = 0; 6134 if (pp->p_lckcnt) { 6135 f_amount = 1; 6136 pp->p_lckcnt = 0; 6137 } 6138 if (pp->p_cowcnt) { 6139 f_amount += pp->p_cowcnt; 6140 pp->p_cowcnt = 0; 6141 } 6142 6143 if (adjust && f_amount) { 6144 mutex_enter(&freemem_lock); 6145 availrmem += f_amount; 6146 mutex_exit(&freemem_lock); 6147 } 6148 6149 return (f_amount); 6150 } 6151 6152 /* 6153 * The following functions is called from free_vp_pages() 6154 * for an inexact estimate of a newly free'd page... 6155 */ 6156 ulong_t 6157 page_share_cnt(page_t *pp) 6158 { 6159 return (hat_page_getshare(pp)); 6160 } 6161 6162 int 6163 page_isshared(page_t *pp) 6164 { 6165 return (hat_page_getshare(pp) > 1); 6166 } 6167 6168 int 6169 page_isfree(page_t *pp) 6170 { 6171 return (PP_ISFREE(pp)); 6172 } 6173 6174 int 6175 page_isref(page_t *pp) 6176 { 6177 return (hat_page_getattr(pp, P_REF)); 6178 } 6179 6180 int 6181 page_ismod(page_t *pp) 6182 { 6183 return (hat_page_getattr(pp, P_MOD)); 6184 } 6185