1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/types.h> 29 #include <sys/param.h> 30 #include <sys/thread.h> 31 #include <sys/proc.h> 32 #include <sys/callb.h> 33 #include <sys/vnode.h> 34 #include <sys/debug.h> 35 #include <sys/systm.h> /* for bzero */ 36 #include <sys/memlist.h> 37 #include <sys/cmn_err.h> 38 #include <sys/sysmacros.h> 39 #include <sys/vmsystm.h> /* for NOMEMWAIT() */ 40 #include <sys/atomic.h> /* used to update kcage_freemem */ 41 #include <sys/kmem.h> /* for kmem_reap */ 42 #include <sys/errno.h> 43 #include <sys/mem_cage.h> 44 #include <vm/seg_kmem.h> 45 #include <vm/page.h> 46 #include <vm/hat.h> 47 #include <vm/vm_dep.h> 48 #include <sys/mem_config.h> 49 #include <sys/lgrp.h> 50 #include <sys/rwlock.h> 51 52 extern pri_t maxclsyspri; 53 54 #ifdef DEBUG 55 #define KCAGE_STATS 56 #endif 57 58 #ifdef KCAGE_STATS 59 60 #define KCAGE_STATS_VERSION 9 /* can help report generators */ 61 #define KCAGE_STATS_NSCANS 256 /* depth of scan statistics buffer */ 62 63 struct kcage_stats_scan { 64 /* managed by KCAGE_STAT_* macros */ 65 clock_t scan_lbolt; 66 uint_t scan_id; 67 68 /* set in kcage_cageout() */ 69 uint_t kt_passes; 70 clock_t kt_ticks; 71 pgcnt_t kt_kcage_freemem_start; 72 pgcnt_t kt_kcage_freemem_end; 73 pgcnt_t kt_freemem_start; 74 pgcnt_t kt_freemem_end; 75 uint_t kt_examined; 76 uint_t kt_cantlock; 77 uint_t kt_gotone; 78 uint_t kt_gotonefree; 79 uint_t kt_skiplevel; 80 uint_t kt_skipshared; 81 uint_t kt_skiprefd; 82 uint_t kt_destroy; 83 84 /* set in kcage_invalidate_page() */ 85 uint_t kip_reloclocked; 86 uint_t kip_relocmod; 87 uint_t kip_destroy; 88 uint_t kip_nomem; 89 uint_t kip_demotefailed; 90 91 /* set in kcage_expand() */ 92 uint_t ke_wanted; 93 uint_t ke_examined; 94 uint_t ke_lefthole; 95 uint_t ke_gotone; 96 uint_t ke_gotonefree; 97 }; 98 99 struct kcage_stats { 100 /* managed by KCAGE_STAT_* macros */ 101 uint_t version; 102 uint_t size; 103 104 /* set in kcage_cageout */ 105 uint_t kt_wakeups; 106 uint_t kt_scans; 107 uint_t kt_cageout_break; 108 109 /* set in kcage_expand */ 110 uint_t ke_calls; 111 uint_t ke_nopfn; 112 uint_t ke_nopaget; 113 uint_t ke_isnoreloc; 114 uint_t ke_deleting; 115 uint_t ke_lowfreemem; 116 uint_t ke_terminate; 117 118 /* set in kcage_freemem_add() */ 119 uint_t kfa_trottlewake; 120 121 /* set in kcage_freemem_sub() */ 122 uint_t kfs_cagewake; 123 124 /* set in kcage_create_throttle */ 125 uint_t kct_calls; 126 uint_t kct_cageout; 127 uint_t kct_critical; 128 uint_t kct_exempt; 129 uint_t kct_cagewake; 130 uint_t kct_wait; 131 uint_t kct_progress; 132 uint_t kct_noprogress; 133 uint_t kct_timeout; 134 135 /* set in kcage_cageout_wakeup */ 136 uint_t kcw_expandearly; 137 138 /* managed by KCAGE_STAT_* macros */ 139 uint_t scan_array_size; 140 uint_t scan_index; 141 struct kcage_stats_scan scans[KCAGE_STATS_NSCANS]; 142 }; 143 144 static struct kcage_stats kcage_stats; 145 static struct kcage_stats_scan kcage_stats_scan_zero; 146 147 /* 148 * No real need for atomics here. For the most part the incs and sets are 149 * done by the kernel cage thread. There are a few that are done by any 150 * number of other threads. Those cases are noted by comments. 151 */ 152 #define KCAGE_STAT_INCR(m) kcage_stats.m++ 153 154 #define KCAGE_STAT_NINCR(m, v) kcage_stats.m += (v) 155 156 #define KCAGE_STAT_INCR_SCAN(m) \ 157 KCAGE_STAT_INCR(scans[kcage_stats.scan_index].m) 158 159 #define KCAGE_STAT_NINCR_SCAN(m, v) \ 160 KCAGE_STAT_NINCR(scans[kcage_stats.scan_index].m, v) 161 162 #define KCAGE_STAT_SET(m, v) kcage_stats.m = (v) 163 164 #define KCAGE_STAT_SETZ(m, v) \ 165 if (kcage_stats.m == 0) kcage_stats.m = (v) 166 167 #define KCAGE_STAT_SET_SCAN(m, v) \ 168 KCAGE_STAT_SET(scans[kcage_stats.scan_index].m, v) 169 170 #define KCAGE_STAT_SETZ_SCAN(m, v) \ 171 KCAGE_STAT_SETZ(scans[kcage_stats.scan_index].m, v) 172 173 #define KCAGE_STAT_INC_SCAN_INDEX \ 174 KCAGE_STAT_SET_SCAN(scan_lbolt, lbolt); \ 175 KCAGE_STAT_SET_SCAN(scan_id, kcage_stats.scan_index); \ 176 kcage_stats.scan_index = \ 177 (kcage_stats.scan_index + 1) % KCAGE_STATS_NSCANS; \ 178 kcage_stats.scans[kcage_stats.scan_index] = kcage_stats_scan_zero 179 180 #define KCAGE_STAT_INIT_SCAN_INDEX \ 181 kcage_stats.version = KCAGE_STATS_VERSION; \ 182 kcage_stats.size = sizeof (kcage_stats); \ 183 kcage_stats.scan_array_size = KCAGE_STATS_NSCANS; \ 184 kcage_stats.scan_index = 0 185 186 #else /* KCAGE_STATS */ 187 188 #define KCAGE_STAT_INCR(v) 189 #define KCAGE_STAT_NINCR(m, v) 190 #define KCAGE_STAT_INCR_SCAN(v) 191 #define KCAGE_STAT_NINCR_SCAN(m, v) 192 #define KCAGE_STAT_SET(m, v) 193 #define KCAGE_STAT_SETZ(m, v) 194 #define KCAGE_STAT_SET_SCAN(m, v) 195 #define KCAGE_STAT_SETZ_SCAN(m, v) 196 #define KCAGE_STAT_INC_SCAN_INDEX 197 #define KCAGE_STAT_INIT_SCAN_INDEX 198 199 #endif /* KCAGE_STATS */ 200 201 static kmutex_t kcage_throttle_mutex; /* protects kcage_throttle_cv */ 202 static kcondvar_t kcage_throttle_cv; 203 204 static kmutex_t kcage_cageout_mutex; /* protects cv and ready flag */ 205 static kcondvar_t kcage_cageout_cv; /* cageout thread naps here */ 206 static int kcage_cageout_ready; /* nonzero when cageout thread ready */ 207 kthread_id_t kcage_cageout_thread; /* to aid debugging */ 208 209 static krwlock_t kcage_range_rwlock; /* protects kcage_glist elements */ 210 211 /* 212 * Cage expansion happens within a range. 213 */ 214 struct kcage_glist { 215 struct kcage_glist *next; 216 pfn_t base; 217 pfn_t lim; 218 pfn_t curr; 219 int decr; 220 }; 221 222 static struct kcage_glist *kcage_glist; 223 static struct kcage_glist *kcage_current_glist; 224 225 /* 226 * The firstfree element is provided so that kmem_alloc can be avoided 227 * until that cage has somewhere to go. This is not currently a problem 228 * as early kmem_alloc's use BOP_ALLOC instead of page_create_va. 229 */ 230 static struct kcage_glist kcage_glist_firstfree; 231 static struct kcage_glist *kcage_glist_freelist = &kcage_glist_firstfree; 232 233 /* 234 * Miscellaneous forward references 235 */ 236 static struct kcage_glist *kcage_glist_alloc(void); 237 static int kcage_glist_delete(pfn_t, pfn_t, struct kcage_glist **); 238 static void kcage_cageout(void); 239 static int kcage_invalidate_page(page_t *, pgcnt_t *); 240 static int kcage_setnoreloc_pages(page_t *, se_t); 241 242 /* 243 * Kernel Memory Cage counters and thresholds. 244 */ 245 int kcage_on = 0; 246 pgcnt_t kcage_freemem; 247 pgcnt_t kcage_needfree; 248 pgcnt_t kcage_lotsfree; 249 pgcnt_t kcage_desfree; 250 pgcnt_t kcage_minfree; 251 pgcnt_t kcage_throttlefree; 252 pgcnt_t kcage_reserve; 253 int kcage_maxwait = 10; /* in seconds */ 254 255 /* when we use lp for kmem we start the cage at a higher initial value */ 256 pgcnt_t kcage_kmemlp_mincage; 257 258 #ifdef DEBUG 259 pgcnt_t kcage_pagets; 260 #define KCAGEPAGETS_INC() kcage_pagets++ 261 #else 262 #define KCAGEPAGETS_INC() 263 #endif 264 265 /* 266 * Startup and Dynamic Reconfiguration interfaces. 267 * kcage_range_lock() 268 * kcage_range_unlock() 269 * kcage_range_islocked() 270 * kcage_range_add() 271 * kcage_range_del() 272 * kcage_init() 273 * kcage_set_thresholds() 274 */ 275 276 /* 277 * Called outside of this file to add/remove from the list, 278 * therefore, it takes a writer lock 279 */ 280 void 281 kcage_range_lock(void) 282 { 283 rw_enter(&kcage_range_rwlock, RW_WRITER); 284 } 285 286 void 287 kcage_range_unlock(void) 288 { 289 rw_exit(&kcage_range_rwlock); 290 } 291 292 int 293 kcage_range_islocked(void) 294 { 295 return (rw_lock_held(&kcage_range_rwlock)); 296 } 297 298 /* 299 * Called from page_get_contig_pages to get the approximate kcage pfn range 300 * for exclusion from search for contiguous pages. This routine is called 301 * without kcage_range lock (kcage routines can call page_get_contig_pages 302 * through page_relocate) and with the assumption, based on kcage_range_add, 303 * that kcage_current_glist always contain a valid pointer. 304 */ 305 306 int 307 kcage_current_pfn(pfn_t *pfncur) 308 { 309 struct kcage_glist *lp = kcage_current_glist; 310 311 ASSERT(kcage_on); 312 313 ASSERT(lp != NULL); 314 315 *pfncur = lp->curr; 316 317 return (lp->decr); 318 } 319 320 /* 321 * Called from vm_pagelist.c during coalesce to find kernel cage regions 322 * within an mnode. Looks for the lowest range between lo and hi. 323 * 324 * Kernel cage memory is defined between kcage_glist and kcage_current_glist. 325 * Non-cage memory is defined between kcage_current_glist and list end. 326 * 327 * If incage is set, returns the lowest kcage range. Otherwise returns lowest 328 * non-cage range. 329 * 330 * Returns zero on success and nlo, nhi: 331 * lo <= nlo < nhi <= hi 332 * Returns non-zero if no overlapping range is found. 333 */ 334 int 335 kcage_next_range(int incage, pfn_t lo, pfn_t hi, 336 pfn_t *nlo, pfn_t *nhi) 337 { 338 struct kcage_glist *lp; 339 pfn_t tlo = hi; 340 pfn_t thi = hi; 341 342 ASSERT(lo <= hi); 343 344 /* 345 * Reader lock protects the list, but kcage_get_pfn 346 * running concurrently may advance kcage_current_glist 347 * and also update kcage_current_glist->curr. Page 348 * coalesce can handle this race condition. 349 */ 350 rw_enter(&kcage_range_rwlock, RW_READER); 351 352 for (lp = incage ? kcage_glist : kcage_current_glist; 353 lp != NULL; lp = lp->next) { 354 355 pfn_t klo, khi; 356 357 /* find the range limits in this element */ 358 if ((incage && lp->decr) || (!incage && !lp->decr)) { 359 klo = lp->curr; 360 khi = lp->lim; 361 } else { 362 klo = lp->base; 363 khi = lp->curr; 364 } 365 366 /* handle overlap */ 367 if (klo < tlo && klo < khi && lo < khi && klo < hi) { 368 tlo = MAX(lo, klo); 369 thi = MIN(hi, khi); 370 if (tlo == lo) 371 break; 372 } 373 374 /* check end of kcage */ 375 if (incage && lp == kcage_current_glist) { 376 break; 377 } 378 } 379 380 rw_exit(&kcage_range_rwlock); 381 382 /* return non-zero if no overlapping range found */ 383 if (tlo == thi) 384 return (1); 385 386 ASSERT(lo <= tlo && tlo < thi && thi <= hi); 387 388 /* return overlapping range */ 389 *nlo = tlo; 390 *nhi = thi; 391 return (0); 392 } 393 394 int 395 kcage_range_init(struct memlist *ml, int decr) 396 { 397 int ret = 0; 398 399 ASSERT(kcage_range_islocked()); 400 401 if (decr) { 402 while (ml->next != NULL) 403 ml = ml->next; 404 } 405 406 while (ml != NULL) { 407 ret = kcage_range_add(btop(ml->address), btop(ml->size), decr); 408 if (ret) 409 break; 410 411 ml = (decr ? ml->prev : ml->next); 412 } 413 414 return (ret); 415 } 416 417 /* 418 * Third arg controls direction of growth: 0: increasing pfns, 419 * 1: decreasing. 420 * Calls to add and delete must be protected by calls to 421 * kcage_range_lock() and kcage_range_unlock(). 422 */ 423 int 424 kcage_range_add(pfn_t base, pgcnt_t npgs, int decr) 425 { 426 struct kcage_glist *new, **lpp; 427 pfn_t lim; 428 429 ASSERT(kcage_range_islocked()); 430 431 ASSERT(npgs != 0); 432 if (npgs == 0) 433 return (EINVAL); 434 435 lim = base + npgs; 436 437 ASSERT(lim > base); 438 if (lim <= base) 439 return (EINVAL); 440 441 new = kcage_glist_alloc(); 442 if (new == NULL) { 443 return (ENOMEM); 444 } 445 446 new->base = base; 447 new->lim = lim; 448 new->decr = decr; 449 if (new->decr != 0) 450 new->curr = new->lim; 451 else 452 new->curr = new->base; 453 /* 454 * Any overlapping existing ranges are removed by deleting 455 * from the new list as we search for the tail. 456 */ 457 lpp = &kcage_glist; 458 while (*lpp != NULL) { 459 int ret; 460 ret = kcage_glist_delete((*lpp)->base, (*lpp)->lim, &new); 461 if (ret != 0) 462 return (ret); 463 lpp = &(*lpp)->next; 464 } 465 466 *lpp = new; 467 468 if (kcage_current_glist == NULL) { 469 kcage_current_glist = kcage_glist; 470 } 471 472 return (0); 473 } 474 475 /* 476 * Calls to add and delete must be protected by calls to 477 * kcage_range_lock() and kcage_range_unlock(). 478 */ 479 int 480 kcage_range_delete(pfn_t base, pgcnt_t npgs) 481 { 482 struct kcage_glist *lp; 483 pfn_t lim; 484 485 ASSERT(kcage_range_islocked()); 486 487 ASSERT(npgs != 0); 488 if (npgs == 0) 489 return (EINVAL); 490 491 lim = base + npgs; 492 493 ASSERT(lim > base); 494 if (lim <= base) 495 return (EINVAL); 496 497 /* 498 * Check if the delete is OK first as a number of elements 499 * might be involved and it will be difficult to go 500 * back and undo (can't just add the range back in). 501 */ 502 for (lp = kcage_glist; lp != NULL; lp = lp->next) { 503 /* 504 * If there have been no pages allocated from this 505 * element, we don't need to check it. 506 */ 507 if ((lp->decr == 0 && lp->curr == lp->base) || 508 (lp->decr != 0 && lp->curr == lp->lim)) 509 continue; 510 /* 511 * If the element does not overlap, its OK. 512 */ 513 if (base >= lp->lim || lim <= lp->base) 514 continue; 515 /* 516 * Overlapping element: Does the range to be deleted 517 * overlap the area already used? If so fail. 518 */ 519 if (lp->decr == 0 && base < lp->curr && lim >= lp->base) { 520 return (EBUSY); 521 } 522 if (lp->decr != 0 && base < lp->lim && lim >= lp->curr) { 523 return (EBUSY); 524 } 525 } 526 return (kcage_glist_delete(base, lim, &kcage_glist)); 527 } 528 529 /* 530 * Calls to add and delete must be protected by calls to 531 * kcage_range_lock() and kcage_range_unlock(). 532 * This routine gets called after successful Solaris memory 533 * delete operation from DR post memory delete routines. 534 */ 535 int 536 kcage_range_delete_post_mem_del(pfn_t base, pgcnt_t npgs) 537 { 538 pfn_t lim; 539 540 ASSERT(kcage_range_islocked()); 541 542 ASSERT(npgs != 0); 543 if (npgs == 0) 544 return (EINVAL); 545 546 lim = base + npgs; 547 548 ASSERT(lim > base); 549 if (lim <= base) 550 return (EINVAL); 551 552 return (kcage_glist_delete(base, lim, &kcage_glist)); 553 } 554 555 /* 556 * No locking is required here as the whole operation is covered 557 * by the kcage_range_lock(). 558 */ 559 static struct kcage_glist * 560 kcage_glist_alloc(void) 561 { 562 struct kcage_glist *new; 563 564 if ((new = kcage_glist_freelist) != NULL) { 565 kcage_glist_freelist = new->next; 566 bzero(new, sizeof (*new)); 567 } else { 568 new = kmem_zalloc(sizeof (struct kcage_glist), KM_NOSLEEP); 569 } 570 return (new); 571 } 572 573 static void 574 kcage_glist_free(struct kcage_glist *lp) 575 { 576 lp->next = kcage_glist_freelist; 577 kcage_glist_freelist = lp; 578 } 579 580 static int 581 kcage_glist_delete(pfn_t base, pfn_t lim, struct kcage_glist **lpp) 582 { 583 struct kcage_glist *lp, *prev = *lpp; 584 585 while ((lp = *lpp) != NULL) { 586 if (lim > lp->base && base < lp->lim) { 587 /* The delete range overlaps this element. */ 588 if (base <= lp->base && lim >= lp->lim) { 589 /* Delete whole element. */ 590 *lpp = lp->next; 591 if (lp == kcage_current_glist) { 592 /* This can never happen. */ 593 ASSERT(kcage_current_glist != prev); 594 kcage_current_glist = prev; 595 } 596 kcage_glist_free(lp); 597 continue; 598 } 599 600 /* Partial delete. */ 601 if (base > lp->base && lim < lp->lim) { 602 struct kcage_glist *new; 603 604 /* 605 * Remove a section from the middle, 606 * need to allocate a new element. 607 */ 608 new = kcage_glist_alloc(); 609 if (new == NULL) { 610 return (ENOMEM); 611 } 612 613 /* 614 * Tranfser unused range to new. 615 * Edit lp in place to preserve 616 * kcage_current_glist. 617 */ 618 new->decr = lp->decr; 619 if (new->decr != 0) { 620 new->base = lp->base; 621 new->lim = base; 622 new->curr = base; 623 624 lp->base = lim; 625 } else { 626 new->base = lim; 627 new->lim = lp->lim; 628 new->curr = new->base; 629 630 lp->lim = base; 631 } 632 633 /* Insert new. */ 634 new->next = lp->next; 635 lp->next = new; 636 lpp = &lp->next; 637 } else { 638 /* Delete part of current block. */ 639 if (base > lp->base) { 640 ASSERT(lim >= lp->lim); 641 ASSERT(base < lp->lim); 642 if (lp->decr != 0 && 643 lp->curr == lp->lim) 644 lp->curr = base; 645 lp->lim = base; 646 } else { 647 ASSERT(base <= lp->base); 648 ASSERT(lim > lp->base); 649 if (lp->decr == 0 && 650 lp->curr == lp->base) 651 lp->curr = lim; 652 lp->base = lim; 653 } 654 } 655 } 656 prev = *lpp; 657 lpp = &(*lpp)->next; 658 } 659 660 return (0); 661 } 662 663 /* 664 * The caller of kcage_get_pfn must hold the kcage_range_lock to make 665 * sure that there are no concurrent calls. The same lock 666 * must be obtained for range add and delete by calling 667 * kcage_range_lock() and kcage_range_unlock(). 668 */ 669 static pfn_t 670 kcage_get_pfn(void) 671 { 672 struct kcage_glist *lp; 673 pfn_t pfn; 674 675 ASSERT(kcage_range_islocked()); 676 677 lp = kcage_current_glist; 678 while (lp != NULL) { 679 if (lp->decr != 0) { 680 if (lp->curr != lp->base) { 681 pfn = --lp->curr; 682 return (pfn); 683 } 684 } else { 685 if (lp->curr != lp->lim) { 686 pfn = lp->curr++; 687 return (pfn); 688 } 689 } 690 691 lp = lp->next; 692 if (lp) 693 kcage_current_glist = lp; 694 } 695 696 return (PFN_INVALID); 697 } 698 699 /* 700 * Walk the physical address space of the cage. 701 * This routine does not guarantee to return PFNs in the order 702 * in which they were allocated to the cage. Instead, it walks 703 * each range as they appear on the growth list returning the PFNs 704 * range in ascending order. 705 * 706 * To begin scanning at lower edge of cage, reset should be nonzero. 707 * To step through cage, reset should be zero. 708 * 709 * PFN_INVALID will be returned when the upper end of the cage is 710 * reached -- indicating a full scan of the cage has been completed since 711 * previous reset. PFN_INVALID will continue to be returned until 712 * kcage_walk_cage is reset. 713 * 714 * It is possible to receive a PFN_INVALID result on reset if a growth 715 * list is not installed or if none of the PFNs in the installed list have 716 * been allocated to the cage. In otherwords, there is no cage. 717 * 718 * Caller need not hold kcage_range_lock while calling this function 719 * as the front part of the list is static - pages never come out of 720 * the cage. 721 * 722 * The caller is expected to only be kcage_cageout(). 723 */ 724 static pfn_t 725 kcage_walk_cage(int reset) 726 { 727 static struct kcage_glist *lp = NULL; 728 static pfn_t pfn; 729 730 if (reset) 731 lp = NULL; 732 if (lp == NULL) { 733 lp = kcage_glist; 734 pfn = PFN_INVALID; 735 } 736 again: 737 if (pfn == PFN_INVALID) { 738 if (lp == NULL) 739 return (PFN_INVALID); 740 741 if (lp->decr != 0) { 742 /* 743 * In this range the cage grows from the highest 744 * address towards the lowest. 745 * Arrange to return pfns from curr to lim-1, 746 * inclusive, in ascending order. 747 */ 748 749 pfn = lp->curr; 750 } else { 751 /* 752 * In this range the cage grows from the lowest 753 * address towards the highest. 754 * Arrange to return pfns from base to curr, 755 * inclusive, in ascending order. 756 */ 757 758 pfn = lp->base; 759 } 760 } 761 762 if (lp->decr != 0) { /* decrementing pfn */ 763 if (pfn == lp->lim) { 764 /* Don't go beyond the static part of the glist. */ 765 if (lp == kcage_current_glist) 766 lp = NULL; 767 else 768 lp = lp->next; 769 pfn = PFN_INVALID; 770 goto again; 771 } 772 773 ASSERT(pfn >= lp->curr && pfn < lp->lim); 774 } else { /* incrementing pfn */ 775 if (pfn == lp->curr) { 776 /* Don't go beyond the static part of the glist. */ 777 if (lp == kcage_current_glist) 778 lp = NULL; 779 else 780 lp = lp->next; 781 pfn = PFN_INVALID; 782 goto again; 783 } 784 785 ASSERT(pfn >= lp->base && pfn < lp->curr); 786 } 787 788 return (pfn++); 789 } 790 791 /* 792 * Callback functions for to recalc cage thresholds after 793 * Kphysm memory add/delete operations. 794 */ 795 /*ARGSUSED*/ 796 static void 797 kcage_kphysm_postadd_cb(void *arg, pgcnt_t delta_pages) 798 { 799 kcage_recalc_thresholds(); 800 } 801 802 /*ARGSUSED*/ 803 static int 804 kcage_kphysm_predel_cb(void *arg, pgcnt_t delta_pages) 805 { 806 /* TODO: when should cage refuse memory delete requests? */ 807 return (0); 808 } 809 810 /*ARGSUSED*/ 811 static void 812 kcage_kphysm_postdel_cb(void *arg, pgcnt_t delta_pages, int cancelled) 813 { 814 kcage_recalc_thresholds(); 815 } 816 817 static kphysm_setup_vector_t kcage_kphysm_vectors = { 818 KPHYSM_SETUP_VECTOR_VERSION, 819 kcage_kphysm_postadd_cb, 820 kcage_kphysm_predel_cb, 821 kcage_kphysm_postdel_cb 822 }; 823 824 /* 825 * This is called before a CPR suspend and after a CPR resume. We have to 826 * turn off kcage_cageout_ready before a suspend, and turn it back on after a 827 * restart. 828 */ 829 /*ARGSUSED*/ 830 static boolean_t 831 kcage_cageout_cpr(void *arg, int code) 832 { 833 if (code == CB_CODE_CPR_CHKPT) { 834 ASSERT(kcage_cageout_ready); 835 kcage_cageout_ready = 0; 836 return (B_TRUE); 837 } else if (code == CB_CODE_CPR_RESUME) { 838 ASSERT(kcage_cageout_ready == 0); 839 kcage_cageout_ready = 1; 840 return (B_TRUE); 841 } 842 return (B_FALSE); 843 } 844 845 /* 846 * kcage_recalc_preferred_size() increases initial cage size to improve large 847 * page availability when lp for kmem is enabled and kpr is disabled 848 */ 849 static pgcnt_t 850 kcage_recalc_preferred_size(pgcnt_t preferred_size) 851 { 852 if (SEGKMEM_USE_LARGEPAGES && segkmem_reloc == 0) { 853 pgcnt_t lpmincage = kcage_kmemlp_mincage; 854 if (lpmincage == 0) { 855 lpmincage = MIN(P2ROUNDUP(((physmem * PAGESIZE) / 8), 856 segkmem_heaplp_quantum), 0x40000000UL) / PAGESIZE; 857 } 858 kcage_kmemlp_mincage = MIN(lpmincage, 859 (segkmem_kmemlp_max / PAGESIZE)); 860 preferred_size = MAX(kcage_kmemlp_mincage, preferred_size); 861 } 862 return (preferred_size); 863 } 864 865 /* 866 * Kcage_init() builds the cage and initializes the cage thresholds. 867 * The size of the cage is determined by the argument preferred_size. 868 * or the actual amount of memory, whichever is smaller. 869 */ 870 void 871 kcage_init(pgcnt_t preferred_size) 872 { 873 pgcnt_t wanted; 874 pfn_t pfn; 875 page_t *pp; 876 extern struct vnode kvp; 877 extern void page_list_noreloc_startup(page_t *); 878 879 ASSERT(!kcage_on); 880 ASSERT(kcage_range_islocked()); 881 882 /* increase preferred cage size for lp for kmem */ 883 preferred_size = kcage_recalc_preferred_size(preferred_size); 884 885 /* Debug note: initialize this now so early expansions can stat */ 886 KCAGE_STAT_INIT_SCAN_INDEX; 887 888 /* 889 * Initialize cage thresholds and install kphysm callback. 890 * If we can't arrange to have the thresholds track with 891 * available physical memory, then the cage thresholds may 892 * end up over time at levels that adversly effect system 893 * performance; so, bail out. 894 */ 895 kcage_recalc_thresholds(); 896 if (kphysm_setup_func_register(&kcage_kphysm_vectors, NULL)) { 897 ASSERT(0); /* Catch this in DEBUG kernels. */ 898 return; 899 } 900 901 /* 902 * Limit startup cage size within the range of kcage_minfree 903 * and availrmem, inclusively. 904 */ 905 wanted = MIN(MAX(preferred_size, kcage_minfree), availrmem); 906 907 /* 908 * Construct the cage. PFNs are allocated from the glist. It 909 * is assumed that the list has been properly ordered for the 910 * platform by the platform code. Typically, this is as simple 911 * as calling kcage_range_init(phys_avail, decr), where decr is 912 * 1 if the kernel has been loaded into upper end of physical 913 * memory, or 0 if the kernel has been loaded at the low end. 914 * 915 * Note: it is assumed that we are in the startup flow, so there 916 * is no reason to grab the page lock. 917 */ 918 kcage_freemem = 0; 919 pfn = PFN_INVALID; /* prime for alignment test */ 920 while (wanted != 0) { 921 if ((pfn = kcage_get_pfn()) == PFN_INVALID) 922 break; 923 924 if ((pp = page_numtopp_nolock(pfn)) != NULL) { 925 KCAGEPAGETS_INC(); 926 /* 927 * Set the noreloc state on the page. 928 * If the page is free and not already 929 * on the noreloc list then move it. 930 */ 931 if (PP_ISFREE(pp)) { 932 if (PP_ISNORELOC(pp) == 0) 933 page_list_noreloc_startup(pp); 934 } else { 935 ASSERT(pp->p_szc == 0); 936 PP_SETNORELOC(pp); 937 } 938 } 939 PLCNT_XFER_NORELOC(pp); 940 wanted -= 1; 941 } 942 943 /* 944 * Need to go through and find kernel allocated pages 945 * and capture them into the Cage. These will primarily 946 * be pages gotten through boot_alloc(). 947 */ 948 if (kvp.v_pages) { 949 950 pp = kvp.v_pages; 951 do { 952 ASSERT(!PP_ISFREE(pp)); 953 ASSERT(pp->p_szc == 0); 954 PP_SETNORELOC(pp); 955 } while ((pp = pp->p_vpnext) != kvp.v_pages); 956 957 } 958 959 kcage_on = 1; 960 961 /* 962 * CB_CL_CPR_POST_KERNEL is the class that executes from cpr_suspend() 963 * after the cageout thread is blocked, and executes from cpr_resume() 964 * before the cageout thread is restarted. By executing in this class, 965 * we are assured that the kernel cage thread won't miss wakeup calls 966 * and also CPR's larger kmem_alloc requests will not fail after 967 * CPR shuts down the cageout kernel thread. 968 */ 969 (void) callb_add(kcage_cageout_cpr, NULL, CB_CL_CPR_POST_KERNEL, 970 "cageout"); 971 972 /* 973 * Coalesce pages to improve large page availability. A better fix 974 * would to coalesce pages as they are included in the cage 975 */ 976 if (SEGKMEM_USE_LARGEPAGES) { 977 extern void page_freelist_coalesce_all(int mnode); 978 extern int max_mem_nodes; 979 int mnode, max_mnodes = max_mem_nodes; 980 for (mnode = 0; mnode < max_mnodes; mnode++) { 981 page_freelist_coalesce_all(mnode); 982 } 983 } 984 } 985 986 void 987 kcage_recalc_thresholds() 988 { 989 static int first = 1; 990 static pgcnt_t init_lotsfree; 991 static pgcnt_t init_desfree; 992 static pgcnt_t init_minfree; 993 static pgcnt_t init_throttlefree; 994 static pgcnt_t init_reserve; 995 996 /* TODO: any reason to take more care than this with live editing? */ 997 mutex_enter(&kcage_cageout_mutex); 998 mutex_enter(&freemem_lock); 999 1000 if (first) { 1001 first = 0; 1002 init_lotsfree = kcage_lotsfree; 1003 init_desfree = kcage_desfree; 1004 init_minfree = kcage_minfree; 1005 init_throttlefree = kcage_throttlefree; 1006 init_reserve = kcage_reserve; 1007 } else { 1008 kcage_lotsfree = init_lotsfree; 1009 kcage_desfree = init_desfree; 1010 kcage_minfree = init_minfree; 1011 kcage_throttlefree = init_throttlefree; 1012 kcage_reserve = init_reserve; 1013 } 1014 1015 if (kcage_lotsfree == 0) 1016 kcage_lotsfree = MAX(32, total_pages / 256); 1017 1018 if (kcage_minfree == 0) 1019 kcage_minfree = MAX(32, kcage_lotsfree / 2); 1020 1021 if (kcage_desfree == 0) 1022 kcage_desfree = MAX(32, kcage_minfree); 1023 1024 if (kcage_throttlefree == 0) 1025 kcage_throttlefree = MAX(32, kcage_minfree / 2); 1026 1027 if (kcage_reserve == 0) 1028 kcage_reserve = MIN(32, kcage_throttlefree / 2); 1029 1030 mutex_exit(&freemem_lock); 1031 mutex_exit(&kcage_cageout_mutex); 1032 1033 if (kcage_cageout_ready) { 1034 if (kcage_freemem < kcage_desfree) 1035 kcage_cageout_wakeup(); 1036 1037 if (kcage_needfree) { 1038 mutex_enter(&kcage_throttle_mutex); 1039 cv_broadcast(&kcage_throttle_cv); 1040 mutex_exit(&kcage_throttle_mutex); 1041 } 1042 } 1043 } 1044 1045 /* 1046 * Pageout interface: 1047 * kcage_cageout_init() 1048 */ 1049 void 1050 kcage_cageout_init() 1051 { 1052 if (kcage_on) { 1053 1054 (void) thread_create(NULL, 0, kcage_cageout, 1055 NULL, 0, proc_pageout, TS_RUN, maxclsyspri - 1); 1056 } 1057 } 1058 1059 1060 /* 1061 * VM Interfaces: 1062 * kcage_create_throttle() 1063 * kcage_freemem_add() 1064 * kcage_freemem_sub() 1065 */ 1066 1067 /* 1068 * Wakeup cageout thread and throttle waiting for the number of pages 1069 * requested to become available. For non-critical requests, a 1070 * timeout is added, since freemem accounting is separate from cage 1071 * freemem accounting: it's possible for us to get stuck and not make 1072 * forward progress even though there was sufficient freemem before 1073 * arriving here. 1074 */ 1075 int 1076 kcage_create_throttle(pgcnt_t npages, int flags) 1077 { 1078 int niter = 0; 1079 pgcnt_t lastfree; 1080 int enough = kcage_freemem > kcage_throttlefree + npages; 1081 1082 KCAGE_STAT_INCR(kct_calls); /* unprotected incr. */ 1083 1084 kcage_cageout_wakeup(); /* just to be sure */ 1085 KCAGE_STAT_INCR(kct_cagewake); /* unprotected incr. */ 1086 1087 /* 1088 * Obviously, we can't throttle the cageout thread since 1089 * we depend on it. We also can't throttle the panic thread. 1090 */ 1091 if (curthread == kcage_cageout_thread || panicstr) { 1092 KCAGE_STAT_INCR(kct_cageout); /* unprotected incr. */ 1093 return (KCT_CRIT); 1094 } 1095 1096 /* 1097 * Don't throttle threads which are critical for proper 1098 * vm management if we're above kcage_throttlefree or 1099 * if freemem is very low. 1100 */ 1101 if (NOMEMWAIT()) { 1102 if (enough) { 1103 KCAGE_STAT_INCR(kct_exempt); /* unprotected incr. */ 1104 return (KCT_CRIT); 1105 } else if (freemem < minfree) { 1106 KCAGE_STAT_INCR(kct_critical); /* unprotected incr. */ 1107 return (KCT_CRIT); 1108 } 1109 } 1110 1111 /* 1112 * Don't throttle real-time threads if kcage_freemem > kcage_reserve. 1113 */ 1114 if (DISP_PRIO(curthread) > maxclsyspri && 1115 kcage_freemem > kcage_reserve) { 1116 KCAGE_STAT_INCR(kct_exempt); /* unprotected incr. */ 1117 return (KCT_CRIT); 1118 } 1119 1120 /* 1121 * Cause all other threads (which are assumed to not be 1122 * critical to cageout) to wait here until their request 1123 * can be satisfied. Be a little paranoid and wake the 1124 * kernel cage on each loop through this logic. 1125 */ 1126 while (kcage_freemem < kcage_throttlefree + npages) { 1127 ASSERT(kcage_on); 1128 1129 lastfree = kcage_freemem; 1130 1131 if (kcage_cageout_ready) { 1132 mutex_enter(&kcage_throttle_mutex); 1133 1134 kcage_needfree += npages; 1135 KCAGE_STAT_INCR(kct_wait); 1136 1137 kcage_cageout_wakeup(); 1138 KCAGE_STAT_INCR(kct_cagewake); 1139 1140 cv_wait(&kcage_throttle_cv, &kcage_throttle_mutex); 1141 1142 kcage_needfree -= npages; 1143 1144 mutex_exit(&kcage_throttle_mutex); 1145 } else { 1146 /* 1147 * NOTE: atomics are used just in case we enter 1148 * mp operation before the cageout thread is ready. 1149 */ 1150 atomic_add_long(&kcage_needfree, npages); 1151 1152 kcage_cageout_wakeup(); 1153 KCAGE_STAT_INCR(kct_cagewake); /* unprotected incr. */ 1154 1155 atomic_add_long(&kcage_needfree, -npages); 1156 } 1157 1158 if ((flags & PG_WAIT) == 0) { 1159 if (kcage_freemem > lastfree) { 1160 KCAGE_STAT_INCR(kct_progress); 1161 niter = 0; 1162 } else { 1163 KCAGE_STAT_INCR(kct_noprogress); 1164 if (++niter >= kcage_maxwait) { 1165 KCAGE_STAT_INCR(kct_timeout); 1166 return (KCT_FAILURE); 1167 } 1168 } 1169 } 1170 } 1171 return (KCT_NONCRIT); 1172 } 1173 1174 void 1175 kcage_freemem_add(pgcnt_t npages) 1176 { 1177 extern void wakeup_pcgs(void); 1178 1179 atomic_add_long(&kcage_freemem, npages); 1180 1181 wakeup_pcgs(); /* wakeup threads in pcgs() */ 1182 1183 if (kcage_needfree != 0 && 1184 kcage_freemem >= (kcage_throttlefree + kcage_needfree)) { 1185 1186 mutex_enter(&kcage_throttle_mutex); 1187 cv_broadcast(&kcage_throttle_cv); 1188 KCAGE_STAT_INCR(kfa_trottlewake); 1189 mutex_exit(&kcage_throttle_mutex); 1190 } 1191 } 1192 1193 void 1194 kcage_freemem_sub(pgcnt_t npages) 1195 { 1196 atomic_add_long(&kcage_freemem, -npages); 1197 1198 if (kcage_freemem < kcage_desfree) { 1199 kcage_cageout_wakeup(); 1200 KCAGE_STAT_INCR(kfs_cagewake); /* unprotected incr. */ 1201 } 1202 } 1203 1204 /* 1205 * return 0 on failure and 1 on success. 1206 */ 1207 static int 1208 kcage_setnoreloc_pages(page_t *rootpp, se_t se) 1209 { 1210 pgcnt_t npgs, i; 1211 page_t *pp; 1212 pfn_t rootpfn = page_pptonum(rootpp); 1213 uint_t szc; 1214 1215 ASSERT(!PP_ISFREE(rootpp)); 1216 ASSERT(PAGE_LOCKED_SE(rootpp, se)); 1217 if (!group_page_trylock(rootpp, se)) { 1218 return (0); 1219 } 1220 szc = rootpp->p_szc; 1221 if (szc == 0) { 1222 /* 1223 * The szc of a locked page can only change for pages that are 1224 * non-swapfs (i.e. anonymous memory) file system pages. 1225 */ 1226 ASSERT(rootpp->p_vnode != NULL && 1227 rootpp->p_vnode != &kvp && 1228 !IS_SWAPFSVP(rootpp->p_vnode)); 1229 PP_SETNORELOC(rootpp); 1230 return (1); 1231 } 1232 npgs = page_get_pagecnt(szc); 1233 ASSERT(IS_P2ALIGNED(rootpfn, npgs)); 1234 pp = rootpp; 1235 for (i = 0; i < npgs; i++, pp++) { 1236 ASSERT(PAGE_LOCKED_SE(pp, se)); 1237 ASSERT(!PP_ISFREE(pp)); 1238 ASSERT(pp->p_szc == szc); 1239 PP_SETNORELOC(pp); 1240 } 1241 group_page_unlock(rootpp); 1242 return (1); 1243 } 1244 1245 /* 1246 * Attempt to convert page to a caged page (set the P_NORELOC flag). 1247 * If successful and pages is free, move page to the tail of whichever 1248 * list it is on. 1249 * Returns: 1250 * EBUSY page already locked, assimilated but not free. 1251 * ENOMEM page assimilated, but memory too low to relocate. Page not free. 1252 * EAGAIN page not assimilated. Page not free. 1253 * ERANGE page assimilated. Page not root. 1254 * 0 page assimilated. Page free. 1255 * *nfreedp number of pages freed. 1256 * NOTE: With error codes ENOMEM, EBUSY, and 0 (zero), there is no way 1257 * to distinguish between a page that was already a NORELOC page from 1258 * those newly converted to NORELOC pages by this invocation of 1259 * kcage_assimilate_page. 1260 */ 1261 static int 1262 kcage_assimilate_page(page_t *pp, pgcnt_t *nfreedp) 1263 { 1264 if (page_trylock(pp, SE_EXCL)) { 1265 if (PP_ISNORELOC(pp)) { 1266 check_free_and_return: 1267 if (PP_ISFREE(pp)) { 1268 page_unlock(pp); 1269 *nfreedp = 0; 1270 return (0); 1271 } else { 1272 page_unlock(pp); 1273 return (EBUSY); 1274 } 1275 /*NOTREACHED*/ 1276 } 1277 } else { 1278 if (page_trylock(pp, SE_SHARED)) { 1279 if (PP_ISNORELOC(pp)) 1280 goto check_free_and_return; 1281 } else 1282 return (EAGAIN); 1283 1284 if (!PP_ISFREE(pp)) { 1285 page_unlock(pp); 1286 return (EAGAIN); 1287 } 1288 1289 /* 1290 * Need to upgrade the lock on it and set the NORELOC 1291 * bit. If it is free then remove it from the free 1292 * list so that the platform free list code can keep 1293 * NORELOC pages where they should be. 1294 */ 1295 /* 1296 * Before doing anything, get the exclusive lock. 1297 * This may fail (eg ISM pages are left shared locked). 1298 * If the page is free this will leave a hole in the 1299 * cage. There is no solution yet to this. 1300 */ 1301 if (!page_tryupgrade(pp)) { 1302 page_unlock(pp); 1303 return (EAGAIN); 1304 } 1305 } 1306 1307 ASSERT(PAGE_EXCL(pp)); 1308 1309 if (PP_ISFREE(pp)) { 1310 int which = PP_ISAGED(pp) ? PG_FREE_LIST : PG_CACHE_LIST; 1311 1312 page_list_sub(pp, which); 1313 ASSERT(pp->p_szc == 0); 1314 PP_SETNORELOC(pp); 1315 PLCNT_XFER_NORELOC(pp); 1316 page_list_add(pp, which | PG_LIST_TAIL); 1317 1318 page_unlock(pp); 1319 *nfreedp = 1; 1320 return (0); 1321 } else { 1322 if (pp->p_szc != 0) { 1323 if (!kcage_setnoreloc_pages(pp, SE_EXCL)) { 1324 page_unlock(pp); 1325 return (EAGAIN); 1326 } 1327 ASSERT(PP_ISNORELOC(pp)); 1328 } else { 1329 PP_SETNORELOC(pp); 1330 } 1331 PLCNT_XFER_NORELOC(pp); 1332 return (kcage_invalidate_page(pp, nfreedp)); 1333 } 1334 /*NOTREACHED*/ 1335 } 1336 1337 static int 1338 kcage_expand() 1339 { 1340 int did_something = 0; 1341 1342 spgcnt_t wanted; 1343 pfn_t pfn; 1344 page_t *pp; 1345 /* TODO: we don't really need n any more? */ 1346 pgcnt_t n; 1347 pgcnt_t nf, nfreed; 1348 1349 /* 1350 * Expand the cage if available cage memory is really low. Calculate 1351 * the amount required to return kcage_freemem to the level of 1352 * kcage_lotsfree, or to satisfy throttled requests, whichever is 1353 * more. It is rare for their sum to create an artificial threshold 1354 * above kcage_lotsfree, but it is possible. 1355 * 1356 * Exit early if expansion amount is equal to or less than zero. 1357 * (<0 is possible if kcage_freemem rises suddenly.) 1358 * 1359 * Exit early when the global page pool (apparently) does not 1360 * have enough free pages to page_relocate() even a single page. 1361 */ 1362 wanted = MAX(kcage_lotsfree, kcage_throttlefree + kcage_needfree) 1363 - kcage_freemem; 1364 if (wanted <= 0) 1365 return (0); 1366 else if (freemem < pageout_reserve + 1) { 1367 KCAGE_STAT_INCR(ke_lowfreemem); 1368 return (0); 1369 } 1370 1371 /* 1372 * Try to get the range list reader lock. If the lock is already 1373 * held, then don't get stuck here waiting for it. 1374 */ 1375 if (!rw_tryenter(&kcage_range_rwlock, RW_READER)) 1376 return (0); 1377 1378 KCAGE_STAT_INCR(ke_calls); 1379 KCAGE_STAT_SET_SCAN(ke_wanted, (uint_t)wanted); 1380 1381 /* 1382 * Assimilate more pages from the global page pool into the cage. 1383 */ 1384 n = 0; /* number of pages PP_SETNORELOC'd */ 1385 nf = 0; /* number of those actually free */ 1386 while (kcage_on && nf < wanted) { 1387 pfn = kcage_get_pfn(); 1388 if (pfn == PFN_INVALID) { /* eek! no where to grow */ 1389 KCAGE_STAT_INCR(ke_nopfn); 1390 goto terminate; 1391 } 1392 1393 KCAGE_STAT_INCR_SCAN(ke_examined); 1394 1395 if ((pp = page_numtopp_nolock(pfn)) == NULL) { 1396 KCAGE_STAT_INCR(ke_nopaget); 1397 continue; 1398 } 1399 KCAGEPAGETS_INC(); 1400 /* 1401 * Sanity check. Skip this pfn if it is 1402 * being deleted. 1403 */ 1404 if (pfn_is_being_deleted(pfn)) { 1405 KCAGE_STAT_INCR(ke_deleting); 1406 continue; 1407 } 1408 1409 /* 1410 * NORELOC is only set at boot-time or by this routine 1411 * under the kcage_range_rwlock lock which is currently 1412 * held. This means we can do a fast check here before 1413 * locking the page in kcage_assimilate_page. 1414 */ 1415 if (PP_ISNORELOC(pp)) { 1416 KCAGE_STAT_INCR(ke_isnoreloc); 1417 continue; 1418 } 1419 1420 switch (kcage_assimilate_page(pp, &nfreed)) { 1421 case 0: /* assimilated, page is free */ 1422 KCAGE_STAT_NINCR_SCAN(ke_gotonefree, nfreed); 1423 did_something = 1; 1424 nf += nfreed; 1425 n++; 1426 break; 1427 1428 case EBUSY: /* assimilated, page not free */ 1429 case ERANGE: /* assimilated, page not root */ 1430 KCAGE_STAT_INCR_SCAN(ke_gotone); 1431 did_something = 1; 1432 n++; 1433 break; 1434 1435 case ENOMEM: /* assimilated, but no mem */ 1436 KCAGE_STAT_INCR(ke_terminate); 1437 did_something = 1; 1438 n++; 1439 goto terminate; 1440 1441 case EAGAIN: /* can't assimilate */ 1442 KCAGE_STAT_INCR_SCAN(ke_lefthole); 1443 break; 1444 1445 default: /* catch this with debug kernels */ 1446 ASSERT(0); 1447 break; 1448 } 1449 } 1450 1451 /* 1452 * Realign cage edge with the nearest physical address 1453 * boundry for big pages. This is done to give us a 1454 * better chance of actually getting usable big pages 1455 * in the cage. 1456 */ 1457 1458 terminate: 1459 kcage_range_unlock(); 1460 1461 return (did_something); 1462 } 1463 1464 /* 1465 * Relocate page opp (Original Page Pointer) from cage pool to page rpp 1466 * (Replacement Page Pointer) in the global pool. Page opp will be freed 1467 * if relocation is successful, otherwise it is only unlocked. 1468 * On entry, page opp must be exclusively locked and not free. 1469 * *nfreedp: number of pages freed. 1470 */ 1471 static int 1472 kcage_relocate_page(page_t *pp, pgcnt_t *nfreedp) 1473 { 1474 page_t *opp = pp; 1475 page_t *rpp = NULL; 1476 spgcnt_t npgs; 1477 int result; 1478 1479 ASSERT(!PP_ISFREE(opp)); 1480 ASSERT(PAGE_EXCL(opp)); 1481 1482 result = page_relocate(&opp, &rpp, 1, 1, &npgs, NULL); 1483 *nfreedp = npgs; 1484 if (result == 0) { 1485 while (npgs-- > 0) { 1486 page_t *tpp; 1487 1488 ASSERT(rpp != NULL); 1489 tpp = rpp; 1490 page_sub(&rpp, tpp); 1491 page_unlock(tpp); 1492 } 1493 1494 ASSERT(rpp == NULL); 1495 1496 return (0); /* success */ 1497 } 1498 1499 page_unlock(opp); 1500 return (result); 1501 } 1502 1503 /* 1504 * Based on page_invalidate_pages() 1505 * 1506 * Kcage_invalidate_page() uses page_relocate() twice. Both instances 1507 * of use must be updated to match the new page_relocate() when it 1508 * becomes available. 1509 * 1510 * Return result of kcage_relocate_page or zero if page was directly freed. 1511 * *nfreedp: number of pages freed. 1512 */ 1513 static int 1514 kcage_invalidate_page(page_t *pp, pgcnt_t *nfreedp) 1515 { 1516 int result; 1517 1518 #if defined(__sparc) 1519 extern struct vnode prom_ppages; 1520 ASSERT(pp->p_vnode != &prom_ppages); 1521 #endif /* __sparc */ 1522 1523 ASSERT(!PP_ISFREE(pp)); 1524 ASSERT(PAGE_EXCL(pp)); 1525 1526 /* 1527 * Is this page involved in some I/O? shared? 1528 * The page_struct_lock need not be acquired to 1529 * examine these fields since the page has an 1530 * "exclusive" lock. 1531 */ 1532 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 1533 result = kcage_relocate_page(pp, nfreedp); 1534 #ifdef KCAGE_STATS 1535 if (result == 0) 1536 KCAGE_STAT_INCR_SCAN(kip_reloclocked); 1537 else if (result == ENOMEM) 1538 KCAGE_STAT_INCR_SCAN(kip_nomem); 1539 #endif 1540 return (result); 1541 } 1542 1543 ASSERT(pp->p_vnode->v_type != VCHR); 1544 1545 /* 1546 * Unload the mappings and check if mod bit is set. 1547 */ 1548 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1549 1550 if (hat_ismod(pp)) { 1551 result = kcage_relocate_page(pp, nfreedp); 1552 #ifdef KCAGE_STATS 1553 if (result == 0) 1554 KCAGE_STAT_INCR_SCAN(kip_relocmod); 1555 else if (result == ENOMEM) 1556 KCAGE_STAT_INCR_SCAN(kip_nomem); 1557 #endif 1558 return (result); 1559 } 1560 1561 if (!page_try_demote_pages(pp)) { 1562 KCAGE_STAT_INCR_SCAN(kip_demotefailed); 1563 page_unlock(pp); 1564 return (EAGAIN); 1565 } 1566 1567 page_destroy(pp, 0); 1568 KCAGE_STAT_INCR_SCAN(kip_destroy); 1569 *nfreedp = 1; 1570 return (0); 1571 } 1572 1573 static void 1574 kcage_cageout() 1575 { 1576 pfn_t pfn; 1577 page_t *pp; 1578 callb_cpr_t cprinfo; 1579 int did_something; 1580 int scan_again; 1581 pfn_t start_pfn; 1582 int pass; 1583 int last_pass; 1584 int pages_skipped; 1585 int shared_skipped; 1586 uint_t shared_level = 8; 1587 pgcnt_t nfreed; 1588 #ifdef KCAGE_STATS 1589 clock_t scan_start; 1590 #endif 1591 1592 CALLB_CPR_INIT(&cprinfo, &kcage_cageout_mutex, 1593 callb_generic_cpr, "cageout"); 1594 1595 mutex_enter(&kcage_cageout_mutex); 1596 kcage_cageout_thread = curthread; 1597 1598 pfn = PFN_INVALID; /* force scan reset */ 1599 start_pfn = PFN_INVALID; /* force init with 1st cage pfn */ 1600 kcage_cageout_ready = 1; /* switch kcage_cageout_wakeup mode */ 1601 1602 loop: 1603 /* 1604 * Wait here. Sooner or later, kcage_freemem_sub() will notice 1605 * that kcage_freemem is less than kcage_desfree. When it does 1606 * notice, kcage_freemem_sub() will wake us up via call to 1607 * kcage_cageout_wakeup(). 1608 */ 1609 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1610 cv_wait(&kcage_cageout_cv, &kcage_cageout_mutex); 1611 CALLB_CPR_SAFE_END(&cprinfo, &kcage_cageout_mutex); 1612 1613 KCAGE_STAT_INCR(kt_wakeups); 1614 KCAGE_STAT_SET_SCAN(kt_freemem_start, freemem); 1615 KCAGE_STAT_SET_SCAN(kt_kcage_freemem_start, kcage_freemem); 1616 pass = 0; 1617 last_pass = 0; 1618 1619 #ifdef KCAGE_STATS 1620 scan_start = lbolt; 1621 #endif 1622 1623 again: 1624 if (!kcage_on) 1625 goto loop; 1626 1627 KCAGE_STAT_INCR(kt_scans); 1628 KCAGE_STAT_INCR_SCAN(kt_passes); 1629 1630 did_something = 0; 1631 pages_skipped = 0; 1632 shared_skipped = 0; 1633 while ((kcage_freemem < kcage_lotsfree || kcage_needfree) && 1634 (pfn = kcage_walk_cage(pfn == PFN_INVALID)) != PFN_INVALID) { 1635 1636 if (start_pfn == PFN_INVALID) 1637 start_pfn = pfn; 1638 else if (start_pfn == pfn) { 1639 last_pass = pass; 1640 pass += 1; 1641 /* 1642 * Did a complete walk of kernel cage, but didn't free 1643 * any pages. If only one cpu is online then 1644 * stop kernel cage walk and try expanding. 1645 */ 1646 if (ncpus_online == 1 && did_something == 0) { 1647 KCAGE_STAT_INCR(kt_cageout_break); 1648 break; 1649 } 1650 } 1651 1652 pp = page_numtopp_nolock(pfn); 1653 if (pp == NULL) { 1654 continue; 1655 } 1656 1657 KCAGE_STAT_INCR_SCAN(kt_examined); 1658 1659 /* 1660 * Do a quick PP_ISNORELOC() and PP_ISFREE test outside 1661 * of the lock. If one is missed it will be seen next 1662 * time through. 1663 * 1664 * Skip non-caged-pages. These pages can exist in the cage 1665 * because, if during cage expansion, a page is 1666 * encountered that is long-term locked the lock prevents the 1667 * expansion logic from setting the P_NORELOC flag. Hence, 1668 * non-caged-pages surrounded by caged-pages. 1669 */ 1670 if (!PP_ISNORELOC(pp)) { 1671 switch (kcage_assimilate_page(pp, &nfreed)) { 1672 case 0: 1673 did_something = 1; 1674 KCAGE_STAT_NINCR_SCAN(kt_gotonefree, 1675 nfreed); 1676 break; 1677 1678 case EBUSY: 1679 case ERANGE: 1680 did_something = 1; 1681 KCAGE_STAT_INCR_SCAN(kt_gotone); 1682 break; 1683 1684 case EAGAIN: 1685 case ENOMEM: 1686 break; 1687 1688 default: 1689 /* catch this with debug kernels */ 1690 ASSERT(0); 1691 break; 1692 } 1693 1694 continue; 1695 } else { 1696 int prm; 1697 1698 if (PP_ISFREE(pp)) { 1699 continue; 1700 } 1701 1702 if ((pp->p_vnode == &kvp && pp->p_lckcnt > 0) || 1703 !page_trylock(pp, SE_EXCL)) { 1704 KCAGE_STAT_INCR_SCAN(kt_cantlock); 1705 continue; 1706 } 1707 1708 /* P_NORELOC bit should not have gone away. */ 1709 ASSERT(PP_ISNORELOC(pp)); 1710 if (PP_ISFREE(pp) || (pp->p_vnode == &kvp && 1711 pp->p_lckcnt > 0)) { 1712 page_unlock(pp); 1713 continue; 1714 } 1715 1716 KCAGE_STAT_SET_SCAN(kt_skiplevel, shared_level); 1717 if (hat_page_getshare(pp) > shared_level) { 1718 page_unlock(pp); 1719 pages_skipped = 1; 1720 shared_skipped = 1; 1721 KCAGE_STAT_INCR_SCAN(kt_skipshared); 1722 continue; 1723 } 1724 1725 /* 1726 * In pass {0, 1}, skip page if ref bit is set. 1727 * In pass {0, 1, 2}, skip page if mod bit is set. 1728 */ 1729 prm = hat_pagesync(pp, 1730 HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD); 1731 1732 /* On first pass ignore ref'd pages */ 1733 if (pass <= 1 && (prm & P_REF)) { 1734 KCAGE_STAT_INCR_SCAN(kt_skiprefd); 1735 pages_skipped = 1; 1736 page_unlock(pp); 1737 continue; 1738 } 1739 1740 /* On pass 2, page_destroy if mod bit is not set */ 1741 if (pass <= 2) { 1742 if (pp->p_szc != 0 || (prm & P_MOD) || 1743 pp->p_lckcnt || pp->p_cowcnt) { 1744 pages_skipped = 1; 1745 page_unlock(pp); 1746 } else { 1747 1748 /* 1749 * unload the mappings before 1750 * checking if mod bit is set 1751 */ 1752 (void) hat_pageunload(pp, 1753 HAT_FORCE_PGUNLOAD); 1754 1755 /* 1756 * skip this page if modified 1757 */ 1758 if (hat_ismod(pp)) { 1759 pages_skipped = 1; 1760 page_unlock(pp); 1761 continue; 1762 } 1763 1764 KCAGE_STAT_INCR_SCAN(kt_destroy); 1765 page_destroy(pp, 0); 1766 did_something = 1; 1767 } 1768 continue; 1769 } 1770 1771 if (kcage_invalidate_page(pp, &nfreed) == 0) { 1772 did_something = 1; 1773 KCAGE_STAT_NINCR_SCAN(kt_gotonefree, nfreed); 1774 } 1775 1776 /* 1777 * No need to drop the page lock here. 1778 * Kcage_invalidate_page has done that for us 1779 * either explicitly or through a page_free. 1780 */ 1781 } 1782 } 1783 1784 /* 1785 * Expand the cage only if available cage memory is really low. 1786 * This test is done only after a complete scan of the cage. 1787 * The reason for not checking and expanding more often is to 1788 * avoid rapid expansion of the cage. Naturally, scanning the 1789 * cage takes time. So by scanning first, we use that work as a 1790 * delay loop in between expand decisions. 1791 */ 1792 1793 scan_again = 0; 1794 if (kcage_freemem < kcage_minfree || kcage_needfree) { 1795 /* 1796 * Kcage_expand() will return a non-zero value if it was 1797 * able to expand the cage -- whether or not the new 1798 * pages are free and immediately usable. If non-zero, 1799 * we do another scan of the cage. The pages might be 1800 * freed during that scan or by time we get back here. 1801 * If not, we will attempt another expansion. 1802 * However, if kcage_expand() returns zero, then it was 1803 * unable to expand the cage. This is the case when the 1804 * the growth list is exausted, therefore no work was done 1805 * and there is no reason to scan the cage again. 1806 * Note: Kernel cage scan is not repeated on single-cpu 1807 * system to avoid kernel cage thread hogging cpu. 1808 */ 1809 if (pass <= 3 && pages_skipped && ncpus_online > 1) 1810 scan_again = 1; 1811 else 1812 (void) kcage_expand(); /* don't scan again */ 1813 } else if (kcage_freemem < kcage_lotsfree) { 1814 /* 1815 * If available cage memory is less than abundant 1816 * and a full scan of the cage has not yet been completed, 1817 * or a scan has completed and some work was performed, 1818 * or pages were skipped because of sharing, 1819 * or we simply have not yet completed two passes, 1820 * then do another scan. 1821 */ 1822 if (pass <= 2 && pages_skipped) 1823 scan_again = 1; 1824 if (pass == last_pass || did_something) 1825 scan_again = 1; 1826 else if (shared_skipped && shared_level < (8<<24)) { 1827 shared_level <<= 1; 1828 scan_again = 1; 1829 } 1830 } 1831 1832 if (scan_again && ncpus_online > 1) 1833 goto again; 1834 else { 1835 if (shared_level > 8) 1836 shared_level >>= 1; 1837 1838 KCAGE_STAT_SET_SCAN(kt_freemem_end, freemem); 1839 KCAGE_STAT_SET_SCAN(kt_kcage_freemem_end, kcage_freemem); 1840 KCAGE_STAT_SET_SCAN(kt_ticks, lbolt - scan_start); 1841 KCAGE_STAT_INC_SCAN_INDEX; 1842 goto loop; 1843 } 1844 1845 /*NOTREACHED*/ 1846 } 1847 1848 void 1849 kcage_cageout_wakeup() 1850 { 1851 if (mutex_tryenter(&kcage_cageout_mutex)) { 1852 if (kcage_cageout_ready) { 1853 cv_signal(&kcage_cageout_cv); 1854 } else if (kcage_freemem < kcage_minfree || kcage_needfree) { 1855 /* 1856 * Available cage memory is really low. Time to 1857 * start expanding the cage. However, the 1858 * kernel cage thread is not yet ready to 1859 * do the work. Use *this* thread, which is 1860 * most likely to be t0, to do the work. 1861 */ 1862 KCAGE_STAT_INCR(kcw_expandearly); 1863 (void) kcage_expand(); 1864 KCAGE_STAT_INC_SCAN_INDEX; 1865 } 1866 1867 mutex_exit(&kcage_cageout_mutex); 1868 } 1869 /* else, kernel cage thread is already running */ 1870 } 1871 1872 void 1873 kcage_tick() 1874 { 1875 /* 1876 * Once per second we wake up all the threads throttled 1877 * waiting for cage memory, in case we've become stuck 1878 * and haven't made forward progress expanding the cage. 1879 */ 1880 if (kcage_on && kcage_cageout_ready) 1881 cv_broadcast(&kcage_throttle_cv); 1882 } 1883