1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/types.h> 29 #include <sys/param.h> 30 #include <sys/thread.h> 31 #include <sys/proc.h> 32 #include <sys/callb.h> 33 #include <sys/vnode.h> 34 #include <sys/debug.h> 35 #include <sys/systm.h> /* for bzero */ 36 #include <sys/memlist.h> 37 #include <sys/cmn_err.h> 38 #include <sys/sysmacros.h> 39 #include <sys/vmsystm.h> /* for NOMEMWAIT() */ 40 #include <sys/atomic.h> /* used to update kcage_freemem */ 41 #include <sys/kmem.h> /* for kmem_reap */ 42 #include <sys/errno.h> 43 #include <sys/mem_cage.h> 44 #include <vm/seg_kmem.h> 45 #include <vm/page.h> 46 #include <vm/hat.h> 47 #include <vm/vm_dep.h> 48 #include <sys/mem_config.h> 49 #include <sys/lgrp.h> 50 #include <sys/rwlock.h> 51 52 extern pri_t maxclsyspri; 53 54 #ifdef DEBUG 55 #define KCAGE_STATS 56 #endif 57 58 #ifdef KCAGE_STATS 59 60 #define KCAGE_STATS_VERSION 9 /* can help report generators */ 61 #define KCAGE_STATS_NSCANS 256 /* depth of scan statistics buffer */ 62 63 struct kcage_stats_scan { 64 /* managed by KCAGE_STAT_* macros */ 65 clock_t scan_lbolt; 66 uint_t scan_id; 67 68 /* set in kcage_cageout() */ 69 uint_t kt_passes; 70 clock_t kt_ticks; 71 pgcnt_t kt_kcage_freemem_start; 72 pgcnt_t kt_kcage_freemem_end; 73 pgcnt_t kt_freemem_start; 74 pgcnt_t kt_freemem_end; 75 uint_t kt_examined; 76 uint_t kt_cantlock; 77 uint_t kt_gotone; 78 uint_t kt_gotonefree; 79 uint_t kt_skiplevel; 80 uint_t kt_skipshared; 81 uint_t kt_skiprefd; 82 uint_t kt_destroy; 83 84 /* set in kcage_invalidate_page() */ 85 uint_t kip_reloclocked; 86 uint_t kip_relocmod; 87 uint_t kip_destroy; 88 uint_t kip_nomem; 89 uint_t kip_demotefailed; 90 91 /* set in kcage_expand() */ 92 uint_t ke_wanted; 93 uint_t ke_examined; 94 uint_t ke_lefthole; 95 uint_t ke_gotone; 96 uint_t ke_gotonefree; 97 }; 98 99 struct kcage_stats { 100 /* managed by KCAGE_STAT_* macros */ 101 uint_t version; 102 uint_t size; 103 104 /* set in kcage_cageout */ 105 uint_t kt_wakeups; 106 uint_t kt_scans; 107 uint_t kt_cageout_break; 108 109 /* set in kcage_expand */ 110 uint_t ke_calls; 111 uint_t ke_nopfn; 112 uint_t ke_nopaget; 113 uint_t ke_isnoreloc; 114 uint_t ke_deleting; 115 uint_t ke_lowfreemem; 116 uint_t ke_terminate; 117 118 /* set in kcage_freemem_add() */ 119 uint_t kfa_trottlewake; 120 121 /* set in kcage_freemem_sub() */ 122 uint_t kfs_cagewake; 123 124 /* set in kcage_create_throttle */ 125 uint_t kct_calls; 126 uint_t kct_cageout; 127 uint_t kct_critical; 128 uint_t kct_exempt; 129 uint_t kct_cagewake; 130 uint_t kct_wait; 131 uint_t kct_progress; 132 uint_t kct_noprogress; 133 uint_t kct_timeout; 134 135 /* set in kcage_cageout_wakeup */ 136 uint_t kcw_expandearly; 137 138 /* managed by KCAGE_STAT_* macros */ 139 uint_t scan_array_size; 140 uint_t scan_index; 141 struct kcage_stats_scan scans[KCAGE_STATS_NSCANS]; 142 }; 143 144 static struct kcage_stats kcage_stats; 145 static struct kcage_stats_scan kcage_stats_scan_zero; 146 147 /* 148 * No real need for atomics here. For the most part the incs and sets are 149 * done by the kernel cage thread. There are a few that are done by any 150 * number of other threads. Those cases are noted by comments. 151 */ 152 #define KCAGE_STAT_INCR(m) kcage_stats.m++ 153 154 #define KCAGE_STAT_NINCR(m, v) kcage_stats.m += (v) 155 156 #define KCAGE_STAT_INCR_SCAN(m) \ 157 KCAGE_STAT_INCR(scans[kcage_stats.scan_index].m) 158 159 #define KCAGE_STAT_NINCR_SCAN(m, v) \ 160 KCAGE_STAT_NINCR(scans[kcage_stats.scan_index].m, v) 161 162 #define KCAGE_STAT_SET(m, v) kcage_stats.m = (v) 163 164 #define KCAGE_STAT_SETZ(m, v) \ 165 if (kcage_stats.m == 0) kcage_stats.m = (v) 166 167 #define KCAGE_STAT_SET_SCAN(m, v) \ 168 KCAGE_STAT_SET(scans[kcage_stats.scan_index].m, v) 169 170 #define KCAGE_STAT_SETZ_SCAN(m, v) \ 171 KCAGE_STAT_SETZ(scans[kcage_stats.scan_index].m, v) 172 173 #define KCAGE_STAT_INC_SCAN_INDEX \ 174 KCAGE_STAT_SET_SCAN(scan_lbolt, lbolt); \ 175 KCAGE_STAT_SET_SCAN(scan_id, kcage_stats.scan_index); \ 176 kcage_stats.scan_index = \ 177 (kcage_stats.scan_index + 1) % KCAGE_STATS_NSCANS; \ 178 kcage_stats.scans[kcage_stats.scan_index] = kcage_stats_scan_zero 179 180 #define KCAGE_STAT_INIT_SCAN_INDEX \ 181 kcage_stats.version = KCAGE_STATS_VERSION; \ 182 kcage_stats.size = sizeof (kcage_stats); \ 183 kcage_stats.scan_array_size = KCAGE_STATS_NSCANS; \ 184 kcage_stats.scan_index = 0 185 186 #else /* KCAGE_STATS */ 187 188 #define KCAGE_STAT_INCR(v) 189 #define KCAGE_STAT_NINCR(m, v) 190 #define KCAGE_STAT_INCR_SCAN(v) 191 #define KCAGE_STAT_NINCR_SCAN(m, v) 192 #define KCAGE_STAT_SET(m, v) 193 #define KCAGE_STAT_SETZ(m, v) 194 #define KCAGE_STAT_SET_SCAN(m, v) 195 #define KCAGE_STAT_SETZ_SCAN(m, v) 196 #define KCAGE_STAT_INC_SCAN_INDEX 197 #define KCAGE_STAT_INIT_SCAN_INDEX 198 199 #endif /* KCAGE_STATS */ 200 201 static kmutex_t kcage_throttle_mutex; /* protects kcage_throttle_cv */ 202 static kcondvar_t kcage_throttle_cv; 203 204 static kmutex_t kcage_cageout_mutex; /* protects cv and ready flag */ 205 static kcondvar_t kcage_cageout_cv; /* cageout thread naps here */ 206 static int kcage_cageout_ready; /* nonzero when cageout thread ready */ 207 kthread_id_t kcage_cageout_thread; /* to aid debugging */ 208 209 static krwlock_t kcage_range_rwlock; /* protects kcage_glist elements */ 210 211 /* 212 * Cage expansion happens within a range. 213 */ 214 struct kcage_glist { 215 struct kcage_glist *next; 216 pfn_t base; 217 pfn_t lim; 218 pfn_t curr; 219 int decr; 220 }; 221 222 static struct kcage_glist *kcage_glist; 223 static struct kcage_glist *kcage_current_glist; 224 225 /* 226 * The firstfree element is provided so that kmem_alloc can be avoided 227 * until that cage has somewhere to go. This is not currently a problem 228 * as early kmem_alloc's use BOP_ALLOC instead of page_create_va. 229 */ 230 static vmem_t *kcage_arena; 231 static struct kcage_glist kcage_glist_firstfree; 232 static struct kcage_glist *kcage_glist_freelist = &kcage_glist_firstfree; 233 234 /* 235 * Miscellaneous forward references 236 */ 237 static struct kcage_glist *kcage_glist_alloc(void); 238 static int kcage_glist_delete(pfn_t, pfn_t, struct kcage_glist **); 239 static void kcage_cageout(void); 240 static int kcage_invalidate_page(page_t *, pgcnt_t *); 241 static int kcage_setnoreloc_pages(page_t *, se_t); 242 static int kcage_range_add_internal(pfn_t base, pgcnt_t npgs, kcage_dir_t); 243 static void kcage_init(pgcnt_t preferred_size); 244 static int kcage_range_delete_internal(pfn_t base, pgcnt_t npgs); 245 246 /* 247 * Kernel Memory Cage counters and thresholds. 248 */ 249 int kcage_on = 0; 250 pgcnt_t kcage_freemem; 251 pgcnt_t kcage_needfree; 252 pgcnt_t kcage_lotsfree; 253 pgcnt_t kcage_desfree; 254 pgcnt_t kcage_minfree; 255 pgcnt_t kcage_throttlefree; 256 pgcnt_t kcage_reserve; 257 int kcage_maxwait = 10; /* in seconds */ 258 259 /* when we use lp for kmem we start the cage at a higher initial value */ 260 pgcnt_t kcage_kmemlp_mincage; 261 262 #ifdef DEBUG 263 pgcnt_t kcage_pagets; 264 #define KCAGEPAGETS_INC() kcage_pagets++ 265 #else 266 #define KCAGEPAGETS_INC() 267 #endif 268 269 /* kstats to export what pages are currently caged */ 270 kmutex_t kcage_kstat_lock; 271 static int kcage_kstat_update(kstat_t *ksp, int rw); 272 static int kcage_kstat_snapshot(kstat_t *ksp, void *buf, int rw); 273 274 /* 275 * Startup and Dynamic Reconfiguration interfaces. 276 * kcage_range_add() 277 * kcage_range_del() 278 * kcage_range_delete_post_mem_del() 279 * kcage_range_init() 280 * kcage_set_thresholds() 281 */ 282 283 /* 284 * Called from page_get_contig_pages to get the approximate kcage pfn range 285 * for exclusion from search for contiguous pages. This routine is called 286 * without kcage_range lock (kcage routines can call page_get_contig_pages 287 * through page_relocate) and with the assumption, based on kcage_range_add, 288 * that kcage_current_glist always contain a valid pointer. 289 */ 290 291 int 292 kcage_current_pfn(pfn_t *pfncur) 293 { 294 struct kcage_glist *lp = kcage_current_glist; 295 296 ASSERT(kcage_on); 297 298 ASSERT(lp != NULL); 299 300 *pfncur = lp->curr; 301 302 return (lp->decr); 303 } 304 305 /* 306 * Called from vm_pagelist.c during coalesce to find kernel cage regions 307 * within an mnode. Looks for the lowest range between lo and hi. 308 * 309 * Kernel cage memory is defined between kcage_glist and kcage_current_glist. 310 * Non-cage memory is defined between kcage_current_glist and list end. 311 * 312 * If incage is set, returns the lowest kcage range. Otherwise returns lowest 313 * non-cage range. 314 * 315 * Returns zero on success and nlo, nhi: 316 * lo <= nlo < nhi <= hi 317 * Returns non-zero if no overlapping range is found. 318 */ 319 int 320 kcage_next_range(int incage, pfn_t lo, pfn_t hi, 321 pfn_t *nlo, pfn_t *nhi) 322 { 323 struct kcage_glist *lp; 324 pfn_t tlo = hi; 325 pfn_t thi = hi; 326 327 ASSERT(lo <= hi); 328 329 /* 330 * Reader lock protects the list, but kcage_get_pfn 331 * running concurrently may advance kcage_current_glist 332 * and also update kcage_current_glist->curr. Page 333 * coalesce can handle this race condition. 334 */ 335 rw_enter(&kcage_range_rwlock, RW_READER); 336 337 for (lp = incage ? kcage_glist : kcage_current_glist; 338 lp != NULL; lp = lp->next) { 339 340 pfn_t klo, khi; 341 342 /* find the range limits in this element */ 343 if ((incage && lp->decr) || (!incage && !lp->decr)) { 344 klo = lp->curr; 345 khi = lp->lim; 346 } else { 347 klo = lp->base; 348 khi = lp->curr; 349 } 350 351 /* handle overlap */ 352 if (klo < tlo && klo < khi && lo < khi && klo < hi) { 353 tlo = MAX(lo, klo); 354 thi = MIN(hi, khi); 355 if (tlo == lo) 356 break; 357 } 358 359 /* check end of kcage */ 360 if (incage && lp == kcage_current_glist) { 361 break; 362 } 363 } 364 365 rw_exit(&kcage_range_rwlock); 366 367 /* return non-zero if no overlapping range found */ 368 if (tlo == thi) 369 return (1); 370 371 ASSERT(lo <= tlo && tlo < thi && thi <= hi); 372 373 /* return overlapping range */ 374 *nlo = tlo; 375 *nhi = thi; 376 return (0); 377 } 378 379 void 380 kcage_range_init(struct memlist *ml, kcage_dir_t d, pgcnt_t preferred_size) 381 { 382 int ret = 0; 383 384 ASSERT(kcage_arena == NULL); 385 kcage_arena = vmem_create("kcage_arena", NULL, 0, sizeof (uint64_t), 386 segkmem_alloc, segkmem_free, heap_arena, 0, VM_SLEEP); 387 ASSERT(kcage_arena != NULL); 388 389 if (d == KCAGE_DOWN) { 390 while (ml->next != NULL) 391 ml = ml->next; 392 } 393 394 rw_enter(&kcage_range_rwlock, RW_WRITER); 395 396 while (ml != NULL) { 397 ret = kcage_range_add_internal(btop(ml->address), 398 btop(ml->size), d); 399 if (ret) 400 panic("kcage_range_add_internal failed: " 401 "ml=%p, ret=0x%x\n", ml, ret); 402 403 ml = (d == KCAGE_DOWN ? ml->prev : ml->next); 404 } 405 406 rw_exit(&kcage_range_rwlock); 407 408 if (ret == 0) 409 kcage_init(preferred_size); 410 } 411 412 /* 413 * Third arg controls direction of growth: 0: increasing pfns, 414 * 1: decreasing. 415 */ 416 static int 417 kcage_range_add_internal(pfn_t base, pgcnt_t npgs, kcage_dir_t d) 418 { 419 struct kcage_glist *new, **lpp; 420 pfn_t lim; 421 422 ASSERT(rw_write_held(&kcage_range_rwlock)); 423 424 ASSERT(npgs != 0); 425 if (npgs == 0) 426 return (EINVAL); 427 428 lim = base + npgs; 429 430 ASSERT(lim > base); 431 if (lim <= base) 432 return (EINVAL); 433 434 new = kcage_glist_alloc(); 435 if (new == NULL) { 436 return (ENOMEM); 437 } 438 439 new->base = base; 440 new->lim = lim; 441 new->decr = (d == KCAGE_DOWN); 442 if (new->decr != 0) 443 new->curr = new->lim; 444 else 445 new->curr = new->base; 446 /* 447 * Any overlapping existing ranges are removed by deleting 448 * from the new list as we search for the tail. 449 */ 450 lpp = &kcage_glist; 451 while (*lpp != NULL) { 452 int ret; 453 ret = kcage_glist_delete((*lpp)->base, (*lpp)->lim, &new); 454 if (ret != 0) 455 return (ret); 456 lpp = &(*lpp)->next; 457 } 458 459 *lpp = new; 460 461 if (kcage_current_glist == NULL) { 462 kcage_current_glist = kcage_glist; 463 } 464 465 return (0); 466 } 467 468 int 469 kcage_range_add(pfn_t base, pgcnt_t npgs, kcage_dir_t d) 470 { 471 int ret; 472 473 rw_enter(&kcage_range_rwlock, RW_WRITER); 474 ret = kcage_range_add_internal(base, npgs, d); 475 rw_exit(&kcage_range_rwlock); 476 return (ret); 477 } 478 479 /* 480 * Calls to add and delete must be protected by kcage_range_rwlock 481 */ 482 static int 483 kcage_range_delete_internal(pfn_t base, pgcnt_t npgs) 484 { 485 struct kcage_glist *lp; 486 pfn_t lim; 487 488 ASSERT(rw_write_held(&kcage_range_rwlock)); 489 490 ASSERT(npgs != 0); 491 if (npgs == 0) 492 return (EINVAL); 493 494 lim = base + npgs; 495 496 ASSERT(lim > base); 497 if (lim <= base) 498 return (EINVAL); 499 500 /* 501 * Check if the delete is OK first as a number of elements 502 * might be involved and it will be difficult to go 503 * back and undo (can't just add the range back in). 504 */ 505 for (lp = kcage_glist; lp != NULL; lp = lp->next) { 506 /* 507 * If there have been no pages allocated from this 508 * element, we don't need to check it. 509 */ 510 if ((lp->decr == 0 && lp->curr == lp->base) || 511 (lp->decr != 0 && lp->curr == lp->lim)) 512 continue; 513 /* 514 * If the element does not overlap, its OK. 515 */ 516 if (base >= lp->lim || lim <= lp->base) 517 continue; 518 /* 519 * Overlapping element: Does the range to be deleted 520 * overlap the area already used? If so fail. 521 */ 522 if (lp->decr == 0 && base < lp->curr && lim >= lp->base) { 523 return (EBUSY); 524 } 525 if (lp->decr != 0 && base < lp->lim && lim >= lp->curr) { 526 return (EBUSY); 527 } 528 } 529 return (kcage_glist_delete(base, lim, &kcage_glist)); 530 } 531 532 int 533 kcage_range_delete(pfn_t base, pgcnt_t npgs) 534 { 535 int ret; 536 537 rw_enter(&kcage_range_rwlock, RW_WRITER); 538 ret = kcage_range_delete_internal(base, npgs); 539 rw_exit(&kcage_range_rwlock); 540 return (ret); 541 } 542 543 /* 544 * Calls to add and delete must be protected by kcage_range_rwlock. 545 * This routine gets called after successful Solaris memory 546 * delete operation from DR post memory delete routines. 547 */ 548 static int 549 kcage_range_delete_post_mem_del_internal(pfn_t base, pgcnt_t npgs) 550 { 551 pfn_t lim; 552 553 ASSERT(rw_write_held(&kcage_range_rwlock)); 554 555 ASSERT(npgs != 0); 556 if (npgs == 0) 557 return (EINVAL); 558 559 lim = base + npgs; 560 561 ASSERT(lim > base); 562 if (lim <= base) 563 return (EINVAL); 564 565 return (kcage_glist_delete(base, lim, &kcage_glist)); 566 } 567 568 int 569 kcage_range_delete_post_mem_del(pfn_t base, pgcnt_t npgs) 570 { 571 int ret; 572 573 rw_enter(&kcage_range_rwlock, RW_WRITER); 574 ret = kcage_range_delete_post_mem_del_internal(base, npgs); 575 rw_exit(&kcage_range_rwlock); 576 return (ret); 577 } 578 579 /* 580 * No locking is required here as the whole operation is covered 581 * by kcage_range_rwlock writer lock. 582 */ 583 static struct kcage_glist * 584 kcage_glist_alloc(void) 585 { 586 struct kcage_glist *new; 587 588 if ((new = kcage_glist_freelist) != NULL) { 589 kcage_glist_freelist = new->next; 590 } else { 591 new = vmem_alloc(kcage_arena, sizeof (*new), VM_NOSLEEP); 592 } 593 594 if (new != NULL) 595 bzero(new, sizeof (*new)); 596 597 return (new); 598 } 599 600 static void 601 kcage_glist_free(struct kcage_glist *lp) 602 { 603 lp->next = kcage_glist_freelist; 604 kcage_glist_freelist = lp; 605 } 606 607 static int 608 kcage_glist_delete(pfn_t base, pfn_t lim, struct kcage_glist **lpp) 609 { 610 struct kcage_glist *lp, *prev = *lpp; 611 612 while ((lp = *lpp) != NULL) { 613 if (lim > lp->base && base < lp->lim) { 614 /* The delete range overlaps this element. */ 615 if (base <= lp->base && lim >= lp->lim) { 616 /* Delete whole element. */ 617 *lpp = lp->next; 618 if (lp == kcage_current_glist) { 619 /* This can never happen. */ 620 ASSERT(kcage_current_glist != prev); 621 kcage_current_glist = prev; 622 } 623 kcage_glist_free(lp); 624 continue; 625 } 626 627 /* Partial delete. */ 628 if (base > lp->base && lim < lp->lim) { 629 struct kcage_glist *new; 630 631 /* 632 * Remove a section from the middle, 633 * need to allocate a new element. 634 */ 635 new = kcage_glist_alloc(); 636 if (new == NULL) { 637 return (ENOMEM); 638 } 639 640 /* 641 * Tranfser unused range to new. 642 * Edit lp in place to preserve 643 * kcage_current_glist. 644 */ 645 new->decr = lp->decr; 646 if (new->decr != 0) { 647 new->base = lp->base; 648 new->lim = base; 649 new->curr = base; 650 651 lp->base = lim; 652 } else { 653 new->base = lim; 654 new->lim = lp->lim; 655 new->curr = new->base; 656 657 lp->lim = base; 658 } 659 660 /* Insert new. */ 661 new->next = lp->next; 662 lp->next = new; 663 lpp = &lp->next; 664 } else { 665 /* Delete part of current block. */ 666 if (base > lp->base) { 667 ASSERT(lim >= lp->lim); 668 ASSERT(base < lp->lim); 669 if (lp->decr != 0 && 670 lp->curr == lp->lim) 671 lp->curr = base; 672 lp->lim = base; 673 } else { 674 ASSERT(base <= lp->base); 675 ASSERT(lim > lp->base); 676 if (lp->decr == 0 && 677 lp->curr == lp->base) 678 lp->curr = lim; 679 lp->base = lim; 680 } 681 } 682 } 683 prev = *lpp; 684 lpp = &(*lpp)->next; 685 } 686 687 return (0); 688 } 689 690 /* 691 * If lockit is 1, kcage_get_pfn holds the 692 * reader lock for kcage_range_rwlock. 693 * Changes to lp->curr can cause race conditions, but 694 * they are handled by higher level code (see kcage_next_range.) 695 */ 696 static pfn_t 697 kcage_get_pfn(int lockit) 698 { 699 struct kcage_glist *lp; 700 pfn_t pfn = PFN_INVALID; 701 702 if (lockit && !rw_tryenter(&kcage_range_rwlock, RW_READER)) 703 return (pfn); 704 705 lp = kcage_current_glist; 706 while (lp != NULL) { 707 if (lp->decr != 0) { 708 if (lp->curr != lp->base) { 709 pfn = --lp->curr; 710 break; 711 } 712 } else { 713 if (lp->curr != lp->lim) { 714 pfn = lp->curr++; 715 break; 716 } 717 } 718 719 lp = lp->next; 720 if (lp) 721 kcage_current_glist = lp; 722 } 723 724 if (lockit) 725 rw_exit(&kcage_range_rwlock); 726 return (pfn); 727 } 728 729 /* 730 * Walk the physical address space of the cage. 731 * This routine does not guarantee to return PFNs in the order 732 * in which they were allocated to the cage. Instead, it walks 733 * each range as they appear on the growth list returning the PFNs 734 * range in ascending order. 735 * 736 * To begin scanning at lower edge of cage, reset should be nonzero. 737 * To step through cage, reset should be zero. 738 * 739 * PFN_INVALID will be returned when the upper end of the cage is 740 * reached -- indicating a full scan of the cage has been completed since 741 * previous reset. PFN_INVALID will continue to be returned until 742 * kcage_walk_cage is reset. 743 * 744 * It is possible to receive a PFN_INVALID result on reset if a growth 745 * list is not installed or if none of the PFNs in the installed list have 746 * been allocated to the cage. In otherwords, there is no cage. 747 * 748 * Caller need not hold kcage_range_rwlock while calling this function 749 * as the front part of the list is static - pages never come out of 750 * the cage. 751 * 752 * The caller is expected to only be kcage_cageout(). 753 */ 754 static pfn_t 755 kcage_walk_cage(int reset) 756 { 757 static struct kcage_glist *lp = NULL; 758 static pfn_t pfn; 759 760 if (reset) 761 lp = NULL; 762 if (lp == NULL) { 763 lp = kcage_glist; 764 pfn = PFN_INVALID; 765 } 766 again: 767 if (pfn == PFN_INVALID) { 768 if (lp == NULL) 769 return (PFN_INVALID); 770 771 if (lp->decr != 0) { 772 /* 773 * In this range the cage grows from the highest 774 * address towards the lowest. 775 * Arrange to return pfns from curr to lim-1, 776 * inclusive, in ascending order. 777 */ 778 779 pfn = lp->curr; 780 } else { 781 /* 782 * In this range the cage grows from the lowest 783 * address towards the highest. 784 * Arrange to return pfns from base to curr, 785 * inclusive, in ascending order. 786 */ 787 788 pfn = lp->base; 789 } 790 } 791 792 if (lp->decr != 0) { /* decrementing pfn */ 793 if (pfn == lp->lim) { 794 /* Don't go beyond the static part of the glist. */ 795 if (lp == kcage_current_glist) 796 lp = NULL; 797 else 798 lp = lp->next; 799 pfn = PFN_INVALID; 800 goto again; 801 } 802 803 ASSERT(pfn >= lp->curr && pfn < lp->lim); 804 } else { /* incrementing pfn */ 805 if (pfn == lp->curr) { 806 /* Don't go beyond the static part of the glist. */ 807 if (lp == kcage_current_glist) 808 lp = NULL; 809 else 810 lp = lp->next; 811 pfn = PFN_INVALID; 812 goto again; 813 } 814 815 ASSERT(pfn >= lp->base && pfn < lp->curr); 816 } 817 818 return (pfn++); 819 } 820 821 /* 822 * Callback functions for to recalc cage thresholds after 823 * Kphysm memory add/delete operations. 824 */ 825 /*ARGSUSED*/ 826 static void 827 kcage_kphysm_postadd_cb(void *arg, pgcnt_t delta_pages) 828 { 829 kcage_recalc_thresholds(); 830 } 831 832 /*ARGSUSED*/ 833 static int 834 kcage_kphysm_predel_cb(void *arg, pgcnt_t delta_pages) 835 { 836 /* TODO: when should cage refuse memory delete requests? */ 837 return (0); 838 } 839 840 /*ARGSUSED*/ 841 static void 842 kcage_kphysm_postdel_cb(void *arg, pgcnt_t delta_pages, int cancelled) 843 { 844 kcage_recalc_thresholds(); 845 } 846 847 static kphysm_setup_vector_t kcage_kphysm_vectors = { 848 KPHYSM_SETUP_VECTOR_VERSION, 849 kcage_kphysm_postadd_cb, 850 kcage_kphysm_predel_cb, 851 kcage_kphysm_postdel_cb 852 }; 853 854 /* 855 * This is called before a CPR suspend and after a CPR resume. We have to 856 * turn off kcage_cageout_ready before a suspend, and turn it back on after a 857 * restart. 858 */ 859 /*ARGSUSED*/ 860 static boolean_t 861 kcage_cageout_cpr(void *arg, int code) 862 { 863 if (code == CB_CODE_CPR_CHKPT) { 864 ASSERT(kcage_cageout_ready); 865 kcage_cageout_ready = 0; 866 return (B_TRUE); 867 } else if (code == CB_CODE_CPR_RESUME) { 868 ASSERT(kcage_cageout_ready == 0); 869 kcage_cageout_ready = 1; 870 return (B_TRUE); 871 } 872 return (B_FALSE); 873 } 874 875 /* 876 * kcage_recalc_preferred_size() increases initial cage size to improve large 877 * page availability when lp for kmem is enabled and kpr is disabled 878 */ 879 static pgcnt_t 880 kcage_recalc_preferred_size(pgcnt_t preferred_size) 881 { 882 if (SEGKMEM_USE_LARGEPAGES && segkmem_reloc == 0) { 883 pgcnt_t lpmincage = kcage_kmemlp_mincage; 884 if (lpmincage == 0) { 885 lpmincage = MIN(P2ROUNDUP(((physmem * PAGESIZE) / 8), 886 segkmem_heaplp_quantum), 0x40000000UL) / PAGESIZE; 887 } 888 kcage_kmemlp_mincage = MIN(lpmincage, 889 (segkmem_kmemlp_max / PAGESIZE)); 890 preferred_size = MAX(kcage_kmemlp_mincage, preferred_size); 891 } 892 return (preferred_size); 893 } 894 895 /* 896 * Kcage_init() builds the cage and initializes the cage thresholds. 897 * The size of the cage is determined by the argument preferred_size. 898 * or the actual amount of memory, whichever is smaller. 899 */ 900 static void 901 kcage_init(pgcnt_t preferred_size) 902 { 903 pgcnt_t wanted; 904 pfn_t pfn; 905 page_t *pp; 906 kstat_t *ksp; 907 908 extern struct vnode kvp; 909 extern void page_list_noreloc_startup(page_t *); 910 911 ASSERT(!kcage_on); 912 913 /* increase preferred cage size for lp for kmem */ 914 preferred_size = kcage_recalc_preferred_size(preferred_size); 915 916 /* Debug note: initialize this now so early expansions can stat */ 917 KCAGE_STAT_INIT_SCAN_INDEX; 918 919 /* 920 * Initialize cage thresholds and install kphysm callback. 921 * If we can't arrange to have the thresholds track with 922 * available physical memory, then the cage thresholds may 923 * end up over time at levels that adversly effect system 924 * performance; so, bail out. 925 */ 926 kcage_recalc_thresholds(); 927 if (kphysm_setup_func_register(&kcage_kphysm_vectors, NULL)) { 928 ASSERT(0); /* Catch this in DEBUG kernels. */ 929 return; 930 } 931 932 /* 933 * Limit startup cage size within the range of kcage_minfree 934 * and availrmem, inclusively. 935 */ 936 wanted = MIN(MAX(preferred_size, kcage_minfree), availrmem); 937 938 /* 939 * Construct the cage. PFNs are allocated from the glist. It 940 * is assumed that the list has been properly ordered for the 941 * platform by the platform code. Typically, this is as simple 942 * as calling kcage_range_init(phys_avail, decr), where decr is 943 * 1 if the kernel has been loaded into upper end of physical 944 * memory, or 0 if the kernel has been loaded at the low end. 945 * 946 * Note: it is assumed that we are in the startup flow, so there 947 * is no reason to grab the page lock. 948 */ 949 kcage_freemem = 0; 950 pfn = PFN_INVALID; /* prime for alignment test */ 951 while (wanted != 0) { 952 if ((pfn = kcage_get_pfn(0)) == PFN_INVALID) 953 break; 954 955 if ((pp = page_numtopp_nolock(pfn)) != NULL) { 956 KCAGEPAGETS_INC(); 957 /* 958 * Set the noreloc state on the page. 959 * If the page is free and not already 960 * on the noreloc list then move it. 961 */ 962 if (PP_ISFREE(pp)) { 963 if (PP_ISNORELOC(pp) == 0) 964 page_list_noreloc_startup(pp); 965 } else { 966 ASSERT(pp->p_szc == 0); 967 PP_SETNORELOC(pp); 968 } 969 } 970 PLCNT_XFER_NORELOC(pp); 971 wanted -= 1; 972 } 973 974 /* 975 * Need to go through and find kernel allocated pages 976 * and capture them into the Cage. These will primarily 977 * be pages gotten through boot_alloc(). 978 */ 979 if (kvp.v_pages) { 980 981 pp = kvp.v_pages; 982 do { 983 ASSERT(!PP_ISFREE(pp)); 984 ASSERT(pp->p_szc == 0); 985 PP_SETNORELOC(pp); 986 } while ((pp = pp->p_vpnext) != kvp.v_pages); 987 988 } 989 990 kcage_on = 1; 991 992 /* 993 * CB_CL_CPR_POST_KERNEL is the class that executes from cpr_suspend() 994 * after the cageout thread is blocked, and executes from cpr_resume() 995 * before the cageout thread is restarted. By executing in this class, 996 * we are assured that the kernel cage thread won't miss wakeup calls 997 * and also CPR's larger kmem_alloc requests will not fail after 998 * CPR shuts down the cageout kernel thread. 999 */ 1000 (void) callb_add(kcage_cageout_cpr, NULL, CB_CL_CPR_POST_KERNEL, 1001 "cageout"); 1002 1003 /* 1004 * Coalesce pages to improve large page availability. A better fix 1005 * would to coalesce pages as they are included in the cage 1006 */ 1007 if (SEGKMEM_USE_LARGEPAGES) { 1008 extern void page_freelist_coalesce_all(int mnode); 1009 extern int max_mem_nodes; 1010 int mnode, max_mnodes = max_mem_nodes; 1011 for (mnode = 0; mnode < max_mnodes; mnode++) { 1012 page_freelist_coalesce_all(mnode); 1013 } 1014 } 1015 1016 ksp = kstat_create("kcage", 0, "kcage_page_list", "misc", 1017 KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE | KSTAT_FLAG_VIRTUAL); 1018 if (ksp != NULL) { 1019 ksp->ks_update = kcage_kstat_update; 1020 ksp->ks_snapshot = kcage_kstat_snapshot; 1021 ksp->ks_lock = &kcage_kstat_lock; /* XXX - not really needed */ 1022 kstat_install(ksp); 1023 } 1024 } 1025 1026 static int 1027 kcage_kstat_update(kstat_t *ksp, int rw) 1028 { 1029 struct kcage_glist *lp; 1030 uint_t count; 1031 1032 if (rw == KSTAT_WRITE) 1033 return (EACCES); 1034 1035 count = 0; 1036 rw_enter(&kcage_range_rwlock, RW_WRITER); 1037 for (lp = kcage_glist; lp != NULL; lp = lp->next) { 1038 if (lp->decr) { 1039 if (lp->curr != lp->lim) { 1040 count++; 1041 } 1042 } else { 1043 if (lp->curr != lp->base) { 1044 count++; 1045 } 1046 } 1047 } 1048 rw_exit(&kcage_range_rwlock); 1049 1050 ksp->ks_ndata = count; 1051 ksp->ks_data_size = count * 2 * sizeof (uint64_t); 1052 1053 return (0); 1054 } 1055 1056 static int 1057 kcage_kstat_snapshot(kstat_t *ksp, void *buf, int rw) 1058 { 1059 struct kcage_glist *lp; 1060 struct memunit { 1061 uint64_t address; 1062 uint64_t size; 1063 } *kspmem; 1064 1065 if (rw == KSTAT_WRITE) 1066 return (EACCES); 1067 1068 ksp->ks_snaptime = gethrtime(); 1069 1070 kspmem = (struct memunit *)buf; 1071 rw_enter(&kcage_range_rwlock, RW_WRITER); 1072 for (lp = kcage_glist; lp != NULL; lp = lp->next, kspmem++) { 1073 if ((caddr_t)kspmem >= (caddr_t)buf + ksp->ks_data_size) 1074 break; 1075 1076 if (lp->decr) { 1077 if (lp->curr != lp->lim) { 1078 kspmem->address = ptob(lp->curr); 1079 kspmem->size = ptob(lp->lim - lp->curr); 1080 } 1081 } else { 1082 if (lp->curr != lp->base) { 1083 kspmem->address = ptob(lp->base); 1084 kspmem->size = ptob(lp->curr - lp->base); 1085 } 1086 } 1087 } 1088 rw_exit(&kcage_range_rwlock); 1089 1090 return (0); 1091 } 1092 1093 void 1094 kcage_recalc_thresholds() 1095 { 1096 static int first = 1; 1097 static pgcnt_t init_lotsfree; 1098 static pgcnt_t init_desfree; 1099 static pgcnt_t init_minfree; 1100 static pgcnt_t init_throttlefree; 1101 static pgcnt_t init_reserve; 1102 1103 /* TODO: any reason to take more care than this with live editing? */ 1104 mutex_enter(&kcage_cageout_mutex); 1105 mutex_enter(&freemem_lock); 1106 1107 if (first) { 1108 first = 0; 1109 init_lotsfree = kcage_lotsfree; 1110 init_desfree = kcage_desfree; 1111 init_minfree = kcage_minfree; 1112 init_throttlefree = kcage_throttlefree; 1113 init_reserve = kcage_reserve; 1114 } else { 1115 kcage_lotsfree = init_lotsfree; 1116 kcage_desfree = init_desfree; 1117 kcage_minfree = init_minfree; 1118 kcage_throttlefree = init_throttlefree; 1119 kcage_reserve = init_reserve; 1120 } 1121 1122 if (kcage_lotsfree == 0) 1123 kcage_lotsfree = MAX(32, total_pages / 256); 1124 1125 if (kcage_minfree == 0) 1126 kcage_minfree = MAX(32, kcage_lotsfree / 2); 1127 1128 if (kcage_desfree == 0) 1129 kcage_desfree = MAX(32, kcage_minfree); 1130 1131 if (kcage_throttlefree == 0) 1132 kcage_throttlefree = MAX(32, kcage_minfree / 2); 1133 1134 if (kcage_reserve == 0) 1135 kcage_reserve = MIN(32, kcage_throttlefree / 2); 1136 1137 mutex_exit(&freemem_lock); 1138 mutex_exit(&kcage_cageout_mutex); 1139 1140 if (kcage_cageout_ready) { 1141 if (kcage_freemem < kcage_desfree) 1142 kcage_cageout_wakeup(); 1143 1144 if (kcage_needfree) { 1145 mutex_enter(&kcage_throttle_mutex); 1146 cv_broadcast(&kcage_throttle_cv); 1147 mutex_exit(&kcage_throttle_mutex); 1148 } 1149 } 1150 } 1151 1152 /* 1153 * Pageout interface: 1154 * kcage_cageout_init() 1155 */ 1156 void 1157 kcage_cageout_init() 1158 { 1159 if (kcage_on) { 1160 1161 (void) thread_create(NULL, 0, kcage_cageout, 1162 NULL, 0, proc_pageout, TS_RUN, maxclsyspri - 1); 1163 } 1164 } 1165 1166 1167 /* 1168 * VM Interfaces: 1169 * kcage_create_throttle() 1170 * kcage_freemem_add() 1171 * kcage_freemem_sub() 1172 */ 1173 1174 /* 1175 * Wakeup cageout thread and throttle waiting for the number of pages 1176 * requested to become available. For non-critical requests, a 1177 * timeout is added, since freemem accounting is separate from cage 1178 * freemem accounting: it's possible for us to get stuck and not make 1179 * forward progress even though there was sufficient freemem before 1180 * arriving here. 1181 */ 1182 int 1183 kcage_create_throttle(pgcnt_t npages, int flags) 1184 { 1185 int niter = 0; 1186 pgcnt_t lastfree; 1187 int enough = kcage_freemem > kcage_throttlefree + npages; 1188 1189 KCAGE_STAT_INCR(kct_calls); /* unprotected incr. */ 1190 1191 kcage_cageout_wakeup(); /* just to be sure */ 1192 KCAGE_STAT_INCR(kct_cagewake); /* unprotected incr. */ 1193 1194 /* 1195 * Obviously, we can't throttle the cageout thread since 1196 * we depend on it. We also can't throttle the panic thread. 1197 */ 1198 if (curthread == kcage_cageout_thread || panicstr) { 1199 KCAGE_STAT_INCR(kct_cageout); /* unprotected incr. */ 1200 return (KCT_CRIT); 1201 } 1202 1203 /* 1204 * Don't throttle threads which are critical for proper 1205 * vm management if we're above kcage_throttlefree or 1206 * if freemem is very low. 1207 */ 1208 if (NOMEMWAIT()) { 1209 if (enough) { 1210 KCAGE_STAT_INCR(kct_exempt); /* unprotected incr. */ 1211 return (KCT_CRIT); 1212 } else if (freemem < minfree) { 1213 KCAGE_STAT_INCR(kct_critical); /* unprotected incr. */ 1214 return (KCT_CRIT); 1215 } 1216 } 1217 1218 /* 1219 * Don't throttle real-time threads if kcage_freemem > kcage_reserve. 1220 */ 1221 if (DISP_PRIO(curthread) > maxclsyspri && 1222 kcage_freemem > kcage_reserve) { 1223 KCAGE_STAT_INCR(kct_exempt); /* unprotected incr. */ 1224 return (KCT_CRIT); 1225 } 1226 1227 /* 1228 * Cause all other threads (which are assumed to not be 1229 * critical to cageout) to wait here until their request 1230 * can be satisfied. Be a little paranoid and wake the 1231 * kernel cage on each loop through this logic. 1232 */ 1233 while (kcage_freemem < kcage_throttlefree + npages) { 1234 ASSERT(kcage_on); 1235 1236 lastfree = kcage_freemem; 1237 1238 if (kcage_cageout_ready) { 1239 mutex_enter(&kcage_throttle_mutex); 1240 1241 kcage_needfree += npages; 1242 KCAGE_STAT_INCR(kct_wait); 1243 1244 kcage_cageout_wakeup(); 1245 KCAGE_STAT_INCR(kct_cagewake); 1246 1247 cv_wait(&kcage_throttle_cv, &kcage_throttle_mutex); 1248 1249 kcage_needfree -= npages; 1250 1251 mutex_exit(&kcage_throttle_mutex); 1252 } else { 1253 /* 1254 * NOTE: atomics are used just in case we enter 1255 * mp operation before the cageout thread is ready. 1256 */ 1257 atomic_add_long(&kcage_needfree, npages); 1258 1259 kcage_cageout_wakeup(); 1260 KCAGE_STAT_INCR(kct_cagewake); /* unprotected incr. */ 1261 1262 atomic_add_long(&kcage_needfree, -npages); 1263 } 1264 1265 if ((flags & PG_WAIT) == 0) { 1266 if (kcage_freemem > lastfree) { 1267 KCAGE_STAT_INCR(kct_progress); 1268 niter = 0; 1269 } else { 1270 KCAGE_STAT_INCR(kct_noprogress); 1271 if (++niter >= kcage_maxwait) { 1272 KCAGE_STAT_INCR(kct_timeout); 1273 return (KCT_FAILURE); 1274 } 1275 } 1276 } 1277 } 1278 return (KCT_NONCRIT); 1279 } 1280 1281 void 1282 kcage_freemem_add(pgcnt_t npages) 1283 { 1284 extern void wakeup_pcgs(void); 1285 1286 atomic_add_long(&kcage_freemem, npages); 1287 1288 wakeup_pcgs(); /* wakeup threads in pcgs() */ 1289 1290 if (kcage_needfree != 0 && 1291 kcage_freemem >= (kcage_throttlefree + kcage_needfree)) { 1292 1293 mutex_enter(&kcage_throttle_mutex); 1294 cv_broadcast(&kcage_throttle_cv); 1295 KCAGE_STAT_INCR(kfa_trottlewake); 1296 mutex_exit(&kcage_throttle_mutex); 1297 } 1298 } 1299 1300 void 1301 kcage_freemem_sub(pgcnt_t npages) 1302 { 1303 atomic_add_long(&kcage_freemem, -npages); 1304 1305 if (kcage_freemem < kcage_desfree) { 1306 kcage_cageout_wakeup(); 1307 KCAGE_STAT_INCR(kfs_cagewake); /* unprotected incr. */ 1308 } 1309 } 1310 1311 /* 1312 * return 0 on failure and 1 on success. 1313 */ 1314 static int 1315 kcage_setnoreloc_pages(page_t *rootpp, se_t se) 1316 { 1317 pgcnt_t npgs, i; 1318 page_t *pp; 1319 pfn_t rootpfn = page_pptonum(rootpp); 1320 uint_t szc; 1321 1322 ASSERT(!PP_ISFREE(rootpp)); 1323 ASSERT(PAGE_LOCKED_SE(rootpp, se)); 1324 if (!group_page_trylock(rootpp, se)) { 1325 return (0); 1326 } 1327 szc = rootpp->p_szc; 1328 if (szc == 0) { 1329 /* 1330 * The szc of a locked page can only change for pages that are 1331 * non-swapfs (i.e. anonymous memory) file system pages. 1332 */ 1333 ASSERT(rootpp->p_vnode != NULL && 1334 !PP_ISKAS(rootpp) && 1335 !IS_SWAPFSVP(rootpp->p_vnode)); 1336 PP_SETNORELOC(rootpp); 1337 return (1); 1338 } 1339 npgs = page_get_pagecnt(szc); 1340 ASSERT(IS_P2ALIGNED(rootpfn, npgs)); 1341 pp = rootpp; 1342 for (i = 0; i < npgs; i++, pp++) { 1343 ASSERT(PAGE_LOCKED_SE(pp, se)); 1344 ASSERT(!PP_ISFREE(pp)); 1345 ASSERT(pp->p_szc == szc); 1346 PP_SETNORELOC(pp); 1347 } 1348 group_page_unlock(rootpp); 1349 return (1); 1350 } 1351 1352 /* 1353 * Attempt to convert page to a caged page (set the P_NORELOC flag). 1354 * If successful and pages is free, move page to the tail of whichever 1355 * list it is on. 1356 * Returns: 1357 * EBUSY page already locked, assimilated but not free. 1358 * ENOMEM page assimilated, but memory too low to relocate. Page not free. 1359 * EAGAIN page not assimilated. Page not free. 1360 * ERANGE page assimilated. Page not root. 1361 * 0 page assimilated. Page free. 1362 * *nfreedp number of pages freed. 1363 * NOTE: With error codes ENOMEM, EBUSY, and 0 (zero), there is no way 1364 * to distinguish between a page that was already a NORELOC page from 1365 * those newly converted to NORELOC pages by this invocation of 1366 * kcage_assimilate_page. 1367 */ 1368 static int 1369 kcage_assimilate_page(page_t *pp, pgcnt_t *nfreedp) 1370 { 1371 if (page_trylock(pp, SE_EXCL)) { 1372 if (PP_ISNORELOC(pp)) { 1373 check_free_and_return: 1374 if (PP_ISFREE(pp)) { 1375 page_unlock(pp); 1376 *nfreedp = 0; 1377 return (0); 1378 } else { 1379 page_unlock(pp); 1380 return (EBUSY); 1381 } 1382 /*NOTREACHED*/ 1383 } 1384 } else { 1385 if (page_trylock(pp, SE_SHARED)) { 1386 if (PP_ISNORELOC(pp)) 1387 goto check_free_and_return; 1388 } else 1389 return (EAGAIN); 1390 1391 if (!PP_ISFREE(pp)) { 1392 page_unlock(pp); 1393 return (EAGAIN); 1394 } 1395 1396 /* 1397 * Need to upgrade the lock on it and set the NORELOC 1398 * bit. If it is free then remove it from the free 1399 * list so that the platform free list code can keep 1400 * NORELOC pages where they should be. 1401 */ 1402 /* 1403 * Before doing anything, get the exclusive lock. 1404 * This may fail (eg ISM pages are left shared locked). 1405 * If the page is free this will leave a hole in the 1406 * cage. There is no solution yet to this. 1407 */ 1408 if (!page_tryupgrade(pp)) { 1409 page_unlock(pp); 1410 return (EAGAIN); 1411 } 1412 } 1413 1414 ASSERT(PAGE_EXCL(pp)); 1415 1416 if (PP_ISFREE(pp)) { 1417 int which = PP_ISAGED(pp) ? PG_FREE_LIST : PG_CACHE_LIST; 1418 1419 page_list_sub(pp, which); 1420 ASSERT(pp->p_szc == 0); 1421 PP_SETNORELOC(pp); 1422 PLCNT_XFER_NORELOC(pp); 1423 page_list_add(pp, which | PG_LIST_TAIL); 1424 1425 page_unlock(pp); 1426 *nfreedp = 1; 1427 return (0); 1428 } else { 1429 if (pp->p_szc != 0) { 1430 if (!kcage_setnoreloc_pages(pp, SE_EXCL)) { 1431 page_unlock(pp); 1432 return (EAGAIN); 1433 } 1434 ASSERT(PP_ISNORELOC(pp)); 1435 } else { 1436 PP_SETNORELOC(pp); 1437 } 1438 PLCNT_XFER_NORELOC(pp); 1439 return (kcage_invalidate_page(pp, nfreedp)); 1440 } 1441 /*NOTREACHED*/ 1442 } 1443 1444 static int 1445 kcage_expand() 1446 { 1447 int did_something = 0; 1448 1449 spgcnt_t wanted; 1450 pfn_t pfn; 1451 page_t *pp; 1452 /* TODO: we don't really need n any more? */ 1453 pgcnt_t n; 1454 pgcnt_t nf, nfreed; 1455 1456 /* 1457 * Expand the cage if available cage memory is really low. Calculate 1458 * the amount required to return kcage_freemem to the level of 1459 * kcage_lotsfree, or to satisfy throttled requests, whichever is 1460 * more. It is rare for their sum to create an artificial threshold 1461 * above kcage_lotsfree, but it is possible. 1462 * 1463 * Exit early if expansion amount is equal to or less than zero. 1464 * (<0 is possible if kcage_freemem rises suddenly.) 1465 * 1466 * Exit early when the global page pool (apparently) does not 1467 * have enough free pages to page_relocate() even a single page. 1468 */ 1469 wanted = MAX(kcage_lotsfree, kcage_throttlefree + kcage_needfree) 1470 - kcage_freemem; 1471 if (wanted <= 0) 1472 return (0); 1473 else if (freemem < pageout_reserve + 1) { 1474 KCAGE_STAT_INCR(ke_lowfreemem); 1475 return (0); 1476 } 1477 1478 KCAGE_STAT_INCR(ke_calls); 1479 KCAGE_STAT_SET_SCAN(ke_wanted, (uint_t)wanted); 1480 1481 /* 1482 * Assimilate more pages from the global page pool into the cage. 1483 */ 1484 n = 0; /* number of pages PP_SETNORELOC'd */ 1485 nf = 0; /* number of those actually free */ 1486 while (kcage_on && nf < wanted) { 1487 pfn = kcage_get_pfn(1); 1488 if (pfn == PFN_INVALID) { /* eek! no where to grow */ 1489 KCAGE_STAT_INCR(ke_nopfn); 1490 goto terminate; 1491 } 1492 1493 KCAGE_STAT_INCR_SCAN(ke_examined); 1494 1495 if ((pp = page_numtopp_nolock(pfn)) == NULL) { 1496 KCAGE_STAT_INCR(ke_nopaget); 1497 continue; 1498 } 1499 KCAGEPAGETS_INC(); 1500 /* 1501 * Sanity check. Skip this pfn if it is 1502 * being deleted. 1503 */ 1504 if (pfn_is_being_deleted(pfn)) { 1505 KCAGE_STAT_INCR(ke_deleting); 1506 continue; 1507 } 1508 1509 if (PP_ISNORELOC(pp)) { 1510 KCAGE_STAT_INCR(ke_isnoreloc); 1511 continue; 1512 } 1513 1514 switch (kcage_assimilate_page(pp, &nfreed)) { 1515 case 0: /* assimilated, page is free */ 1516 KCAGE_STAT_NINCR_SCAN(ke_gotonefree, nfreed); 1517 did_something = 1; 1518 nf += nfreed; 1519 n++; 1520 break; 1521 1522 case EBUSY: /* assimilated, page not free */ 1523 case ERANGE: /* assimilated, page not root */ 1524 KCAGE_STAT_INCR_SCAN(ke_gotone); 1525 did_something = 1; 1526 n++; 1527 break; 1528 1529 case ENOMEM: /* assimilated, but no mem */ 1530 KCAGE_STAT_INCR(ke_terminate); 1531 did_something = 1; 1532 n++; 1533 goto terminate; 1534 1535 case EAGAIN: /* can't assimilate */ 1536 KCAGE_STAT_INCR_SCAN(ke_lefthole); 1537 break; 1538 1539 default: /* catch this with debug kernels */ 1540 ASSERT(0); 1541 break; 1542 } 1543 } 1544 1545 /* 1546 * Realign cage edge with the nearest physical address 1547 * boundry for big pages. This is done to give us a 1548 * better chance of actually getting usable big pages 1549 * in the cage. 1550 */ 1551 1552 terminate: 1553 1554 return (did_something); 1555 } 1556 1557 /* 1558 * Relocate page opp (Original Page Pointer) from cage pool to page rpp 1559 * (Replacement Page Pointer) in the global pool. Page opp will be freed 1560 * if relocation is successful, otherwise it is only unlocked. 1561 * On entry, page opp must be exclusively locked and not free. 1562 * *nfreedp: number of pages freed. 1563 */ 1564 static int 1565 kcage_relocate_page(page_t *pp, pgcnt_t *nfreedp) 1566 { 1567 page_t *opp = pp; 1568 page_t *rpp = NULL; 1569 spgcnt_t npgs; 1570 int result; 1571 1572 ASSERT(!PP_ISFREE(opp)); 1573 ASSERT(PAGE_EXCL(opp)); 1574 1575 result = page_relocate(&opp, &rpp, 1, 1, &npgs, NULL); 1576 *nfreedp = npgs; 1577 if (result == 0) { 1578 while (npgs-- > 0) { 1579 page_t *tpp; 1580 1581 ASSERT(rpp != NULL); 1582 tpp = rpp; 1583 page_sub(&rpp, tpp); 1584 page_unlock(tpp); 1585 } 1586 1587 ASSERT(rpp == NULL); 1588 1589 return (0); /* success */ 1590 } 1591 1592 page_unlock(opp); 1593 return (result); 1594 } 1595 1596 /* 1597 * Based on page_invalidate_pages() 1598 * 1599 * Kcage_invalidate_page() uses page_relocate() twice. Both instances 1600 * of use must be updated to match the new page_relocate() when it 1601 * becomes available. 1602 * 1603 * Return result of kcage_relocate_page or zero if page was directly freed. 1604 * *nfreedp: number of pages freed. 1605 */ 1606 static int 1607 kcage_invalidate_page(page_t *pp, pgcnt_t *nfreedp) 1608 { 1609 int result; 1610 1611 #if defined(__sparc) 1612 extern struct vnode prom_ppages; 1613 ASSERT(pp->p_vnode != &prom_ppages); 1614 #endif /* __sparc */ 1615 1616 ASSERT(!PP_ISFREE(pp)); 1617 ASSERT(PAGE_EXCL(pp)); 1618 1619 /* 1620 * Is this page involved in some I/O? shared? 1621 * The page_struct_lock need not be acquired to 1622 * examine these fields since the page has an 1623 * "exclusive" lock. 1624 */ 1625 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 1626 result = kcage_relocate_page(pp, nfreedp); 1627 #ifdef KCAGE_STATS 1628 if (result == 0) 1629 KCAGE_STAT_INCR_SCAN(kip_reloclocked); 1630 else if (result == ENOMEM) 1631 KCAGE_STAT_INCR_SCAN(kip_nomem); 1632 #endif 1633 return (result); 1634 } 1635 1636 ASSERT(pp->p_vnode->v_type != VCHR); 1637 1638 /* 1639 * Unload the mappings and check if mod bit is set. 1640 */ 1641 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1642 1643 if (hat_ismod(pp)) { 1644 result = kcage_relocate_page(pp, nfreedp); 1645 #ifdef KCAGE_STATS 1646 if (result == 0) 1647 KCAGE_STAT_INCR_SCAN(kip_relocmod); 1648 else if (result == ENOMEM) 1649 KCAGE_STAT_INCR_SCAN(kip_nomem); 1650 #endif 1651 return (result); 1652 } 1653 1654 if (!page_try_demote_pages(pp)) { 1655 KCAGE_STAT_INCR_SCAN(kip_demotefailed); 1656 page_unlock(pp); 1657 return (EAGAIN); 1658 } 1659 1660 page_destroy(pp, 0); 1661 KCAGE_STAT_INCR_SCAN(kip_destroy); 1662 *nfreedp = 1; 1663 return (0); 1664 } 1665 1666 static void 1667 kcage_cageout() 1668 { 1669 pfn_t pfn; 1670 page_t *pp; 1671 callb_cpr_t cprinfo; 1672 int did_something; 1673 int scan_again; 1674 pfn_t start_pfn; 1675 int pass; 1676 int last_pass; 1677 int pages_skipped; 1678 int shared_skipped; 1679 uint_t shared_level = 8; 1680 pgcnt_t nfreed; 1681 #ifdef KCAGE_STATS 1682 clock_t scan_start; 1683 #endif 1684 1685 CALLB_CPR_INIT(&cprinfo, &kcage_cageout_mutex, 1686 callb_generic_cpr, "cageout"); 1687 1688 mutex_enter(&kcage_cageout_mutex); 1689 kcage_cageout_thread = curthread; 1690 1691 pfn = PFN_INVALID; /* force scan reset */ 1692 start_pfn = PFN_INVALID; /* force init with 1st cage pfn */ 1693 kcage_cageout_ready = 1; /* switch kcage_cageout_wakeup mode */ 1694 1695 loop: 1696 /* 1697 * Wait here. Sooner or later, kcage_freemem_sub() will notice 1698 * that kcage_freemem is less than kcage_desfree. When it does 1699 * notice, kcage_freemem_sub() will wake us up via call to 1700 * kcage_cageout_wakeup(). 1701 */ 1702 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1703 cv_wait(&kcage_cageout_cv, &kcage_cageout_mutex); 1704 CALLB_CPR_SAFE_END(&cprinfo, &kcage_cageout_mutex); 1705 1706 KCAGE_STAT_INCR(kt_wakeups); 1707 KCAGE_STAT_SET_SCAN(kt_freemem_start, freemem); 1708 KCAGE_STAT_SET_SCAN(kt_kcage_freemem_start, kcage_freemem); 1709 pass = 0; 1710 last_pass = 0; 1711 1712 #ifdef KCAGE_STATS 1713 scan_start = lbolt; 1714 #endif 1715 1716 again: 1717 if (!kcage_on) 1718 goto loop; 1719 1720 KCAGE_STAT_INCR(kt_scans); 1721 KCAGE_STAT_INCR_SCAN(kt_passes); 1722 1723 did_something = 0; 1724 pages_skipped = 0; 1725 shared_skipped = 0; 1726 while ((kcage_freemem < kcage_lotsfree || kcage_needfree) && 1727 (pfn = kcage_walk_cage(pfn == PFN_INVALID)) != PFN_INVALID) { 1728 1729 if (start_pfn == PFN_INVALID) 1730 start_pfn = pfn; 1731 else if (start_pfn == pfn) { 1732 last_pass = pass; 1733 pass += 1; 1734 /* 1735 * Did a complete walk of kernel cage, but didn't free 1736 * any pages. If only one cpu is online then 1737 * stop kernel cage walk and try expanding. 1738 */ 1739 if (ncpus_online == 1 && did_something == 0) { 1740 KCAGE_STAT_INCR(kt_cageout_break); 1741 break; 1742 } 1743 } 1744 1745 pp = page_numtopp_nolock(pfn); 1746 if (pp == NULL) { 1747 continue; 1748 } 1749 1750 KCAGE_STAT_INCR_SCAN(kt_examined); 1751 1752 /* 1753 * Do a quick PP_ISNORELOC() and PP_ISFREE test outside 1754 * of the lock. If one is missed it will be seen next 1755 * time through. 1756 * 1757 * Skip non-caged-pages. These pages can exist in the cage 1758 * because, if during cage expansion, a page is 1759 * encountered that is long-term locked the lock prevents the 1760 * expansion logic from setting the P_NORELOC flag. Hence, 1761 * non-caged-pages surrounded by caged-pages. 1762 */ 1763 if (!PP_ISNORELOC(pp)) { 1764 switch (kcage_assimilate_page(pp, &nfreed)) { 1765 case 0: 1766 did_something = 1; 1767 KCAGE_STAT_NINCR_SCAN(kt_gotonefree, 1768 nfreed); 1769 break; 1770 1771 case EBUSY: 1772 case ERANGE: 1773 did_something = 1; 1774 KCAGE_STAT_INCR_SCAN(kt_gotone); 1775 break; 1776 1777 case EAGAIN: 1778 case ENOMEM: 1779 break; 1780 1781 default: 1782 /* catch this with debug kernels */ 1783 ASSERT(0); 1784 break; 1785 } 1786 1787 continue; 1788 } else { 1789 int prm; 1790 1791 if (PP_ISFREE(pp)) { 1792 continue; 1793 } 1794 1795 if ((PP_ISKAS(pp) && pp->p_lckcnt > 0) || 1796 !page_trylock(pp, SE_EXCL)) { 1797 KCAGE_STAT_INCR_SCAN(kt_cantlock); 1798 continue; 1799 } 1800 1801 /* P_NORELOC bit should not have gone away. */ 1802 ASSERT(PP_ISNORELOC(pp)); 1803 if (PP_ISFREE(pp) || (PP_ISKAS(pp) && 1804 pp->p_lckcnt > 0)) { 1805 page_unlock(pp); 1806 continue; 1807 } 1808 1809 KCAGE_STAT_SET_SCAN(kt_skiplevel, shared_level); 1810 if (hat_page_getshare(pp) > shared_level) { 1811 page_unlock(pp); 1812 pages_skipped = 1; 1813 shared_skipped = 1; 1814 KCAGE_STAT_INCR_SCAN(kt_skipshared); 1815 continue; 1816 } 1817 1818 /* 1819 * In pass {0, 1}, skip page if ref bit is set. 1820 * In pass {0, 1, 2}, skip page if mod bit is set. 1821 */ 1822 prm = hat_pagesync(pp, 1823 HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD); 1824 1825 /* On first pass ignore ref'd pages */ 1826 if (pass <= 1 && (prm & P_REF)) { 1827 KCAGE_STAT_INCR_SCAN(kt_skiprefd); 1828 pages_skipped = 1; 1829 page_unlock(pp); 1830 continue; 1831 } 1832 1833 /* On pass 2, page_destroy if mod bit is not set */ 1834 if (pass <= 2) { 1835 if (pp->p_szc != 0 || (prm & P_MOD) || 1836 pp->p_lckcnt || pp->p_cowcnt) { 1837 pages_skipped = 1; 1838 page_unlock(pp); 1839 } else { 1840 1841 /* 1842 * unload the mappings before 1843 * checking if mod bit is set 1844 */ 1845 (void) hat_pageunload(pp, 1846 HAT_FORCE_PGUNLOAD); 1847 1848 /* 1849 * skip this page if modified 1850 */ 1851 if (hat_ismod(pp)) { 1852 pages_skipped = 1; 1853 page_unlock(pp); 1854 continue; 1855 } 1856 1857 KCAGE_STAT_INCR_SCAN(kt_destroy); 1858 page_destroy(pp, 0); 1859 did_something = 1; 1860 } 1861 continue; 1862 } 1863 1864 if (kcage_invalidate_page(pp, &nfreed) == 0) { 1865 did_something = 1; 1866 KCAGE_STAT_NINCR_SCAN(kt_gotonefree, nfreed); 1867 } 1868 1869 /* 1870 * No need to drop the page lock here. 1871 * Kcage_invalidate_page has done that for us 1872 * either explicitly or through a page_free. 1873 */ 1874 } 1875 } 1876 1877 /* 1878 * Expand the cage only if available cage memory is really low. 1879 * This test is done only after a complete scan of the cage. 1880 * The reason for not checking and expanding more often is to 1881 * avoid rapid expansion of the cage. Naturally, scanning the 1882 * cage takes time. So by scanning first, we use that work as a 1883 * delay loop in between expand decisions. 1884 */ 1885 1886 scan_again = 0; 1887 if (kcage_freemem < kcage_minfree || kcage_needfree) { 1888 /* 1889 * Kcage_expand() will return a non-zero value if it was 1890 * able to expand the cage -- whether or not the new 1891 * pages are free and immediately usable. If non-zero, 1892 * we do another scan of the cage. The pages might be 1893 * freed during that scan or by time we get back here. 1894 * If not, we will attempt another expansion. 1895 * However, if kcage_expand() returns zero, then it was 1896 * unable to expand the cage. This is the case when the 1897 * the growth list is exausted, therefore no work was done 1898 * and there is no reason to scan the cage again. 1899 * Note: Kernel cage scan is not repeated on single-cpu 1900 * system to avoid kernel cage thread hogging cpu. 1901 */ 1902 if (pass <= 3 && pages_skipped && ncpus_online > 1) 1903 scan_again = 1; 1904 else 1905 (void) kcage_expand(); /* don't scan again */ 1906 } else if (kcage_freemem < kcage_lotsfree) { 1907 /* 1908 * If available cage memory is less than abundant 1909 * and a full scan of the cage has not yet been completed, 1910 * or a scan has completed and some work was performed, 1911 * or pages were skipped because of sharing, 1912 * or we simply have not yet completed two passes, 1913 * then do another scan. 1914 */ 1915 if (pass <= 2 && pages_skipped) 1916 scan_again = 1; 1917 if (pass == last_pass || did_something) 1918 scan_again = 1; 1919 else if (shared_skipped && shared_level < (8<<24)) { 1920 shared_level <<= 1; 1921 scan_again = 1; 1922 } 1923 } 1924 1925 if (scan_again && ncpus_online > 1) 1926 goto again; 1927 else { 1928 if (shared_level > 8) 1929 shared_level >>= 1; 1930 1931 KCAGE_STAT_SET_SCAN(kt_freemem_end, freemem); 1932 KCAGE_STAT_SET_SCAN(kt_kcage_freemem_end, kcage_freemem); 1933 KCAGE_STAT_SET_SCAN(kt_ticks, lbolt - scan_start); 1934 KCAGE_STAT_INC_SCAN_INDEX; 1935 goto loop; 1936 } 1937 1938 /*NOTREACHED*/ 1939 } 1940 1941 void 1942 kcage_cageout_wakeup() 1943 { 1944 if (mutex_tryenter(&kcage_cageout_mutex)) { 1945 if (kcage_cageout_ready) { 1946 cv_signal(&kcage_cageout_cv); 1947 } else if (kcage_freemem < kcage_minfree || kcage_needfree) { 1948 /* 1949 * Available cage memory is really low. Time to 1950 * start expanding the cage. However, the 1951 * kernel cage thread is not yet ready to 1952 * do the work. Use *this* thread, which is 1953 * most likely to be t0, to do the work. 1954 */ 1955 KCAGE_STAT_INCR(kcw_expandearly); 1956 (void) kcage_expand(); 1957 KCAGE_STAT_INC_SCAN_INDEX; 1958 } 1959 1960 mutex_exit(&kcage_cageout_mutex); 1961 } 1962 /* else, kernel cage thread is already running */ 1963 } 1964 1965 void 1966 kcage_tick() 1967 { 1968 /* 1969 * Once per second we wake up all the threads throttled 1970 * waiting for cage memory, in case we've become stuck 1971 * and haven't made forward progress expanding the cage. 1972 */ 1973 if (kcage_on && kcage_cageout_ready) 1974 cv_broadcast(&kcage_throttle_cv); 1975 } 1976