1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/types.h> 29 #include <sys/sysmacros.h> 30 #include <sys/kmem.h> 31 #include <sys/atomic.h> 32 #include <sys/bitmap.h> 33 #include <sys/systm.h> 34 #include <vm/seg_kmem.h> 35 #include <vm/hat.h> 36 #include <vm/vm_dep.h> 37 #include <vm/hat_i86.h> 38 #include <sys/cmn_err.h> 39 40 41 /* 42 * When pages are shared by more than one mapping, a list of these 43 * structs hangs off of the page_t connected by the hm_next and hm_prev 44 * fields. Every hment is also indexed by a system-wide hash table, using 45 * hm_hashnext to connect it to the chain of hments in a single hash 46 * bucket. 47 */ 48 struct hment { 49 struct hment *hm_hashnext; /* next mapping on hash chain */ 50 struct hment *hm_next; /* next mapping of same page */ 51 struct hment *hm_prev; /* previous mapping of same page */ 52 htable_t *hm_htable; /* corresponding htable_t */ 53 pfn_t hm_pfn; /* mapping page frame number */ 54 uint16_t hm_entry; /* index of pte in htable */ 55 uint16_t hm_pad; /* explicitly expose compiler padding */ 56 #ifdef __amd64 57 uint32_t hm_pad2; /* explicitly expose compiler padding */ 58 #endif 59 }; 60 61 /* 62 * Value returned by hment_walk() when dealing with a single mapping 63 * embedded in the page_t. 64 */ 65 #define HMENT_EMBEDDED ((hment_t *)(uintptr_t)1) 66 67 kmem_cache_t *hment_cache; 68 69 /* 70 * The hment reserve is similar to the htable reserve, with the following 71 * exception. Hment's are never needed for HAT kmem allocs. 72 * 73 * The hment_reserve_amount variable is used, so that you can change it's 74 * value to zero via a kernel debugger to force stealing to get tested. 75 */ 76 #define HMENT_RESERVE_AMOUNT (200) /* currently a guess at right value. */ 77 uint_t hment_reserve_amount = HMENT_RESERVE_AMOUNT; 78 kmutex_t hment_reserve_mutex; 79 uint_t hment_reserve_count; 80 hment_t *hment_reserve_pool; 81 extern kthread_t *hat_reserves_thread; 82 83 /* 84 * Possible performance RFE: we might need to make this dynamic, perhaps 85 * based on the number of pages in the system. 86 */ 87 #define HMENT_HASH_SIZE (64 * 1024) 88 static uint_t hment_hash_entries = HMENT_HASH_SIZE; 89 static hment_t **hment_hash; 90 91 /* 92 * Lots of highly shared pages will have the same value for "entry" (consider 93 * the starting address of "xterm" or "sh"). So we'll distinguish them by 94 * adding the pfn of the page table into both the high bits. 95 * The shift by 9 corresponds to the range of values for entry (0..511). 96 */ 97 #define HMENT_HASH(pfn, entry) (uint32_t) \ 98 ((((pfn) << 9) + entry + pfn) & (hment_hash_entries - 1)) 99 100 /* 101 * "mlist_lock" is a hashed mutex lock for protecting per-page mapping 102 * lists and "hash_lock" is a similar lock protecting the hment hash 103 * table. The hashed approach is taken to avoid the spatial overhead of 104 * maintaining a separate lock for each page, while still achieving better 105 * scalability than a single lock would allow. 106 */ 107 #define MLIST_NUM_LOCK 256 /* must be power of two */ 108 static kmutex_t mlist_lock[MLIST_NUM_LOCK]; 109 110 /* 111 * the shift by 9 is so that all large pages don't use the same hash bucket 112 */ 113 #define MLIST_MUTEX(pp) \ 114 &mlist_lock[((pp)->p_pagenum + ((pp)->p_pagenum >> 9)) & \ 115 (MLIST_NUM_LOCK - 1)] 116 117 #define HASH_NUM_LOCK 256 /* must be power of two */ 118 static kmutex_t hash_lock[HASH_NUM_LOCK]; 119 120 #define HASH_MUTEX(idx) &hash_lock[(idx) & (HASH_NUM_LOCK-1)] 121 122 static hment_t *hment_steal(void); 123 124 /* 125 * put one hment onto the reserves list 126 */ 127 static void 128 hment_put_reserve(hment_t *hm) 129 { 130 HATSTAT_INC(hs_hm_put_reserve); 131 mutex_enter(&hment_reserve_mutex); 132 hm->hm_next = hment_reserve_pool; 133 hment_reserve_pool = hm; 134 ++hment_reserve_count; 135 mutex_exit(&hment_reserve_mutex); 136 } 137 138 /* 139 * Take one hment from the reserve. 140 */ 141 static hment_t * 142 hment_get_reserve(void) 143 { 144 hment_t *hm = NULL; 145 146 /* 147 * We rely on a "donation system" to refill the hment reserve 148 * list, which only takes place when we are allocating hments for 149 * user mappings. It is theoretically possible that an incredibly 150 * long string of kernel hment_alloc()s with no intervening user 151 * hment_alloc()s could exhaust that pool. 152 */ 153 HATSTAT_INC(hs_hm_get_reserve); 154 mutex_enter(&hment_reserve_mutex); 155 if (hment_reserve_count != 0) { 156 hm = hment_reserve_pool; 157 hment_reserve_pool = hm->hm_next; 158 --hment_reserve_count; 159 } 160 mutex_exit(&hment_reserve_mutex); 161 return (hm); 162 } 163 164 /* 165 * Allocate an hment 166 */ 167 static hment_t * 168 hment_alloc() 169 { 170 int km_flag = can_steal_post_boot ? KM_NOSLEEP : KM_SLEEP; 171 hment_t *hm = NULL; 172 int use_reserves = (use_boot_reserve || 173 curthread == hat_reserves_thread || panicstr != NULL); 174 175 /* 176 * If we aren't using the reserves, try using kmem to get an hment. 177 * Donate any successful allocations to reserves if low. 178 * 179 * If we're in panic, resort to using the reserves. 180 */ 181 HATSTAT_INC(hs_hm_alloc); 182 if (!use_reserves) { 183 for (;;) { 184 hm = kmem_cache_alloc(hment_cache, km_flag); 185 if (hment_reserve_count >= hment_reserve_amount || 186 hm == NULL || panicstr != NULL || 187 curthread == hat_reserves_thread) 188 break; 189 hment_put_reserve(hm); 190 } 191 } 192 193 /* 194 * If allocation failed, we need to tap the reserves or steal 195 */ 196 if (hm == NULL) { 197 if (use_reserves) 198 hm = hment_get_reserve(); 199 200 /* 201 * If we still haven't gotten an hment, attempt to steal one by 202 * victimizing a mapping in a user htable. 203 */ 204 if (hm == NULL && can_steal_post_boot) 205 hm = hment_steal(); 206 207 /* 208 * we're in dire straights, try the reserve 209 */ 210 if (hm == NULL) 211 hm = hment_get_reserve(); 212 213 /* 214 * still no hment is a serious problem. 215 */ 216 if (hm == NULL) 217 panic("hment_alloc(): no reserve, couldn't steal"); 218 } 219 220 221 hm->hm_entry = 0; 222 hm->hm_htable = NULL; 223 hm->hm_hashnext = NULL; 224 hm->hm_next = NULL; 225 hm->hm_prev = NULL; 226 hm->hm_pfn = PFN_INVALID; 227 return (hm); 228 } 229 230 /* 231 * Free an hment, possibly to the reserves list when called from the 232 * thread using the reserves. For example, when freeing an hment during an 233 * htable_steal(), we can't recurse into the kmem allocator, so we just 234 * push the hment onto the reserve list. 235 */ 236 void 237 hment_free(hment_t *hm) 238 { 239 #ifdef DEBUG 240 /* 241 * zero out all fields to try and force any race conditions to segfault 242 */ 243 bzero(hm, sizeof (*hm)); 244 #endif 245 HATSTAT_INC(hs_hm_free); 246 if (curthread == hat_reserves_thread || 247 hment_reserve_count < hment_reserve_amount) 248 hment_put_reserve(hm); 249 else 250 kmem_cache_free(hment_cache, hm); 251 } 252 253 int 254 x86_hm_held(page_t *pp) 255 { 256 ASSERT(pp != NULL); 257 return (MUTEX_HELD(MLIST_MUTEX(pp))); 258 } 259 260 void 261 x86_hm_enter(page_t *pp) 262 { 263 ASSERT(pp != NULL); 264 mutex_enter(MLIST_MUTEX(pp)); 265 } 266 267 void 268 x86_hm_exit(page_t *pp) 269 { 270 ASSERT(pp != NULL); 271 mutex_exit(MLIST_MUTEX(pp)); 272 } 273 274 /* 275 * Internal routine to add a full hment to a page_t mapping list 276 */ 277 static void 278 hment_insert(hment_t *hm, page_t *pp) 279 { 280 uint_t idx; 281 282 ASSERT(x86_hm_held(pp)); 283 ASSERT(!pp->p_embed); 284 285 /* 286 * Add the hment to the page's mapping list. 287 */ 288 ++pp->p_share; 289 hm->hm_next = pp->p_mapping; 290 if (pp->p_mapping != NULL) 291 ((hment_t *)pp->p_mapping)->hm_prev = hm; 292 pp->p_mapping = hm; 293 294 /* 295 * Add the hment to the system-wide hash table. 296 */ 297 idx = HMENT_HASH(hm->hm_htable->ht_pfn, hm->hm_entry); 298 299 mutex_enter(HASH_MUTEX(idx)); 300 hm->hm_hashnext = hment_hash[idx]; 301 hment_hash[idx] = hm; 302 mutex_exit(HASH_MUTEX(idx)); 303 } 304 305 /* 306 * Prepare a mapping list entry to the given page. 307 * 308 * There are 4 different situations to deal with: 309 * 310 * - Adding the first mapping to a page_t as an embedded hment 311 * - Refaulting on an existing embedded mapping 312 * - Upgrading an embedded mapping when adding a 2nd mapping 313 * - Adding another mapping to a page_t that already has multiple mappings 314 * note we don't optimized for the refaulting case here. 315 * 316 * Due to competition with other threads that may be mapping/unmapping the 317 * same page and the need to drop all locks while allocating hments, any or 318 * all of the 3 situations can occur (and in almost any order) in any given 319 * call. Isn't this fun! 320 */ 321 hment_t * 322 hment_prepare(htable_t *htable, uint_t entry, page_t *pp) 323 { 324 hment_t *hm = NULL; 325 326 ASSERT(x86_hm_held(pp)); 327 328 for (;;) { 329 330 /* 331 * The most common case is establishing the first mapping to a 332 * page, so check that first. This doesn't need any allocated 333 * hment. 334 */ 335 if (pp->p_mapping == NULL) { 336 ASSERT(!pp->p_embed); 337 ASSERT(pp->p_share == 0); 338 if (hm == NULL) 339 break; 340 341 /* 342 * we had an hment already, so free it and retry 343 */ 344 goto free_and_continue; 345 } 346 347 /* 348 * If there is an embedded mapping, we may need to 349 * convert it to an hment. 350 */ 351 if (pp->p_embed) { 352 353 /* should point to htable */ 354 ASSERT(pp->p_mapping != NULL); 355 356 /* 357 * If we are faulting on a pre-existing mapping 358 * there is no need to promote/allocate a new hment. 359 * This happens a lot due to segmap. 360 */ 361 if (pp->p_mapping == htable && pp->p_mlentry == entry) { 362 if (hm == NULL) 363 break; 364 goto free_and_continue; 365 } 366 367 /* 368 * If we have an hment allocated, use it to promote the 369 * existing embedded mapping. 370 */ 371 if (hm != NULL) { 372 hm->hm_htable = pp->p_mapping; 373 hm->hm_entry = pp->p_mlentry; 374 hm->hm_pfn = pp->p_pagenum; 375 pp->p_mapping = NULL; 376 pp->p_share = 0; 377 pp->p_embed = 0; 378 hment_insert(hm, pp); 379 } 380 381 /* 382 * We either didn't have an hment allocated or we just 383 * used it for the embedded mapping. In either case, 384 * allocate another hment and restart. 385 */ 386 goto allocate_and_continue; 387 } 388 389 /* 390 * Last possibility is that we're adding an hment to a list 391 * of hments. 392 */ 393 if (hm != NULL) 394 break; 395 allocate_and_continue: 396 x86_hm_exit(pp); 397 hm = hment_alloc(); 398 x86_hm_enter(pp); 399 continue; 400 401 free_and_continue: 402 /* 403 * we allocated an hment already, free it and retry 404 */ 405 x86_hm_exit(pp); 406 hment_free(hm); 407 hm = NULL; 408 x86_hm_enter(pp); 409 } 410 ASSERT(x86_hm_held(pp)); 411 return (hm); 412 } 413 414 /* 415 * Record a mapping list entry for the htable/entry to the given page. 416 * 417 * hment_prepare() should have properly set up the situation. 418 */ 419 void 420 hment_assign(htable_t *htable, uint_t entry, page_t *pp, hment_t *hm) 421 { 422 ASSERT(x86_hm_held(pp)); 423 424 /* 425 * The most common case is establishing the first mapping to a 426 * page, so check that first. This doesn't need any allocated 427 * hment. 428 */ 429 if (pp->p_mapping == NULL) { 430 ASSERT(hm == NULL); 431 ASSERT(!pp->p_embed); 432 ASSERT(pp->p_share == 0); 433 pp->p_embed = 1; 434 pp->p_mapping = htable; 435 pp->p_mlentry = entry; 436 return; 437 } 438 439 /* 440 * We should never get here with a pre-existing embedded maping 441 */ 442 ASSERT(!pp->p_embed); 443 444 /* 445 * add the new hment to the mapping list 446 */ 447 ASSERT(hm != NULL); 448 hm->hm_htable = htable; 449 hm->hm_entry = entry; 450 hm->hm_pfn = pp->p_pagenum; 451 hment_insert(hm, pp); 452 } 453 454 /* 455 * Walk through the mappings for a page. 456 * 457 * must already have done an x86_hm_enter() 458 */ 459 hment_t * 460 hment_walk(page_t *pp, htable_t **ht, uint_t *entry, hment_t *prev) 461 { 462 hment_t *hm; 463 464 ASSERT(x86_hm_held(pp)); 465 466 if (pp->p_embed) { 467 if (prev == NULL) { 468 *ht = (htable_t *)pp->p_mapping; 469 *entry = pp->p_mlentry; 470 hm = HMENT_EMBEDDED; 471 } else { 472 ASSERT(prev == HMENT_EMBEDDED); 473 hm = NULL; 474 } 475 } else { 476 if (prev == NULL) { 477 ASSERT(prev != HMENT_EMBEDDED); 478 hm = (hment_t *)pp->p_mapping; 479 } else { 480 hm = prev->hm_next; 481 } 482 483 if (hm != NULL) { 484 *ht = hm->hm_htable; 485 *entry = hm->hm_entry; 486 } 487 } 488 return (hm); 489 } 490 491 /* 492 * Remove a mapping to a page from its mapping list. Must have 493 * the corresponding mapping list locked. 494 * Finds the mapping list entry with the given pte_t and 495 * unlinks it from the mapping list. 496 */ 497 hment_t * 498 hment_remove(page_t *pp, htable_t *ht, uint_t entry) 499 { 500 hment_t *prev = NULL; 501 hment_t *hm; 502 uint_t idx; 503 pfn_t pfn; 504 505 ASSERT(x86_hm_held(pp)); 506 507 /* 508 * Check if we have only one mapping embedded in the page_t. 509 */ 510 if (pp->p_embed) { 511 ASSERT(ht == (htable_t *)pp->p_mapping); 512 ASSERT(entry == pp->p_mlentry); 513 ASSERT(pp->p_share == 0); 514 pp->p_mapping = NULL; 515 pp->p_mlentry = 0; 516 pp->p_embed = 0; 517 return (NULL); 518 } 519 520 /* 521 * Otherwise it must be in the list of hments. 522 * Find the hment in the system-wide hash table and remove it. 523 */ 524 ASSERT(pp->p_share != 0); 525 pfn = pp->p_pagenum; 526 idx = HMENT_HASH(ht->ht_pfn, entry); 527 mutex_enter(HASH_MUTEX(idx)); 528 hm = hment_hash[idx]; 529 while (hm && (hm->hm_htable != ht || hm->hm_entry != entry || 530 hm->hm_pfn != pfn)) { 531 prev = hm; 532 hm = hm->hm_hashnext; 533 } 534 if (hm == NULL) { 535 panic("hment_remove() missing in hash table pp=%lx, ht=%lx," 536 "entry=0x%x hash index=0x%x", (uintptr_t)pp, (uintptr_t)ht, 537 entry, idx); 538 } 539 540 if (prev) 541 prev->hm_hashnext = hm->hm_hashnext; 542 else 543 hment_hash[idx] = hm->hm_hashnext; 544 mutex_exit(HASH_MUTEX(idx)); 545 546 /* 547 * Remove the hment from the page's mapping list 548 */ 549 if (hm->hm_next) 550 hm->hm_next->hm_prev = hm->hm_prev; 551 if (hm->hm_prev) 552 hm->hm_prev->hm_next = hm->hm_next; 553 else 554 pp->p_mapping = hm->hm_next; 555 556 --pp->p_share; 557 hm->hm_hashnext = NULL; 558 hm->hm_next = NULL; 559 hm->hm_prev = NULL; 560 561 return (hm); 562 } 563 564 /* 565 * Put initial hment's in the reserve pool. 566 */ 567 void 568 hment_reserve(uint_t count) 569 { 570 hment_t *hm; 571 572 count += hment_reserve_amount; 573 574 while (hment_reserve_count < count) { 575 hm = kmem_cache_alloc(hment_cache, KM_NOSLEEP); 576 if (hm == NULL) 577 return; 578 hment_put_reserve(hm); 579 } 580 } 581 582 /* 583 * Readjust the hment reserves after they may have been used. 584 */ 585 void 586 hment_adjust_reserve() 587 { 588 hment_t *hm; 589 590 /* 591 * Free up any excess reserves 592 */ 593 while (hment_reserve_count > hment_reserve_amount) { 594 ASSERT(curthread != hat_reserves_thread); 595 hm = hment_get_reserve(); 596 if (hm == NULL) 597 return; 598 hment_free(hm); 599 } 600 } 601 602 /* 603 * initialize hment data structures 604 */ 605 void 606 hment_init(void) 607 { 608 int i; 609 int flags = KMC_NOHASH | KMC_NODEBUG; 610 611 /* 612 * Initialize kmem caches. On 32 bit kernel's we shut off 613 * debug information to save on precious kernel VA usage. 614 */ 615 hment_cache = kmem_cache_create("hment_t", 616 sizeof (hment_t), 0, NULL, NULL, NULL, 617 NULL, hat_memload_arena, flags); 618 619 hment_hash = kmem_zalloc(hment_hash_entries * sizeof (hment_t *), 620 KM_SLEEP); 621 622 for (i = 0; i < MLIST_NUM_LOCK; i++) 623 mutex_init(&mlist_lock[i], NULL, MUTEX_DEFAULT, NULL); 624 625 for (i = 0; i < HASH_NUM_LOCK; i++) 626 mutex_init(&hash_lock[i], NULL, MUTEX_DEFAULT, NULL); 627 628 629 } 630 631 /* 632 * return the number of mappings to a page 633 * 634 * Note there is no ASSERT() that the MUTEX is held for this. 635 * Hence the return value might be inaccurate if this is called without 636 * doing an x86_hm_enter(). 637 */ 638 uint_t 639 hment_mapcnt(page_t *pp) 640 { 641 uint_t cnt; 642 uint_t szc; 643 page_t *larger; 644 hment_t *hm; 645 646 x86_hm_enter(pp); 647 if (pp->p_mapping == NULL) 648 cnt = 0; 649 else if (pp->p_embed) 650 cnt = 1; 651 else 652 cnt = pp->p_share; 653 x86_hm_exit(pp); 654 655 /* 656 * walk through all larger mapping sizes counting mappings 657 */ 658 for (szc = 1; szc <= pp->p_szc; ++szc) { 659 larger = PP_GROUPLEADER(pp, szc); 660 if (larger == pp) /* don't double count large mappings */ 661 continue; 662 663 x86_hm_enter(larger); 664 if (larger->p_mapping != NULL) { 665 if (larger->p_embed && 666 ((htable_t *)larger->p_mapping)->ht_level == szc) { 667 ++cnt; 668 } else if (!larger->p_embed) { 669 for (hm = larger->p_mapping; hm; 670 hm = hm->hm_next) { 671 if (hm->hm_htable->ht_level == szc) 672 ++cnt; 673 } 674 } 675 } 676 x86_hm_exit(larger); 677 } 678 return (cnt); 679 } 680 681 /* 682 * We need to steal an hment. Walk through all the page_t's until we 683 * find one that has multiple mappings. Unload one of the mappings 684 * and reclaim that hment. Note that we'll save/restart the starting 685 * page to try and spread the pain. 686 */ 687 static page_t *last_page = NULL; 688 689 static hment_t * 690 hment_steal(void) 691 { 692 page_t *last = last_page; 693 page_t *pp = last; 694 hment_t *hm = NULL; 695 hment_t *hm2; 696 htable_t *ht; 697 uint_t found_one = 0; 698 699 HATSTAT_INC(hs_hm_steals); 700 if (pp == NULL) 701 last = pp = page_first(); 702 703 while (!found_one) { 704 HATSTAT_INC(hs_hm_steal_exam); 705 pp = page_next(pp); 706 if (pp == NULL) 707 pp = page_first(); 708 709 /* 710 * The loop and function exit here if nothing found to steal. 711 */ 712 if (pp == last) 713 return (NULL); 714 715 /* 716 * Only lock the page_t if it has hments. 717 */ 718 if (pp->p_mapping == NULL || pp->p_embed) 719 continue; 720 721 /* 722 * Search the mapping list for a usable mapping. 723 */ 724 x86_hm_enter(pp); 725 if (!pp->p_embed) { 726 for (hm = pp->p_mapping; hm; hm = hm->hm_next) { 727 ht = hm->hm_htable; 728 if (ht->ht_hat != kas.a_hat && 729 ht->ht_busy == 0 && 730 ht->ht_lock_cnt == 0) { 731 found_one = 1; 732 break; 733 } 734 } 735 } 736 if (!found_one) 737 x86_hm_exit(pp); 738 } 739 740 /* 741 * Steal the mapping we found. Note that hati_page_unmap() will 742 * do the x86_hm_exit(). 743 */ 744 hm2 = hati_page_unmap(pp, ht, hm->hm_entry); 745 ASSERT(hm2 == hm); 746 last_page = pp; 747 return (hm); 748 } 749