1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 #include <sys/types.h> 27 #include <sys/sysmacros.h> 28 #include <sys/kmem.h> 29 #include <sys/atomic.h> 30 #include <sys/bitmap.h> 31 #include <sys/machparam.h> 32 #include <sys/machsystm.h> 33 #include <sys/mman.h> 34 #include <sys/systm.h> 35 #include <sys/cpuvar.h> 36 #include <sys/thread.h> 37 #include <sys/proc.h> 38 #include <sys/cpu.h> 39 #include <sys/kmem.h> 40 #include <sys/disp.h> 41 #include <sys/vmem.h> 42 #include <sys/vmsystm.h> 43 #include <sys/promif.h> 44 #include <sys/var.h> 45 #include <sys/x86_archext.h> 46 #include <sys/archsystm.h> 47 #include <sys/bootconf.h> 48 #include <sys/dumphdr.h> 49 #include <vm/seg_kmem.h> 50 #include <vm/seg_kpm.h> 51 #include <vm/hat.h> 52 #include <vm/hat_i86.h> 53 #include <sys/cmn_err.h> 54 #include <sys/panic.h> 55 56 #ifdef __xpv 57 #include <sys/hypervisor.h> 58 #include <sys/xpv_panic.h> 59 #endif 60 61 #include <sys/bootinfo.h> 62 #include <vm/kboot_mmu.h> 63 64 static void x86pte_zero(htable_t *dest, uint_t entry, uint_t count); 65 66 kmem_cache_t *htable_cache; 67 68 /* 69 * The variable htable_reserve_amount, rather than HTABLE_RESERVE_AMOUNT, 70 * is used in order to facilitate testing of the htable_steal() code. 71 * By resetting htable_reserve_amount to a lower value, we can force 72 * stealing to occur. The reserve amount is a guess to get us through boot. 73 */ 74 #define HTABLE_RESERVE_AMOUNT (200) 75 uint_t htable_reserve_amount = HTABLE_RESERVE_AMOUNT; 76 kmutex_t htable_reserve_mutex; 77 uint_t htable_reserve_cnt; 78 htable_t *htable_reserve_pool; 79 80 /* 81 * Used to hand test htable_steal(). 82 */ 83 #ifdef DEBUG 84 ulong_t force_steal = 0; 85 ulong_t ptable_cnt = 0; 86 #endif 87 88 /* 89 * This variable is so that we can tune this via /etc/system 90 * Any value works, but a power of two <= mmu.ptes_per_table is best. 91 */ 92 uint_t htable_steal_passes = 8; 93 94 /* 95 * mutex stuff for access to htable hash 96 */ 97 #define NUM_HTABLE_MUTEX 128 98 kmutex_t htable_mutex[NUM_HTABLE_MUTEX]; 99 #define HTABLE_MUTEX_HASH(h) ((h) & (NUM_HTABLE_MUTEX - 1)) 100 101 #define HTABLE_ENTER(h) mutex_enter(&htable_mutex[HTABLE_MUTEX_HASH(h)]); 102 #define HTABLE_EXIT(h) mutex_exit(&htable_mutex[HTABLE_MUTEX_HASH(h)]); 103 104 /* 105 * forward declarations 106 */ 107 static void link_ptp(htable_t *higher, htable_t *new, uintptr_t vaddr); 108 static void unlink_ptp(htable_t *higher, htable_t *old, uintptr_t vaddr); 109 static void htable_free(htable_t *ht); 110 static x86pte_t *x86pte_access_pagetable(htable_t *ht, uint_t index); 111 static void x86pte_release_pagetable(htable_t *ht); 112 static x86pte_t x86pte_cas(htable_t *ht, uint_t entry, x86pte_t old, 113 x86pte_t new); 114 115 /* 116 * A counter to track if we are stealing or reaping htables. When non-zero 117 * htable_free() will directly free htables (either to the reserve or kmem) 118 * instead of putting them in a hat's htable cache. 119 */ 120 uint32_t htable_dont_cache = 0; 121 122 /* 123 * Track the number of active pagetables, so we can know how many to reap 124 */ 125 static uint32_t active_ptables = 0; 126 127 #ifdef __xpv 128 /* 129 * Deal with hypervisor complications. 130 */ 131 void 132 xen_flush_va(caddr_t va) 133 { 134 struct mmuext_op t; 135 uint_t count; 136 137 if (IN_XPV_PANIC()) { 138 mmu_tlbflush_entry((caddr_t)va); 139 } else { 140 t.cmd = MMUEXT_INVLPG_LOCAL; 141 t.arg1.linear_addr = (uintptr_t)va; 142 if (HYPERVISOR_mmuext_op(&t, 1, &count, DOMID_SELF) < 0) 143 panic("HYPERVISOR_mmuext_op() failed"); 144 ASSERT(count == 1); 145 } 146 } 147 148 void 149 xen_gflush_va(caddr_t va, cpuset_t cpus) 150 { 151 struct mmuext_op t; 152 uint_t count; 153 154 if (IN_XPV_PANIC()) { 155 mmu_tlbflush_entry((caddr_t)va); 156 return; 157 } 158 159 t.cmd = MMUEXT_INVLPG_MULTI; 160 t.arg1.linear_addr = (uintptr_t)va; 161 /*LINTED: constant in conditional context*/ 162 set_xen_guest_handle(t.arg2.vcpumask, &cpus); 163 if (HYPERVISOR_mmuext_op(&t, 1, &count, DOMID_SELF) < 0) 164 panic("HYPERVISOR_mmuext_op() failed"); 165 ASSERT(count == 1); 166 } 167 168 void 169 xen_flush_tlb() 170 { 171 struct mmuext_op t; 172 uint_t count; 173 174 if (IN_XPV_PANIC()) { 175 xpv_panic_reload_cr3(); 176 } else { 177 t.cmd = MMUEXT_TLB_FLUSH_LOCAL; 178 if (HYPERVISOR_mmuext_op(&t, 1, &count, DOMID_SELF) < 0) 179 panic("HYPERVISOR_mmuext_op() failed"); 180 ASSERT(count == 1); 181 } 182 } 183 184 void 185 xen_gflush_tlb(cpuset_t cpus) 186 { 187 struct mmuext_op t; 188 uint_t count; 189 190 ASSERT(!IN_XPV_PANIC()); 191 t.cmd = MMUEXT_TLB_FLUSH_MULTI; 192 /*LINTED: constant in conditional context*/ 193 set_xen_guest_handle(t.arg2.vcpumask, &cpus); 194 if (HYPERVISOR_mmuext_op(&t, 1, &count, DOMID_SELF) < 0) 195 panic("HYPERVISOR_mmuext_op() failed"); 196 ASSERT(count == 1); 197 } 198 199 /* 200 * Install/Adjust a kpm mapping under the hypervisor. 201 * Value of "how" should be: 202 * PT_WRITABLE | PT_VALID - regular kpm mapping 203 * PT_VALID - make mapping read-only 204 * 0 - remove mapping 205 * 206 * returns 0 on success. non-zero for failure. 207 */ 208 int 209 xen_kpm_page(pfn_t pfn, uint_t how) 210 { 211 paddr_t pa = mmu_ptob((paddr_t)pfn); 212 x86pte_t pte = PT_NOCONSIST | PT_REF | PT_MOD; 213 214 if (kpm_vbase == NULL) 215 return (0); 216 217 if (how) 218 pte |= pa_to_ma(pa) | how; 219 else 220 pte = 0; 221 return (HYPERVISOR_update_va_mapping((uintptr_t)kpm_vbase + pa, 222 pte, UVMF_INVLPG | UVMF_ALL)); 223 } 224 225 void 226 xen_pin(pfn_t pfn, level_t lvl) 227 { 228 struct mmuext_op t; 229 uint_t count; 230 231 t.cmd = MMUEXT_PIN_L1_TABLE + lvl; 232 t.arg1.mfn = pfn_to_mfn(pfn); 233 if (HYPERVISOR_mmuext_op(&t, 1, &count, DOMID_SELF) < 0) 234 panic("HYPERVISOR_mmuext_op() failed"); 235 ASSERT(count == 1); 236 } 237 238 void 239 xen_unpin(pfn_t pfn) 240 { 241 struct mmuext_op t; 242 uint_t count; 243 244 t.cmd = MMUEXT_UNPIN_TABLE; 245 t.arg1.mfn = pfn_to_mfn(pfn); 246 if (HYPERVISOR_mmuext_op(&t, 1, &count, DOMID_SELF) < 0) 247 panic("HYPERVISOR_mmuext_op() failed"); 248 ASSERT(count == 1); 249 } 250 251 static void 252 xen_map(uint64_t pte, caddr_t va) 253 { 254 if (HYPERVISOR_update_va_mapping((uintptr_t)va, pte, 255 UVMF_INVLPG | UVMF_LOCAL)) 256 panic("HYPERVISOR_update_va_mapping() failed"); 257 } 258 #endif /* __xpv */ 259 260 /* 261 * Allocate a memory page for a hardware page table. 262 * 263 * A wrapper around page_get_physical(), with some extra checks. 264 */ 265 static pfn_t 266 ptable_alloc(uintptr_t seed) 267 { 268 pfn_t pfn; 269 page_t *pp; 270 271 pfn = PFN_INVALID; 272 273 /* 274 * The first check is to see if there is memory in the system. If we 275 * drop to throttlefree, then fail the ptable_alloc() and let the 276 * stealing code kick in. Note that we have to do this test here, 277 * since the test in page_create_throttle() would let the NOSLEEP 278 * allocation go through and deplete the page reserves. 279 * 280 * The !NOMEMWAIT() lets pageout, fsflush, etc. skip this check. 281 */ 282 if (!NOMEMWAIT() && freemem <= throttlefree + 1) 283 return (PFN_INVALID); 284 285 #ifdef DEBUG 286 /* 287 * This code makes htable_steal() easier to test. By setting 288 * force_steal we force pagetable allocations to fall 289 * into the stealing code. Roughly 1 in ever "force_steal" 290 * page table allocations will fail. 291 */ 292 if (proc_pageout != NULL && force_steal > 1 && 293 ++ptable_cnt > force_steal) { 294 ptable_cnt = 0; 295 return (PFN_INVALID); 296 } 297 #endif /* DEBUG */ 298 299 pp = page_get_physical(seed); 300 if (pp == NULL) 301 return (PFN_INVALID); 302 ASSERT(PAGE_SHARED(pp)); 303 pfn = pp->p_pagenum; 304 if (pfn == PFN_INVALID) 305 panic("ptable_alloc(): Invalid PFN!!"); 306 atomic_inc_32(&active_ptables); 307 HATSTAT_INC(hs_ptable_allocs); 308 return (pfn); 309 } 310 311 /* 312 * Free an htable's associated page table page. See the comments 313 * for ptable_alloc(). 314 */ 315 static void 316 ptable_free(pfn_t pfn) 317 { 318 page_t *pp = page_numtopp_nolock(pfn); 319 320 /* 321 * need to destroy the page used for the pagetable 322 */ 323 ASSERT(pfn != PFN_INVALID); 324 HATSTAT_INC(hs_ptable_frees); 325 atomic_dec_32(&active_ptables); 326 if (pp == NULL) 327 panic("ptable_free(): no page for pfn!"); 328 ASSERT(PAGE_SHARED(pp)); 329 ASSERT(pfn == pp->p_pagenum); 330 ASSERT(!IN_XPV_PANIC()); 331 332 /* 333 * Get an exclusive lock, might have to wait for a kmem reader. 334 */ 335 if (!page_tryupgrade(pp)) { 336 u_offset_t off = pp->p_offset; 337 page_unlock(pp); 338 pp = page_lookup(&kvp, off, SE_EXCL); 339 if (pp == NULL) 340 panic("page not found"); 341 } 342 #ifdef __xpv 343 if (kpm_vbase && xen_kpm_page(pfn, PT_VALID | PT_WRITABLE) < 0) 344 panic("failure making kpm r/w pfn=0x%lx", pfn); 345 #endif 346 page_hashout(pp, NULL); 347 page_free(pp, 1); 348 page_unresv(1); 349 } 350 351 /* 352 * Put one htable on the reserve list. 353 */ 354 static void 355 htable_put_reserve(htable_t *ht) 356 { 357 ht->ht_hat = NULL; /* no longer tied to a hat */ 358 ASSERT(ht->ht_pfn == PFN_INVALID); 359 HATSTAT_INC(hs_htable_rputs); 360 mutex_enter(&htable_reserve_mutex); 361 ht->ht_next = htable_reserve_pool; 362 htable_reserve_pool = ht; 363 ++htable_reserve_cnt; 364 mutex_exit(&htable_reserve_mutex); 365 } 366 367 /* 368 * Take one htable from the reserve. 369 */ 370 static htable_t * 371 htable_get_reserve(void) 372 { 373 htable_t *ht = NULL; 374 375 mutex_enter(&htable_reserve_mutex); 376 if (htable_reserve_cnt != 0) { 377 ht = htable_reserve_pool; 378 ASSERT(ht != NULL); 379 ASSERT(ht->ht_pfn == PFN_INVALID); 380 htable_reserve_pool = ht->ht_next; 381 --htable_reserve_cnt; 382 HATSTAT_INC(hs_htable_rgets); 383 } 384 mutex_exit(&htable_reserve_mutex); 385 return (ht); 386 } 387 388 /* 389 * Allocate initial htables and put them on the reserve list 390 */ 391 void 392 htable_initial_reserve(uint_t count) 393 { 394 htable_t *ht; 395 396 count += HTABLE_RESERVE_AMOUNT; 397 while (count > 0) { 398 ht = kmem_cache_alloc(htable_cache, KM_NOSLEEP); 399 ASSERT(ht != NULL); 400 401 ASSERT(use_boot_reserve); 402 ht->ht_pfn = PFN_INVALID; 403 htable_put_reserve(ht); 404 --count; 405 } 406 } 407 408 /* 409 * Readjust the reserves after a thread finishes using them. 410 */ 411 void 412 htable_adjust_reserve() 413 { 414 htable_t *ht; 415 416 /* 417 * Free any excess htables in the reserve list 418 */ 419 while (htable_reserve_cnt > htable_reserve_amount && 420 !USE_HAT_RESERVES()) { 421 ht = htable_get_reserve(); 422 if (ht == NULL) 423 return; 424 ASSERT(ht->ht_pfn == PFN_INVALID); 425 kmem_cache_free(htable_cache, ht); 426 } 427 } 428 429 /* 430 * Search the active htables for one to steal. Start at a different hash 431 * bucket every time to help spread the pain of stealing 432 */ 433 static void 434 htable_steal_active(hat_t *hat, uint_t cnt, uint_t threshold, 435 uint_t *stolen, htable_t **list) 436 { 437 static uint_t h_seed = 0; 438 htable_t *higher, *ht; 439 uint_t h, e, h_start; 440 uintptr_t va; 441 x86pte_t pte; 442 443 h = h_start = h_seed++ % hat->hat_num_hash; 444 do { 445 higher = NULL; 446 HTABLE_ENTER(h); 447 for (ht = hat->hat_ht_hash[h]; ht; ht = ht->ht_next) { 448 449 /* 450 * Can we rule out reaping? 451 */ 452 if (ht->ht_busy != 0 || 453 (ht->ht_flags & HTABLE_SHARED_PFN) || 454 ht->ht_level > 0 || ht->ht_valid_cnt > threshold || 455 ht->ht_lock_cnt != 0) 456 continue; 457 458 /* 459 * Increment busy so the htable can't disappear. We 460 * drop the htable mutex to avoid deadlocks with 461 * hat_pageunload() and the hment mutex while we 462 * call hat_pte_unmap() 463 */ 464 ++ht->ht_busy; 465 HTABLE_EXIT(h); 466 467 /* 468 * Try stealing. 469 * - unload and invalidate all PTEs 470 */ 471 for (e = 0, va = ht->ht_vaddr; 472 e < HTABLE_NUM_PTES(ht) && ht->ht_valid_cnt > 0 && 473 ht->ht_busy == 1 && ht->ht_lock_cnt == 0; 474 ++e, va += MMU_PAGESIZE) { 475 pte = x86pte_get(ht, e); 476 if (!PTE_ISVALID(pte)) 477 continue; 478 hat_pte_unmap(ht, e, HAT_UNLOAD, pte, NULL); 479 } 480 481 /* 482 * Reacquire htable lock. If we didn't remove all 483 * mappings in the table, or another thread added a new 484 * mapping behind us, give up on this table. 485 */ 486 HTABLE_ENTER(h); 487 if (ht->ht_busy != 1 || ht->ht_valid_cnt != 0 || 488 ht->ht_lock_cnt != 0) { 489 --ht->ht_busy; 490 continue; 491 } 492 493 /* 494 * Steal it and unlink the page table. 495 */ 496 higher = ht->ht_parent; 497 unlink_ptp(higher, ht, ht->ht_vaddr); 498 499 /* 500 * remove from the hash list 501 */ 502 if (ht->ht_next) 503 ht->ht_next->ht_prev = ht->ht_prev; 504 505 if (ht->ht_prev) { 506 ht->ht_prev->ht_next = ht->ht_next; 507 } else { 508 ASSERT(hat->hat_ht_hash[h] == ht); 509 hat->hat_ht_hash[h] = ht->ht_next; 510 } 511 512 /* 513 * Break to outer loop to release the 514 * higher (ht_parent) pagetable. This 515 * spreads out the pain caused by 516 * pagefaults. 517 */ 518 ht->ht_next = *list; 519 *list = ht; 520 ++*stolen; 521 break; 522 } 523 HTABLE_EXIT(h); 524 if (higher != NULL) 525 htable_release(higher); 526 if (++h == hat->hat_num_hash) 527 h = 0; 528 } while (*stolen < cnt && h != h_start); 529 } 530 531 /* 532 * Move hat to the end of the kas list 533 */ 534 static void 535 move_victim(hat_t *hat) 536 { 537 ASSERT(MUTEX_HELD(&hat_list_lock)); 538 539 /* unlink victim hat */ 540 if (hat->hat_prev) 541 hat->hat_prev->hat_next = hat->hat_next; 542 else 543 kas.a_hat->hat_next = hat->hat_next; 544 545 if (hat->hat_next) 546 hat->hat_next->hat_prev = hat->hat_prev; 547 else 548 kas.a_hat->hat_prev = hat->hat_prev; 549 /* relink at end of hat list */ 550 hat->hat_next = NULL; 551 hat->hat_prev = kas.a_hat->hat_prev; 552 if (hat->hat_prev) 553 hat->hat_prev->hat_next = hat; 554 else 555 kas.a_hat->hat_next = hat; 556 557 kas.a_hat->hat_prev = hat; 558 } 559 560 /* 561 * This routine steals htables from user processes. Called by htable_reap 562 * (reap=TRUE) or htable_alloc (reap=FALSE). 563 */ 564 static htable_t * 565 htable_steal(uint_t cnt, boolean_t reap) 566 { 567 hat_t *hat = kas.a_hat; /* list starts with khat */ 568 htable_t *list = NULL; 569 htable_t *ht; 570 uint_t stolen = 0; 571 uint_t pass; 572 uint_t threshold; 573 574 /* 575 * Limit htable_steal_passes to something reasonable 576 */ 577 if (htable_steal_passes == 0) 578 htable_steal_passes = 1; 579 if (htable_steal_passes > mmu.ptes_per_table) 580 htable_steal_passes = mmu.ptes_per_table; 581 582 /* 583 * Loop through all user hats. The 1st pass takes cached htables that 584 * aren't in use. The later passes steal by removing mappings, too. 585 */ 586 atomic_inc_32(&htable_dont_cache); 587 for (pass = 0; pass <= htable_steal_passes && stolen < cnt; ++pass) { 588 threshold = pass * mmu.ptes_per_table / htable_steal_passes; 589 590 mutex_enter(&hat_list_lock); 591 592 /* skip the first hat (kernel) */ 593 hat = kas.a_hat->hat_next; 594 for (;;) { 595 /* 596 * Skip any hat that is already being stolen from. 597 * 598 * We skip SHARED hats, as these are dummy 599 * hats that host ISM shared page tables. 600 * 601 * We also skip if HAT_FREEING because hat_pte_unmap() 602 * won't zero out the PTE's. That would lead to hitting 603 * stale PTEs either here or under hat_unload() when we 604 * steal and unload the same page table in competing 605 * threads. 606 */ 607 while (hat != NULL && 608 (hat->hat_flags & 609 (HAT_VICTIM | HAT_SHARED | HAT_FREEING)) != 0) 610 hat = hat->hat_next; 611 612 if (hat == NULL) 613 break; 614 615 /* 616 * Mark the HAT as a stealing victim so that it is 617 * not freed from under us, e.g. in as_free() 618 */ 619 hat->hat_flags |= HAT_VICTIM; 620 mutex_exit(&hat_list_lock); 621 622 /* 623 * Take any htables from the hat's cached "free" list. 624 */ 625 hat_enter(hat); 626 while ((ht = hat->hat_ht_cached) != NULL && 627 stolen < cnt) { 628 hat->hat_ht_cached = ht->ht_next; 629 ht->ht_next = list; 630 list = ht; 631 ++stolen; 632 } 633 hat_exit(hat); 634 635 /* 636 * Don't steal active htables on first pass. 637 */ 638 if (pass != 0 && (stolen < cnt)) 639 htable_steal_active(hat, cnt, threshold, 640 &stolen, &list); 641 642 /* 643 * do synchronous teardown for the reap case so that 644 * we can forget hat; at this time, hat is 645 * guaranteed to be around because HAT_VICTIM is set 646 * (see htable_free() for similar code) 647 */ 648 for (ht = list; (ht) && (reap); ht = ht->ht_next) { 649 if (ht->ht_hat == NULL) 650 continue; 651 ASSERT(ht->ht_hat == hat); 652 #if defined(__xpv) && defined(__amd64) 653 if (!(ht->ht_flags & HTABLE_VLP) && 654 ht->ht_level == mmu.max_level) { 655 ptable_free(hat->hat_user_ptable); 656 hat->hat_user_ptable = PFN_INVALID; 657 } 658 #endif 659 /* 660 * forget the hat 661 */ 662 ht->ht_hat = NULL; 663 } 664 665 mutex_enter(&hat_list_lock); 666 667 /* 668 * Are we finished? 669 */ 670 if (stolen == cnt) { 671 /* 672 * Try to spread the pain of stealing, 673 * move victim HAT to the end of the HAT list. 674 */ 675 if (pass >= 1 && cnt == 1 && 676 kas.a_hat->hat_prev != hat) 677 move_victim(hat); 678 /* 679 * We are finished 680 */ 681 } 682 683 /* 684 * Clear the victim flag, hat can go away now (once 685 * the lock is dropped) 686 */ 687 if (hat->hat_flags & HAT_VICTIM) { 688 ASSERT(hat != kas.a_hat); 689 hat->hat_flags &= ~HAT_VICTIM; 690 cv_broadcast(&hat_list_cv); 691 } 692 693 /* move on to the next hat */ 694 hat = hat->hat_next; 695 } 696 697 mutex_exit(&hat_list_lock); 698 699 } 700 ASSERT(!MUTEX_HELD(&hat_list_lock)); 701 702 atomic_dec_32(&htable_dont_cache); 703 return (list); 704 } 705 706 /* 707 * This is invoked from kmem when the system is low on memory. We try 708 * to free hments, htables, and ptables to improve the memory situation. 709 */ 710 /*ARGSUSED*/ 711 static void 712 htable_reap(void *handle) 713 { 714 uint_t reap_cnt; 715 htable_t *list; 716 htable_t *ht; 717 718 HATSTAT_INC(hs_reap_attempts); 719 if (!can_steal_post_boot) 720 return; 721 722 /* 723 * Try to reap 5% of the page tables bounded by a maximum of 724 * 5% of physmem and a minimum of 10. 725 */ 726 reap_cnt = MAX(MIN(physmem / 20, active_ptables / 20), 10); 727 728 /* 729 * Note: htable_dont_cache should be set at the time of 730 * invoking htable_free() 731 */ 732 atomic_inc_32(&htable_dont_cache); 733 /* 734 * Let htable_steal() do the work, we just call htable_free() 735 */ 736 XPV_DISALLOW_MIGRATE(); 737 list = htable_steal(reap_cnt, B_TRUE); 738 XPV_ALLOW_MIGRATE(); 739 while ((ht = list) != NULL) { 740 list = ht->ht_next; 741 HATSTAT_INC(hs_reaped); 742 htable_free(ht); 743 } 744 atomic_dec_32(&htable_dont_cache); 745 746 /* 747 * Free up excess reserves 748 */ 749 htable_adjust_reserve(); 750 hment_adjust_reserve(); 751 } 752 753 /* 754 * Allocate an htable, stealing one or using the reserve if necessary 755 */ 756 static htable_t * 757 htable_alloc( 758 hat_t *hat, 759 uintptr_t vaddr, 760 level_t level, 761 htable_t *shared) 762 { 763 htable_t *ht = NULL; 764 uint_t is_vlp; 765 uint_t is_bare = 0; 766 uint_t need_to_zero = 1; 767 int kmflags = (can_steal_post_boot ? KM_NOSLEEP : KM_SLEEP); 768 769 if (level < 0 || level > TOP_LEVEL(hat)) 770 panic("htable_alloc(): level %d out of range\n", level); 771 772 is_vlp = (hat->hat_flags & HAT_VLP) && level == VLP_LEVEL; 773 if (is_vlp || shared != NULL) 774 is_bare = 1; 775 776 /* 777 * First reuse a cached htable from the hat_ht_cached field, this 778 * avoids unnecessary trips through kmem/page allocators. 779 */ 780 if (hat->hat_ht_cached != NULL && !is_bare) { 781 hat_enter(hat); 782 ht = hat->hat_ht_cached; 783 if (ht != NULL) { 784 hat->hat_ht_cached = ht->ht_next; 785 need_to_zero = 0; 786 /* XX64 ASSERT() they're all zero somehow */ 787 ASSERT(ht->ht_pfn != PFN_INVALID); 788 } 789 hat_exit(hat); 790 } 791 792 if (ht == NULL) { 793 /* 794 * Allocate an htable, possibly refilling the reserves. 795 */ 796 if (USE_HAT_RESERVES()) { 797 ht = htable_get_reserve(); 798 } else { 799 /* 800 * Donate successful htable allocations to the reserve. 801 */ 802 for (;;) { 803 ht = kmem_cache_alloc(htable_cache, kmflags); 804 if (ht == NULL) 805 break; 806 ht->ht_pfn = PFN_INVALID; 807 if (USE_HAT_RESERVES() || 808 htable_reserve_cnt >= htable_reserve_amount) 809 break; 810 htable_put_reserve(ht); 811 } 812 } 813 814 /* 815 * allocate a page for the hardware page table if needed 816 */ 817 if (ht != NULL && !is_bare) { 818 ht->ht_hat = hat; 819 ht->ht_pfn = ptable_alloc((uintptr_t)ht); 820 if (ht->ht_pfn == PFN_INVALID) { 821 if (USE_HAT_RESERVES()) 822 htable_put_reserve(ht); 823 else 824 kmem_cache_free(htable_cache, ht); 825 ht = NULL; 826 } 827 } 828 } 829 830 /* 831 * If allocations failed, kick off a kmem_reap() and resort to 832 * htable steal(). We may spin here if the system is very low on 833 * memory. If the kernel itself has consumed all memory and kmem_reap() 834 * can't free up anything, then we'll really get stuck here. 835 * That should only happen in a system where the administrator has 836 * misconfigured VM parameters via /etc/system. 837 */ 838 while (ht == NULL && can_steal_post_boot) { 839 kmem_reap(); 840 ht = htable_steal(1, B_FALSE); 841 HATSTAT_INC(hs_steals); 842 843 /* 844 * If we stole for a bare htable, release the pagetable page. 845 */ 846 if (ht != NULL) { 847 if (is_bare) { 848 ptable_free(ht->ht_pfn); 849 ht->ht_pfn = PFN_INVALID; 850 #if defined(__xpv) && defined(__amd64) 851 /* 852 * make stolen page table writable again in kpm 853 */ 854 } else if (kpm_vbase && xen_kpm_page(ht->ht_pfn, 855 PT_VALID | PT_WRITABLE) < 0) { 856 panic("failure making kpm r/w pfn=0x%lx", 857 ht->ht_pfn); 858 #endif 859 } 860 } 861 } 862 863 /* 864 * All attempts to allocate or steal failed. This should only happen 865 * if we run out of memory during boot, due perhaps to a huge 866 * boot_archive. At this point there's no way to continue. 867 */ 868 if (ht == NULL) 869 panic("htable_alloc(): couldn't steal\n"); 870 871 #if defined(__amd64) && defined(__xpv) 872 /* 873 * Under the 64-bit hypervisor, we have 2 top level page tables. 874 * If this allocation fails, we'll resort to stealing. 875 * We use the stolen page indirectly, by freeing the 876 * stolen htable first. 877 */ 878 if (level == mmu.max_level) { 879 for (;;) { 880 htable_t *stolen; 881 882 hat->hat_user_ptable = ptable_alloc((uintptr_t)ht + 1); 883 if (hat->hat_user_ptable != PFN_INVALID) 884 break; 885 stolen = htable_steal(1, B_FALSE); 886 if (stolen == NULL) 887 panic("2nd steal ptable failed\n"); 888 htable_free(stolen); 889 } 890 block_zero_no_xmm(kpm_vbase + pfn_to_pa(hat->hat_user_ptable), 891 MMU_PAGESIZE); 892 } 893 #endif 894 895 /* 896 * Shared page tables have all entries locked and entries may not 897 * be added or deleted. 898 */ 899 ht->ht_flags = 0; 900 if (shared != NULL) { 901 ASSERT(shared->ht_valid_cnt > 0); 902 ht->ht_flags |= HTABLE_SHARED_PFN; 903 ht->ht_pfn = shared->ht_pfn; 904 ht->ht_lock_cnt = 0; 905 ht->ht_valid_cnt = 0; /* updated in hat_share() */ 906 ht->ht_shares = shared; 907 need_to_zero = 0; 908 } else { 909 ht->ht_shares = NULL; 910 ht->ht_lock_cnt = 0; 911 ht->ht_valid_cnt = 0; 912 } 913 914 /* 915 * setup flags, etc. for VLP htables 916 */ 917 if (is_vlp) { 918 ht->ht_flags |= HTABLE_VLP; 919 ASSERT(ht->ht_pfn == PFN_INVALID); 920 need_to_zero = 0; 921 } 922 923 /* 924 * fill in the htable 925 */ 926 ht->ht_hat = hat; 927 ht->ht_parent = NULL; 928 ht->ht_vaddr = vaddr; 929 ht->ht_level = level; 930 ht->ht_busy = 1; 931 ht->ht_next = NULL; 932 ht->ht_prev = NULL; 933 934 /* 935 * Zero out any freshly allocated page table 936 */ 937 if (need_to_zero) 938 x86pte_zero(ht, 0, mmu.ptes_per_table); 939 940 #if defined(__amd64) && defined(__xpv) 941 if (!is_bare && kpm_vbase) { 942 (void) xen_kpm_page(ht->ht_pfn, PT_VALID); 943 if (level == mmu.max_level) 944 (void) xen_kpm_page(hat->hat_user_ptable, PT_VALID); 945 } 946 #endif 947 948 return (ht); 949 } 950 951 /* 952 * Free up an htable, either to a hat's cached list, the reserves or 953 * back to kmem. 954 */ 955 static void 956 htable_free(htable_t *ht) 957 { 958 hat_t *hat = ht->ht_hat; 959 960 /* 961 * If the process isn't exiting, cache the free htable in the hat 962 * structure. We always do this for the boot time reserve. We don't 963 * do this if the hat is exiting or we are stealing/reaping htables. 964 */ 965 if (hat != NULL && 966 !(ht->ht_flags & HTABLE_SHARED_PFN) && 967 (use_boot_reserve || 968 (!(hat->hat_flags & HAT_FREEING) && !htable_dont_cache))) { 969 ASSERT((ht->ht_flags & HTABLE_VLP) == 0); 970 ASSERT(ht->ht_pfn != PFN_INVALID); 971 hat_enter(hat); 972 ht->ht_next = hat->hat_ht_cached; 973 hat->hat_ht_cached = ht; 974 hat_exit(hat); 975 return; 976 } 977 978 /* 979 * If we have a hardware page table, free it. 980 * We don't free page tables that are accessed by sharing. 981 */ 982 if (ht->ht_flags & HTABLE_SHARED_PFN) { 983 ASSERT(ht->ht_pfn != PFN_INVALID); 984 } else if (!(ht->ht_flags & HTABLE_VLP)) { 985 ptable_free(ht->ht_pfn); 986 #if defined(__amd64) && defined(__xpv) 987 if (ht->ht_level == mmu.max_level && hat != NULL) { 988 ptable_free(hat->hat_user_ptable); 989 hat->hat_user_ptable = PFN_INVALID; 990 } 991 #endif 992 } 993 ht->ht_pfn = PFN_INVALID; 994 995 /* 996 * Free it or put into reserves. 997 */ 998 if (USE_HAT_RESERVES() || htable_reserve_cnt < htable_reserve_amount) { 999 htable_put_reserve(ht); 1000 } else { 1001 kmem_cache_free(htable_cache, ht); 1002 htable_adjust_reserve(); 1003 } 1004 } 1005 1006 1007 /* 1008 * This is called when a hat is being destroyed or swapped out. We reap all 1009 * the remaining htables in the hat cache. If destroying all left over 1010 * htables are also destroyed. 1011 * 1012 * We also don't need to invalidate any of the PTPs nor do any demapping. 1013 */ 1014 void 1015 htable_purge_hat(hat_t *hat) 1016 { 1017 htable_t *ht; 1018 int h; 1019 1020 /* 1021 * Purge the htable cache if just reaping. 1022 */ 1023 if (!(hat->hat_flags & HAT_FREEING)) { 1024 atomic_inc_32(&htable_dont_cache); 1025 for (;;) { 1026 hat_enter(hat); 1027 ht = hat->hat_ht_cached; 1028 if (ht == NULL) { 1029 hat_exit(hat); 1030 break; 1031 } 1032 hat->hat_ht_cached = ht->ht_next; 1033 hat_exit(hat); 1034 htable_free(ht); 1035 } 1036 atomic_dec_32(&htable_dont_cache); 1037 return; 1038 } 1039 1040 /* 1041 * if freeing, no locking is needed 1042 */ 1043 while ((ht = hat->hat_ht_cached) != NULL) { 1044 hat->hat_ht_cached = ht->ht_next; 1045 htable_free(ht); 1046 } 1047 1048 /* 1049 * walk thru the htable hash table and free all the htables in it. 1050 */ 1051 for (h = 0; h < hat->hat_num_hash; ++h) { 1052 while ((ht = hat->hat_ht_hash[h]) != NULL) { 1053 if (ht->ht_next) 1054 ht->ht_next->ht_prev = ht->ht_prev; 1055 1056 if (ht->ht_prev) { 1057 ht->ht_prev->ht_next = ht->ht_next; 1058 } else { 1059 ASSERT(hat->hat_ht_hash[h] == ht); 1060 hat->hat_ht_hash[h] = ht->ht_next; 1061 } 1062 htable_free(ht); 1063 } 1064 } 1065 } 1066 1067 /* 1068 * Unlink an entry for a table at vaddr and level out of the existing table 1069 * one level higher. We are always holding the HASH_ENTER() when doing this. 1070 */ 1071 static void 1072 unlink_ptp(htable_t *higher, htable_t *old, uintptr_t vaddr) 1073 { 1074 uint_t entry = htable_va2entry(vaddr, higher); 1075 x86pte_t expect = MAKEPTP(old->ht_pfn, old->ht_level); 1076 x86pte_t found; 1077 hat_t *hat = old->ht_hat; 1078 1079 ASSERT(higher->ht_busy > 0); 1080 ASSERT(higher->ht_valid_cnt > 0); 1081 ASSERT(old->ht_valid_cnt == 0); 1082 found = x86pte_cas(higher, entry, expect, 0); 1083 #ifdef __xpv 1084 /* 1085 * This is weird, but Xen apparently automatically unlinks empty 1086 * pagetables from the upper page table. So allow PTP to be 0 already. 1087 */ 1088 if (found != expect && found != 0) 1089 #else 1090 if (found != expect) 1091 #endif 1092 panic("Bad PTP found=" FMT_PTE ", expected=" FMT_PTE, 1093 found, expect); 1094 1095 /* 1096 * When a top level VLP page table entry changes, we must issue 1097 * a reload of cr3 on all processors. 1098 * 1099 * If we don't need do do that, then we still have to INVLPG against 1100 * an address covered by the inner page table, as the latest processors 1101 * have TLB-like caches for non-leaf page table entries. 1102 */ 1103 if (!(hat->hat_flags & HAT_FREEING)) { 1104 hat_tlb_inval(hat, (higher->ht_flags & HTABLE_VLP) ? 1105 DEMAP_ALL_ADDR : old->ht_vaddr); 1106 } 1107 1108 HTABLE_DEC(higher->ht_valid_cnt); 1109 } 1110 1111 /* 1112 * Link an entry for a new table at vaddr and level into the existing table 1113 * one level higher. We are always holding the HASH_ENTER() when doing this. 1114 */ 1115 static void 1116 link_ptp(htable_t *higher, htable_t *new, uintptr_t vaddr) 1117 { 1118 uint_t entry = htable_va2entry(vaddr, higher); 1119 x86pte_t newptp = MAKEPTP(new->ht_pfn, new->ht_level); 1120 x86pte_t found; 1121 1122 ASSERT(higher->ht_busy > 0); 1123 1124 ASSERT(new->ht_level != mmu.max_level); 1125 1126 HTABLE_INC(higher->ht_valid_cnt); 1127 1128 found = x86pte_cas(higher, entry, 0, newptp); 1129 if ((found & ~PT_REF) != 0) 1130 panic("HAT: ptp not 0, found=" FMT_PTE, found); 1131 1132 /* 1133 * When any top level VLP page table entry changes, we must issue 1134 * a reload of cr3 on all processors using it. 1135 * We also need to do this for the kernel hat on PAE 32 bit kernel. 1136 */ 1137 if ( 1138 #ifdef __i386 1139 (higher->ht_hat == kas.a_hat && higher->ht_level == VLP_LEVEL) || 1140 #endif 1141 (higher->ht_flags & HTABLE_VLP)) 1142 hat_tlb_inval(higher->ht_hat, DEMAP_ALL_ADDR); 1143 } 1144 1145 /* 1146 * Release of hold on an htable. If this is the last use and the pagetable 1147 * is empty we may want to free it, then recursively look at the pagetable 1148 * above it. The recursion is handled by the outer while() loop. 1149 * 1150 * On the metal, during process exit, we don't bother unlinking the tables from 1151 * upper level pagetables. They are instead handled in bulk by hat_free_end(). 1152 * We can't do this on the hypervisor as we need the page table to be 1153 * implicitly unpinnned before it goes to the free page lists. This can't 1154 * happen unless we fully unlink it from the page table hierarchy. 1155 */ 1156 void 1157 htable_release(htable_t *ht) 1158 { 1159 uint_t hashval; 1160 htable_t *shared; 1161 htable_t *higher; 1162 hat_t *hat; 1163 uintptr_t va; 1164 level_t level; 1165 1166 while (ht != NULL) { 1167 shared = NULL; 1168 for (;;) { 1169 hat = ht->ht_hat; 1170 va = ht->ht_vaddr; 1171 level = ht->ht_level; 1172 hashval = HTABLE_HASH(hat, va, level); 1173 1174 /* 1175 * The common case is that this isn't the last use of 1176 * an htable so we don't want to free the htable. 1177 */ 1178 HTABLE_ENTER(hashval); 1179 ASSERT(ht->ht_valid_cnt >= 0); 1180 ASSERT(ht->ht_busy > 0); 1181 if (ht->ht_valid_cnt > 0) 1182 break; 1183 if (ht->ht_busy > 1) 1184 break; 1185 ASSERT(ht->ht_lock_cnt == 0); 1186 1187 #if !defined(__xpv) 1188 /* 1189 * we always release empty shared htables 1190 */ 1191 if (!(ht->ht_flags & HTABLE_SHARED_PFN)) { 1192 1193 /* 1194 * don't release if in address space tear down 1195 */ 1196 if (hat->hat_flags & HAT_FREEING) 1197 break; 1198 1199 /* 1200 * At and above max_page_level, free if it's for 1201 * a boot-time kernel mapping below kernelbase. 1202 */ 1203 if (level >= mmu.max_page_level && 1204 (hat != kas.a_hat || va >= kernelbase)) 1205 break; 1206 } 1207 #endif /* __xpv */ 1208 1209 /* 1210 * Remember if we destroy an htable that shares its PFN 1211 * from elsewhere. 1212 */ 1213 if (ht->ht_flags & HTABLE_SHARED_PFN) { 1214 ASSERT(shared == NULL); 1215 shared = ht->ht_shares; 1216 HATSTAT_INC(hs_htable_unshared); 1217 } 1218 1219 /* 1220 * Handle release of a table and freeing the htable_t. 1221 * Unlink it from the table higher (ie. ht_parent). 1222 */ 1223 higher = ht->ht_parent; 1224 ASSERT(higher != NULL); 1225 1226 /* 1227 * Unlink the pagetable. 1228 */ 1229 unlink_ptp(higher, ht, va); 1230 1231 /* 1232 * remove this htable from its hash list 1233 */ 1234 if (ht->ht_next) 1235 ht->ht_next->ht_prev = ht->ht_prev; 1236 1237 if (ht->ht_prev) { 1238 ht->ht_prev->ht_next = ht->ht_next; 1239 } else { 1240 ASSERT(hat->hat_ht_hash[hashval] == ht); 1241 hat->hat_ht_hash[hashval] = ht->ht_next; 1242 } 1243 HTABLE_EXIT(hashval); 1244 htable_free(ht); 1245 ht = higher; 1246 } 1247 1248 ASSERT(ht->ht_busy >= 1); 1249 --ht->ht_busy; 1250 HTABLE_EXIT(hashval); 1251 1252 /* 1253 * If we released a shared htable, do a release on the htable 1254 * from which it shared 1255 */ 1256 ht = shared; 1257 } 1258 } 1259 1260 /* 1261 * Find the htable for the pagetable at the given level for the given address. 1262 * If found acquires a hold that eventually needs to be htable_release()d 1263 */ 1264 htable_t * 1265 htable_lookup(hat_t *hat, uintptr_t vaddr, level_t level) 1266 { 1267 uintptr_t base; 1268 uint_t hashval; 1269 htable_t *ht = NULL; 1270 1271 ASSERT(level >= 0); 1272 ASSERT(level <= TOP_LEVEL(hat)); 1273 1274 if (level == TOP_LEVEL(hat)) { 1275 #if defined(__amd64) 1276 /* 1277 * 32 bit address spaces on 64 bit kernels need to check 1278 * for overflow of the 32 bit address space 1279 */ 1280 if ((hat->hat_flags & HAT_VLP) && vaddr >= ((uint64_t)1 << 32)) 1281 return (NULL); 1282 #endif 1283 base = 0; 1284 } else { 1285 base = vaddr & LEVEL_MASK(level + 1); 1286 } 1287 1288 hashval = HTABLE_HASH(hat, base, level); 1289 HTABLE_ENTER(hashval); 1290 for (ht = hat->hat_ht_hash[hashval]; ht; ht = ht->ht_next) { 1291 if (ht->ht_hat == hat && 1292 ht->ht_vaddr == base && 1293 ht->ht_level == level) 1294 break; 1295 } 1296 if (ht) 1297 ++ht->ht_busy; 1298 1299 HTABLE_EXIT(hashval); 1300 return (ht); 1301 } 1302 1303 /* 1304 * Acquires a hold on a known htable (from a locked hment entry). 1305 */ 1306 void 1307 htable_acquire(htable_t *ht) 1308 { 1309 hat_t *hat = ht->ht_hat; 1310 level_t level = ht->ht_level; 1311 uintptr_t base = ht->ht_vaddr; 1312 uint_t hashval = HTABLE_HASH(hat, base, level); 1313 1314 HTABLE_ENTER(hashval); 1315 #ifdef DEBUG 1316 /* 1317 * make sure the htable is there 1318 */ 1319 { 1320 htable_t *h; 1321 1322 for (h = hat->hat_ht_hash[hashval]; 1323 h && h != ht; 1324 h = h->ht_next) 1325 ; 1326 ASSERT(h == ht); 1327 } 1328 #endif /* DEBUG */ 1329 ++ht->ht_busy; 1330 HTABLE_EXIT(hashval); 1331 } 1332 1333 /* 1334 * Find the htable for the pagetable at the given level for the given address. 1335 * If found acquires a hold that eventually needs to be htable_release()d 1336 * If not found the table is created. 1337 * 1338 * Since we can't hold a hash table mutex during allocation, we have to 1339 * drop it and redo the search on a create. Then we may have to free the newly 1340 * allocated htable if another thread raced in and created it ahead of us. 1341 */ 1342 htable_t * 1343 htable_create( 1344 hat_t *hat, 1345 uintptr_t vaddr, 1346 level_t level, 1347 htable_t *shared) 1348 { 1349 uint_t h; 1350 level_t l; 1351 uintptr_t base; 1352 htable_t *ht; 1353 htable_t *higher = NULL; 1354 htable_t *new = NULL; 1355 1356 if (level < 0 || level > TOP_LEVEL(hat)) 1357 panic("htable_create(): level %d out of range\n", level); 1358 1359 /* 1360 * Create the page tables in top down order. 1361 */ 1362 for (l = TOP_LEVEL(hat); l >= level; --l) { 1363 new = NULL; 1364 if (l == TOP_LEVEL(hat)) 1365 base = 0; 1366 else 1367 base = vaddr & LEVEL_MASK(l + 1); 1368 1369 h = HTABLE_HASH(hat, base, l); 1370 try_again: 1371 /* 1372 * look up the htable at this level 1373 */ 1374 HTABLE_ENTER(h); 1375 if (l == TOP_LEVEL(hat)) { 1376 ht = hat->hat_htable; 1377 } else { 1378 for (ht = hat->hat_ht_hash[h]; ht; ht = ht->ht_next) { 1379 ASSERT(ht->ht_hat == hat); 1380 if (ht->ht_vaddr == base && 1381 ht->ht_level == l) 1382 break; 1383 } 1384 } 1385 1386 /* 1387 * if we found the htable, increment its busy cnt 1388 * and if we had allocated a new htable, free it. 1389 */ 1390 if (ht != NULL) { 1391 /* 1392 * If we find a pre-existing shared table, it must 1393 * share from the same place. 1394 */ 1395 if (l == level && shared && ht->ht_shares && 1396 ht->ht_shares != shared) { 1397 panic("htable shared from wrong place " 1398 "found htable=%p shared=%p", 1399 (void *)ht, (void *)shared); 1400 } 1401 ++ht->ht_busy; 1402 HTABLE_EXIT(h); 1403 if (new) 1404 htable_free(new); 1405 if (higher != NULL) 1406 htable_release(higher); 1407 higher = ht; 1408 1409 /* 1410 * if we didn't find it on the first search 1411 * allocate a new one and search again 1412 */ 1413 } else if (new == NULL) { 1414 HTABLE_EXIT(h); 1415 new = htable_alloc(hat, base, l, 1416 l == level ? shared : NULL); 1417 goto try_again; 1418 1419 /* 1420 * 2nd search and still not there, use "new" table 1421 * Link new table into higher, when not at top level. 1422 */ 1423 } else { 1424 ht = new; 1425 if (higher != NULL) { 1426 link_ptp(higher, ht, base); 1427 ht->ht_parent = higher; 1428 } 1429 ht->ht_next = hat->hat_ht_hash[h]; 1430 ASSERT(ht->ht_prev == NULL); 1431 if (hat->hat_ht_hash[h]) 1432 hat->hat_ht_hash[h]->ht_prev = ht; 1433 hat->hat_ht_hash[h] = ht; 1434 HTABLE_EXIT(h); 1435 1436 /* 1437 * Note we don't do htable_release(higher). 1438 * That happens recursively when "new" is removed by 1439 * htable_release() or htable_steal(). 1440 */ 1441 higher = ht; 1442 1443 /* 1444 * If we just created a new shared page table we 1445 * increment the shared htable's busy count, so that 1446 * it can't be the victim of a steal even if it's empty. 1447 */ 1448 if (l == level && shared) { 1449 (void) htable_lookup(shared->ht_hat, 1450 shared->ht_vaddr, shared->ht_level); 1451 HATSTAT_INC(hs_htable_shared); 1452 } 1453 } 1454 } 1455 1456 return (ht); 1457 } 1458 1459 /* 1460 * Inherit initial pagetables from the boot program. On the 64-bit 1461 * hypervisor we also temporarily mark the p_index field of page table 1462 * pages, so we know not to try making them writable in seg_kpm. 1463 */ 1464 void 1465 htable_attach( 1466 hat_t *hat, 1467 uintptr_t base, 1468 level_t level, 1469 htable_t *parent, 1470 pfn_t pfn) 1471 { 1472 htable_t *ht; 1473 uint_t h; 1474 uint_t i; 1475 x86pte_t pte; 1476 x86pte_t *ptep; 1477 page_t *pp; 1478 extern page_t *boot_claim_page(pfn_t); 1479 1480 ht = htable_get_reserve(); 1481 if (level == mmu.max_level) 1482 kas.a_hat->hat_htable = ht; 1483 ht->ht_hat = hat; 1484 ht->ht_parent = parent; 1485 ht->ht_vaddr = base; 1486 ht->ht_level = level; 1487 ht->ht_busy = 1; 1488 ht->ht_next = NULL; 1489 ht->ht_prev = NULL; 1490 ht->ht_flags = 0; 1491 ht->ht_pfn = pfn; 1492 ht->ht_lock_cnt = 0; 1493 ht->ht_valid_cnt = 0; 1494 if (parent != NULL) 1495 ++parent->ht_busy; 1496 1497 h = HTABLE_HASH(hat, base, level); 1498 HTABLE_ENTER(h); 1499 ht->ht_next = hat->hat_ht_hash[h]; 1500 ASSERT(ht->ht_prev == NULL); 1501 if (hat->hat_ht_hash[h]) 1502 hat->hat_ht_hash[h]->ht_prev = ht; 1503 hat->hat_ht_hash[h] = ht; 1504 HTABLE_EXIT(h); 1505 1506 /* 1507 * make sure the page table physical page is not FREE 1508 */ 1509 if (page_resv(1, KM_NOSLEEP) == 0) 1510 panic("page_resv() failed in ptable alloc"); 1511 1512 pp = boot_claim_page(pfn); 1513 ASSERT(pp != NULL); 1514 1515 /* 1516 * Page table pages that were allocated by dboot or 1517 * in very early startup didn't go through boot_mapin() 1518 * and so won't have vnode/offsets. Fix that here. 1519 */ 1520 if (pp->p_vnode == NULL) { 1521 /* match offset calculation in page_get_physical() */ 1522 u_offset_t offset = (uintptr_t)ht; 1523 if (offset > kernelbase) 1524 offset -= kernelbase; 1525 offset <<= MMU_PAGESHIFT; 1526 #if defined(__amd64) 1527 offset += mmu.hole_start; /* something in VA hole */ 1528 #else 1529 offset += 1ULL << 40; /* something > 4 Gig */ 1530 #endif 1531 ASSERT(page_exists(&kvp, offset) == NULL); 1532 (void) page_hashin(pp, &kvp, offset, NULL); 1533 } 1534 page_downgrade(pp); 1535 #if defined(__xpv) && defined(__amd64) 1536 /* 1537 * Record in the page_t that is a pagetable for segkpm setup. 1538 */ 1539 if (kpm_vbase) 1540 pp->p_index = 1; 1541 #endif 1542 1543 /* 1544 * Count valid mappings and recursively attach lower level pagetables. 1545 */ 1546 ptep = kbm_remap_window(pfn_to_pa(pfn), 0); 1547 for (i = 0; i < HTABLE_NUM_PTES(ht); ++i) { 1548 if (mmu.pae_hat) 1549 pte = ptep[i]; 1550 else 1551 pte = ((x86pte32_t *)ptep)[i]; 1552 if (!IN_HYPERVISOR_VA(base) && PTE_ISVALID(pte)) { 1553 ++ht->ht_valid_cnt; 1554 if (!PTE_ISPAGE(pte, level)) { 1555 htable_attach(hat, base, level - 1, 1556 ht, PTE2PFN(pte, level)); 1557 ptep = kbm_remap_window(pfn_to_pa(pfn), 0); 1558 } 1559 } 1560 base += LEVEL_SIZE(level); 1561 if (base == mmu.hole_start) 1562 base = (mmu.hole_end + MMU_PAGEOFFSET) & MMU_PAGEMASK; 1563 } 1564 1565 /* 1566 * As long as all the mappings we had were below kernel base 1567 * we can release the htable. 1568 */ 1569 if (base < kernelbase) 1570 htable_release(ht); 1571 } 1572 1573 /* 1574 * Walk through a given htable looking for the first valid entry. This 1575 * routine takes both a starting and ending address. The starting address 1576 * is required to be within the htable provided by the caller, but there is 1577 * no such restriction on the ending address. 1578 * 1579 * If the routine finds a valid entry in the htable (at or beyond the 1580 * starting address), the PTE (and its address) will be returned. 1581 * This PTE may correspond to either a page or a pagetable - it is the 1582 * caller's responsibility to determine which. If no valid entry is 1583 * found, 0 (and invalid PTE) and the next unexamined address will be 1584 * returned. 1585 * 1586 * The loop has been carefully coded for optimization. 1587 */ 1588 static x86pte_t 1589 htable_scan(htable_t *ht, uintptr_t *vap, uintptr_t eaddr) 1590 { 1591 uint_t e; 1592 x86pte_t found_pte = (x86pte_t)0; 1593 caddr_t pte_ptr; 1594 caddr_t end_pte_ptr; 1595 int l = ht->ht_level; 1596 uintptr_t va = *vap & LEVEL_MASK(l); 1597 size_t pgsize = LEVEL_SIZE(l); 1598 1599 ASSERT(va >= ht->ht_vaddr); 1600 ASSERT(va <= HTABLE_LAST_PAGE(ht)); 1601 1602 /* 1603 * Compute the starting index and ending virtual address 1604 */ 1605 e = htable_va2entry(va, ht); 1606 1607 /* 1608 * The following page table scan code knows that the valid 1609 * bit of a PTE is in the lowest byte AND that x86 is little endian!! 1610 */ 1611 pte_ptr = (caddr_t)x86pte_access_pagetable(ht, 0); 1612 end_pte_ptr = (caddr_t)PT_INDEX_PTR(pte_ptr, HTABLE_NUM_PTES(ht)); 1613 pte_ptr = (caddr_t)PT_INDEX_PTR((x86pte_t *)pte_ptr, e); 1614 while (!PTE_ISVALID(*pte_ptr)) { 1615 va += pgsize; 1616 if (va >= eaddr) 1617 break; 1618 pte_ptr += mmu.pte_size; 1619 ASSERT(pte_ptr <= end_pte_ptr); 1620 if (pte_ptr == end_pte_ptr) 1621 break; 1622 } 1623 1624 /* 1625 * if we found a valid PTE, load the entire PTE 1626 */ 1627 if (va < eaddr && pte_ptr != end_pte_ptr) 1628 found_pte = GET_PTE((x86pte_t *)pte_ptr); 1629 x86pte_release_pagetable(ht); 1630 1631 #if defined(__amd64) 1632 /* 1633 * deal with VA hole on amd64 1634 */ 1635 if (l == mmu.max_level && va >= mmu.hole_start && va <= mmu.hole_end) 1636 va = mmu.hole_end + va - mmu.hole_start; 1637 #endif /* __amd64 */ 1638 1639 *vap = va; 1640 return (found_pte); 1641 } 1642 1643 /* 1644 * Find the address and htable for the first populated translation at or 1645 * above the given virtual address. The caller may also specify an upper 1646 * limit to the address range to search. Uses level information to quickly 1647 * skip unpopulated sections of virtual address spaces. 1648 * 1649 * If not found returns NULL. When found, returns the htable and virt addr 1650 * and has a hold on the htable. 1651 */ 1652 x86pte_t 1653 htable_walk( 1654 struct hat *hat, 1655 htable_t **htp, 1656 uintptr_t *vaddr, 1657 uintptr_t eaddr) 1658 { 1659 uintptr_t va = *vaddr; 1660 htable_t *ht; 1661 htable_t *prev = *htp; 1662 level_t l; 1663 level_t max_mapped_level; 1664 x86pte_t pte; 1665 1666 ASSERT(eaddr > va); 1667 1668 /* 1669 * If this is a user address, then we know we need not look beyond 1670 * kernelbase. 1671 */ 1672 ASSERT(hat == kas.a_hat || eaddr <= kernelbase || 1673 eaddr == HTABLE_WALK_TO_END); 1674 if (hat != kas.a_hat && eaddr == HTABLE_WALK_TO_END) 1675 eaddr = kernelbase; 1676 1677 /* 1678 * If we're coming in with a previous page table, search it first 1679 * without doing an htable_lookup(), this should be frequent. 1680 */ 1681 if (prev) { 1682 ASSERT(prev->ht_busy > 0); 1683 ASSERT(prev->ht_vaddr <= va); 1684 l = prev->ht_level; 1685 if (va <= HTABLE_LAST_PAGE(prev)) { 1686 pte = htable_scan(prev, &va, eaddr); 1687 1688 if (PTE_ISPAGE(pte, l)) { 1689 *vaddr = va; 1690 *htp = prev; 1691 return (pte); 1692 } 1693 } 1694 1695 /* 1696 * We found nothing in the htable provided by the caller, 1697 * so fall through and do the full search 1698 */ 1699 htable_release(prev); 1700 } 1701 1702 /* 1703 * Find the level of the largest pagesize used by this HAT. 1704 */ 1705 if (hat->hat_ism_pgcnt > 0) { 1706 max_mapped_level = mmu.umax_page_level; 1707 } else { 1708 max_mapped_level = 0; 1709 for (l = 1; l <= mmu.max_page_level; ++l) 1710 if (hat->hat_pages_mapped[l] != 0) 1711 max_mapped_level = l; 1712 } 1713 1714 while (va < eaddr && va >= *vaddr) { 1715 ASSERT(!IN_VA_HOLE(va)); 1716 1717 /* 1718 * Find lowest table with any entry for given address. 1719 */ 1720 for (l = 0; l <= TOP_LEVEL(hat); ++l) { 1721 ht = htable_lookup(hat, va, l); 1722 if (ht != NULL) { 1723 pte = htable_scan(ht, &va, eaddr); 1724 if (PTE_ISPAGE(pte, l)) { 1725 *vaddr = va; 1726 *htp = ht; 1727 return (pte); 1728 } 1729 htable_release(ht); 1730 break; 1731 } 1732 1733 /* 1734 * No htable at this level for the address. If there 1735 * is no larger page size that could cover it, we can 1736 * skip right to the start of the next page table. 1737 */ 1738 ASSERT(l < TOP_LEVEL(hat)); 1739 if (l >= max_mapped_level) { 1740 va = NEXT_ENTRY_VA(va, l + 1); 1741 if (va >= eaddr) 1742 break; 1743 } 1744 } 1745 } 1746 1747 *vaddr = 0; 1748 *htp = NULL; 1749 return (0); 1750 } 1751 1752 /* 1753 * Find the htable and page table entry index of the given virtual address 1754 * with pagesize at or below given level. 1755 * If not found returns NULL. When found, returns the htable, sets 1756 * entry, and has a hold on the htable. 1757 */ 1758 htable_t * 1759 htable_getpte( 1760 struct hat *hat, 1761 uintptr_t vaddr, 1762 uint_t *entry, 1763 x86pte_t *pte, 1764 level_t level) 1765 { 1766 htable_t *ht; 1767 level_t l; 1768 uint_t e; 1769 1770 ASSERT(level <= mmu.max_page_level); 1771 1772 for (l = 0; l <= level; ++l) { 1773 ht = htable_lookup(hat, vaddr, l); 1774 if (ht == NULL) 1775 continue; 1776 e = htable_va2entry(vaddr, ht); 1777 if (entry != NULL) 1778 *entry = e; 1779 if (pte != NULL) 1780 *pte = x86pte_get(ht, e); 1781 return (ht); 1782 } 1783 return (NULL); 1784 } 1785 1786 /* 1787 * Find the htable and page table entry index of the given virtual address. 1788 * There must be a valid page mapped at the given address. 1789 * If not found returns NULL. When found, returns the htable, sets 1790 * entry, and has a hold on the htable. 1791 */ 1792 htable_t * 1793 htable_getpage(struct hat *hat, uintptr_t vaddr, uint_t *entry) 1794 { 1795 htable_t *ht; 1796 uint_t e; 1797 x86pte_t pte; 1798 1799 ht = htable_getpte(hat, vaddr, &e, &pte, mmu.max_page_level); 1800 if (ht == NULL) 1801 return (NULL); 1802 1803 if (entry) 1804 *entry = e; 1805 1806 if (PTE_ISPAGE(pte, ht->ht_level)) 1807 return (ht); 1808 htable_release(ht); 1809 return (NULL); 1810 } 1811 1812 1813 void 1814 htable_init() 1815 { 1816 /* 1817 * To save on kernel VA usage, we avoid debug information in 32 bit 1818 * kernels. 1819 */ 1820 #if defined(__amd64) 1821 int kmem_flags = KMC_NOHASH; 1822 #elif defined(__i386) 1823 int kmem_flags = KMC_NOHASH | KMC_NODEBUG; 1824 #endif 1825 1826 /* 1827 * initialize kmem caches 1828 */ 1829 htable_cache = kmem_cache_create("htable_t", 1830 sizeof (htable_t), 0, NULL, NULL, 1831 htable_reap, NULL, hat_memload_arena, kmem_flags); 1832 } 1833 1834 /* 1835 * get the pte index for the virtual address in the given htable's pagetable 1836 */ 1837 uint_t 1838 htable_va2entry(uintptr_t va, htable_t *ht) 1839 { 1840 level_t l = ht->ht_level; 1841 1842 ASSERT(va >= ht->ht_vaddr); 1843 ASSERT(va <= HTABLE_LAST_PAGE(ht)); 1844 return ((va >> LEVEL_SHIFT(l)) & (HTABLE_NUM_PTES(ht) - 1)); 1845 } 1846 1847 /* 1848 * Given an htable and the index of a pte in it, return the virtual address 1849 * of the page. 1850 */ 1851 uintptr_t 1852 htable_e2va(htable_t *ht, uint_t entry) 1853 { 1854 level_t l = ht->ht_level; 1855 uintptr_t va; 1856 1857 ASSERT(entry < HTABLE_NUM_PTES(ht)); 1858 va = ht->ht_vaddr + ((uintptr_t)entry << LEVEL_SHIFT(l)); 1859 1860 /* 1861 * Need to skip over any VA hole in top level table 1862 */ 1863 #if defined(__amd64) 1864 if (ht->ht_level == mmu.max_level && va >= mmu.hole_start) 1865 va += ((mmu.hole_end - mmu.hole_start) + 1); 1866 #endif 1867 1868 return (va); 1869 } 1870 1871 /* 1872 * The code uses compare and swap instructions to read/write PTE's to 1873 * avoid atomicity problems, since PTEs can be 8 bytes on 32 bit systems. 1874 * will naturally be atomic. 1875 * 1876 * The combination of using kpreempt_disable()/_enable() and the hci_mutex 1877 * are used to ensure that an interrupt won't overwrite a temporary mapping 1878 * while it's in use. If an interrupt thread tries to access a PTE, it will 1879 * yield briefly back to the pinned thread which holds the cpu's hci_mutex. 1880 */ 1881 void 1882 x86pte_cpu_init(cpu_t *cpu) 1883 { 1884 struct hat_cpu_info *hci; 1885 1886 hci = kmem_zalloc(sizeof (*hci), KM_SLEEP); 1887 mutex_init(&hci->hci_mutex, NULL, MUTEX_DEFAULT, NULL); 1888 cpu->cpu_hat_info = hci; 1889 } 1890 1891 void 1892 x86pte_cpu_fini(cpu_t *cpu) 1893 { 1894 struct hat_cpu_info *hci = cpu->cpu_hat_info; 1895 1896 kmem_free(hci, sizeof (*hci)); 1897 cpu->cpu_hat_info = NULL; 1898 } 1899 1900 #ifdef __i386 1901 /* 1902 * On 32 bit kernels, loading a 64 bit PTE is a little tricky 1903 */ 1904 x86pte_t 1905 get_pte64(x86pte_t *ptr) 1906 { 1907 volatile uint32_t *p = (uint32_t *)ptr; 1908 x86pte_t t; 1909 1910 ASSERT(mmu.pae_hat != 0); 1911 for (;;) { 1912 t = p[0]; 1913 t |= (uint64_t)p[1] << 32; 1914 if ((t & 0xffffffff) == p[0]) 1915 return (t); 1916 } 1917 } 1918 #endif /* __i386 */ 1919 1920 /* 1921 * Disable preemption and establish a mapping to the pagetable with the 1922 * given pfn. This is optimized for there case where it's the same 1923 * pfn as we last used referenced from this CPU. 1924 */ 1925 static x86pte_t * 1926 x86pte_access_pagetable(htable_t *ht, uint_t index) 1927 { 1928 /* 1929 * VLP pagetables are contained in the hat_t 1930 */ 1931 if (ht->ht_flags & HTABLE_VLP) 1932 return (PT_INDEX_PTR(ht->ht_hat->hat_vlp_ptes, index)); 1933 return (x86pte_mapin(ht->ht_pfn, index, ht)); 1934 } 1935 1936 /* 1937 * map the given pfn into the page table window. 1938 */ 1939 /*ARGSUSED*/ 1940 x86pte_t * 1941 x86pte_mapin(pfn_t pfn, uint_t index, htable_t *ht) 1942 { 1943 x86pte_t *pteptr; 1944 x86pte_t pte = 0; 1945 x86pte_t newpte; 1946 int x; 1947 1948 ASSERT(pfn != PFN_INVALID); 1949 1950 if (!khat_running) { 1951 caddr_t va = kbm_remap_window(pfn_to_pa(pfn), 1); 1952 return (PT_INDEX_PTR(va, index)); 1953 } 1954 1955 /* 1956 * If kpm is available, use it. 1957 */ 1958 if (kpm_vbase) 1959 return (PT_INDEX_PTR(hat_kpm_pfn2va(pfn), index)); 1960 1961 /* 1962 * Disable preemption and grab the CPU's hci_mutex 1963 */ 1964 kpreempt_disable(); 1965 ASSERT(CPU->cpu_hat_info != NULL); 1966 mutex_enter(&CPU->cpu_hat_info->hci_mutex); 1967 x = PWIN_TABLE(CPU->cpu_id); 1968 pteptr = (x86pte_t *)PWIN_PTE_VA(x); 1969 #ifndef __xpv 1970 if (mmu.pae_hat) 1971 pte = *pteptr; 1972 else 1973 pte = *(x86pte32_t *)pteptr; 1974 #endif 1975 1976 newpte = MAKEPTE(pfn, 0) | mmu.pt_global | mmu.pt_nx; 1977 1978 /* 1979 * For hardware we can use a writable mapping. 1980 */ 1981 #ifdef __xpv 1982 if (IN_XPV_PANIC()) 1983 #endif 1984 newpte |= PT_WRITABLE; 1985 1986 if (!PTE_EQUIV(newpte, pte)) { 1987 1988 #ifdef __xpv 1989 if (!IN_XPV_PANIC()) { 1990 xen_map(newpte, PWIN_VA(x)); 1991 } else 1992 #endif 1993 { 1994 XPV_ALLOW_PAGETABLE_UPDATES(); 1995 if (mmu.pae_hat) 1996 *pteptr = newpte; 1997 else 1998 *(x86pte32_t *)pteptr = newpte; 1999 XPV_DISALLOW_PAGETABLE_UPDATES(); 2000 mmu_tlbflush_entry((caddr_t)(PWIN_VA(x))); 2001 } 2002 } 2003 return (PT_INDEX_PTR(PWIN_VA(x), index)); 2004 } 2005 2006 /* 2007 * Release access to a page table. 2008 */ 2009 static void 2010 x86pte_release_pagetable(htable_t *ht) 2011 { 2012 /* 2013 * nothing to do for VLP htables 2014 */ 2015 if (ht->ht_flags & HTABLE_VLP) 2016 return; 2017 2018 x86pte_mapout(); 2019 } 2020 2021 void 2022 x86pte_mapout(void) 2023 { 2024 if (kpm_vbase != NULL || !khat_running) 2025 return; 2026 2027 /* 2028 * Drop the CPU's hci_mutex and restore preemption. 2029 */ 2030 #ifdef __xpv 2031 if (!IN_XPV_PANIC()) { 2032 uintptr_t va; 2033 2034 /* 2035 * We need to always clear the mapping in case a page 2036 * that was once a page table page is ballooned out. 2037 */ 2038 va = (uintptr_t)PWIN_VA(PWIN_TABLE(CPU->cpu_id)); 2039 (void) HYPERVISOR_update_va_mapping(va, 0, 2040 UVMF_INVLPG | UVMF_LOCAL); 2041 } 2042 #endif 2043 mutex_exit(&CPU->cpu_hat_info->hci_mutex); 2044 kpreempt_enable(); 2045 } 2046 2047 /* 2048 * Atomic retrieval of a pagetable entry 2049 */ 2050 x86pte_t 2051 x86pte_get(htable_t *ht, uint_t entry) 2052 { 2053 x86pte_t pte; 2054 x86pte_t *ptep; 2055 2056 /* 2057 * Be careful that loading PAE entries in 32 bit kernel is atomic. 2058 */ 2059 ASSERT(entry < mmu.ptes_per_table); 2060 ptep = x86pte_access_pagetable(ht, entry); 2061 pte = GET_PTE(ptep); 2062 x86pte_release_pagetable(ht); 2063 return (pte); 2064 } 2065 2066 /* 2067 * Atomic unconditional set of a page table entry, it returns the previous 2068 * value. For pre-existing mappings if the PFN changes, then we don't care 2069 * about the old pte's REF / MOD bits. If the PFN remains the same, we leave 2070 * the MOD/REF bits unchanged. 2071 * 2072 * If asked to overwrite a link to a lower page table with a large page 2073 * mapping, this routine returns the special value of LPAGE_ERROR. This 2074 * allows the upper HAT layers to retry with a smaller mapping size. 2075 */ 2076 x86pte_t 2077 x86pte_set(htable_t *ht, uint_t entry, x86pte_t new, void *ptr) 2078 { 2079 x86pte_t old; 2080 x86pte_t prev; 2081 x86pte_t *ptep; 2082 level_t l = ht->ht_level; 2083 x86pte_t pfn_mask = (l != 0) ? PT_PADDR_LGPG : PT_PADDR; 2084 x86pte_t n; 2085 uintptr_t addr = htable_e2va(ht, entry); 2086 hat_t *hat = ht->ht_hat; 2087 2088 ASSERT(new != 0); /* don't use to invalidate a PTE, see x86pte_update */ 2089 ASSERT(!(ht->ht_flags & HTABLE_SHARED_PFN)); 2090 if (ptr == NULL) 2091 ptep = x86pte_access_pagetable(ht, entry); 2092 else 2093 ptep = ptr; 2094 2095 /* 2096 * Install the new PTE. If remapping the same PFN, then 2097 * copy existing REF/MOD bits to new mapping. 2098 */ 2099 do { 2100 prev = GET_PTE(ptep); 2101 n = new; 2102 if (PTE_ISVALID(n) && (prev & pfn_mask) == (new & pfn_mask)) 2103 n |= prev & (PT_REF | PT_MOD); 2104 2105 /* 2106 * Another thread may have installed this mapping already, 2107 * flush the local TLB and be done. 2108 */ 2109 if (prev == n) { 2110 old = new; 2111 #ifdef __xpv 2112 if (!IN_XPV_PANIC()) 2113 xen_flush_va((caddr_t)addr); 2114 else 2115 #endif 2116 mmu_tlbflush_entry((caddr_t)addr); 2117 goto done; 2118 } 2119 2120 /* 2121 * Detect if we have a collision of installing a large 2122 * page mapping where there already is a lower page table. 2123 */ 2124 if (l > 0 && (prev & PT_VALID) && !(prev & PT_PAGESIZE)) { 2125 old = LPAGE_ERROR; 2126 goto done; 2127 } 2128 2129 XPV_ALLOW_PAGETABLE_UPDATES(); 2130 old = CAS_PTE(ptep, prev, n); 2131 XPV_DISALLOW_PAGETABLE_UPDATES(); 2132 } while (old != prev); 2133 2134 /* 2135 * Do a TLB demap if needed, ie. the old pte was valid. 2136 * 2137 * Note that a stale TLB writeback to the PTE here either can't happen 2138 * or doesn't matter. The PFN can only change for NOSYNC|NOCONSIST 2139 * mappings, but they were created with REF and MOD already set, so 2140 * no stale writeback will happen. 2141 * 2142 * Segmap is the only place where remaps happen on the same pfn and for 2143 * that we want to preserve the stale REF/MOD bits. 2144 */ 2145 if (old & PT_REF) 2146 hat_tlb_inval(hat, addr); 2147 2148 done: 2149 if (ptr == NULL) 2150 x86pte_release_pagetable(ht); 2151 return (old); 2152 } 2153 2154 /* 2155 * Atomic compare and swap of a page table entry. No TLB invalidates are done. 2156 * This is used for links between pagetables of different levels. 2157 * Note we always create these links with dirty/access set, so they should 2158 * never change. 2159 */ 2160 x86pte_t 2161 x86pte_cas(htable_t *ht, uint_t entry, x86pte_t old, x86pte_t new) 2162 { 2163 x86pte_t pte; 2164 x86pte_t *ptep; 2165 #ifdef __xpv 2166 /* 2167 * We can't use writable pagetables for upper level tables, so fake it. 2168 */ 2169 mmu_update_t t[2]; 2170 int cnt = 1; 2171 int count; 2172 maddr_t ma; 2173 2174 if (!IN_XPV_PANIC()) { 2175 ASSERT(!(ht->ht_flags & HTABLE_VLP)); /* no VLP yet */ 2176 ma = pa_to_ma(PT_INDEX_PHYSADDR(pfn_to_pa(ht->ht_pfn), entry)); 2177 t[0].ptr = ma | MMU_NORMAL_PT_UPDATE; 2178 t[0].val = new; 2179 2180 #if defined(__amd64) 2181 /* 2182 * On the 64-bit hypervisor we need to maintain the user mode 2183 * top page table too. 2184 */ 2185 if (ht->ht_level == mmu.max_level && ht->ht_hat != kas.a_hat) { 2186 ma = pa_to_ma(PT_INDEX_PHYSADDR(pfn_to_pa( 2187 ht->ht_hat->hat_user_ptable), entry)); 2188 t[1].ptr = ma | MMU_NORMAL_PT_UPDATE; 2189 t[1].val = new; 2190 ++cnt; 2191 } 2192 #endif /* __amd64 */ 2193 2194 if (HYPERVISOR_mmu_update(t, cnt, &count, DOMID_SELF)) 2195 panic("HYPERVISOR_mmu_update() failed"); 2196 ASSERT(count == cnt); 2197 return (old); 2198 } 2199 #endif 2200 ptep = x86pte_access_pagetable(ht, entry); 2201 XPV_ALLOW_PAGETABLE_UPDATES(); 2202 pte = CAS_PTE(ptep, old, new); 2203 XPV_DISALLOW_PAGETABLE_UPDATES(); 2204 x86pte_release_pagetable(ht); 2205 return (pte); 2206 } 2207 2208 /* 2209 * Invalidate a page table entry as long as it currently maps something that 2210 * matches the value determined by expect. 2211 * 2212 * Also invalidates any TLB entries and returns the previous value of the PTE. 2213 */ 2214 x86pte_t 2215 x86pte_inval( 2216 htable_t *ht, 2217 uint_t entry, 2218 x86pte_t expect, 2219 x86pte_t *pte_ptr) 2220 { 2221 x86pte_t *ptep; 2222 x86pte_t oldpte; 2223 x86pte_t found; 2224 2225 ASSERT(!(ht->ht_flags & HTABLE_SHARED_PFN)); 2226 ASSERT(ht->ht_level <= mmu.max_page_level); 2227 2228 if (pte_ptr != NULL) 2229 ptep = pte_ptr; 2230 else 2231 ptep = x86pte_access_pagetable(ht, entry); 2232 2233 #if defined(__xpv) 2234 /* 2235 * If exit()ing just use HYPERVISOR_mmu_update(), as we can't be racing 2236 * with anything else. 2237 */ 2238 if ((ht->ht_hat->hat_flags & HAT_FREEING) && !IN_XPV_PANIC()) { 2239 int count; 2240 mmu_update_t t[1]; 2241 maddr_t ma; 2242 2243 oldpte = GET_PTE(ptep); 2244 if (expect != 0 && (oldpte & PT_PADDR) != (expect & PT_PADDR)) 2245 goto done; 2246 ma = pa_to_ma(PT_INDEX_PHYSADDR(pfn_to_pa(ht->ht_pfn), entry)); 2247 t[0].ptr = ma | MMU_NORMAL_PT_UPDATE; 2248 t[0].val = 0; 2249 if (HYPERVISOR_mmu_update(t, 1, &count, DOMID_SELF)) 2250 panic("HYPERVISOR_mmu_update() failed"); 2251 ASSERT(count == 1); 2252 goto done; 2253 } 2254 #endif /* __xpv */ 2255 2256 /* 2257 * Note that the loop is needed to handle changes due to h/w updating 2258 * of PT_MOD/PT_REF. 2259 */ 2260 do { 2261 oldpte = GET_PTE(ptep); 2262 if (expect != 0 && (oldpte & PT_PADDR) != (expect & PT_PADDR)) 2263 goto done; 2264 XPV_ALLOW_PAGETABLE_UPDATES(); 2265 found = CAS_PTE(ptep, oldpte, 0); 2266 XPV_DISALLOW_PAGETABLE_UPDATES(); 2267 } while (found != oldpte); 2268 if (oldpte & (PT_REF | PT_MOD)) 2269 hat_tlb_inval(ht->ht_hat, htable_e2va(ht, entry)); 2270 2271 done: 2272 if (pte_ptr == NULL) 2273 x86pte_release_pagetable(ht); 2274 return (oldpte); 2275 } 2276 2277 /* 2278 * Change a page table entry af it currently matches the value in expect. 2279 */ 2280 x86pte_t 2281 x86pte_update( 2282 htable_t *ht, 2283 uint_t entry, 2284 x86pte_t expect, 2285 x86pte_t new) 2286 { 2287 x86pte_t *ptep; 2288 x86pte_t found; 2289 2290 ASSERT(new != 0); 2291 ASSERT(!(ht->ht_flags & HTABLE_SHARED_PFN)); 2292 ASSERT(ht->ht_level <= mmu.max_page_level); 2293 2294 ptep = x86pte_access_pagetable(ht, entry); 2295 XPV_ALLOW_PAGETABLE_UPDATES(); 2296 found = CAS_PTE(ptep, expect, new); 2297 XPV_DISALLOW_PAGETABLE_UPDATES(); 2298 if (found == expect) { 2299 hat_tlb_inval(ht->ht_hat, htable_e2va(ht, entry)); 2300 2301 /* 2302 * When removing write permission *and* clearing the 2303 * MOD bit, check if a write happened via a stale 2304 * TLB entry before the TLB shootdown finished. 2305 * 2306 * If it did happen, simply re-enable write permission and 2307 * act like the original CAS failed. 2308 */ 2309 if ((expect & (PT_WRITABLE | PT_MOD)) == PT_WRITABLE && 2310 (new & (PT_WRITABLE | PT_MOD)) == 0 && 2311 (GET_PTE(ptep) & PT_MOD) != 0) { 2312 do { 2313 found = GET_PTE(ptep); 2314 XPV_ALLOW_PAGETABLE_UPDATES(); 2315 found = 2316 CAS_PTE(ptep, found, found | PT_WRITABLE); 2317 XPV_DISALLOW_PAGETABLE_UPDATES(); 2318 } while ((found & PT_WRITABLE) == 0); 2319 } 2320 } 2321 x86pte_release_pagetable(ht); 2322 return (found); 2323 } 2324 2325 #ifndef __xpv 2326 /* 2327 * Copy page tables - this is just a little more complicated than the 2328 * previous routines. Note that it's also not atomic! It also is never 2329 * used for VLP pagetables. 2330 */ 2331 void 2332 x86pte_copy(htable_t *src, htable_t *dest, uint_t entry, uint_t count) 2333 { 2334 caddr_t src_va; 2335 caddr_t dst_va; 2336 size_t size; 2337 x86pte_t *pteptr; 2338 x86pte_t pte; 2339 2340 ASSERT(khat_running); 2341 ASSERT(!(dest->ht_flags & HTABLE_VLP)); 2342 ASSERT(!(src->ht_flags & HTABLE_VLP)); 2343 ASSERT(!(src->ht_flags & HTABLE_SHARED_PFN)); 2344 ASSERT(!(dest->ht_flags & HTABLE_SHARED_PFN)); 2345 2346 /* 2347 * Acquire access to the CPU pagetable windows for the dest and source. 2348 */ 2349 dst_va = (caddr_t)x86pte_access_pagetable(dest, entry); 2350 if (kpm_vbase) { 2351 src_va = (caddr_t) 2352 PT_INDEX_PTR(hat_kpm_pfn2va(src->ht_pfn), entry); 2353 } else { 2354 uint_t x = PWIN_SRC(CPU->cpu_id); 2355 2356 /* 2357 * Finish defining the src pagetable mapping 2358 */ 2359 src_va = (caddr_t)PT_INDEX_PTR(PWIN_VA(x), entry); 2360 pte = MAKEPTE(src->ht_pfn, 0) | mmu.pt_global | mmu.pt_nx; 2361 pteptr = (x86pte_t *)PWIN_PTE_VA(x); 2362 if (mmu.pae_hat) 2363 *pteptr = pte; 2364 else 2365 *(x86pte32_t *)pteptr = pte; 2366 mmu_tlbflush_entry((caddr_t)(PWIN_VA(x))); 2367 } 2368 2369 /* 2370 * now do the copy 2371 */ 2372 size = count << mmu.pte_size_shift; 2373 bcopy(src_va, dst_va, size); 2374 2375 x86pte_release_pagetable(dest); 2376 } 2377 2378 #else /* __xpv */ 2379 2380 /* 2381 * The hypervisor only supports writable pagetables at level 0, so we have 2382 * to install these 1 by 1 the slow way. 2383 */ 2384 void 2385 x86pte_copy(htable_t *src, htable_t *dest, uint_t entry, uint_t count) 2386 { 2387 caddr_t src_va; 2388 x86pte_t pte; 2389 2390 ASSERT(!IN_XPV_PANIC()); 2391 src_va = (caddr_t)x86pte_access_pagetable(src, entry); 2392 while (count) { 2393 if (mmu.pae_hat) 2394 pte = *(x86pte_t *)src_va; 2395 else 2396 pte = *(x86pte32_t *)src_va; 2397 if (pte != 0) { 2398 set_pteval(pfn_to_pa(dest->ht_pfn), entry, 2399 dest->ht_level, pte); 2400 #ifdef __amd64 2401 if (dest->ht_level == mmu.max_level && 2402 htable_e2va(dest, entry) < HYPERVISOR_VIRT_END) 2403 set_pteval( 2404 pfn_to_pa(dest->ht_hat->hat_user_ptable), 2405 entry, dest->ht_level, pte); 2406 #endif 2407 } 2408 --count; 2409 ++entry; 2410 src_va += mmu.pte_size; 2411 } 2412 x86pte_release_pagetable(src); 2413 } 2414 #endif /* __xpv */ 2415 2416 /* 2417 * Zero page table entries - Note this doesn't use atomic stores! 2418 */ 2419 static void 2420 x86pte_zero(htable_t *dest, uint_t entry, uint_t count) 2421 { 2422 caddr_t dst_va; 2423 size_t size; 2424 #ifdef __xpv 2425 int x; 2426 x86pte_t newpte; 2427 #endif 2428 2429 /* 2430 * Map in the page table to be zeroed. 2431 */ 2432 ASSERT(!(dest->ht_flags & HTABLE_SHARED_PFN)); 2433 ASSERT(!(dest->ht_flags & HTABLE_VLP)); 2434 2435 /* 2436 * On the hypervisor we don't use x86pte_access_pagetable() since 2437 * in this case the page is not pinned yet. 2438 */ 2439 #ifdef __xpv 2440 if (kpm_vbase == NULL) { 2441 kpreempt_disable(); 2442 ASSERT(CPU->cpu_hat_info != NULL); 2443 mutex_enter(&CPU->cpu_hat_info->hci_mutex); 2444 x = PWIN_TABLE(CPU->cpu_id); 2445 newpte = MAKEPTE(dest->ht_pfn, 0) | PT_WRITABLE; 2446 xen_map(newpte, PWIN_VA(x)); 2447 dst_va = (caddr_t)PT_INDEX_PTR(PWIN_VA(x), entry); 2448 } else 2449 #endif 2450 dst_va = (caddr_t)x86pte_access_pagetable(dest, entry); 2451 2452 size = count << mmu.pte_size_shift; 2453 ASSERT(size > BLOCKZEROALIGN); 2454 #ifdef __i386 2455 if (!is_x86_feature(x86_featureset, X86FSET_SSE2)) 2456 bzero(dst_va, size); 2457 else 2458 #endif 2459 block_zero_no_xmm(dst_va, size); 2460 2461 #ifdef __xpv 2462 if (kpm_vbase == NULL) { 2463 xen_map(0, PWIN_VA(x)); 2464 mutex_exit(&CPU->cpu_hat_info->hci_mutex); 2465 kpreempt_enable(); 2466 } else 2467 #endif 2468 x86pte_release_pagetable(dest); 2469 } 2470 2471 /* 2472 * Called to ensure that all pagetables are in the system dump 2473 */ 2474 void 2475 hat_dump(void) 2476 { 2477 hat_t *hat; 2478 uint_t h; 2479 htable_t *ht; 2480 2481 /* 2482 * Dump all page tables 2483 */ 2484 for (hat = kas.a_hat; hat != NULL; hat = hat->hat_next) { 2485 for (h = 0; h < hat->hat_num_hash; ++h) { 2486 for (ht = hat->hat_ht_hash[h]; ht; ht = ht->ht_next) { 2487 if ((ht->ht_flags & HTABLE_VLP) == 0) 2488 dump_page(ht->ht_pfn); 2489 } 2490 } 2491 } 2492 } 2493