1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2014 by Delphix. All rights reserved. 25 * Copyright 2015 Joyent, Inc. 26 */ 27 28 #include <sys/types.h> 29 #include <sys/sysmacros.h> 30 #include <sys/kmem.h> 31 #include <sys/atomic.h> 32 #include <sys/bitmap.h> 33 #include <sys/machparam.h> 34 #include <sys/machsystm.h> 35 #include <sys/mman.h> 36 #include <sys/systm.h> 37 #include <sys/cpuvar.h> 38 #include <sys/thread.h> 39 #include <sys/proc.h> 40 #include <sys/cpu.h> 41 #include <sys/kmem.h> 42 #include <sys/disp.h> 43 #include <sys/vmem.h> 44 #include <sys/vmsystm.h> 45 #include <sys/promif.h> 46 #include <sys/var.h> 47 #include <sys/x86_archext.h> 48 #include <sys/archsystm.h> 49 #include <sys/bootconf.h> 50 #include <sys/dumphdr.h> 51 #include <vm/seg_kmem.h> 52 #include <vm/seg_kpm.h> 53 #include <vm/hat.h> 54 #include <vm/hat_i86.h> 55 #include <sys/cmn_err.h> 56 #include <sys/panic.h> 57 58 #ifdef __xpv 59 #include <sys/hypervisor.h> 60 #include <sys/xpv_panic.h> 61 #endif 62 63 #include <sys/bootinfo.h> 64 #include <vm/kboot_mmu.h> 65 66 static void x86pte_zero(htable_t *dest, uint_t entry, uint_t count); 67 68 kmem_cache_t *htable_cache; 69 70 /* 71 * The variable htable_reserve_amount, rather than HTABLE_RESERVE_AMOUNT, 72 * is used in order to facilitate testing of the htable_steal() code. 73 * By resetting htable_reserve_amount to a lower value, we can force 74 * stealing to occur. The reserve amount is a guess to get us through boot. 75 */ 76 #define HTABLE_RESERVE_AMOUNT (200) 77 uint_t htable_reserve_amount = HTABLE_RESERVE_AMOUNT; 78 kmutex_t htable_reserve_mutex; 79 uint_t htable_reserve_cnt; 80 htable_t *htable_reserve_pool; 81 82 /* 83 * Used to hand test htable_steal(). 84 */ 85 #ifdef DEBUG 86 ulong_t force_steal = 0; 87 ulong_t ptable_cnt = 0; 88 #endif 89 90 /* 91 * This variable is so that we can tune this via /etc/system 92 * Any value works, but a power of two <= mmu.ptes_per_table is best. 93 */ 94 uint_t htable_steal_passes = 8; 95 96 /* 97 * mutex stuff for access to htable hash 98 */ 99 #define NUM_HTABLE_MUTEX 128 100 kmutex_t htable_mutex[NUM_HTABLE_MUTEX]; 101 #define HTABLE_MUTEX_HASH(h) ((h) & (NUM_HTABLE_MUTEX - 1)) 102 103 #define HTABLE_ENTER(h) mutex_enter(&htable_mutex[HTABLE_MUTEX_HASH(h)]); 104 #define HTABLE_EXIT(h) mutex_exit(&htable_mutex[HTABLE_MUTEX_HASH(h)]); 105 106 /* 107 * forward declarations 108 */ 109 static void link_ptp(htable_t *higher, htable_t *new, uintptr_t vaddr); 110 static void unlink_ptp(htable_t *higher, htable_t *old, uintptr_t vaddr); 111 static void htable_free(htable_t *ht); 112 static x86pte_t *x86pte_access_pagetable(htable_t *ht, uint_t index); 113 static void x86pte_release_pagetable(htable_t *ht); 114 static x86pte_t x86pte_cas(htable_t *ht, uint_t entry, x86pte_t old, 115 x86pte_t new); 116 117 /* 118 * A counter to track if we are stealing or reaping htables. When non-zero 119 * htable_free() will directly free htables (either to the reserve or kmem) 120 * instead of putting them in a hat's htable cache. 121 */ 122 uint32_t htable_dont_cache = 0; 123 124 /* 125 * Track the number of active pagetables, so we can know how many to reap 126 */ 127 static uint32_t active_ptables = 0; 128 129 #ifdef __xpv 130 /* 131 * Deal with hypervisor complications. 132 */ 133 void 134 xen_flush_va(caddr_t va) 135 { 136 struct mmuext_op t; 137 uint_t count; 138 139 if (IN_XPV_PANIC()) { 140 mmu_tlbflush_entry((caddr_t)va); 141 } else { 142 t.cmd = MMUEXT_INVLPG_LOCAL; 143 t.arg1.linear_addr = (uintptr_t)va; 144 if (HYPERVISOR_mmuext_op(&t, 1, &count, DOMID_SELF) < 0) 145 panic("HYPERVISOR_mmuext_op() failed"); 146 ASSERT(count == 1); 147 } 148 } 149 150 void 151 xen_gflush_va(caddr_t va, cpuset_t cpus) 152 { 153 struct mmuext_op t; 154 uint_t count; 155 156 if (IN_XPV_PANIC()) { 157 mmu_tlbflush_entry((caddr_t)va); 158 return; 159 } 160 161 t.cmd = MMUEXT_INVLPG_MULTI; 162 t.arg1.linear_addr = (uintptr_t)va; 163 /*LINTED: constant in conditional context*/ 164 set_xen_guest_handle(t.arg2.vcpumask, &cpus); 165 if (HYPERVISOR_mmuext_op(&t, 1, &count, DOMID_SELF) < 0) 166 panic("HYPERVISOR_mmuext_op() failed"); 167 ASSERT(count == 1); 168 } 169 170 void 171 xen_flush_tlb() 172 { 173 struct mmuext_op t; 174 uint_t count; 175 176 if (IN_XPV_PANIC()) { 177 xpv_panic_reload_cr3(); 178 } else { 179 t.cmd = MMUEXT_TLB_FLUSH_LOCAL; 180 if (HYPERVISOR_mmuext_op(&t, 1, &count, DOMID_SELF) < 0) 181 panic("HYPERVISOR_mmuext_op() failed"); 182 ASSERT(count == 1); 183 } 184 } 185 186 void 187 xen_gflush_tlb(cpuset_t cpus) 188 { 189 struct mmuext_op t; 190 uint_t count; 191 192 ASSERT(!IN_XPV_PANIC()); 193 t.cmd = MMUEXT_TLB_FLUSH_MULTI; 194 /*LINTED: constant in conditional context*/ 195 set_xen_guest_handle(t.arg2.vcpumask, &cpus); 196 if (HYPERVISOR_mmuext_op(&t, 1, &count, DOMID_SELF) < 0) 197 panic("HYPERVISOR_mmuext_op() failed"); 198 ASSERT(count == 1); 199 } 200 201 /* 202 * Install/Adjust a kpm mapping under the hypervisor. 203 * Value of "how" should be: 204 * PT_WRITABLE | PT_VALID - regular kpm mapping 205 * PT_VALID - make mapping read-only 206 * 0 - remove mapping 207 * 208 * returns 0 on success. non-zero for failure. 209 */ 210 int 211 xen_kpm_page(pfn_t pfn, uint_t how) 212 { 213 paddr_t pa = mmu_ptob((paddr_t)pfn); 214 x86pte_t pte = PT_NOCONSIST | PT_REF | PT_MOD; 215 216 if (kpm_vbase == NULL) 217 return (0); 218 219 if (how) 220 pte |= pa_to_ma(pa) | how; 221 else 222 pte = 0; 223 return (HYPERVISOR_update_va_mapping((uintptr_t)kpm_vbase + pa, 224 pte, UVMF_INVLPG | UVMF_ALL)); 225 } 226 227 void 228 xen_pin(pfn_t pfn, level_t lvl) 229 { 230 struct mmuext_op t; 231 uint_t count; 232 233 t.cmd = MMUEXT_PIN_L1_TABLE + lvl; 234 t.arg1.mfn = pfn_to_mfn(pfn); 235 if (HYPERVISOR_mmuext_op(&t, 1, &count, DOMID_SELF) < 0) 236 panic("HYPERVISOR_mmuext_op() failed"); 237 ASSERT(count == 1); 238 } 239 240 void 241 xen_unpin(pfn_t pfn) 242 { 243 struct mmuext_op t; 244 uint_t count; 245 246 t.cmd = MMUEXT_UNPIN_TABLE; 247 t.arg1.mfn = pfn_to_mfn(pfn); 248 if (HYPERVISOR_mmuext_op(&t, 1, &count, DOMID_SELF) < 0) 249 panic("HYPERVISOR_mmuext_op() failed"); 250 ASSERT(count == 1); 251 } 252 253 static void 254 xen_map(uint64_t pte, caddr_t va) 255 { 256 if (HYPERVISOR_update_va_mapping((uintptr_t)va, pte, 257 UVMF_INVLPG | UVMF_LOCAL)) 258 panic("HYPERVISOR_update_va_mapping() failed"); 259 } 260 #endif /* __xpv */ 261 262 /* 263 * Allocate a memory page for a hardware page table. 264 * 265 * A wrapper around page_get_physical(), with some extra checks. 266 */ 267 static pfn_t 268 ptable_alloc(uintptr_t seed) 269 { 270 pfn_t pfn; 271 page_t *pp; 272 273 pfn = PFN_INVALID; 274 275 /* 276 * The first check is to see if there is memory in the system. If we 277 * drop to throttlefree, then fail the ptable_alloc() and let the 278 * stealing code kick in. Note that we have to do this test here, 279 * since the test in page_create_throttle() would let the NOSLEEP 280 * allocation go through and deplete the page reserves. 281 * 282 * The !NOMEMWAIT() lets pageout, fsflush, etc. skip this check. 283 */ 284 if (!NOMEMWAIT() && freemem <= throttlefree + 1) 285 return (PFN_INVALID); 286 287 #ifdef DEBUG 288 /* 289 * This code makes htable_steal() easier to test. By setting 290 * force_steal we force pagetable allocations to fall 291 * into the stealing code. Roughly 1 in ever "force_steal" 292 * page table allocations will fail. 293 */ 294 if (proc_pageout != NULL && force_steal > 1 && 295 ++ptable_cnt > force_steal) { 296 ptable_cnt = 0; 297 return (PFN_INVALID); 298 } 299 #endif /* DEBUG */ 300 301 pp = page_get_physical(seed); 302 if (pp == NULL) 303 return (PFN_INVALID); 304 ASSERT(PAGE_SHARED(pp)); 305 pfn = pp->p_pagenum; 306 if (pfn == PFN_INVALID) 307 panic("ptable_alloc(): Invalid PFN!!"); 308 atomic_inc_32(&active_ptables); 309 HATSTAT_INC(hs_ptable_allocs); 310 return (pfn); 311 } 312 313 /* 314 * Free an htable's associated page table page. See the comments 315 * for ptable_alloc(). 316 */ 317 static void 318 ptable_free(pfn_t pfn) 319 { 320 page_t *pp = page_numtopp_nolock(pfn); 321 322 /* 323 * need to destroy the page used for the pagetable 324 */ 325 ASSERT(pfn != PFN_INVALID); 326 HATSTAT_INC(hs_ptable_frees); 327 atomic_dec_32(&active_ptables); 328 if (pp == NULL) 329 panic("ptable_free(): no page for pfn!"); 330 ASSERT(PAGE_SHARED(pp)); 331 ASSERT(pfn == pp->p_pagenum); 332 ASSERT(!IN_XPV_PANIC()); 333 334 /* 335 * Get an exclusive lock, might have to wait for a kmem reader. 336 */ 337 if (!page_tryupgrade(pp)) { 338 u_offset_t off = pp->p_offset; 339 page_unlock(pp); 340 pp = page_lookup(&kvp, off, SE_EXCL); 341 if (pp == NULL) 342 panic("page not found"); 343 } 344 #ifdef __xpv 345 if (kpm_vbase && xen_kpm_page(pfn, PT_VALID | PT_WRITABLE) < 0) 346 panic("failure making kpm r/w pfn=0x%lx", pfn); 347 #endif 348 page_hashout(pp, NULL); 349 page_free(pp, 1); 350 page_unresv(1); 351 } 352 353 /* 354 * Put one htable on the reserve list. 355 */ 356 static void 357 htable_put_reserve(htable_t *ht) 358 { 359 ht->ht_hat = NULL; /* no longer tied to a hat */ 360 ASSERT(ht->ht_pfn == PFN_INVALID); 361 HATSTAT_INC(hs_htable_rputs); 362 mutex_enter(&htable_reserve_mutex); 363 ht->ht_next = htable_reserve_pool; 364 htable_reserve_pool = ht; 365 ++htable_reserve_cnt; 366 mutex_exit(&htable_reserve_mutex); 367 } 368 369 /* 370 * Take one htable from the reserve. 371 */ 372 static htable_t * 373 htable_get_reserve(void) 374 { 375 htable_t *ht = NULL; 376 377 mutex_enter(&htable_reserve_mutex); 378 if (htable_reserve_cnt != 0) { 379 ht = htable_reserve_pool; 380 ASSERT(ht != NULL); 381 ASSERT(ht->ht_pfn == PFN_INVALID); 382 htable_reserve_pool = ht->ht_next; 383 --htable_reserve_cnt; 384 HATSTAT_INC(hs_htable_rgets); 385 } 386 mutex_exit(&htable_reserve_mutex); 387 return (ht); 388 } 389 390 /* 391 * Allocate initial htables and put them on the reserve list 392 */ 393 void 394 htable_initial_reserve(uint_t count) 395 { 396 htable_t *ht; 397 398 count += HTABLE_RESERVE_AMOUNT; 399 while (count > 0) { 400 ht = kmem_cache_alloc(htable_cache, KM_NOSLEEP); 401 ASSERT(ht != NULL); 402 403 ASSERT(use_boot_reserve); 404 ht->ht_pfn = PFN_INVALID; 405 htable_put_reserve(ht); 406 --count; 407 } 408 } 409 410 /* 411 * Readjust the reserves after a thread finishes using them. 412 */ 413 void 414 htable_adjust_reserve() 415 { 416 htable_t *ht; 417 418 /* 419 * Free any excess htables in the reserve list 420 */ 421 while (htable_reserve_cnt > htable_reserve_amount && 422 !USE_HAT_RESERVES()) { 423 ht = htable_get_reserve(); 424 if (ht == NULL) 425 return; 426 ASSERT(ht->ht_pfn == PFN_INVALID); 427 kmem_cache_free(htable_cache, ht); 428 } 429 } 430 431 /* 432 * Search the active htables for one to steal. Start at a different hash 433 * bucket every time to help spread the pain of stealing 434 */ 435 static void 436 htable_steal_active(hat_t *hat, uint_t cnt, uint_t threshold, 437 uint_t *stolen, htable_t **list) 438 { 439 static uint_t h_seed = 0; 440 htable_t *higher, *ht; 441 uint_t h, e, h_start; 442 uintptr_t va; 443 x86pte_t pte; 444 445 h = h_start = h_seed++ % hat->hat_num_hash; 446 do { 447 higher = NULL; 448 HTABLE_ENTER(h); 449 for (ht = hat->hat_ht_hash[h]; ht; ht = ht->ht_next) { 450 451 /* 452 * Can we rule out reaping? 453 */ 454 if (ht->ht_busy != 0 || 455 (ht->ht_flags & HTABLE_SHARED_PFN) || 456 ht->ht_level > 0 || ht->ht_valid_cnt > threshold || 457 ht->ht_lock_cnt != 0) 458 continue; 459 460 /* 461 * Increment busy so the htable can't disappear. We 462 * drop the htable mutex to avoid deadlocks with 463 * hat_pageunload() and the hment mutex while we 464 * call hat_pte_unmap() 465 */ 466 ++ht->ht_busy; 467 HTABLE_EXIT(h); 468 469 /* 470 * Try stealing. 471 * - unload and invalidate all PTEs 472 */ 473 for (e = 0, va = ht->ht_vaddr; 474 e < HTABLE_NUM_PTES(ht) && ht->ht_valid_cnt > 0 && 475 ht->ht_busy == 1 && ht->ht_lock_cnt == 0; 476 ++e, va += MMU_PAGESIZE) { 477 pte = x86pte_get(ht, e); 478 if (!PTE_ISVALID(pte)) 479 continue; 480 hat_pte_unmap(ht, e, HAT_UNLOAD, pte, NULL, 481 B_TRUE); 482 } 483 484 /* 485 * Reacquire htable lock. If we didn't remove all 486 * mappings in the table, or another thread added a new 487 * mapping behind us, give up on this table. 488 */ 489 HTABLE_ENTER(h); 490 if (ht->ht_busy != 1 || ht->ht_valid_cnt != 0 || 491 ht->ht_lock_cnt != 0) { 492 --ht->ht_busy; 493 continue; 494 } 495 496 /* 497 * Steal it and unlink the page table. 498 */ 499 higher = ht->ht_parent; 500 unlink_ptp(higher, ht, ht->ht_vaddr); 501 502 /* 503 * remove from the hash list 504 */ 505 if (ht->ht_next) 506 ht->ht_next->ht_prev = ht->ht_prev; 507 508 if (ht->ht_prev) { 509 ht->ht_prev->ht_next = ht->ht_next; 510 } else { 511 ASSERT(hat->hat_ht_hash[h] == ht); 512 hat->hat_ht_hash[h] = ht->ht_next; 513 } 514 515 /* 516 * Break to outer loop to release the 517 * higher (ht_parent) pagetable. This 518 * spreads out the pain caused by 519 * pagefaults. 520 */ 521 ht->ht_next = *list; 522 *list = ht; 523 ++*stolen; 524 break; 525 } 526 HTABLE_EXIT(h); 527 if (higher != NULL) 528 htable_release(higher); 529 if (++h == hat->hat_num_hash) 530 h = 0; 531 } while (*stolen < cnt && h != h_start); 532 } 533 534 /* 535 * Move hat to the end of the kas list 536 */ 537 static void 538 move_victim(hat_t *hat) 539 { 540 ASSERT(MUTEX_HELD(&hat_list_lock)); 541 542 /* unlink victim hat */ 543 if (hat->hat_prev) 544 hat->hat_prev->hat_next = hat->hat_next; 545 else 546 kas.a_hat->hat_next = hat->hat_next; 547 548 if (hat->hat_next) 549 hat->hat_next->hat_prev = hat->hat_prev; 550 else 551 kas.a_hat->hat_prev = hat->hat_prev; 552 /* relink at end of hat list */ 553 hat->hat_next = NULL; 554 hat->hat_prev = kas.a_hat->hat_prev; 555 if (hat->hat_prev) 556 hat->hat_prev->hat_next = hat; 557 else 558 kas.a_hat->hat_next = hat; 559 560 kas.a_hat->hat_prev = hat; 561 } 562 563 /* 564 * This routine steals htables from user processes. Called by htable_reap 565 * (reap=TRUE) or htable_alloc (reap=FALSE). 566 */ 567 static htable_t * 568 htable_steal(uint_t cnt, boolean_t reap) 569 { 570 hat_t *hat = kas.a_hat; /* list starts with khat */ 571 htable_t *list = NULL; 572 htable_t *ht; 573 uint_t stolen = 0; 574 uint_t pass, passes; 575 uint_t threshold; 576 577 /* 578 * Limit htable_steal_passes to something reasonable 579 */ 580 if (htable_steal_passes == 0) 581 htable_steal_passes = 1; 582 if (htable_steal_passes > mmu.ptes_per_table) 583 htable_steal_passes = mmu.ptes_per_table; 584 585 /* 586 * If we're stealing merely as part of kmem reaping (versus stealing 587 * to assure forward progress), we don't want to actually steal any 588 * active htables. (Stealing active htables merely to give memory 589 * back to the system can inadvertently kick off an htable crime wave 590 * as active processes repeatedly steal htables from one another, 591 * plummeting the system into a kind of HAT lawlessness that can 592 * become so violent as to impede the one thing that can end it: the 593 * freeing of memory via ARC reclaim and other means.) So if we're 594 * reaping, we limit ourselves to the first pass that steals cached 595 * htables that aren't in use -- which gives memory back, but averts 596 * the entire breakdown of social order. 597 */ 598 passes = reap ? 0 : htable_steal_passes; 599 600 /* 601 * Loop through all user hats. The 1st pass takes cached htables that 602 * aren't in use. The later passes steal by removing mappings, too. 603 */ 604 atomic_inc_32(&htable_dont_cache); 605 for (pass = 0; pass <= passes && stolen < cnt; ++pass) { 606 threshold = pass * mmu.ptes_per_table / htable_steal_passes; 607 608 mutex_enter(&hat_list_lock); 609 610 /* skip the first hat (kernel) */ 611 hat = kas.a_hat->hat_next; 612 for (;;) { 613 /* 614 * Skip any hat that is already being stolen from. 615 * 616 * We skip SHARED hats, as these are dummy 617 * hats that host ISM shared page tables. 618 * 619 * We also skip if HAT_FREEING because hat_pte_unmap() 620 * won't zero out the PTE's. That would lead to hitting 621 * stale PTEs either here or under hat_unload() when we 622 * steal and unload the same page table in competing 623 * threads. 624 */ 625 while (hat != NULL && 626 (hat->hat_flags & 627 (HAT_VICTIM | HAT_SHARED | HAT_FREEING)) != 0) 628 hat = hat->hat_next; 629 630 if (hat == NULL) 631 break; 632 633 /* 634 * Mark the HAT as a stealing victim so that it is 635 * not freed from under us, e.g. in as_free() 636 */ 637 hat->hat_flags |= HAT_VICTIM; 638 mutex_exit(&hat_list_lock); 639 640 /* 641 * Take any htables from the hat's cached "free" list. 642 */ 643 hat_enter(hat); 644 while ((ht = hat->hat_ht_cached) != NULL && 645 stolen < cnt) { 646 hat->hat_ht_cached = ht->ht_next; 647 ht->ht_next = list; 648 list = ht; 649 ++stolen; 650 } 651 hat_exit(hat); 652 653 /* 654 * Don't steal active htables on first pass. 655 */ 656 if (pass != 0 && (stolen < cnt)) 657 htable_steal_active(hat, cnt, threshold, 658 &stolen, &list); 659 660 /* 661 * do synchronous teardown for the reap case so that 662 * we can forget hat; at this time, hat is 663 * guaranteed to be around because HAT_VICTIM is set 664 * (see htable_free() for similar code) 665 */ 666 for (ht = list; (ht) && (reap); ht = ht->ht_next) { 667 if (ht->ht_hat == NULL) 668 continue; 669 ASSERT(ht->ht_hat == hat); 670 #if defined(__xpv) && defined(__amd64) 671 if (!(ht->ht_flags & HTABLE_VLP) && 672 ht->ht_level == mmu.max_level) { 673 ptable_free(hat->hat_user_ptable); 674 hat->hat_user_ptable = PFN_INVALID; 675 } 676 #endif 677 /* 678 * forget the hat 679 */ 680 ht->ht_hat = NULL; 681 } 682 683 mutex_enter(&hat_list_lock); 684 685 /* 686 * Are we finished? 687 */ 688 if (stolen == cnt) { 689 /* 690 * Try to spread the pain of stealing, 691 * move victim HAT to the end of the HAT list. 692 */ 693 if (pass >= 1 && cnt == 1 && 694 kas.a_hat->hat_prev != hat) 695 move_victim(hat); 696 /* 697 * We are finished 698 */ 699 } 700 701 /* 702 * Clear the victim flag, hat can go away now (once 703 * the lock is dropped) 704 */ 705 if (hat->hat_flags & HAT_VICTIM) { 706 ASSERT(hat != kas.a_hat); 707 hat->hat_flags &= ~HAT_VICTIM; 708 cv_broadcast(&hat_list_cv); 709 } 710 711 /* move on to the next hat */ 712 hat = hat->hat_next; 713 } 714 715 mutex_exit(&hat_list_lock); 716 717 } 718 ASSERT(!MUTEX_HELD(&hat_list_lock)); 719 720 atomic_dec_32(&htable_dont_cache); 721 return (list); 722 } 723 724 /* 725 * This is invoked from kmem when the system is low on memory. We try 726 * to free hments, htables, and ptables to improve the memory situation. 727 */ 728 /*ARGSUSED*/ 729 static void 730 htable_reap(void *handle) 731 { 732 uint_t reap_cnt; 733 htable_t *list; 734 htable_t *ht; 735 736 HATSTAT_INC(hs_reap_attempts); 737 if (!can_steal_post_boot) 738 return; 739 740 /* 741 * Try to reap 5% of the page tables bounded by a maximum of 742 * 5% of physmem and a minimum of 10. 743 */ 744 reap_cnt = MAX(MIN(physmem / 20, active_ptables / 20), 10); 745 746 /* 747 * Note: htable_dont_cache should be set at the time of 748 * invoking htable_free() 749 */ 750 atomic_inc_32(&htable_dont_cache); 751 /* 752 * Let htable_steal() do the work, we just call htable_free() 753 */ 754 XPV_DISALLOW_MIGRATE(); 755 list = htable_steal(reap_cnt, B_TRUE); 756 XPV_ALLOW_MIGRATE(); 757 while ((ht = list) != NULL) { 758 list = ht->ht_next; 759 HATSTAT_INC(hs_reaped); 760 htable_free(ht); 761 } 762 atomic_dec_32(&htable_dont_cache); 763 764 /* 765 * Free up excess reserves 766 */ 767 htable_adjust_reserve(); 768 hment_adjust_reserve(); 769 } 770 771 /* 772 * Allocate an htable, stealing one or using the reserve if necessary 773 */ 774 static htable_t * 775 htable_alloc( 776 hat_t *hat, 777 uintptr_t vaddr, 778 level_t level, 779 htable_t *shared) 780 { 781 htable_t *ht = NULL; 782 uint_t is_vlp; 783 uint_t is_bare = 0; 784 uint_t need_to_zero = 1; 785 int kmflags = (can_steal_post_boot ? KM_NOSLEEP : KM_SLEEP); 786 787 if (level < 0 || level > TOP_LEVEL(hat)) 788 panic("htable_alloc(): level %d out of range\n", level); 789 790 is_vlp = (hat->hat_flags & HAT_VLP) && level == VLP_LEVEL; 791 if (is_vlp || shared != NULL) 792 is_bare = 1; 793 794 /* 795 * First reuse a cached htable from the hat_ht_cached field, this 796 * avoids unnecessary trips through kmem/page allocators. 797 */ 798 if (hat->hat_ht_cached != NULL && !is_bare) { 799 hat_enter(hat); 800 ht = hat->hat_ht_cached; 801 if (ht != NULL) { 802 hat->hat_ht_cached = ht->ht_next; 803 need_to_zero = 0; 804 /* XX64 ASSERT() they're all zero somehow */ 805 ASSERT(ht->ht_pfn != PFN_INVALID); 806 } 807 hat_exit(hat); 808 } 809 810 if (ht == NULL) { 811 /* 812 * Allocate an htable, possibly refilling the reserves. 813 */ 814 if (USE_HAT_RESERVES()) { 815 ht = htable_get_reserve(); 816 } else { 817 /* 818 * Donate successful htable allocations to the reserve. 819 */ 820 for (;;) { 821 ht = kmem_cache_alloc(htable_cache, kmflags); 822 if (ht == NULL) 823 break; 824 ht->ht_pfn = PFN_INVALID; 825 if (USE_HAT_RESERVES() || 826 htable_reserve_cnt >= htable_reserve_amount) 827 break; 828 htable_put_reserve(ht); 829 } 830 } 831 832 /* 833 * allocate a page for the hardware page table if needed 834 */ 835 if (ht != NULL && !is_bare) { 836 ht->ht_hat = hat; 837 ht->ht_pfn = ptable_alloc((uintptr_t)ht); 838 if (ht->ht_pfn == PFN_INVALID) { 839 if (USE_HAT_RESERVES()) 840 htable_put_reserve(ht); 841 else 842 kmem_cache_free(htable_cache, ht); 843 ht = NULL; 844 } 845 } 846 } 847 848 /* 849 * If allocations failed, kick off a kmem_reap() and resort to 850 * htable steal(). We may spin here if the system is very low on 851 * memory. If the kernel itself has consumed all memory and kmem_reap() 852 * can't free up anything, then we'll really get stuck here. 853 * That should only happen in a system where the administrator has 854 * misconfigured VM parameters via /etc/system. 855 */ 856 while (ht == NULL && can_steal_post_boot) { 857 kmem_reap(); 858 ht = htable_steal(1, B_FALSE); 859 HATSTAT_INC(hs_steals); 860 861 /* 862 * If we stole for a bare htable, release the pagetable page. 863 */ 864 if (ht != NULL) { 865 if (is_bare) { 866 ptable_free(ht->ht_pfn); 867 ht->ht_pfn = PFN_INVALID; 868 #if defined(__xpv) && defined(__amd64) 869 /* 870 * make stolen page table writable again in kpm 871 */ 872 } else if (kpm_vbase && xen_kpm_page(ht->ht_pfn, 873 PT_VALID | PT_WRITABLE) < 0) { 874 panic("failure making kpm r/w pfn=0x%lx", 875 ht->ht_pfn); 876 #endif 877 } 878 } 879 } 880 881 /* 882 * All attempts to allocate or steal failed. This should only happen 883 * if we run out of memory during boot, due perhaps to a huge 884 * boot_archive. At this point there's no way to continue. 885 */ 886 if (ht == NULL) 887 panic("htable_alloc(): couldn't steal\n"); 888 889 #if defined(__amd64) && defined(__xpv) 890 /* 891 * Under the 64-bit hypervisor, we have 2 top level page tables. 892 * If this allocation fails, we'll resort to stealing. 893 * We use the stolen page indirectly, by freeing the 894 * stolen htable first. 895 */ 896 if (level == mmu.max_level) { 897 for (;;) { 898 htable_t *stolen; 899 900 hat->hat_user_ptable = ptable_alloc((uintptr_t)ht + 1); 901 if (hat->hat_user_ptable != PFN_INVALID) 902 break; 903 stolen = htable_steal(1, B_FALSE); 904 if (stolen == NULL) 905 panic("2nd steal ptable failed\n"); 906 htable_free(stolen); 907 } 908 block_zero_no_xmm(kpm_vbase + pfn_to_pa(hat->hat_user_ptable), 909 MMU_PAGESIZE); 910 } 911 #endif 912 913 /* 914 * Shared page tables have all entries locked and entries may not 915 * be added or deleted. 916 */ 917 ht->ht_flags = 0; 918 if (shared != NULL) { 919 ASSERT(shared->ht_valid_cnt > 0); 920 ht->ht_flags |= HTABLE_SHARED_PFN; 921 ht->ht_pfn = shared->ht_pfn; 922 ht->ht_lock_cnt = 0; 923 ht->ht_valid_cnt = 0; /* updated in hat_share() */ 924 ht->ht_shares = shared; 925 need_to_zero = 0; 926 } else { 927 ht->ht_shares = NULL; 928 ht->ht_lock_cnt = 0; 929 ht->ht_valid_cnt = 0; 930 } 931 932 /* 933 * setup flags, etc. for VLP htables 934 */ 935 if (is_vlp) { 936 ht->ht_flags |= HTABLE_VLP; 937 ASSERT(ht->ht_pfn == PFN_INVALID); 938 need_to_zero = 0; 939 } 940 941 /* 942 * fill in the htable 943 */ 944 ht->ht_hat = hat; 945 ht->ht_parent = NULL; 946 ht->ht_vaddr = vaddr; 947 ht->ht_level = level; 948 ht->ht_busy = 1; 949 ht->ht_next = NULL; 950 ht->ht_prev = NULL; 951 952 /* 953 * Zero out any freshly allocated page table 954 */ 955 if (need_to_zero) 956 x86pte_zero(ht, 0, mmu.ptes_per_table); 957 958 #if defined(__amd64) && defined(__xpv) 959 if (!is_bare && kpm_vbase) { 960 (void) xen_kpm_page(ht->ht_pfn, PT_VALID); 961 if (level == mmu.max_level) 962 (void) xen_kpm_page(hat->hat_user_ptable, PT_VALID); 963 } 964 #endif 965 966 return (ht); 967 } 968 969 /* 970 * Free up an htable, either to a hat's cached list, the reserves or 971 * back to kmem. 972 */ 973 static void 974 htable_free(htable_t *ht) 975 { 976 hat_t *hat = ht->ht_hat; 977 978 /* 979 * If the process isn't exiting, cache the free htable in the hat 980 * structure. We always do this for the boot time reserve. We don't 981 * do this if the hat is exiting or we are stealing/reaping htables. 982 */ 983 if (hat != NULL && 984 !(ht->ht_flags & HTABLE_SHARED_PFN) && 985 (use_boot_reserve || 986 (!(hat->hat_flags & HAT_FREEING) && !htable_dont_cache))) { 987 ASSERT((ht->ht_flags & HTABLE_VLP) == 0); 988 ASSERT(ht->ht_pfn != PFN_INVALID); 989 hat_enter(hat); 990 ht->ht_next = hat->hat_ht_cached; 991 hat->hat_ht_cached = ht; 992 hat_exit(hat); 993 return; 994 } 995 996 /* 997 * If we have a hardware page table, free it. 998 * We don't free page tables that are accessed by sharing. 999 */ 1000 if (ht->ht_flags & HTABLE_SHARED_PFN) { 1001 ASSERT(ht->ht_pfn != PFN_INVALID); 1002 } else if (!(ht->ht_flags & HTABLE_VLP)) { 1003 ptable_free(ht->ht_pfn); 1004 #if defined(__amd64) && defined(__xpv) 1005 if (ht->ht_level == mmu.max_level && hat != NULL) { 1006 ptable_free(hat->hat_user_ptable); 1007 hat->hat_user_ptable = PFN_INVALID; 1008 } 1009 #endif 1010 } 1011 ht->ht_pfn = PFN_INVALID; 1012 1013 /* 1014 * Free it or put into reserves. 1015 */ 1016 if (USE_HAT_RESERVES() || htable_reserve_cnt < htable_reserve_amount) { 1017 htable_put_reserve(ht); 1018 } else { 1019 kmem_cache_free(htable_cache, ht); 1020 htable_adjust_reserve(); 1021 } 1022 } 1023 1024 1025 /* 1026 * This is called when a hat is being destroyed or swapped out. We reap all 1027 * the remaining htables in the hat cache. If destroying all left over 1028 * htables are also destroyed. 1029 * 1030 * We also don't need to invalidate any of the PTPs nor do any demapping. 1031 */ 1032 void 1033 htable_purge_hat(hat_t *hat) 1034 { 1035 htable_t *ht; 1036 int h; 1037 1038 /* 1039 * Purge the htable cache if just reaping. 1040 */ 1041 if (!(hat->hat_flags & HAT_FREEING)) { 1042 atomic_inc_32(&htable_dont_cache); 1043 for (;;) { 1044 hat_enter(hat); 1045 ht = hat->hat_ht_cached; 1046 if (ht == NULL) { 1047 hat_exit(hat); 1048 break; 1049 } 1050 hat->hat_ht_cached = ht->ht_next; 1051 hat_exit(hat); 1052 htable_free(ht); 1053 } 1054 atomic_dec_32(&htable_dont_cache); 1055 return; 1056 } 1057 1058 /* 1059 * if freeing, no locking is needed 1060 */ 1061 while ((ht = hat->hat_ht_cached) != NULL) { 1062 hat->hat_ht_cached = ht->ht_next; 1063 htable_free(ht); 1064 } 1065 1066 /* 1067 * walk thru the htable hash table and free all the htables in it. 1068 */ 1069 for (h = 0; h < hat->hat_num_hash; ++h) { 1070 while ((ht = hat->hat_ht_hash[h]) != NULL) { 1071 if (ht->ht_next) 1072 ht->ht_next->ht_prev = ht->ht_prev; 1073 1074 if (ht->ht_prev) { 1075 ht->ht_prev->ht_next = ht->ht_next; 1076 } else { 1077 ASSERT(hat->hat_ht_hash[h] == ht); 1078 hat->hat_ht_hash[h] = ht->ht_next; 1079 } 1080 htable_free(ht); 1081 } 1082 } 1083 } 1084 1085 /* 1086 * Unlink an entry for a table at vaddr and level out of the existing table 1087 * one level higher. We are always holding the HASH_ENTER() when doing this. 1088 */ 1089 static void 1090 unlink_ptp(htable_t *higher, htable_t *old, uintptr_t vaddr) 1091 { 1092 uint_t entry = htable_va2entry(vaddr, higher); 1093 x86pte_t expect = MAKEPTP(old->ht_pfn, old->ht_level); 1094 x86pte_t found; 1095 hat_t *hat = old->ht_hat; 1096 1097 ASSERT(higher->ht_busy > 0); 1098 ASSERT(higher->ht_valid_cnt > 0); 1099 ASSERT(old->ht_valid_cnt == 0); 1100 found = x86pte_cas(higher, entry, expect, 0); 1101 #ifdef __xpv 1102 /* 1103 * This is weird, but Xen apparently automatically unlinks empty 1104 * pagetables from the upper page table. So allow PTP to be 0 already. 1105 */ 1106 if (found != expect && found != 0) 1107 #else 1108 if (found != expect) 1109 #endif 1110 panic("Bad PTP found=" FMT_PTE ", expected=" FMT_PTE, 1111 found, expect); 1112 1113 /* 1114 * When a top level VLP page table entry changes, we must issue 1115 * a reload of cr3 on all processors. 1116 * 1117 * If we don't need do do that, then we still have to INVLPG against 1118 * an address covered by the inner page table, as the latest processors 1119 * have TLB-like caches for non-leaf page table entries. 1120 */ 1121 if (!(hat->hat_flags & HAT_FREEING)) { 1122 hat_tlb_inval(hat, (higher->ht_flags & HTABLE_VLP) ? 1123 DEMAP_ALL_ADDR : old->ht_vaddr); 1124 } 1125 1126 HTABLE_DEC(higher->ht_valid_cnt); 1127 } 1128 1129 /* 1130 * Link an entry for a new table at vaddr and level into the existing table 1131 * one level higher. We are always holding the HASH_ENTER() when doing this. 1132 */ 1133 static void 1134 link_ptp(htable_t *higher, htable_t *new, uintptr_t vaddr) 1135 { 1136 uint_t entry = htable_va2entry(vaddr, higher); 1137 x86pte_t newptp = MAKEPTP(new->ht_pfn, new->ht_level); 1138 x86pte_t found; 1139 1140 ASSERT(higher->ht_busy > 0); 1141 1142 ASSERT(new->ht_level != mmu.max_level); 1143 1144 HTABLE_INC(higher->ht_valid_cnt); 1145 1146 found = x86pte_cas(higher, entry, 0, newptp); 1147 if ((found & ~PT_REF) != 0) 1148 panic("HAT: ptp not 0, found=" FMT_PTE, found); 1149 1150 /* 1151 * When any top level VLP page table entry changes, we must issue 1152 * a reload of cr3 on all processors using it. 1153 * We also need to do this for the kernel hat on PAE 32 bit kernel. 1154 */ 1155 if ( 1156 #ifdef __i386 1157 (higher->ht_hat == kas.a_hat && higher->ht_level == VLP_LEVEL) || 1158 #endif 1159 (higher->ht_flags & HTABLE_VLP)) 1160 hat_tlb_inval(higher->ht_hat, DEMAP_ALL_ADDR); 1161 } 1162 1163 /* 1164 * Release of hold on an htable. If this is the last use and the pagetable 1165 * is empty we may want to free it, then recursively look at the pagetable 1166 * above it. The recursion is handled by the outer while() loop. 1167 * 1168 * On the metal, during process exit, we don't bother unlinking the tables from 1169 * upper level pagetables. They are instead handled in bulk by hat_free_end(). 1170 * We can't do this on the hypervisor as we need the page table to be 1171 * implicitly unpinnned before it goes to the free page lists. This can't 1172 * happen unless we fully unlink it from the page table hierarchy. 1173 */ 1174 void 1175 htable_release(htable_t *ht) 1176 { 1177 uint_t hashval; 1178 htable_t *shared; 1179 htable_t *higher; 1180 hat_t *hat; 1181 uintptr_t va; 1182 level_t level; 1183 1184 while (ht != NULL) { 1185 shared = NULL; 1186 for (;;) { 1187 hat = ht->ht_hat; 1188 va = ht->ht_vaddr; 1189 level = ht->ht_level; 1190 hashval = HTABLE_HASH(hat, va, level); 1191 1192 /* 1193 * The common case is that this isn't the last use of 1194 * an htable so we don't want to free the htable. 1195 */ 1196 HTABLE_ENTER(hashval); 1197 ASSERT(ht->ht_valid_cnt >= 0); 1198 ASSERT(ht->ht_busy > 0); 1199 if (ht->ht_valid_cnt > 0) 1200 break; 1201 if (ht->ht_busy > 1) 1202 break; 1203 ASSERT(ht->ht_lock_cnt == 0); 1204 1205 #if !defined(__xpv) 1206 /* 1207 * we always release empty shared htables 1208 */ 1209 if (!(ht->ht_flags & HTABLE_SHARED_PFN)) { 1210 1211 /* 1212 * don't release if in address space tear down 1213 */ 1214 if (hat->hat_flags & HAT_FREEING) 1215 break; 1216 1217 /* 1218 * At and above max_page_level, free if it's for 1219 * a boot-time kernel mapping below kernelbase. 1220 */ 1221 if (level >= mmu.max_page_level && 1222 (hat != kas.a_hat || va >= kernelbase)) 1223 break; 1224 } 1225 #endif /* __xpv */ 1226 1227 /* 1228 * Remember if we destroy an htable that shares its PFN 1229 * from elsewhere. 1230 */ 1231 if (ht->ht_flags & HTABLE_SHARED_PFN) { 1232 ASSERT(shared == NULL); 1233 shared = ht->ht_shares; 1234 HATSTAT_INC(hs_htable_unshared); 1235 } 1236 1237 /* 1238 * Handle release of a table and freeing the htable_t. 1239 * Unlink it from the table higher (ie. ht_parent). 1240 */ 1241 higher = ht->ht_parent; 1242 ASSERT(higher != NULL); 1243 1244 /* 1245 * Unlink the pagetable. 1246 */ 1247 unlink_ptp(higher, ht, va); 1248 1249 /* 1250 * remove this htable from its hash list 1251 */ 1252 if (ht->ht_next) 1253 ht->ht_next->ht_prev = ht->ht_prev; 1254 1255 if (ht->ht_prev) { 1256 ht->ht_prev->ht_next = ht->ht_next; 1257 } else { 1258 ASSERT(hat->hat_ht_hash[hashval] == ht); 1259 hat->hat_ht_hash[hashval] = ht->ht_next; 1260 } 1261 HTABLE_EXIT(hashval); 1262 htable_free(ht); 1263 ht = higher; 1264 } 1265 1266 ASSERT(ht->ht_busy >= 1); 1267 --ht->ht_busy; 1268 HTABLE_EXIT(hashval); 1269 1270 /* 1271 * If we released a shared htable, do a release on the htable 1272 * from which it shared 1273 */ 1274 ht = shared; 1275 } 1276 } 1277 1278 /* 1279 * Find the htable for the pagetable at the given level for the given address. 1280 * If found acquires a hold that eventually needs to be htable_release()d 1281 */ 1282 htable_t * 1283 htable_lookup(hat_t *hat, uintptr_t vaddr, level_t level) 1284 { 1285 uintptr_t base; 1286 uint_t hashval; 1287 htable_t *ht = NULL; 1288 1289 ASSERT(level >= 0); 1290 ASSERT(level <= TOP_LEVEL(hat)); 1291 1292 if (level == TOP_LEVEL(hat)) { 1293 #if defined(__amd64) 1294 /* 1295 * 32 bit address spaces on 64 bit kernels need to check 1296 * for overflow of the 32 bit address space 1297 */ 1298 if ((hat->hat_flags & HAT_VLP) && vaddr >= ((uint64_t)1 << 32)) 1299 return (NULL); 1300 #endif 1301 base = 0; 1302 } else { 1303 base = vaddr & LEVEL_MASK(level + 1); 1304 } 1305 1306 hashval = HTABLE_HASH(hat, base, level); 1307 HTABLE_ENTER(hashval); 1308 for (ht = hat->hat_ht_hash[hashval]; ht; ht = ht->ht_next) { 1309 if (ht->ht_hat == hat && 1310 ht->ht_vaddr == base && 1311 ht->ht_level == level) 1312 break; 1313 } 1314 if (ht) 1315 ++ht->ht_busy; 1316 1317 HTABLE_EXIT(hashval); 1318 return (ht); 1319 } 1320 1321 /* 1322 * Acquires a hold on a known htable (from a locked hment entry). 1323 */ 1324 void 1325 htable_acquire(htable_t *ht) 1326 { 1327 hat_t *hat = ht->ht_hat; 1328 level_t level = ht->ht_level; 1329 uintptr_t base = ht->ht_vaddr; 1330 uint_t hashval = HTABLE_HASH(hat, base, level); 1331 1332 HTABLE_ENTER(hashval); 1333 #ifdef DEBUG 1334 /* 1335 * make sure the htable is there 1336 */ 1337 { 1338 htable_t *h; 1339 1340 for (h = hat->hat_ht_hash[hashval]; 1341 h && h != ht; 1342 h = h->ht_next) 1343 ; 1344 ASSERT(h == ht); 1345 } 1346 #endif /* DEBUG */ 1347 ++ht->ht_busy; 1348 HTABLE_EXIT(hashval); 1349 } 1350 1351 /* 1352 * Find the htable for the pagetable at the given level for the given address. 1353 * If found acquires a hold that eventually needs to be htable_release()d 1354 * If not found the table is created. 1355 * 1356 * Since we can't hold a hash table mutex during allocation, we have to 1357 * drop it and redo the search on a create. Then we may have to free the newly 1358 * allocated htable if another thread raced in and created it ahead of us. 1359 */ 1360 htable_t * 1361 htable_create( 1362 hat_t *hat, 1363 uintptr_t vaddr, 1364 level_t level, 1365 htable_t *shared) 1366 { 1367 uint_t h; 1368 level_t l; 1369 uintptr_t base; 1370 htable_t *ht; 1371 htable_t *higher = NULL; 1372 htable_t *new = NULL; 1373 1374 if (level < 0 || level > TOP_LEVEL(hat)) 1375 panic("htable_create(): level %d out of range\n", level); 1376 1377 /* 1378 * Create the page tables in top down order. 1379 */ 1380 for (l = TOP_LEVEL(hat); l >= level; --l) { 1381 new = NULL; 1382 if (l == TOP_LEVEL(hat)) 1383 base = 0; 1384 else 1385 base = vaddr & LEVEL_MASK(l + 1); 1386 1387 h = HTABLE_HASH(hat, base, l); 1388 try_again: 1389 /* 1390 * look up the htable at this level 1391 */ 1392 HTABLE_ENTER(h); 1393 if (l == TOP_LEVEL(hat)) { 1394 ht = hat->hat_htable; 1395 } else { 1396 for (ht = hat->hat_ht_hash[h]; ht; ht = ht->ht_next) { 1397 ASSERT(ht->ht_hat == hat); 1398 if (ht->ht_vaddr == base && 1399 ht->ht_level == l) 1400 break; 1401 } 1402 } 1403 1404 /* 1405 * if we found the htable, increment its busy cnt 1406 * and if we had allocated a new htable, free it. 1407 */ 1408 if (ht != NULL) { 1409 /* 1410 * If we find a pre-existing shared table, it must 1411 * share from the same place. 1412 */ 1413 if (l == level && shared && ht->ht_shares && 1414 ht->ht_shares != shared) { 1415 panic("htable shared from wrong place " 1416 "found htable=%p shared=%p", 1417 (void *)ht, (void *)shared); 1418 } 1419 ++ht->ht_busy; 1420 HTABLE_EXIT(h); 1421 if (new) 1422 htable_free(new); 1423 if (higher != NULL) 1424 htable_release(higher); 1425 higher = ht; 1426 1427 /* 1428 * if we didn't find it on the first search 1429 * allocate a new one and search again 1430 */ 1431 } else if (new == NULL) { 1432 HTABLE_EXIT(h); 1433 new = htable_alloc(hat, base, l, 1434 l == level ? shared : NULL); 1435 goto try_again; 1436 1437 /* 1438 * 2nd search and still not there, use "new" table 1439 * Link new table into higher, when not at top level. 1440 */ 1441 } else { 1442 ht = new; 1443 if (higher != NULL) { 1444 link_ptp(higher, ht, base); 1445 ht->ht_parent = higher; 1446 } 1447 ht->ht_next = hat->hat_ht_hash[h]; 1448 ASSERT(ht->ht_prev == NULL); 1449 if (hat->hat_ht_hash[h]) 1450 hat->hat_ht_hash[h]->ht_prev = ht; 1451 hat->hat_ht_hash[h] = ht; 1452 HTABLE_EXIT(h); 1453 1454 /* 1455 * Note we don't do htable_release(higher). 1456 * That happens recursively when "new" is removed by 1457 * htable_release() or htable_steal(). 1458 */ 1459 higher = ht; 1460 1461 /* 1462 * If we just created a new shared page table we 1463 * increment the shared htable's busy count, so that 1464 * it can't be the victim of a steal even if it's empty. 1465 */ 1466 if (l == level && shared) { 1467 (void) htable_lookup(shared->ht_hat, 1468 shared->ht_vaddr, shared->ht_level); 1469 HATSTAT_INC(hs_htable_shared); 1470 } 1471 } 1472 } 1473 1474 return (ht); 1475 } 1476 1477 /* 1478 * Inherit initial pagetables from the boot program. On the 64-bit 1479 * hypervisor we also temporarily mark the p_index field of page table 1480 * pages, so we know not to try making them writable in seg_kpm. 1481 */ 1482 void 1483 htable_attach( 1484 hat_t *hat, 1485 uintptr_t base, 1486 level_t level, 1487 htable_t *parent, 1488 pfn_t pfn) 1489 { 1490 htable_t *ht; 1491 uint_t h; 1492 uint_t i; 1493 x86pte_t pte; 1494 x86pte_t *ptep; 1495 page_t *pp; 1496 extern page_t *boot_claim_page(pfn_t); 1497 1498 ht = htable_get_reserve(); 1499 if (level == mmu.max_level) 1500 kas.a_hat->hat_htable = ht; 1501 ht->ht_hat = hat; 1502 ht->ht_parent = parent; 1503 ht->ht_vaddr = base; 1504 ht->ht_level = level; 1505 ht->ht_busy = 1; 1506 ht->ht_next = NULL; 1507 ht->ht_prev = NULL; 1508 ht->ht_flags = 0; 1509 ht->ht_pfn = pfn; 1510 ht->ht_lock_cnt = 0; 1511 ht->ht_valid_cnt = 0; 1512 if (parent != NULL) 1513 ++parent->ht_busy; 1514 1515 h = HTABLE_HASH(hat, base, level); 1516 HTABLE_ENTER(h); 1517 ht->ht_next = hat->hat_ht_hash[h]; 1518 ASSERT(ht->ht_prev == NULL); 1519 if (hat->hat_ht_hash[h]) 1520 hat->hat_ht_hash[h]->ht_prev = ht; 1521 hat->hat_ht_hash[h] = ht; 1522 HTABLE_EXIT(h); 1523 1524 /* 1525 * make sure the page table physical page is not FREE 1526 */ 1527 if (page_resv(1, KM_NOSLEEP) == 0) 1528 panic("page_resv() failed in ptable alloc"); 1529 1530 pp = boot_claim_page(pfn); 1531 ASSERT(pp != NULL); 1532 1533 /* 1534 * Page table pages that were allocated by dboot or 1535 * in very early startup didn't go through boot_mapin() 1536 * and so won't have vnode/offsets. Fix that here. 1537 */ 1538 if (pp->p_vnode == NULL) { 1539 /* match offset calculation in page_get_physical() */ 1540 u_offset_t offset = (uintptr_t)ht; 1541 if (offset > kernelbase) 1542 offset -= kernelbase; 1543 offset <<= MMU_PAGESHIFT; 1544 #if defined(__amd64) 1545 offset += mmu.hole_start; /* something in VA hole */ 1546 #else 1547 offset += 1ULL << 40; /* something > 4 Gig */ 1548 #endif 1549 ASSERT(page_exists(&kvp, offset) == NULL); 1550 (void) page_hashin(pp, &kvp, offset, NULL); 1551 } 1552 page_downgrade(pp); 1553 #if defined(__xpv) && defined(__amd64) 1554 /* 1555 * Record in the page_t that is a pagetable for segkpm setup. 1556 */ 1557 if (kpm_vbase) 1558 pp->p_index = 1; 1559 #endif 1560 1561 /* 1562 * Count valid mappings and recursively attach lower level pagetables. 1563 */ 1564 ptep = kbm_remap_window(pfn_to_pa(pfn), 0); 1565 for (i = 0; i < HTABLE_NUM_PTES(ht); ++i) { 1566 if (mmu.pae_hat) 1567 pte = ptep[i]; 1568 else 1569 pte = ((x86pte32_t *)ptep)[i]; 1570 if (!IN_HYPERVISOR_VA(base) && PTE_ISVALID(pte)) { 1571 ++ht->ht_valid_cnt; 1572 if (!PTE_ISPAGE(pte, level)) { 1573 htable_attach(hat, base, level - 1, 1574 ht, PTE2PFN(pte, level)); 1575 ptep = kbm_remap_window(pfn_to_pa(pfn), 0); 1576 } 1577 } 1578 base += LEVEL_SIZE(level); 1579 if (base == mmu.hole_start) 1580 base = (mmu.hole_end + MMU_PAGEOFFSET) & MMU_PAGEMASK; 1581 } 1582 1583 /* 1584 * As long as all the mappings we had were below kernel base 1585 * we can release the htable. 1586 */ 1587 if (base < kernelbase) 1588 htable_release(ht); 1589 } 1590 1591 /* 1592 * Walk through a given htable looking for the first valid entry. This 1593 * routine takes both a starting and ending address. The starting address 1594 * is required to be within the htable provided by the caller, but there is 1595 * no such restriction on the ending address. 1596 * 1597 * If the routine finds a valid entry in the htable (at or beyond the 1598 * starting address), the PTE (and its address) will be returned. 1599 * This PTE may correspond to either a page or a pagetable - it is the 1600 * caller's responsibility to determine which. If no valid entry is 1601 * found, 0 (and invalid PTE) and the next unexamined address will be 1602 * returned. 1603 * 1604 * The loop has been carefully coded for optimization. 1605 */ 1606 static x86pte_t 1607 htable_scan(htable_t *ht, uintptr_t *vap, uintptr_t eaddr) 1608 { 1609 uint_t e; 1610 x86pte_t found_pte = (x86pte_t)0; 1611 caddr_t pte_ptr; 1612 caddr_t end_pte_ptr; 1613 int l = ht->ht_level; 1614 uintptr_t va = *vap & LEVEL_MASK(l); 1615 size_t pgsize = LEVEL_SIZE(l); 1616 1617 ASSERT(va >= ht->ht_vaddr); 1618 ASSERT(va <= HTABLE_LAST_PAGE(ht)); 1619 1620 /* 1621 * Compute the starting index and ending virtual address 1622 */ 1623 e = htable_va2entry(va, ht); 1624 1625 /* 1626 * The following page table scan code knows that the valid 1627 * bit of a PTE is in the lowest byte AND that x86 is little endian!! 1628 */ 1629 pte_ptr = (caddr_t)x86pte_access_pagetable(ht, 0); 1630 end_pte_ptr = (caddr_t)PT_INDEX_PTR(pte_ptr, HTABLE_NUM_PTES(ht)); 1631 pte_ptr = (caddr_t)PT_INDEX_PTR((x86pte_t *)pte_ptr, e); 1632 while (!PTE_ISVALID(*pte_ptr)) { 1633 va += pgsize; 1634 if (va >= eaddr) 1635 break; 1636 pte_ptr += mmu.pte_size; 1637 ASSERT(pte_ptr <= end_pte_ptr); 1638 if (pte_ptr == end_pte_ptr) 1639 break; 1640 } 1641 1642 /* 1643 * if we found a valid PTE, load the entire PTE 1644 */ 1645 if (va < eaddr && pte_ptr != end_pte_ptr) 1646 found_pte = GET_PTE((x86pte_t *)pte_ptr); 1647 x86pte_release_pagetable(ht); 1648 1649 #if defined(__amd64) 1650 /* 1651 * deal with VA hole on amd64 1652 */ 1653 if (l == mmu.max_level && va >= mmu.hole_start && va <= mmu.hole_end) 1654 va = mmu.hole_end + va - mmu.hole_start; 1655 #endif /* __amd64 */ 1656 1657 *vap = va; 1658 return (found_pte); 1659 } 1660 1661 /* 1662 * Find the address and htable for the first populated translation at or 1663 * above the given virtual address. The caller may also specify an upper 1664 * limit to the address range to search. Uses level information to quickly 1665 * skip unpopulated sections of virtual address spaces. 1666 * 1667 * If not found returns NULL. When found, returns the htable and virt addr 1668 * and has a hold on the htable. 1669 */ 1670 x86pte_t 1671 htable_walk( 1672 struct hat *hat, 1673 htable_t **htp, 1674 uintptr_t *vaddr, 1675 uintptr_t eaddr) 1676 { 1677 uintptr_t va = *vaddr; 1678 htable_t *ht; 1679 htable_t *prev = *htp; 1680 level_t l; 1681 level_t max_mapped_level; 1682 x86pte_t pte; 1683 1684 ASSERT(eaddr > va); 1685 1686 /* 1687 * If this is a user address, then we know we need not look beyond 1688 * kernelbase. 1689 */ 1690 ASSERT(hat == kas.a_hat || eaddr <= kernelbase || 1691 eaddr == HTABLE_WALK_TO_END); 1692 if (hat != kas.a_hat && eaddr == HTABLE_WALK_TO_END) 1693 eaddr = kernelbase; 1694 1695 /* 1696 * If we're coming in with a previous page table, search it first 1697 * without doing an htable_lookup(), this should be frequent. 1698 */ 1699 if (prev) { 1700 ASSERT(prev->ht_busy > 0); 1701 ASSERT(prev->ht_vaddr <= va); 1702 l = prev->ht_level; 1703 if (va <= HTABLE_LAST_PAGE(prev)) { 1704 pte = htable_scan(prev, &va, eaddr); 1705 1706 if (PTE_ISPAGE(pte, l)) { 1707 *vaddr = va; 1708 *htp = prev; 1709 return (pte); 1710 } 1711 } 1712 1713 /* 1714 * We found nothing in the htable provided by the caller, 1715 * so fall through and do the full search 1716 */ 1717 htable_release(prev); 1718 } 1719 1720 /* 1721 * Find the level of the largest pagesize used by this HAT. 1722 */ 1723 if (hat->hat_ism_pgcnt > 0) { 1724 max_mapped_level = mmu.umax_page_level; 1725 } else { 1726 max_mapped_level = 0; 1727 for (l = 1; l <= mmu.max_page_level; ++l) 1728 if (hat->hat_pages_mapped[l] != 0) 1729 max_mapped_level = l; 1730 } 1731 1732 while (va < eaddr && va >= *vaddr) { 1733 /* 1734 * Find lowest table with any entry for given address. 1735 */ 1736 for (l = 0; l <= TOP_LEVEL(hat); ++l) { 1737 ht = htable_lookup(hat, va, l); 1738 if (ht != NULL) { 1739 pte = htable_scan(ht, &va, eaddr); 1740 if (PTE_ISPAGE(pte, l)) { 1741 VERIFY(!IN_VA_HOLE(va)); 1742 *vaddr = va; 1743 *htp = ht; 1744 return (pte); 1745 } 1746 htable_release(ht); 1747 break; 1748 } 1749 1750 /* 1751 * No htable at this level for the address. If there 1752 * is no larger page size that could cover it, we can 1753 * skip right to the start of the next page table. 1754 */ 1755 ASSERT(l < TOP_LEVEL(hat)); 1756 if (l >= max_mapped_level) { 1757 va = NEXT_ENTRY_VA(va, l + 1); 1758 if (va >= eaddr) 1759 break; 1760 } 1761 } 1762 } 1763 1764 *vaddr = 0; 1765 *htp = NULL; 1766 return (0); 1767 } 1768 1769 /* 1770 * Find the htable and page table entry index of the given virtual address 1771 * with pagesize at or below given level. 1772 * If not found returns NULL. When found, returns the htable, sets 1773 * entry, and has a hold on the htable. 1774 */ 1775 htable_t * 1776 htable_getpte( 1777 struct hat *hat, 1778 uintptr_t vaddr, 1779 uint_t *entry, 1780 x86pte_t *pte, 1781 level_t level) 1782 { 1783 htable_t *ht; 1784 level_t l; 1785 uint_t e; 1786 1787 ASSERT(level <= mmu.max_page_level); 1788 1789 for (l = 0; l <= level; ++l) { 1790 ht = htable_lookup(hat, vaddr, l); 1791 if (ht == NULL) 1792 continue; 1793 e = htable_va2entry(vaddr, ht); 1794 if (entry != NULL) 1795 *entry = e; 1796 if (pte != NULL) 1797 *pte = x86pte_get(ht, e); 1798 return (ht); 1799 } 1800 return (NULL); 1801 } 1802 1803 /* 1804 * Find the htable and page table entry index of the given virtual address. 1805 * There must be a valid page mapped at the given address. 1806 * If not found returns NULL. When found, returns the htable, sets 1807 * entry, and has a hold on the htable. 1808 */ 1809 htable_t * 1810 htable_getpage(struct hat *hat, uintptr_t vaddr, uint_t *entry) 1811 { 1812 htable_t *ht; 1813 uint_t e; 1814 x86pte_t pte; 1815 1816 ht = htable_getpte(hat, vaddr, &e, &pte, mmu.max_page_level); 1817 if (ht == NULL) 1818 return (NULL); 1819 1820 if (entry) 1821 *entry = e; 1822 1823 if (PTE_ISPAGE(pte, ht->ht_level)) 1824 return (ht); 1825 htable_release(ht); 1826 return (NULL); 1827 } 1828 1829 1830 void 1831 htable_init() 1832 { 1833 /* 1834 * To save on kernel VA usage, we avoid debug information in 32 bit 1835 * kernels. 1836 */ 1837 #if defined(__amd64) 1838 int kmem_flags = KMC_NOHASH; 1839 #elif defined(__i386) 1840 int kmem_flags = KMC_NOHASH | KMC_NODEBUG; 1841 #endif 1842 1843 /* 1844 * initialize kmem caches 1845 */ 1846 htable_cache = kmem_cache_create("htable_t", 1847 sizeof (htable_t), 0, NULL, NULL, 1848 htable_reap, NULL, hat_memload_arena, kmem_flags); 1849 } 1850 1851 /* 1852 * get the pte index for the virtual address in the given htable's pagetable 1853 */ 1854 uint_t 1855 htable_va2entry(uintptr_t va, htable_t *ht) 1856 { 1857 level_t l = ht->ht_level; 1858 1859 ASSERT(va >= ht->ht_vaddr); 1860 ASSERT(va <= HTABLE_LAST_PAGE(ht)); 1861 return ((va >> LEVEL_SHIFT(l)) & (HTABLE_NUM_PTES(ht) - 1)); 1862 } 1863 1864 /* 1865 * Given an htable and the index of a pte in it, return the virtual address 1866 * of the page. 1867 */ 1868 uintptr_t 1869 htable_e2va(htable_t *ht, uint_t entry) 1870 { 1871 level_t l = ht->ht_level; 1872 uintptr_t va; 1873 1874 ASSERT(entry < HTABLE_NUM_PTES(ht)); 1875 va = ht->ht_vaddr + ((uintptr_t)entry << LEVEL_SHIFT(l)); 1876 1877 /* 1878 * Need to skip over any VA hole in top level table 1879 */ 1880 #if defined(__amd64) 1881 if (ht->ht_level == mmu.max_level && va >= mmu.hole_start) 1882 va += ((mmu.hole_end - mmu.hole_start) + 1); 1883 #endif 1884 1885 return (va); 1886 } 1887 1888 /* 1889 * The code uses compare and swap instructions to read/write PTE's to 1890 * avoid atomicity problems, since PTEs can be 8 bytes on 32 bit systems. 1891 * will naturally be atomic. 1892 * 1893 * The combination of using kpreempt_disable()/_enable() and the hci_mutex 1894 * are used to ensure that an interrupt won't overwrite a temporary mapping 1895 * while it's in use. If an interrupt thread tries to access a PTE, it will 1896 * yield briefly back to the pinned thread which holds the cpu's hci_mutex. 1897 */ 1898 void 1899 x86pte_cpu_init(cpu_t *cpu) 1900 { 1901 struct hat_cpu_info *hci; 1902 1903 hci = kmem_zalloc(sizeof (*hci), KM_SLEEP); 1904 mutex_init(&hci->hci_mutex, NULL, MUTEX_DEFAULT, NULL); 1905 cpu->cpu_hat_info = hci; 1906 } 1907 1908 void 1909 x86pte_cpu_fini(cpu_t *cpu) 1910 { 1911 struct hat_cpu_info *hci = cpu->cpu_hat_info; 1912 1913 kmem_free(hci, sizeof (*hci)); 1914 cpu->cpu_hat_info = NULL; 1915 } 1916 1917 #ifdef __i386 1918 /* 1919 * On 32 bit kernels, loading a 64 bit PTE is a little tricky 1920 */ 1921 x86pte_t 1922 get_pte64(x86pte_t *ptr) 1923 { 1924 volatile uint32_t *p = (uint32_t *)ptr; 1925 x86pte_t t; 1926 1927 ASSERT(mmu.pae_hat != 0); 1928 for (;;) { 1929 t = p[0]; 1930 t |= (uint64_t)p[1] << 32; 1931 if ((t & 0xffffffff) == p[0]) 1932 return (t); 1933 } 1934 } 1935 #endif /* __i386 */ 1936 1937 /* 1938 * Disable preemption and establish a mapping to the pagetable with the 1939 * given pfn. This is optimized for there case where it's the same 1940 * pfn as we last used referenced from this CPU. 1941 */ 1942 static x86pte_t * 1943 x86pte_access_pagetable(htable_t *ht, uint_t index) 1944 { 1945 /* 1946 * VLP pagetables are contained in the hat_t 1947 */ 1948 if (ht->ht_flags & HTABLE_VLP) 1949 return (PT_INDEX_PTR(ht->ht_hat->hat_vlp_ptes, index)); 1950 return (x86pte_mapin(ht->ht_pfn, index, ht)); 1951 } 1952 1953 /* 1954 * map the given pfn into the page table window. 1955 */ 1956 /*ARGSUSED*/ 1957 x86pte_t * 1958 x86pte_mapin(pfn_t pfn, uint_t index, htable_t *ht) 1959 { 1960 x86pte_t *pteptr; 1961 x86pte_t pte = 0; 1962 x86pte_t newpte; 1963 int x; 1964 1965 ASSERT(pfn != PFN_INVALID); 1966 1967 if (!khat_running) { 1968 caddr_t va = kbm_remap_window(pfn_to_pa(pfn), 1); 1969 return (PT_INDEX_PTR(va, index)); 1970 } 1971 1972 /* 1973 * If kpm is available, use it. 1974 */ 1975 if (kpm_vbase) 1976 return (PT_INDEX_PTR(hat_kpm_pfn2va(pfn), index)); 1977 1978 /* 1979 * Disable preemption and grab the CPU's hci_mutex 1980 */ 1981 kpreempt_disable(); 1982 ASSERT(CPU->cpu_hat_info != NULL); 1983 mutex_enter(&CPU->cpu_hat_info->hci_mutex); 1984 x = PWIN_TABLE(CPU->cpu_id); 1985 pteptr = (x86pte_t *)PWIN_PTE_VA(x); 1986 #ifndef __xpv 1987 if (mmu.pae_hat) 1988 pte = *pteptr; 1989 else 1990 pte = *(x86pte32_t *)pteptr; 1991 #endif 1992 1993 newpte = MAKEPTE(pfn, 0) | mmu.pt_global | mmu.pt_nx; 1994 1995 /* 1996 * For hardware we can use a writable mapping. 1997 */ 1998 #ifdef __xpv 1999 if (IN_XPV_PANIC()) 2000 #endif 2001 newpte |= PT_WRITABLE; 2002 2003 if (!PTE_EQUIV(newpte, pte)) { 2004 2005 #ifdef __xpv 2006 if (!IN_XPV_PANIC()) { 2007 xen_map(newpte, PWIN_VA(x)); 2008 } else 2009 #endif 2010 { 2011 XPV_ALLOW_PAGETABLE_UPDATES(); 2012 if (mmu.pae_hat) 2013 *pteptr = newpte; 2014 else 2015 *(x86pte32_t *)pteptr = newpte; 2016 XPV_DISALLOW_PAGETABLE_UPDATES(); 2017 mmu_tlbflush_entry((caddr_t)(PWIN_VA(x))); 2018 } 2019 } 2020 return (PT_INDEX_PTR(PWIN_VA(x), index)); 2021 } 2022 2023 /* 2024 * Release access to a page table. 2025 */ 2026 static void 2027 x86pte_release_pagetable(htable_t *ht) 2028 { 2029 /* 2030 * nothing to do for VLP htables 2031 */ 2032 if (ht->ht_flags & HTABLE_VLP) 2033 return; 2034 2035 x86pte_mapout(); 2036 } 2037 2038 void 2039 x86pte_mapout(void) 2040 { 2041 if (kpm_vbase != NULL || !khat_running) 2042 return; 2043 2044 /* 2045 * Drop the CPU's hci_mutex and restore preemption. 2046 */ 2047 #ifdef __xpv 2048 if (!IN_XPV_PANIC()) { 2049 uintptr_t va; 2050 2051 /* 2052 * We need to always clear the mapping in case a page 2053 * that was once a page table page is ballooned out. 2054 */ 2055 va = (uintptr_t)PWIN_VA(PWIN_TABLE(CPU->cpu_id)); 2056 (void) HYPERVISOR_update_va_mapping(va, 0, 2057 UVMF_INVLPG | UVMF_LOCAL); 2058 } 2059 #endif 2060 mutex_exit(&CPU->cpu_hat_info->hci_mutex); 2061 kpreempt_enable(); 2062 } 2063 2064 /* 2065 * Atomic retrieval of a pagetable entry 2066 */ 2067 x86pte_t 2068 x86pte_get(htable_t *ht, uint_t entry) 2069 { 2070 x86pte_t pte; 2071 x86pte_t *ptep; 2072 2073 /* 2074 * Be careful that loading PAE entries in 32 bit kernel is atomic. 2075 */ 2076 ASSERT(entry < mmu.ptes_per_table); 2077 ptep = x86pte_access_pagetable(ht, entry); 2078 pte = GET_PTE(ptep); 2079 x86pte_release_pagetable(ht); 2080 return (pte); 2081 } 2082 2083 /* 2084 * Atomic unconditional set of a page table entry, it returns the previous 2085 * value. For pre-existing mappings if the PFN changes, then we don't care 2086 * about the old pte's REF / MOD bits. If the PFN remains the same, we leave 2087 * the MOD/REF bits unchanged. 2088 * 2089 * If asked to overwrite a link to a lower page table with a large page 2090 * mapping, this routine returns the special value of LPAGE_ERROR. This 2091 * allows the upper HAT layers to retry with a smaller mapping size. 2092 */ 2093 x86pte_t 2094 x86pte_set(htable_t *ht, uint_t entry, x86pte_t new, void *ptr) 2095 { 2096 x86pte_t old; 2097 x86pte_t prev; 2098 x86pte_t *ptep; 2099 level_t l = ht->ht_level; 2100 x86pte_t pfn_mask = (l != 0) ? PT_PADDR_LGPG : PT_PADDR; 2101 x86pte_t n; 2102 uintptr_t addr = htable_e2va(ht, entry); 2103 hat_t *hat = ht->ht_hat; 2104 2105 ASSERT(new != 0); /* don't use to invalidate a PTE, see x86pte_update */ 2106 ASSERT(!(ht->ht_flags & HTABLE_SHARED_PFN)); 2107 if (ptr == NULL) 2108 ptep = x86pte_access_pagetable(ht, entry); 2109 else 2110 ptep = ptr; 2111 2112 /* 2113 * Install the new PTE. If remapping the same PFN, then 2114 * copy existing REF/MOD bits to new mapping. 2115 */ 2116 do { 2117 prev = GET_PTE(ptep); 2118 n = new; 2119 if (PTE_ISVALID(n) && (prev & pfn_mask) == (new & pfn_mask)) 2120 n |= prev & (PT_REF | PT_MOD); 2121 2122 /* 2123 * Another thread may have installed this mapping already, 2124 * flush the local TLB and be done. 2125 */ 2126 if (prev == n) { 2127 old = new; 2128 #ifdef __xpv 2129 if (!IN_XPV_PANIC()) 2130 xen_flush_va((caddr_t)addr); 2131 else 2132 #endif 2133 mmu_tlbflush_entry((caddr_t)addr); 2134 goto done; 2135 } 2136 2137 /* 2138 * Detect if we have a collision of installing a large 2139 * page mapping where there already is a lower page table. 2140 */ 2141 if (l > 0 && (prev & PT_VALID) && !(prev & PT_PAGESIZE)) { 2142 old = LPAGE_ERROR; 2143 goto done; 2144 } 2145 2146 XPV_ALLOW_PAGETABLE_UPDATES(); 2147 old = CAS_PTE(ptep, prev, n); 2148 XPV_DISALLOW_PAGETABLE_UPDATES(); 2149 } while (old != prev); 2150 2151 /* 2152 * Do a TLB demap if needed, ie. the old pte was valid. 2153 * 2154 * Note that a stale TLB writeback to the PTE here either can't happen 2155 * or doesn't matter. The PFN can only change for NOSYNC|NOCONSIST 2156 * mappings, but they were created with REF and MOD already set, so 2157 * no stale writeback will happen. 2158 * 2159 * Segmap is the only place where remaps happen on the same pfn and for 2160 * that we want to preserve the stale REF/MOD bits. 2161 */ 2162 if (old & PT_REF) 2163 hat_tlb_inval(hat, addr); 2164 2165 done: 2166 if (ptr == NULL) 2167 x86pte_release_pagetable(ht); 2168 return (old); 2169 } 2170 2171 /* 2172 * Atomic compare and swap of a page table entry. No TLB invalidates are done. 2173 * This is used for links between pagetables of different levels. 2174 * Note we always create these links with dirty/access set, so they should 2175 * never change. 2176 */ 2177 x86pte_t 2178 x86pte_cas(htable_t *ht, uint_t entry, x86pte_t old, x86pte_t new) 2179 { 2180 x86pte_t pte; 2181 x86pte_t *ptep; 2182 #ifdef __xpv 2183 /* 2184 * We can't use writable pagetables for upper level tables, so fake it. 2185 */ 2186 mmu_update_t t[2]; 2187 int cnt = 1; 2188 int count; 2189 maddr_t ma; 2190 2191 if (!IN_XPV_PANIC()) { 2192 ASSERT(!(ht->ht_flags & HTABLE_VLP)); /* no VLP yet */ 2193 ma = pa_to_ma(PT_INDEX_PHYSADDR(pfn_to_pa(ht->ht_pfn), entry)); 2194 t[0].ptr = ma | MMU_NORMAL_PT_UPDATE; 2195 t[0].val = new; 2196 2197 #if defined(__amd64) 2198 /* 2199 * On the 64-bit hypervisor we need to maintain the user mode 2200 * top page table too. 2201 */ 2202 if (ht->ht_level == mmu.max_level && ht->ht_hat != kas.a_hat) { 2203 ma = pa_to_ma(PT_INDEX_PHYSADDR(pfn_to_pa( 2204 ht->ht_hat->hat_user_ptable), entry)); 2205 t[1].ptr = ma | MMU_NORMAL_PT_UPDATE; 2206 t[1].val = new; 2207 ++cnt; 2208 } 2209 #endif /* __amd64 */ 2210 2211 if (HYPERVISOR_mmu_update(t, cnt, &count, DOMID_SELF)) 2212 panic("HYPERVISOR_mmu_update() failed"); 2213 ASSERT(count == cnt); 2214 return (old); 2215 } 2216 #endif 2217 ptep = x86pte_access_pagetable(ht, entry); 2218 XPV_ALLOW_PAGETABLE_UPDATES(); 2219 pte = CAS_PTE(ptep, old, new); 2220 XPV_DISALLOW_PAGETABLE_UPDATES(); 2221 x86pte_release_pagetable(ht); 2222 return (pte); 2223 } 2224 2225 /* 2226 * Invalidate a page table entry as long as it currently maps something that 2227 * matches the value determined by expect. 2228 * 2229 * If tlb is set, also invalidates any TLB entries. 2230 * 2231 * Returns the previous value of the PTE. 2232 */ 2233 x86pte_t 2234 x86pte_inval( 2235 htable_t *ht, 2236 uint_t entry, 2237 x86pte_t expect, 2238 x86pte_t *pte_ptr, 2239 boolean_t tlb) 2240 { 2241 x86pte_t *ptep; 2242 x86pte_t oldpte; 2243 x86pte_t found; 2244 2245 ASSERT(!(ht->ht_flags & HTABLE_SHARED_PFN)); 2246 ASSERT(ht->ht_level <= mmu.max_page_level); 2247 2248 if (pte_ptr != NULL) 2249 ptep = pte_ptr; 2250 else 2251 ptep = x86pte_access_pagetable(ht, entry); 2252 2253 #if defined(__xpv) 2254 /* 2255 * If exit()ing just use HYPERVISOR_mmu_update(), as we can't be racing 2256 * with anything else. 2257 */ 2258 if ((ht->ht_hat->hat_flags & HAT_FREEING) && !IN_XPV_PANIC()) { 2259 int count; 2260 mmu_update_t t[1]; 2261 maddr_t ma; 2262 2263 oldpte = GET_PTE(ptep); 2264 if (expect != 0 && (oldpte & PT_PADDR) != (expect & PT_PADDR)) 2265 goto done; 2266 ma = pa_to_ma(PT_INDEX_PHYSADDR(pfn_to_pa(ht->ht_pfn), entry)); 2267 t[0].ptr = ma | MMU_NORMAL_PT_UPDATE; 2268 t[0].val = 0; 2269 if (HYPERVISOR_mmu_update(t, 1, &count, DOMID_SELF)) 2270 panic("HYPERVISOR_mmu_update() failed"); 2271 ASSERT(count == 1); 2272 goto done; 2273 } 2274 #endif /* __xpv */ 2275 2276 /* 2277 * Note that the loop is needed to handle changes due to h/w updating 2278 * of PT_MOD/PT_REF. 2279 */ 2280 do { 2281 oldpte = GET_PTE(ptep); 2282 if (expect != 0 && (oldpte & PT_PADDR) != (expect & PT_PADDR)) 2283 goto done; 2284 XPV_ALLOW_PAGETABLE_UPDATES(); 2285 found = CAS_PTE(ptep, oldpte, 0); 2286 XPV_DISALLOW_PAGETABLE_UPDATES(); 2287 } while (found != oldpte); 2288 if (tlb && (oldpte & (PT_REF | PT_MOD))) 2289 hat_tlb_inval(ht->ht_hat, htable_e2va(ht, entry)); 2290 2291 done: 2292 if (pte_ptr == NULL) 2293 x86pte_release_pagetable(ht); 2294 return (oldpte); 2295 } 2296 2297 /* 2298 * Change a page table entry af it currently matches the value in expect. 2299 */ 2300 x86pte_t 2301 x86pte_update( 2302 htable_t *ht, 2303 uint_t entry, 2304 x86pte_t expect, 2305 x86pte_t new) 2306 { 2307 x86pte_t *ptep; 2308 x86pte_t found; 2309 2310 ASSERT(new != 0); 2311 ASSERT(!(ht->ht_flags & HTABLE_SHARED_PFN)); 2312 ASSERT(ht->ht_level <= mmu.max_page_level); 2313 2314 ptep = x86pte_access_pagetable(ht, entry); 2315 XPV_ALLOW_PAGETABLE_UPDATES(); 2316 found = CAS_PTE(ptep, expect, new); 2317 XPV_DISALLOW_PAGETABLE_UPDATES(); 2318 if (found == expect) { 2319 hat_tlb_inval(ht->ht_hat, htable_e2va(ht, entry)); 2320 2321 /* 2322 * When removing write permission *and* clearing the 2323 * MOD bit, check if a write happened via a stale 2324 * TLB entry before the TLB shootdown finished. 2325 * 2326 * If it did happen, simply re-enable write permission and 2327 * act like the original CAS failed. 2328 */ 2329 if ((expect & (PT_WRITABLE | PT_MOD)) == PT_WRITABLE && 2330 (new & (PT_WRITABLE | PT_MOD)) == 0 && 2331 (GET_PTE(ptep) & PT_MOD) != 0) { 2332 do { 2333 found = GET_PTE(ptep); 2334 XPV_ALLOW_PAGETABLE_UPDATES(); 2335 found = 2336 CAS_PTE(ptep, found, found | PT_WRITABLE); 2337 XPV_DISALLOW_PAGETABLE_UPDATES(); 2338 } while ((found & PT_WRITABLE) == 0); 2339 } 2340 } 2341 x86pte_release_pagetable(ht); 2342 return (found); 2343 } 2344 2345 #ifndef __xpv 2346 /* 2347 * Copy page tables - this is just a little more complicated than the 2348 * previous routines. Note that it's also not atomic! It also is never 2349 * used for VLP pagetables. 2350 */ 2351 void 2352 x86pte_copy(htable_t *src, htable_t *dest, uint_t entry, uint_t count) 2353 { 2354 caddr_t src_va; 2355 caddr_t dst_va; 2356 size_t size; 2357 x86pte_t *pteptr; 2358 x86pte_t pte; 2359 2360 ASSERT(khat_running); 2361 ASSERT(!(dest->ht_flags & HTABLE_VLP)); 2362 ASSERT(!(src->ht_flags & HTABLE_VLP)); 2363 ASSERT(!(src->ht_flags & HTABLE_SHARED_PFN)); 2364 ASSERT(!(dest->ht_flags & HTABLE_SHARED_PFN)); 2365 2366 /* 2367 * Acquire access to the CPU pagetable windows for the dest and source. 2368 */ 2369 dst_va = (caddr_t)x86pte_access_pagetable(dest, entry); 2370 if (kpm_vbase) { 2371 src_va = (caddr_t) 2372 PT_INDEX_PTR(hat_kpm_pfn2va(src->ht_pfn), entry); 2373 } else { 2374 uint_t x = PWIN_SRC(CPU->cpu_id); 2375 2376 /* 2377 * Finish defining the src pagetable mapping 2378 */ 2379 src_va = (caddr_t)PT_INDEX_PTR(PWIN_VA(x), entry); 2380 pte = MAKEPTE(src->ht_pfn, 0) | mmu.pt_global | mmu.pt_nx; 2381 pteptr = (x86pte_t *)PWIN_PTE_VA(x); 2382 if (mmu.pae_hat) 2383 *pteptr = pte; 2384 else 2385 *(x86pte32_t *)pteptr = pte; 2386 mmu_tlbflush_entry((caddr_t)(PWIN_VA(x))); 2387 } 2388 2389 /* 2390 * now do the copy 2391 */ 2392 size = count << mmu.pte_size_shift; 2393 bcopy(src_va, dst_va, size); 2394 2395 x86pte_release_pagetable(dest); 2396 } 2397 2398 #else /* __xpv */ 2399 2400 /* 2401 * The hypervisor only supports writable pagetables at level 0, so we have 2402 * to install these 1 by 1 the slow way. 2403 */ 2404 void 2405 x86pte_copy(htable_t *src, htable_t *dest, uint_t entry, uint_t count) 2406 { 2407 caddr_t src_va; 2408 x86pte_t pte; 2409 2410 ASSERT(!IN_XPV_PANIC()); 2411 src_va = (caddr_t)x86pte_access_pagetable(src, entry); 2412 while (count) { 2413 if (mmu.pae_hat) 2414 pte = *(x86pte_t *)src_va; 2415 else 2416 pte = *(x86pte32_t *)src_va; 2417 if (pte != 0) { 2418 set_pteval(pfn_to_pa(dest->ht_pfn), entry, 2419 dest->ht_level, pte); 2420 #ifdef __amd64 2421 if (dest->ht_level == mmu.max_level && 2422 htable_e2va(dest, entry) < HYPERVISOR_VIRT_END) 2423 set_pteval( 2424 pfn_to_pa(dest->ht_hat->hat_user_ptable), 2425 entry, dest->ht_level, pte); 2426 #endif 2427 } 2428 --count; 2429 ++entry; 2430 src_va += mmu.pte_size; 2431 } 2432 x86pte_release_pagetable(src); 2433 } 2434 #endif /* __xpv */ 2435 2436 /* 2437 * Zero page table entries - Note this doesn't use atomic stores! 2438 */ 2439 static void 2440 x86pte_zero(htable_t *dest, uint_t entry, uint_t count) 2441 { 2442 caddr_t dst_va; 2443 size_t size; 2444 #ifdef __xpv 2445 int x; 2446 x86pte_t newpte; 2447 #endif 2448 2449 /* 2450 * Map in the page table to be zeroed. 2451 */ 2452 ASSERT(!(dest->ht_flags & HTABLE_SHARED_PFN)); 2453 ASSERT(!(dest->ht_flags & HTABLE_VLP)); 2454 2455 /* 2456 * On the hypervisor we don't use x86pte_access_pagetable() since 2457 * in this case the page is not pinned yet. 2458 */ 2459 #ifdef __xpv 2460 if (kpm_vbase == NULL) { 2461 kpreempt_disable(); 2462 ASSERT(CPU->cpu_hat_info != NULL); 2463 mutex_enter(&CPU->cpu_hat_info->hci_mutex); 2464 x = PWIN_TABLE(CPU->cpu_id); 2465 newpte = MAKEPTE(dest->ht_pfn, 0) | PT_WRITABLE; 2466 xen_map(newpte, PWIN_VA(x)); 2467 dst_va = (caddr_t)PT_INDEX_PTR(PWIN_VA(x), entry); 2468 } else 2469 #endif 2470 dst_va = (caddr_t)x86pte_access_pagetable(dest, entry); 2471 2472 size = count << mmu.pte_size_shift; 2473 ASSERT(size > BLOCKZEROALIGN); 2474 #ifdef __i386 2475 if (!is_x86_feature(x86_featureset, X86FSET_SSE2)) 2476 bzero(dst_va, size); 2477 else 2478 #endif 2479 block_zero_no_xmm(dst_va, size); 2480 2481 #ifdef __xpv 2482 if (kpm_vbase == NULL) { 2483 xen_map(0, PWIN_VA(x)); 2484 mutex_exit(&CPU->cpu_hat_info->hci_mutex); 2485 kpreempt_enable(); 2486 } else 2487 #endif 2488 x86pte_release_pagetable(dest); 2489 } 2490 2491 /* 2492 * Called to ensure that all pagetables are in the system dump 2493 */ 2494 void 2495 hat_dump(void) 2496 { 2497 hat_t *hat; 2498 uint_t h; 2499 htable_t *ht; 2500 2501 /* 2502 * Dump all page tables 2503 */ 2504 for (hat = kas.a_hat; hat != NULL; hat = hat->hat_next) { 2505 for (h = 0; h < hat->hat_num_hash; ++h) { 2506 for (ht = hat->hat_ht_hash[h]; ht; ht = ht->ht_next) { 2507 if ((ht->ht_flags & HTABLE_VLP) == 0) 2508 dump_page(ht->ht_pfn); 2509 } 2510 } 2511 } 2512 } 2513