1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Kernel-based Virtual Machine driver for Linux 4 * 5 * This module enables machines with Intel VT-x extensions to run virtual 6 * machines without emulation or binary translation. 7 * 8 * MMU support 9 * 10 * Copyright (C) 2006 Qumranet, Inc. 11 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 12 * 13 * Authors: 14 * Yaniv Kamay <yaniv@qumranet.com> 15 * Avi Kivity <avi@qumranet.com> 16 */ 17 18 #include "irq.h" 19 #include "ioapic.h" 20 #include "mmu.h" 21 #include "mmu_internal.h" 22 #include "tdp_mmu.h" 23 #include "x86.h" 24 #include "kvm_cache_regs.h" 25 #include "kvm_emulate.h" 26 #include "cpuid.h" 27 #include "spte.h" 28 29 #include <linux/kvm_host.h> 30 #include <linux/types.h> 31 #include <linux/string.h> 32 #include <linux/mm.h> 33 #include <linux/highmem.h> 34 #include <linux/moduleparam.h> 35 #include <linux/export.h> 36 #include <linux/swap.h> 37 #include <linux/hugetlb.h> 38 #include <linux/compiler.h> 39 #include <linux/srcu.h> 40 #include <linux/slab.h> 41 #include <linux/sched/signal.h> 42 #include <linux/uaccess.h> 43 #include <linux/hash.h> 44 #include <linux/kern_levels.h> 45 #include <linux/kthread.h> 46 47 #include <asm/page.h> 48 #include <asm/memtype.h> 49 #include <asm/cmpxchg.h> 50 #include <asm/io.h> 51 #include <asm/set_memory.h> 52 #include <asm/vmx.h> 53 #include <asm/kvm_page_track.h> 54 #include "trace.h" 55 56 #include "paging.h" 57 58 extern bool itlb_multihit_kvm_mitigation; 59 60 int __read_mostly nx_huge_pages = -1; 61 static uint __read_mostly nx_huge_pages_recovery_period_ms; 62 #ifdef CONFIG_PREEMPT_RT 63 /* Recovery can cause latency spikes, disable it for PREEMPT_RT. */ 64 static uint __read_mostly nx_huge_pages_recovery_ratio = 0; 65 #else 66 static uint __read_mostly nx_huge_pages_recovery_ratio = 60; 67 #endif 68 69 static int set_nx_huge_pages(const char *val, const struct kernel_param *kp); 70 static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp); 71 72 static const struct kernel_param_ops nx_huge_pages_ops = { 73 .set = set_nx_huge_pages, 74 .get = param_get_bool, 75 }; 76 77 static const struct kernel_param_ops nx_huge_pages_recovery_param_ops = { 78 .set = set_nx_huge_pages_recovery_param, 79 .get = param_get_uint, 80 }; 81 82 module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644); 83 __MODULE_PARM_TYPE(nx_huge_pages, "bool"); 84 module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_param_ops, 85 &nx_huge_pages_recovery_ratio, 0644); 86 __MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint"); 87 module_param_cb(nx_huge_pages_recovery_period_ms, &nx_huge_pages_recovery_param_ops, 88 &nx_huge_pages_recovery_period_ms, 0644); 89 __MODULE_PARM_TYPE(nx_huge_pages_recovery_period_ms, "uint"); 90 91 static bool __read_mostly force_flush_and_sync_on_reuse; 92 module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644); 93 94 /* 95 * When setting this variable to true it enables Two-Dimensional-Paging 96 * where the hardware walks 2 page tables: 97 * 1. the guest-virtual to guest-physical 98 * 2. while doing 1. it walks guest-physical to host-physical 99 * If the hardware supports that we don't need to do shadow paging. 100 */ 101 bool tdp_enabled = false; 102 103 static int max_huge_page_level __read_mostly; 104 static int tdp_root_level __read_mostly; 105 static int max_tdp_level __read_mostly; 106 107 enum { 108 AUDIT_PRE_PAGE_FAULT, 109 AUDIT_POST_PAGE_FAULT, 110 AUDIT_PRE_PTE_WRITE, 111 AUDIT_POST_PTE_WRITE, 112 AUDIT_PRE_SYNC, 113 AUDIT_POST_SYNC 114 }; 115 116 #ifdef MMU_DEBUG 117 bool dbg = 0; 118 module_param(dbg, bool, 0644); 119 #endif 120 121 #define PTE_PREFETCH_NUM 8 122 123 #define PT32_LEVEL_BITS 10 124 125 #define PT32_LEVEL_SHIFT(level) \ 126 (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS) 127 128 #define PT32_LVL_OFFSET_MASK(level) \ 129 (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ 130 * PT32_LEVEL_BITS))) - 1)) 131 132 #define PT32_INDEX(address, level)\ 133 (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) 134 135 136 #define PT32_BASE_ADDR_MASK PAGE_MASK 137 #define PT32_DIR_BASE_ADDR_MASK \ 138 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1)) 139 #define PT32_LVL_ADDR_MASK(level) \ 140 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ 141 * PT32_LEVEL_BITS))) - 1)) 142 143 #include <trace/events/kvm.h> 144 145 /* make pte_list_desc fit well in cache lines */ 146 #define PTE_LIST_EXT 14 147 148 /* 149 * Slight optimization of cacheline layout, by putting `more' and `spte_count' 150 * at the start; then accessing it will only use one single cacheline for 151 * either full (entries==PTE_LIST_EXT) case or entries<=6. 152 */ 153 struct pte_list_desc { 154 struct pte_list_desc *more; 155 /* 156 * Stores number of entries stored in the pte_list_desc. No need to be 157 * u64 but just for easier alignment. When PTE_LIST_EXT, means full. 158 */ 159 u64 spte_count; 160 u64 *sptes[PTE_LIST_EXT]; 161 }; 162 163 struct kvm_shadow_walk_iterator { 164 u64 addr; 165 hpa_t shadow_addr; 166 u64 *sptep; 167 int level; 168 unsigned index; 169 }; 170 171 #define for_each_shadow_entry_using_root(_vcpu, _root, _addr, _walker) \ 172 for (shadow_walk_init_using_root(&(_walker), (_vcpu), \ 173 (_root), (_addr)); \ 174 shadow_walk_okay(&(_walker)); \ 175 shadow_walk_next(&(_walker))) 176 177 #define for_each_shadow_entry(_vcpu, _addr, _walker) \ 178 for (shadow_walk_init(&(_walker), _vcpu, _addr); \ 179 shadow_walk_okay(&(_walker)); \ 180 shadow_walk_next(&(_walker))) 181 182 #define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte) \ 183 for (shadow_walk_init(&(_walker), _vcpu, _addr); \ 184 shadow_walk_okay(&(_walker)) && \ 185 ({ spte = mmu_spte_get_lockless(_walker.sptep); 1; }); \ 186 __shadow_walk_next(&(_walker), spte)) 187 188 static struct kmem_cache *pte_list_desc_cache; 189 struct kmem_cache *mmu_page_header_cache; 190 static struct percpu_counter kvm_total_used_mmu_pages; 191 192 static void mmu_spte_set(u64 *sptep, u64 spte); 193 static union kvm_mmu_page_role 194 kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu); 195 196 struct kvm_mmu_role_regs { 197 const unsigned long cr0; 198 const unsigned long cr4; 199 const u64 efer; 200 }; 201 202 #define CREATE_TRACE_POINTS 203 #include "mmutrace.h" 204 205 /* 206 * Yes, lot's of underscores. They're a hint that you probably shouldn't be 207 * reading from the role_regs. Once the mmu_role is constructed, it becomes 208 * the single source of truth for the MMU's state. 209 */ 210 #define BUILD_MMU_ROLE_REGS_ACCESSOR(reg, name, flag) \ 211 static inline bool __maybe_unused ____is_##reg##_##name(struct kvm_mmu_role_regs *regs)\ 212 { \ 213 return !!(regs->reg & flag); \ 214 } 215 BUILD_MMU_ROLE_REGS_ACCESSOR(cr0, pg, X86_CR0_PG); 216 BUILD_MMU_ROLE_REGS_ACCESSOR(cr0, wp, X86_CR0_WP); 217 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pse, X86_CR4_PSE); 218 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pae, X86_CR4_PAE); 219 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smep, X86_CR4_SMEP); 220 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smap, X86_CR4_SMAP); 221 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pke, X86_CR4_PKE); 222 BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, la57, X86_CR4_LA57); 223 BUILD_MMU_ROLE_REGS_ACCESSOR(efer, nx, EFER_NX); 224 BUILD_MMU_ROLE_REGS_ACCESSOR(efer, lma, EFER_LMA); 225 226 /* 227 * The MMU itself (with a valid role) is the single source of truth for the 228 * MMU. Do not use the regs used to build the MMU/role, nor the vCPU. The 229 * regs don't account for dependencies, e.g. clearing CR4 bits if CR0.PG=1, 230 * and the vCPU may be incorrect/irrelevant. 231 */ 232 #define BUILD_MMU_ROLE_ACCESSOR(base_or_ext, reg, name) \ 233 static inline bool __maybe_unused is_##reg##_##name(struct kvm_mmu *mmu) \ 234 { \ 235 return !!(mmu->mmu_role. base_or_ext . reg##_##name); \ 236 } 237 BUILD_MMU_ROLE_ACCESSOR(ext, cr0, pg); 238 BUILD_MMU_ROLE_ACCESSOR(base, cr0, wp); 239 BUILD_MMU_ROLE_ACCESSOR(ext, cr4, pse); 240 BUILD_MMU_ROLE_ACCESSOR(ext, cr4, pae); 241 BUILD_MMU_ROLE_ACCESSOR(ext, cr4, smep); 242 BUILD_MMU_ROLE_ACCESSOR(ext, cr4, smap); 243 BUILD_MMU_ROLE_ACCESSOR(ext, cr4, pke); 244 BUILD_MMU_ROLE_ACCESSOR(ext, cr4, la57); 245 BUILD_MMU_ROLE_ACCESSOR(base, efer, nx); 246 247 static struct kvm_mmu_role_regs vcpu_to_role_regs(struct kvm_vcpu *vcpu) 248 { 249 struct kvm_mmu_role_regs regs = { 250 .cr0 = kvm_read_cr0_bits(vcpu, KVM_MMU_CR0_ROLE_BITS), 251 .cr4 = kvm_read_cr4_bits(vcpu, KVM_MMU_CR4_ROLE_BITS), 252 .efer = vcpu->arch.efer, 253 }; 254 255 return regs; 256 } 257 258 static int role_regs_to_root_level(struct kvm_mmu_role_regs *regs) 259 { 260 if (!____is_cr0_pg(regs)) 261 return 0; 262 else if (____is_efer_lma(regs)) 263 return ____is_cr4_la57(regs) ? PT64_ROOT_5LEVEL : 264 PT64_ROOT_4LEVEL; 265 else if (____is_cr4_pae(regs)) 266 return PT32E_ROOT_LEVEL; 267 else 268 return PT32_ROOT_LEVEL; 269 } 270 271 static inline bool kvm_available_flush_tlb_with_range(void) 272 { 273 return kvm_x86_ops.tlb_remote_flush_with_range; 274 } 275 276 static void kvm_flush_remote_tlbs_with_range(struct kvm *kvm, 277 struct kvm_tlb_range *range) 278 { 279 int ret = -ENOTSUPP; 280 281 if (range && kvm_x86_ops.tlb_remote_flush_with_range) 282 ret = static_call(kvm_x86_tlb_remote_flush_with_range)(kvm, range); 283 284 if (ret) 285 kvm_flush_remote_tlbs(kvm); 286 } 287 288 void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, 289 u64 start_gfn, u64 pages) 290 { 291 struct kvm_tlb_range range; 292 293 range.start_gfn = start_gfn; 294 range.pages = pages; 295 296 kvm_flush_remote_tlbs_with_range(kvm, &range); 297 } 298 299 static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, 300 unsigned int access) 301 { 302 u64 spte = make_mmio_spte(vcpu, gfn, access); 303 304 trace_mark_mmio_spte(sptep, gfn, spte); 305 mmu_spte_set(sptep, spte); 306 } 307 308 static gfn_t get_mmio_spte_gfn(u64 spte) 309 { 310 u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask; 311 312 gpa |= (spte >> SHADOW_NONPRESENT_OR_RSVD_MASK_LEN) 313 & shadow_nonpresent_or_rsvd_mask; 314 315 return gpa >> PAGE_SHIFT; 316 } 317 318 static unsigned get_mmio_spte_access(u64 spte) 319 { 320 return spte & shadow_mmio_access_mask; 321 } 322 323 static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte) 324 { 325 u64 kvm_gen, spte_gen, gen; 326 327 gen = kvm_vcpu_memslots(vcpu)->generation; 328 if (unlikely(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS)) 329 return false; 330 331 kvm_gen = gen & MMIO_SPTE_GEN_MASK; 332 spte_gen = get_mmio_spte_generation(spte); 333 334 trace_check_mmio_spte(spte, kvm_gen, spte_gen); 335 return likely(kvm_gen == spte_gen); 336 } 337 338 static int is_cpuid_PSE36(void) 339 { 340 return 1; 341 } 342 343 static gfn_t pse36_gfn_delta(u32 gpte) 344 { 345 int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT; 346 347 return (gpte & PT32_DIR_PSE36_MASK) << shift; 348 } 349 350 #ifdef CONFIG_X86_64 351 static void __set_spte(u64 *sptep, u64 spte) 352 { 353 WRITE_ONCE(*sptep, spte); 354 } 355 356 static void __update_clear_spte_fast(u64 *sptep, u64 spte) 357 { 358 WRITE_ONCE(*sptep, spte); 359 } 360 361 static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) 362 { 363 return xchg(sptep, spte); 364 } 365 366 static u64 __get_spte_lockless(u64 *sptep) 367 { 368 return READ_ONCE(*sptep); 369 } 370 #else 371 union split_spte { 372 struct { 373 u32 spte_low; 374 u32 spte_high; 375 }; 376 u64 spte; 377 }; 378 379 static void count_spte_clear(u64 *sptep, u64 spte) 380 { 381 struct kvm_mmu_page *sp = sptep_to_sp(sptep); 382 383 if (is_shadow_present_pte(spte)) 384 return; 385 386 /* Ensure the spte is completely set before we increase the count */ 387 smp_wmb(); 388 sp->clear_spte_count++; 389 } 390 391 static void __set_spte(u64 *sptep, u64 spte) 392 { 393 union split_spte *ssptep, sspte; 394 395 ssptep = (union split_spte *)sptep; 396 sspte = (union split_spte)spte; 397 398 ssptep->spte_high = sspte.spte_high; 399 400 /* 401 * If we map the spte from nonpresent to present, We should store 402 * the high bits firstly, then set present bit, so cpu can not 403 * fetch this spte while we are setting the spte. 404 */ 405 smp_wmb(); 406 407 WRITE_ONCE(ssptep->spte_low, sspte.spte_low); 408 } 409 410 static void __update_clear_spte_fast(u64 *sptep, u64 spte) 411 { 412 union split_spte *ssptep, sspte; 413 414 ssptep = (union split_spte *)sptep; 415 sspte = (union split_spte)spte; 416 417 WRITE_ONCE(ssptep->spte_low, sspte.spte_low); 418 419 /* 420 * If we map the spte from present to nonpresent, we should clear 421 * present bit firstly to avoid vcpu fetch the old high bits. 422 */ 423 smp_wmb(); 424 425 ssptep->spte_high = sspte.spte_high; 426 count_spte_clear(sptep, spte); 427 } 428 429 static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) 430 { 431 union split_spte *ssptep, sspte, orig; 432 433 ssptep = (union split_spte *)sptep; 434 sspte = (union split_spte)spte; 435 436 /* xchg acts as a barrier before the setting of the high bits */ 437 orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low); 438 orig.spte_high = ssptep->spte_high; 439 ssptep->spte_high = sspte.spte_high; 440 count_spte_clear(sptep, spte); 441 442 return orig.spte; 443 } 444 445 /* 446 * The idea using the light way get the spte on x86_32 guest is from 447 * gup_get_pte (mm/gup.c). 448 * 449 * An spte tlb flush may be pending, because kvm_set_pte_rmapp 450 * coalesces them and we are running out of the MMU lock. Therefore 451 * we need to protect against in-progress updates of the spte. 452 * 453 * Reading the spte while an update is in progress may get the old value 454 * for the high part of the spte. The race is fine for a present->non-present 455 * change (because the high part of the spte is ignored for non-present spte), 456 * but for a present->present change we must reread the spte. 457 * 458 * All such changes are done in two steps (present->non-present and 459 * non-present->present), hence it is enough to count the number of 460 * present->non-present updates: if it changed while reading the spte, 461 * we might have hit the race. This is done using clear_spte_count. 462 */ 463 static u64 __get_spte_lockless(u64 *sptep) 464 { 465 struct kvm_mmu_page *sp = sptep_to_sp(sptep); 466 union split_spte spte, *orig = (union split_spte *)sptep; 467 int count; 468 469 retry: 470 count = sp->clear_spte_count; 471 smp_rmb(); 472 473 spte.spte_low = orig->spte_low; 474 smp_rmb(); 475 476 spte.spte_high = orig->spte_high; 477 smp_rmb(); 478 479 if (unlikely(spte.spte_low != orig->spte_low || 480 count != sp->clear_spte_count)) 481 goto retry; 482 483 return spte.spte; 484 } 485 #endif 486 487 static bool spte_has_volatile_bits(u64 spte) 488 { 489 if (!is_shadow_present_pte(spte)) 490 return false; 491 492 /* 493 * Always atomically update spte if it can be updated 494 * out of mmu-lock, it can ensure dirty bit is not lost, 495 * also, it can help us to get a stable is_writable_pte() 496 * to ensure tlb flush is not missed. 497 */ 498 if (spte_can_locklessly_be_made_writable(spte) || 499 is_access_track_spte(spte)) 500 return true; 501 502 if (spte_ad_enabled(spte)) { 503 if ((spte & shadow_accessed_mask) == 0 || 504 (is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0)) 505 return true; 506 } 507 508 return false; 509 } 510 511 /* Rules for using mmu_spte_set: 512 * Set the sptep from nonpresent to present. 513 * Note: the sptep being assigned *must* be either not present 514 * or in a state where the hardware will not attempt to update 515 * the spte. 516 */ 517 static void mmu_spte_set(u64 *sptep, u64 new_spte) 518 { 519 WARN_ON(is_shadow_present_pte(*sptep)); 520 __set_spte(sptep, new_spte); 521 } 522 523 /* 524 * Update the SPTE (excluding the PFN), but do not track changes in its 525 * accessed/dirty status. 526 */ 527 static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte) 528 { 529 u64 old_spte = *sptep; 530 531 WARN_ON(!is_shadow_present_pte(new_spte)); 532 533 if (!is_shadow_present_pte(old_spte)) { 534 mmu_spte_set(sptep, new_spte); 535 return old_spte; 536 } 537 538 if (!spte_has_volatile_bits(old_spte)) 539 __update_clear_spte_fast(sptep, new_spte); 540 else 541 old_spte = __update_clear_spte_slow(sptep, new_spte); 542 543 WARN_ON(spte_to_pfn(old_spte) != spte_to_pfn(new_spte)); 544 545 return old_spte; 546 } 547 548 /* Rules for using mmu_spte_update: 549 * Update the state bits, it means the mapped pfn is not changed. 550 * 551 * Whenever we overwrite a writable spte with a read-only one we 552 * should flush remote TLBs. Otherwise rmap_write_protect 553 * will find a read-only spte, even though the writable spte 554 * might be cached on a CPU's TLB, the return value indicates this 555 * case. 556 * 557 * Returns true if the TLB needs to be flushed 558 */ 559 static bool mmu_spte_update(u64 *sptep, u64 new_spte) 560 { 561 bool flush = false; 562 u64 old_spte = mmu_spte_update_no_track(sptep, new_spte); 563 564 if (!is_shadow_present_pte(old_spte)) 565 return false; 566 567 /* 568 * For the spte updated out of mmu-lock is safe, since 569 * we always atomically update it, see the comments in 570 * spte_has_volatile_bits(). 571 */ 572 if (spte_can_locklessly_be_made_writable(old_spte) && 573 !is_writable_pte(new_spte)) 574 flush = true; 575 576 /* 577 * Flush TLB when accessed/dirty states are changed in the page tables, 578 * to guarantee consistency between TLB and page tables. 579 */ 580 581 if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte)) { 582 flush = true; 583 kvm_set_pfn_accessed(spte_to_pfn(old_spte)); 584 } 585 586 if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte)) { 587 flush = true; 588 kvm_set_pfn_dirty(spte_to_pfn(old_spte)); 589 } 590 591 return flush; 592 } 593 594 /* 595 * Rules for using mmu_spte_clear_track_bits: 596 * It sets the sptep from present to nonpresent, and track the 597 * state bits, it is used to clear the last level sptep. 598 * Returns the old PTE. 599 */ 600 static int mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep) 601 { 602 kvm_pfn_t pfn; 603 u64 old_spte = *sptep; 604 int level = sptep_to_sp(sptep)->role.level; 605 606 if (!spte_has_volatile_bits(old_spte)) 607 __update_clear_spte_fast(sptep, 0ull); 608 else 609 old_spte = __update_clear_spte_slow(sptep, 0ull); 610 611 if (!is_shadow_present_pte(old_spte)) 612 return old_spte; 613 614 kvm_update_page_stats(kvm, level, -1); 615 616 pfn = spte_to_pfn(old_spte); 617 618 /* 619 * KVM does not hold the refcount of the page used by 620 * kvm mmu, before reclaiming the page, we should 621 * unmap it from mmu first. 622 */ 623 WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn))); 624 625 if (is_accessed_spte(old_spte)) 626 kvm_set_pfn_accessed(pfn); 627 628 if (is_dirty_spte(old_spte)) 629 kvm_set_pfn_dirty(pfn); 630 631 return old_spte; 632 } 633 634 /* 635 * Rules for using mmu_spte_clear_no_track: 636 * Directly clear spte without caring the state bits of sptep, 637 * it is used to set the upper level spte. 638 */ 639 static void mmu_spte_clear_no_track(u64 *sptep) 640 { 641 __update_clear_spte_fast(sptep, 0ull); 642 } 643 644 static u64 mmu_spte_get_lockless(u64 *sptep) 645 { 646 return __get_spte_lockless(sptep); 647 } 648 649 /* Restore an acc-track PTE back to a regular PTE */ 650 static u64 restore_acc_track_spte(u64 spte) 651 { 652 u64 new_spte = spte; 653 u64 saved_bits = (spte >> SHADOW_ACC_TRACK_SAVED_BITS_SHIFT) 654 & SHADOW_ACC_TRACK_SAVED_BITS_MASK; 655 656 WARN_ON_ONCE(spte_ad_enabled(spte)); 657 WARN_ON_ONCE(!is_access_track_spte(spte)); 658 659 new_spte &= ~shadow_acc_track_mask; 660 new_spte &= ~(SHADOW_ACC_TRACK_SAVED_BITS_MASK << 661 SHADOW_ACC_TRACK_SAVED_BITS_SHIFT); 662 new_spte |= saved_bits; 663 664 return new_spte; 665 } 666 667 /* Returns the Accessed status of the PTE and resets it at the same time. */ 668 static bool mmu_spte_age(u64 *sptep) 669 { 670 u64 spte = mmu_spte_get_lockless(sptep); 671 672 if (!is_accessed_spte(spte)) 673 return false; 674 675 if (spte_ad_enabled(spte)) { 676 clear_bit((ffs(shadow_accessed_mask) - 1), 677 (unsigned long *)sptep); 678 } else { 679 /* 680 * Capture the dirty status of the page, so that it doesn't get 681 * lost when the SPTE is marked for access tracking. 682 */ 683 if (is_writable_pte(spte)) 684 kvm_set_pfn_dirty(spte_to_pfn(spte)); 685 686 spte = mark_spte_for_access_track(spte); 687 mmu_spte_update_no_track(sptep, spte); 688 } 689 690 return true; 691 } 692 693 static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu) 694 { 695 if (is_tdp_mmu(vcpu->arch.mmu)) { 696 kvm_tdp_mmu_walk_lockless_begin(); 697 } else { 698 /* 699 * Prevent page table teardown by making any free-er wait during 700 * kvm_flush_remote_tlbs() IPI to all active vcpus. 701 */ 702 local_irq_disable(); 703 704 /* 705 * Make sure a following spte read is not reordered ahead of the write 706 * to vcpu->mode. 707 */ 708 smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES); 709 } 710 } 711 712 static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu) 713 { 714 if (is_tdp_mmu(vcpu->arch.mmu)) { 715 kvm_tdp_mmu_walk_lockless_end(); 716 } else { 717 /* 718 * Make sure the write to vcpu->mode is not reordered in front of 719 * reads to sptes. If it does, kvm_mmu_commit_zap_page() can see us 720 * OUTSIDE_GUEST_MODE and proceed to free the shadow page table. 721 */ 722 smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE); 723 local_irq_enable(); 724 } 725 } 726 727 static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect) 728 { 729 int r; 730 731 /* 1 rmap, 1 parent PTE per level, and the prefetched rmaps. */ 732 r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache, 733 1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM); 734 if (r) 735 return r; 736 r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache, 737 PT64_ROOT_MAX_LEVEL); 738 if (r) 739 return r; 740 if (maybe_indirect) { 741 r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_gfn_array_cache, 742 PT64_ROOT_MAX_LEVEL); 743 if (r) 744 return r; 745 } 746 return kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache, 747 PT64_ROOT_MAX_LEVEL); 748 } 749 750 static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) 751 { 752 kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache); 753 kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache); 754 kvm_mmu_free_memory_cache(&vcpu->arch.mmu_gfn_array_cache); 755 kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache); 756 } 757 758 static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu) 759 { 760 return kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache); 761 } 762 763 static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc) 764 { 765 kmem_cache_free(pte_list_desc_cache, pte_list_desc); 766 } 767 768 static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) 769 { 770 if (!sp->role.direct) 771 return sp->gfns[index]; 772 773 return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS)); 774 } 775 776 static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn) 777 { 778 if (!sp->role.direct) { 779 sp->gfns[index] = gfn; 780 return; 781 } 782 783 if (WARN_ON(gfn != kvm_mmu_page_get_gfn(sp, index))) 784 pr_err_ratelimited("gfn mismatch under direct page %llx " 785 "(expected %llx, got %llx)\n", 786 sp->gfn, 787 kvm_mmu_page_get_gfn(sp, index), gfn); 788 } 789 790 /* 791 * Return the pointer to the large page information for a given gfn, 792 * handling slots that are not large page aligned. 793 */ 794 static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn, 795 const struct kvm_memory_slot *slot, int level) 796 { 797 unsigned long idx; 798 799 idx = gfn_to_index(gfn, slot->base_gfn, level); 800 return &slot->arch.lpage_info[level - 2][idx]; 801 } 802 803 static void update_gfn_disallow_lpage_count(const struct kvm_memory_slot *slot, 804 gfn_t gfn, int count) 805 { 806 struct kvm_lpage_info *linfo; 807 int i; 808 809 for (i = PG_LEVEL_2M; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) { 810 linfo = lpage_info_slot(gfn, slot, i); 811 linfo->disallow_lpage += count; 812 WARN_ON(linfo->disallow_lpage < 0); 813 } 814 } 815 816 void kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn) 817 { 818 update_gfn_disallow_lpage_count(slot, gfn, 1); 819 } 820 821 void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn) 822 { 823 update_gfn_disallow_lpage_count(slot, gfn, -1); 824 } 825 826 static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) 827 { 828 struct kvm_memslots *slots; 829 struct kvm_memory_slot *slot; 830 gfn_t gfn; 831 832 kvm->arch.indirect_shadow_pages++; 833 gfn = sp->gfn; 834 slots = kvm_memslots_for_spte_role(kvm, sp->role); 835 slot = __gfn_to_memslot(slots, gfn); 836 837 /* the non-leaf shadow pages are keeping readonly. */ 838 if (sp->role.level > PG_LEVEL_4K) 839 return kvm_slot_page_track_add_page(kvm, slot, gfn, 840 KVM_PAGE_TRACK_WRITE); 841 842 kvm_mmu_gfn_disallow_lpage(slot, gfn); 843 } 844 845 void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) 846 { 847 if (sp->lpage_disallowed) 848 return; 849 850 ++kvm->stat.nx_lpage_splits; 851 list_add_tail(&sp->lpage_disallowed_link, 852 &kvm->arch.lpage_disallowed_mmu_pages); 853 sp->lpage_disallowed = true; 854 } 855 856 static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) 857 { 858 struct kvm_memslots *slots; 859 struct kvm_memory_slot *slot; 860 gfn_t gfn; 861 862 kvm->arch.indirect_shadow_pages--; 863 gfn = sp->gfn; 864 slots = kvm_memslots_for_spte_role(kvm, sp->role); 865 slot = __gfn_to_memslot(slots, gfn); 866 if (sp->role.level > PG_LEVEL_4K) 867 return kvm_slot_page_track_remove_page(kvm, slot, gfn, 868 KVM_PAGE_TRACK_WRITE); 869 870 kvm_mmu_gfn_allow_lpage(slot, gfn); 871 } 872 873 void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) 874 { 875 --kvm->stat.nx_lpage_splits; 876 sp->lpage_disallowed = false; 877 list_del(&sp->lpage_disallowed_link); 878 } 879 880 static struct kvm_memory_slot * 881 gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, 882 bool no_dirty_log) 883 { 884 struct kvm_memory_slot *slot; 885 886 slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 887 if (!slot || slot->flags & KVM_MEMSLOT_INVALID) 888 return NULL; 889 if (no_dirty_log && kvm_slot_dirty_track_enabled(slot)) 890 return NULL; 891 892 return slot; 893 } 894 895 /* 896 * About rmap_head encoding: 897 * 898 * If the bit zero of rmap_head->val is clear, then it points to the only spte 899 * in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct 900 * pte_list_desc containing more mappings. 901 */ 902 903 /* 904 * Returns the number of pointers in the rmap chain, not counting the new one. 905 */ 906 static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte, 907 struct kvm_rmap_head *rmap_head) 908 { 909 struct pte_list_desc *desc; 910 int count = 0; 911 912 if (!rmap_head->val) { 913 rmap_printk("%p %llx 0->1\n", spte, *spte); 914 rmap_head->val = (unsigned long)spte; 915 } else if (!(rmap_head->val & 1)) { 916 rmap_printk("%p %llx 1->many\n", spte, *spte); 917 desc = mmu_alloc_pte_list_desc(vcpu); 918 desc->sptes[0] = (u64 *)rmap_head->val; 919 desc->sptes[1] = spte; 920 desc->spte_count = 2; 921 rmap_head->val = (unsigned long)desc | 1; 922 ++count; 923 } else { 924 rmap_printk("%p %llx many->many\n", spte, *spte); 925 desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); 926 while (desc->spte_count == PTE_LIST_EXT) { 927 count += PTE_LIST_EXT; 928 if (!desc->more) { 929 desc->more = mmu_alloc_pte_list_desc(vcpu); 930 desc = desc->more; 931 desc->spte_count = 0; 932 break; 933 } 934 desc = desc->more; 935 } 936 count += desc->spte_count; 937 desc->sptes[desc->spte_count++] = spte; 938 } 939 return count; 940 } 941 942 static void 943 pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head, 944 struct pte_list_desc *desc, int i, 945 struct pte_list_desc *prev_desc) 946 { 947 int j = desc->spte_count - 1; 948 949 desc->sptes[i] = desc->sptes[j]; 950 desc->sptes[j] = NULL; 951 desc->spte_count--; 952 if (desc->spte_count) 953 return; 954 if (!prev_desc && !desc->more) 955 rmap_head->val = 0; 956 else 957 if (prev_desc) 958 prev_desc->more = desc->more; 959 else 960 rmap_head->val = (unsigned long)desc->more | 1; 961 mmu_free_pte_list_desc(desc); 962 } 963 964 static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head) 965 { 966 struct pte_list_desc *desc; 967 struct pte_list_desc *prev_desc; 968 int i; 969 970 if (!rmap_head->val) { 971 pr_err("%s: %p 0->BUG\n", __func__, spte); 972 BUG(); 973 } else if (!(rmap_head->val & 1)) { 974 rmap_printk("%p 1->0\n", spte); 975 if ((u64 *)rmap_head->val != spte) { 976 pr_err("%s: %p 1->BUG\n", __func__, spte); 977 BUG(); 978 } 979 rmap_head->val = 0; 980 } else { 981 rmap_printk("%p many->many\n", spte); 982 desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); 983 prev_desc = NULL; 984 while (desc) { 985 for (i = 0; i < desc->spte_count; ++i) { 986 if (desc->sptes[i] == spte) { 987 pte_list_desc_remove_entry(rmap_head, 988 desc, i, prev_desc); 989 return; 990 } 991 } 992 prev_desc = desc; 993 desc = desc->more; 994 } 995 pr_err("%s: %p many->many\n", __func__, spte); 996 BUG(); 997 } 998 } 999 1000 static void pte_list_remove(struct kvm *kvm, struct kvm_rmap_head *rmap_head, 1001 u64 *sptep) 1002 { 1003 mmu_spte_clear_track_bits(kvm, sptep); 1004 __pte_list_remove(sptep, rmap_head); 1005 } 1006 1007 /* Return true if rmap existed, false otherwise */ 1008 static bool pte_list_destroy(struct kvm *kvm, struct kvm_rmap_head *rmap_head) 1009 { 1010 struct pte_list_desc *desc, *next; 1011 int i; 1012 1013 if (!rmap_head->val) 1014 return false; 1015 1016 if (!(rmap_head->val & 1)) { 1017 mmu_spte_clear_track_bits(kvm, (u64 *)rmap_head->val); 1018 goto out; 1019 } 1020 1021 desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); 1022 1023 for (; desc; desc = next) { 1024 for (i = 0; i < desc->spte_count; i++) 1025 mmu_spte_clear_track_bits(kvm, desc->sptes[i]); 1026 next = desc->more; 1027 mmu_free_pte_list_desc(desc); 1028 } 1029 out: 1030 /* rmap_head is meaningless now, remember to reset it */ 1031 rmap_head->val = 0; 1032 return true; 1033 } 1034 1035 unsigned int pte_list_count(struct kvm_rmap_head *rmap_head) 1036 { 1037 struct pte_list_desc *desc; 1038 unsigned int count = 0; 1039 1040 if (!rmap_head->val) 1041 return 0; 1042 else if (!(rmap_head->val & 1)) 1043 return 1; 1044 1045 desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); 1046 1047 while (desc) { 1048 count += desc->spte_count; 1049 desc = desc->more; 1050 } 1051 1052 return count; 1053 } 1054 1055 static struct kvm_rmap_head *gfn_to_rmap(gfn_t gfn, int level, 1056 const struct kvm_memory_slot *slot) 1057 { 1058 unsigned long idx; 1059 1060 idx = gfn_to_index(gfn, slot->base_gfn, level); 1061 return &slot->arch.rmap[level - PG_LEVEL_4K][idx]; 1062 } 1063 1064 static bool rmap_can_add(struct kvm_vcpu *vcpu) 1065 { 1066 struct kvm_mmu_memory_cache *mc; 1067 1068 mc = &vcpu->arch.mmu_pte_list_desc_cache; 1069 return kvm_mmu_memory_cache_nr_free_objects(mc); 1070 } 1071 1072 static void rmap_remove(struct kvm *kvm, u64 *spte) 1073 { 1074 struct kvm_memslots *slots; 1075 struct kvm_memory_slot *slot; 1076 struct kvm_mmu_page *sp; 1077 gfn_t gfn; 1078 struct kvm_rmap_head *rmap_head; 1079 1080 sp = sptep_to_sp(spte); 1081 gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); 1082 1083 /* 1084 * Unlike rmap_add, rmap_remove does not run in the context of a vCPU 1085 * so we have to determine which memslots to use based on context 1086 * information in sp->role. 1087 */ 1088 slots = kvm_memslots_for_spte_role(kvm, sp->role); 1089 1090 slot = __gfn_to_memslot(slots, gfn); 1091 rmap_head = gfn_to_rmap(gfn, sp->role.level, slot); 1092 1093 __pte_list_remove(spte, rmap_head); 1094 } 1095 1096 /* 1097 * Used by the following functions to iterate through the sptes linked by a 1098 * rmap. All fields are private and not assumed to be used outside. 1099 */ 1100 struct rmap_iterator { 1101 /* private fields */ 1102 struct pte_list_desc *desc; /* holds the sptep if not NULL */ 1103 int pos; /* index of the sptep */ 1104 }; 1105 1106 /* 1107 * Iteration must be started by this function. This should also be used after 1108 * removing/dropping sptes from the rmap link because in such cases the 1109 * information in the iterator may not be valid. 1110 * 1111 * Returns sptep if found, NULL otherwise. 1112 */ 1113 static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head, 1114 struct rmap_iterator *iter) 1115 { 1116 u64 *sptep; 1117 1118 if (!rmap_head->val) 1119 return NULL; 1120 1121 if (!(rmap_head->val & 1)) { 1122 iter->desc = NULL; 1123 sptep = (u64 *)rmap_head->val; 1124 goto out; 1125 } 1126 1127 iter->desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); 1128 iter->pos = 0; 1129 sptep = iter->desc->sptes[iter->pos]; 1130 out: 1131 BUG_ON(!is_shadow_present_pte(*sptep)); 1132 return sptep; 1133 } 1134 1135 /* 1136 * Must be used with a valid iterator: e.g. after rmap_get_first(). 1137 * 1138 * Returns sptep if found, NULL otherwise. 1139 */ 1140 static u64 *rmap_get_next(struct rmap_iterator *iter) 1141 { 1142 u64 *sptep; 1143 1144 if (iter->desc) { 1145 if (iter->pos < PTE_LIST_EXT - 1) { 1146 ++iter->pos; 1147 sptep = iter->desc->sptes[iter->pos]; 1148 if (sptep) 1149 goto out; 1150 } 1151 1152 iter->desc = iter->desc->more; 1153 1154 if (iter->desc) { 1155 iter->pos = 0; 1156 /* desc->sptes[0] cannot be NULL */ 1157 sptep = iter->desc->sptes[iter->pos]; 1158 goto out; 1159 } 1160 } 1161 1162 return NULL; 1163 out: 1164 BUG_ON(!is_shadow_present_pte(*sptep)); 1165 return sptep; 1166 } 1167 1168 #define for_each_rmap_spte(_rmap_head_, _iter_, _spte_) \ 1169 for (_spte_ = rmap_get_first(_rmap_head_, _iter_); \ 1170 _spte_; _spte_ = rmap_get_next(_iter_)) 1171 1172 static void drop_spte(struct kvm *kvm, u64 *sptep) 1173 { 1174 u64 old_spte = mmu_spte_clear_track_bits(kvm, sptep); 1175 1176 if (is_shadow_present_pte(old_spte)) 1177 rmap_remove(kvm, sptep); 1178 } 1179 1180 1181 static bool __drop_large_spte(struct kvm *kvm, u64 *sptep) 1182 { 1183 if (is_large_pte(*sptep)) { 1184 WARN_ON(sptep_to_sp(sptep)->role.level == PG_LEVEL_4K); 1185 drop_spte(kvm, sptep); 1186 return true; 1187 } 1188 1189 return false; 1190 } 1191 1192 static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) 1193 { 1194 if (__drop_large_spte(vcpu->kvm, sptep)) { 1195 struct kvm_mmu_page *sp = sptep_to_sp(sptep); 1196 1197 kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn, 1198 KVM_PAGES_PER_HPAGE(sp->role.level)); 1199 } 1200 } 1201 1202 /* 1203 * Write-protect on the specified @sptep, @pt_protect indicates whether 1204 * spte write-protection is caused by protecting shadow page table. 1205 * 1206 * Note: write protection is difference between dirty logging and spte 1207 * protection: 1208 * - for dirty logging, the spte can be set to writable at anytime if 1209 * its dirty bitmap is properly set. 1210 * - for spte protection, the spte can be writable only after unsync-ing 1211 * shadow page. 1212 * 1213 * Return true if tlb need be flushed. 1214 */ 1215 static bool spte_write_protect(u64 *sptep, bool pt_protect) 1216 { 1217 u64 spte = *sptep; 1218 1219 if (!is_writable_pte(spte) && 1220 !(pt_protect && spte_can_locklessly_be_made_writable(spte))) 1221 return false; 1222 1223 rmap_printk("spte %p %llx\n", sptep, *sptep); 1224 1225 if (pt_protect) 1226 spte &= ~shadow_mmu_writable_mask; 1227 spte = spte & ~PT_WRITABLE_MASK; 1228 1229 return mmu_spte_update(sptep, spte); 1230 } 1231 1232 static bool __rmap_write_protect(struct kvm *kvm, 1233 struct kvm_rmap_head *rmap_head, 1234 bool pt_protect) 1235 { 1236 u64 *sptep; 1237 struct rmap_iterator iter; 1238 bool flush = false; 1239 1240 for_each_rmap_spte(rmap_head, &iter, sptep) 1241 flush |= spte_write_protect(sptep, pt_protect); 1242 1243 return flush; 1244 } 1245 1246 static bool spte_clear_dirty(u64 *sptep) 1247 { 1248 u64 spte = *sptep; 1249 1250 rmap_printk("spte %p %llx\n", sptep, *sptep); 1251 1252 MMU_WARN_ON(!spte_ad_enabled(spte)); 1253 spte &= ~shadow_dirty_mask; 1254 return mmu_spte_update(sptep, spte); 1255 } 1256 1257 static bool spte_wrprot_for_clear_dirty(u64 *sptep) 1258 { 1259 bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT, 1260 (unsigned long *)sptep); 1261 if (was_writable && !spte_ad_enabled(*sptep)) 1262 kvm_set_pfn_dirty(spte_to_pfn(*sptep)); 1263 1264 return was_writable; 1265 } 1266 1267 /* 1268 * Gets the GFN ready for another round of dirty logging by clearing the 1269 * - D bit on ad-enabled SPTEs, and 1270 * - W bit on ad-disabled SPTEs. 1271 * Returns true iff any D or W bits were cleared. 1272 */ 1273 static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head, 1274 const struct kvm_memory_slot *slot) 1275 { 1276 u64 *sptep; 1277 struct rmap_iterator iter; 1278 bool flush = false; 1279 1280 for_each_rmap_spte(rmap_head, &iter, sptep) 1281 if (spte_ad_need_write_protect(*sptep)) 1282 flush |= spte_wrprot_for_clear_dirty(sptep); 1283 else 1284 flush |= spte_clear_dirty(sptep); 1285 1286 return flush; 1287 } 1288 1289 /** 1290 * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages 1291 * @kvm: kvm instance 1292 * @slot: slot to protect 1293 * @gfn_offset: start of the BITS_PER_LONG pages we care about 1294 * @mask: indicates which pages we should protect 1295 * 1296 * Used when we do not need to care about huge page mappings. 1297 */ 1298 static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, 1299 struct kvm_memory_slot *slot, 1300 gfn_t gfn_offset, unsigned long mask) 1301 { 1302 struct kvm_rmap_head *rmap_head; 1303 1304 if (is_tdp_mmu_enabled(kvm)) 1305 kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot, 1306 slot->base_gfn + gfn_offset, mask, true); 1307 1308 if (!kvm_memslots_have_rmaps(kvm)) 1309 return; 1310 1311 while (mask) { 1312 rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), 1313 PG_LEVEL_4K, slot); 1314 __rmap_write_protect(kvm, rmap_head, false); 1315 1316 /* clear the first set bit */ 1317 mask &= mask - 1; 1318 } 1319 } 1320 1321 /** 1322 * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages, or write 1323 * protect the page if the D-bit isn't supported. 1324 * @kvm: kvm instance 1325 * @slot: slot to clear D-bit 1326 * @gfn_offset: start of the BITS_PER_LONG pages we care about 1327 * @mask: indicates which pages we should clear D-bit 1328 * 1329 * Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap. 1330 */ 1331 static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, 1332 struct kvm_memory_slot *slot, 1333 gfn_t gfn_offset, unsigned long mask) 1334 { 1335 struct kvm_rmap_head *rmap_head; 1336 1337 if (is_tdp_mmu_enabled(kvm)) 1338 kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot, 1339 slot->base_gfn + gfn_offset, mask, false); 1340 1341 if (!kvm_memslots_have_rmaps(kvm)) 1342 return; 1343 1344 while (mask) { 1345 rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), 1346 PG_LEVEL_4K, slot); 1347 __rmap_clear_dirty(kvm, rmap_head, slot); 1348 1349 /* clear the first set bit */ 1350 mask &= mask - 1; 1351 } 1352 } 1353 1354 /** 1355 * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected 1356 * PT level pages. 1357 * 1358 * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to 1359 * enable dirty logging for them. 1360 * 1361 * We need to care about huge page mappings: e.g. during dirty logging we may 1362 * have such mappings. 1363 */ 1364 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, 1365 struct kvm_memory_slot *slot, 1366 gfn_t gfn_offset, unsigned long mask) 1367 { 1368 /* 1369 * Huge pages are NOT write protected when we start dirty logging in 1370 * initially-all-set mode; must write protect them here so that they 1371 * are split to 4K on the first write. 1372 * 1373 * The gfn_offset is guaranteed to be aligned to 64, but the base_gfn 1374 * of memslot has no such restriction, so the range can cross two large 1375 * pages. 1376 */ 1377 if (kvm_dirty_log_manual_protect_and_init_set(kvm)) { 1378 gfn_t start = slot->base_gfn + gfn_offset + __ffs(mask); 1379 gfn_t end = slot->base_gfn + gfn_offset + __fls(mask); 1380 1381 kvm_mmu_slot_gfn_write_protect(kvm, slot, start, PG_LEVEL_2M); 1382 1383 /* Cross two large pages? */ 1384 if (ALIGN(start << PAGE_SHIFT, PMD_SIZE) != 1385 ALIGN(end << PAGE_SHIFT, PMD_SIZE)) 1386 kvm_mmu_slot_gfn_write_protect(kvm, slot, end, 1387 PG_LEVEL_2M); 1388 } 1389 1390 /* Now handle 4K PTEs. */ 1391 if (kvm_x86_ops.cpu_dirty_log_size) 1392 kvm_mmu_clear_dirty_pt_masked(kvm, slot, gfn_offset, mask); 1393 else 1394 kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); 1395 } 1396 1397 int kvm_cpu_dirty_log_size(void) 1398 { 1399 return kvm_x86_ops.cpu_dirty_log_size; 1400 } 1401 1402 bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, 1403 struct kvm_memory_slot *slot, u64 gfn, 1404 int min_level) 1405 { 1406 struct kvm_rmap_head *rmap_head; 1407 int i; 1408 bool write_protected = false; 1409 1410 if (kvm_memslots_have_rmaps(kvm)) { 1411 for (i = min_level; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) { 1412 rmap_head = gfn_to_rmap(gfn, i, slot); 1413 write_protected |= __rmap_write_protect(kvm, rmap_head, true); 1414 } 1415 } 1416 1417 if (is_tdp_mmu_enabled(kvm)) 1418 write_protected |= 1419 kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn, min_level); 1420 1421 return write_protected; 1422 } 1423 1424 static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn) 1425 { 1426 struct kvm_memory_slot *slot; 1427 1428 slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 1429 return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn, PG_LEVEL_4K); 1430 } 1431 1432 static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, 1433 const struct kvm_memory_slot *slot) 1434 { 1435 return pte_list_destroy(kvm, rmap_head); 1436 } 1437 1438 static bool kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, 1439 struct kvm_memory_slot *slot, gfn_t gfn, int level, 1440 pte_t unused) 1441 { 1442 return kvm_zap_rmapp(kvm, rmap_head, slot); 1443 } 1444 1445 static bool kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, 1446 struct kvm_memory_slot *slot, gfn_t gfn, int level, 1447 pte_t pte) 1448 { 1449 u64 *sptep; 1450 struct rmap_iterator iter; 1451 bool need_flush = false; 1452 u64 new_spte; 1453 kvm_pfn_t new_pfn; 1454 1455 WARN_ON(pte_huge(pte)); 1456 new_pfn = pte_pfn(pte); 1457 1458 restart: 1459 for_each_rmap_spte(rmap_head, &iter, sptep) { 1460 rmap_printk("spte %p %llx gfn %llx (%d)\n", 1461 sptep, *sptep, gfn, level); 1462 1463 need_flush = true; 1464 1465 if (pte_write(pte)) { 1466 pte_list_remove(kvm, rmap_head, sptep); 1467 goto restart; 1468 } else { 1469 new_spte = kvm_mmu_changed_pte_notifier_make_spte( 1470 *sptep, new_pfn); 1471 1472 mmu_spte_clear_track_bits(kvm, sptep); 1473 mmu_spte_set(sptep, new_spte); 1474 } 1475 } 1476 1477 if (need_flush && kvm_available_flush_tlb_with_range()) { 1478 kvm_flush_remote_tlbs_with_address(kvm, gfn, 1); 1479 return false; 1480 } 1481 1482 return need_flush; 1483 } 1484 1485 struct slot_rmap_walk_iterator { 1486 /* input fields. */ 1487 const struct kvm_memory_slot *slot; 1488 gfn_t start_gfn; 1489 gfn_t end_gfn; 1490 int start_level; 1491 int end_level; 1492 1493 /* output fields. */ 1494 gfn_t gfn; 1495 struct kvm_rmap_head *rmap; 1496 int level; 1497 1498 /* private field. */ 1499 struct kvm_rmap_head *end_rmap; 1500 }; 1501 1502 static void 1503 rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level) 1504 { 1505 iterator->level = level; 1506 iterator->gfn = iterator->start_gfn; 1507 iterator->rmap = gfn_to_rmap(iterator->gfn, level, iterator->slot); 1508 iterator->end_rmap = gfn_to_rmap(iterator->end_gfn, level, iterator->slot); 1509 } 1510 1511 static void 1512 slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator, 1513 const struct kvm_memory_slot *slot, int start_level, 1514 int end_level, gfn_t start_gfn, gfn_t end_gfn) 1515 { 1516 iterator->slot = slot; 1517 iterator->start_level = start_level; 1518 iterator->end_level = end_level; 1519 iterator->start_gfn = start_gfn; 1520 iterator->end_gfn = end_gfn; 1521 1522 rmap_walk_init_level(iterator, iterator->start_level); 1523 } 1524 1525 static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator) 1526 { 1527 return !!iterator->rmap; 1528 } 1529 1530 static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator) 1531 { 1532 if (++iterator->rmap <= iterator->end_rmap) { 1533 iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level)); 1534 return; 1535 } 1536 1537 if (++iterator->level > iterator->end_level) { 1538 iterator->rmap = NULL; 1539 return; 1540 } 1541 1542 rmap_walk_init_level(iterator, iterator->level); 1543 } 1544 1545 #define for_each_slot_rmap_range(_slot_, _start_level_, _end_level_, \ 1546 _start_gfn, _end_gfn, _iter_) \ 1547 for (slot_rmap_walk_init(_iter_, _slot_, _start_level_, \ 1548 _end_level_, _start_gfn, _end_gfn); \ 1549 slot_rmap_walk_okay(_iter_); \ 1550 slot_rmap_walk_next(_iter_)) 1551 1552 typedef bool (*rmap_handler_t)(struct kvm *kvm, struct kvm_rmap_head *rmap_head, 1553 struct kvm_memory_slot *slot, gfn_t gfn, 1554 int level, pte_t pte); 1555 1556 static __always_inline bool kvm_handle_gfn_range(struct kvm *kvm, 1557 struct kvm_gfn_range *range, 1558 rmap_handler_t handler) 1559 { 1560 struct slot_rmap_walk_iterator iterator; 1561 bool ret = false; 1562 1563 for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, 1564 range->start, range->end - 1, &iterator) 1565 ret |= handler(kvm, iterator.rmap, range->slot, iterator.gfn, 1566 iterator.level, range->pte); 1567 1568 return ret; 1569 } 1570 1571 bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) 1572 { 1573 bool flush = false; 1574 1575 if (kvm_memslots_have_rmaps(kvm)) 1576 flush = kvm_handle_gfn_range(kvm, range, kvm_unmap_rmapp); 1577 1578 if (is_tdp_mmu_enabled(kvm)) 1579 flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush); 1580 1581 return flush; 1582 } 1583 1584 bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 1585 { 1586 bool flush = false; 1587 1588 if (kvm_memslots_have_rmaps(kvm)) 1589 flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmapp); 1590 1591 if (is_tdp_mmu_enabled(kvm)) 1592 flush |= kvm_tdp_mmu_set_spte_gfn(kvm, range); 1593 1594 return flush; 1595 } 1596 1597 static bool kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, 1598 struct kvm_memory_slot *slot, gfn_t gfn, int level, 1599 pte_t unused) 1600 { 1601 u64 *sptep; 1602 struct rmap_iterator iter; 1603 int young = 0; 1604 1605 for_each_rmap_spte(rmap_head, &iter, sptep) 1606 young |= mmu_spte_age(sptep); 1607 1608 return young; 1609 } 1610 1611 static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, 1612 struct kvm_memory_slot *slot, gfn_t gfn, 1613 int level, pte_t unused) 1614 { 1615 u64 *sptep; 1616 struct rmap_iterator iter; 1617 1618 for_each_rmap_spte(rmap_head, &iter, sptep) 1619 if (is_accessed_spte(*sptep)) 1620 return true; 1621 return false; 1622 } 1623 1624 #define RMAP_RECYCLE_THRESHOLD 1000 1625 1626 static void rmap_add(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, 1627 u64 *spte, gfn_t gfn) 1628 { 1629 struct kvm_mmu_page *sp; 1630 struct kvm_rmap_head *rmap_head; 1631 int rmap_count; 1632 1633 sp = sptep_to_sp(spte); 1634 kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); 1635 rmap_head = gfn_to_rmap(gfn, sp->role.level, slot); 1636 rmap_count = pte_list_add(vcpu, spte, rmap_head); 1637 1638 if (rmap_count > RMAP_RECYCLE_THRESHOLD) { 1639 kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, __pte(0)); 1640 kvm_flush_remote_tlbs_with_address( 1641 vcpu->kvm, sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level)); 1642 } 1643 } 1644 1645 bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 1646 { 1647 bool young = false; 1648 1649 if (kvm_memslots_have_rmaps(kvm)) 1650 young = kvm_handle_gfn_range(kvm, range, kvm_age_rmapp); 1651 1652 if (is_tdp_mmu_enabled(kvm)) 1653 young |= kvm_tdp_mmu_age_gfn_range(kvm, range); 1654 1655 return young; 1656 } 1657 1658 bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 1659 { 1660 bool young = false; 1661 1662 if (kvm_memslots_have_rmaps(kvm)) 1663 young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmapp); 1664 1665 if (is_tdp_mmu_enabled(kvm)) 1666 young |= kvm_tdp_mmu_test_age_gfn(kvm, range); 1667 1668 return young; 1669 } 1670 1671 #ifdef MMU_DEBUG 1672 static int is_empty_shadow_page(u64 *spt) 1673 { 1674 u64 *pos; 1675 u64 *end; 1676 1677 for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++) 1678 if (is_shadow_present_pte(*pos)) { 1679 printk(KERN_ERR "%s: %p %llx\n", __func__, 1680 pos, *pos); 1681 return 0; 1682 } 1683 return 1; 1684 } 1685 #endif 1686 1687 /* 1688 * This value is the sum of all of the kvm instances's 1689 * kvm->arch.n_used_mmu_pages values. We need a global, 1690 * aggregate version in order to make the slab shrinker 1691 * faster 1692 */ 1693 static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, long nr) 1694 { 1695 kvm->arch.n_used_mmu_pages += nr; 1696 percpu_counter_add(&kvm_total_used_mmu_pages, nr); 1697 } 1698 1699 static void kvm_mmu_free_page(struct kvm_mmu_page *sp) 1700 { 1701 MMU_WARN_ON(!is_empty_shadow_page(sp->spt)); 1702 hlist_del(&sp->hash_link); 1703 list_del(&sp->link); 1704 free_page((unsigned long)sp->spt); 1705 if (!sp->role.direct) 1706 free_page((unsigned long)sp->gfns); 1707 kmem_cache_free(mmu_page_header_cache, sp); 1708 } 1709 1710 static unsigned kvm_page_table_hashfn(gfn_t gfn) 1711 { 1712 return hash_64(gfn, KVM_MMU_HASH_SHIFT); 1713 } 1714 1715 static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu, 1716 struct kvm_mmu_page *sp, u64 *parent_pte) 1717 { 1718 if (!parent_pte) 1719 return; 1720 1721 pte_list_add(vcpu, parent_pte, &sp->parent_ptes); 1722 } 1723 1724 static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, 1725 u64 *parent_pte) 1726 { 1727 __pte_list_remove(parent_pte, &sp->parent_ptes); 1728 } 1729 1730 static void drop_parent_pte(struct kvm_mmu_page *sp, 1731 u64 *parent_pte) 1732 { 1733 mmu_page_remove_parent_pte(sp, parent_pte); 1734 mmu_spte_clear_no_track(parent_pte); 1735 } 1736 1737 static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct) 1738 { 1739 struct kvm_mmu_page *sp; 1740 1741 sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); 1742 sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); 1743 if (!direct) 1744 sp->gfns = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_gfn_array_cache); 1745 set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 1746 1747 /* 1748 * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages() 1749 * depends on valid pages being added to the head of the list. See 1750 * comments in kvm_zap_obsolete_pages(). 1751 */ 1752 sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen; 1753 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); 1754 kvm_mod_used_mmu_pages(vcpu->kvm, +1); 1755 return sp; 1756 } 1757 1758 static void mark_unsync(u64 *spte); 1759 static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp) 1760 { 1761 u64 *sptep; 1762 struct rmap_iterator iter; 1763 1764 for_each_rmap_spte(&sp->parent_ptes, &iter, sptep) { 1765 mark_unsync(sptep); 1766 } 1767 } 1768 1769 static void mark_unsync(u64 *spte) 1770 { 1771 struct kvm_mmu_page *sp; 1772 unsigned int index; 1773 1774 sp = sptep_to_sp(spte); 1775 index = spte - sp->spt; 1776 if (__test_and_set_bit(index, sp->unsync_child_bitmap)) 1777 return; 1778 if (sp->unsync_children++) 1779 return; 1780 kvm_mmu_mark_parents_unsync(sp); 1781 } 1782 1783 static int nonpaging_sync_page(struct kvm_vcpu *vcpu, 1784 struct kvm_mmu_page *sp) 1785 { 1786 return -1; 1787 } 1788 1789 #define KVM_PAGE_ARRAY_NR 16 1790 1791 struct kvm_mmu_pages { 1792 struct mmu_page_and_offset { 1793 struct kvm_mmu_page *sp; 1794 unsigned int idx; 1795 } page[KVM_PAGE_ARRAY_NR]; 1796 unsigned int nr; 1797 }; 1798 1799 static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp, 1800 int idx) 1801 { 1802 int i; 1803 1804 if (sp->unsync) 1805 for (i=0; i < pvec->nr; i++) 1806 if (pvec->page[i].sp == sp) 1807 return 0; 1808 1809 pvec->page[pvec->nr].sp = sp; 1810 pvec->page[pvec->nr].idx = idx; 1811 pvec->nr++; 1812 return (pvec->nr == KVM_PAGE_ARRAY_NR); 1813 } 1814 1815 static inline void clear_unsync_child_bit(struct kvm_mmu_page *sp, int idx) 1816 { 1817 --sp->unsync_children; 1818 WARN_ON((int)sp->unsync_children < 0); 1819 __clear_bit(idx, sp->unsync_child_bitmap); 1820 } 1821 1822 static int __mmu_unsync_walk(struct kvm_mmu_page *sp, 1823 struct kvm_mmu_pages *pvec) 1824 { 1825 int i, ret, nr_unsync_leaf = 0; 1826 1827 for_each_set_bit(i, sp->unsync_child_bitmap, 512) { 1828 struct kvm_mmu_page *child; 1829 u64 ent = sp->spt[i]; 1830 1831 if (!is_shadow_present_pte(ent) || is_large_pte(ent)) { 1832 clear_unsync_child_bit(sp, i); 1833 continue; 1834 } 1835 1836 child = to_shadow_page(ent & PT64_BASE_ADDR_MASK); 1837 1838 if (child->unsync_children) { 1839 if (mmu_pages_add(pvec, child, i)) 1840 return -ENOSPC; 1841 1842 ret = __mmu_unsync_walk(child, pvec); 1843 if (!ret) { 1844 clear_unsync_child_bit(sp, i); 1845 continue; 1846 } else if (ret > 0) { 1847 nr_unsync_leaf += ret; 1848 } else 1849 return ret; 1850 } else if (child->unsync) { 1851 nr_unsync_leaf++; 1852 if (mmu_pages_add(pvec, child, i)) 1853 return -ENOSPC; 1854 } else 1855 clear_unsync_child_bit(sp, i); 1856 } 1857 1858 return nr_unsync_leaf; 1859 } 1860 1861 #define INVALID_INDEX (-1) 1862 1863 static int mmu_unsync_walk(struct kvm_mmu_page *sp, 1864 struct kvm_mmu_pages *pvec) 1865 { 1866 pvec->nr = 0; 1867 if (!sp->unsync_children) 1868 return 0; 1869 1870 mmu_pages_add(pvec, sp, INVALID_INDEX); 1871 return __mmu_unsync_walk(sp, pvec); 1872 } 1873 1874 static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) 1875 { 1876 WARN_ON(!sp->unsync); 1877 trace_kvm_mmu_sync_page(sp); 1878 sp->unsync = 0; 1879 --kvm->stat.mmu_unsync; 1880 } 1881 1882 static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, 1883 struct list_head *invalid_list); 1884 static void kvm_mmu_commit_zap_page(struct kvm *kvm, 1885 struct list_head *invalid_list); 1886 1887 #define for_each_valid_sp(_kvm, _sp, _list) \ 1888 hlist_for_each_entry(_sp, _list, hash_link) \ 1889 if (is_obsolete_sp((_kvm), (_sp))) { \ 1890 } else 1891 1892 #define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \ 1893 for_each_valid_sp(_kvm, _sp, \ 1894 &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)]) \ 1895 if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else 1896 1897 static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 1898 struct list_head *invalid_list) 1899 { 1900 int ret = vcpu->arch.mmu->sync_page(vcpu, sp); 1901 1902 if (ret < 0) { 1903 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); 1904 return false; 1905 } 1906 1907 return !!ret; 1908 } 1909 1910 static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm, 1911 struct list_head *invalid_list, 1912 bool remote_flush) 1913 { 1914 if (!remote_flush && list_empty(invalid_list)) 1915 return false; 1916 1917 if (!list_empty(invalid_list)) 1918 kvm_mmu_commit_zap_page(kvm, invalid_list); 1919 else 1920 kvm_flush_remote_tlbs(kvm); 1921 return true; 1922 } 1923 1924 #ifdef CONFIG_KVM_MMU_AUDIT 1925 #include "mmu_audit.c" 1926 #else 1927 static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { } 1928 static void mmu_audit_disable(void) { } 1929 #endif 1930 1931 static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp) 1932 { 1933 if (sp->role.invalid) 1934 return true; 1935 1936 /* TDP MMU pages due not use the MMU generation. */ 1937 return !sp->tdp_mmu_page && 1938 unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen); 1939 } 1940 1941 struct mmu_page_path { 1942 struct kvm_mmu_page *parent[PT64_ROOT_MAX_LEVEL]; 1943 unsigned int idx[PT64_ROOT_MAX_LEVEL]; 1944 }; 1945 1946 #define for_each_sp(pvec, sp, parents, i) \ 1947 for (i = mmu_pages_first(&pvec, &parents); \ 1948 i < pvec.nr && ({ sp = pvec.page[i].sp; 1;}); \ 1949 i = mmu_pages_next(&pvec, &parents, i)) 1950 1951 static int mmu_pages_next(struct kvm_mmu_pages *pvec, 1952 struct mmu_page_path *parents, 1953 int i) 1954 { 1955 int n; 1956 1957 for (n = i+1; n < pvec->nr; n++) { 1958 struct kvm_mmu_page *sp = pvec->page[n].sp; 1959 unsigned idx = pvec->page[n].idx; 1960 int level = sp->role.level; 1961 1962 parents->idx[level-1] = idx; 1963 if (level == PG_LEVEL_4K) 1964 break; 1965 1966 parents->parent[level-2] = sp; 1967 } 1968 1969 return n; 1970 } 1971 1972 static int mmu_pages_first(struct kvm_mmu_pages *pvec, 1973 struct mmu_page_path *parents) 1974 { 1975 struct kvm_mmu_page *sp; 1976 int level; 1977 1978 if (pvec->nr == 0) 1979 return 0; 1980 1981 WARN_ON(pvec->page[0].idx != INVALID_INDEX); 1982 1983 sp = pvec->page[0].sp; 1984 level = sp->role.level; 1985 WARN_ON(level == PG_LEVEL_4K); 1986 1987 parents->parent[level-2] = sp; 1988 1989 /* Also set up a sentinel. Further entries in pvec are all 1990 * children of sp, so this element is never overwritten. 1991 */ 1992 parents->parent[level-1] = NULL; 1993 return mmu_pages_next(pvec, parents, 0); 1994 } 1995 1996 static void mmu_pages_clear_parents(struct mmu_page_path *parents) 1997 { 1998 struct kvm_mmu_page *sp; 1999 unsigned int level = 0; 2000 2001 do { 2002 unsigned int idx = parents->idx[level]; 2003 sp = parents->parent[level]; 2004 if (!sp) 2005 return; 2006 2007 WARN_ON(idx == INVALID_INDEX); 2008 clear_unsync_child_bit(sp, idx); 2009 level++; 2010 } while (!sp->unsync_children); 2011 } 2012 2013 static int mmu_sync_children(struct kvm_vcpu *vcpu, 2014 struct kvm_mmu_page *parent, bool can_yield) 2015 { 2016 int i; 2017 struct kvm_mmu_page *sp; 2018 struct mmu_page_path parents; 2019 struct kvm_mmu_pages pages; 2020 LIST_HEAD(invalid_list); 2021 bool flush = false; 2022 2023 while (mmu_unsync_walk(parent, &pages)) { 2024 bool protected = false; 2025 2026 for_each_sp(pages, sp, parents, i) 2027 protected |= rmap_write_protect(vcpu, sp->gfn); 2028 2029 if (protected) { 2030 kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, true); 2031 flush = false; 2032 } 2033 2034 for_each_sp(pages, sp, parents, i) { 2035 kvm_unlink_unsync_page(vcpu->kvm, sp); 2036 flush |= kvm_sync_page(vcpu, sp, &invalid_list); 2037 mmu_pages_clear_parents(&parents); 2038 } 2039 if (need_resched() || rwlock_needbreak(&vcpu->kvm->mmu_lock)) { 2040 kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush); 2041 if (!can_yield) { 2042 kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); 2043 return -EINTR; 2044 } 2045 2046 cond_resched_rwlock_write(&vcpu->kvm->mmu_lock); 2047 flush = false; 2048 } 2049 } 2050 2051 kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush); 2052 return 0; 2053 } 2054 2055 static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp) 2056 { 2057 atomic_set(&sp->write_flooding_count, 0); 2058 } 2059 2060 static void clear_sp_write_flooding_count(u64 *spte) 2061 { 2062 __clear_sp_write_flooding_count(sptep_to_sp(spte)); 2063 } 2064 2065 static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, 2066 gfn_t gfn, 2067 gva_t gaddr, 2068 unsigned level, 2069 int direct, 2070 unsigned int access) 2071 { 2072 bool direct_mmu = vcpu->arch.mmu->direct_map; 2073 union kvm_mmu_page_role role; 2074 struct hlist_head *sp_list; 2075 unsigned quadrant; 2076 struct kvm_mmu_page *sp; 2077 int collisions = 0; 2078 LIST_HEAD(invalid_list); 2079 2080 role = vcpu->arch.mmu->mmu_role.base; 2081 role.level = level; 2082 role.direct = direct; 2083 role.access = access; 2084 if (role.has_4_byte_gpte) { 2085 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); 2086 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; 2087 role.quadrant = quadrant; 2088 } 2089 2090 sp_list = &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]; 2091 for_each_valid_sp(vcpu->kvm, sp, sp_list) { 2092 if (sp->gfn != gfn) { 2093 collisions++; 2094 continue; 2095 } 2096 2097 if (sp->role.word != role.word) { 2098 /* 2099 * If the guest is creating an upper-level page, zap 2100 * unsync pages for the same gfn. While it's possible 2101 * the guest is using recursive page tables, in all 2102 * likelihood the guest has stopped using the unsync 2103 * page and is installing a completely unrelated page. 2104 * Unsync pages must not be left as is, because the new 2105 * upper-level page will be write-protected. 2106 */ 2107 if (level > PG_LEVEL_4K && sp->unsync) 2108 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, 2109 &invalid_list); 2110 continue; 2111 } 2112 2113 if (direct_mmu) 2114 goto trace_get_page; 2115 2116 if (sp->unsync) { 2117 /* 2118 * The page is good, but is stale. kvm_sync_page does 2119 * get the latest guest state, but (unlike mmu_unsync_children) 2120 * it doesn't write-protect the page or mark it synchronized! 2121 * This way the validity of the mapping is ensured, but the 2122 * overhead of write protection is not incurred until the 2123 * guest invalidates the TLB mapping. This allows multiple 2124 * SPs for a single gfn to be unsync. 2125 * 2126 * If the sync fails, the page is zapped. If so, break 2127 * in order to rebuild it. 2128 */ 2129 if (!kvm_sync_page(vcpu, sp, &invalid_list)) 2130 break; 2131 2132 WARN_ON(!list_empty(&invalid_list)); 2133 kvm_flush_remote_tlbs(vcpu->kvm); 2134 } 2135 2136 __clear_sp_write_flooding_count(sp); 2137 2138 trace_get_page: 2139 trace_kvm_mmu_get_page(sp, false); 2140 goto out; 2141 } 2142 2143 ++vcpu->kvm->stat.mmu_cache_miss; 2144 2145 sp = kvm_mmu_alloc_page(vcpu, direct); 2146 2147 sp->gfn = gfn; 2148 sp->role = role; 2149 hlist_add_head(&sp->hash_link, sp_list); 2150 if (!direct) { 2151 account_shadowed(vcpu->kvm, sp); 2152 if (level == PG_LEVEL_4K && rmap_write_protect(vcpu, gfn)) 2153 kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1); 2154 } 2155 trace_kvm_mmu_get_page(sp, true); 2156 out: 2157 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); 2158 2159 if (collisions > vcpu->kvm->stat.max_mmu_page_hash_collisions) 2160 vcpu->kvm->stat.max_mmu_page_hash_collisions = collisions; 2161 return sp; 2162 } 2163 2164 static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterator, 2165 struct kvm_vcpu *vcpu, hpa_t root, 2166 u64 addr) 2167 { 2168 iterator->addr = addr; 2169 iterator->shadow_addr = root; 2170 iterator->level = vcpu->arch.mmu->shadow_root_level; 2171 2172 if (iterator->level >= PT64_ROOT_4LEVEL && 2173 vcpu->arch.mmu->root_level < PT64_ROOT_4LEVEL && 2174 !vcpu->arch.mmu->direct_map) 2175 iterator->level = PT32E_ROOT_LEVEL; 2176 2177 if (iterator->level == PT32E_ROOT_LEVEL) { 2178 /* 2179 * prev_root is currently only used for 64-bit hosts. So only 2180 * the active root_hpa is valid here. 2181 */ 2182 BUG_ON(root != vcpu->arch.mmu->root_hpa); 2183 2184 iterator->shadow_addr 2185 = vcpu->arch.mmu->pae_root[(addr >> 30) & 3]; 2186 iterator->shadow_addr &= PT64_BASE_ADDR_MASK; 2187 --iterator->level; 2188 if (!iterator->shadow_addr) 2189 iterator->level = 0; 2190 } 2191 } 2192 2193 static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator, 2194 struct kvm_vcpu *vcpu, u64 addr) 2195 { 2196 shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu->root_hpa, 2197 addr); 2198 } 2199 2200 static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator) 2201 { 2202 if (iterator->level < PG_LEVEL_4K) 2203 return false; 2204 2205 iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level); 2206 iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index; 2207 return true; 2208 } 2209 2210 static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator, 2211 u64 spte) 2212 { 2213 if (!is_shadow_present_pte(spte) || is_last_spte(spte, iterator->level)) { 2214 iterator->level = 0; 2215 return; 2216 } 2217 2218 iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK; 2219 --iterator->level; 2220 } 2221 2222 static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) 2223 { 2224 __shadow_walk_next(iterator, *iterator->sptep); 2225 } 2226 2227 static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, 2228 struct kvm_mmu_page *sp) 2229 { 2230 u64 spte; 2231 2232 BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK); 2233 2234 spte = make_nonleaf_spte(sp->spt, sp_ad_disabled(sp)); 2235 2236 mmu_spte_set(sptep, spte); 2237 2238 mmu_page_add_parent_pte(vcpu, sp, sptep); 2239 2240 if (sp->unsync_children || sp->unsync) 2241 mark_unsync(sptep); 2242 } 2243 2244 static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, 2245 unsigned direct_access) 2246 { 2247 if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) { 2248 struct kvm_mmu_page *child; 2249 2250 /* 2251 * For the direct sp, if the guest pte's dirty bit 2252 * changed form clean to dirty, it will corrupt the 2253 * sp's access: allow writable in the read-only sp, 2254 * so we should update the spte at this point to get 2255 * a new sp with the correct access. 2256 */ 2257 child = to_shadow_page(*sptep & PT64_BASE_ADDR_MASK); 2258 if (child->role.access == direct_access) 2259 return; 2260 2261 drop_parent_pte(child, sptep); 2262 kvm_flush_remote_tlbs_with_address(vcpu->kvm, child->gfn, 1); 2263 } 2264 } 2265 2266 /* Returns the number of zapped non-leaf child shadow pages. */ 2267 static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, 2268 u64 *spte, struct list_head *invalid_list) 2269 { 2270 u64 pte; 2271 struct kvm_mmu_page *child; 2272 2273 pte = *spte; 2274 if (is_shadow_present_pte(pte)) { 2275 if (is_last_spte(pte, sp->role.level)) { 2276 drop_spte(kvm, spte); 2277 } else { 2278 child = to_shadow_page(pte & PT64_BASE_ADDR_MASK); 2279 drop_parent_pte(child, spte); 2280 2281 /* 2282 * Recursively zap nested TDP SPs, parentless SPs are 2283 * unlikely to be used again in the near future. This 2284 * avoids retaining a large number of stale nested SPs. 2285 */ 2286 if (tdp_enabled && invalid_list && 2287 child->role.guest_mode && !child->parent_ptes.val) 2288 return kvm_mmu_prepare_zap_page(kvm, child, 2289 invalid_list); 2290 } 2291 } else if (is_mmio_spte(pte)) { 2292 mmu_spte_clear_no_track(spte); 2293 } 2294 return 0; 2295 } 2296 2297 static int kvm_mmu_page_unlink_children(struct kvm *kvm, 2298 struct kvm_mmu_page *sp, 2299 struct list_head *invalid_list) 2300 { 2301 int zapped = 0; 2302 unsigned i; 2303 2304 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) 2305 zapped += mmu_page_zap_pte(kvm, sp, sp->spt + i, invalid_list); 2306 2307 return zapped; 2308 } 2309 2310 static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) 2311 { 2312 u64 *sptep; 2313 struct rmap_iterator iter; 2314 2315 while ((sptep = rmap_get_first(&sp->parent_ptes, &iter))) 2316 drop_parent_pte(sp, sptep); 2317 } 2318 2319 static int mmu_zap_unsync_children(struct kvm *kvm, 2320 struct kvm_mmu_page *parent, 2321 struct list_head *invalid_list) 2322 { 2323 int i, zapped = 0; 2324 struct mmu_page_path parents; 2325 struct kvm_mmu_pages pages; 2326 2327 if (parent->role.level == PG_LEVEL_4K) 2328 return 0; 2329 2330 while (mmu_unsync_walk(parent, &pages)) { 2331 struct kvm_mmu_page *sp; 2332 2333 for_each_sp(pages, sp, parents, i) { 2334 kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); 2335 mmu_pages_clear_parents(&parents); 2336 zapped++; 2337 } 2338 } 2339 2340 return zapped; 2341 } 2342 2343 static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm, 2344 struct kvm_mmu_page *sp, 2345 struct list_head *invalid_list, 2346 int *nr_zapped) 2347 { 2348 bool list_unstable; 2349 2350 trace_kvm_mmu_prepare_zap_page(sp); 2351 ++kvm->stat.mmu_shadow_zapped; 2352 *nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list); 2353 *nr_zapped += kvm_mmu_page_unlink_children(kvm, sp, invalid_list); 2354 kvm_mmu_unlink_parents(kvm, sp); 2355 2356 /* Zapping children means active_mmu_pages has become unstable. */ 2357 list_unstable = *nr_zapped; 2358 2359 if (!sp->role.invalid && !sp->role.direct) 2360 unaccount_shadowed(kvm, sp); 2361 2362 if (sp->unsync) 2363 kvm_unlink_unsync_page(kvm, sp); 2364 if (!sp->root_count) { 2365 /* Count self */ 2366 (*nr_zapped)++; 2367 2368 /* 2369 * Already invalid pages (previously active roots) are not on 2370 * the active page list. See list_del() in the "else" case of 2371 * !sp->root_count. 2372 */ 2373 if (sp->role.invalid) 2374 list_add(&sp->link, invalid_list); 2375 else 2376 list_move(&sp->link, invalid_list); 2377 kvm_mod_used_mmu_pages(kvm, -1); 2378 } else { 2379 /* 2380 * Remove the active root from the active page list, the root 2381 * will be explicitly freed when the root_count hits zero. 2382 */ 2383 list_del(&sp->link); 2384 2385 /* 2386 * Obsolete pages cannot be used on any vCPUs, see the comment 2387 * in kvm_mmu_zap_all_fast(). Note, is_obsolete_sp() also 2388 * treats invalid shadow pages as being obsolete. 2389 */ 2390 if (!is_obsolete_sp(kvm, sp)) 2391 kvm_reload_remote_mmus(kvm); 2392 } 2393 2394 if (sp->lpage_disallowed) 2395 unaccount_huge_nx_page(kvm, sp); 2396 2397 sp->role.invalid = 1; 2398 return list_unstable; 2399 } 2400 2401 static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, 2402 struct list_head *invalid_list) 2403 { 2404 int nr_zapped; 2405 2406 __kvm_mmu_prepare_zap_page(kvm, sp, invalid_list, &nr_zapped); 2407 return nr_zapped; 2408 } 2409 2410 static void kvm_mmu_commit_zap_page(struct kvm *kvm, 2411 struct list_head *invalid_list) 2412 { 2413 struct kvm_mmu_page *sp, *nsp; 2414 2415 if (list_empty(invalid_list)) 2416 return; 2417 2418 /* 2419 * We need to make sure everyone sees our modifications to 2420 * the page tables and see changes to vcpu->mode here. The barrier 2421 * in the kvm_flush_remote_tlbs() achieves this. This pairs 2422 * with vcpu_enter_guest and walk_shadow_page_lockless_begin/end. 2423 * 2424 * In addition, kvm_flush_remote_tlbs waits for all vcpus to exit 2425 * guest mode and/or lockless shadow page table walks. 2426 */ 2427 kvm_flush_remote_tlbs(kvm); 2428 2429 list_for_each_entry_safe(sp, nsp, invalid_list, link) { 2430 WARN_ON(!sp->role.invalid || sp->root_count); 2431 kvm_mmu_free_page(sp); 2432 } 2433 } 2434 2435 static unsigned long kvm_mmu_zap_oldest_mmu_pages(struct kvm *kvm, 2436 unsigned long nr_to_zap) 2437 { 2438 unsigned long total_zapped = 0; 2439 struct kvm_mmu_page *sp, *tmp; 2440 LIST_HEAD(invalid_list); 2441 bool unstable; 2442 int nr_zapped; 2443 2444 if (list_empty(&kvm->arch.active_mmu_pages)) 2445 return 0; 2446 2447 restart: 2448 list_for_each_entry_safe_reverse(sp, tmp, &kvm->arch.active_mmu_pages, link) { 2449 /* 2450 * Don't zap active root pages, the page itself can't be freed 2451 * and zapping it will just force vCPUs to realloc and reload. 2452 */ 2453 if (sp->root_count) 2454 continue; 2455 2456 unstable = __kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, 2457 &nr_zapped); 2458 total_zapped += nr_zapped; 2459 if (total_zapped >= nr_to_zap) 2460 break; 2461 2462 if (unstable) 2463 goto restart; 2464 } 2465 2466 kvm_mmu_commit_zap_page(kvm, &invalid_list); 2467 2468 kvm->stat.mmu_recycled += total_zapped; 2469 return total_zapped; 2470 } 2471 2472 static inline unsigned long kvm_mmu_available_pages(struct kvm *kvm) 2473 { 2474 if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages) 2475 return kvm->arch.n_max_mmu_pages - 2476 kvm->arch.n_used_mmu_pages; 2477 2478 return 0; 2479 } 2480 2481 static int make_mmu_pages_available(struct kvm_vcpu *vcpu) 2482 { 2483 unsigned long avail = kvm_mmu_available_pages(vcpu->kvm); 2484 2485 if (likely(avail >= KVM_MIN_FREE_MMU_PAGES)) 2486 return 0; 2487 2488 kvm_mmu_zap_oldest_mmu_pages(vcpu->kvm, KVM_REFILL_PAGES - avail); 2489 2490 /* 2491 * Note, this check is intentionally soft, it only guarantees that one 2492 * page is available, while the caller may end up allocating as many as 2493 * four pages, e.g. for PAE roots or for 5-level paging. Temporarily 2494 * exceeding the (arbitrary by default) limit will not harm the host, 2495 * being too aggressive may unnecessarily kill the guest, and getting an 2496 * exact count is far more trouble than it's worth, especially in the 2497 * page fault paths. 2498 */ 2499 if (!kvm_mmu_available_pages(vcpu->kvm)) 2500 return -ENOSPC; 2501 return 0; 2502 } 2503 2504 /* 2505 * Changing the number of mmu pages allocated to the vm 2506 * Note: if goal_nr_mmu_pages is too small, you will get dead lock 2507 */ 2508 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages) 2509 { 2510 write_lock(&kvm->mmu_lock); 2511 2512 if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) { 2513 kvm_mmu_zap_oldest_mmu_pages(kvm, kvm->arch.n_used_mmu_pages - 2514 goal_nr_mmu_pages); 2515 2516 goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages; 2517 } 2518 2519 kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages; 2520 2521 write_unlock(&kvm->mmu_lock); 2522 } 2523 2524 int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) 2525 { 2526 struct kvm_mmu_page *sp; 2527 LIST_HEAD(invalid_list); 2528 int r; 2529 2530 pgprintk("%s: looking for gfn %llx\n", __func__, gfn); 2531 r = 0; 2532 write_lock(&kvm->mmu_lock); 2533 for_each_gfn_indirect_valid_sp(kvm, sp, gfn) { 2534 pgprintk("%s: gfn %llx role %x\n", __func__, gfn, 2535 sp->role.word); 2536 r = 1; 2537 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); 2538 } 2539 kvm_mmu_commit_zap_page(kvm, &invalid_list); 2540 write_unlock(&kvm->mmu_lock); 2541 2542 return r; 2543 } 2544 2545 static int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) 2546 { 2547 gpa_t gpa; 2548 int r; 2549 2550 if (vcpu->arch.mmu->direct_map) 2551 return 0; 2552 2553 gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); 2554 2555 r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); 2556 2557 return r; 2558 } 2559 2560 static void kvm_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) 2561 { 2562 trace_kvm_mmu_unsync_page(sp); 2563 ++kvm->stat.mmu_unsync; 2564 sp->unsync = 1; 2565 2566 kvm_mmu_mark_parents_unsync(sp); 2567 } 2568 2569 /* 2570 * Attempt to unsync any shadow pages that can be reached by the specified gfn, 2571 * KVM is creating a writable mapping for said gfn. Returns 0 if all pages 2572 * were marked unsync (or if there is no shadow page), -EPERM if the SPTE must 2573 * be write-protected. 2574 */ 2575 int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot, 2576 gfn_t gfn, bool can_unsync, bool prefetch) 2577 { 2578 struct kvm_mmu_page *sp; 2579 bool locked = false; 2580 2581 /* 2582 * Force write-protection if the page is being tracked. Note, the page 2583 * track machinery is used to write-protect upper-level shadow pages, 2584 * i.e. this guards the role.level == 4K assertion below! 2585 */ 2586 if (kvm_slot_page_track_is_active(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE)) 2587 return -EPERM; 2588 2589 /* 2590 * The page is not write-tracked, mark existing shadow pages unsync 2591 * unless KVM is synchronizing an unsync SP (can_unsync = false). In 2592 * that case, KVM must complete emulation of the guest TLB flush before 2593 * allowing shadow pages to become unsync (writable by the guest). 2594 */ 2595 for_each_gfn_indirect_valid_sp(kvm, sp, gfn) { 2596 if (!can_unsync) 2597 return -EPERM; 2598 2599 if (sp->unsync) 2600 continue; 2601 2602 if (prefetch) 2603 return -EEXIST; 2604 2605 /* 2606 * TDP MMU page faults require an additional spinlock as they 2607 * run with mmu_lock held for read, not write, and the unsync 2608 * logic is not thread safe. Take the spinklock regardless of 2609 * the MMU type to avoid extra conditionals/parameters, there's 2610 * no meaningful penalty if mmu_lock is held for write. 2611 */ 2612 if (!locked) { 2613 locked = true; 2614 spin_lock(&kvm->arch.mmu_unsync_pages_lock); 2615 2616 /* 2617 * Recheck after taking the spinlock, a different vCPU 2618 * may have since marked the page unsync. A false 2619 * positive on the unprotected check above is not 2620 * possible as clearing sp->unsync _must_ hold mmu_lock 2621 * for write, i.e. unsync cannot transition from 0->1 2622 * while this CPU holds mmu_lock for read (or write). 2623 */ 2624 if (READ_ONCE(sp->unsync)) 2625 continue; 2626 } 2627 2628 WARN_ON(sp->role.level != PG_LEVEL_4K); 2629 kvm_unsync_page(kvm, sp); 2630 } 2631 if (locked) 2632 spin_unlock(&kvm->arch.mmu_unsync_pages_lock); 2633 2634 /* 2635 * We need to ensure that the marking of unsync pages is visible 2636 * before the SPTE is updated to allow writes because 2637 * kvm_mmu_sync_roots() checks the unsync flags without holding 2638 * the MMU lock and so can race with this. If the SPTE was updated 2639 * before the page had been marked as unsync-ed, something like the 2640 * following could happen: 2641 * 2642 * CPU 1 CPU 2 2643 * --------------------------------------------------------------------- 2644 * 1.2 Host updates SPTE 2645 * to be writable 2646 * 2.1 Guest writes a GPTE for GVA X. 2647 * (GPTE being in the guest page table shadowed 2648 * by the SP from CPU 1.) 2649 * This reads SPTE during the page table walk. 2650 * Since SPTE.W is read as 1, there is no 2651 * fault. 2652 * 2653 * 2.2 Guest issues TLB flush. 2654 * That causes a VM Exit. 2655 * 2656 * 2.3 Walking of unsync pages sees sp->unsync is 2657 * false and skips the page. 2658 * 2659 * 2.4 Guest accesses GVA X. 2660 * Since the mapping in the SP was not updated, 2661 * so the old mapping for GVA X incorrectly 2662 * gets used. 2663 * 1.1 Host marks SP 2664 * as unsync 2665 * (sp->unsync = true) 2666 * 2667 * The write barrier below ensures that 1.1 happens before 1.2 and thus 2668 * the situation in 2.4 does not arise. It pairs with the read barrier 2669 * in is_unsync_root(), placed between 2.1's load of SPTE.W and 2.3. 2670 */ 2671 smp_wmb(); 2672 2673 return 0; 2674 } 2675 2676 static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, 2677 u64 *sptep, unsigned int pte_access, gfn_t gfn, 2678 kvm_pfn_t pfn, struct kvm_page_fault *fault) 2679 { 2680 struct kvm_mmu_page *sp = sptep_to_sp(sptep); 2681 int level = sp->role.level; 2682 int was_rmapped = 0; 2683 int ret = RET_PF_FIXED; 2684 bool flush = false; 2685 bool wrprot; 2686 u64 spte; 2687 2688 /* Prefetching always gets a writable pfn. */ 2689 bool host_writable = !fault || fault->map_writable; 2690 bool prefetch = !fault || fault->prefetch; 2691 bool write_fault = fault && fault->write; 2692 2693 pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__, 2694 *sptep, write_fault, gfn); 2695 2696 if (unlikely(is_noslot_pfn(pfn))) { 2697 mark_mmio_spte(vcpu, sptep, gfn, pte_access); 2698 return RET_PF_EMULATE; 2699 } 2700 2701 if (is_shadow_present_pte(*sptep)) { 2702 /* 2703 * If we overwrite a PTE page pointer with a 2MB PMD, unlink 2704 * the parent of the now unreachable PTE. 2705 */ 2706 if (level > PG_LEVEL_4K && !is_large_pte(*sptep)) { 2707 struct kvm_mmu_page *child; 2708 u64 pte = *sptep; 2709 2710 child = to_shadow_page(pte & PT64_BASE_ADDR_MASK); 2711 drop_parent_pte(child, sptep); 2712 flush = true; 2713 } else if (pfn != spte_to_pfn(*sptep)) { 2714 pgprintk("hfn old %llx new %llx\n", 2715 spte_to_pfn(*sptep), pfn); 2716 drop_spte(vcpu->kvm, sptep); 2717 flush = true; 2718 } else 2719 was_rmapped = 1; 2720 } 2721 2722 wrprot = make_spte(vcpu, sp, slot, pte_access, gfn, pfn, *sptep, prefetch, 2723 true, host_writable, &spte); 2724 2725 if (*sptep == spte) { 2726 ret = RET_PF_SPURIOUS; 2727 } else { 2728 trace_kvm_mmu_set_spte(level, gfn, sptep); 2729 flush |= mmu_spte_update(sptep, spte); 2730 } 2731 2732 if (wrprot) { 2733 if (write_fault) 2734 ret = RET_PF_EMULATE; 2735 } 2736 2737 if (flush) 2738 kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 2739 KVM_PAGES_PER_HPAGE(level)); 2740 2741 pgprintk("%s: setting spte %llx\n", __func__, *sptep); 2742 2743 if (!was_rmapped) { 2744 WARN_ON_ONCE(ret == RET_PF_SPURIOUS); 2745 kvm_update_page_stats(vcpu->kvm, level, 1); 2746 rmap_add(vcpu, slot, sptep, gfn); 2747 } 2748 2749 return ret; 2750 } 2751 2752 static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, 2753 struct kvm_mmu_page *sp, 2754 u64 *start, u64 *end) 2755 { 2756 struct page *pages[PTE_PREFETCH_NUM]; 2757 struct kvm_memory_slot *slot; 2758 unsigned int access = sp->role.access; 2759 int i, ret; 2760 gfn_t gfn; 2761 2762 gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt); 2763 slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK); 2764 if (!slot) 2765 return -1; 2766 2767 ret = gfn_to_page_many_atomic(slot, gfn, pages, end - start); 2768 if (ret <= 0) 2769 return -1; 2770 2771 for (i = 0; i < ret; i++, gfn++, start++) { 2772 mmu_set_spte(vcpu, slot, start, access, gfn, 2773 page_to_pfn(pages[i]), NULL); 2774 put_page(pages[i]); 2775 } 2776 2777 return 0; 2778 } 2779 2780 static void __direct_pte_prefetch(struct kvm_vcpu *vcpu, 2781 struct kvm_mmu_page *sp, u64 *sptep) 2782 { 2783 u64 *spte, *start = NULL; 2784 int i; 2785 2786 WARN_ON(!sp->role.direct); 2787 2788 i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1); 2789 spte = sp->spt + i; 2790 2791 for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { 2792 if (is_shadow_present_pte(*spte) || spte == sptep) { 2793 if (!start) 2794 continue; 2795 if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0) 2796 return; 2797 start = NULL; 2798 } else if (!start) 2799 start = spte; 2800 } 2801 if (start) 2802 direct_pte_prefetch_many(vcpu, sp, start, spte); 2803 } 2804 2805 static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) 2806 { 2807 struct kvm_mmu_page *sp; 2808 2809 sp = sptep_to_sp(sptep); 2810 2811 /* 2812 * Without accessed bits, there's no way to distinguish between 2813 * actually accessed translations and prefetched, so disable pte 2814 * prefetch if accessed bits aren't available. 2815 */ 2816 if (sp_ad_disabled(sp)) 2817 return; 2818 2819 if (sp->role.level > PG_LEVEL_4K) 2820 return; 2821 2822 /* 2823 * If addresses are being invalidated, skip prefetching to avoid 2824 * accidentally prefetching those addresses. 2825 */ 2826 if (unlikely(vcpu->kvm->mmu_notifier_count)) 2827 return; 2828 2829 __direct_pte_prefetch(vcpu, sp, sptep); 2830 } 2831 2832 static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, 2833 const struct kvm_memory_slot *slot) 2834 { 2835 unsigned long hva; 2836 pte_t *pte; 2837 int level; 2838 2839 if (!PageCompound(pfn_to_page(pfn)) && !kvm_is_zone_device_pfn(pfn)) 2840 return PG_LEVEL_4K; 2841 2842 /* 2843 * Note, using the already-retrieved memslot and __gfn_to_hva_memslot() 2844 * is not solely for performance, it's also necessary to avoid the 2845 * "writable" check in __gfn_to_hva_many(), which will always fail on 2846 * read-only memslots due to gfn_to_hva() assuming writes. Earlier 2847 * page fault steps have already verified the guest isn't writing a 2848 * read-only memslot. 2849 */ 2850 hva = __gfn_to_hva_memslot(slot, gfn); 2851 2852 pte = lookup_address_in_mm(kvm->mm, hva, &level); 2853 if (unlikely(!pte)) 2854 return PG_LEVEL_4K; 2855 2856 return level; 2857 } 2858 2859 int kvm_mmu_max_mapping_level(struct kvm *kvm, 2860 const struct kvm_memory_slot *slot, gfn_t gfn, 2861 kvm_pfn_t pfn, int max_level) 2862 { 2863 struct kvm_lpage_info *linfo; 2864 int host_level; 2865 2866 max_level = min(max_level, max_huge_page_level); 2867 for ( ; max_level > PG_LEVEL_4K; max_level--) { 2868 linfo = lpage_info_slot(gfn, slot, max_level); 2869 if (!linfo->disallow_lpage) 2870 break; 2871 } 2872 2873 if (max_level == PG_LEVEL_4K) 2874 return PG_LEVEL_4K; 2875 2876 host_level = host_pfn_mapping_level(kvm, gfn, pfn, slot); 2877 return min(host_level, max_level); 2878 } 2879 2880 void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) 2881 { 2882 struct kvm_memory_slot *slot = fault->slot; 2883 kvm_pfn_t mask; 2884 2885 fault->huge_page_disallowed = fault->exec && fault->nx_huge_page_workaround_enabled; 2886 2887 if (unlikely(fault->max_level == PG_LEVEL_4K)) 2888 return; 2889 2890 if (is_error_noslot_pfn(fault->pfn) || kvm_is_reserved_pfn(fault->pfn)) 2891 return; 2892 2893 if (kvm_slot_dirty_track_enabled(slot)) 2894 return; 2895 2896 /* 2897 * Enforce the iTLB multihit workaround after capturing the requested 2898 * level, which will be used to do precise, accurate accounting. 2899 */ 2900 fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, 2901 fault->gfn, fault->pfn, 2902 fault->max_level); 2903 if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed) 2904 return; 2905 2906 /* 2907 * mmu_notifier_retry() was successful and mmu_lock is held, so 2908 * the pmd can't be split from under us. 2909 */ 2910 fault->goal_level = fault->req_level; 2911 mask = KVM_PAGES_PER_HPAGE(fault->goal_level) - 1; 2912 VM_BUG_ON((fault->gfn & mask) != (fault->pfn & mask)); 2913 fault->pfn &= ~mask; 2914 } 2915 2916 void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level) 2917 { 2918 if (cur_level > PG_LEVEL_4K && 2919 cur_level == fault->goal_level && 2920 is_shadow_present_pte(spte) && 2921 !is_large_pte(spte)) { 2922 /* 2923 * A small SPTE exists for this pfn, but FNAME(fetch) 2924 * and __direct_map would like to create a large PTE 2925 * instead: just force them to go down another level, 2926 * patching back for them into pfn the next 9 bits of 2927 * the address. 2928 */ 2929 u64 page_mask = KVM_PAGES_PER_HPAGE(cur_level) - 2930 KVM_PAGES_PER_HPAGE(cur_level - 1); 2931 fault->pfn |= fault->gfn & page_mask; 2932 fault->goal_level--; 2933 } 2934 } 2935 2936 static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) 2937 { 2938 struct kvm_shadow_walk_iterator it; 2939 struct kvm_mmu_page *sp; 2940 int ret; 2941 gfn_t base_gfn = fault->gfn; 2942 2943 kvm_mmu_hugepage_adjust(vcpu, fault); 2944 2945 trace_kvm_mmu_spte_requested(fault); 2946 for_each_shadow_entry(vcpu, fault->addr, it) { 2947 /* 2948 * We cannot overwrite existing page tables with an NX 2949 * large page, as the leaf could be executable. 2950 */ 2951 if (fault->nx_huge_page_workaround_enabled) 2952 disallowed_hugepage_adjust(fault, *it.sptep, it.level); 2953 2954 base_gfn = fault->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); 2955 if (it.level == fault->goal_level) 2956 break; 2957 2958 drop_large_spte(vcpu, it.sptep); 2959 if (is_shadow_present_pte(*it.sptep)) 2960 continue; 2961 2962 sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr, 2963 it.level - 1, true, ACC_ALL); 2964 2965 link_shadow_page(vcpu, it.sptep, sp); 2966 if (fault->is_tdp && fault->huge_page_disallowed && 2967 fault->req_level >= it.level) 2968 account_huge_nx_page(vcpu->kvm, sp); 2969 } 2970 2971 if (WARN_ON_ONCE(it.level != fault->goal_level)) 2972 return -EFAULT; 2973 2974 ret = mmu_set_spte(vcpu, fault->slot, it.sptep, ACC_ALL, 2975 base_gfn, fault->pfn, fault); 2976 if (ret == RET_PF_SPURIOUS) 2977 return ret; 2978 2979 direct_pte_prefetch(vcpu, it.sptep); 2980 ++vcpu->stat.pf_fixed; 2981 return ret; 2982 } 2983 2984 static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk) 2985 { 2986 send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, PAGE_SHIFT, tsk); 2987 } 2988 2989 static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn) 2990 { 2991 /* 2992 * Do not cache the mmio info caused by writing the readonly gfn 2993 * into the spte otherwise read access on readonly gfn also can 2994 * caused mmio page fault and treat it as mmio access. 2995 */ 2996 if (pfn == KVM_PFN_ERR_RO_FAULT) 2997 return RET_PF_EMULATE; 2998 2999 if (pfn == KVM_PFN_ERR_HWPOISON) { 3000 kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current); 3001 return RET_PF_RETRY; 3002 } 3003 3004 return -EFAULT; 3005 } 3006 3007 static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, 3008 unsigned int access, int *ret_val) 3009 { 3010 /* The pfn is invalid, report the error! */ 3011 if (unlikely(is_error_pfn(fault->pfn))) { 3012 *ret_val = kvm_handle_bad_page(vcpu, fault->gfn, fault->pfn); 3013 return true; 3014 } 3015 3016 if (unlikely(!fault->slot)) { 3017 gva_t gva = fault->is_tdp ? 0 : fault->addr; 3018 3019 vcpu_cache_mmio_info(vcpu, gva, fault->gfn, 3020 access & shadow_mmio_access_mask); 3021 /* 3022 * If MMIO caching is disabled, emulate immediately without 3023 * touching the shadow page tables as attempting to install an 3024 * MMIO SPTE will just be an expensive nop. 3025 */ 3026 if (unlikely(!shadow_mmio_value)) { 3027 *ret_val = RET_PF_EMULATE; 3028 return true; 3029 } 3030 } 3031 3032 return false; 3033 } 3034 3035 static bool page_fault_can_be_fast(struct kvm_page_fault *fault) 3036 { 3037 /* 3038 * Do not fix the mmio spte with invalid generation number which 3039 * need to be updated by slow page fault path. 3040 */ 3041 if (fault->rsvd) 3042 return false; 3043 3044 /* See if the page fault is due to an NX violation */ 3045 if (unlikely(fault->exec && fault->present)) 3046 return false; 3047 3048 /* 3049 * #PF can be fast if: 3050 * 1. The shadow page table entry is not present, which could mean that 3051 * the fault is potentially caused by access tracking (if enabled). 3052 * 2. The shadow page table entry is present and the fault 3053 * is caused by write-protect, that means we just need change the W 3054 * bit of the spte which can be done out of mmu-lock. 3055 * 3056 * However, if access tracking is disabled we know that a non-present 3057 * page must be a genuine page fault where we have to create a new SPTE. 3058 * So, if access tracking is disabled, we return true only for write 3059 * accesses to a present page. 3060 */ 3061 3062 return shadow_acc_track_mask != 0 || (fault->write && fault->present); 3063 } 3064 3065 /* 3066 * Returns true if the SPTE was fixed successfully. Otherwise, 3067 * someone else modified the SPTE from its original value. 3068 */ 3069 static bool 3070 fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, 3071 u64 *sptep, u64 old_spte, u64 new_spte) 3072 { 3073 /* 3074 * Theoretically we could also set dirty bit (and flush TLB) here in 3075 * order to eliminate unnecessary PML logging. See comments in 3076 * set_spte. But fast_page_fault is very unlikely to happen with PML 3077 * enabled, so we do not do this. This might result in the same GPA 3078 * to be logged in PML buffer again when the write really happens, and 3079 * eventually to be called by mark_page_dirty twice. But it's also no 3080 * harm. This also avoids the TLB flush needed after setting dirty bit 3081 * so non-PML cases won't be impacted. 3082 * 3083 * Compare with set_spte where instead shadow_dirty_mask is set. 3084 */ 3085 if (cmpxchg64(sptep, old_spte, new_spte) != old_spte) 3086 return false; 3087 3088 if (is_writable_pte(new_spte) && !is_writable_pte(old_spte)) 3089 mark_page_dirty_in_slot(vcpu->kvm, fault->slot, fault->gfn); 3090 3091 return true; 3092 } 3093 3094 static bool is_access_allowed(struct kvm_page_fault *fault, u64 spte) 3095 { 3096 if (fault->exec) 3097 return is_executable_pte(spte); 3098 3099 if (fault->write) 3100 return is_writable_pte(spte); 3101 3102 /* Fault was on Read access */ 3103 return spte & PT_PRESENT_MASK; 3104 } 3105 3106 /* 3107 * Returns the last level spte pointer of the shadow page walk for the given 3108 * gpa, and sets *spte to the spte value. This spte may be non-preset. If no 3109 * walk could be performed, returns NULL and *spte does not contain valid data. 3110 * 3111 * Contract: 3112 * - Must be called between walk_shadow_page_lockless_{begin,end}. 3113 * - The returned sptep must not be used after walk_shadow_page_lockless_end. 3114 */ 3115 static u64 *fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gpa_t gpa, u64 *spte) 3116 { 3117 struct kvm_shadow_walk_iterator iterator; 3118 u64 old_spte; 3119 u64 *sptep = NULL; 3120 3121 for_each_shadow_entry_lockless(vcpu, gpa, iterator, old_spte) { 3122 sptep = iterator.sptep; 3123 *spte = old_spte; 3124 } 3125 3126 return sptep; 3127 } 3128 3129 /* 3130 * Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS. 3131 */ 3132 static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) 3133 { 3134 struct kvm_mmu_page *sp; 3135 int ret = RET_PF_INVALID; 3136 u64 spte = 0ull; 3137 u64 *sptep = NULL; 3138 uint retry_count = 0; 3139 3140 if (!page_fault_can_be_fast(fault)) 3141 return ret; 3142 3143 walk_shadow_page_lockless_begin(vcpu); 3144 3145 do { 3146 u64 new_spte; 3147 3148 if (is_tdp_mmu(vcpu->arch.mmu)) 3149 sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, fault->addr, &spte); 3150 else 3151 sptep = fast_pf_get_last_sptep(vcpu, fault->addr, &spte); 3152 3153 if (!is_shadow_present_pte(spte)) 3154 break; 3155 3156 sp = sptep_to_sp(sptep); 3157 if (!is_last_spte(spte, sp->role.level)) 3158 break; 3159 3160 /* 3161 * Check whether the memory access that caused the fault would 3162 * still cause it if it were to be performed right now. If not, 3163 * then this is a spurious fault caused by TLB lazily flushed, 3164 * or some other CPU has already fixed the PTE after the 3165 * current CPU took the fault. 3166 * 3167 * Need not check the access of upper level table entries since 3168 * they are always ACC_ALL. 3169 */ 3170 if (is_access_allowed(fault, spte)) { 3171 ret = RET_PF_SPURIOUS; 3172 break; 3173 } 3174 3175 new_spte = spte; 3176 3177 if (is_access_track_spte(spte)) 3178 new_spte = restore_acc_track_spte(new_spte); 3179 3180 /* 3181 * Currently, to simplify the code, write-protection can 3182 * be removed in the fast path only if the SPTE was 3183 * write-protected for dirty-logging or access tracking. 3184 */ 3185 if (fault->write && 3186 spte_can_locklessly_be_made_writable(spte)) { 3187 new_spte |= PT_WRITABLE_MASK; 3188 3189 /* 3190 * Do not fix write-permission on the large spte when 3191 * dirty logging is enabled. Since we only dirty the 3192 * first page into the dirty-bitmap in 3193 * fast_pf_fix_direct_spte(), other pages are missed 3194 * if its slot has dirty logging enabled. 3195 * 3196 * Instead, we let the slow page fault path create a 3197 * normal spte to fix the access. 3198 */ 3199 if (sp->role.level > PG_LEVEL_4K && 3200 kvm_slot_dirty_track_enabled(fault->slot)) 3201 break; 3202 } 3203 3204 /* Verify that the fault can be handled in the fast path */ 3205 if (new_spte == spte || 3206 !is_access_allowed(fault, new_spte)) 3207 break; 3208 3209 /* 3210 * Currently, fast page fault only works for direct mapping 3211 * since the gfn is not stable for indirect shadow page. See 3212 * Documentation/virt/kvm/locking.rst to get more detail. 3213 */ 3214 if (fast_pf_fix_direct_spte(vcpu, fault, sptep, spte, new_spte)) { 3215 ret = RET_PF_FIXED; 3216 break; 3217 } 3218 3219 if (++retry_count > 4) { 3220 printk_once(KERN_WARNING 3221 "kvm: Fast #PF retrying more than 4 times.\n"); 3222 break; 3223 } 3224 3225 } while (true); 3226 3227 trace_fast_page_fault(vcpu, fault, sptep, spte, ret); 3228 walk_shadow_page_lockless_end(vcpu); 3229 3230 return ret; 3231 } 3232 3233 static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, 3234 struct list_head *invalid_list) 3235 { 3236 struct kvm_mmu_page *sp; 3237 3238 if (!VALID_PAGE(*root_hpa)) 3239 return; 3240 3241 sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK); 3242 3243 if (is_tdp_mmu_page(sp)) 3244 kvm_tdp_mmu_put_root(kvm, sp, false); 3245 else if (!--sp->root_count && sp->role.invalid) 3246 kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); 3247 3248 *root_hpa = INVALID_PAGE; 3249 } 3250 3251 /* roots_to_free must be some combination of the KVM_MMU_ROOT_* flags */ 3252 void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, 3253 ulong roots_to_free) 3254 { 3255 struct kvm *kvm = vcpu->kvm; 3256 int i; 3257 LIST_HEAD(invalid_list); 3258 bool free_active_root = roots_to_free & KVM_MMU_ROOT_CURRENT; 3259 3260 BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG); 3261 3262 /* Before acquiring the MMU lock, see if we need to do any real work. */ 3263 if (!(free_active_root && VALID_PAGE(mmu->root_hpa))) { 3264 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) 3265 if ((roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) && 3266 VALID_PAGE(mmu->prev_roots[i].hpa)) 3267 break; 3268 3269 if (i == KVM_MMU_NUM_PREV_ROOTS) 3270 return; 3271 } 3272 3273 write_lock(&kvm->mmu_lock); 3274 3275 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) 3276 if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) 3277 mmu_free_root_page(kvm, &mmu->prev_roots[i].hpa, 3278 &invalid_list); 3279 3280 if (free_active_root) { 3281 if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL && 3282 (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) { 3283 mmu_free_root_page(kvm, &mmu->root_hpa, &invalid_list); 3284 } else if (mmu->pae_root) { 3285 for (i = 0; i < 4; ++i) { 3286 if (!IS_VALID_PAE_ROOT(mmu->pae_root[i])) 3287 continue; 3288 3289 mmu_free_root_page(kvm, &mmu->pae_root[i], 3290 &invalid_list); 3291 mmu->pae_root[i] = INVALID_PAE_ROOT; 3292 } 3293 } 3294 mmu->root_hpa = INVALID_PAGE; 3295 mmu->root_pgd = 0; 3296 } 3297 3298 kvm_mmu_commit_zap_page(kvm, &invalid_list); 3299 write_unlock(&kvm->mmu_lock); 3300 } 3301 EXPORT_SYMBOL_GPL(kvm_mmu_free_roots); 3302 3303 void kvm_mmu_free_guest_mode_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) 3304 { 3305 unsigned long roots_to_free = 0; 3306 hpa_t root_hpa; 3307 int i; 3308 3309 /* 3310 * This should not be called while L2 is active, L2 can't invalidate 3311 * _only_ its own roots, e.g. INVVPID unconditionally exits. 3312 */ 3313 WARN_ON_ONCE(mmu->mmu_role.base.guest_mode); 3314 3315 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 3316 root_hpa = mmu->prev_roots[i].hpa; 3317 if (!VALID_PAGE(root_hpa)) 3318 continue; 3319 3320 if (!to_shadow_page(root_hpa) || 3321 to_shadow_page(root_hpa)->role.guest_mode) 3322 roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); 3323 } 3324 3325 kvm_mmu_free_roots(vcpu, mmu, roots_to_free); 3326 } 3327 EXPORT_SYMBOL_GPL(kvm_mmu_free_guest_mode_roots); 3328 3329 3330 static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn) 3331 { 3332 int ret = 0; 3333 3334 if (!kvm_vcpu_is_visible_gfn(vcpu, root_gfn)) { 3335 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 3336 ret = 1; 3337 } 3338 3339 return ret; 3340 } 3341 3342 static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva, 3343 u8 level, bool direct) 3344 { 3345 struct kvm_mmu_page *sp; 3346 3347 sp = kvm_mmu_get_page(vcpu, gfn, gva, level, direct, ACC_ALL); 3348 ++sp->root_count; 3349 3350 return __pa(sp->spt); 3351 } 3352 3353 static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) 3354 { 3355 struct kvm_mmu *mmu = vcpu->arch.mmu; 3356 u8 shadow_root_level = mmu->shadow_root_level; 3357 hpa_t root; 3358 unsigned i; 3359 int r; 3360 3361 write_lock(&vcpu->kvm->mmu_lock); 3362 r = make_mmu_pages_available(vcpu); 3363 if (r < 0) 3364 goto out_unlock; 3365 3366 if (is_tdp_mmu_enabled(vcpu->kvm)) { 3367 root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu); 3368 mmu->root_hpa = root; 3369 } else if (shadow_root_level >= PT64_ROOT_4LEVEL) { 3370 root = mmu_alloc_root(vcpu, 0, 0, shadow_root_level, true); 3371 mmu->root_hpa = root; 3372 } else if (shadow_root_level == PT32E_ROOT_LEVEL) { 3373 if (WARN_ON_ONCE(!mmu->pae_root)) { 3374 r = -EIO; 3375 goto out_unlock; 3376 } 3377 3378 for (i = 0; i < 4; ++i) { 3379 WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i])); 3380 3381 root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT), 3382 i << 30, PT32_ROOT_LEVEL, true); 3383 mmu->pae_root[i] = root | PT_PRESENT_MASK | 3384 shadow_me_mask; 3385 } 3386 mmu->root_hpa = __pa(mmu->pae_root); 3387 } else { 3388 WARN_ONCE(1, "Bad TDP root level = %d\n", shadow_root_level); 3389 r = -EIO; 3390 goto out_unlock; 3391 } 3392 3393 /* root_pgd is ignored for direct MMUs. */ 3394 mmu->root_pgd = 0; 3395 out_unlock: 3396 write_unlock(&vcpu->kvm->mmu_lock); 3397 return r; 3398 } 3399 3400 static int mmu_first_shadow_root_alloc(struct kvm *kvm) 3401 { 3402 struct kvm_memslots *slots; 3403 struct kvm_memory_slot *slot; 3404 int r = 0, i, bkt; 3405 3406 /* 3407 * Check if this is the first shadow root being allocated before 3408 * taking the lock. 3409 */ 3410 if (kvm_shadow_root_allocated(kvm)) 3411 return 0; 3412 3413 mutex_lock(&kvm->slots_arch_lock); 3414 3415 /* Recheck, under the lock, whether this is the first shadow root. */ 3416 if (kvm_shadow_root_allocated(kvm)) 3417 goto out_unlock; 3418 3419 /* 3420 * Check if anything actually needs to be allocated, e.g. all metadata 3421 * will be allocated upfront if TDP is disabled. 3422 */ 3423 if (kvm_memslots_have_rmaps(kvm) && 3424 kvm_page_track_write_tracking_enabled(kvm)) 3425 goto out_success; 3426 3427 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { 3428 slots = __kvm_memslots(kvm, i); 3429 kvm_for_each_memslot(slot, bkt, slots) { 3430 /* 3431 * Both of these functions are no-ops if the target is 3432 * already allocated, so unconditionally calling both 3433 * is safe. Intentionally do NOT free allocations on 3434 * failure to avoid having to track which allocations 3435 * were made now versus when the memslot was created. 3436 * The metadata is guaranteed to be freed when the slot 3437 * is freed, and will be kept/used if userspace retries 3438 * KVM_RUN instead of killing the VM. 3439 */ 3440 r = memslot_rmap_alloc(slot, slot->npages); 3441 if (r) 3442 goto out_unlock; 3443 r = kvm_page_track_write_tracking_alloc(slot); 3444 if (r) 3445 goto out_unlock; 3446 } 3447 } 3448 3449 /* 3450 * Ensure that shadow_root_allocated becomes true strictly after 3451 * all the related pointers are set. 3452 */ 3453 out_success: 3454 smp_store_release(&kvm->arch.shadow_root_allocated, true); 3455 3456 out_unlock: 3457 mutex_unlock(&kvm->slots_arch_lock); 3458 return r; 3459 } 3460 3461 static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) 3462 { 3463 struct kvm_mmu *mmu = vcpu->arch.mmu; 3464 u64 pdptrs[4], pm_mask; 3465 gfn_t root_gfn, root_pgd; 3466 hpa_t root; 3467 unsigned i; 3468 int r; 3469 3470 root_pgd = mmu->get_guest_pgd(vcpu); 3471 root_gfn = root_pgd >> PAGE_SHIFT; 3472 3473 if (mmu_check_root(vcpu, root_gfn)) 3474 return 1; 3475 3476 /* 3477 * On SVM, reading PDPTRs might access guest memory, which might fault 3478 * and thus might sleep. Grab the PDPTRs before acquiring mmu_lock. 3479 */ 3480 if (mmu->root_level == PT32E_ROOT_LEVEL) { 3481 for (i = 0; i < 4; ++i) { 3482 pdptrs[i] = mmu->get_pdptr(vcpu, i); 3483 if (!(pdptrs[i] & PT_PRESENT_MASK)) 3484 continue; 3485 3486 if (mmu_check_root(vcpu, pdptrs[i] >> PAGE_SHIFT)) 3487 return 1; 3488 } 3489 } 3490 3491 r = mmu_first_shadow_root_alloc(vcpu->kvm); 3492 if (r) 3493 return r; 3494 3495 write_lock(&vcpu->kvm->mmu_lock); 3496 r = make_mmu_pages_available(vcpu); 3497 if (r < 0) 3498 goto out_unlock; 3499 3500 /* 3501 * Do we shadow a long mode page table? If so we need to 3502 * write-protect the guests page table root. 3503 */ 3504 if (mmu->root_level >= PT64_ROOT_4LEVEL) { 3505 root = mmu_alloc_root(vcpu, root_gfn, 0, 3506 mmu->shadow_root_level, false); 3507 mmu->root_hpa = root; 3508 goto set_root_pgd; 3509 } 3510 3511 if (WARN_ON_ONCE(!mmu->pae_root)) { 3512 r = -EIO; 3513 goto out_unlock; 3514 } 3515 3516 /* 3517 * We shadow a 32 bit page table. This may be a legacy 2-level 3518 * or a PAE 3-level page table. In either case we need to be aware that 3519 * the shadow page table may be a PAE or a long mode page table. 3520 */ 3521 pm_mask = PT_PRESENT_MASK | shadow_me_mask; 3522 if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL) { 3523 pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; 3524 3525 if (WARN_ON_ONCE(!mmu->pml4_root)) { 3526 r = -EIO; 3527 goto out_unlock; 3528 } 3529 mmu->pml4_root[0] = __pa(mmu->pae_root) | pm_mask; 3530 3531 if (mmu->shadow_root_level == PT64_ROOT_5LEVEL) { 3532 if (WARN_ON_ONCE(!mmu->pml5_root)) { 3533 r = -EIO; 3534 goto out_unlock; 3535 } 3536 mmu->pml5_root[0] = __pa(mmu->pml4_root) | pm_mask; 3537 } 3538 } 3539 3540 for (i = 0; i < 4; ++i) { 3541 WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i])); 3542 3543 if (mmu->root_level == PT32E_ROOT_LEVEL) { 3544 if (!(pdptrs[i] & PT_PRESENT_MASK)) { 3545 mmu->pae_root[i] = INVALID_PAE_ROOT; 3546 continue; 3547 } 3548 root_gfn = pdptrs[i] >> PAGE_SHIFT; 3549 } 3550 3551 root = mmu_alloc_root(vcpu, root_gfn, i << 30, 3552 PT32_ROOT_LEVEL, false); 3553 mmu->pae_root[i] = root | pm_mask; 3554 } 3555 3556 if (mmu->shadow_root_level == PT64_ROOT_5LEVEL) 3557 mmu->root_hpa = __pa(mmu->pml5_root); 3558 else if (mmu->shadow_root_level == PT64_ROOT_4LEVEL) 3559 mmu->root_hpa = __pa(mmu->pml4_root); 3560 else 3561 mmu->root_hpa = __pa(mmu->pae_root); 3562 3563 set_root_pgd: 3564 mmu->root_pgd = root_pgd; 3565 out_unlock: 3566 write_unlock(&vcpu->kvm->mmu_lock); 3567 3568 return 0; 3569 } 3570 3571 static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu) 3572 { 3573 struct kvm_mmu *mmu = vcpu->arch.mmu; 3574 bool need_pml5 = mmu->shadow_root_level > PT64_ROOT_4LEVEL; 3575 u64 *pml5_root = NULL; 3576 u64 *pml4_root = NULL; 3577 u64 *pae_root; 3578 3579 /* 3580 * When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP 3581 * tables are allocated and initialized at root creation as there is no 3582 * equivalent level in the guest's NPT to shadow. Allocate the tables 3583 * on demand, as running a 32-bit L1 VMM on 64-bit KVM is very rare. 3584 */ 3585 if (mmu->direct_map || mmu->root_level >= PT64_ROOT_4LEVEL || 3586 mmu->shadow_root_level < PT64_ROOT_4LEVEL) 3587 return 0; 3588 3589 /* 3590 * NPT, the only paging mode that uses this horror, uses a fixed number 3591 * of levels for the shadow page tables, e.g. all MMUs are 4-level or 3592 * all MMus are 5-level. Thus, this can safely require that pml5_root 3593 * is allocated if the other roots are valid and pml5 is needed, as any 3594 * prior MMU would also have required pml5. 3595 */ 3596 if (mmu->pae_root && mmu->pml4_root && (!need_pml5 || mmu->pml5_root)) 3597 return 0; 3598 3599 /* 3600 * The special roots should always be allocated in concert. Yell and 3601 * bail if KVM ends up in a state where only one of the roots is valid. 3602 */ 3603 if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root || 3604 (need_pml5 && mmu->pml5_root))) 3605 return -EIO; 3606 3607 /* 3608 * Unlike 32-bit NPT, the PDP table doesn't need to be in low mem, and 3609 * doesn't need to be decrypted. 3610 */ 3611 pae_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); 3612 if (!pae_root) 3613 return -ENOMEM; 3614 3615 #ifdef CONFIG_X86_64 3616 pml4_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); 3617 if (!pml4_root) 3618 goto err_pml4; 3619 3620 if (need_pml5) { 3621 pml5_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); 3622 if (!pml5_root) 3623 goto err_pml5; 3624 } 3625 #endif 3626 3627 mmu->pae_root = pae_root; 3628 mmu->pml4_root = pml4_root; 3629 mmu->pml5_root = pml5_root; 3630 3631 return 0; 3632 3633 #ifdef CONFIG_X86_64 3634 err_pml5: 3635 free_page((unsigned long)pml4_root); 3636 err_pml4: 3637 free_page((unsigned long)pae_root); 3638 return -ENOMEM; 3639 #endif 3640 } 3641 3642 static bool is_unsync_root(hpa_t root) 3643 { 3644 struct kvm_mmu_page *sp; 3645 3646 if (!VALID_PAGE(root)) 3647 return false; 3648 3649 /* 3650 * The read barrier orders the CPU's read of SPTE.W during the page table 3651 * walk before the reads of sp->unsync/sp->unsync_children here. 3652 * 3653 * Even if another CPU was marking the SP as unsync-ed simultaneously, 3654 * any guest page table changes are not guaranteed to be visible anyway 3655 * until this VCPU issues a TLB flush strictly after those changes are 3656 * made. We only need to ensure that the other CPU sets these flags 3657 * before any actual changes to the page tables are made. The comments 3658 * in mmu_try_to_unsync_pages() describe what could go wrong if this 3659 * requirement isn't satisfied. 3660 */ 3661 smp_rmb(); 3662 sp = to_shadow_page(root); 3663 if (sp->unsync || sp->unsync_children) 3664 return true; 3665 3666 return false; 3667 } 3668 3669 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) 3670 { 3671 int i; 3672 struct kvm_mmu_page *sp; 3673 3674 if (vcpu->arch.mmu->direct_map) 3675 return; 3676 3677 if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) 3678 return; 3679 3680 vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); 3681 3682 if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) { 3683 hpa_t root = vcpu->arch.mmu->root_hpa; 3684 sp = to_shadow_page(root); 3685 3686 if (!is_unsync_root(root)) 3687 return; 3688 3689 write_lock(&vcpu->kvm->mmu_lock); 3690 kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); 3691 3692 mmu_sync_children(vcpu, sp, true); 3693 3694 kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); 3695 write_unlock(&vcpu->kvm->mmu_lock); 3696 return; 3697 } 3698 3699 write_lock(&vcpu->kvm->mmu_lock); 3700 kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); 3701 3702 for (i = 0; i < 4; ++i) { 3703 hpa_t root = vcpu->arch.mmu->pae_root[i]; 3704 3705 if (IS_VALID_PAE_ROOT(root)) { 3706 root &= PT64_BASE_ADDR_MASK; 3707 sp = to_shadow_page(root); 3708 mmu_sync_children(vcpu, sp, true); 3709 } 3710 } 3711 3712 kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); 3713 write_unlock(&vcpu->kvm->mmu_lock); 3714 } 3715 3716 void kvm_mmu_sync_prev_roots(struct kvm_vcpu *vcpu) 3717 { 3718 unsigned long roots_to_free = 0; 3719 int i; 3720 3721 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) 3722 if (is_unsync_root(vcpu->arch.mmu->prev_roots[i].hpa)) 3723 roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); 3724 3725 /* sync prev_roots by simply freeing them */ 3726 kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free); 3727 } 3728 3729 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, 3730 gpa_t vaddr, u32 access, 3731 struct x86_exception *exception) 3732 { 3733 if (exception) 3734 exception->error_code = 0; 3735 return kvm_translate_gpa(vcpu, mmu, vaddr, access, exception); 3736 } 3737 3738 static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct) 3739 { 3740 /* 3741 * A nested guest cannot use the MMIO cache if it is using nested 3742 * page tables, because cr2 is a nGPA while the cache stores GPAs. 3743 */ 3744 if (mmu_is_nested(vcpu)) 3745 return false; 3746 3747 if (direct) 3748 return vcpu_match_mmio_gpa(vcpu, addr); 3749 3750 return vcpu_match_mmio_gva(vcpu, addr); 3751 } 3752 3753 /* 3754 * Return the level of the lowest level SPTE added to sptes. 3755 * That SPTE may be non-present. 3756 * 3757 * Must be called between walk_shadow_page_lockless_{begin,end}. 3758 */ 3759 static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level) 3760 { 3761 struct kvm_shadow_walk_iterator iterator; 3762 int leaf = -1; 3763 u64 spte; 3764 3765 for (shadow_walk_init(&iterator, vcpu, addr), 3766 *root_level = iterator.level; 3767 shadow_walk_okay(&iterator); 3768 __shadow_walk_next(&iterator, spte)) { 3769 leaf = iterator.level; 3770 spte = mmu_spte_get_lockless(iterator.sptep); 3771 3772 sptes[leaf] = spte; 3773 } 3774 3775 return leaf; 3776 } 3777 3778 /* return true if reserved bit(s) are detected on a valid, non-MMIO SPTE. */ 3779 static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) 3780 { 3781 u64 sptes[PT64_ROOT_MAX_LEVEL + 1]; 3782 struct rsvd_bits_validate *rsvd_check; 3783 int root, leaf, level; 3784 bool reserved = false; 3785 3786 walk_shadow_page_lockless_begin(vcpu); 3787 3788 if (is_tdp_mmu(vcpu->arch.mmu)) 3789 leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, &root); 3790 else 3791 leaf = get_walk(vcpu, addr, sptes, &root); 3792 3793 walk_shadow_page_lockless_end(vcpu); 3794 3795 if (unlikely(leaf < 0)) { 3796 *sptep = 0ull; 3797 return reserved; 3798 } 3799 3800 *sptep = sptes[leaf]; 3801 3802 /* 3803 * Skip reserved bits checks on the terminal leaf if it's not a valid 3804 * SPTE. Note, this also (intentionally) skips MMIO SPTEs, which, by 3805 * design, always have reserved bits set. The purpose of the checks is 3806 * to detect reserved bits on non-MMIO SPTEs. i.e. buggy SPTEs. 3807 */ 3808 if (!is_shadow_present_pte(sptes[leaf])) 3809 leaf++; 3810 3811 rsvd_check = &vcpu->arch.mmu->shadow_zero_check; 3812 3813 for (level = root; level >= leaf; level--) 3814 reserved |= is_rsvd_spte(rsvd_check, sptes[level], level); 3815 3816 if (reserved) { 3817 pr_err("%s: reserved bits set on MMU-present spte, addr 0x%llx, hierarchy:\n", 3818 __func__, addr); 3819 for (level = root; level >= leaf; level--) 3820 pr_err("------ spte = 0x%llx level = %d, rsvd bits = 0x%llx", 3821 sptes[level], level, 3822 get_rsvd_bits(rsvd_check, sptes[level], level)); 3823 } 3824 3825 return reserved; 3826 } 3827 3828 static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct) 3829 { 3830 u64 spte; 3831 bool reserved; 3832 3833 if (mmio_info_in_cache(vcpu, addr, direct)) 3834 return RET_PF_EMULATE; 3835 3836 reserved = get_mmio_spte(vcpu, addr, &spte); 3837 if (WARN_ON(reserved)) 3838 return -EINVAL; 3839 3840 if (is_mmio_spte(spte)) { 3841 gfn_t gfn = get_mmio_spte_gfn(spte); 3842 unsigned int access = get_mmio_spte_access(spte); 3843 3844 if (!check_mmio_spte(vcpu, spte)) 3845 return RET_PF_INVALID; 3846 3847 if (direct) 3848 addr = 0; 3849 3850 trace_handle_mmio_page_fault(addr, gfn, access); 3851 vcpu_cache_mmio_info(vcpu, addr, gfn, access); 3852 return RET_PF_EMULATE; 3853 } 3854 3855 /* 3856 * If the page table is zapped by other cpus, let CPU fault again on 3857 * the address. 3858 */ 3859 return RET_PF_RETRY; 3860 } 3861 3862 static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu, 3863 struct kvm_page_fault *fault) 3864 { 3865 if (unlikely(fault->rsvd)) 3866 return false; 3867 3868 if (!fault->present || !fault->write) 3869 return false; 3870 3871 /* 3872 * guest is writing the page which is write tracked which can 3873 * not be fixed by page fault handler. 3874 */ 3875 if (kvm_slot_page_track_is_active(vcpu->kvm, fault->slot, fault->gfn, KVM_PAGE_TRACK_WRITE)) 3876 return true; 3877 3878 return false; 3879 } 3880 3881 static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr) 3882 { 3883 struct kvm_shadow_walk_iterator iterator; 3884 u64 spte; 3885 3886 walk_shadow_page_lockless_begin(vcpu); 3887 for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) 3888 clear_sp_write_flooding_count(iterator.sptep); 3889 walk_shadow_page_lockless_end(vcpu); 3890 } 3891 3892 static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, 3893 gfn_t gfn) 3894 { 3895 struct kvm_arch_async_pf arch; 3896 3897 arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id; 3898 arch.gfn = gfn; 3899 arch.direct_map = vcpu->arch.mmu->direct_map; 3900 arch.cr3 = vcpu->arch.mmu->get_guest_pgd(vcpu); 3901 3902 return kvm_setup_async_pf(vcpu, cr2_or_gpa, 3903 kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch); 3904 } 3905 3906 static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, int *r) 3907 { 3908 struct kvm_memory_slot *slot = fault->slot; 3909 bool async; 3910 3911 /* 3912 * Retry the page fault if the gfn hit a memslot that is being deleted 3913 * or moved. This ensures any existing SPTEs for the old memslot will 3914 * be zapped before KVM inserts a new MMIO SPTE for the gfn. 3915 */ 3916 if (slot && (slot->flags & KVM_MEMSLOT_INVALID)) 3917 goto out_retry; 3918 3919 if (!kvm_is_visible_memslot(slot)) { 3920 /* Don't expose private memslots to L2. */ 3921 if (is_guest_mode(vcpu)) { 3922 fault->slot = NULL; 3923 fault->pfn = KVM_PFN_NOSLOT; 3924 fault->map_writable = false; 3925 return false; 3926 } 3927 /* 3928 * If the APIC access page exists but is disabled, go directly 3929 * to emulation without caching the MMIO access or creating a 3930 * MMIO SPTE. That way the cache doesn't need to be purged 3931 * when the AVIC is re-enabled. 3932 */ 3933 if (slot && slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT && 3934 !kvm_apicv_activated(vcpu->kvm)) { 3935 *r = RET_PF_EMULATE; 3936 return true; 3937 } 3938 } 3939 3940 async = false; 3941 fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, &async, 3942 fault->write, &fault->map_writable, 3943 &fault->hva); 3944 if (!async) 3945 return false; /* *pfn has correct page already */ 3946 3947 if (!fault->prefetch && kvm_can_do_async_pf(vcpu)) { 3948 trace_kvm_try_async_get_page(fault->addr, fault->gfn); 3949 if (kvm_find_async_pf_gfn(vcpu, fault->gfn)) { 3950 trace_kvm_async_pf_doublefault(fault->addr, fault->gfn); 3951 kvm_make_request(KVM_REQ_APF_HALT, vcpu); 3952 goto out_retry; 3953 } else if (kvm_arch_setup_async_pf(vcpu, fault->addr, fault->gfn)) 3954 goto out_retry; 3955 } 3956 3957 fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, NULL, 3958 fault->write, &fault->map_writable, 3959 &fault->hva); 3960 return false; 3961 3962 out_retry: 3963 *r = RET_PF_RETRY; 3964 return true; 3965 } 3966 3967 /* 3968 * Returns true if the page fault is stale and needs to be retried, i.e. if the 3969 * root was invalidated by a memslot update or a relevant mmu_notifier fired. 3970 */ 3971 static bool is_page_fault_stale(struct kvm_vcpu *vcpu, 3972 struct kvm_page_fault *fault, int mmu_seq) 3973 { 3974 struct kvm_mmu_page *sp = to_shadow_page(vcpu->arch.mmu->root_hpa); 3975 3976 /* Special roots, e.g. pae_root, are not backed by shadow pages. */ 3977 if (sp && is_obsolete_sp(vcpu->kvm, sp)) 3978 return true; 3979 3980 /* 3981 * Roots without an associated shadow page are considered invalid if 3982 * there is a pending request to free obsolete roots. The request is 3983 * only a hint that the current root _may_ be obsolete and needs to be 3984 * reloaded, e.g. if the guest frees a PGD that KVM is tracking as a 3985 * previous root, then __kvm_mmu_prepare_zap_page() signals all vCPUs 3986 * to reload even if no vCPU is actively using the root. 3987 */ 3988 if (!sp && kvm_test_request(KVM_REQ_MMU_RELOAD, vcpu)) 3989 return true; 3990 3991 return fault->slot && 3992 mmu_notifier_retry_hva(vcpu->kvm, mmu_seq, fault->hva); 3993 } 3994 3995 static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) 3996 { 3997 bool is_tdp_mmu_fault = is_tdp_mmu(vcpu->arch.mmu); 3998 3999 unsigned long mmu_seq; 4000 int r; 4001 4002 fault->gfn = fault->addr >> PAGE_SHIFT; 4003 fault->slot = kvm_vcpu_gfn_to_memslot(vcpu, fault->gfn); 4004 4005 if (page_fault_handle_page_track(vcpu, fault)) 4006 return RET_PF_EMULATE; 4007 4008 r = fast_page_fault(vcpu, fault); 4009 if (r != RET_PF_INVALID) 4010 return r; 4011 4012 r = mmu_topup_memory_caches(vcpu, false); 4013 if (r) 4014 return r; 4015 4016 mmu_seq = vcpu->kvm->mmu_notifier_seq; 4017 smp_rmb(); 4018 4019 if (kvm_faultin_pfn(vcpu, fault, &r)) 4020 return r; 4021 4022 if (handle_abnormal_pfn(vcpu, fault, ACC_ALL, &r)) 4023 return r; 4024 4025 r = RET_PF_RETRY; 4026 4027 if (is_tdp_mmu_fault) 4028 read_lock(&vcpu->kvm->mmu_lock); 4029 else 4030 write_lock(&vcpu->kvm->mmu_lock); 4031 4032 if (is_page_fault_stale(vcpu, fault, mmu_seq)) 4033 goto out_unlock; 4034 4035 r = make_mmu_pages_available(vcpu); 4036 if (r) 4037 goto out_unlock; 4038 4039 if (is_tdp_mmu_fault) 4040 r = kvm_tdp_mmu_map(vcpu, fault); 4041 else 4042 r = __direct_map(vcpu, fault); 4043 4044 out_unlock: 4045 if (is_tdp_mmu_fault) 4046 read_unlock(&vcpu->kvm->mmu_lock); 4047 else 4048 write_unlock(&vcpu->kvm->mmu_lock); 4049 kvm_release_pfn_clean(fault->pfn); 4050 return r; 4051 } 4052 4053 static int nonpaging_page_fault(struct kvm_vcpu *vcpu, 4054 struct kvm_page_fault *fault) 4055 { 4056 pgprintk("%s: gva %lx error %x\n", __func__, fault->addr, fault->error_code); 4057 4058 /* This path builds a PAE pagetable, we can map 2mb pages at maximum. */ 4059 fault->max_level = PG_LEVEL_2M; 4060 return direct_page_fault(vcpu, fault); 4061 } 4062 4063 int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, 4064 u64 fault_address, char *insn, int insn_len) 4065 { 4066 int r = 1; 4067 u32 flags = vcpu->arch.apf.host_apf_flags; 4068 4069 #ifndef CONFIG_X86_64 4070 /* A 64-bit CR2 should be impossible on 32-bit KVM. */ 4071 if (WARN_ON_ONCE(fault_address >> 32)) 4072 return -EFAULT; 4073 #endif 4074 4075 vcpu->arch.l1tf_flush_l1d = true; 4076 if (!flags) { 4077 trace_kvm_page_fault(fault_address, error_code); 4078 4079 if (kvm_event_needs_reinjection(vcpu)) 4080 kvm_mmu_unprotect_page_virt(vcpu, fault_address); 4081 r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn, 4082 insn_len); 4083 } else if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) { 4084 vcpu->arch.apf.host_apf_flags = 0; 4085 local_irq_disable(); 4086 kvm_async_pf_task_wait_schedule(fault_address); 4087 local_irq_enable(); 4088 } else { 4089 WARN_ONCE(1, "Unexpected host async PF flags: %x\n", flags); 4090 } 4091 4092 return r; 4093 } 4094 EXPORT_SYMBOL_GPL(kvm_handle_page_fault); 4095 4096 int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) 4097 { 4098 while (fault->max_level > PG_LEVEL_4K) { 4099 int page_num = KVM_PAGES_PER_HPAGE(fault->max_level); 4100 gfn_t base = (fault->addr >> PAGE_SHIFT) & ~(page_num - 1); 4101 4102 if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num)) 4103 break; 4104 4105 --fault->max_level; 4106 } 4107 4108 return direct_page_fault(vcpu, fault); 4109 } 4110 4111 static void nonpaging_init_context(struct kvm_mmu *context) 4112 { 4113 context->page_fault = nonpaging_page_fault; 4114 context->gva_to_gpa = nonpaging_gva_to_gpa; 4115 context->sync_page = nonpaging_sync_page; 4116 context->invlpg = NULL; 4117 context->direct_map = true; 4118 } 4119 4120 static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd, 4121 union kvm_mmu_page_role role) 4122 { 4123 return (role.direct || pgd == root->pgd) && 4124 VALID_PAGE(root->hpa) && to_shadow_page(root->hpa) && 4125 role.word == to_shadow_page(root->hpa)->role.word; 4126 } 4127 4128 /* 4129 * Find out if a previously cached root matching the new pgd/role is available. 4130 * The current root is also inserted into the cache. 4131 * If a matching root was found, it is assigned to kvm_mmu->root_hpa and true is 4132 * returned. 4133 * Otherwise, the LRU root from the cache is assigned to kvm_mmu->root_hpa and 4134 * false is returned. This root should now be freed by the caller. 4135 */ 4136 static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_pgd, 4137 union kvm_mmu_page_role new_role) 4138 { 4139 uint i; 4140 struct kvm_mmu_root_info root; 4141 struct kvm_mmu *mmu = vcpu->arch.mmu; 4142 4143 root.pgd = mmu->root_pgd; 4144 root.hpa = mmu->root_hpa; 4145 4146 if (is_root_usable(&root, new_pgd, new_role)) 4147 return true; 4148 4149 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 4150 swap(root, mmu->prev_roots[i]); 4151 4152 if (is_root_usable(&root, new_pgd, new_role)) 4153 break; 4154 } 4155 4156 mmu->root_hpa = root.hpa; 4157 mmu->root_pgd = root.pgd; 4158 4159 return i < KVM_MMU_NUM_PREV_ROOTS; 4160 } 4161 4162 static bool fast_pgd_switch(struct kvm_vcpu *vcpu, gpa_t new_pgd, 4163 union kvm_mmu_page_role new_role) 4164 { 4165 struct kvm_mmu *mmu = vcpu->arch.mmu; 4166 4167 /* 4168 * For now, limit the fast switch to 64-bit hosts+VMs in order to avoid 4169 * having to deal with PDPTEs. We may add support for 32-bit hosts/VMs 4170 * later if necessary. 4171 */ 4172 if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL && 4173 mmu->root_level >= PT64_ROOT_4LEVEL) 4174 return cached_root_available(vcpu, new_pgd, new_role); 4175 4176 return false; 4177 } 4178 4179 static void __kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, 4180 union kvm_mmu_page_role new_role) 4181 { 4182 if (!fast_pgd_switch(vcpu, new_pgd, new_role)) { 4183 kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, KVM_MMU_ROOT_CURRENT); 4184 return; 4185 } 4186 4187 /* 4188 * It's possible that the cached previous root page is obsolete because 4189 * of a change in the MMU generation number. However, changing the 4190 * generation number is accompanied by KVM_REQ_MMU_RELOAD, which will 4191 * free the root set here and allocate a new one. 4192 */ 4193 kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu); 4194 4195 if (force_flush_and_sync_on_reuse) { 4196 kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); 4197 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); 4198 } 4199 4200 /* 4201 * The last MMIO access's GVA and GPA are cached in the VCPU. When 4202 * switching to a new CR3, that GVA->GPA mapping may no longer be 4203 * valid. So clear any cached MMIO info even when we don't need to sync 4204 * the shadow page tables. 4205 */ 4206 vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); 4207 4208 /* 4209 * If this is a direct root page, it doesn't have a write flooding 4210 * count. Otherwise, clear the write flooding count. 4211 */ 4212 if (!new_role.direct) 4213 __clear_sp_write_flooding_count( 4214 to_shadow_page(vcpu->arch.mmu->root_hpa)); 4215 } 4216 4217 void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd) 4218 { 4219 __kvm_mmu_new_pgd(vcpu, new_pgd, kvm_mmu_calc_root_page_role(vcpu)); 4220 } 4221 EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd); 4222 4223 static unsigned long get_cr3(struct kvm_vcpu *vcpu) 4224 { 4225 return kvm_read_cr3(vcpu); 4226 } 4227 4228 static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, 4229 unsigned int access) 4230 { 4231 if (unlikely(is_mmio_spte(*sptep))) { 4232 if (gfn != get_mmio_spte_gfn(*sptep)) { 4233 mmu_spte_clear_no_track(sptep); 4234 return true; 4235 } 4236 4237 mark_mmio_spte(vcpu, sptep, gfn, access); 4238 return true; 4239 } 4240 4241 return false; 4242 } 4243 4244 #define PTTYPE_EPT 18 /* arbitrary */ 4245 #define PTTYPE PTTYPE_EPT 4246 #include "paging_tmpl.h" 4247 #undef PTTYPE 4248 4249 #define PTTYPE 64 4250 #include "paging_tmpl.h" 4251 #undef PTTYPE 4252 4253 #define PTTYPE 32 4254 #include "paging_tmpl.h" 4255 #undef PTTYPE 4256 4257 static void 4258 __reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check, 4259 u64 pa_bits_rsvd, int level, bool nx, bool gbpages, 4260 bool pse, bool amd) 4261 { 4262 u64 gbpages_bit_rsvd = 0; 4263 u64 nonleaf_bit8_rsvd = 0; 4264 u64 high_bits_rsvd; 4265 4266 rsvd_check->bad_mt_xwr = 0; 4267 4268 if (!gbpages) 4269 gbpages_bit_rsvd = rsvd_bits(7, 7); 4270 4271 if (level == PT32E_ROOT_LEVEL) 4272 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 62); 4273 else 4274 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51); 4275 4276 /* Note, NX doesn't exist in PDPTEs, this is handled below. */ 4277 if (!nx) 4278 high_bits_rsvd |= rsvd_bits(63, 63); 4279 4280 /* 4281 * Non-leaf PML4Es and PDPEs reserve bit 8 (which would be the G bit for 4282 * leaf entries) on AMD CPUs only. 4283 */ 4284 if (amd) 4285 nonleaf_bit8_rsvd = rsvd_bits(8, 8); 4286 4287 switch (level) { 4288 case PT32_ROOT_LEVEL: 4289 /* no rsvd bits for 2 level 4K page table entries */ 4290 rsvd_check->rsvd_bits_mask[0][1] = 0; 4291 rsvd_check->rsvd_bits_mask[0][0] = 0; 4292 rsvd_check->rsvd_bits_mask[1][0] = 4293 rsvd_check->rsvd_bits_mask[0][0]; 4294 4295 if (!pse) { 4296 rsvd_check->rsvd_bits_mask[1][1] = 0; 4297 break; 4298 } 4299 4300 if (is_cpuid_PSE36()) 4301 /* 36bits PSE 4MB page */ 4302 rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(17, 21); 4303 else 4304 /* 32 bits PSE 4MB page */ 4305 rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21); 4306 break; 4307 case PT32E_ROOT_LEVEL: 4308 rsvd_check->rsvd_bits_mask[0][2] = rsvd_bits(63, 63) | 4309 high_bits_rsvd | 4310 rsvd_bits(5, 8) | 4311 rsvd_bits(1, 2); /* PDPTE */ 4312 rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd; /* PDE */ 4313 rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd; /* PTE */ 4314 rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | 4315 rsvd_bits(13, 20); /* large page */ 4316 rsvd_check->rsvd_bits_mask[1][0] = 4317 rsvd_check->rsvd_bits_mask[0][0]; 4318 break; 4319 case PT64_ROOT_5LEVEL: 4320 rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd | 4321 nonleaf_bit8_rsvd | 4322 rsvd_bits(7, 7); 4323 rsvd_check->rsvd_bits_mask[1][4] = 4324 rsvd_check->rsvd_bits_mask[0][4]; 4325 fallthrough; 4326 case PT64_ROOT_4LEVEL: 4327 rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd | 4328 nonleaf_bit8_rsvd | 4329 rsvd_bits(7, 7); 4330 rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd | 4331 gbpages_bit_rsvd; 4332 rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd; 4333 rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd; 4334 rsvd_check->rsvd_bits_mask[1][3] = 4335 rsvd_check->rsvd_bits_mask[0][3]; 4336 rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd | 4337 gbpages_bit_rsvd | 4338 rsvd_bits(13, 29); 4339 rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | 4340 rsvd_bits(13, 20); /* large page */ 4341 rsvd_check->rsvd_bits_mask[1][0] = 4342 rsvd_check->rsvd_bits_mask[0][0]; 4343 break; 4344 } 4345 } 4346 4347 static bool guest_can_use_gbpages(struct kvm_vcpu *vcpu) 4348 { 4349 /* 4350 * If TDP is enabled, let the guest use GBPAGES if they're supported in 4351 * hardware. The hardware page walker doesn't let KVM disable GBPAGES, 4352 * i.e. won't treat them as reserved, and KVM doesn't redo the GVA->GPA 4353 * walk for performance and complexity reasons. Not to mention KVM 4354 * _can't_ solve the problem because GVA->GPA walks aren't visible to 4355 * KVM once a TDP translation is installed. Mimic hardware behavior so 4356 * that KVM's is at least consistent, i.e. doesn't randomly inject #PF. 4357 */ 4358 return tdp_enabled ? boot_cpu_has(X86_FEATURE_GBPAGES) : 4359 guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES); 4360 } 4361 4362 static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, 4363 struct kvm_mmu *context) 4364 { 4365 __reset_rsvds_bits_mask(&context->guest_rsvd_check, 4366 vcpu->arch.reserved_gpa_bits, 4367 context->root_level, is_efer_nx(context), 4368 guest_can_use_gbpages(vcpu), 4369 is_cr4_pse(context), 4370 guest_cpuid_is_amd_or_hygon(vcpu)); 4371 } 4372 4373 static void 4374 __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check, 4375 u64 pa_bits_rsvd, bool execonly, int huge_page_level) 4376 { 4377 u64 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(0, 51); 4378 u64 large_1g_rsvd = 0, large_2m_rsvd = 0; 4379 u64 bad_mt_xwr; 4380 4381 if (huge_page_level < PG_LEVEL_1G) 4382 large_1g_rsvd = rsvd_bits(7, 7); 4383 if (huge_page_level < PG_LEVEL_2M) 4384 large_2m_rsvd = rsvd_bits(7, 7); 4385 4386 rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd | rsvd_bits(3, 7); 4387 rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd | rsvd_bits(3, 7); 4388 rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd | rsvd_bits(3, 6) | large_1g_rsvd; 4389 rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd | rsvd_bits(3, 6) | large_2m_rsvd; 4390 rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd; 4391 4392 /* large page */ 4393 rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4]; 4394 rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3]; 4395 rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd | rsvd_bits(12, 29) | large_1g_rsvd; 4396 rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | rsvd_bits(12, 20) | large_2m_rsvd; 4397 rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0]; 4398 4399 bad_mt_xwr = 0xFFull << (2 * 8); /* bits 3..5 must not be 2 */ 4400 bad_mt_xwr |= 0xFFull << (3 * 8); /* bits 3..5 must not be 3 */ 4401 bad_mt_xwr |= 0xFFull << (7 * 8); /* bits 3..5 must not be 7 */ 4402 bad_mt_xwr |= REPEAT_BYTE(1ull << 2); /* bits 0..2 must not be 010 */ 4403 bad_mt_xwr |= REPEAT_BYTE(1ull << 6); /* bits 0..2 must not be 110 */ 4404 if (!execonly) { 4405 /* bits 0..2 must not be 100 unless VMX capabilities allow it */ 4406 bad_mt_xwr |= REPEAT_BYTE(1ull << 4); 4407 } 4408 rsvd_check->bad_mt_xwr = bad_mt_xwr; 4409 } 4410 4411 static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, 4412 struct kvm_mmu *context, bool execonly, int huge_page_level) 4413 { 4414 __reset_rsvds_bits_mask_ept(&context->guest_rsvd_check, 4415 vcpu->arch.reserved_gpa_bits, execonly, 4416 huge_page_level); 4417 } 4418 4419 static inline u64 reserved_hpa_bits(void) 4420 { 4421 return rsvd_bits(shadow_phys_bits, 63); 4422 } 4423 4424 /* 4425 * the page table on host is the shadow page table for the page 4426 * table in guest or amd nested guest, its mmu features completely 4427 * follow the features in guest. 4428 */ 4429 static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, 4430 struct kvm_mmu *context) 4431 { 4432 /* 4433 * KVM uses NX when TDP is disabled to handle a variety of scenarios, 4434 * notably for huge SPTEs if iTLB multi-hit mitigation is enabled and 4435 * to generate correct permissions for CR0.WP=0/CR4.SMEP=1/EFER.NX=0. 4436 * The iTLB multi-hit workaround can be toggled at any time, so assume 4437 * NX can be used by any non-nested shadow MMU to avoid having to reset 4438 * MMU contexts. Note, KVM forces EFER.NX=1 when TDP is disabled. 4439 */ 4440 bool uses_nx = is_efer_nx(context) || !tdp_enabled; 4441 4442 /* @amd adds a check on bit of SPTEs, which KVM shouldn't use anyways. */ 4443 bool is_amd = true; 4444 /* KVM doesn't use 2-level page tables for the shadow MMU. */ 4445 bool is_pse = false; 4446 struct rsvd_bits_validate *shadow_zero_check; 4447 int i; 4448 4449 WARN_ON_ONCE(context->shadow_root_level < PT32E_ROOT_LEVEL); 4450 4451 shadow_zero_check = &context->shadow_zero_check; 4452 __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(), 4453 context->shadow_root_level, uses_nx, 4454 guest_can_use_gbpages(vcpu), is_pse, is_amd); 4455 4456 if (!shadow_me_mask) 4457 return; 4458 4459 for (i = context->shadow_root_level; --i >= 0;) { 4460 shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask; 4461 shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask; 4462 } 4463 4464 } 4465 4466 static inline bool boot_cpu_is_amd(void) 4467 { 4468 WARN_ON_ONCE(!tdp_enabled); 4469 return shadow_x_mask == 0; 4470 } 4471 4472 /* 4473 * the direct page table on host, use as much mmu features as 4474 * possible, however, kvm currently does not do execution-protection. 4475 */ 4476 static void 4477 reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, 4478 struct kvm_mmu *context) 4479 { 4480 struct rsvd_bits_validate *shadow_zero_check; 4481 int i; 4482 4483 shadow_zero_check = &context->shadow_zero_check; 4484 4485 if (boot_cpu_is_amd()) 4486 __reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(), 4487 context->shadow_root_level, false, 4488 boot_cpu_has(X86_FEATURE_GBPAGES), 4489 false, true); 4490 else 4491 __reset_rsvds_bits_mask_ept(shadow_zero_check, 4492 reserved_hpa_bits(), false, 4493 max_huge_page_level); 4494 4495 if (!shadow_me_mask) 4496 return; 4497 4498 for (i = context->shadow_root_level; --i >= 0;) { 4499 shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask; 4500 shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask; 4501 } 4502 } 4503 4504 /* 4505 * as the comments in reset_shadow_zero_bits_mask() except it 4506 * is the shadow page table for intel nested guest. 4507 */ 4508 static void 4509 reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, 4510 struct kvm_mmu *context, bool execonly) 4511 { 4512 __reset_rsvds_bits_mask_ept(&context->shadow_zero_check, 4513 reserved_hpa_bits(), execonly, 4514 max_huge_page_level); 4515 } 4516 4517 #define BYTE_MASK(access) \ 4518 ((1 & (access) ? 2 : 0) | \ 4519 (2 & (access) ? 4 : 0) | \ 4520 (3 & (access) ? 8 : 0) | \ 4521 (4 & (access) ? 16 : 0) | \ 4522 (5 & (access) ? 32 : 0) | \ 4523 (6 & (access) ? 64 : 0) | \ 4524 (7 & (access) ? 128 : 0)) 4525 4526 4527 static void update_permission_bitmask(struct kvm_mmu *mmu, bool ept) 4528 { 4529 unsigned byte; 4530 4531 const u8 x = BYTE_MASK(ACC_EXEC_MASK); 4532 const u8 w = BYTE_MASK(ACC_WRITE_MASK); 4533 const u8 u = BYTE_MASK(ACC_USER_MASK); 4534 4535 bool cr4_smep = is_cr4_smep(mmu); 4536 bool cr4_smap = is_cr4_smap(mmu); 4537 bool cr0_wp = is_cr0_wp(mmu); 4538 bool efer_nx = is_efer_nx(mmu); 4539 4540 for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) { 4541 unsigned pfec = byte << 1; 4542 4543 /* 4544 * Each "*f" variable has a 1 bit for each UWX value 4545 * that causes a fault with the given PFEC. 4546 */ 4547 4548 /* Faults from writes to non-writable pages */ 4549 u8 wf = (pfec & PFERR_WRITE_MASK) ? (u8)~w : 0; 4550 /* Faults from user mode accesses to supervisor pages */ 4551 u8 uf = (pfec & PFERR_USER_MASK) ? (u8)~u : 0; 4552 /* Faults from fetches of non-executable pages*/ 4553 u8 ff = (pfec & PFERR_FETCH_MASK) ? (u8)~x : 0; 4554 /* Faults from kernel mode fetches of user pages */ 4555 u8 smepf = 0; 4556 /* Faults from kernel mode accesses of user pages */ 4557 u8 smapf = 0; 4558 4559 if (!ept) { 4560 /* Faults from kernel mode accesses to user pages */ 4561 u8 kf = (pfec & PFERR_USER_MASK) ? 0 : u; 4562 4563 /* Not really needed: !nx will cause pte.nx to fault */ 4564 if (!efer_nx) 4565 ff = 0; 4566 4567 /* Allow supervisor writes if !cr0.wp */ 4568 if (!cr0_wp) 4569 wf = (pfec & PFERR_USER_MASK) ? wf : 0; 4570 4571 /* Disallow supervisor fetches of user code if cr4.smep */ 4572 if (cr4_smep) 4573 smepf = (pfec & PFERR_FETCH_MASK) ? kf : 0; 4574 4575 /* 4576 * SMAP:kernel-mode data accesses from user-mode 4577 * mappings should fault. A fault is considered 4578 * as a SMAP violation if all of the following 4579 * conditions are true: 4580 * - X86_CR4_SMAP is set in CR4 4581 * - A user page is accessed 4582 * - The access is not a fetch 4583 * - Page fault in kernel mode 4584 * - if CPL = 3 or X86_EFLAGS_AC is clear 4585 * 4586 * Here, we cover the first three conditions. 4587 * The fourth is computed dynamically in permission_fault(); 4588 * PFERR_RSVD_MASK bit will be set in PFEC if the access is 4589 * *not* subject to SMAP restrictions. 4590 */ 4591 if (cr4_smap) 4592 smapf = (pfec & (PFERR_RSVD_MASK|PFERR_FETCH_MASK)) ? 0 : kf; 4593 } 4594 4595 mmu->permissions[byte] = ff | uf | wf | smepf | smapf; 4596 } 4597 } 4598 4599 /* 4600 * PKU is an additional mechanism by which the paging controls access to 4601 * user-mode addresses based on the value in the PKRU register. Protection 4602 * key violations are reported through a bit in the page fault error code. 4603 * Unlike other bits of the error code, the PK bit is not known at the 4604 * call site of e.g. gva_to_gpa; it must be computed directly in 4605 * permission_fault based on two bits of PKRU, on some machine state (CR4, 4606 * CR0, EFER, CPL), and on other bits of the error code and the page tables. 4607 * 4608 * In particular the following conditions come from the error code, the 4609 * page tables and the machine state: 4610 * - PK is always zero unless CR4.PKE=1 and EFER.LMA=1 4611 * - PK is always zero if RSVD=1 (reserved bit set) or F=1 (instruction fetch) 4612 * - PK is always zero if U=0 in the page tables 4613 * - PKRU.WD is ignored if CR0.WP=0 and the access is a supervisor access. 4614 * 4615 * The PKRU bitmask caches the result of these four conditions. The error 4616 * code (minus the P bit) and the page table's U bit form an index into the 4617 * PKRU bitmask. Two bits of the PKRU bitmask are then extracted and ANDed 4618 * with the two bits of the PKRU register corresponding to the protection key. 4619 * For the first three conditions above the bits will be 00, thus masking 4620 * away both AD and WD. For all reads or if the last condition holds, WD 4621 * only will be masked away. 4622 */ 4623 static void update_pkru_bitmask(struct kvm_mmu *mmu) 4624 { 4625 unsigned bit; 4626 bool wp; 4627 4628 mmu->pkru_mask = 0; 4629 4630 if (!is_cr4_pke(mmu)) 4631 return; 4632 4633 wp = is_cr0_wp(mmu); 4634 4635 for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) { 4636 unsigned pfec, pkey_bits; 4637 bool check_pkey, check_write, ff, uf, wf, pte_user; 4638 4639 pfec = bit << 1; 4640 ff = pfec & PFERR_FETCH_MASK; 4641 uf = pfec & PFERR_USER_MASK; 4642 wf = pfec & PFERR_WRITE_MASK; 4643 4644 /* PFEC.RSVD is replaced by ACC_USER_MASK. */ 4645 pte_user = pfec & PFERR_RSVD_MASK; 4646 4647 /* 4648 * Only need to check the access which is not an 4649 * instruction fetch and is to a user page. 4650 */ 4651 check_pkey = (!ff && pte_user); 4652 /* 4653 * write access is controlled by PKRU if it is a 4654 * user access or CR0.WP = 1. 4655 */ 4656 check_write = check_pkey && wf && (uf || wp); 4657 4658 /* PKRU.AD stops both read and write access. */ 4659 pkey_bits = !!check_pkey; 4660 /* PKRU.WD stops write access. */ 4661 pkey_bits |= (!!check_write) << 1; 4662 4663 mmu->pkru_mask |= (pkey_bits & 3) << pfec; 4664 } 4665 } 4666 4667 static void reset_guest_paging_metadata(struct kvm_vcpu *vcpu, 4668 struct kvm_mmu *mmu) 4669 { 4670 if (!is_cr0_pg(mmu)) 4671 return; 4672 4673 reset_rsvds_bits_mask(vcpu, mmu); 4674 update_permission_bitmask(mmu, false); 4675 update_pkru_bitmask(mmu); 4676 } 4677 4678 static void paging64_init_context(struct kvm_mmu *context) 4679 { 4680 context->page_fault = paging64_page_fault; 4681 context->gva_to_gpa = paging64_gva_to_gpa; 4682 context->sync_page = paging64_sync_page; 4683 context->invlpg = paging64_invlpg; 4684 context->direct_map = false; 4685 } 4686 4687 static void paging32_init_context(struct kvm_mmu *context) 4688 { 4689 context->page_fault = paging32_page_fault; 4690 context->gva_to_gpa = paging32_gva_to_gpa; 4691 context->sync_page = paging32_sync_page; 4692 context->invlpg = paging32_invlpg; 4693 context->direct_map = false; 4694 } 4695 4696 static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu, 4697 struct kvm_mmu_role_regs *regs) 4698 { 4699 union kvm_mmu_extended_role ext = {0}; 4700 4701 if (____is_cr0_pg(regs)) { 4702 ext.cr0_pg = 1; 4703 ext.cr4_pae = ____is_cr4_pae(regs); 4704 ext.cr4_smep = ____is_cr4_smep(regs); 4705 ext.cr4_smap = ____is_cr4_smap(regs); 4706 ext.cr4_pse = ____is_cr4_pse(regs); 4707 4708 /* PKEY and LA57 are active iff long mode is active. */ 4709 ext.cr4_pke = ____is_efer_lma(regs) && ____is_cr4_pke(regs); 4710 ext.cr4_la57 = ____is_efer_lma(regs) && ____is_cr4_la57(regs); 4711 ext.efer_lma = ____is_efer_lma(regs); 4712 } 4713 4714 ext.valid = 1; 4715 4716 return ext; 4717 } 4718 4719 static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu, 4720 struct kvm_mmu_role_regs *regs, 4721 bool base_only) 4722 { 4723 union kvm_mmu_role role = {0}; 4724 4725 role.base.access = ACC_ALL; 4726 if (____is_cr0_pg(regs)) { 4727 role.base.efer_nx = ____is_efer_nx(regs); 4728 role.base.cr0_wp = ____is_cr0_wp(regs); 4729 } 4730 role.base.smm = is_smm(vcpu); 4731 role.base.guest_mode = is_guest_mode(vcpu); 4732 4733 if (base_only) 4734 return role; 4735 4736 role.ext = kvm_calc_mmu_role_ext(vcpu, regs); 4737 4738 return role; 4739 } 4740 4741 static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu) 4742 { 4743 /* tdp_root_level is architecture forced level, use it if nonzero */ 4744 if (tdp_root_level) 4745 return tdp_root_level; 4746 4747 /* Use 5-level TDP if and only if it's useful/necessary. */ 4748 if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48) 4749 return 4; 4750 4751 return max_tdp_level; 4752 } 4753 4754 static union kvm_mmu_role 4755 kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, 4756 struct kvm_mmu_role_regs *regs, bool base_only) 4757 { 4758 union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, regs, base_only); 4759 4760 role.base.ad_disabled = (shadow_accessed_mask == 0); 4761 role.base.level = kvm_mmu_get_tdp_level(vcpu); 4762 role.base.direct = true; 4763 role.base.has_4_byte_gpte = false; 4764 4765 return role; 4766 } 4767 4768 static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) 4769 { 4770 struct kvm_mmu *context = &vcpu->arch.root_mmu; 4771 struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu); 4772 union kvm_mmu_role new_role = 4773 kvm_calc_tdp_mmu_root_page_role(vcpu, ®s, false); 4774 4775 if (new_role.as_u64 == context->mmu_role.as_u64) 4776 return; 4777 4778 context->mmu_role.as_u64 = new_role.as_u64; 4779 context->page_fault = kvm_tdp_page_fault; 4780 context->sync_page = nonpaging_sync_page; 4781 context->invlpg = NULL; 4782 context->shadow_root_level = kvm_mmu_get_tdp_level(vcpu); 4783 context->direct_map = true; 4784 context->get_guest_pgd = get_cr3; 4785 context->get_pdptr = kvm_pdptr_read; 4786 context->inject_page_fault = kvm_inject_page_fault; 4787 context->root_level = role_regs_to_root_level(®s); 4788 4789 if (!is_cr0_pg(context)) 4790 context->gva_to_gpa = nonpaging_gva_to_gpa; 4791 else if (is_cr4_pae(context)) 4792 context->gva_to_gpa = paging64_gva_to_gpa; 4793 else 4794 context->gva_to_gpa = paging32_gva_to_gpa; 4795 4796 reset_guest_paging_metadata(vcpu, context); 4797 reset_tdp_shadow_zero_bits_mask(vcpu, context); 4798 } 4799 4800 static union kvm_mmu_role 4801 kvm_calc_shadow_root_page_role_common(struct kvm_vcpu *vcpu, 4802 struct kvm_mmu_role_regs *regs, bool base_only) 4803 { 4804 union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, regs, base_only); 4805 4806 role.base.smep_andnot_wp = role.ext.cr4_smep && !____is_cr0_wp(regs); 4807 role.base.smap_andnot_wp = role.ext.cr4_smap && !____is_cr0_wp(regs); 4808 role.base.has_4_byte_gpte = ____is_cr0_pg(regs) && !____is_cr4_pae(regs); 4809 4810 return role; 4811 } 4812 4813 static union kvm_mmu_role 4814 kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, 4815 struct kvm_mmu_role_regs *regs, bool base_only) 4816 { 4817 union kvm_mmu_role role = 4818 kvm_calc_shadow_root_page_role_common(vcpu, regs, base_only); 4819 4820 role.base.direct = !____is_cr0_pg(regs); 4821 4822 if (!____is_efer_lma(regs)) 4823 role.base.level = PT32E_ROOT_LEVEL; 4824 else if (____is_cr4_la57(regs)) 4825 role.base.level = PT64_ROOT_5LEVEL; 4826 else 4827 role.base.level = PT64_ROOT_4LEVEL; 4828 4829 return role; 4830 } 4831 4832 static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context, 4833 struct kvm_mmu_role_regs *regs, 4834 union kvm_mmu_role new_role) 4835 { 4836 if (new_role.as_u64 == context->mmu_role.as_u64) 4837 return; 4838 4839 context->mmu_role.as_u64 = new_role.as_u64; 4840 4841 if (!is_cr0_pg(context)) 4842 nonpaging_init_context(context); 4843 else if (is_cr4_pae(context)) 4844 paging64_init_context(context); 4845 else 4846 paging32_init_context(context); 4847 context->root_level = role_regs_to_root_level(regs); 4848 4849 reset_guest_paging_metadata(vcpu, context); 4850 context->shadow_root_level = new_role.base.level; 4851 4852 reset_shadow_zero_bits_mask(vcpu, context); 4853 } 4854 4855 static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, 4856 struct kvm_mmu_role_regs *regs) 4857 { 4858 struct kvm_mmu *context = &vcpu->arch.root_mmu; 4859 union kvm_mmu_role new_role = 4860 kvm_calc_shadow_mmu_root_page_role(vcpu, regs, false); 4861 4862 shadow_mmu_init_context(vcpu, context, regs, new_role); 4863 } 4864 4865 static union kvm_mmu_role 4866 kvm_calc_shadow_npt_root_page_role(struct kvm_vcpu *vcpu, 4867 struct kvm_mmu_role_regs *regs) 4868 { 4869 union kvm_mmu_role role = 4870 kvm_calc_shadow_root_page_role_common(vcpu, regs, false); 4871 4872 role.base.direct = false; 4873 role.base.level = kvm_mmu_get_tdp_level(vcpu); 4874 4875 return role; 4876 } 4877 4878 void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0, 4879 unsigned long cr4, u64 efer, gpa_t nested_cr3) 4880 { 4881 struct kvm_mmu *context = &vcpu->arch.guest_mmu; 4882 struct kvm_mmu_role_regs regs = { 4883 .cr0 = cr0, 4884 .cr4 = cr4 & ~X86_CR4_PKE, 4885 .efer = efer, 4886 }; 4887 union kvm_mmu_role new_role; 4888 4889 new_role = kvm_calc_shadow_npt_root_page_role(vcpu, ®s); 4890 4891 __kvm_mmu_new_pgd(vcpu, nested_cr3, new_role.base); 4892 4893 shadow_mmu_init_context(vcpu, context, ®s, new_role); 4894 } 4895 EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu); 4896 4897 static union kvm_mmu_role 4898 kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty, 4899 bool execonly, u8 level) 4900 { 4901 union kvm_mmu_role role = {0}; 4902 4903 /* SMM flag is inherited from root_mmu */ 4904 role.base.smm = vcpu->arch.root_mmu.mmu_role.base.smm; 4905 4906 role.base.level = level; 4907 role.base.has_4_byte_gpte = false; 4908 role.base.direct = false; 4909 role.base.ad_disabled = !accessed_dirty; 4910 role.base.guest_mode = true; 4911 role.base.access = ACC_ALL; 4912 4913 /* EPT, and thus nested EPT, does not consume CR0, CR4, nor EFER. */ 4914 role.ext.word = 0; 4915 role.ext.execonly = execonly; 4916 role.ext.valid = 1; 4917 4918 return role; 4919 } 4920 4921 void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, 4922 int huge_page_level, bool accessed_dirty, 4923 gpa_t new_eptp) 4924 { 4925 struct kvm_mmu *context = &vcpu->arch.guest_mmu; 4926 u8 level = vmx_eptp_page_walk_level(new_eptp); 4927 union kvm_mmu_role new_role = 4928 kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty, 4929 execonly, level); 4930 4931 __kvm_mmu_new_pgd(vcpu, new_eptp, new_role.base); 4932 4933 if (new_role.as_u64 == context->mmu_role.as_u64) 4934 return; 4935 4936 context->mmu_role.as_u64 = new_role.as_u64; 4937 4938 context->shadow_root_level = level; 4939 4940 context->ept_ad = accessed_dirty; 4941 context->page_fault = ept_page_fault; 4942 context->gva_to_gpa = ept_gva_to_gpa; 4943 context->sync_page = ept_sync_page; 4944 context->invlpg = ept_invlpg; 4945 context->root_level = level; 4946 context->direct_map = false; 4947 4948 update_permission_bitmask(context, true); 4949 context->pkru_mask = 0; 4950 reset_rsvds_bits_mask_ept(vcpu, context, execonly, huge_page_level); 4951 reset_ept_shadow_zero_bits_mask(vcpu, context, execonly); 4952 } 4953 EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu); 4954 4955 static void init_kvm_softmmu(struct kvm_vcpu *vcpu) 4956 { 4957 struct kvm_mmu *context = &vcpu->arch.root_mmu; 4958 struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu); 4959 4960 kvm_init_shadow_mmu(vcpu, ®s); 4961 4962 context->get_guest_pgd = get_cr3; 4963 context->get_pdptr = kvm_pdptr_read; 4964 context->inject_page_fault = kvm_inject_page_fault; 4965 } 4966 4967 static union kvm_mmu_role 4968 kvm_calc_nested_mmu_role(struct kvm_vcpu *vcpu, struct kvm_mmu_role_regs *regs) 4969 { 4970 union kvm_mmu_role role; 4971 4972 role = kvm_calc_shadow_root_page_role_common(vcpu, regs, false); 4973 4974 /* 4975 * Nested MMUs are used only for walking L2's gva->gpa, they never have 4976 * shadow pages of their own and so "direct" has no meaning. Set it 4977 * to "true" to try to detect bogus usage of the nested MMU. 4978 */ 4979 role.base.direct = true; 4980 role.base.level = role_regs_to_root_level(regs); 4981 return role; 4982 } 4983 4984 static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) 4985 { 4986 struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu); 4987 union kvm_mmu_role new_role = kvm_calc_nested_mmu_role(vcpu, ®s); 4988 struct kvm_mmu *g_context = &vcpu->arch.nested_mmu; 4989 4990 if (new_role.as_u64 == g_context->mmu_role.as_u64) 4991 return; 4992 4993 g_context->mmu_role.as_u64 = new_role.as_u64; 4994 g_context->get_guest_pgd = get_cr3; 4995 g_context->get_pdptr = kvm_pdptr_read; 4996 g_context->inject_page_fault = kvm_inject_page_fault; 4997 g_context->root_level = new_role.base.level; 4998 4999 /* 5000 * L2 page tables are never shadowed, so there is no need to sync 5001 * SPTEs. 5002 */ 5003 g_context->invlpg = NULL; 5004 5005 /* 5006 * Note that arch.mmu->gva_to_gpa translates l2_gpa to l1_gpa using 5007 * L1's nested page tables (e.g. EPT12). The nested translation 5008 * of l2_gva to l1_gpa is done by arch.nested_mmu.gva_to_gpa using 5009 * L2's page tables as the first level of translation and L1's 5010 * nested page tables as the second level of translation. Basically 5011 * the gva_to_gpa functions between mmu and nested_mmu are swapped. 5012 */ 5013 if (!is_paging(vcpu)) 5014 g_context->gva_to_gpa = nonpaging_gva_to_gpa; 5015 else if (is_long_mode(vcpu)) 5016 g_context->gva_to_gpa = paging64_gva_to_gpa; 5017 else if (is_pae(vcpu)) 5018 g_context->gva_to_gpa = paging64_gva_to_gpa; 5019 else 5020 g_context->gva_to_gpa = paging32_gva_to_gpa; 5021 5022 reset_guest_paging_metadata(vcpu, g_context); 5023 } 5024 5025 void kvm_init_mmu(struct kvm_vcpu *vcpu) 5026 { 5027 if (mmu_is_nested(vcpu)) 5028 init_kvm_nested_mmu(vcpu); 5029 else if (tdp_enabled) 5030 init_kvm_tdp_mmu(vcpu); 5031 else 5032 init_kvm_softmmu(vcpu); 5033 } 5034 EXPORT_SYMBOL_GPL(kvm_init_mmu); 5035 5036 static union kvm_mmu_page_role 5037 kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu) 5038 { 5039 struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu); 5040 union kvm_mmu_role role; 5041 5042 if (tdp_enabled) 5043 role = kvm_calc_tdp_mmu_root_page_role(vcpu, ®s, true); 5044 else 5045 role = kvm_calc_shadow_mmu_root_page_role(vcpu, ®s, true); 5046 5047 return role.base; 5048 } 5049 5050 void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu) 5051 { 5052 /* 5053 * Invalidate all MMU roles to force them to reinitialize as CPUID 5054 * information is factored into reserved bit calculations. 5055 * 5056 * Correctly handling multiple vCPU models with respect to paging and 5057 * physical address properties) in a single VM would require tracking 5058 * all relevant CPUID information in kvm_mmu_page_role. That is very 5059 * undesirable as it would increase the memory requirements for 5060 * gfn_track (see struct kvm_mmu_page_role comments). For now that 5061 * problem is swept under the rug; KVM's CPUID API is horrific and 5062 * it's all but impossible to solve it without introducing a new API. 5063 */ 5064 vcpu->arch.root_mmu.mmu_role.ext.valid = 0; 5065 vcpu->arch.guest_mmu.mmu_role.ext.valid = 0; 5066 vcpu->arch.nested_mmu.mmu_role.ext.valid = 0; 5067 kvm_mmu_reset_context(vcpu); 5068 5069 /* 5070 * Changing guest CPUID after KVM_RUN is forbidden, see the comment in 5071 * kvm_arch_vcpu_ioctl(). 5072 */ 5073 KVM_BUG_ON(vcpu->arch.last_vmentry_cpu != -1, vcpu->kvm); 5074 } 5075 5076 void kvm_mmu_reset_context(struct kvm_vcpu *vcpu) 5077 { 5078 kvm_mmu_unload(vcpu); 5079 kvm_init_mmu(vcpu); 5080 } 5081 EXPORT_SYMBOL_GPL(kvm_mmu_reset_context); 5082 5083 int kvm_mmu_load(struct kvm_vcpu *vcpu) 5084 { 5085 int r; 5086 5087 r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->direct_map); 5088 if (r) 5089 goto out; 5090 r = mmu_alloc_special_roots(vcpu); 5091 if (r) 5092 goto out; 5093 if (vcpu->arch.mmu->direct_map) 5094 r = mmu_alloc_direct_roots(vcpu); 5095 else 5096 r = mmu_alloc_shadow_roots(vcpu); 5097 if (r) 5098 goto out; 5099 5100 kvm_mmu_sync_roots(vcpu); 5101 5102 kvm_mmu_load_pgd(vcpu); 5103 static_call(kvm_x86_tlb_flush_current)(vcpu); 5104 out: 5105 return r; 5106 } 5107 5108 void kvm_mmu_unload(struct kvm_vcpu *vcpu) 5109 { 5110 kvm_mmu_free_roots(vcpu, &vcpu->arch.root_mmu, KVM_MMU_ROOTS_ALL); 5111 WARN_ON(VALID_PAGE(vcpu->arch.root_mmu.root_hpa)); 5112 kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 5113 WARN_ON(VALID_PAGE(vcpu->arch.guest_mmu.root_hpa)); 5114 } 5115 5116 static bool need_remote_flush(u64 old, u64 new) 5117 { 5118 if (!is_shadow_present_pte(old)) 5119 return false; 5120 if (!is_shadow_present_pte(new)) 5121 return true; 5122 if ((old ^ new) & PT64_BASE_ADDR_MASK) 5123 return true; 5124 old ^= shadow_nx_mask; 5125 new ^= shadow_nx_mask; 5126 return (old & ~new & PT64_PERM_MASK) != 0; 5127 } 5128 5129 static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa, 5130 int *bytes) 5131 { 5132 u64 gentry = 0; 5133 int r; 5134 5135 /* 5136 * Assume that the pte write on a page table of the same type 5137 * as the current vcpu paging mode since we update the sptes only 5138 * when they have the same mode. 5139 */ 5140 if (is_pae(vcpu) && *bytes == 4) { 5141 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ 5142 *gpa &= ~(gpa_t)7; 5143 *bytes = 8; 5144 } 5145 5146 if (*bytes == 4 || *bytes == 8) { 5147 r = kvm_vcpu_read_guest_atomic(vcpu, *gpa, &gentry, *bytes); 5148 if (r) 5149 gentry = 0; 5150 } 5151 5152 return gentry; 5153 } 5154 5155 /* 5156 * If we're seeing too many writes to a page, it may no longer be a page table, 5157 * or we may be forking, in which case it is better to unmap the page. 5158 */ 5159 static bool detect_write_flooding(struct kvm_mmu_page *sp) 5160 { 5161 /* 5162 * Skip write-flooding detected for the sp whose level is 1, because 5163 * it can become unsync, then the guest page is not write-protected. 5164 */ 5165 if (sp->role.level == PG_LEVEL_4K) 5166 return false; 5167 5168 atomic_inc(&sp->write_flooding_count); 5169 return atomic_read(&sp->write_flooding_count) >= 3; 5170 } 5171 5172 /* 5173 * Misaligned accesses are too much trouble to fix up; also, they usually 5174 * indicate a page is not used as a page table. 5175 */ 5176 static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa, 5177 int bytes) 5178 { 5179 unsigned offset, pte_size, misaligned; 5180 5181 pgprintk("misaligned: gpa %llx bytes %d role %x\n", 5182 gpa, bytes, sp->role.word); 5183 5184 offset = offset_in_page(gpa); 5185 pte_size = sp->role.has_4_byte_gpte ? 4 : 8; 5186 5187 /* 5188 * Sometimes, the OS only writes the last one bytes to update status 5189 * bits, for example, in linux, andb instruction is used in clear_bit(). 5190 */ 5191 if (!(offset & (pte_size - 1)) && bytes == 1) 5192 return false; 5193 5194 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); 5195 misaligned |= bytes < 4; 5196 5197 return misaligned; 5198 } 5199 5200 static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte) 5201 { 5202 unsigned page_offset, quadrant; 5203 u64 *spte; 5204 int level; 5205 5206 page_offset = offset_in_page(gpa); 5207 level = sp->role.level; 5208 *nspte = 1; 5209 if (sp->role.has_4_byte_gpte) { 5210 page_offset <<= 1; /* 32->64 */ 5211 /* 5212 * A 32-bit pde maps 4MB while the shadow pdes map 5213 * only 2MB. So we need to double the offset again 5214 * and zap two pdes instead of one. 5215 */ 5216 if (level == PT32_ROOT_LEVEL) { 5217 page_offset &= ~7; /* kill rounding error */ 5218 page_offset <<= 1; 5219 *nspte = 2; 5220 } 5221 quadrant = page_offset >> PAGE_SHIFT; 5222 page_offset &= ~PAGE_MASK; 5223 if (quadrant != sp->role.quadrant) 5224 return NULL; 5225 } 5226 5227 spte = &sp->spt[page_offset / sizeof(*spte)]; 5228 return spte; 5229 } 5230 5231 static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, 5232 const u8 *new, int bytes, 5233 struct kvm_page_track_notifier_node *node) 5234 { 5235 gfn_t gfn = gpa >> PAGE_SHIFT; 5236 struct kvm_mmu_page *sp; 5237 LIST_HEAD(invalid_list); 5238 u64 entry, gentry, *spte; 5239 int npte; 5240 bool flush = false; 5241 5242 /* 5243 * If we don't have indirect shadow pages, it means no page is 5244 * write-protected, so we can exit simply. 5245 */ 5246 if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages)) 5247 return; 5248 5249 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); 5250 5251 /* 5252 * No need to care whether allocation memory is successful 5253 * or not since pte prefetch is skipped if it does not have 5254 * enough objects in the cache. 5255 */ 5256 mmu_topup_memory_caches(vcpu, true); 5257 5258 write_lock(&vcpu->kvm->mmu_lock); 5259 5260 gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes); 5261 5262 ++vcpu->kvm->stat.mmu_pte_write; 5263 kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE); 5264 5265 for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) { 5266 if (detect_write_misaligned(sp, gpa, bytes) || 5267 detect_write_flooding(sp)) { 5268 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); 5269 ++vcpu->kvm->stat.mmu_flooded; 5270 continue; 5271 } 5272 5273 spte = get_written_sptes(sp, gpa, &npte); 5274 if (!spte) 5275 continue; 5276 5277 while (npte--) { 5278 entry = *spte; 5279 mmu_page_zap_pte(vcpu->kvm, sp, spte, NULL); 5280 if (gentry && sp->role.level != PG_LEVEL_4K) 5281 ++vcpu->kvm->stat.mmu_pde_zapped; 5282 if (need_remote_flush(entry, *spte)) 5283 flush = true; 5284 ++spte; 5285 } 5286 } 5287 kvm_mmu_remote_flush_or_zap(vcpu->kvm, &invalid_list, flush); 5288 kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE); 5289 write_unlock(&vcpu->kvm->mmu_lock); 5290 } 5291 5292 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, 5293 void *insn, int insn_len) 5294 { 5295 int r, emulation_type = EMULTYPE_PF; 5296 bool direct = vcpu->arch.mmu->direct_map; 5297 5298 if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) 5299 return RET_PF_RETRY; 5300 5301 r = RET_PF_INVALID; 5302 if (unlikely(error_code & PFERR_RSVD_MASK)) { 5303 r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct); 5304 if (r == RET_PF_EMULATE) 5305 goto emulate; 5306 } 5307 5308 if (r == RET_PF_INVALID) { 5309 r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, 5310 lower_32_bits(error_code), false); 5311 if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm)) 5312 return -EIO; 5313 } 5314 5315 if (r < 0) 5316 return r; 5317 if (r != RET_PF_EMULATE) 5318 return 1; 5319 5320 /* 5321 * Before emulating the instruction, check if the error code 5322 * was due to a RO violation while translating the guest page. 5323 * This can occur when using nested virtualization with nested 5324 * paging in both guests. If true, we simply unprotect the page 5325 * and resume the guest. 5326 */ 5327 if (vcpu->arch.mmu->direct_map && 5328 (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) { 5329 kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa)); 5330 return 1; 5331 } 5332 5333 /* 5334 * vcpu->arch.mmu.page_fault returned RET_PF_EMULATE, but we can still 5335 * optimistically try to just unprotect the page and let the processor 5336 * re-execute the instruction that caused the page fault. Do not allow 5337 * retrying MMIO emulation, as it's not only pointless but could also 5338 * cause us to enter an infinite loop because the processor will keep 5339 * faulting on the non-existent MMIO address. Retrying an instruction 5340 * from a nested guest is also pointless and dangerous as we are only 5341 * explicitly shadowing L1's page tables, i.e. unprotecting something 5342 * for L1 isn't going to magically fix whatever issue cause L2 to fail. 5343 */ 5344 if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu)) 5345 emulation_type |= EMULTYPE_ALLOW_RETRY_PF; 5346 emulate: 5347 return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn, 5348 insn_len); 5349 } 5350 EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); 5351 5352 void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, 5353 gva_t gva, hpa_t root_hpa) 5354 { 5355 int i; 5356 5357 /* It's actually a GPA for vcpu->arch.guest_mmu. */ 5358 if (mmu != &vcpu->arch.guest_mmu) { 5359 /* INVLPG on a non-canonical address is a NOP according to the SDM. */ 5360 if (is_noncanonical_address(gva, vcpu)) 5361 return; 5362 5363 static_call(kvm_x86_tlb_flush_gva)(vcpu, gva); 5364 } 5365 5366 if (!mmu->invlpg) 5367 return; 5368 5369 if (root_hpa == INVALID_PAGE) { 5370 mmu->invlpg(vcpu, gva, mmu->root_hpa); 5371 5372 /* 5373 * INVLPG is required to invalidate any global mappings for the VA, 5374 * irrespective of PCID. Since it would take us roughly similar amount 5375 * of work to determine whether any of the prev_root mappings of the VA 5376 * is marked global, or to just sync it blindly, so we might as well 5377 * just always sync it. 5378 * 5379 * Mappings not reachable via the current cr3 or the prev_roots will be 5380 * synced when switching to that cr3, so nothing needs to be done here 5381 * for them. 5382 */ 5383 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) 5384 if (VALID_PAGE(mmu->prev_roots[i].hpa)) 5385 mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa); 5386 } else { 5387 mmu->invlpg(vcpu, gva, root_hpa); 5388 } 5389 } 5390 5391 void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) 5392 { 5393 kvm_mmu_invalidate_gva(vcpu, vcpu->arch.walk_mmu, gva, INVALID_PAGE); 5394 ++vcpu->stat.invlpg; 5395 } 5396 EXPORT_SYMBOL_GPL(kvm_mmu_invlpg); 5397 5398 5399 void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) 5400 { 5401 struct kvm_mmu *mmu = vcpu->arch.mmu; 5402 bool tlb_flush = false; 5403 uint i; 5404 5405 if (pcid == kvm_get_active_pcid(vcpu)) { 5406 mmu->invlpg(vcpu, gva, mmu->root_hpa); 5407 tlb_flush = true; 5408 } 5409 5410 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 5411 if (VALID_PAGE(mmu->prev_roots[i].hpa) && 5412 pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd)) { 5413 mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa); 5414 tlb_flush = true; 5415 } 5416 } 5417 5418 if (tlb_flush) 5419 static_call(kvm_x86_tlb_flush_gva)(vcpu, gva); 5420 5421 ++vcpu->stat.invlpg; 5422 5423 /* 5424 * Mappings not reachable via the current cr3 or the prev_roots will be 5425 * synced when switching to that cr3, so nothing needs to be done here 5426 * for them. 5427 */ 5428 } 5429 5430 void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, 5431 int tdp_max_root_level, int tdp_huge_page_level) 5432 { 5433 tdp_enabled = enable_tdp; 5434 tdp_root_level = tdp_forced_root_level; 5435 max_tdp_level = tdp_max_root_level; 5436 5437 /* 5438 * max_huge_page_level reflects KVM's MMU capabilities irrespective 5439 * of kernel support, e.g. KVM may be capable of using 1GB pages when 5440 * the kernel is not. But, KVM never creates a page size greater than 5441 * what is used by the kernel for any given HVA, i.e. the kernel's 5442 * capabilities are ultimately consulted by kvm_mmu_hugepage_adjust(). 5443 */ 5444 if (tdp_enabled) 5445 max_huge_page_level = tdp_huge_page_level; 5446 else if (boot_cpu_has(X86_FEATURE_GBPAGES)) 5447 max_huge_page_level = PG_LEVEL_1G; 5448 else 5449 max_huge_page_level = PG_LEVEL_2M; 5450 } 5451 EXPORT_SYMBOL_GPL(kvm_configure_mmu); 5452 5453 /* The return value indicates if tlb flush on all vcpus is needed. */ 5454 typedef bool (*slot_level_handler) (struct kvm *kvm, 5455 struct kvm_rmap_head *rmap_head, 5456 const struct kvm_memory_slot *slot); 5457 5458 /* The caller should hold mmu-lock before calling this function. */ 5459 static __always_inline bool 5460 slot_handle_level_range(struct kvm *kvm, const struct kvm_memory_slot *memslot, 5461 slot_level_handler fn, int start_level, int end_level, 5462 gfn_t start_gfn, gfn_t end_gfn, bool flush_on_yield, 5463 bool flush) 5464 { 5465 struct slot_rmap_walk_iterator iterator; 5466 5467 for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn, 5468 end_gfn, &iterator) { 5469 if (iterator.rmap) 5470 flush |= fn(kvm, iterator.rmap, memslot); 5471 5472 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { 5473 if (flush && flush_on_yield) { 5474 kvm_flush_remote_tlbs_with_address(kvm, 5475 start_gfn, 5476 iterator.gfn - start_gfn + 1); 5477 flush = false; 5478 } 5479 cond_resched_rwlock_write(&kvm->mmu_lock); 5480 } 5481 } 5482 5483 return flush; 5484 } 5485 5486 static __always_inline bool 5487 slot_handle_level(struct kvm *kvm, const struct kvm_memory_slot *memslot, 5488 slot_level_handler fn, int start_level, int end_level, 5489 bool flush_on_yield) 5490 { 5491 return slot_handle_level_range(kvm, memslot, fn, start_level, 5492 end_level, memslot->base_gfn, 5493 memslot->base_gfn + memslot->npages - 1, 5494 flush_on_yield, false); 5495 } 5496 5497 static __always_inline bool 5498 slot_handle_level_4k(struct kvm *kvm, const struct kvm_memory_slot *memslot, 5499 slot_level_handler fn, bool flush_on_yield) 5500 { 5501 return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K, 5502 PG_LEVEL_4K, flush_on_yield); 5503 } 5504 5505 static void free_mmu_pages(struct kvm_mmu *mmu) 5506 { 5507 if (!tdp_enabled && mmu->pae_root) 5508 set_memory_encrypted((unsigned long)mmu->pae_root, 1); 5509 free_page((unsigned long)mmu->pae_root); 5510 free_page((unsigned long)mmu->pml4_root); 5511 free_page((unsigned long)mmu->pml5_root); 5512 } 5513 5514 static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) 5515 { 5516 struct page *page; 5517 int i; 5518 5519 mmu->root_hpa = INVALID_PAGE; 5520 mmu->root_pgd = 0; 5521 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) 5522 mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; 5523 5524 /* vcpu->arch.guest_mmu isn't used when !tdp_enabled. */ 5525 if (!tdp_enabled && mmu == &vcpu->arch.guest_mmu) 5526 return 0; 5527 5528 /* 5529 * When using PAE paging, the four PDPTEs are treated as 'root' pages, 5530 * while the PDP table is a per-vCPU construct that's allocated at MMU 5531 * creation. When emulating 32-bit mode, cr3 is only 32 bits even on 5532 * x86_64. Therefore we need to allocate the PDP table in the first 5533 * 4GB of memory, which happens to fit the DMA32 zone. TDP paging 5534 * generally doesn't use PAE paging and can skip allocating the PDP 5535 * table. The main exception, handled here, is SVM's 32-bit NPT. The 5536 * other exception is for shadowing L1's 32-bit or PAE NPT on 64-bit 5537 * KVM; that horror is handled on-demand by mmu_alloc_special_roots(). 5538 */ 5539 if (tdp_enabled && kvm_mmu_get_tdp_level(vcpu) > PT32E_ROOT_LEVEL) 5540 return 0; 5541 5542 page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32); 5543 if (!page) 5544 return -ENOMEM; 5545 5546 mmu->pae_root = page_address(page); 5547 5548 /* 5549 * CR3 is only 32 bits when PAE paging is used, thus it's impossible to 5550 * get the CPU to treat the PDPTEs as encrypted. Decrypt the page so 5551 * that KVM's writes and the CPU's reads get along. Note, this is 5552 * only necessary when using shadow paging, as 64-bit NPT can get at 5553 * the C-bit even when shadowing 32-bit NPT, and SME isn't supported 5554 * by 32-bit kernels (when KVM itself uses 32-bit NPT). 5555 */ 5556 if (!tdp_enabled) 5557 set_memory_decrypted((unsigned long)mmu->pae_root, 1); 5558 else 5559 WARN_ON_ONCE(shadow_me_mask); 5560 5561 for (i = 0; i < 4; ++i) 5562 mmu->pae_root[i] = INVALID_PAE_ROOT; 5563 5564 return 0; 5565 } 5566 5567 int kvm_mmu_create(struct kvm_vcpu *vcpu) 5568 { 5569 int ret; 5570 5571 vcpu->arch.mmu_pte_list_desc_cache.kmem_cache = pte_list_desc_cache; 5572 vcpu->arch.mmu_pte_list_desc_cache.gfp_zero = __GFP_ZERO; 5573 5574 vcpu->arch.mmu_page_header_cache.kmem_cache = mmu_page_header_cache; 5575 vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO; 5576 5577 vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO; 5578 5579 vcpu->arch.mmu = &vcpu->arch.root_mmu; 5580 vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; 5581 5582 ret = __kvm_mmu_create(vcpu, &vcpu->arch.guest_mmu); 5583 if (ret) 5584 return ret; 5585 5586 ret = __kvm_mmu_create(vcpu, &vcpu->arch.root_mmu); 5587 if (ret) 5588 goto fail_allocate_root; 5589 5590 return ret; 5591 fail_allocate_root: 5592 free_mmu_pages(&vcpu->arch.guest_mmu); 5593 return ret; 5594 } 5595 5596 #define BATCH_ZAP_PAGES 10 5597 static void kvm_zap_obsolete_pages(struct kvm *kvm) 5598 { 5599 struct kvm_mmu_page *sp, *node; 5600 int nr_zapped, batch = 0; 5601 5602 restart: 5603 list_for_each_entry_safe_reverse(sp, node, 5604 &kvm->arch.active_mmu_pages, link) { 5605 /* 5606 * No obsolete valid page exists before a newly created page 5607 * since active_mmu_pages is a FIFO list. 5608 */ 5609 if (!is_obsolete_sp(kvm, sp)) 5610 break; 5611 5612 /* 5613 * Invalid pages should never land back on the list of active 5614 * pages. Skip the bogus page, otherwise we'll get stuck in an 5615 * infinite loop if the page gets put back on the list (again). 5616 */ 5617 if (WARN_ON(sp->role.invalid)) 5618 continue; 5619 5620 /* 5621 * No need to flush the TLB since we're only zapping shadow 5622 * pages with an obsolete generation number and all vCPUS have 5623 * loaded a new root, i.e. the shadow pages being zapped cannot 5624 * be in active use by the guest. 5625 */ 5626 if (batch >= BATCH_ZAP_PAGES && 5627 cond_resched_rwlock_write(&kvm->mmu_lock)) { 5628 batch = 0; 5629 goto restart; 5630 } 5631 5632 if (__kvm_mmu_prepare_zap_page(kvm, sp, 5633 &kvm->arch.zapped_obsolete_pages, &nr_zapped)) { 5634 batch += nr_zapped; 5635 goto restart; 5636 } 5637 } 5638 5639 /* 5640 * Trigger a remote TLB flush before freeing the page tables to ensure 5641 * KVM is not in the middle of a lockless shadow page table walk, which 5642 * may reference the pages. 5643 */ 5644 kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages); 5645 } 5646 5647 /* 5648 * Fast invalidate all shadow pages and use lock-break technique 5649 * to zap obsolete pages. 5650 * 5651 * It's required when memslot is being deleted or VM is being 5652 * destroyed, in these cases, we should ensure that KVM MMU does 5653 * not use any resource of the being-deleted slot or all slots 5654 * after calling the function. 5655 */ 5656 static void kvm_mmu_zap_all_fast(struct kvm *kvm) 5657 { 5658 lockdep_assert_held(&kvm->slots_lock); 5659 5660 write_lock(&kvm->mmu_lock); 5661 trace_kvm_mmu_zap_all_fast(kvm); 5662 5663 /* 5664 * Toggle mmu_valid_gen between '0' and '1'. Because slots_lock is 5665 * held for the entire duration of zapping obsolete pages, it's 5666 * impossible for there to be multiple invalid generations associated 5667 * with *valid* shadow pages at any given time, i.e. there is exactly 5668 * one valid generation and (at most) one invalid generation. 5669 */ 5670 kvm->arch.mmu_valid_gen = kvm->arch.mmu_valid_gen ? 0 : 1; 5671 5672 /* In order to ensure all threads see this change when 5673 * handling the MMU reload signal, this must happen in the 5674 * same critical section as kvm_reload_remote_mmus, and 5675 * before kvm_zap_obsolete_pages as kvm_zap_obsolete_pages 5676 * could drop the MMU lock and yield. 5677 */ 5678 if (is_tdp_mmu_enabled(kvm)) 5679 kvm_tdp_mmu_invalidate_all_roots(kvm); 5680 5681 /* 5682 * Notify all vcpus to reload its shadow page table and flush TLB. 5683 * Then all vcpus will switch to new shadow page table with the new 5684 * mmu_valid_gen. 5685 * 5686 * Note: we need to do this under the protection of mmu_lock, 5687 * otherwise, vcpu would purge shadow page but miss tlb flush. 5688 */ 5689 kvm_reload_remote_mmus(kvm); 5690 5691 kvm_zap_obsolete_pages(kvm); 5692 5693 write_unlock(&kvm->mmu_lock); 5694 5695 if (is_tdp_mmu_enabled(kvm)) { 5696 read_lock(&kvm->mmu_lock); 5697 kvm_tdp_mmu_zap_invalidated_roots(kvm); 5698 read_unlock(&kvm->mmu_lock); 5699 } 5700 } 5701 5702 static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm) 5703 { 5704 return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages)); 5705 } 5706 5707 static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm, 5708 struct kvm_memory_slot *slot, 5709 struct kvm_page_track_notifier_node *node) 5710 { 5711 kvm_mmu_zap_all_fast(kvm); 5712 } 5713 5714 void kvm_mmu_init_vm(struct kvm *kvm) 5715 { 5716 struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; 5717 5718 spin_lock_init(&kvm->arch.mmu_unsync_pages_lock); 5719 5720 kvm_mmu_init_tdp_mmu(kvm); 5721 5722 node->track_write = kvm_mmu_pte_write; 5723 node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot; 5724 kvm_page_track_register_notifier(kvm, node); 5725 } 5726 5727 void kvm_mmu_uninit_vm(struct kvm *kvm) 5728 { 5729 struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; 5730 5731 kvm_page_track_unregister_notifier(kvm, node); 5732 5733 kvm_mmu_uninit_tdp_mmu(kvm); 5734 } 5735 5736 static bool __kvm_zap_rmaps(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) 5737 { 5738 const struct kvm_memory_slot *memslot; 5739 struct kvm_memslots *slots; 5740 struct kvm_memslot_iter iter; 5741 bool flush = false; 5742 gfn_t start, end; 5743 int i; 5744 5745 if (!kvm_memslots_have_rmaps(kvm)) 5746 return flush; 5747 5748 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { 5749 slots = __kvm_memslots(kvm, i); 5750 5751 kvm_for_each_memslot_in_gfn_range(&iter, slots, gfn_start, gfn_end) { 5752 memslot = iter.slot; 5753 start = max(gfn_start, memslot->base_gfn); 5754 end = min(gfn_end, memslot->base_gfn + memslot->npages); 5755 if (WARN_ON_ONCE(start >= end)) 5756 continue; 5757 5758 flush = slot_handle_level_range(kvm, memslot, kvm_zap_rmapp, 5759 5760 PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, 5761 start, end - 1, true, flush); 5762 } 5763 } 5764 5765 return flush; 5766 } 5767 5768 /* 5769 * Invalidate (zap) SPTEs that cover GFNs from gfn_start and up to gfn_end 5770 * (not including it) 5771 */ 5772 void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) 5773 { 5774 bool flush; 5775 int i; 5776 5777 if (WARN_ON_ONCE(gfn_end <= gfn_start)) 5778 return; 5779 5780 write_lock(&kvm->mmu_lock); 5781 5782 kvm_inc_notifier_count(kvm, gfn_start, gfn_end); 5783 5784 flush = __kvm_zap_rmaps(kvm, gfn_start, gfn_end); 5785 5786 if (is_tdp_mmu_enabled(kvm)) { 5787 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) 5788 flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, gfn_start, 5789 gfn_end, flush); 5790 } 5791 5792 if (flush) 5793 kvm_flush_remote_tlbs_with_address(kvm, gfn_start, 5794 gfn_end - gfn_start); 5795 5796 kvm_dec_notifier_count(kvm, gfn_start, gfn_end); 5797 5798 write_unlock(&kvm->mmu_lock); 5799 } 5800 5801 static bool slot_rmap_write_protect(struct kvm *kvm, 5802 struct kvm_rmap_head *rmap_head, 5803 const struct kvm_memory_slot *slot) 5804 { 5805 return __rmap_write_protect(kvm, rmap_head, false); 5806 } 5807 5808 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, 5809 const struct kvm_memory_slot *memslot, 5810 int start_level) 5811 { 5812 bool flush = false; 5813 5814 if (kvm_memslots_have_rmaps(kvm)) { 5815 write_lock(&kvm->mmu_lock); 5816 flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect, 5817 start_level, KVM_MAX_HUGEPAGE_LEVEL, 5818 false); 5819 write_unlock(&kvm->mmu_lock); 5820 } 5821 5822 if (is_tdp_mmu_enabled(kvm)) { 5823 read_lock(&kvm->mmu_lock); 5824 flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, start_level); 5825 read_unlock(&kvm->mmu_lock); 5826 } 5827 5828 /* 5829 * Flush TLBs if any SPTEs had to be write-protected to ensure that 5830 * guest writes are reflected in the dirty bitmap before the memslot 5831 * update completes, i.e. before enabling dirty logging is visible to 5832 * userspace. 5833 * 5834 * Perform the TLB flush outside the mmu_lock to reduce the amount of 5835 * time the lock is held. However, this does mean that another CPU can 5836 * now grab mmu_lock and encounter a write-protected SPTE while CPUs 5837 * still have a writable mapping for the associated GFN in their TLB. 5838 * 5839 * This is safe but requires KVM to be careful when making decisions 5840 * based on the write-protection status of an SPTE. Specifically, KVM 5841 * also write-protects SPTEs to monitor changes to guest page tables 5842 * during shadow paging, and must guarantee no CPUs can write to those 5843 * page before the lock is dropped. As mentioned in the previous 5844 * paragraph, a write-protected SPTE is no guarantee that CPU cannot 5845 * perform writes. So to determine if a TLB flush is truly required, KVM 5846 * will clear a separate software-only bit (MMU-writable) and skip the 5847 * flush if-and-only-if this bit was already clear. 5848 * 5849 * See DEFAULT_SPTE_MMU_WRITEABLE for more details. 5850 */ 5851 if (flush) 5852 kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); 5853 } 5854 5855 static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, 5856 struct kvm_rmap_head *rmap_head, 5857 const struct kvm_memory_slot *slot) 5858 { 5859 u64 *sptep; 5860 struct rmap_iterator iter; 5861 int need_tlb_flush = 0; 5862 kvm_pfn_t pfn; 5863 struct kvm_mmu_page *sp; 5864 5865 restart: 5866 for_each_rmap_spte(rmap_head, &iter, sptep) { 5867 sp = sptep_to_sp(sptep); 5868 pfn = spte_to_pfn(*sptep); 5869 5870 /* 5871 * We cannot do huge page mapping for indirect shadow pages, 5872 * which are found on the last rmap (level = 1) when not using 5873 * tdp; such shadow pages are synced with the page table in 5874 * the guest, and the guest page table is using 4K page size 5875 * mapping if the indirect sp has level = 1. 5876 */ 5877 if (sp->role.direct && !kvm_is_reserved_pfn(pfn) && 5878 sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn, 5879 pfn, PG_LEVEL_NUM)) { 5880 pte_list_remove(kvm, rmap_head, sptep); 5881 5882 if (kvm_available_flush_tlb_with_range()) 5883 kvm_flush_remote_tlbs_with_address(kvm, sp->gfn, 5884 KVM_PAGES_PER_HPAGE(sp->role.level)); 5885 else 5886 need_tlb_flush = 1; 5887 5888 goto restart; 5889 } 5890 } 5891 5892 return need_tlb_flush; 5893 } 5894 5895 void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, 5896 const struct kvm_memory_slot *slot) 5897 { 5898 if (kvm_memslots_have_rmaps(kvm)) { 5899 write_lock(&kvm->mmu_lock); 5900 /* 5901 * Zap only 4k SPTEs since the legacy MMU only supports dirty 5902 * logging at a 4k granularity and never creates collapsible 5903 * 2m SPTEs during dirty logging. 5904 */ 5905 if (slot_handle_level_4k(kvm, slot, kvm_mmu_zap_collapsible_spte, true)) 5906 kvm_arch_flush_remote_tlbs_memslot(kvm, slot); 5907 write_unlock(&kvm->mmu_lock); 5908 } 5909 5910 if (is_tdp_mmu_enabled(kvm)) { 5911 read_lock(&kvm->mmu_lock); 5912 kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot); 5913 read_unlock(&kvm->mmu_lock); 5914 } 5915 } 5916 5917 void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, 5918 const struct kvm_memory_slot *memslot) 5919 { 5920 /* 5921 * All current use cases for flushing the TLBs for a specific memslot 5922 * related to dirty logging, and many do the TLB flush out of mmu_lock. 5923 * The interaction between the various operations on memslot must be 5924 * serialized by slots_locks to ensure the TLB flush from one operation 5925 * is observed by any other operation on the same memslot. 5926 */ 5927 lockdep_assert_held(&kvm->slots_lock); 5928 kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn, 5929 memslot->npages); 5930 } 5931 5932 void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, 5933 const struct kvm_memory_slot *memslot) 5934 { 5935 bool flush = false; 5936 5937 if (kvm_memslots_have_rmaps(kvm)) { 5938 write_lock(&kvm->mmu_lock); 5939 /* 5940 * Clear dirty bits only on 4k SPTEs since the legacy MMU only 5941 * support dirty logging at a 4k granularity. 5942 */ 5943 flush = slot_handle_level_4k(kvm, memslot, __rmap_clear_dirty, false); 5944 write_unlock(&kvm->mmu_lock); 5945 } 5946 5947 if (is_tdp_mmu_enabled(kvm)) { 5948 read_lock(&kvm->mmu_lock); 5949 flush |= kvm_tdp_mmu_clear_dirty_slot(kvm, memslot); 5950 read_unlock(&kvm->mmu_lock); 5951 } 5952 5953 /* 5954 * It's also safe to flush TLBs out of mmu lock here as currently this 5955 * function is only used for dirty logging, in which case flushing TLB 5956 * out of mmu lock also guarantees no dirty pages will be lost in 5957 * dirty_bitmap. 5958 */ 5959 if (flush) 5960 kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); 5961 } 5962 5963 void kvm_mmu_zap_all(struct kvm *kvm) 5964 { 5965 struct kvm_mmu_page *sp, *node; 5966 LIST_HEAD(invalid_list); 5967 int ign; 5968 5969 write_lock(&kvm->mmu_lock); 5970 restart: 5971 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) { 5972 if (WARN_ON(sp->role.invalid)) 5973 continue; 5974 if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign)) 5975 goto restart; 5976 if (cond_resched_rwlock_write(&kvm->mmu_lock)) 5977 goto restart; 5978 } 5979 5980 kvm_mmu_commit_zap_page(kvm, &invalid_list); 5981 5982 if (is_tdp_mmu_enabled(kvm)) 5983 kvm_tdp_mmu_zap_all(kvm); 5984 5985 write_unlock(&kvm->mmu_lock); 5986 } 5987 5988 void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) 5989 { 5990 WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS); 5991 5992 gen &= MMIO_SPTE_GEN_MASK; 5993 5994 /* 5995 * Generation numbers are incremented in multiples of the number of 5996 * address spaces in order to provide unique generations across all 5997 * address spaces. Strip what is effectively the address space 5998 * modifier prior to checking for a wrap of the MMIO generation so 5999 * that a wrap in any address space is detected. 6000 */ 6001 gen &= ~((u64)KVM_ADDRESS_SPACE_NUM - 1); 6002 6003 /* 6004 * The very rare case: if the MMIO generation number has wrapped, 6005 * zap all shadow pages. 6006 */ 6007 if (unlikely(gen == 0)) { 6008 kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n"); 6009 kvm_mmu_zap_all_fast(kvm); 6010 } 6011 } 6012 6013 static unsigned long 6014 mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) 6015 { 6016 struct kvm *kvm; 6017 int nr_to_scan = sc->nr_to_scan; 6018 unsigned long freed = 0; 6019 6020 mutex_lock(&kvm_lock); 6021 6022 list_for_each_entry(kvm, &vm_list, vm_list) { 6023 int idx; 6024 LIST_HEAD(invalid_list); 6025 6026 /* 6027 * Never scan more than sc->nr_to_scan VM instances. 6028 * Will not hit this condition practically since we do not try 6029 * to shrink more than one VM and it is very unlikely to see 6030 * !n_used_mmu_pages so many times. 6031 */ 6032 if (!nr_to_scan--) 6033 break; 6034 /* 6035 * n_used_mmu_pages is accessed without holding kvm->mmu_lock 6036 * here. We may skip a VM instance errorneosly, but we do not 6037 * want to shrink a VM that only started to populate its MMU 6038 * anyway. 6039 */ 6040 if (!kvm->arch.n_used_mmu_pages && 6041 !kvm_has_zapped_obsolete_pages(kvm)) 6042 continue; 6043 6044 idx = srcu_read_lock(&kvm->srcu); 6045 write_lock(&kvm->mmu_lock); 6046 6047 if (kvm_has_zapped_obsolete_pages(kvm)) { 6048 kvm_mmu_commit_zap_page(kvm, 6049 &kvm->arch.zapped_obsolete_pages); 6050 goto unlock; 6051 } 6052 6053 freed = kvm_mmu_zap_oldest_mmu_pages(kvm, sc->nr_to_scan); 6054 6055 unlock: 6056 write_unlock(&kvm->mmu_lock); 6057 srcu_read_unlock(&kvm->srcu, idx); 6058 6059 /* 6060 * unfair on small ones 6061 * per-vm shrinkers cry out 6062 * sadness comes quickly 6063 */ 6064 list_move_tail(&kvm->vm_list, &vm_list); 6065 break; 6066 } 6067 6068 mutex_unlock(&kvm_lock); 6069 return freed; 6070 } 6071 6072 static unsigned long 6073 mmu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) 6074 { 6075 return percpu_counter_read_positive(&kvm_total_used_mmu_pages); 6076 } 6077 6078 static struct shrinker mmu_shrinker = { 6079 .count_objects = mmu_shrink_count, 6080 .scan_objects = mmu_shrink_scan, 6081 .seeks = DEFAULT_SEEKS * 10, 6082 }; 6083 6084 static void mmu_destroy_caches(void) 6085 { 6086 kmem_cache_destroy(pte_list_desc_cache); 6087 kmem_cache_destroy(mmu_page_header_cache); 6088 } 6089 6090 static bool get_nx_auto_mode(void) 6091 { 6092 /* Return true when CPU has the bug, and mitigations are ON */ 6093 return boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT) && !cpu_mitigations_off(); 6094 } 6095 6096 static void __set_nx_huge_pages(bool val) 6097 { 6098 nx_huge_pages = itlb_multihit_kvm_mitigation = val; 6099 } 6100 6101 static int set_nx_huge_pages(const char *val, const struct kernel_param *kp) 6102 { 6103 bool old_val = nx_huge_pages; 6104 bool new_val; 6105 6106 /* In "auto" mode deploy workaround only if CPU has the bug. */ 6107 if (sysfs_streq(val, "off")) 6108 new_val = 0; 6109 else if (sysfs_streq(val, "force")) 6110 new_val = 1; 6111 else if (sysfs_streq(val, "auto")) 6112 new_val = get_nx_auto_mode(); 6113 else if (strtobool(val, &new_val) < 0) 6114 return -EINVAL; 6115 6116 __set_nx_huge_pages(new_val); 6117 6118 if (new_val != old_val) { 6119 struct kvm *kvm; 6120 6121 mutex_lock(&kvm_lock); 6122 6123 list_for_each_entry(kvm, &vm_list, vm_list) { 6124 mutex_lock(&kvm->slots_lock); 6125 kvm_mmu_zap_all_fast(kvm); 6126 mutex_unlock(&kvm->slots_lock); 6127 6128 wake_up_process(kvm->arch.nx_lpage_recovery_thread); 6129 } 6130 mutex_unlock(&kvm_lock); 6131 } 6132 6133 return 0; 6134 } 6135 6136 int kvm_mmu_module_init(void) 6137 { 6138 int ret = -ENOMEM; 6139 6140 if (nx_huge_pages == -1) 6141 __set_nx_huge_pages(get_nx_auto_mode()); 6142 6143 /* 6144 * MMU roles use union aliasing which is, generally speaking, an 6145 * undefined behavior. However, we supposedly know how compilers behave 6146 * and the current status quo is unlikely to change. Guardians below are 6147 * supposed to let us know if the assumption becomes false. 6148 */ 6149 BUILD_BUG_ON(sizeof(union kvm_mmu_page_role) != sizeof(u32)); 6150 BUILD_BUG_ON(sizeof(union kvm_mmu_extended_role) != sizeof(u32)); 6151 BUILD_BUG_ON(sizeof(union kvm_mmu_role) != sizeof(u64)); 6152 6153 kvm_mmu_reset_all_pte_masks(); 6154 6155 pte_list_desc_cache = kmem_cache_create("pte_list_desc", 6156 sizeof(struct pte_list_desc), 6157 0, SLAB_ACCOUNT, NULL); 6158 if (!pte_list_desc_cache) 6159 goto out; 6160 6161 mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header", 6162 sizeof(struct kvm_mmu_page), 6163 0, SLAB_ACCOUNT, NULL); 6164 if (!mmu_page_header_cache) 6165 goto out; 6166 6167 if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL)) 6168 goto out; 6169 6170 ret = register_shrinker(&mmu_shrinker); 6171 if (ret) 6172 goto out; 6173 6174 return 0; 6175 6176 out: 6177 mmu_destroy_caches(); 6178 return ret; 6179 } 6180 6181 void kvm_mmu_destroy(struct kvm_vcpu *vcpu) 6182 { 6183 kvm_mmu_unload(vcpu); 6184 free_mmu_pages(&vcpu->arch.root_mmu); 6185 free_mmu_pages(&vcpu->arch.guest_mmu); 6186 mmu_free_memory_caches(vcpu); 6187 } 6188 6189 void kvm_mmu_module_exit(void) 6190 { 6191 mmu_destroy_caches(); 6192 percpu_counter_destroy(&kvm_total_used_mmu_pages); 6193 unregister_shrinker(&mmu_shrinker); 6194 mmu_audit_disable(); 6195 } 6196 6197 /* 6198 * Calculate the effective recovery period, accounting for '0' meaning "let KVM 6199 * select a halving time of 1 hour". Returns true if recovery is enabled. 6200 */ 6201 static bool calc_nx_huge_pages_recovery_period(uint *period) 6202 { 6203 /* 6204 * Use READ_ONCE to get the params, this may be called outside of the 6205 * param setters, e.g. by the kthread to compute its next timeout. 6206 */ 6207 bool enabled = READ_ONCE(nx_huge_pages); 6208 uint ratio = READ_ONCE(nx_huge_pages_recovery_ratio); 6209 6210 if (!enabled || !ratio) 6211 return false; 6212 6213 *period = READ_ONCE(nx_huge_pages_recovery_period_ms); 6214 if (!*period) { 6215 /* Make sure the period is not less than one second. */ 6216 ratio = min(ratio, 3600u); 6217 *period = 60 * 60 * 1000 / ratio; 6218 } 6219 return true; 6220 } 6221 6222 static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp) 6223 { 6224 bool was_recovery_enabled, is_recovery_enabled; 6225 uint old_period, new_period; 6226 int err; 6227 6228 was_recovery_enabled = calc_nx_huge_pages_recovery_period(&old_period); 6229 6230 err = param_set_uint(val, kp); 6231 if (err) 6232 return err; 6233 6234 is_recovery_enabled = calc_nx_huge_pages_recovery_period(&new_period); 6235 6236 if (is_recovery_enabled && 6237 (!was_recovery_enabled || old_period > new_period)) { 6238 struct kvm *kvm; 6239 6240 mutex_lock(&kvm_lock); 6241 6242 list_for_each_entry(kvm, &vm_list, vm_list) 6243 wake_up_process(kvm->arch.nx_lpage_recovery_thread); 6244 6245 mutex_unlock(&kvm_lock); 6246 } 6247 6248 return err; 6249 } 6250 6251 static void kvm_recover_nx_lpages(struct kvm *kvm) 6252 { 6253 unsigned long nx_lpage_splits = kvm->stat.nx_lpage_splits; 6254 int rcu_idx; 6255 struct kvm_mmu_page *sp; 6256 unsigned int ratio; 6257 LIST_HEAD(invalid_list); 6258 bool flush = false; 6259 ulong to_zap; 6260 6261 rcu_idx = srcu_read_lock(&kvm->srcu); 6262 write_lock(&kvm->mmu_lock); 6263 6264 ratio = READ_ONCE(nx_huge_pages_recovery_ratio); 6265 to_zap = ratio ? DIV_ROUND_UP(nx_lpage_splits, ratio) : 0; 6266 for ( ; to_zap; --to_zap) { 6267 if (list_empty(&kvm->arch.lpage_disallowed_mmu_pages)) 6268 break; 6269 6270 /* 6271 * We use a separate list instead of just using active_mmu_pages 6272 * because the number of lpage_disallowed pages is expected to 6273 * be relatively small compared to the total. 6274 */ 6275 sp = list_first_entry(&kvm->arch.lpage_disallowed_mmu_pages, 6276 struct kvm_mmu_page, 6277 lpage_disallowed_link); 6278 WARN_ON_ONCE(!sp->lpage_disallowed); 6279 if (is_tdp_mmu_page(sp)) { 6280 flush |= kvm_tdp_mmu_zap_sp(kvm, sp); 6281 } else { 6282 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); 6283 WARN_ON_ONCE(sp->lpage_disallowed); 6284 } 6285 6286 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { 6287 kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); 6288 cond_resched_rwlock_write(&kvm->mmu_lock); 6289 flush = false; 6290 } 6291 } 6292 kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); 6293 6294 write_unlock(&kvm->mmu_lock); 6295 srcu_read_unlock(&kvm->srcu, rcu_idx); 6296 } 6297 6298 static long get_nx_lpage_recovery_timeout(u64 start_time) 6299 { 6300 bool enabled; 6301 uint period; 6302 6303 enabled = calc_nx_huge_pages_recovery_period(&period); 6304 6305 return enabled ? start_time + msecs_to_jiffies(period) - get_jiffies_64() 6306 : MAX_SCHEDULE_TIMEOUT; 6307 } 6308 6309 static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data) 6310 { 6311 u64 start_time; 6312 long remaining_time; 6313 6314 while (true) { 6315 start_time = get_jiffies_64(); 6316 remaining_time = get_nx_lpage_recovery_timeout(start_time); 6317 6318 set_current_state(TASK_INTERRUPTIBLE); 6319 while (!kthread_should_stop() && remaining_time > 0) { 6320 schedule_timeout(remaining_time); 6321 remaining_time = get_nx_lpage_recovery_timeout(start_time); 6322 set_current_state(TASK_INTERRUPTIBLE); 6323 } 6324 6325 set_current_state(TASK_RUNNING); 6326 6327 if (kthread_should_stop()) 6328 return 0; 6329 6330 kvm_recover_nx_lpages(kvm); 6331 } 6332 } 6333 6334 int kvm_mmu_post_init_vm(struct kvm *kvm) 6335 { 6336 int err; 6337 6338 err = kvm_vm_create_worker_thread(kvm, kvm_nx_lpage_recovery_worker, 0, 6339 "kvm-nx-lpage-recovery", 6340 &kvm->arch.nx_lpage_recovery_thread); 6341 if (!err) 6342 kthread_unpark(kvm->arch.nx_lpage_recovery_thread); 6343 6344 return err; 6345 } 6346 6347 void kvm_mmu_pre_destroy_vm(struct kvm *kvm) 6348 { 6349 if (kvm->arch.nx_lpage_recovery_thread) 6350 kthread_stop(kvm->arch.nx_lpage_recovery_thread); 6351 } 6352