1 // SPDX-License-Identifier: GPL-2.0 2 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 3 4 #include "mmu.h" 5 #include "mmu_internal.h" 6 #include "mmutrace.h" 7 #include "tdp_iter.h" 8 #include "tdp_mmu.h" 9 #include "spte.h" 10 11 #include <asm/cmpxchg.h> 12 #include <trace/events/kvm.h> 13 14 /* Initializes the TDP MMU for the VM, if enabled. */ 15 void kvm_mmu_init_tdp_mmu(struct kvm *kvm) 16 { 17 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); 18 spin_lock_init(&kvm->arch.tdp_mmu_pages_lock); 19 } 20 21 /* Arbitrarily returns true so that this may be used in if statements. */ 22 static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm, 23 bool shared) 24 { 25 if (shared) 26 lockdep_assert_held_read(&kvm->mmu_lock); 27 else 28 lockdep_assert_held_write(&kvm->mmu_lock); 29 30 return true; 31 } 32 33 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) 34 { 35 /* 36 * Invalidate all roots, which besides the obvious, schedules all roots 37 * for zapping and thus puts the TDP MMU's reference to each root, i.e. 38 * ultimately frees all roots. 39 */ 40 kvm_tdp_mmu_invalidate_roots(kvm, KVM_VALID_ROOTS); 41 kvm_tdp_mmu_zap_invalidated_roots(kvm, false); 42 43 #ifdef CONFIG_KVM_PROVE_MMU 44 KVM_MMU_WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages)); 45 #endif 46 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); 47 48 /* 49 * Ensure that all the outstanding RCU callbacks to free shadow pages 50 * can run before the VM is torn down. Putting the last reference to 51 * zapped roots will create new callbacks. 52 */ 53 rcu_barrier(); 54 } 55 56 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp) 57 { 58 free_page((unsigned long)sp->external_spt); 59 free_page((unsigned long)sp->spt); 60 kmem_cache_free(mmu_page_header_cache, sp); 61 } 62 63 /* 64 * This is called through call_rcu in order to free TDP page table memory 65 * safely with respect to other kernel threads that may be operating on 66 * the memory. 67 * By only accessing TDP MMU page table memory in an RCU read critical 68 * section, and freeing it after a grace period, lockless access to that 69 * memory won't use it after it is freed. 70 */ 71 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head) 72 { 73 struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page, 74 rcu_head); 75 76 tdp_mmu_free_sp(sp); 77 } 78 79 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root) 80 { 81 if (!refcount_dec_and_test(&root->tdp_mmu_root_count)) 82 return; 83 84 /* 85 * The TDP MMU itself holds a reference to each root until the root is 86 * explicitly invalidated, i.e. the final reference should be never be 87 * put for a valid root. 88 */ 89 KVM_BUG_ON(!is_tdp_mmu_page(root) || !root->role.invalid, kvm); 90 91 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 92 list_del_rcu(&root->link); 93 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 94 call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback); 95 } 96 97 static bool tdp_mmu_root_match(struct kvm_mmu_page *root, 98 enum kvm_tdp_mmu_root_types types) 99 { 100 if (WARN_ON_ONCE(!(types & KVM_VALID_ROOTS))) 101 return false; 102 103 if (root->role.invalid && !(types & KVM_INVALID_ROOTS)) 104 return false; 105 106 if (likely(!is_mirror_sp(root))) 107 return types & KVM_DIRECT_ROOTS; 108 return types & KVM_MIRROR_ROOTS; 109 } 110 111 /* 112 * Returns the next root after @prev_root (or the first root if @prev_root is 113 * NULL) that matches with @types. A reference to the returned root is 114 * acquired, and the reference to @prev_root is released (the caller obviously 115 * must hold a reference to @prev_root if it's non-NULL). 116 * 117 * Roots that doesn't match with @types are skipped. 118 * 119 * Returns NULL if the end of tdp_mmu_roots was reached. 120 */ 121 static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, 122 struct kvm_mmu_page *prev_root, 123 enum kvm_tdp_mmu_root_types types) 124 { 125 struct kvm_mmu_page *next_root; 126 127 /* 128 * While the roots themselves are RCU-protected, fields such as 129 * role.invalid are protected by mmu_lock. 130 */ 131 lockdep_assert_held(&kvm->mmu_lock); 132 133 rcu_read_lock(); 134 135 if (prev_root) 136 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, 137 &prev_root->link, 138 typeof(*prev_root), link); 139 else 140 next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots, 141 typeof(*next_root), link); 142 143 while (next_root) { 144 if (tdp_mmu_root_match(next_root, types) && 145 kvm_tdp_mmu_get_root(next_root)) 146 break; 147 148 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, 149 &next_root->link, typeof(*next_root), link); 150 } 151 152 rcu_read_unlock(); 153 154 if (prev_root) 155 kvm_tdp_mmu_put_root(kvm, prev_root); 156 157 return next_root; 158 } 159 160 /* 161 * Note: this iterator gets and puts references to the roots it iterates over. 162 * This makes it safe to release the MMU lock and yield within the loop, but 163 * if exiting the loop early, the caller must drop the reference to the most 164 * recent root. (Unless keeping a live reference is desirable.) 165 * 166 * If shared is set, this function is operating under the MMU lock in read 167 * mode. 168 */ 169 #define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _types) \ 170 for (_root = tdp_mmu_next_root(_kvm, NULL, _types); \ 171 ({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \ 172 _root = tdp_mmu_next_root(_kvm, _root, _types)) \ 173 if (_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) { \ 174 } else 175 176 #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \ 177 __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, KVM_VALID_ROOTS) 178 179 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root) \ 180 for (_root = tdp_mmu_next_root(_kvm, NULL, KVM_ALL_ROOTS); \ 181 ({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \ 182 _root = tdp_mmu_next_root(_kvm, _root, KVM_ALL_ROOTS)) 183 184 /* 185 * Iterate over all TDP MMU roots. Requires that mmu_lock be held for write, 186 * the implication being that any flow that holds mmu_lock for read is 187 * inherently yield-friendly and should use the yield-safe variant above. 188 * Holding mmu_lock for write obviates the need for RCU protection as the list 189 * is guaranteed to be stable. 190 */ 191 #define __for_each_tdp_mmu_root(_kvm, _root, _as_id, _types) \ 192 list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \ 193 if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \ 194 ((_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) || \ 195 !tdp_mmu_root_match((_root), (_types)))) { \ 196 } else 197 198 /* 199 * Iterate over all TDP MMU roots in an RCU read-side critical section. 200 * It is safe to iterate over the SPTEs under the root, but their values will 201 * be unstable, so all writes must be atomic. As this routine is meant to be 202 * used without holding the mmu_lock at all, any bits that are flipped must 203 * be reflected in kvm_tdp_mmu_spte_need_atomic_write(). 204 */ 205 #define for_each_tdp_mmu_root_rcu(_kvm, _root, _as_id, _types) \ 206 list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link) \ 207 if ((_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) || \ 208 !tdp_mmu_root_match((_root), (_types))) { \ 209 } else 210 211 #define for_each_valid_tdp_mmu_root(_kvm, _root, _as_id) \ 212 __for_each_tdp_mmu_root(_kvm, _root, _as_id, KVM_VALID_ROOTS) 213 214 static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu) 215 { 216 struct kvm_mmu_page *sp; 217 218 sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); 219 sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); 220 221 return sp; 222 } 223 224 static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep, 225 gfn_t gfn, union kvm_mmu_page_role role) 226 { 227 INIT_LIST_HEAD(&sp->possible_nx_huge_page_link); 228 229 set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 230 231 sp->role = role; 232 sp->gfn = gfn; 233 sp->ptep = sptep; 234 sp->tdp_mmu_page = true; 235 236 trace_kvm_mmu_get_page(sp, true); 237 } 238 239 static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp, 240 struct tdp_iter *iter) 241 { 242 struct kvm_mmu_page *parent_sp; 243 union kvm_mmu_page_role role; 244 245 parent_sp = sptep_to_sp(rcu_dereference(iter->sptep)); 246 247 role = parent_sp->role; 248 role.level--; 249 250 tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role); 251 } 252 253 void kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu, bool mirror) 254 { 255 struct kvm_mmu *mmu = vcpu->arch.mmu; 256 union kvm_mmu_page_role role = mmu->root_role; 257 int as_id = kvm_mmu_role_as_id(role); 258 struct kvm *kvm = vcpu->kvm; 259 struct kvm_mmu_page *root; 260 261 if (mirror) 262 role.is_mirror = true; 263 264 /* 265 * Check for an existing root before acquiring the pages lock to avoid 266 * unnecessary serialization if multiple vCPUs are loading a new root. 267 * E.g. when bringing up secondary vCPUs, KVM will already have created 268 * a valid root on behalf of the primary vCPU. 269 */ 270 read_lock(&kvm->mmu_lock); 271 272 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, as_id) { 273 if (root->role.word == role.word) 274 goto out_read_unlock; 275 } 276 277 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 278 279 /* 280 * Recheck for an existing root after acquiring the pages lock, another 281 * vCPU may have raced ahead and created a new usable root. Manually 282 * walk the list of roots as the standard macros assume that the pages 283 * lock is *not* held. WARN if grabbing a reference to a usable root 284 * fails, as the last reference to a root can only be put *after* the 285 * root has been invalidated, which requires holding mmu_lock for write. 286 */ 287 list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) { 288 if (root->role.word == role.word && 289 !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root))) 290 goto out_spin_unlock; 291 } 292 293 root = tdp_mmu_alloc_sp(vcpu); 294 tdp_mmu_init_sp(root, NULL, 0, role); 295 296 /* 297 * TDP MMU roots are kept until they are explicitly invalidated, either 298 * by a memslot update or by the destruction of the VM. Initialize the 299 * refcount to two; one reference for the vCPU, and one reference for 300 * the TDP MMU itself, which is held until the root is invalidated and 301 * is ultimately put by kvm_tdp_mmu_zap_invalidated_roots(). 302 */ 303 refcount_set(&root->tdp_mmu_root_count, 2); 304 list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots); 305 306 out_spin_unlock: 307 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 308 out_read_unlock: 309 read_unlock(&kvm->mmu_lock); 310 /* 311 * Note, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS will prevent entering the guest 312 * and actually consuming the root if it's invalidated after dropping 313 * mmu_lock, and the root can't be freed as this vCPU holds a reference. 314 */ 315 if (mirror) { 316 mmu->mirror_root_hpa = __pa(root->spt); 317 } else { 318 mmu->root.hpa = __pa(root->spt); 319 mmu->root.pgd = 0; 320 } 321 } 322 323 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 324 u64 old_spte, u64 new_spte, int level, 325 bool shared); 326 327 static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) 328 { 329 kvm_account_pgtable_pages((void *)sp->spt, +1); 330 #ifdef CONFIG_KVM_PROVE_MMU 331 atomic64_inc(&kvm->arch.tdp_mmu_pages); 332 #endif 333 } 334 335 static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) 336 { 337 kvm_account_pgtable_pages((void *)sp->spt, -1); 338 #ifdef CONFIG_KVM_PROVE_MMU 339 atomic64_dec(&kvm->arch.tdp_mmu_pages); 340 #endif 341 } 342 343 /** 344 * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages 345 * 346 * @kvm: kvm instance 347 * @sp: the page to be removed 348 */ 349 static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp) 350 { 351 tdp_unaccount_mmu_page(kvm, sp); 352 353 if (!sp->nx_huge_page_disallowed) 354 return; 355 356 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 357 sp->nx_huge_page_disallowed = false; 358 untrack_possible_nx_huge_page(kvm, sp, KVM_TDP_MMU); 359 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 360 } 361 362 static void remove_external_spte(struct kvm *kvm, gfn_t gfn, u64 old_spte, 363 int level) 364 { 365 kvm_pfn_t old_pfn = spte_to_pfn(old_spte); 366 int ret; 367 368 /* 369 * External (TDX) SPTEs are limited to PG_LEVEL_4K, and external 370 * PTs are removed in a special order, involving free_external_spt(). 371 * But remove_external_spte() will be called on non-leaf PTEs via 372 * __tdp_mmu_zap_root(), so avoid the error the former would return 373 * in this case. 374 */ 375 if (!is_last_spte(old_spte, level)) 376 return; 377 378 /* Zapping leaf spte is allowed only when write lock is held. */ 379 lockdep_assert_held_write(&kvm->mmu_lock); 380 /* Because write lock is held, operation should success. */ 381 ret = kvm_x86_call(remove_external_spte)(kvm, gfn, level, old_pfn); 382 KVM_BUG_ON(ret, kvm); 383 } 384 385 /** 386 * handle_removed_pt() - handle a page table removed from the TDP structure 387 * 388 * @kvm: kvm instance 389 * @pt: the page removed from the paging structure 390 * @shared: This operation may not be running under the exclusive use 391 * of the MMU lock and the operation must synchronize with other 392 * threads that might be modifying SPTEs. 393 * 394 * Given a page table that has been removed from the TDP paging structure, 395 * iterates through the page table to clear SPTEs and free child page tables. 396 * 397 * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU 398 * protection. Since this thread removed it from the paging structure, 399 * this thread will be responsible for ensuring the page is freed. Hence the 400 * early rcu_dereferences in the function. 401 */ 402 static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) 403 { 404 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt)); 405 int level = sp->role.level; 406 gfn_t base_gfn = sp->gfn; 407 int i; 408 409 trace_kvm_mmu_prepare_zap_page(sp); 410 411 tdp_mmu_unlink_sp(kvm, sp); 412 413 for (i = 0; i < SPTE_ENT_PER_PAGE; i++) { 414 tdp_ptep_t sptep = pt + i; 415 gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level); 416 u64 old_spte; 417 418 if (shared) { 419 /* 420 * Set the SPTE to a nonpresent value that other 421 * threads will not overwrite. If the SPTE was 422 * already marked as frozen then another thread 423 * handling a page fault could overwrite it, so 424 * set the SPTE until it is set from some other 425 * value to the frozen SPTE value. 426 */ 427 for (;;) { 428 old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, FROZEN_SPTE); 429 if (!is_frozen_spte(old_spte)) 430 break; 431 cpu_relax(); 432 } 433 } else { 434 /* 435 * If the SPTE is not MMU-present, there is no backing 436 * page associated with the SPTE and so no side effects 437 * that need to be recorded, and exclusive ownership of 438 * mmu_lock ensures the SPTE can't be made present. 439 * Note, zapping MMIO SPTEs is also unnecessary as they 440 * are guarded by the memslots generation, not by being 441 * unreachable. 442 */ 443 old_spte = kvm_tdp_mmu_read_spte(sptep); 444 if (!is_shadow_present_pte(old_spte)) 445 continue; 446 447 /* 448 * Use the common helper instead of a raw WRITE_ONCE as 449 * the SPTE needs to be updated atomically if it can be 450 * modified by a different vCPU outside of mmu_lock. 451 * Even though the parent SPTE is !PRESENT, the TLB 452 * hasn't yet been flushed, and both Intel and AMD 453 * document that A/D assists can use upper-level PxE 454 * entries that are cached in the TLB, i.e. the CPU can 455 * still access the page and mark it dirty. 456 * 457 * No retry is needed in the atomic update path as the 458 * sole concern is dropping a Dirty bit, i.e. no other 459 * task can zap/remove the SPTE as mmu_lock is held for 460 * write. Marking the SPTE as a frozen SPTE is not 461 * strictly necessary for the same reason, but using 462 * the frozen SPTE value keeps the shared/exclusive 463 * paths consistent and allows the handle_changed_spte() 464 * call below to hardcode the new value to FROZEN_SPTE. 465 * 466 * Note, even though dropping a Dirty bit is the only 467 * scenario where a non-atomic update could result in a 468 * functional bug, simply checking the Dirty bit isn't 469 * sufficient as a fast page fault could read the upper 470 * level SPTE before it is zapped, and then make this 471 * target SPTE writable, resume the guest, and set the 472 * Dirty bit between reading the SPTE above and writing 473 * it here. 474 */ 475 old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, 476 FROZEN_SPTE, level); 477 } 478 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn, 479 old_spte, FROZEN_SPTE, level, shared); 480 481 if (is_mirror_sp(sp)) { 482 KVM_BUG_ON(shared, kvm); 483 remove_external_spte(kvm, gfn, old_spte, level); 484 } 485 } 486 487 if (is_mirror_sp(sp) && 488 WARN_ON(kvm_x86_call(free_external_spt)(kvm, base_gfn, sp->role.level, 489 sp->external_spt))) { 490 /* 491 * Failed to free page table page in mirror page table and 492 * there is nothing to do further. 493 * Intentionally leak the page to prevent the kernel from 494 * accessing the encrypted page. 495 */ 496 sp->external_spt = NULL; 497 } 498 499 call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); 500 } 501 502 static void *get_external_spt(gfn_t gfn, u64 new_spte, int level) 503 { 504 if (is_shadow_present_pte(new_spte) && !is_last_spte(new_spte, level)) { 505 struct kvm_mmu_page *sp = spte_to_child_sp(new_spte); 506 507 WARN_ON_ONCE(sp->role.level + 1 != level); 508 WARN_ON_ONCE(sp->gfn != gfn); 509 return sp->external_spt; 510 } 511 512 return NULL; 513 } 514 515 static int __must_check set_external_spte_present(struct kvm *kvm, tdp_ptep_t sptep, 516 gfn_t gfn, u64 old_spte, 517 u64 new_spte, int level) 518 { 519 bool was_present = is_shadow_present_pte(old_spte); 520 bool is_present = is_shadow_present_pte(new_spte); 521 bool is_leaf = is_present && is_last_spte(new_spte, level); 522 kvm_pfn_t new_pfn = spte_to_pfn(new_spte); 523 int ret = 0; 524 525 KVM_BUG_ON(was_present, kvm); 526 527 lockdep_assert_held(&kvm->mmu_lock); 528 /* 529 * We need to lock out other updates to the SPTE until the external 530 * page table has been modified. Use FROZEN_SPTE similar to 531 * the zapping case. 532 */ 533 if (!try_cmpxchg64(rcu_dereference(sptep), &old_spte, FROZEN_SPTE)) 534 return -EBUSY; 535 536 /* 537 * Use different call to either set up middle level 538 * external page table, or leaf. 539 */ 540 if (is_leaf) { 541 ret = kvm_x86_call(set_external_spte)(kvm, gfn, level, new_pfn); 542 } else { 543 void *external_spt = get_external_spt(gfn, new_spte, level); 544 545 KVM_BUG_ON(!external_spt, kvm); 546 ret = kvm_x86_call(link_external_spt)(kvm, gfn, level, external_spt); 547 } 548 if (ret) 549 __kvm_tdp_mmu_write_spte(sptep, old_spte); 550 else 551 __kvm_tdp_mmu_write_spte(sptep, new_spte); 552 return ret; 553 } 554 555 /** 556 * handle_changed_spte - handle bookkeeping associated with an SPTE change 557 * @kvm: kvm instance 558 * @as_id: the address space of the paging structure the SPTE was a part of 559 * @gfn: the base GFN that was mapped by the SPTE 560 * @old_spte: The value of the SPTE before the change 561 * @new_spte: The value of the SPTE after the change 562 * @level: the level of the PT the SPTE is part of in the paging structure 563 * @shared: This operation may not be running under the exclusive use of 564 * the MMU lock and the operation must synchronize with other 565 * threads that might be modifying SPTEs. 566 * 567 * Handle bookkeeping that might result from the modification of a SPTE. Note, 568 * dirty logging updates are handled in common code, not here (see make_spte() 569 * and fast_pf_fix_direct_spte()). 570 */ 571 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 572 u64 old_spte, u64 new_spte, int level, 573 bool shared) 574 { 575 bool was_present = is_shadow_present_pte(old_spte); 576 bool is_present = is_shadow_present_pte(new_spte); 577 bool was_leaf = was_present && is_last_spte(old_spte, level); 578 bool is_leaf = is_present && is_last_spte(new_spte, level); 579 bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); 580 581 WARN_ON_ONCE(level > PT64_ROOT_MAX_LEVEL); 582 WARN_ON_ONCE(level < PG_LEVEL_4K); 583 WARN_ON_ONCE(gfn & (KVM_PAGES_PER_HPAGE(level) - 1)); 584 585 /* 586 * If this warning were to trigger it would indicate that there was a 587 * missing MMU notifier or a race with some notifier handler. 588 * A present, leaf SPTE should never be directly replaced with another 589 * present leaf SPTE pointing to a different PFN. A notifier handler 590 * should be zapping the SPTE before the main MM's page table is 591 * changed, or the SPTE should be zeroed, and the TLBs flushed by the 592 * thread before replacement. 593 */ 594 if (was_leaf && is_leaf && pfn_changed) { 595 pr_err("Invalid SPTE change: cannot replace a present leaf\n" 596 "SPTE with another present leaf SPTE mapping a\n" 597 "different PFN!\n" 598 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", 599 as_id, gfn, old_spte, new_spte, level); 600 601 /* 602 * Crash the host to prevent error propagation and guest data 603 * corruption. 604 */ 605 BUG(); 606 } 607 608 if (old_spte == new_spte) 609 return; 610 611 trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte); 612 613 if (is_leaf) 614 check_spte_writable_invariants(new_spte); 615 616 /* 617 * The only times a SPTE should be changed from a non-present to 618 * non-present state is when an MMIO entry is installed/modified/ 619 * removed. In that case, there is nothing to do here. 620 */ 621 if (!was_present && !is_present) { 622 /* 623 * If this change does not involve a MMIO SPTE or frozen SPTE, 624 * it is unexpected. Log the change, though it should not 625 * impact the guest since both the former and current SPTEs 626 * are nonpresent. 627 */ 628 if (WARN_ON_ONCE(!is_mmio_spte(kvm, old_spte) && 629 !is_mmio_spte(kvm, new_spte) && 630 !is_frozen_spte(new_spte))) 631 pr_err("Unexpected SPTE change! Nonpresent SPTEs\n" 632 "should not be replaced with another,\n" 633 "different nonpresent SPTE, unless one or both\n" 634 "are MMIO SPTEs, or the new SPTE is\n" 635 "a temporary frozen SPTE.\n" 636 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", 637 as_id, gfn, old_spte, new_spte, level); 638 return; 639 } 640 641 if (is_leaf != was_leaf) 642 kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1); 643 644 /* 645 * Recursively handle child PTs if the change removed a subtree from 646 * the paging structure. Note the WARN on the PFN changing without the 647 * SPTE being converted to a hugepage (leaf) or being zapped. Shadow 648 * pages are kernel allocations and should never be migrated. 649 */ 650 if (was_present && !was_leaf && 651 (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed))) 652 handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared); 653 } 654 655 static inline int __must_check __tdp_mmu_set_spte_atomic(struct kvm *kvm, 656 struct tdp_iter *iter, 657 u64 new_spte) 658 { 659 /* 660 * The caller is responsible for ensuring the old SPTE is not a FROZEN 661 * SPTE. KVM should never attempt to zap or manipulate a FROZEN SPTE, 662 * and pre-checking before inserting a new SPTE is advantageous as it 663 * avoids unnecessary work. 664 */ 665 WARN_ON_ONCE(iter->yielded || is_frozen_spte(iter->old_spte)); 666 667 if (is_mirror_sptep(iter->sptep) && !is_frozen_spte(new_spte)) { 668 int ret; 669 670 /* 671 * Users of atomic zapping don't operate on mirror roots, 672 * so don't handle it and bug the VM if it's seen. 673 */ 674 if (KVM_BUG_ON(!is_shadow_present_pte(new_spte), kvm)) 675 return -EBUSY; 676 677 ret = set_external_spte_present(kvm, iter->sptep, iter->gfn, 678 iter->old_spte, new_spte, iter->level); 679 if (ret) 680 return ret; 681 } else { 682 u64 *sptep = rcu_dereference(iter->sptep); 683 684 /* 685 * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs 686 * and does not hold the mmu_lock. On failure, i.e. if a 687 * different logical CPU modified the SPTE, try_cmpxchg64() 688 * updates iter->old_spte with the current value, so the caller 689 * operates on fresh data, e.g. if it retries 690 * tdp_mmu_set_spte_atomic() 691 */ 692 if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte)) 693 return -EBUSY; 694 } 695 696 return 0; 697 } 698 699 /* 700 * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically 701 * and handle the associated bookkeeping. Do not mark the page dirty 702 * in KVM's dirty bitmaps. 703 * 704 * If setting the SPTE fails because it has changed, iter->old_spte will be 705 * refreshed to the current value of the spte. 706 * 707 * @kvm: kvm instance 708 * @iter: a tdp_iter instance currently on the SPTE that should be set 709 * @new_spte: The value the SPTE should be set to 710 * Return: 711 * * 0 - If the SPTE was set. 712 * * -EBUSY - If the SPTE cannot be set. In this case this function will have 713 * no side-effects other than setting iter->old_spte to the last 714 * known value of the spte. 715 */ 716 static inline int __must_check tdp_mmu_set_spte_atomic(struct kvm *kvm, 717 struct tdp_iter *iter, 718 u64 new_spte) 719 { 720 int ret; 721 722 lockdep_assert_held_read(&kvm->mmu_lock); 723 724 ret = __tdp_mmu_set_spte_atomic(kvm, iter, new_spte); 725 if (ret) 726 return ret; 727 728 handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, 729 new_spte, iter->level, true); 730 731 return 0; 732 } 733 734 /* 735 * tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping 736 * @kvm: KVM instance 737 * @as_id: Address space ID, i.e. regular vs. SMM 738 * @sptep: Pointer to the SPTE 739 * @old_spte: The current value of the SPTE 740 * @new_spte: The new value that will be set for the SPTE 741 * @gfn: The base GFN that was (or will be) mapped by the SPTE 742 * @level: The level _containing_ the SPTE (its parent PT's level) 743 * 744 * Returns the old SPTE value, which _may_ be different than @old_spte if the 745 * SPTE had voldatile bits. 746 */ 747 static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep, 748 u64 old_spte, u64 new_spte, gfn_t gfn, int level) 749 { 750 lockdep_assert_held_write(&kvm->mmu_lock); 751 752 /* 753 * No thread should be using this function to set SPTEs to or from the 754 * temporary frozen SPTE value. 755 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic 756 * should be used. If operating under the MMU lock in write mode, the 757 * use of the frozen SPTE should not be necessary. 758 */ 759 WARN_ON_ONCE(is_frozen_spte(old_spte) || is_frozen_spte(new_spte)); 760 761 old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level); 762 763 handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false); 764 765 /* 766 * Users that do non-atomic setting of PTEs don't operate on mirror 767 * roots, so don't handle it and bug the VM if it's seen. 768 */ 769 if (is_mirror_sptep(sptep)) { 770 KVM_BUG_ON(is_shadow_present_pte(new_spte), kvm); 771 remove_external_spte(kvm, gfn, old_spte, level); 772 } 773 774 return old_spte; 775 } 776 777 static inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter, 778 u64 new_spte) 779 { 780 WARN_ON_ONCE(iter->yielded); 781 iter->old_spte = tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep, 782 iter->old_spte, new_spte, 783 iter->gfn, iter->level); 784 } 785 786 #define tdp_root_for_each_pte(_iter, _kvm, _root, _start, _end) \ 787 for_each_tdp_pte(_iter, _kvm, _root, _start, _end) 788 789 #define tdp_root_for_each_leaf_pte(_iter, _kvm, _root, _start, _end) \ 790 tdp_root_for_each_pte(_iter, _kvm, _root, _start, _end) \ 791 if (!is_shadow_present_pte(_iter.old_spte) || \ 792 !is_last_spte(_iter.old_spte, _iter.level)) \ 793 continue; \ 794 else 795 796 static inline bool __must_check tdp_mmu_iter_need_resched(struct kvm *kvm, 797 struct tdp_iter *iter) 798 { 799 if (!need_resched() && !rwlock_needbreak(&kvm->mmu_lock)) 800 return false; 801 802 /* Ensure forward progress has been made before yielding. */ 803 return iter->next_last_level_gfn != iter->yielded_gfn; 804 } 805 806 /* 807 * Yield if the MMU lock is contended or this thread needs to return control 808 * to the scheduler. 809 * 810 * If this function should yield and flush is set, it will perform a remote 811 * TLB flush before yielding. 812 * 813 * If this function yields, iter->yielded is set and the caller must skip to 814 * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk 815 * over the paging structures to allow the iterator to continue its traversal 816 * from the paging structure root. 817 * 818 * Returns true if this function yielded. 819 */ 820 static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm, 821 struct tdp_iter *iter, 822 bool flush, bool shared) 823 { 824 KVM_MMU_WARN_ON(iter->yielded); 825 826 if (!tdp_mmu_iter_need_resched(kvm, iter)) 827 return false; 828 829 if (flush) 830 kvm_flush_remote_tlbs(kvm); 831 832 rcu_read_unlock(); 833 834 if (shared) 835 cond_resched_rwlock_read(&kvm->mmu_lock); 836 else 837 cond_resched_rwlock_write(&kvm->mmu_lock); 838 839 rcu_read_lock(); 840 841 WARN_ON_ONCE(iter->gfn > iter->next_last_level_gfn); 842 843 iter->yielded = true; 844 return true; 845 } 846 847 static inline gfn_t tdp_mmu_max_gfn_exclusive(void) 848 { 849 /* 850 * Bound TDP MMU walks at host.MAXPHYADDR. KVM disallows memslots with 851 * a gpa range that would exceed the max gfn, and KVM does not create 852 * MMIO SPTEs for "impossible" gfns, instead sending such accesses down 853 * the slow emulation path every time. 854 */ 855 return kvm_mmu_max_gfn() + 1; 856 } 857 858 static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, 859 bool shared, int zap_level) 860 { 861 struct tdp_iter iter; 862 863 for_each_tdp_pte_min_level_all(iter, root, zap_level) { 864 retry: 865 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared)) 866 continue; 867 868 if (!is_shadow_present_pte(iter.old_spte)) 869 continue; 870 871 if (iter.level > zap_level) 872 continue; 873 874 if (!shared) 875 tdp_mmu_iter_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE); 876 else if (tdp_mmu_set_spte_atomic(kvm, &iter, SHADOW_NONPRESENT_VALUE)) 877 goto retry; 878 } 879 } 880 881 static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, 882 bool shared) 883 { 884 885 /* 886 * The root must have an elevated refcount so that it's reachable via 887 * mmu_notifier callbacks, which allows this path to yield and drop 888 * mmu_lock. When handling an unmap/release mmu_notifier command, KVM 889 * must drop all references to relevant pages prior to completing the 890 * callback. Dropping mmu_lock with an unreachable root would result 891 * in zapping SPTEs after a relevant mmu_notifier callback completes 892 * and lead to use-after-free as zapping a SPTE triggers "writeback" of 893 * dirty accessed bits to the SPTE's associated struct page. 894 */ 895 WARN_ON_ONCE(!refcount_read(&root->tdp_mmu_root_count)); 896 897 kvm_lockdep_assert_mmu_lock_held(kvm, shared); 898 899 rcu_read_lock(); 900 901 /* 902 * Zap roots in multiple passes of decreasing granularity, i.e. zap at 903 * 4KiB=>2MiB=>1GiB=>root, in order to better honor need_resched() (all 904 * preempt models) or mmu_lock contention (full or real-time models). 905 * Zapping at finer granularity marginally increases the total time of 906 * the zap, but in most cases the zap itself isn't latency sensitive. 907 * 908 * If KVM is configured to prove the MMU, skip the 4KiB and 2MiB zaps 909 * in order to mimic the page fault path, which can replace a 1GiB page 910 * table with an equivalent 1GiB hugepage, i.e. can get saddled with 911 * zapping a 1GiB region that's fully populated with 4KiB SPTEs. This 912 * allows verifying that KVM can safely zap 1GiB regions, e.g. without 913 * inducing RCU stalls, without relying on a relatively rare event 914 * (zapping roots is orders of magnitude more common). Note, because 915 * zapping a SP recurses on its children, stepping down to PG_LEVEL_4K 916 * in the iterator itself is unnecessary. 917 */ 918 if (!IS_ENABLED(CONFIG_KVM_PROVE_MMU)) { 919 __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_4K); 920 __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_2M); 921 } 922 __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G); 923 __tdp_mmu_zap_root(kvm, root, shared, root->role.level); 924 925 rcu_read_unlock(); 926 } 927 928 bool kvm_tdp_mmu_zap_possible_nx_huge_page(struct kvm *kvm, 929 struct kvm_mmu_page *sp) 930 { 931 struct tdp_iter iter = { 932 .old_spte = sp->ptep ? kvm_tdp_mmu_read_spte(sp->ptep) : 0, 933 .sptep = sp->ptep, 934 .level = sp->role.level + 1, 935 .gfn = sp->gfn, 936 .as_id = kvm_mmu_page_as_id(sp), 937 }; 938 939 lockdep_assert_held_read(&kvm->mmu_lock); 940 941 if (WARN_ON_ONCE(!is_tdp_mmu_page(sp))) 942 return false; 943 944 /* 945 * Root shadow pages don't have a parent page table and thus no 946 * associated entry, but they can never be possible NX huge pages. 947 */ 948 if (WARN_ON_ONCE(!sp->ptep)) 949 return false; 950 951 /* 952 * Since mmu_lock is held in read mode, it's possible another task has 953 * already modified the SPTE. Zap the SPTE if and only if the SPTE 954 * points at the SP's page table, as checking shadow-present isn't 955 * sufficient, e.g. the SPTE could be replaced by a leaf SPTE, or even 956 * another SP. Note, spte_to_child_pt() also checks that the SPTE is 957 * shadow-present, i.e. guards against zapping a frozen SPTE. 958 */ 959 if ((tdp_ptep_t)sp->spt != spte_to_child_pt(iter.old_spte, iter.level)) 960 return false; 961 962 /* 963 * If a different task modified the SPTE, then it should be impossible 964 * for the SPTE to still be used for the to-be-zapped SP. Non-leaf 965 * SPTEs don't have Dirty bits, KVM always sets the Accessed bit when 966 * creating non-leaf SPTEs, and all other bits are immutable for non- 967 * leaf SPTEs, i.e. the only legal operations for non-leaf SPTEs are 968 * zapping and replacement. 969 */ 970 if (tdp_mmu_set_spte_atomic(kvm, &iter, SHADOW_NONPRESENT_VALUE)) { 971 WARN_ON_ONCE((tdp_ptep_t)sp->spt == spte_to_child_pt(iter.old_spte, iter.level)); 972 return false; 973 } 974 975 return true; 976 } 977 978 /* 979 * If can_yield is true, will release the MMU lock and reschedule if the 980 * scheduler needs the CPU or there is contention on the MMU lock. If this 981 * function cannot yield, it will not release the MMU lock or reschedule and 982 * the caller must ensure it does not supply too large a GFN range, or the 983 * operation can cause a soft lockup. 984 */ 985 static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root, 986 gfn_t start, gfn_t end, bool can_yield, bool flush) 987 { 988 struct tdp_iter iter; 989 990 end = min(end, tdp_mmu_max_gfn_exclusive()); 991 992 lockdep_assert_held_write(&kvm->mmu_lock); 993 994 rcu_read_lock(); 995 996 for_each_tdp_pte_min_level(iter, kvm, root, PG_LEVEL_4K, start, end) { 997 if (can_yield && 998 tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) { 999 flush = false; 1000 continue; 1001 } 1002 1003 if (!is_shadow_present_pte(iter.old_spte) || 1004 !is_last_spte(iter.old_spte, iter.level)) 1005 continue; 1006 1007 tdp_mmu_iter_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE); 1008 1009 /* 1010 * Zappings SPTEs in invalid roots doesn't require a TLB flush, 1011 * see kvm_tdp_mmu_zap_invalidated_roots() for details. 1012 */ 1013 if (!root->role.invalid) 1014 flush = true; 1015 } 1016 1017 rcu_read_unlock(); 1018 1019 /* 1020 * Because this flow zaps _only_ leaf SPTEs, the caller doesn't need 1021 * to provide RCU protection as no 'struct kvm_mmu_page' will be freed. 1022 */ 1023 return flush; 1024 } 1025 1026 /* 1027 * Zap leaf SPTEs for the range of gfns, [start, end), for all *VALID** roots. 1028 * Returns true if a TLB flush is needed before releasing the MMU lock, i.e. if 1029 * one or more SPTEs were zapped since the MMU lock was last acquired. 1030 */ 1031 bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush) 1032 { 1033 struct kvm_mmu_page *root; 1034 1035 lockdep_assert_held_write(&kvm->mmu_lock); 1036 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, -1) 1037 flush = tdp_mmu_zap_leafs(kvm, root, start, end, true, flush); 1038 1039 return flush; 1040 } 1041 1042 void kvm_tdp_mmu_zap_all(struct kvm *kvm) 1043 { 1044 struct kvm_mmu_page *root; 1045 1046 /* 1047 * Zap all direct roots, including invalid direct roots, as all direct 1048 * SPTEs must be dropped before returning to the caller. For TDX, mirror 1049 * roots don't need handling in response to the mmu notifier (the caller). 1050 * 1051 * Zap directly even if the root is also being zapped by a concurrent 1052 * "fast zap". Walking zapped top-level SPTEs isn't all that expensive 1053 * and mmu_lock is already held, which means the other thread has yielded. 1054 * 1055 * A TLB flush is unnecessary, KVM zaps everything if and only the VM 1056 * is being destroyed or the userspace VMM has exited. In both cases, 1057 * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request. 1058 */ 1059 lockdep_assert_held_write(&kvm->mmu_lock); 1060 __for_each_tdp_mmu_root_yield_safe(kvm, root, -1, 1061 KVM_DIRECT_ROOTS | KVM_INVALID_ROOTS) 1062 tdp_mmu_zap_root(kvm, root, false); 1063 } 1064 1065 /* 1066 * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast 1067 * zap" completes. 1068 */ 1069 void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm, bool shared) 1070 { 1071 struct kvm_mmu_page *root; 1072 1073 if (shared) 1074 read_lock(&kvm->mmu_lock); 1075 else 1076 write_lock(&kvm->mmu_lock); 1077 1078 for_each_tdp_mmu_root_yield_safe(kvm, root) { 1079 if (!root->tdp_mmu_scheduled_root_to_zap) 1080 continue; 1081 1082 root->tdp_mmu_scheduled_root_to_zap = false; 1083 KVM_BUG_ON(!root->role.invalid, kvm); 1084 1085 /* 1086 * A TLB flush is not necessary as KVM performs a local TLB 1087 * flush when allocating a new root (see kvm_mmu_load()), and 1088 * when migrating a vCPU to a different pCPU. Note, the local 1089 * TLB flush on reuse also invalidates paging-structure-cache 1090 * entries, i.e. TLB entries for intermediate paging structures, 1091 * that may be zapped, as such entries are associated with the 1092 * ASID on both VMX and SVM. 1093 */ 1094 tdp_mmu_zap_root(kvm, root, shared); 1095 1096 /* 1097 * The referenced needs to be put *after* zapping the root, as 1098 * the root must be reachable by mmu_notifiers while it's being 1099 * zapped 1100 */ 1101 kvm_tdp_mmu_put_root(kvm, root); 1102 } 1103 1104 if (shared) 1105 read_unlock(&kvm->mmu_lock); 1106 else 1107 write_unlock(&kvm->mmu_lock); 1108 } 1109 1110 /* 1111 * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that 1112 * is about to be zapped, e.g. in response to a memslots update. The actual 1113 * zapping is done separately so that it happens with mmu_lock with read, 1114 * whereas invalidating roots must be done with mmu_lock held for write (unless 1115 * the VM is being destroyed). 1116 * 1117 * Note, kvm_tdp_mmu_zap_invalidated_roots() is gifted the TDP MMU's reference. 1118 * See kvm_tdp_mmu_alloc_root(). 1119 */ 1120 void kvm_tdp_mmu_invalidate_roots(struct kvm *kvm, 1121 enum kvm_tdp_mmu_root_types root_types) 1122 { 1123 struct kvm_mmu_page *root; 1124 1125 /* 1126 * Invalidating invalid roots doesn't make sense, prevent developers from 1127 * having to think about it. 1128 */ 1129 if (WARN_ON_ONCE(root_types & KVM_INVALID_ROOTS)) 1130 root_types &= ~KVM_INVALID_ROOTS; 1131 1132 /* 1133 * mmu_lock must be held for write to ensure that a root doesn't become 1134 * invalid while there are active readers (invalidating a root while 1135 * there are active readers may or may not be problematic in practice, 1136 * but it's uncharted territory and not supported). 1137 * 1138 * Waive the assertion if there are no users of @kvm, i.e. the VM is 1139 * being destroyed after all references have been put, or if no vCPUs 1140 * have been created (which means there are no roots), i.e. the VM is 1141 * being destroyed in an error path of KVM_CREATE_VM. 1142 */ 1143 if (IS_ENABLED(CONFIG_PROVE_LOCKING) && 1144 refcount_read(&kvm->users_count) && kvm->created_vcpus) 1145 lockdep_assert_held_write(&kvm->mmu_lock); 1146 1147 /* 1148 * As above, mmu_lock isn't held when destroying the VM! There can't 1149 * be other references to @kvm, i.e. nothing else can invalidate roots 1150 * or get/put references to roots. 1151 */ 1152 list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) { 1153 if (!tdp_mmu_root_match(root, root_types)) 1154 continue; 1155 1156 /* 1157 * Note, invalid roots can outlive a memslot update! Invalid 1158 * roots must be *zapped* before the memslot update completes, 1159 * but a different task can acquire a reference and keep the 1160 * root alive after its been zapped. 1161 */ 1162 if (!root->role.invalid) { 1163 root->tdp_mmu_scheduled_root_to_zap = true; 1164 root->role.invalid = true; 1165 } 1166 } 1167 } 1168 1169 /* 1170 * Installs a last-level SPTE to handle a TDP page fault. 1171 * (NPT/EPT violation/misconfiguration) 1172 */ 1173 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, 1174 struct kvm_page_fault *fault, 1175 struct tdp_iter *iter) 1176 { 1177 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep)); 1178 u64 new_spte; 1179 int ret = RET_PF_FIXED; 1180 bool wrprot = false; 1181 1182 if (WARN_ON_ONCE(sp->role.level != fault->goal_level)) 1183 return RET_PF_RETRY; 1184 1185 if (is_shadow_present_pte(iter->old_spte) && 1186 (fault->prefetch || is_access_allowed(fault, iter->old_spte)) && 1187 is_last_spte(iter->old_spte, iter->level)) { 1188 WARN_ON_ONCE(fault->pfn != spte_to_pfn(iter->old_spte)); 1189 return RET_PF_SPURIOUS; 1190 } 1191 1192 if (unlikely(!fault->slot)) 1193 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); 1194 else 1195 wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn, 1196 fault->pfn, iter->old_spte, fault->prefetch, 1197 false, fault->map_writable, &new_spte); 1198 1199 if (new_spte == iter->old_spte) 1200 ret = RET_PF_SPURIOUS; 1201 else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte)) 1202 return RET_PF_RETRY; 1203 else if (is_shadow_present_pte(iter->old_spte) && 1204 (!is_last_spte(iter->old_spte, iter->level) || 1205 WARN_ON_ONCE(leaf_spte_change_needs_tlb_flush(iter->old_spte, new_spte)))) 1206 kvm_flush_remote_tlbs_gfn(vcpu->kvm, iter->gfn, iter->level); 1207 1208 /* 1209 * If the page fault was caused by a write but the page is write 1210 * protected, emulation is needed. If the emulation was skipped, 1211 * the vCPU would have the same fault again. 1212 */ 1213 if (wrprot && fault->write) 1214 ret = RET_PF_WRITE_PROTECTED; 1215 1216 /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */ 1217 if (unlikely(is_mmio_spte(vcpu->kvm, new_spte))) { 1218 vcpu->stat.pf_mmio_spte_created++; 1219 trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn, 1220 new_spte); 1221 ret = RET_PF_EMULATE; 1222 } else { 1223 trace_kvm_mmu_set_spte(iter->level, iter->gfn, 1224 rcu_dereference(iter->sptep)); 1225 } 1226 1227 return ret; 1228 } 1229 1230 /* 1231 * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the 1232 * provided page table. 1233 * 1234 * @kvm: kvm instance 1235 * @iter: a tdp_iter instance currently on the SPTE that should be set 1236 * @sp: The new TDP page table to install. 1237 * @shared: This operation is running under the MMU lock in read mode. 1238 * 1239 * Returns: 0 if the new page table was installed. Non-0 if the page table 1240 * could not be installed (e.g. the atomic compare-exchange failed). 1241 */ 1242 static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter, 1243 struct kvm_mmu_page *sp, bool shared) 1244 { 1245 u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled); 1246 int ret = 0; 1247 1248 if (shared) { 1249 ret = tdp_mmu_set_spte_atomic(kvm, iter, spte); 1250 if (ret) 1251 return ret; 1252 } else { 1253 tdp_mmu_iter_set_spte(kvm, iter, spte); 1254 } 1255 1256 tdp_account_mmu_page(kvm, sp); 1257 1258 return 0; 1259 } 1260 1261 static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, 1262 struct kvm_mmu_page *sp, bool shared); 1263 1264 /* 1265 * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing 1266 * page tables and SPTEs to translate the faulting guest physical address. 1267 */ 1268 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) 1269 { 1270 struct kvm_mmu_page *root = tdp_mmu_get_root_for_fault(vcpu, fault); 1271 struct kvm *kvm = vcpu->kvm; 1272 struct tdp_iter iter; 1273 struct kvm_mmu_page *sp; 1274 int ret = RET_PF_RETRY; 1275 1276 kvm_mmu_hugepage_adjust(vcpu, fault); 1277 1278 trace_kvm_mmu_spte_requested(fault); 1279 1280 rcu_read_lock(); 1281 1282 for_each_tdp_pte(iter, kvm, root, fault->gfn, fault->gfn + 1) { 1283 int r; 1284 1285 if (fault->nx_huge_page_workaround_enabled) 1286 disallowed_hugepage_adjust(fault, iter.old_spte, iter.level); 1287 1288 /* 1289 * If SPTE has been frozen by another thread, just give up and 1290 * retry, avoiding unnecessary page table allocation and free. 1291 */ 1292 if (is_frozen_spte(iter.old_spte)) 1293 goto retry; 1294 1295 if (iter.level == fault->goal_level) 1296 goto map_target_level; 1297 1298 /* Step down into the lower level page table if it exists. */ 1299 if (is_shadow_present_pte(iter.old_spte) && 1300 !is_large_pte(iter.old_spte)) 1301 continue; 1302 1303 /* 1304 * The SPTE is either non-present or points to a huge page that 1305 * needs to be split. 1306 */ 1307 sp = tdp_mmu_alloc_sp(vcpu); 1308 tdp_mmu_init_child_sp(sp, &iter); 1309 if (is_mirror_sp(sp)) 1310 kvm_mmu_alloc_external_spt(vcpu, sp); 1311 1312 sp->nx_huge_page_disallowed = fault->huge_page_disallowed; 1313 1314 if (is_shadow_present_pte(iter.old_spte)) { 1315 /* Don't support large page for mirrored roots (TDX) */ 1316 KVM_BUG_ON(is_mirror_sptep(iter.sptep), vcpu->kvm); 1317 r = tdp_mmu_split_huge_page(kvm, &iter, sp, true); 1318 } else { 1319 r = tdp_mmu_link_sp(kvm, &iter, sp, true); 1320 } 1321 1322 /* 1323 * Force the guest to retry if installing an upper level SPTE 1324 * failed, e.g. because a different task modified the SPTE. 1325 */ 1326 if (r) { 1327 tdp_mmu_free_sp(sp); 1328 goto retry; 1329 } 1330 1331 if (fault->huge_page_disallowed && 1332 fault->req_level >= iter.level) { 1333 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 1334 if (sp->nx_huge_page_disallowed) 1335 track_possible_nx_huge_page(kvm, sp, KVM_TDP_MMU); 1336 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 1337 } 1338 } 1339 1340 /* 1341 * The walk aborted before reaching the target level, e.g. because the 1342 * iterator detected an upper level SPTE was frozen during traversal. 1343 */ 1344 WARN_ON_ONCE(iter.level == fault->goal_level); 1345 goto retry; 1346 1347 map_target_level: 1348 ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter); 1349 1350 retry: 1351 rcu_read_unlock(); 1352 return ret; 1353 } 1354 1355 /* Used by mmu notifier via kvm_unmap_gfn_range() */ 1356 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, 1357 bool flush) 1358 { 1359 enum kvm_tdp_mmu_root_types types; 1360 struct kvm_mmu_page *root; 1361 1362 types = kvm_gfn_range_filter_to_root_types(kvm, range->attr_filter) | KVM_INVALID_ROOTS; 1363 1364 __for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, types) 1365 flush = tdp_mmu_zap_leafs(kvm, root, range->start, range->end, 1366 range->may_block, flush); 1367 1368 return flush; 1369 } 1370 1371 /* 1372 * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero 1373 * if any of the GFNs in the range have been accessed. 1374 * 1375 * No need to mark the corresponding PFN as accessed as this call is coming 1376 * from the clear_young() or clear_flush_young() notifier, which uses the 1377 * return value to determine if the page has been accessed. 1378 */ 1379 static void kvm_tdp_mmu_age_spte(struct kvm *kvm, struct tdp_iter *iter) 1380 { 1381 u64 new_spte; 1382 1383 if (spte_ad_enabled(iter->old_spte)) { 1384 iter->old_spte = tdp_mmu_clear_spte_bits_atomic(iter->sptep, 1385 shadow_accessed_mask); 1386 new_spte = iter->old_spte & ~shadow_accessed_mask; 1387 } else { 1388 new_spte = mark_spte_for_access_track(iter->old_spte); 1389 /* 1390 * It is safe for the following cmpxchg to fail. Leave the 1391 * Accessed bit set, as the spte is most likely young anyway. 1392 */ 1393 if (__tdp_mmu_set_spte_atomic(kvm, iter, new_spte)) 1394 return; 1395 } 1396 1397 trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level, 1398 iter->old_spte, new_spte); 1399 } 1400 1401 static bool __kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, 1402 struct kvm_gfn_range *range, 1403 bool test_only) 1404 { 1405 enum kvm_tdp_mmu_root_types types; 1406 struct kvm_mmu_page *root; 1407 struct tdp_iter iter; 1408 bool ret = false; 1409 1410 types = kvm_gfn_range_filter_to_root_types(kvm, range->attr_filter); 1411 1412 /* 1413 * Don't support rescheduling, none of the MMU notifiers that funnel 1414 * into this helper allow blocking; it'd be dead, wasteful code. Note, 1415 * this helper must NOT be used to unmap GFNs, as it processes only 1416 * valid roots! 1417 */ 1418 WARN_ON(types & ~KVM_VALID_ROOTS); 1419 1420 guard(rcu)(); 1421 for_each_tdp_mmu_root_rcu(kvm, root, range->slot->as_id, types) { 1422 tdp_root_for_each_leaf_pte(iter, kvm, root, range->start, range->end) { 1423 if (!is_accessed_spte(iter.old_spte)) 1424 continue; 1425 1426 if (test_only) 1427 return true; 1428 1429 ret = true; 1430 kvm_tdp_mmu_age_spte(kvm, &iter); 1431 } 1432 } 1433 1434 return ret; 1435 } 1436 1437 bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) 1438 { 1439 return __kvm_tdp_mmu_age_gfn_range(kvm, range, false); 1440 } 1441 1442 bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 1443 { 1444 return __kvm_tdp_mmu_age_gfn_range(kvm, range, true); 1445 } 1446 1447 /* 1448 * Remove write access from all SPTEs at or above min_level that map GFNs 1449 * [start, end). Returns true if an SPTE has been changed and the TLBs need to 1450 * be flushed. 1451 */ 1452 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 1453 gfn_t start, gfn_t end, int min_level) 1454 { 1455 struct tdp_iter iter; 1456 u64 new_spte; 1457 bool spte_set = false; 1458 1459 rcu_read_lock(); 1460 1461 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); 1462 1463 for_each_tdp_pte_min_level(iter, kvm, root, min_level, start, end) { 1464 retry: 1465 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) 1466 continue; 1467 1468 if (!is_shadow_present_pte(iter.old_spte) || 1469 !is_last_spte(iter.old_spte, iter.level) || 1470 !(iter.old_spte & PT_WRITABLE_MASK)) 1471 continue; 1472 1473 new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 1474 1475 if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) 1476 goto retry; 1477 1478 spte_set = true; 1479 } 1480 1481 rcu_read_unlock(); 1482 return spte_set; 1483 } 1484 1485 /* 1486 * Remove write access from all the SPTEs mapping GFNs in the memslot. Will 1487 * only affect leaf SPTEs down to min_level. 1488 * Returns true if an SPTE has been changed and the TLBs need to be flushed. 1489 */ 1490 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, 1491 const struct kvm_memory_slot *slot, int min_level) 1492 { 1493 struct kvm_mmu_page *root; 1494 bool spte_set = false; 1495 1496 lockdep_assert_held_read(&kvm->mmu_lock); 1497 1498 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) 1499 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn, 1500 slot->base_gfn + slot->npages, min_level); 1501 1502 return spte_set; 1503 } 1504 1505 static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(void) 1506 { 1507 struct kvm_mmu_page *sp; 1508 1509 sp = kmem_cache_zalloc(mmu_page_header_cache, GFP_KERNEL_ACCOUNT); 1510 if (!sp) 1511 return NULL; 1512 1513 sp->spt = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); 1514 if (!sp->spt) { 1515 kmem_cache_free(mmu_page_header_cache, sp); 1516 return NULL; 1517 } 1518 1519 return sp; 1520 } 1521 1522 /* Note, the caller is responsible for initializing @sp. */ 1523 static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, 1524 struct kvm_mmu_page *sp, bool shared) 1525 { 1526 const u64 huge_spte = iter->old_spte; 1527 const int level = iter->level; 1528 int ret, i; 1529 1530 /* 1531 * No need for atomics when writing to sp->spt since the page table has 1532 * not been linked in yet and thus is not reachable from any other CPU. 1533 */ 1534 for (i = 0; i < SPTE_ENT_PER_PAGE; i++) 1535 sp->spt[i] = make_small_spte(kvm, huge_spte, sp->role, i); 1536 1537 /* 1538 * Replace the huge spte with a pointer to the populated lower level 1539 * page table. Since we are making this change without a TLB flush vCPUs 1540 * will see a mix of the split mappings and the original huge mapping, 1541 * depending on what's currently in their TLB. This is fine from a 1542 * correctness standpoint since the translation will be the same either 1543 * way. 1544 */ 1545 ret = tdp_mmu_link_sp(kvm, iter, sp, shared); 1546 if (ret) 1547 goto out; 1548 1549 /* 1550 * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we 1551 * are overwriting from the page stats. But we have to manually update 1552 * the page stats with the new present child pages. 1553 */ 1554 kvm_update_page_stats(kvm, level - 1, SPTE_ENT_PER_PAGE); 1555 1556 out: 1557 trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret); 1558 return ret; 1559 } 1560 1561 static int tdp_mmu_split_huge_pages_root(struct kvm *kvm, 1562 struct kvm_mmu_page *root, 1563 gfn_t start, gfn_t end, 1564 int target_level, bool shared) 1565 { 1566 struct kvm_mmu_page *sp = NULL; 1567 struct tdp_iter iter; 1568 1569 rcu_read_lock(); 1570 1571 /* 1572 * Traverse the page table splitting all huge pages above the target 1573 * level into one lower level. For example, if we encounter a 1GB page 1574 * we split it into 512 2MB pages. 1575 * 1576 * Since the TDP iterator uses a pre-order traversal, we are guaranteed 1577 * to visit an SPTE before ever visiting its children, which means we 1578 * will correctly recursively split huge pages that are more than one 1579 * level above the target level (e.g. splitting a 1GB to 512 2MB pages, 1580 * and then splitting each of those to 512 4KB pages). 1581 */ 1582 for_each_tdp_pte_min_level(iter, kvm, root, target_level + 1, start, end) { 1583 retry: 1584 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared)) 1585 continue; 1586 1587 if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte)) 1588 continue; 1589 1590 if (!sp) { 1591 rcu_read_unlock(); 1592 1593 if (shared) 1594 read_unlock(&kvm->mmu_lock); 1595 else 1596 write_unlock(&kvm->mmu_lock); 1597 1598 sp = tdp_mmu_alloc_sp_for_split(); 1599 1600 if (shared) 1601 read_lock(&kvm->mmu_lock); 1602 else 1603 write_lock(&kvm->mmu_lock); 1604 1605 if (!sp) { 1606 trace_kvm_mmu_split_huge_page(iter.gfn, 1607 iter.old_spte, 1608 iter.level, -ENOMEM); 1609 return -ENOMEM; 1610 } 1611 1612 rcu_read_lock(); 1613 1614 iter.yielded = true; 1615 continue; 1616 } 1617 1618 tdp_mmu_init_child_sp(sp, &iter); 1619 1620 if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared)) 1621 goto retry; 1622 1623 sp = NULL; 1624 } 1625 1626 rcu_read_unlock(); 1627 1628 /* 1629 * It's possible to exit the loop having never used the last sp if, for 1630 * example, a vCPU doing HugePage NX splitting wins the race and 1631 * installs its own sp in place of the last sp we tried to split. 1632 */ 1633 if (sp) 1634 tdp_mmu_free_sp(sp); 1635 1636 return 0; 1637 } 1638 1639 1640 /* 1641 * Try to split all huge pages mapped by the TDP MMU down to the target level. 1642 */ 1643 void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm, 1644 const struct kvm_memory_slot *slot, 1645 gfn_t start, gfn_t end, 1646 int target_level, bool shared) 1647 { 1648 struct kvm_mmu_page *root; 1649 int r = 0; 1650 1651 kvm_lockdep_assert_mmu_lock_held(kvm, shared); 1652 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) { 1653 r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared); 1654 if (r) { 1655 kvm_tdp_mmu_put_root(kvm, root); 1656 break; 1657 } 1658 } 1659 } 1660 1661 static bool tdp_mmu_need_write_protect(struct kvm *kvm, struct kvm_mmu_page *sp) 1662 { 1663 /* 1664 * All TDP MMU shadow pages share the same role as their root, aside 1665 * from level, so it is valid to key off any shadow page to determine if 1666 * write protection is needed for an entire tree. 1667 */ 1668 return kvm_mmu_page_ad_need_write_protect(kvm, sp) || !kvm_ad_enabled; 1669 } 1670 1671 static void clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 1672 gfn_t start, gfn_t end) 1673 { 1674 const u64 dbit = tdp_mmu_need_write_protect(kvm, root) ? 1675 PT_WRITABLE_MASK : shadow_dirty_mask; 1676 struct tdp_iter iter; 1677 1678 rcu_read_lock(); 1679 1680 tdp_root_for_each_pte(iter, kvm, root, start, end) { 1681 retry: 1682 if (!is_shadow_present_pte(iter.old_spte) || 1683 !is_last_spte(iter.old_spte, iter.level)) 1684 continue; 1685 1686 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) 1687 continue; 1688 1689 KVM_MMU_WARN_ON(dbit == shadow_dirty_mask && 1690 spte_ad_need_write_protect(iter.old_spte)); 1691 1692 if (!(iter.old_spte & dbit)) 1693 continue; 1694 1695 if (tdp_mmu_set_spte_atomic(kvm, &iter, iter.old_spte & ~dbit)) 1696 goto retry; 1697 } 1698 1699 rcu_read_unlock(); 1700 } 1701 1702 /* 1703 * Clear the dirty status (D-bit or W-bit) of all the SPTEs mapping GFNs in the 1704 * memslot. 1705 */ 1706 void kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, 1707 const struct kvm_memory_slot *slot) 1708 { 1709 struct kvm_mmu_page *root; 1710 1711 lockdep_assert_held_read(&kvm->mmu_lock); 1712 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) 1713 clear_dirty_gfn_range(kvm, root, slot->base_gfn, 1714 slot->base_gfn + slot->npages); 1715 } 1716 1717 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, 1718 gfn_t gfn, unsigned long mask, bool wrprot) 1719 { 1720 const u64 dbit = (wrprot || tdp_mmu_need_write_protect(kvm, root)) ? 1721 PT_WRITABLE_MASK : shadow_dirty_mask; 1722 struct tdp_iter iter; 1723 1724 lockdep_assert_held_write(&kvm->mmu_lock); 1725 1726 rcu_read_lock(); 1727 1728 tdp_root_for_each_leaf_pte(iter, kvm, root, gfn + __ffs(mask), 1729 gfn + BITS_PER_LONG) { 1730 if (!mask) 1731 break; 1732 1733 KVM_MMU_WARN_ON(dbit == shadow_dirty_mask && 1734 spte_ad_need_write_protect(iter.old_spte)); 1735 1736 if (iter.level > PG_LEVEL_4K || 1737 !(mask & (1UL << (iter.gfn - gfn)))) 1738 continue; 1739 1740 mask &= ~(1UL << (iter.gfn - gfn)); 1741 1742 if (!(iter.old_spte & dbit)) 1743 continue; 1744 1745 iter.old_spte = tdp_mmu_clear_spte_bits(iter.sptep, 1746 iter.old_spte, dbit, 1747 iter.level); 1748 1749 trace_kvm_tdp_mmu_spte_changed(iter.as_id, iter.gfn, iter.level, 1750 iter.old_spte, 1751 iter.old_spte & ~dbit); 1752 } 1753 1754 rcu_read_unlock(); 1755 } 1756 1757 /* 1758 * Clear the dirty status (D-bit or W-bit) of all the 4k SPTEs mapping GFNs for 1759 * which a bit is set in mask, starting at gfn. The given memslot is expected to 1760 * contain all the GFNs represented by set bits in the mask. 1761 */ 1762 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, 1763 struct kvm_memory_slot *slot, 1764 gfn_t gfn, unsigned long mask, 1765 bool wrprot) 1766 { 1767 struct kvm_mmu_page *root; 1768 1769 for_each_valid_tdp_mmu_root(kvm, root, slot->as_id) 1770 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot); 1771 } 1772 1773 static int tdp_mmu_make_huge_spte(struct kvm *kvm, 1774 struct tdp_iter *parent, 1775 u64 *huge_spte) 1776 { 1777 struct kvm_mmu_page *root = spte_to_child_sp(parent->old_spte); 1778 gfn_t start = parent->gfn; 1779 gfn_t end = start + KVM_PAGES_PER_HPAGE(parent->level); 1780 struct tdp_iter iter; 1781 1782 tdp_root_for_each_leaf_pte(iter, kvm, root, start, end) { 1783 /* 1784 * Use the parent iterator when checking for forward progress so 1785 * that KVM doesn't get stuck continuously trying to yield (i.e. 1786 * returning -EAGAIN here and then failing the forward progress 1787 * check in the caller ad nauseam). 1788 */ 1789 if (tdp_mmu_iter_need_resched(kvm, parent)) 1790 return -EAGAIN; 1791 1792 *huge_spte = make_huge_spte(kvm, iter.old_spte, parent->level); 1793 return 0; 1794 } 1795 1796 return -ENOENT; 1797 } 1798 1799 static void recover_huge_pages_range(struct kvm *kvm, 1800 struct kvm_mmu_page *root, 1801 const struct kvm_memory_slot *slot) 1802 { 1803 gfn_t start = slot->base_gfn; 1804 gfn_t end = start + slot->npages; 1805 struct tdp_iter iter; 1806 int max_mapping_level; 1807 bool flush = false; 1808 u64 huge_spte; 1809 int r; 1810 1811 if (WARN_ON_ONCE(kvm_slot_dirty_track_enabled(slot))) 1812 return; 1813 1814 rcu_read_lock(); 1815 1816 for_each_tdp_pte_min_level(iter, kvm, root, PG_LEVEL_2M, start, end) { 1817 retry: 1818 if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, true)) { 1819 flush = false; 1820 continue; 1821 } 1822 1823 if (iter.level > KVM_MAX_HUGEPAGE_LEVEL || 1824 !is_shadow_present_pte(iter.old_spte)) 1825 continue; 1826 1827 /* 1828 * Don't zap leaf SPTEs, if a leaf SPTE could be replaced with 1829 * a large page size, then its parent would have been zapped 1830 * instead of stepping down. 1831 */ 1832 if (is_last_spte(iter.old_spte, iter.level)) 1833 continue; 1834 1835 /* 1836 * If iter.gfn resides outside of the slot, i.e. the page for 1837 * the current level overlaps but is not contained by the slot, 1838 * then the SPTE can't be made huge. More importantly, trying 1839 * to query that info from slot->arch.lpage_info will cause an 1840 * out-of-bounds access. 1841 */ 1842 if (iter.gfn < start || iter.gfn >= end) 1843 continue; 1844 1845 max_mapping_level = kvm_mmu_max_mapping_level(kvm, NULL, slot, iter.gfn); 1846 if (max_mapping_level < iter.level) 1847 continue; 1848 1849 r = tdp_mmu_make_huge_spte(kvm, &iter, &huge_spte); 1850 if (r == -EAGAIN) 1851 goto retry; 1852 else if (r) 1853 continue; 1854 1855 if (tdp_mmu_set_spte_atomic(kvm, &iter, huge_spte)) 1856 goto retry; 1857 1858 flush = true; 1859 } 1860 1861 if (flush) 1862 kvm_flush_remote_tlbs_memslot(kvm, slot); 1863 1864 rcu_read_unlock(); 1865 } 1866 1867 /* 1868 * Recover huge page mappings within the slot by replacing non-leaf SPTEs with 1869 * huge SPTEs where possible. 1870 */ 1871 void kvm_tdp_mmu_recover_huge_pages(struct kvm *kvm, 1872 const struct kvm_memory_slot *slot) 1873 { 1874 struct kvm_mmu_page *root; 1875 1876 lockdep_assert_held_read(&kvm->mmu_lock); 1877 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) 1878 recover_huge_pages_range(kvm, root, slot); 1879 } 1880 1881 /* 1882 * Removes write access on the last level SPTE mapping this GFN and unsets the 1883 * MMU-writable bit to ensure future writes continue to be intercepted. 1884 * Returns true if an SPTE was set and a TLB flush is needed. 1885 */ 1886 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, 1887 gfn_t gfn, int min_level) 1888 { 1889 struct tdp_iter iter; 1890 u64 new_spte; 1891 bool spte_set = false; 1892 1893 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); 1894 1895 rcu_read_lock(); 1896 1897 for_each_tdp_pte_min_level(iter, kvm, root, min_level, gfn, gfn + 1) { 1898 if (!is_shadow_present_pte(iter.old_spte) || 1899 !is_last_spte(iter.old_spte, iter.level)) 1900 continue; 1901 1902 new_spte = iter.old_spte & 1903 ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask); 1904 1905 if (new_spte == iter.old_spte) 1906 break; 1907 1908 tdp_mmu_iter_set_spte(kvm, &iter, new_spte); 1909 spte_set = true; 1910 } 1911 1912 rcu_read_unlock(); 1913 1914 return spte_set; 1915 } 1916 1917 /* 1918 * Removes write access on the last level SPTE mapping this GFN and unsets the 1919 * MMU-writable bit to ensure future writes continue to be intercepted. 1920 * Returns true if an SPTE was set and a TLB flush is needed. 1921 */ 1922 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, 1923 struct kvm_memory_slot *slot, gfn_t gfn, 1924 int min_level) 1925 { 1926 struct kvm_mmu_page *root; 1927 bool spte_set = false; 1928 1929 lockdep_assert_held_write(&kvm->mmu_lock); 1930 for_each_valid_tdp_mmu_root(kvm, root, slot->as_id) 1931 spte_set |= write_protect_gfn(kvm, root, gfn, min_level); 1932 1933 return spte_set; 1934 } 1935 1936 /* 1937 * Return the level of the lowest level SPTE added to sptes. 1938 * That SPTE may be non-present. 1939 * 1940 * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. 1941 */ 1942 static int __kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, 1943 struct kvm_mmu_page *root) 1944 { 1945 struct tdp_iter iter; 1946 gfn_t gfn = addr >> PAGE_SHIFT; 1947 int leaf = -1; 1948 1949 for_each_tdp_pte(iter, vcpu->kvm, root, gfn, gfn + 1) { 1950 leaf = iter.level; 1951 sptes[leaf] = iter.old_spte; 1952 } 1953 1954 return leaf; 1955 } 1956 1957 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, 1958 int *root_level) 1959 { 1960 struct kvm_mmu_page *root = root_to_sp(vcpu->arch.mmu->root.hpa); 1961 *root_level = vcpu->arch.mmu->root_role.level; 1962 1963 return __kvm_tdp_mmu_get_walk(vcpu, addr, sptes, root); 1964 } 1965 1966 bool kvm_tdp_mmu_gpa_is_mapped(struct kvm_vcpu *vcpu, u64 gpa) 1967 { 1968 struct kvm *kvm = vcpu->kvm; 1969 bool is_direct = kvm_is_addr_direct(kvm, gpa); 1970 hpa_t root = is_direct ? vcpu->arch.mmu->root.hpa : 1971 vcpu->arch.mmu->mirror_root_hpa; 1972 u64 sptes[PT64_ROOT_MAX_LEVEL + 1], spte; 1973 int leaf; 1974 1975 lockdep_assert_held(&kvm->mmu_lock); 1976 rcu_read_lock(); 1977 leaf = __kvm_tdp_mmu_get_walk(vcpu, gpa, sptes, root_to_sp(root)); 1978 rcu_read_unlock(); 1979 if (leaf < 0) 1980 return false; 1981 1982 spte = sptes[leaf]; 1983 return is_shadow_present_pte(spte) && is_last_spte(spte, leaf); 1984 } 1985 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_tdp_mmu_gpa_is_mapped); 1986 1987 /* 1988 * Returns the last level spte pointer of the shadow page walk for the given 1989 * gpa, and sets *spte to the spte value. This spte may be non-preset. If no 1990 * walk could be performed, returns NULL and *spte does not contain valid data. 1991 * 1992 * Contract: 1993 * - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. 1994 * - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end. 1995 * 1996 * WARNING: This function is only intended to be called during fast_page_fault. 1997 */ 1998 u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gfn_t gfn, 1999 u64 *spte) 2000 { 2001 /* Fast pf is not supported for mirrored roots */ 2002 struct kvm_mmu_page *root = tdp_mmu_get_root(vcpu, KVM_DIRECT_ROOTS); 2003 struct tdp_iter iter; 2004 tdp_ptep_t sptep = NULL; 2005 2006 for_each_tdp_pte(iter, vcpu->kvm, root, gfn, gfn + 1) { 2007 *spte = iter.old_spte; 2008 sptep = iter.sptep; 2009 } 2010 2011 /* 2012 * Perform the rcu_dereference to get the raw spte pointer value since 2013 * we are passing it up to fast_page_fault, which is shared with the 2014 * legacy MMU and thus does not retain the TDP MMU-specific __rcu 2015 * annotation. 2016 * 2017 * This is safe since fast_page_fault obeys the contracts of this 2018 * function as well as all TDP MMU contracts around modifying SPTEs 2019 * outside of mmu_lock. 2020 */ 2021 return rcu_dereference(sptep); 2022 } 2023