1 // SPDX-License-Identifier: GPL-2.0 2 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 3 4 #include "mmu.h" 5 #include "mmu_internal.h" 6 #include "mmutrace.h" 7 #include "tdp_iter.h" 8 #include "tdp_mmu.h" 9 #include "spte.h" 10 11 #include <asm/cmpxchg.h> 12 #include <trace/events/kvm.h> 13 14 /* Initializes the TDP MMU for the VM, if enabled. */ 15 void kvm_mmu_init_tdp_mmu(struct kvm *kvm) 16 { 17 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); 18 spin_lock_init(&kvm->arch.tdp_mmu_pages_lock); 19 } 20 21 /* Arbitrarily returns true so that this may be used in if statements. */ 22 static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm, 23 bool shared) 24 { 25 if (shared) 26 lockdep_assert_held_read(&kvm->mmu_lock); 27 else 28 lockdep_assert_held_write(&kvm->mmu_lock); 29 30 return true; 31 } 32 33 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) 34 { 35 /* 36 * Invalidate all roots, which besides the obvious, schedules all roots 37 * for zapping and thus puts the TDP MMU's reference to each root, i.e. 38 * ultimately frees all roots. 39 */ 40 kvm_tdp_mmu_invalidate_roots(kvm, KVM_VALID_ROOTS); 41 kvm_tdp_mmu_zap_invalidated_roots(kvm, false); 42 43 WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages)); 44 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); 45 46 /* 47 * Ensure that all the outstanding RCU callbacks to free shadow pages 48 * can run before the VM is torn down. Putting the last reference to 49 * zapped roots will create new callbacks. 50 */ 51 rcu_barrier(); 52 } 53 54 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp) 55 { 56 free_page((unsigned long)sp->external_spt); 57 free_page((unsigned long)sp->spt); 58 kmem_cache_free(mmu_page_header_cache, sp); 59 } 60 61 /* 62 * This is called through call_rcu in order to free TDP page table memory 63 * safely with respect to other kernel threads that may be operating on 64 * the memory. 65 * By only accessing TDP MMU page table memory in an RCU read critical 66 * section, and freeing it after a grace period, lockless access to that 67 * memory won't use it after it is freed. 68 */ 69 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head) 70 { 71 struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page, 72 rcu_head); 73 74 tdp_mmu_free_sp(sp); 75 } 76 77 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root) 78 { 79 if (!refcount_dec_and_test(&root->tdp_mmu_root_count)) 80 return; 81 82 /* 83 * The TDP MMU itself holds a reference to each root until the root is 84 * explicitly invalidated, i.e. the final reference should be never be 85 * put for a valid root. 86 */ 87 KVM_BUG_ON(!is_tdp_mmu_page(root) || !root->role.invalid, kvm); 88 89 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 90 list_del_rcu(&root->link); 91 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 92 call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback); 93 } 94 95 static bool tdp_mmu_root_match(struct kvm_mmu_page *root, 96 enum kvm_tdp_mmu_root_types types) 97 { 98 if (WARN_ON_ONCE(!(types & KVM_VALID_ROOTS))) 99 return false; 100 101 if (root->role.invalid && !(types & KVM_INVALID_ROOTS)) 102 return false; 103 104 if (likely(!is_mirror_sp(root))) 105 return types & KVM_DIRECT_ROOTS; 106 return types & KVM_MIRROR_ROOTS; 107 } 108 109 /* 110 * Returns the next root after @prev_root (or the first root if @prev_root is 111 * NULL) that matches with @types. A reference to the returned root is 112 * acquired, and the reference to @prev_root is released (the caller obviously 113 * must hold a reference to @prev_root if it's non-NULL). 114 * 115 * Roots that doesn't match with @types are skipped. 116 * 117 * Returns NULL if the end of tdp_mmu_roots was reached. 118 */ 119 static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, 120 struct kvm_mmu_page *prev_root, 121 enum kvm_tdp_mmu_root_types types) 122 { 123 struct kvm_mmu_page *next_root; 124 125 /* 126 * While the roots themselves are RCU-protected, fields such as 127 * role.invalid are protected by mmu_lock. 128 */ 129 lockdep_assert_held(&kvm->mmu_lock); 130 131 rcu_read_lock(); 132 133 if (prev_root) 134 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, 135 &prev_root->link, 136 typeof(*prev_root), link); 137 else 138 next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots, 139 typeof(*next_root), link); 140 141 while (next_root) { 142 if (tdp_mmu_root_match(next_root, types) && 143 kvm_tdp_mmu_get_root(next_root)) 144 break; 145 146 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, 147 &next_root->link, typeof(*next_root), link); 148 } 149 150 rcu_read_unlock(); 151 152 if (prev_root) 153 kvm_tdp_mmu_put_root(kvm, prev_root); 154 155 return next_root; 156 } 157 158 /* 159 * Note: this iterator gets and puts references to the roots it iterates over. 160 * This makes it safe to release the MMU lock and yield within the loop, but 161 * if exiting the loop early, the caller must drop the reference to the most 162 * recent root. (Unless keeping a live reference is desirable.) 163 * 164 * If shared is set, this function is operating under the MMU lock in read 165 * mode. 166 */ 167 #define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _types) \ 168 for (_root = tdp_mmu_next_root(_kvm, NULL, _types); \ 169 ({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \ 170 _root = tdp_mmu_next_root(_kvm, _root, _types)) \ 171 if (_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) { \ 172 } else 173 174 #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \ 175 __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, KVM_VALID_ROOTS) 176 177 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root) \ 178 for (_root = tdp_mmu_next_root(_kvm, NULL, KVM_ALL_ROOTS); \ 179 ({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \ 180 _root = tdp_mmu_next_root(_kvm, _root, KVM_ALL_ROOTS)) 181 182 /* 183 * Iterate over all TDP MMU roots. Requires that mmu_lock be held for write, 184 * the implication being that any flow that holds mmu_lock for read is 185 * inherently yield-friendly and should use the yield-safe variant above. 186 * Holding mmu_lock for write obviates the need for RCU protection as the list 187 * is guaranteed to be stable. 188 */ 189 #define __for_each_tdp_mmu_root(_kvm, _root, _as_id, _types) \ 190 list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \ 191 if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \ 192 ((_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) || \ 193 !tdp_mmu_root_match((_root), (_types)))) { \ 194 } else 195 196 #define for_each_valid_tdp_mmu_root(_kvm, _root, _as_id) \ 197 __for_each_tdp_mmu_root(_kvm, _root, _as_id, KVM_VALID_ROOTS) 198 199 static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu) 200 { 201 struct kvm_mmu_page *sp; 202 203 sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); 204 sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); 205 206 return sp; 207 } 208 209 static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep, 210 gfn_t gfn, union kvm_mmu_page_role role) 211 { 212 INIT_LIST_HEAD(&sp->possible_nx_huge_page_link); 213 214 set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 215 216 sp->role = role; 217 sp->gfn = gfn; 218 sp->ptep = sptep; 219 sp->tdp_mmu_page = true; 220 221 trace_kvm_mmu_get_page(sp, true); 222 } 223 224 static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp, 225 struct tdp_iter *iter) 226 { 227 struct kvm_mmu_page *parent_sp; 228 union kvm_mmu_page_role role; 229 230 parent_sp = sptep_to_sp(rcu_dereference(iter->sptep)); 231 232 role = parent_sp->role; 233 role.level--; 234 235 tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role); 236 } 237 238 void kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu, bool mirror) 239 { 240 struct kvm_mmu *mmu = vcpu->arch.mmu; 241 union kvm_mmu_page_role role = mmu->root_role; 242 int as_id = kvm_mmu_role_as_id(role); 243 struct kvm *kvm = vcpu->kvm; 244 struct kvm_mmu_page *root; 245 246 if (mirror) 247 role.is_mirror = true; 248 249 /* 250 * Check for an existing root before acquiring the pages lock to avoid 251 * unnecessary serialization if multiple vCPUs are loading a new root. 252 * E.g. when bringing up secondary vCPUs, KVM will already have created 253 * a valid root on behalf of the primary vCPU. 254 */ 255 read_lock(&kvm->mmu_lock); 256 257 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, as_id) { 258 if (root->role.word == role.word) 259 goto out_read_unlock; 260 } 261 262 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 263 264 /* 265 * Recheck for an existing root after acquiring the pages lock, another 266 * vCPU may have raced ahead and created a new usable root. Manually 267 * walk the list of roots as the standard macros assume that the pages 268 * lock is *not* held. WARN if grabbing a reference to a usable root 269 * fails, as the last reference to a root can only be put *after* the 270 * root has been invalidated, which requires holding mmu_lock for write. 271 */ 272 list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) { 273 if (root->role.word == role.word && 274 !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root))) 275 goto out_spin_unlock; 276 } 277 278 root = tdp_mmu_alloc_sp(vcpu); 279 tdp_mmu_init_sp(root, NULL, 0, role); 280 281 /* 282 * TDP MMU roots are kept until they are explicitly invalidated, either 283 * by a memslot update or by the destruction of the VM. Initialize the 284 * refcount to two; one reference for the vCPU, and one reference for 285 * the TDP MMU itself, which is held until the root is invalidated and 286 * is ultimately put by kvm_tdp_mmu_zap_invalidated_roots(). 287 */ 288 refcount_set(&root->tdp_mmu_root_count, 2); 289 list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots); 290 291 out_spin_unlock: 292 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 293 out_read_unlock: 294 read_unlock(&kvm->mmu_lock); 295 /* 296 * Note, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS will prevent entering the guest 297 * and actually consuming the root if it's invalidated after dropping 298 * mmu_lock, and the root can't be freed as this vCPU holds a reference. 299 */ 300 if (mirror) { 301 mmu->mirror_root_hpa = __pa(root->spt); 302 } else { 303 mmu->root.hpa = __pa(root->spt); 304 mmu->root.pgd = 0; 305 } 306 } 307 308 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 309 u64 old_spte, u64 new_spte, int level, 310 bool shared); 311 312 static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) 313 { 314 kvm_account_pgtable_pages((void *)sp->spt, +1); 315 atomic64_inc(&kvm->arch.tdp_mmu_pages); 316 } 317 318 static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) 319 { 320 kvm_account_pgtable_pages((void *)sp->spt, -1); 321 atomic64_dec(&kvm->arch.tdp_mmu_pages); 322 } 323 324 /** 325 * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages 326 * 327 * @kvm: kvm instance 328 * @sp: the page to be removed 329 */ 330 static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp) 331 { 332 tdp_unaccount_mmu_page(kvm, sp); 333 334 if (!sp->nx_huge_page_disallowed) 335 return; 336 337 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 338 sp->nx_huge_page_disallowed = false; 339 untrack_possible_nx_huge_page(kvm, sp); 340 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 341 } 342 343 static void remove_external_spte(struct kvm *kvm, gfn_t gfn, u64 old_spte, 344 int level) 345 { 346 kvm_pfn_t old_pfn = spte_to_pfn(old_spte); 347 int ret; 348 349 /* 350 * External (TDX) SPTEs are limited to PG_LEVEL_4K, and external 351 * PTs are removed in a special order, involving free_external_spt(). 352 * But remove_external_spte() will be called on non-leaf PTEs via 353 * __tdp_mmu_zap_root(), so avoid the error the former would return 354 * in this case. 355 */ 356 if (!is_last_spte(old_spte, level)) 357 return; 358 359 /* Zapping leaf spte is allowed only when write lock is held. */ 360 lockdep_assert_held_write(&kvm->mmu_lock); 361 /* Because write lock is held, operation should success. */ 362 ret = static_call(kvm_x86_remove_external_spte)(kvm, gfn, level, old_pfn); 363 KVM_BUG_ON(ret, kvm); 364 } 365 366 /** 367 * handle_removed_pt() - handle a page table removed from the TDP structure 368 * 369 * @kvm: kvm instance 370 * @pt: the page removed from the paging structure 371 * @shared: This operation may not be running under the exclusive use 372 * of the MMU lock and the operation must synchronize with other 373 * threads that might be modifying SPTEs. 374 * 375 * Given a page table that has been removed from the TDP paging structure, 376 * iterates through the page table to clear SPTEs and free child page tables. 377 * 378 * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU 379 * protection. Since this thread removed it from the paging structure, 380 * this thread will be responsible for ensuring the page is freed. Hence the 381 * early rcu_dereferences in the function. 382 */ 383 static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) 384 { 385 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt)); 386 int level = sp->role.level; 387 gfn_t base_gfn = sp->gfn; 388 int i; 389 390 trace_kvm_mmu_prepare_zap_page(sp); 391 392 tdp_mmu_unlink_sp(kvm, sp); 393 394 for (i = 0; i < SPTE_ENT_PER_PAGE; i++) { 395 tdp_ptep_t sptep = pt + i; 396 gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level); 397 u64 old_spte; 398 399 if (shared) { 400 /* 401 * Set the SPTE to a nonpresent value that other 402 * threads will not overwrite. If the SPTE was 403 * already marked as frozen then another thread 404 * handling a page fault could overwrite it, so 405 * set the SPTE until it is set from some other 406 * value to the frozen SPTE value. 407 */ 408 for (;;) { 409 old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, FROZEN_SPTE); 410 if (!is_frozen_spte(old_spte)) 411 break; 412 cpu_relax(); 413 } 414 } else { 415 /* 416 * If the SPTE is not MMU-present, there is no backing 417 * page associated with the SPTE and so no side effects 418 * that need to be recorded, and exclusive ownership of 419 * mmu_lock ensures the SPTE can't be made present. 420 * Note, zapping MMIO SPTEs is also unnecessary as they 421 * are guarded by the memslots generation, not by being 422 * unreachable. 423 */ 424 old_spte = kvm_tdp_mmu_read_spte(sptep); 425 if (!is_shadow_present_pte(old_spte)) 426 continue; 427 428 /* 429 * Use the common helper instead of a raw WRITE_ONCE as 430 * the SPTE needs to be updated atomically if it can be 431 * modified by a different vCPU outside of mmu_lock. 432 * Even though the parent SPTE is !PRESENT, the TLB 433 * hasn't yet been flushed, and both Intel and AMD 434 * document that A/D assists can use upper-level PxE 435 * entries that are cached in the TLB, i.e. the CPU can 436 * still access the page and mark it dirty. 437 * 438 * No retry is needed in the atomic update path as the 439 * sole concern is dropping a Dirty bit, i.e. no other 440 * task can zap/remove the SPTE as mmu_lock is held for 441 * write. Marking the SPTE as a frozen SPTE is not 442 * strictly necessary for the same reason, but using 443 * the frozen SPTE value keeps the shared/exclusive 444 * paths consistent and allows the handle_changed_spte() 445 * call below to hardcode the new value to FROZEN_SPTE. 446 * 447 * Note, even though dropping a Dirty bit is the only 448 * scenario where a non-atomic update could result in a 449 * functional bug, simply checking the Dirty bit isn't 450 * sufficient as a fast page fault could read the upper 451 * level SPTE before it is zapped, and then make this 452 * target SPTE writable, resume the guest, and set the 453 * Dirty bit between reading the SPTE above and writing 454 * it here. 455 */ 456 old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, 457 FROZEN_SPTE, level); 458 } 459 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn, 460 old_spte, FROZEN_SPTE, level, shared); 461 462 if (is_mirror_sp(sp)) { 463 KVM_BUG_ON(shared, kvm); 464 remove_external_spte(kvm, gfn, old_spte, level); 465 } 466 } 467 468 if (is_mirror_sp(sp) && 469 WARN_ON(static_call(kvm_x86_free_external_spt)(kvm, base_gfn, sp->role.level, 470 sp->external_spt))) { 471 /* 472 * Failed to free page table page in mirror page table and 473 * there is nothing to do further. 474 * Intentionally leak the page to prevent the kernel from 475 * accessing the encrypted page. 476 */ 477 sp->external_spt = NULL; 478 } 479 480 call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); 481 } 482 483 static void *get_external_spt(gfn_t gfn, u64 new_spte, int level) 484 { 485 if (is_shadow_present_pte(new_spte) && !is_last_spte(new_spte, level)) { 486 struct kvm_mmu_page *sp = spte_to_child_sp(new_spte); 487 488 WARN_ON_ONCE(sp->role.level + 1 != level); 489 WARN_ON_ONCE(sp->gfn != gfn); 490 return sp->external_spt; 491 } 492 493 return NULL; 494 } 495 496 static int __must_check set_external_spte_present(struct kvm *kvm, tdp_ptep_t sptep, 497 gfn_t gfn, u64 old_spte, 498 u64 new_spte, int level) 499 { 500 bool was_present = is_shadow_present_pte(old_spte); 501 bool is_present = is_shadow_present_pte(new_spte); 502 bool is_leaf = is_present && is_last_spte(new_spte, level); 503 kvm_pfn_t new_pfn = spte_to_pfn(new_spte); 504 int ret = 0; 505 506 KVM_BUG_ON(was_present, kvm); 507 508 lockdep_assert_held(&kvm->mmu_lock); 509 /* 510 * We need to lock out other updates to the SPTE until the external 511 * page table has been modified. Use FROZEN_SPTE similar to 512 * the zapping case. 513 */ 514 if (!try_cmpxchg64(rcu_dereference(sptep), &old_spte, FROZEN_SPTE)) 515 return -EBUSY; 516 517 /* 518 * Use different call to either set up middle level 519 * external page table, or leaf. 520 */ 521 if (is_leaf) { 522 ret = static_call(kvm_x86_set_external_spte)(kvm, gfn, level, new_pfn); 523 } else { 524 void *external_spt = get_external_spt(gfn, new_spte, level); 525 526 KVM_BUG_ON(!external_spt, kvm); 527 ret = static_call(kvm_x86_link_external_spt)(kvm, gfn, level, external_spt); 528 } 529 if (ret) 530 __kvm_tdp_mmu_write_spte(sptep, old_spte); 531 else 532 __kvm_tdp_mmu_write_spte(sptep, new_spte); 533 return ret; 534 } 535 536 /** 537 * handle_changed_spte - handle bookkeeping associated with an SPTE change 538 * @kvm: kvm instance 539 * @as_id: the address space of the paging structure the SPTE was a part of 540 * @gfn: the base GFN that was mapped by the SPTE 541 * @old_spte: The value of the SPTE before the change 542 * @new_spte: The value of the SPTE after the change 543 * @level: the level of the PT the SPTE is part of in the paging structure 544 * @shared: This operation may not be running under the exclusive use of 545 * the MMU lock and the operation must synchronize with other 546 * threads that might be modifying SPTEs. 547 * 548 * Handle bookkeeping that might result from the modification of a SPTE. Note, 549 * dirty logging updates are handled in common code, not here (see make_spte() 550 * and fast_pf_fix_direct_spte()). 551 */ 552 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 553 u64 old_spte, u64 new_spte, int level, 554 bool shared) 555 { 556 bool was_present = is_shadow_present_pte(old_spte); 557 bool is_present = is_shadow_present_pte(new_spte); 558 bool was_leaf = was_present && is_last_spte(old_spte, level); 559 bool is_leaf = is_present && is_last_spte(new_spte, level); 560 bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); 561 562 WARN_ON_ONCE(level > PT64_ROOT_MAX_LEVEL); 563 WARN_ON_ONCE(level < PG_LEVEL_4K); 564 WARN_ON_ONCE(gfn & (KVM_PAGES_PER_HPAGE(level) - 1)); 565 566 /* 567 * If this warning were to trigger it would indicate that there was a 568 * missing MMU notifier or a race with some notifier handler. 569 * A present, leaf SPTE should never be directly replaced with another 570 * present leaf SPTE pointing to a different PFN. A notifier handler 571 * should be zapping the SPTE before the main MM's page table is 572 * changed, or the SPTE should be zeroed, and the TLBs flushed by the 573 * thread before replacement. 574 */ 575 if (was_leaf && is_leaf && pfn_changed) { 576 pr_err("Invalid SPTE change: cannot replace a present leaf\n" 577 "SPTE with another present leaf SPTE mapping a\n" 578 "different PFN!\n" 579 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", 580 as_id, gfn, old_spte, new_spte, level); 581 582 /* 583 * Crash the host to prevent error propagation and guest data 584 * corruption. 585 */ 586 BUG(); 587 } 588 589 if (old_spte == new_spte) 590 return; 591 592 trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte); 593 594 if (is_leaf) 595 check_spte_writable_invariants(new_spte); 596 597 /* 598 * The only times a SPTE should be changed from a non-present to 599 * non-present state is when an MMIO entry is installed/modified/ 600 * removed. In that case, there is nothing to do here. 601 */ 602 if (!was_present && !is_present) { 603 /* 604 * If this change does not involve a MMIO SPTE or frozen SPTE, 605 * it is unexpected. Log the change, though it should not 606 * impact the guest since both the former and current SPTEs 607 * are nonpresent. 608 */ 609 if (WARN_ON_ONCE(!is_mmio_spte(kvm, old_spte) && 610 !is_mmio_spte(kvm, new_spte) && 611 !is_frozen_spte(new_spte))) 612 pr_err("Unexpected SPTE change! Nonpresent SPTEs\n" 613 "should not be replaced with another,\n" 614 "different nonpresent SPTE, unless one or both\n" 615 "are MMIO SPTEs, or the new SPTE is\n" 616 "a temporary frozen SPTE.\n" 617 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", 618 as_id, gfn, old_spte, new_spte, level); 619 return; 620 } 621 622 if (is_leaf != was_leaf) 623 kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1); 624 625 /* 626 * Recursively handle child PTs if the change removed a subtree from 627 * the paging structure. Note the WARN on the PFN changing without the 628 * SPTE being converted to a hugepage (leaf) or being zapped. Shadow 629 * pages are kernel allocations and should never be migrated. 630 */ 631 if (was_present && !was_leaf && 632 (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed))) 633 handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared); 634 } 635 636 static inline int __must_check __tdp_mmu_set_spte_atomic(struct kvm *kvm, 637 struct tdp_iter *iter, 638 u64 new_spte) 639 { 640 /* 641 * The caller is responsible for ensuring the old SPTE is not a FROZEN 642 * SPTE. KVM should never attempt to zap or manipulate a FROZEN SPTE, 643 * and pre-checking before inserting a new SPTE is advantageous as it 644 * avoids unnecessary work. 645 */ 646 WARN_ON_ONCE(iter->yielded || is_frozen_spte(iter->old_spte)); 647 648 if (is_mirror_sptep(iter->sptep) && !is_frozen_spte(new_spte)) { 649 int ret; 650 651 /* 652 * Users of atomic zapping don't operate on mirror roots, 653 * so don't handle it and bug the VM if it's seen. 654 */ 655 if (KVM_BUG_ON(!is_shadow_present_pte(new_spte), kvm)) 656 return -EBUSY; 657 658 ret = set_external_spte_present(kvm, iter->sptep, iter->gfn, 659 iter->old_spte, new_spte, iter->level); 660 if (ret) 661 return ret; 662 } else { 663 u64 *sptep = rcu_dereference(iter->sptep); 664 665 /* 666 * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs 667 * and does not hold the mmu_lock. On failure, i.e. if a 668 * different logical CPU modified the SPTE, try_cmpxchg64() 669 * updates iter->old_spte with the current value, so the caller 670 * operates on fresh data, e.g. if it retries 671 * tdp_mmu_set_spte_atomic() 672 */ 673 if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte)) 674 return -EBUSY; 675 } 676 677 return 0; 678 } 679 680 /* 681 * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically 682 * and handle the associated bookkeeping. Do not mark the page dirty 683 * in KVM's dirty bitmaps. 684 * 685 * If setting the SPTE fails because it has changed, iter->old_spte will be 686 * refreshed to the current value of the spte. 687 * 688 * @kvm: kvm instance 689 * @iter: a tdp_iter instance currently on the SPTE that should be set 690 * @new_spte: The value the SPTE should be set to 691 * Return: 692 * * 0 - If the SPTE was set. 693 * * -EBUSY - If the SPTE cannot be set. In this case this function will have 694 * no side-effects other than setting iter->old_spte to the last 695 * known value of the spte. 696 */ 697 static inline int __must_check tdp_mmu_set_spte_atomic(struct kvm *kvm, 698 struct tdp_iter *iter, 699 u64 new_spte) 700 { 701 int ret; 702 703 lockdep_assert_held_read(&kvm->mmu_lock); 704 705 ret = __tdp_mmu_set_spte_atomic(kvm, iter, new_spte); 706 if (ret) 707 return ret; 708 709 handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, 710 new_spte, iter->level, true); 711 712 return 0; 713 } 714 715 /* 716 * tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping 717 * @kvm: KVM instance 718 * @as_id: Address space ID, i.e. regular vs. SMM 719 * @sptep: Pointer to the SPTE 720 * @old_spte: The current value of the SPTE 721 * @new_spte: The new value that will be set for the SPTE 722 * @gfn: The base GFN that was (or will be) mapped by the SPTE 723 * @level: The level _containing_ the SPTE (its parent PT's level) 724 * 725 * Returns the old SPTE value, which _may_ be different than @old_spte if the 726 * SPTE had voldatile bits. 727 */ 728 static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep, 729 u64 old_spte, u64 new_spte, gfn_t gfn, int level) 730 { 731 lockdep_assert_held_write(&kvm->mmu_lock); 732 733 /* 734 * No thread should be using this function to set SPTEs to or from the 735 * temporary frozen SPTE value. 736 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic 737 * should be used. If operating under the MMU lock in write mode, the 738 * use of the frozen SPTE should not be necessary. 739 */ 740 WARN_ON_ONCE(is_frozen_spte(old_spte) || is_frozen_spte(new_spte)); 741 742 old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level); 743 744 handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false); 745 746 /* 747 * Users that do non-atomic setting of PTEs don't operate on mirror 748 * roots, so don't handle it and bug the VM if it's seen. 749 */ 750 if (is_mirror_sptep(sptep)) { 751 KVM_BUG_ON(is_shadow_present_pte(new_spte), kvm); 752 remove_external_spte(kvm, gfn, old_spte, level); 753 } 754 755 return old_spte; 756 } 757 758 static inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter, 759 u64 new_spte) 760 { 761 WARN_ON_ONCE(iter->yielded); 762 iter->old_spte = tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep, 763 iter->old_spte, new_spte, 764 iter->gfn, iter->level); 765 } 766 767 #define tdp_root_for_each_pte(_iter, _kvm, _root, _start, _end) \ 768 for_each_tdp_pte(_iter, _kvm, _root, _start, _end) 769 770 #define tdp_root_for_each_leaf_pte(_iter, _kvm, _root, _start, _end) \ 771 tdp_root_for_each_pte(_iter, _kvm, _root, _start, _end) \ 772 if (!is_shadow_present_pte(_iter.old_spte) || \ 773 !is_last_spte(_iter.old_spte, _iter.level)) \ 774 continue; \ 775 else 776 777 #define tdp_mmu_for_each_pte(_iter, _kvm, _root, _start, _end) \ 778 for_each_tdp_pte(_iter, _kvm, _root, _start, _end) 779 780 static inline bool __must_check tdp_mmu_iter_need_resched(struct kvm *kvm, 781 struct tdp_iter *iter) 782 { 783 if (!need_resched() && !rwlock_needbreak(&kvm->mmu_lock)) 784 return false; 785 786 /* Ensure forward progress has been made before yielding. */ 787 return iter->next_last_level_gfn != iter->yielded_gfn; 788 } 789 790 /* 791 * Yield if the MMU lock is contended or this thread needs to return control 792 * to the scheduler. 793 * 794 * If this function should yield and flush is set, it will perform a remote 795 * TLB flush before yielding. 796 * 797 * If this function yields, iter->yielded is set and the caller must skip to 798 * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk 799 * over the paging structures to allow the iterator to continue its traversal 800 * from the paging structure root. 801 * 802 * Returns true if this function yielded. 803 */ 804 static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm, 805 struct tdp_iter *iter, 806 bool flush, bool shared) 807 { 808 KVM_MMU_WARN_ON(iter->yielded); 809 810 if (!tdp_mmu_iter_need_resched(kvm, iter)) 811 return false; 812 813 if (flush) 814 kvm_flush_remote_tlbs(kvm); 815 816 rcu_read_unlock(); 817 818 if (shared) 819 cond_resched_rwlock_read(&kvm->mmu_lock); 820 else 821 cond_resched_rwlock_write(&kvm->mmu_lock); 822 823 rcu_read_lock(); 824 825 WARN_ON_ONCE(iter->gfn > iter->next_last_level_gfn); 826 827 iter->yielded = true; 828 return true; 829 } 830 831 static inline gfn_t tdp_mmu_max_gfn_exclusive(void) 832 { 833 /* 834 * Bound TDP MMU walks at host.MAXPHYADDR. KVM disallows memslots with 835 * a gpa range that would exceed the max gfn, and KVM does not create 836 * MMIO SPTEs for "impossible" gfns, instead sending such accesses down 837 * the slow emulation path every time. 838 */ 839 return kvm_mmu_max_gfn() + 1; 840 } 841 842 static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, 843 bool shared, int zap_level) 844 { 845 struct tdp_iter iter; 846 847 for_each_tdp_pte_min_level_all(iter, root, zap_level) { 848 retry: 849 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared)) 850 continue; 851 852 if (!is_shadow_present_pte(iter.old_spte)) 853 continue; 854 855 if (iter.level > zap_level) 856 continue; 857 858 if (!shared) 859 tdp_mmu_iter_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE); 860 else if (tdp_mmu_set_spte_atomic(kvm, &iter, SHADOW_NONPRESENT_VALUE)) 861 goto retry; 862 } 863 } 864 865 static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, 866 bool shared) 867 { 868 869 /* 870 * The root must have an elevated refcount so that it's reachable via 871 * mmu_notifier callbacks, which allows this path to yield and drop 872 * mmu_lock. When handling an unmap/release mmu_notifier command, KVM 873 * must drop all references to relevant pages prior to completing the 874 * callback. Dropping mmu_lock with an unreachable root would result 875 * in zapping SPTEs after a relevant mmu_notifier callback completes 876 * and lead to use-after-free as zapping a SPTE triggers "writeback" of 877 * dirty accessed bits to the SPTE's associated struct page. 878 */ 879 WARN_ON_ONCE(!refcount_read(&root->tdp_mmu_root_count)); 880 881 kvm_lockdep_assert_mmu_lock_held(kvm, shared); 882 883 rcu_read_lock(); 884 885 /* 886 * Zap roots in multiple passes of decreasing granularity, i.e. zap at 887 * 4KiB=>2MiB=>1GiB=>root, in order to better honor need_resched() (all 888 * preempt models) or mmu_lock contention (full or real-time models). 889 * Zapping at finer granularity marginally increases the total time of 890 * the zap, but in most cases the zap itself isn't latency sensitive. 891 * 892 * If KVM is configured to prove the MMU, skip the 4KiB and 2MiB zaps 893 * in order to mimic the page fault path, which can replace a 1GiB page 894 * table with an equivalent 1GiB hugepage, i.e. can get saddled with 895 * zapping a 1GiB region that's fully populated with 4KiB SPTEs. This 896 * allows verifying that KVM can safely zap 1GiB regions, e.g. without 897 * inducing RCU stalls, without relying on a relatively rare event 898 * (zapping roots is orders of magnitude more common). Note, because 899 * zapping a SP recurses on its children, stepping down to PG_LEVEL_4K 900 * in the iterator itself is unnecessary. 901 */ 902 if (!IS_ENABLED(CONFIG_KVM_PROVE_MMU)) { 903 __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_4K); 904 __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_2M); 905 } 906 __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G); 907 __tdp_mmu_zap_root(kvm, root, shared, root->role.level); 908 909 rcu_read_unlock(); 910 } 911 912 bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) 913 { 914 u64 old_spte; 915 916 /* 917 * This helper intentionally doesn't allow zapping a root shadow page, 918 * which doesn't have a parent page table and thus no associated entry. 919 */ 920 if (WARN_ON_ONCE(!sp->ptep)) 921 return false; 922 923 old_spte = kvm_tdp_mmu_read_spte(sp->ptep); 924 if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte))) 925 return false; 926 927 tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 928 SHADOW_NONPRESENT_VALUE, sp->gfn, sp->role.level + 1); 929 930 return true; 931 } 932 933 /* 934 * If can_yield is true, will release the MMU lock and reschedule if the 935 * scheduler needs the CPU or there is contention on the MMU lock. If this 936 * function cannot yield, it will not release the MMU lock or reschedule and 937 * the caller must ensure it does not supply too large a GFN range, or the 938 * operation can cause a soft lockup. 939 */ 940 static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root, 941 gfn_t start, gfn_t end, bool can_yield, bool flush) 942 { 943 struct tdp_iter iter; 944 945 end = min(end, tdp_mmu_max_gfn_exclusive()); 946 947 lockdep_assert_held_write(&kvm->mmu_lock); 948 949 rcu_read_lock(); 950 951 for_each_tdp_pte_min_level(iter, kvm, root, PG_LEVEL_4K, start, end) { 952 if (can_yield && 953 tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) { 954 flush = false; 955 continue; 956 } 957 958 if (!is_shadow_present_pte(iter.old_spte) || 959 !is_last_spte(iter.old_spte, iter.level)) 960 continue; 961 962 tdp_mmu_iter_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE); 963 964 /* 965 * Zappings SPTEs in invalid roots doesn't require a TLB flush, 966 * see kvm_tdp_mmu_zap_invalidated_roots() for details. 967 */ 968 if (!root->role.invalid) 969 flush = true; 970 } 971 972 rcu_read_unlock(); 973 974 /* 975 * Because this flow zaps _only_ leaf SPTEs, the caller doesn't need 976 * to provide RCU protection as no 'struct kvm_mmu_page' will be freed. 977 */ 978 return flush; 979 } 980 981 /* 982 * Zap leaf SPTEs for the range of gfns, [start, end), for all *VALID** roots. 983 * Returns true if a TLB flush is needed before releasing the MMU lock, i.e. if 984 * one or more SPTEs were zapped since the MMU lock was last acquired. 985 */ 986 bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush) 987 { 988 struct kvm_mmu_page *root; 989 990 lockdep_assert_held_write(&kvm->mmu_lock); 991 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, -1) 992 flush = tdp_mmu_zap_leafs(kvm, root, start, end, true, flush); 993 994 return flush; 995 } 996 997 void kvm_tdp_mmu_zap_all(struct kvm *kvm) 998 { 999 struct kvm_mmu_page *root; 1000 1001 /* 1002 * Zap all direct roots, including invalid direct roots, as all direct 1003 * SPTEs must be dropped before returning to the caller. For TDX, mirror 1004 * roots don't need handling in response to the mmu notifier (the caller). 1005 * 1006 * Zap directly even if the root is also being zapped by a concurrent 1007 * "fast zap". Walking zapped top-level SPTEs isn't all that expensive 1008 * and mmu_lock is already held, which means the other thread has yielded. 1009 * 1010 * A TLB flush is unnecessary, KVM zaps everything if and only the VM 1011 * is being destroyed or the userspace VMM has exited. In both cases, 1012 * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request. 1013 */ 1014 lockdep_assert_held_write(&kvm->mmu_lock); 1015 __for_each_tdp_mmu_root_yield_safe(kvm, root, -1, 1016 KVM_DIRECT_ROOTS | KVM_INVALID_ROOTS) 1017 tdp_mmu_zap_root(kvm, root, false); 1018 } 1019 1020 /* 1021 * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast 1022 * zap" completes. 1023 */ 1024 void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm, bool shared) 1025 { 1026 struct kvm_mmu_page *root; 1027 1028 if (shared) 1029 read_lock(&kvm->mmu_lock); 1030 else 1031 write_lock(&kvm->mmu_lock); 1032 1033 for_each_tdp_mmu_root_yield_safe(kvm, root) { 1034 if (!root->tdp_mmu_scheduled_root_to_zap) 1035 continue; 1036 1037 root->tdp_mmu_scheduled_root_to_zap = false; 1038 KVM_BUG_ON(!root->role.invalid, kvm); 1039 1040 /* 1041 * A TLB flush is not necessary as KVM performs a local TLB 1042 * flush when allocating a new root (see kvm_mmu_load()), and 1043 * when migrating a vCPU to a different pCPU. Note, the local 1044 * TLB flush on reuse also invalidates paging-structure-cache 1045 * entries, i.e. TLB entries for intermediate paging structures, 1046 * that may be zapped, as such entries are associated with the 1047 * ASID on both VMX and SVM. 1048 */ 1049 tdp_mmu_zap_root(kvm, root, shared); 1050 1051 /* 1052 * The referenced needs to be put *after* zapping the root, as 1053 * the root must be reachable by mmu_notifiers while it's being 1054 * zapped 1055 */ 1056 kvm_tdp_mmu_put_root(kvm, root); 1057 } 1058 1059 if (shared) 1060 read_unlock(&kvm->mmu_lock); 1061 else 1062 write_unlock(&kvm->mmu_lock); 1063 } 1064 1065 /* 1066 * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that 1067 * is about to be zapped, e.g. in response to a memslots update. The actual 1068 * zapping is done separately so that it happens with mmu_lock with read, 1069 * whereas invalidating roots must be done with mmu_lock held for write (unless 1070 * the VM is being destroyed). 1071 * 1072 * Note, kvm_tdp_mmu_zap_invalidated_roots() is gifted the TDP MMU's reference. 1073 * See kvm_tdp_mmu_alloc_root(). 1074 */ 1075 void kvm_tdp_mmu_invalidate_roots(struct kvm *kvm, 1076 enum kvm_tdp_mmu_root_types root_types) 1077 { 1078 struct kvm_mmu_page *root; 1079 1080 /* 1081 * Invalidating invalid roots doesn't make sense, prevent developers from 1082 * having to think about it. 1083 */ 1084 if (WARN_ON_ONCE(root_types & KVM_INVALID_ROOTS)) 1085 root_types &= ~KVM_INVALID_ROOTS; 1086 1087 /* 1088 * mmu_lock must be held for write to ensure that a root doesn't become 1089 * invalid while there are active readers (invalidating a root while 1090 * there are active readers may or may not be problematic in practice, 1091 * but it's uncharted territory and not supported). 1092 * 1093 * Waive the assertion if there are no users of @kvm, i.e. the VM is 1094 * being destroyed after all references have been put, or if no vCPUs 1095 * have been created (which means there are no roots), i.e. the VM is 1096 * being destroyed in an error path of KVM_CREATE_VM. 1097 */ 1098 if (IS_ENABLED(CONFIG_PROVE_LOCKING) && 1099 refcount_read(&kvm->users_count) && kvm->created_vcpus) 1100 lockdep_assert_held_write(&kvm->mmu_lock); 1101 1102 /* 1103 * As above, mmu_lock isn't held when destroying the VM! There can't 1104 * be other references to @kvm, i.e. nothing else can invalidate roots 1105 * or get/put references to roots. 1106 */ 1107 list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) { 1108 if (!tdp_mmu_root_match(root, root_types)) 1109 continue; 1110 1111 /* 1112 * Note, invalid roots can outlive a memslot update! Invalid 1113 * roots must be *zapped* before the memslot update completes, 1114 * but a different task can acquire a reference and keep the 1115 * root alive after its been zapped. 1116 */ 1117 if (!root->role.invalid) { 1118 root->tdp_mmu_scheduled_root_to_zap = true; 1119 root->role.invalid = true; 1120 } 1121 } 1122 } 1123 1124 /* 1125 * Installs a last-level SPTE to handle a TDP page fault. 1126 * (NPT/EPT violation/misconfiguration) 1127 */ 1128 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, 1129 struct kvm_page_fault *fault, 1130 struct tdp_iter *iter) 1131 { 1132 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep)); 1133 u64 new_spte; 1134 int ret = RET_PF_FIXED; 1135 bool wrprot = false; 1136 1137 if (WARN_ON_ONCE(sp->role.level != fault->goal_level)) 1138 return RET_PF_RETRY; 1139 1140 if (fault->prefetch && is_shadow_present_pte(iter->old_spte)) 1141 return RET_PF_SPURIOUS; 1142 1143 if (is_shadow_present_pte(iter->old_spte) && 1144 is_access_allowed(fault, iter->old_spte) && 1145 is_last_spte(iter->old_spte, iter->level)) 1146 return RET_PF_SPURIOUS; 1147 1148 if (unlikely(!fault->slot)) 1149 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); 1150 else 1151 wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn, 1152 fault->pfn, iter->old_spte, fault->prefetch, 1153 false, fault->map_writable, &new_spte); 1154 1155 if (new_spte == iter->old_spte) 1156 ret = RET_PF_SPURIOUS; 1157 else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte)) 1158 return RET_PF_RETRY; 1159 else if (is_shadow_present_pte(iter->old_spte) && 1160 (!is_last_spte(iter->old_spte, iter->level) || 1161 WARN_ON_ONCE(leaf_spte_change_needs_tlb_flush(iter->old_spte, new_spte)))) 1162 kvm_flush_remote_tlbs_gfn(vcpu->kvm, iter->gfn, iter->level); 1163 1164 /* 1165 * If the page fault was caused by a write but the page is write 1166 * protected, emulation is needed. If the emulation was skipped, 1167 * the vCPU would have the same fault again. 1168 */ 1169 if (wrprot && fault->write) 1170 ret = RET_PF_WRITE_PROTECTED; 1171 1172 /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */ 1173 if (unlikely(is_mmio_spte(vcpu->kvm, new_spte))) { 1174 vcpu->stat.pf_mmio_spte_created++; 1175 trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn, 1176 new_spte); 1177 ret = RET_PF_EMULATE; 1178 } else { 1179 trace_kvm_mmu_set_spte(iter->level, iter->gfn, 1180 rcu_dereference(iter->sptep)); 1181 } 1182 1183 return ret; 1184 } 1185 1186 /* 1187 * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the 1188 * provided page table. 1189 * 1190 * @kvm: kvm instance 1191 * @iter: a tdp_iter instance currently on the SPTE that should be set 1192 * @sp: The new TDP page table to install. 1193 * @shared: This operation is running under the MMU lock in read mode. 1194 * 1195 * Returns: 0 if the new page table was installed. Non-0 if the page table 1196 * could not be installed (e.g. the atomic compare-exchange failed). 1197 */ 1198 static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter, 1199 struct kvm_mmu_page *sp, bool shared) 1200 { 1201 u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled); 1202 int ret = 0; 1203 1204 if (shared) { 1205 ret = tdp_mmu_set_spte_atomic(kvm, iter, spte); 1206 if (ret) 1207 return ret; 1208 } else { 1209 tdp_mmu_iter_set_spte(kvm, iter, spte); 1210 } 1211 1212 tdp_account_mmu_page(kvm, sp); 1213 1214 return 0; 1215 } 1216 1217 static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, 1218 struct kvm_mmu_page *sp, bool shared); 1219 1220 /* 1221 * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing 1222 * page tables and SPTEs to translate the faulting guest physical address. 1223 */ 1224 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) 1225 { 1226 struct kvm_mmu_page *root = tdp_mmu_get_root_for_fault(vcpu, fault); 1227 struct kvm *kvm = vcpu->kvm; 1228 struct tdp_iter iter; 1229 struct kvm_mmu_page *sp; 1230 int ret = RET_PF_RETRY; 1231 1232 kvm_mmu_hugepage_adjust(vcpu, fault); 1233 1234 trace_kvm_mmu_spte_requested(fault); 1235 1236 rcu_read_lock(); 1237 1238 tdp_mmu_for_each_pte(iter, kvm, root, fault->gfn, fault->gfn + 1) { 1239 int r; 1240 1241 if (fault->nx_huge_page_workaround_enabled) 1242 disallowed_hugepage_adjust(fault, iter.old_spte, iter.level); 1243 1244 /* 1245 * If SPTE has been frozen by another thread, just give up and 1246 * retry, avoiding unnecessary page table allocation and free. 1247 */ 1248 if (is_frozen_spte(iter.old_spte)) 1249 goto retry; 1250 1251 if (iter.level == fault->goal_level) 1252 goto map_target_level; 1253 1254 /* Step down into the lower level page table if it exists. */ 1255 if (is_shadow_present_pte(iter.old_spte) && 1256 !is_large_pte(iter.old_spte)) 1257 continue; 1258 1259 /* 1260 * The SPTE is either non-present or points to a huge page that 1261 * needs to be split. 1262 */ 1263 sp = tdp_mmu_alloc_sp(vcpu); 1264 tdp_mmu_init_child_sp(sp, &iter); 1265 if (is_mirror_sp(sp)) 1266 kvm_mmu_alloc_external_spt(vcpu, sp); 1267 1268 sp->nx_huge_page_disallowed = fault->huge_page_disallowed; 1269 1270 if (is_shadow_present_pte(iter.old_spte)) { 1271 /* Don't support large page for mirrored roots (TDX) */ 1272 KVM_BUG_ON(is_mirror_sptep(iter.sptep), vcpu->kvm); 1273 r = tdp_mmu_split_huge_page(kvm, &iter, sp, true); 1274 } else { 1275 r = tdp_mmu_link_sp(kvm, &iter, sp, true); 1276 } 1277 1278 /* 1279 * Force the guest to retry if installing an upper level SPTE 1280 * failed, e.g. because a different task modified the SPTE. 1281 */ 1282 if (r) { 1283 tdp_mmu_free_sp(sp); 1284 goto retry; 1285 } 1286 1287 if (fault->huge_page_disallowed && 1288 fault->req_level >= iter.level) { 1289 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 1290 if (sp->nx_huge_page_disallowed) 1291 track_possible_nx_huge_page(kvm, sp); 1292 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 1293 } 1294 } 1295 1296 /* 1297 * The walk aborted before reaching the target level, e.g. because the 1298 * iterator detected an upper level SPTE was frozen during traversal. 1299 */ 1300 WARN_ON_ONCE(iter.level == fault->goal_level); 1301 goto retry; 1302 1303 map_target_level: 1304 ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter); 1305 1306 retry: 1307 rcu_read_unlock(); 1308 return ret; 1309 } 1310 1311 /* Used by mmu notifier via kvm_unmap_gfn_range() */ 1312 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, 1313 bool flush) 1314 { 1315 enum kvm_tdp_mmu_root_types types; 1316 struct kvm_mmu_page *root; 1317 1318 types = kvm_gfn_range_filter_to_root_types(kvm, range->attr_filter) | KVM_INVALID_ROOTS; 1319 1320 __for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, types) 1321 flush = tdp_mmu_zap_leafs(kvm, root, range->start, range->end, 1322 range->may_block, flush); 1323 1324 return flush; 1325 } 1326 1327 /* 1328 * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero 1329 * if any of the GFNs in the range have been accessed. 1330 * 1331 * No need to mark the corresponding PFN as accessed as this call is coming 1332 * from the clear_young() or clear_flush_young() notifier, which uses the 1333 * return value to determine if the page has been accessed. 1334 */ 1335 static void kvm_tdp_mmu_age_spte(struct tdp_iter *iter) 1336 { 1337 u64 new_spte; 1338 1339 if (spte_ad_enabled(iter->old_spte)) { 1340 iter->old_spte = tdp_mmu_clear_spte_bits(iter->sptep, 1341 iter->old_spte, 1342 shadow_accessed_mask, 1343 iter->level); 1344 new_spte = iter->old_spte & ~shadow_accessed_mask; 1345 } else { 1346 new_spte = mark_spte_for_access_track(iter->old_spte); 1347 iter->old_spte = kvm_tdp_mmu_write_spte(iter->sptep, 1348 iter->old_spte, new_spte, 1349 iter->level); 1350 } 1351 1352 trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level, 1353 iter->old_spte, new_spte); 1354 } 1355 1356 static bool __kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, 1357 struct kvm_gfn_range *range, 1358 bool test_only) 1359 { 1360 enum kvm_tdp_mmu_root_types types; 1361 struct kvm_mmu_page *root; 1362 struct tdp_iter iter; 1363 bool ret = false; 1364 1365 types = kvm_gfn_range_filter_to_root_types(kvm, range->attr_filter); 1366 1367 /* 1368 * Don't support rescheduling, none of the MMU notifiers that funnel 1369 * into this helper allow blocking; it'd be dead, wasteful code. Note, 1370 * this helper must NOT be used to unmap GFNs, as it processes only 1371 * valid roots! 1372 */ 1373 WARN_ON(types & ~KVM_VALID_ROOTS); 1374 __for_each_tdp_mmu_root(kvm, root, range->slot->as_id, types) { 1375 guard(rcu)(); 1376 1377 tdp_root_for_each_leaf_pte(iter, kvm, root, range->start, range->end) { 1378 if (!is_accessed_spte(iter.old_spte)) 1379 continue; 1380 1381 if (test_only) 1382 return true; 1383 1384 ret = true; 1385 kvm_tdp_mmu_age_spte(&iter); 1386 } 1387 } 1388 1389 return ret; 1390 } 1391 1392 bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) 1393 { 1394 return __kvm_tdp_mmu_age_gfn_range(kvm, range, false); 1395 } 1396 1397 bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 1398 { 1399 return __kvm_tdp_mmu_age_gfn_range(kvm, range, true); 1400 } 1401 1402 /* 1403 * Remove write access from all SPTEs at or above min_level that map GFNs 1404 * [start, end). Returns true if an SPTE has been changed and the TLBs need to 1405 * be flushed. 1406 */ 1407 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 1408 gfn_t start, gfn_t end, int min_level) 1409 { 1410 struct tdp_iter iter; 1411 u64 new_spte; 1412 bool spte_set = false; 1413 1414 rcu_read_lock(); 1415 1416 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); 1417 1418 for_each_tdp_pte_min_level(iter, kvm, root, min_level, start, end) { 1419 retry: 1420 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) 1421 continue; 1422 1423 if (!is_shadow_present_pte(iter.old_spte) || 1424 !is_last_spte(iter.old_spte, iter.level) || 1425 !(iter.old_spte & PT_WRITABLE_MASK)) 1426 continue; 1427 1428 new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 1429 1430 if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) 1431 goto retry; 1432 1433 spte_set = true; 1434 } 1435 1436 rcu_read_unlock(); 1437 return spte_set; 1438 } 1439 1440 /* 1441 * Remove write access from all the SPTEs mapping GFNs in the memslot. Will 1442 * only affect leaf SPTEs down to min_level. 1443 * Returns true if an SPTE has been changed and the TLBs need to be flushed. 1444 */ 1445 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, 1446 const struct kvm_memory_slot *slot, int min_level) 1447 { 1448 struct kvm_mmu_page *root; 1449 bool spte_set = false; 1450 1451 lockdep_assert_held_read(&kvm->mmu_lock); 1452 1453 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) 1454 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn, 1455 slot->base_gfn + slot->npages, min_level); 1456 1457 return spte_set; 1458 } 1459 1460 static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(void) 1461 { 1462 struct kvm_mmu_page *sp; 1463 1464 sp = kmem_cache_zalloc(mmu_page_header_cache, GFP_KERNEL_ACCOUNT); 1465 if (!sp) 1466 return NULL; 1467 1468 sp->spt = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); 1469 if (!sp->spt) { 1470 kmem_cache_free(mmu_page_header_cache, sp); 1471 return NULL; 1472 } 1473 1474 return sp; 1475 } 1476 1477 /* Note, the caller is responsible for initializing @sp. */ 1478 static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, 1479 struct kvm_mmu_page *sp, bool shared) 1480 { 1481 const u64 huge_spte = iter->old_spte; 1482 const int level = iter->level; 1483 int ret, i; 1484 1485 /* 1486 * No need for atomics when writing to sp->spt since the page table has 1487 * not been linked in yet and thus is not reachable from any other CPU. 1488 */ 1489 for (i = 0; i < SPTE_ENT_PER_PAGE; i++) 1490 sp->spt[i] = make_small_spte(kvm, huge_spte, sp->role, i); 1491 1492 /* 1493 * Replace the huge spte with a pointer to the populated lower level 1494 * page table. Since we are making this change without a TLB flush vCPUs 1495 * will see a mix of the split mappings and the original huge mapping, 1496 * depending on what's currently in their TLB. This is fine from a 1497 * correctness standpoint since the translation will be the same either 1498 * way. 1499 */ 1500 ret = tdp_mmu_link_sp(kvm, iter, sp, shared); 1501 if (ret) 1502 goto out; 1503 1504 /* 1505 * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we 1506 * are overwriting from the page stats. But we have to manually update 1507 * the page stats with the new present child pages. 1508 */ 1509 kvm_update_page_stats(kvm, level - 1, SPTE_ENT_PER_PAGE); 1510 1511 out: 1512 trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret); 1513 return ret; 1514 } 1515 1516 static int tdp_mmu_split_huge_pages_root(struct kvm *kvm, 1517 struct kvm_mmu_page *root, 1518 gfn_t start, gfn_t end, 1519 int target_level, bool shared) 1520 { 1521 struct kvm_mmu_page *sp = NULL; 1522 struct tdp_iter iter; 1523 1524 rcu_read_lock(); 1525 1526 /* 1527 * Traverse the page table splitting all huge pages above the target 1528 * level into one lower level. For example, if we encounter a 1GB page 1529 * we split it into 512 2MB pages. 1530 * 1531 * Since the TDP iterator uses a pre-order traversal, we are guaranteed 1532 * to visit an SPTE before ever visiting its children, which means we 1533 * will correctly recursively split huge pages that are more than one 1534 * level above the target level (e.g. splitting a 1GB to 512 2MB pages, 1535 * and then splitting each of those to 512 4KB pages). 1536 */ 1537 for_each_tdp_pte_min_level(iter, kvm, root, target_level + 1, start, end) { 1538 retry: 1539 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared)) 1540 continue; 1541 1542 if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte)) 1543 continue; 1544 1545 if (!sp) { 1546 rcu_read_unlock(); 1547 1548 if (shared) 1549 read_unlock(&kvm->mmu_lock); 1550 else 1551 write_unlock(&kvm->mmu_lock); 1552 1553 sp = tdp_mmu_alloc_sp_for_split(); 1554 1555 if (shared) 1556 read_lock(&kvm->mmu_lock); 1557 else 1558 write_lock(&kvm->mmu_lock); 1559 1560 if (!sp) { 1561 trace_kvm_mmu_split_huge_page(iter.gfn, 1562 iter.old_spte, 1563 iter.level, -ENOMEM); 1564 return -ENOMEM; 1565 } 1566 1567 rcu_read_lock(); 1568 1569 iter.yielded = true; 1570 continue; 1571 } 1572 1573 tdp_mmu_init_child_sp(sp, &iter); 1574 1575 if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared)) 1576 goto retry; 1577 1578 sp = NULL; 1579 } 1580 1581 rcu_read_unlock(); 1582 1583 /* 1584 * It's possible to exit the loop having never used the last sp if, for 1585 * example, a vCPU doing HugePage NX splitting wins the race and 1586 * installs its own sp in place of the last sp we tried to split. 1587 */ 1588 if (sp) 1589 tdp_mmu_free_sp(sp); 1590 1591 return 0; 1592 } 1593 1594 1595 /* 1596 * Try to split all huge pages mapped by the TDP MMU down to the target level. 1597 */ 1598 void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm, 1599 const struct kvm_memory_slot *slot, 1600 gfn_t start, gfn_t end, 1601 int target_level, bool shared) 1602 { 1603 struct kvm_mmu_page *root; 1604 int r = 0; 1605 1606 kvm_lockdep_assert_mmu_lock_held(kvm, shared); 1607 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) { 1608 r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared); 1609 if (r) { 1610 kvm_tdp_mmu_put_root(kvm, root); 1611 break; 1612 } 1613 } 1614 } 1615 1616 static bool tdp_mmu_need_write_protect(struct kvm_mmu_page *sp) 1617 { 1618 /* 1619 * All TDP MMU shadow pages share the same role as their root, aside 1620 * from level, so it is valid to key off any shadow page to determine if 1621 * write protection is needed for an entire tree. 1622 */ 1623 return kvm_mmu_page_ad_need_write_protect(sp) || !kvm_ad_enabled; 1624 } 1625 1626 static void clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 1627 gfn_t start, gfn_t end) 1628 { 1629 const u64 dbit = tdp_mmu_need_write_protect(root) ? PT_WRITABLE_MASK : 1630 shadow_dirty_mask; 1631 struct tdp_iter iter; 1632 1633 rcu_read_lock(); 1634 1635 tdp_root_for_each_pte(iter, kvm, root, start, end) { 1636 retry: 1637 if (!is_shadow_present_pte(iter.old_spte) || 1638 !is_last_spte(iter.old_spte, iter.level)) 1639 continue; 1640 1641 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) 1642 continue; 1643 1644 KVM_MMU_WARN_ON(dbit == shadow_dirty_mask && 1645 spte_ad_need_write_protect(iter.old_spte)); 1646 1647 if (!(iter.old_spte & dbit)) 1648 continue; 1649 1650 if (tdp_mmu_set_spte_atomic(kvm, &iter, iter.old_spte & ~dbit)) 1651 goto retry; 1652 } 1653 1654 rcu_read_unlock(); 1655 } 1656 1657 /* 1658 * Clear the dirty status (D-bit or W-bit) of all the SPTEs mapping GFNs in the 1659 * memslot. 1660 */ 1661 void kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, 1662 const struct kvm_memory_slot *slot) 1663 { 1664 struct kvm_mmu_page *root; 1665 1666 lockdep_assert_held_read(&kvm->mmu_lock); 1667 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) 1668 clear_dirty_gfn_range(kvm, root, slot->base_gfn, 1669 slot->base_gfn + slot->npages); 1670 } 1671 1672 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, 1673 gfn_t gfn, unsigned long mask, bool wrprot) 1674 { 1675 const u64 dbit = (wrprot || tdp_mmu_need_write_protect(root)) ? PT_WRITABLE_MASK : 1676 shadow_dirty_mask; 1677 struct tdp_iter iter; 1678 1679 lockdep_assert_held_write(&kvm->mmu_lock); 1680 1681 rcu_read_lock(); 1682 1683 tdp_root_for_each_leaf_pte(iter, kvm, root, gfn + __ffs(mask), 1684 gfn + BITS_PER_LONG) { 1685 if (!mask) 1686 break; 1687 1688 KVM_MMU_WARN_ON(dbit == shadow_dirty_mask && 1689 spte_ad_need_write_protect(iter.old_spte)); 1690 1691 if (iter.level > PG_LEVEL_4K || 1692 !(mask & (1UL << (iter.gfn - gfn)))) 1693 continue; 1694 1695 mask &= ~(1UL << (iter.gfn - gfn)); 1696 1697 if (!(iter.old_spte & dbit)) 1698 continue; 1699 1700 iter.old_spte = tdp_mmu_clear_spte_bits(iter.sptep, 1701 iter.old_spte, dbit, 1702 iter.level); 1703 1704 trace_kvm_tdp_mmu_spte_changed(iter.as_id, iter.gfn, iter.level, 1705 iter.old_spte, 1706 iter.old_spte & ~dbit); 1707 } 1708 1709 rcu_read_unlock(); 1710 } 1711 1712 /* 1713 * Clear the dirty status (D-bit or W-bit) of all the 4k SPTEs mapping GFNs for 1714 * which a bit is set in mask, starting at gfn. The given memslot is expected to 1715 * contain all the GFNs represented by set bits in the mask. 1716 */ 1717 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, 1718 struct kvm_memory_slot *slot, 1719 gfn_t gfn, unsigned long mask, 1720 bool wrprot) 1721 { 1722 struct kvm_mmu_page *root; 1723 1724 for_each_valid_tdp_mmu_root(kvm, root, slot->as_id) 1725 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot); 1726 } 1727 1728 static int tdp_mmu_make_huge_spte(struct kvm *kvm, 1729 struct tdp_iter *parent, 1730 u64 *huge_spte) 1731 { 1732 struct kvm_mmu_page *root = spte_to_child_sp(parent->old_spte); 1733 gfn_t start = parent->gfn; 1734 gfn_t end = start + KVM_PAGES_PER_HPAGE(parent->level); 1735 struct tdp_iter iter; 1736 1737 tdp_root_for_each_leaf_pte(iter, kvm, root, start, end) { 1738 /* 1739 * Use the parent iterator when checking for forward progress so 1740 * that KVM doesn't get stuck continuously trying to yield (i.e. 1741 * returning -EAGAIN here and then failing the forward progress 1742 * check in the caller ad nauseam). 1743 */ 1744 if (tdp_mmu_iter_need_resched(kvm, parent)) 1745 return -EAGAIN; 1746 1747 *huge_spte = make_huge_spte(kvm, iter.old_spte, parent->level); 1748 return 0; 1749 } 1750 1751 return -ENOENT; 1752 } 1753 1754 static void recover_huge_pages_range(struct kvm *kvm, 1755 struct kvm_mmu_page *root, 1756 const struct kvm_memory_slot *slot) 1757 { 1758 gfn_t start = slot->base_gfn; 1759 gfn_t end = start + slot->npages; 1760 struct tdp_iter iter; 1761 int max_mapping_level; 1762 bool flush = false; 1763 u64 huge_spte; 1764 int r; 1765 1766 if (WARN_ON_ONCE(kvm_slot_dirty_track_enabled(slot))) 1767 return; 1768 1769 rcu_read_lock(); 1770 1771 for_each_tdp_pte_min_level(iter, kvm, root, PG_LEVEL_2M, start, end) { 1772 retry: 1773 if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, true)) { 1774 flush = false; 1775 continue; 1776 } 1777 1778 if (iter.level > KVM_MAX_HUGEPAGE_LEVEL || 1779 !is_shadow_present_pte(iter.old_spte)) 1780 continue; 1781 1782 /* 1783 * Don't zap leaf SPTEs, if a leaf SPTE could be replaced with 1784 * a large page size, then its parent would have been zapped 1785 * instead of stepping down. 1786 */ 1787 if (is_last_spte(iter.old_spte, iter.level)) 1788 continue; 1789 1790 /* 1791 * If iter.gfn resides outside of the slot, i.e. the page for 1792 * the current level overlaps but is not contained by the slot, 1793 * then the SPTE can't be made huge. More importantly, trying 1794 * to query that info from slot->arch.lpage_info will cause an 1795 * out-of-bounds access. 1796 */ 1797 if (iter.gfn < start || iter.gfn >= end) 1798 continue; 1799 1800 max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, iter.gfn); 1801 if (max_mapping_level < iter.level) 1802 continue; 1803 1804 r = tdp_mmu_make_huge_spte(kvm, &iter, &huge_spte); 1805 if (r == -EAGAIN) 1806 goto retry; 1807 else if (r) 1808 continue; 1809 1810 if (tdp_mmu_set_spte_atomic(kvm, &iter, huge_spte)) 1811 goto retry; 1812 1813 flush = true; 1814 } 1815 1816 if (flush) 1817 kvm_flush_remote_tlbs_memslot(kvm, slot); 1818 1819 rcu_read_unlock(); 1820 } 1821 1822 /* 1823 * Recover huge page mappings within the slot by replacing non-leaf SPTEs with 1824 * huge SPTEs where possible. 1825 */ 1826 void kvm_tdp_mmu_recover_huge_pages(struct kvm *kvm, 1827 const struct kvm_memory_slot *slot) 1828 { 1829 struct kvm_mmu_page *root; 1830 1831 lockdep_assert_held_read(&kvm->mmu_lock); 1832 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) 1833 recover_huge_pages_range(kvm, root, slot); 1834 } 1835 1836 /* 1837 * Removes write access on the last level SPTE mapping this GFN and unsets the 1838 * MMU-writable bit to ensure future writes continue to be intercepted. 1839 * Returns true if an SPTE was set and a TLB flush is needed. 1840 */ 1841 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, 1842 gfn_t gfn, int min_level) 1843 { 1844 struct tdp_iter iter; 1845 u64 new_spte; 1846 bool spte_set = false; 1847 1848 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); 1849 1850 rcu_read_lock(); 1851 1852 for_each_tdp_pte_min_level(iter, kvm, root, min_level, gfn, gfn + 1) { 1853 if (!is_shadow_present_pte(iter.old_spte) || 1854 !is_last_spte(iter.old_spte, iter.level)) 1855 continue; 1856 1857 new_spte = iter.old_spte & 1858 ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask); 1859 1860 if (new_spte == iter.old_spte) 1861 break; 1862 1863 tdp_mmu_iter_set_spte(kvm, &iter, new_spte); 1864 spte_set = true; 1865 } 1866 1867 rcu_read_unlock(); 1868 1869 return spte_set; 1870 } 1871 1872 /* 1873 * Removes write access on the last level SPTE mapping this GFN and unsets the 1874 * MMU-writable bit to ensure future writes continue to be intercepted. 1875 * Returns true if an SPTE was set and a TLB flush is needed. 1876 */ 1877 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, 1878 struct kvm_memory_slot *slot, gfn_t gfn, 1879 int min_level) 1880 { 1881 struct kvm_mmu_page *root; 1882 bool spte_set = false; 1883 1884 lockdep_assert_held_write(&kvm->mmu_lock); 1885 for_each_valid_tdp_mmu_root(kvm, root, slot->as_id) 1886 spte_set |= write_protect_gfn(kvm, root, gfn, min_level); 1887 1888 return spte_set; 1889 } 1890 1891 /* 1892 * Return the level of the lowest level SPTE added to sptes. 1893 * That SPTE may be non-present. 1894 * 1895 * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. 1896 */ 1897 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, 1898 int *root_level) 1899 { 1900 struct kvm_mmu_page *root = root_to_sp(vcpu->arch.mmu->root.hpa); 1901 struct tdp_iter iter; 1902 gfn_t gfn = addr >> PAGE_SHIFT; 1903 int leaf = -1; 1904 1905 *root_level = vcpu->arch.mmu->root_role.level; 1906 1907 tdp_mmu_for_each_pte(iter, vcpu->kvm, root, gfn, gfn + 1) { 1908 leaf = iter.level; 1909 sptes[leaf] = iter.old_spte; 1910 } 1911 1912 return leaf; 1913 } 1914 1915 /* 1916 * Returns the last level spte pointer of the shadow page walk for the given 1917 * gpa, and sets *spte to the spte value. This spte may be non-preset. If no 1918 * walk could be performed, returns NULL and *spte does not contain valid data. 1919 * 1920 * Contract: 1921 * - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. 1922 * - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end. 1923 * 1924 * WARNING: This function is only intended to be called during fast_page_fault. 1925 */ 1926 u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gfn_t gfn, 1927 u64 *spte) 1928 { 1929 /* Fast pf is not supported for mirrored roots */ 1930 struct kvm_mmu_page *root = tdp_mmu_get_root(vcpu, KVM_DIRECT_ROOTS); 1931 struct tdp_iter iter; 1932 tdp_ptep_t sptep = NULL; 1933 1934 tdp_mmu_for_each_pte(iter, vcpu->kvm, root, gfn, gfn + 1) { 1935 *spte = iter.old_spte; 1936 sptep = iter.sptep; 1937 } 1938 1939 /* 1940 * Perform the rcu_dereference to get the raw spte pointer value since 1941 * we are passing it up to fast_page_fault, which is shared with the 1942 * legacy MMU and thus does not retain the TDP MMU-specific __rcu 1943 * annotation. 1944 * 1945 * This is safe since fast_page_fault obeys the contracts of this 1946 * function as well as all TDP MMU contracts around modifying SPTEs 1947 * outside of mmu_lock. 1948 */ 1949 return rcu_dereference(sptep); 1950 } 1951