1 // SPDX-License-Identifier: GPL-2.0 2 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 3 4 #include "mmu.h" 5 #include "mmu_internal.h" 6 #include "mmutrace.h" 7 #include "tdp_iter.h" 8 #include "tdp_mmu.h" 9 #include "spte.h" 10 11 #include <asm/cmpxchg.h> 12 #include <trace/events/kvm.h> 13 14 /* Initializes the TDP MMU for the VM, if enabled. */ 15 void kvm_mmu_init_tdp_mmu(struct kvm *kvm) 16 { 17 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); 18 spin_lock_init(&kvm->arch.tdp_mmu_pages_lock); 19 } 20 21 /* Arbitrarily returns true so that this may be used in if statements. */ 22 static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm, 23 bool shared) 24 { 25 if (shared) 26 lockdep_assert_held_read(&kvm->mmu_lock); 27 else 28 lockdep_assert_held_write(&kvm->mmu_lock); 29 30 return true; 31 } 32 33 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) 34 { 35 /* 36 * Invalidate all roots, which besides the obvious, schedules all roots 37 * for zapping and thus puts the TDP MMU's reference to each root, i.e. 38 * ultimately frees all roots. 39 */ 40 kvm_tdp_mmu_invalidate_roots(kvm, KVM_VALID_ROOTS); 41 kvm_tdp_mmu_zap_invalidated_roots(kvm, false); 42 43 WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages)); 44 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); 45 46 /* 47 * Ensure that all the outstanding RCU callbacks to free shadow pages 48 * can run before the VM is torn down. Putting the last reference to 49 * zapped roots will create new callbacks. 50 */ 51 rcu_barrier(); 52 } 53 54 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp) 55 { 56 free_page((unsigned long)sp->external_spt); 57 free_page((unsigned long)sp->spt); 58 kmem_cache_free(mmu_page_header_cache, sp); 59 } 60 61 /* 62 * This is called through call_rcu in order to free TDP page table memory 63 * safely with respect to other kernel threads that may be operating on 64 * the memory. 65 * By only accessing TDP MMU page table memory in an RCU read critical 66 * section, and freeing it after a grace period, lockless access to that 67 * memory won't use it after it is freed. 68 */ 69 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head) 70 { 71 struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page, 72 rcu_head); 73 74 tdp_mmu_free_sp(sp); 75 } 76 77 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root) 78 { 79 if (!refcount_dec_and_test(&root->tdp_mmu_root_count)) 80 return; 81 82 /* 83 * The TDP MMU itself holds a reference to each root until the root is 84 * explicitly invalidated, i.e. the final reference should be never be 85 * put for a valid root. 86 */ 87 KVM_BUG_ON(!is_tdp_mmu_page(root) || !root->role.invalid, kvm); 88 89 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 90 list_del_rcu(&root->link); 91 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 92 call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback); 93 } 94 95 static bool tdp_mmu_root_match(struct kvm_mmu_page *root, 96 enum kvm_tdp_mmu_root_types types) 97 { 98 if (WARN_ON_ONCE(!(types & KVM_VALID_ROOTS))) 99 return false; 100 101 if (root->role.invalid && !(types & KVM_INVALID_ROOTS)) 102 return false; 103 104 if (likely(!is_mirror_sp(root))) 105 return types & KVM_DIRECT_ROOTS; 106 return types & KVM_MIRROR_ROOTS; 107 } 108 109 /* 110 * Returns the next root after @prev_root (or the first root if @prev_root is 111 * NULL) that matches with @types. A reference to the returned root is 112 * acquired, and the reference to @prev_root is released (the caller obviously 113 * must hold a reference to @prev_root if it's non-NULL). 114 * 115 * Roots that doesn't match with @types are skipped. 116 * 117 * Returns NULL if the end of tdp_mmu_roots was reached. 118 */ 119 static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, 120 struct kvm_mmu_page *prev_root, 121 enum kvm_tdp_mmu_root_types types) 122 { 123 struct kvm_mmu_page *next_root; 124 125 /* 126 * While the roots themselves are RCU-protected, fields such as 127 * role.invalid are protected by mmu_lock. 128 */ 129 lockdep_assert_held(&kvm->mmu_lock); 130 131 rcu_read_lock(); 132 133 if (prev_root) 134 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, 135 &prev_root->link, 136 typeof(*prev_root), link); 137 else 138 next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots, 139 typeof(*next_root), link); 140 141 while (next_root) { 142 if (tdp_mmu_root_match(next_root, types) && 143 kvm_tdp_mmu_get_root(next_root)) 144 break; 145 146 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, 147 &next_root->link, typeof(*next_root), link); 148 } 149 150 rcu_read_unlock(); 151 152 if (prev_root) 153 kvm_tdp_mmu_put_root(kvm, prev_root); 154 155 return next_root; 156 } 157 158 /* 159 * Note: this iterator gets and puts references to the roots it iterates over. 160 * This makes it safe to release the MMU lock and yield within the loop, but 161 * if exiting the loop early, the caller must drop the reference to the most 162 * recent root. (Unless keeping a live reference is desirable.) 163 * 164 * If shared is set, this function is operating under the MMU lock in read 165 * mode. 166 */ 167 #define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _types) \ 168 for (_root = tdp_mmu_next_root(_kvm, NULL, _types); \ 169 ({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \ 170 _root = tdp_mmu_next_root(_kvm, _root, _types)) \ 171 if (_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) { \ 172 } else 173 174 #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \ 175 __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, KVM_VALID_ROOTS) 176 177 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root) \ 178 for (_root = tdp_mmu_next_root(_kvm, NULL, KVM_ALL_ROOTS); \ 179 ({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \ 180 _root = tdp_mmu_next_root(_kvm, _root, KVM_ALL_ROOTS)) 181 182 /* 183 * Iterate over all TDP MMU roots. Requires that mmu_lock be held for write, 184 * the implication being that any flow that holds mmu_lock for read is 185 * inherently yield-friendly and should use the yield-safe variant above. 186 * Holding mmu_lock for write obviates the need for RCU protection as the list 187 * is guaranteed to be stable. 188 */ 189 #define __for_each_tdp_mmu_root(_kvm, _root, _as_id, _types) \ 190 list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \ 191 if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \ 192 ((_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) || \ 193 !tdp_mmu_root_match((_root), (_types)))) { \ 194 } else 195 196 /* 197 * Iterate over all TDP MMU roots in an RCU read-side critical section. 198 * It is safe to iterate over the SPTEs under the root, but their values will 199 * be unstable, so all writes must be atomic. As this routine is meant to be 200 * used without holding the mmu_lock at all, any bits that are flipped must 201 * be reflected in kvm_tdp_mmu_spte_need_atomic_write(). 202 */ 203 #define for_each_tdp_mmu_root_rcu(_kvm, _root, _as_id, _types) \ 204 list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link) \ 205 if ((_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) || \ 206 !tdp_mmu_root_match((_root), (_types))) { \ 207 } else 208 209 #define for_each_valid_tdp_mmu_root(_kvm, _root, _as_id) \ 210 __for_each_tdp_mmu_root(_kvm, _root, _as_id, KVM_VALID_ROOTS) 211 212 static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu) 213 { 214 struct kvm_mmu_page *sp; 215 216 sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); 217 sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); 218 219 return sp; 220 } 221 222 static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep, 223 gfn_t gfn, union kvm_mmu_page_role role) 224 { 225 INIT_LIST_HEAD(&sp->possible_nx_huge_page_link); 226 227 set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 228 229 sp->role = role; 230 sp->gfn = gfn; 231 sp->ptep = sptep; 232 sp->tdp_mmu_page = true; 233 234 trace_kvm_mmu_get_page(sp, true); 235 } 236 237 static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp, 238 struct tdp_iter *iter) 239 { 240 struct kvm_mmu_page *parent_sp; 241 union kvm_mmu_page_role role; 242 243 parent_sp = sptep_to_sp(rcu_dereference(iter->sptep)); 244 245 role = parent_sp->role; 246 role.level--; 247 248 tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role); 249 } 250 251 void kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu, bool mirror) 252 { 253 struct kvm_mmu *mmu = vcpu->arch.mmu; 254 union kvm_mmu_page_role role = mmu->root_role; 255 int as_id = kvm_mmu_role_as_id(role); 256 struct kvm *kvm = vcpu->kvm; 257 struct kvm_mmu_page *root; 258 259 if (mirror) 260 role.is_mirror = true; 261 262 /* 263 * Check for an existing root before acquiring the pages lock to avoid 264 * unnecessary serialization if multiple vCPUs are loading a new root. 265 * E.g. when bringing up secondary vCPUs, KVM will already have created 266 * a valid root on behalf of the primary vCPU. 267 */ 268 read_lock(&kvm->mmu_lock); 269 270 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, as_id) { 271 if (root->role.word == role.word) 272 goto out_read_unlock; 273 } 274 275 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 276 277 /* 278 * Recheck for an existing root after acquiring the pages lock, another 279 * vCPU may have raced ahead and created a new usable root. Manually 280 * walk the list of roots as the standard macros assume that the pages 281 * lock is *not* held. WARN if grabbing a reference to a usable root 282 * fails, as the last reference to a root can only be put *after* the 283 * root has been invalidated, which requires holding mmu_lock for write. 284 */ 285 list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) { 286 if (root->role.word == role.word && 287 !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root))) 288 goto out_spin_unlock; 289 } 290 291 root = tdp_mmu_alloc_sp(vcpu); 292 tdp_mmu_init_sp(root, NULL, 0, role); 293 294 /* 295 * TDP MMU roots are kept until they are explicitly invalidated, either 296 * by a memslot update or by the destruction of the VM. Initialize the 297 * refcount to two; one reference for the vCPU, and one reference for 298 * the TDP MMU itself, which is held until the root is invalidated and 299 * is ultimately put by kvm_tdp_mmu_zap_invalidated_roots(). 300 */ 301 refcount_set(&root->tdp_mmu_root_count, 2); 302 list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots); 303 304 out_spin_unlock: 305 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 306 out_read_unlock: 307 read_unlock(&kvm->mmu_lock); 308 /* 309 * Note, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS will prevent entering the guest 310 * and actually consuming the root if it's invalidated after dropping 311 * mmu_lock, and the root can't be freed as this vCPU holds a reference. 312 */ 313 if (mirror) { 314 mmu->mirror_root_hpa = __pa(root->spt); 315 } else { 316 mmu->root.hpa = __pa(root->spt); 317 mmu->root.pgd = 0; 318 } 319 } 320 321 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 322 u64 old_spte, u64 new_spte, int level, 323 bool shared); 324 325 static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) 326 { 327 kvm_account_pgtable_pages((void *)sp->spt, +1); 328 atomic64_inc(&kvm->arch.tdp_mmu_pages); 329 } 330 331 static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) 332 { 333 kvm_account_pgtable_pages((void *)sp->spt, -1); 334 atomic64_dec(&kvm->arch.tdp_mmu_pages); 335 } 336 337 /** 338 * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages 339 * 340 * @kvm: kvm instance 341 * @sp: the page to be removed 342 */ 343 static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp) 344 { 345 tdp_unaccount_mmu_page(kvm, sp); 346 347 if (!sp->nx_huge_page_disallowed) 348 return; 349 350 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 351 sp->nx_huge_page_disallowed = false; 352 untrack_possible_nx_huge_page(kvm, sp); 353 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 354 } 355 356 static void remove_external_spte(struct kvm *kvm, gfn_t gfn, u64 old_spte, 357 int level) 358 { 359 kvm_pfn_t old_pfn = spte_to_pfn(old_spte); 360 int ret; 361 362 /* 363 * External (TDX) SPTEs are limited to PG_LEVEL_4K, and external 364 * PTs are removed in a special order, involving free_external_spt(). 365 * But remove_external_spte() will be called on non-leaf PTEs via 366 * __tdp_mmu_zap_root(), so avoid the error the former would return 367 * in this case. 368 */ 369 if (!is_last_spte(old_spte, level)) 370 return; 371 372 /* Zapping leaf spte is allowed only when write lock is held. */ 373 lockdep_assert_held_write(&kvm->mmu_lock); 374 /* Because write lock is held, operation should success. */ 375 ret = static_call(kvm_x86_remove_external_spte)(kvm, gfn, level, old_pfn); 376 KVM_BUG_ON(ret, kvm); 377 } 378 379 /** 380 * handle_removed_pt() - handle a page table removed from the TDP structure 381 * 382 * @kvm: kvm instance 383 * @pt: the page removed from the paging structure 384 * @shared: This operation may not be running under the exclusive use 385 * of the MMU lock and the operation must synchronize with other 386 * threads that might be modifying SPTEs. 387 * 388 * Given a page table that has been removed from the TDP paging structure, 389 * iterates through the page table to clear SPTEs and free child page tables. 390 * 391 * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU 392 * protection. Since this thread removed it from the paging structure, 393 * this thread will be responsible for ensuring the page is freed. Hence the 394 * early rcu_dereferences in the function. 395 */ 396 static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) 397 { 398 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt)); 399 int level = sp->role.level; 400 gfn_t base_gfn = sp->gfn; 401 int i; 402 403 trace_kvm_mmu_prepare_zap_page(sp); 404 405 tdp_mmu_unlink_sp(kvm, sp); 406 407 for (i = 0; i < SPTE_ENT_PER_PAGE; i++) { 408 tdp_ptep_t sptep = pt + i; 409 gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level); 410 u64 old_spte; 411 412 if (shared) { 413 /* 414 * Set the SPTE to a nonpresent value that other 415 * threads will not overwrite. If the SPTE was 416 * already marked as frozen then another thread 417 * handling a page fault could overwrite it, so 418 * set the SPTE until it is set from some other 419 * value to the frozen SPTE value. 420 */ 421 for (;;) { 422 old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, FROZEN_SPTE); 423 if (!is_frozen_spte(old_spte)) 424 break; 425 cpu_relax(); 426 } 427 } else { 428 /* 429 * If the SPTE is not MMU-present, there is no backing 430 * page associated with the SPTE and so no side effects 431 * that need to be recorded, and exclusive ownership of 432 * mmu_lock ensures the SPTE can't be made present. 433 * Note, zapping MMIO SPTEs is also unnecessary as they 434 * are guarded by the memslots generation, not by being 435 * unreachable. 436 */ 437 old_spte = kvm_tdp_mmu_read_spte(sptep); 438 if (!is_shadow_present_pte(old_spte)) 439 continue; 440 441 /* 442 * Use the common helper instead of a raw WRITE_ONCE as 443 * the SPTE needs to be updated atomically if it can be 444 * modified by a different vCPU outside of mmu_lock. 445 * Even though the parent SPTE is !PRESENT, the TLB 446 * hasn't yet been flushed, and both Intel and AMD 447 * document that A/D assists can use upper-level PxE 448 * entries that are cached in the TLB, i.e. the CPU can 449 * still access the page and mark it dirty. 450 * 451 * No retry is needed in the atomic update path as the 452 * sole concern is dropping a Dirty bit, i.e. no other 453 * task can zap/remove the SPTE as mmu_lock is held for 454 * write. Marking the SPTE as a frozen SPTE is not 455 * strictly necessary for the same reason, but using 456 * the frozen SPTE value keeps the shared/exclusive 457 * paths consistent and allows the handle_changed_spte() 458 * call below to hardcode the new value to FROZEN_SPTE. 459 * 460 * Note, even though dropping a Dirty bit is the only 461 * scenario where a non-atomic update could result in a 462 * functional bug, simply checking the Dirty bit isn't 463 * sufficient as a fast page fault could read the upper 464 * level SPTE before it is zapped, and then make this 465 * target SPTE writable, resume the guest, and set the 466 * Dirty bit between reading the SPTE above and writing 467 * it here. 468 */ 469 old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, 470 FROZEN_SPTE, level); 471 } 472 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn, 473 old_spte, FROZEN_SPTE, level, shared); 474 475 if (is_mirror_sp(sp)) { 476 KVM_BUG_ON(shared, kvm); 477 remove_external_spte(kvm, gfn, old_spte, level); 478 } 479 } 480 481 if (is_mirror_sp(sp) && 482 WARN_ON(static_call(kvm_x86_free_external_spt)(kvm, base_gfn, sp->role.level, 483 sp->external_spt))) { 484 /* 485 * Failed to free page table page in mirror page table and 486 * there is nothing to do further. 487 * Intentionally leak the page to prevent the kernel from 488 * accessing the encrypted page. 489 */ 490 sp->external_spt = NULL; 491 } 492 493 call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); 494 } 495 496 static void *get_external_spt(gfn_t gfn, u64 new_spte, int level) 497 { 498 if (is_shadow_present_pte(new_spte) && !is_last_spte(new_spte, level)) { 499 struct kvm_mmu_page *sp = spte_to_child_sp(new_spte); 500 501 WARN_ON_ONCE(sp->role.level + 1 != level); 502 WARN_ON_ONCE(sp->gfn != gfn); 503 return sp->external_spt; 504 } 505 506 return NULL; 507 } 508 509 static int __must_check set_external_spte_present(struct kvm *kvm, tdp_ptep_t sptep, 510 gfn_t gfn, u64 old_spte, 511 u64 new_spte, int level) 512 { 513 bool was_present = is_shadow_present_pte(old_spte); 514 bool is_present = is_shadow_present_pte(new_spte); 515 bool is_leaf = is_present && is_last_spte(new_spte, level); 516 kvm_pfn_t new_pfn = spte_to_pfn(new_spte); 517 int ret = 0; 518 519 KVM_BUG_ON(was_present, kvm); 520 521 lockdep_assert_held(&kvm->mmu_lock); 522 /* 523 * We need to lock out other updates to the SPTE until the external 524 * page table has been modified. Use FROZEN_SPTE similar to 525 * the zapping case. 526 */ 527 if (!try_cmpxchg64(rcu_dereference(sptep), &old_spte, FROZEN_SPTE)) 528 return -EBUSY; 529 530 /* 531 * Use different call to either set up middle level 532 * external page table, or leaf. 533 */ 534 if (is_leaf) { 535 ret = static_call(kvm_x86_set_external_spte)(kvm, gfn, level, new_pfn); 536 } else { 537 void *external_spt = get_external_spt(gfn, new_spte, level); 538 539 KVM_BUG_ON(!external_spt, kvm); 540 ret = static_call(kvm_x86_link_external_spt)(kvm, gfn, level, external_spt); 541 } 542 if (ret) 543 __kvm_tdp_mmu_write_spte(sptep, old_spte); 544 else 545 __kvm_tdp_mmu_write_spte(sptep, new_spte); 546 return ret; 547 } 548 549 /** 550 * handle_changed_spte - handle bookkeeping associated with an SPTE change 551 * @kvm: kvm instance 552 * @as_id: the address space of the paging structure the SPTE was a part of 553 * @gfn: the base GFN that was mapped by the SPTE 554 * @old_spte: The value of the SPTE before the change 555 * @new_spte: The value of the SPTE after the change 556 * @level: the level of the PT the SPTE is part of in the paging structure 557 * @shared: This operation may not be running under the exclusive use of 558 * the MMU lock and the operation must synchronize with other 559 * threads that might be modifying SPTEs. 560 * 561 * Handle bookkeeping that might result from the modification of a SPTE. Note, 562 * dirty logging updates are handled in common code, not here (see make_spte() 563 * and fast_pf_fix_direct_spte()). 564 */ 565 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 566 u64 old_spte, u64 new_spte, int level, 567 bool shared) 568 { 569 bool was_present = is_shadow_present_pte(old_spte); 570 bool is_present = is_shadow_present_pte(new_spte); 571 bool was_leaf = was_present && is_last_spte(old_spte, level); 572 bool is_leaf = is_present && is_last_spte(new_spte, level); 573 bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); 574 575 WARN_ON_ONCE(level > PT64_ROOT_MAX_LEVEL); 576 WARN_ON_ONCE(level < PG_LEVEL_4K); 577 WARN_ON_ONCE(gfn & (KVM_PAGES_PER_HPAGE(level) - 1)); 578 579 /* 580 * If this warning were to trigger it would indicate that there was a 581 * missing MMU notifier or a race with some notifier handler. 582 * A present, leaf SPTE should never be directly replaced with another 583 * present leaf SPTE pointing to a different PFN. A notifier handler 584 * should be zapping the SPTE before the main MM's page table is 585 * changed, or the SPTE should be zeroed, and the TLBs flushed by the 586 * thread before replacement. 587 */ 588 if (was_leaf && is_leaf && pfn_changed) { 589 pr_err("Invalid SPTE change: cannot replace a present leaf\n" 590 "SPTE with another present leaf SPTE mapping a\n" 591 "different PFN!\n" 592 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", 593 as_id, gfn, old_spte, new_spte, level); 594 595 /* 596 * Crash the host to prevent error propagation and guest data 597 * corruption. 598 */ 599 BUG(); 600 } 601 602 if (old_spte == new_spte) 603 return; 604 605 trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte); 606 607 if (is_leaf) 608 check_spte_writable_invariants(new_spte); 609 610 /* 611 * The only times a SPTE should be changed from a non-present to 612 * non-present state is when an MMIO entry is installed/modified/ 613 * removed. In that case, there is nothing to do here. 614 */ 615 if (!was_present && !is_present) { 616 /* 617 * If this change does not involve a MMIO SPTE or frozen SPTE, 618 * it is unexpected. Log the change, though it should not 619 * impact the guest since both the former and current SPTEs 620 * are nonpresent. 621 */ 622 if (WARN_ON_ONCE(!is_mmio_spte(kvm, old_spte) && 623 !is_mmio_spte(kvm, new_spte) && 624 !is_frozen_spte(new_spte))) 625 pr_err("Unexpected SPTE change! Nonpresent SPTEs\n" 626 "should not be replaced with another,\n" 627 "different nonpresent SPTE, unless one or both\n" 628 "are MMIO SPTEs, or the new SPTE is\n" 629 "a temporary frozen SPTE.\n" 630 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", 631 as_id, gfn, old_spte, new_spte, level); 632 return; 633 } 634 635 if (is_leaf != was_leaf) 636 kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1); 637 638 /* 639 * Recursively handle child PTs if the change removed a subtree from 640 * the paging structure. Note the WARN on the PFN changing without the 641 * SPTE being converted to a hugepage (leaf) or being zapped. Shadow 642 * pages are kernel allocations and should never be migrated. 643 */ 644 if (was_present && !was_leaf && 645 (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed))) 646 handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared); 647 } 648 649 static inline int __must_check __tdp_mmu_set_spte_atomic(struct kvm *kvm, 650 struct tdp_iter *iter, 651 u64 new_spte) 652 { 653 /* 654 * The caller is responsible for ensuring the old SPTE is not a FROZEN 655 * SPTE. KVM should never attempt to zap or manipulate a FROZEN SPTE, 656 * and pre-checking before inserting a new SPTE is advantageous as it 657 * avoids unnecessary work. 658 */ 659 WARN_ON_ONCE(iter->yielded || is_frozen_spte(iter->old_spte)); 660 661 if (is_mirror_sptep(iter->sptep) && !is_frozen_spte(new_spte)) { 662 int ret; 663 664 /* 665 * Users of atomic zapping don't operate on mirror roots, 666 * so don't handle it and bug the VM if it's seen. 667 */ 668 if (KVM_BUG_ON(!is_shadow_present_pte(new_spte), kvm)) 669 return -EBUSY; 670 671 ret = set_external_spte_present(kvm, iter->sptep, iter->gfn, 672 iter->old_spte, new_spte, iter->level); 673 if (ret) 674 return ret; 675 } else { 676 u64 *sptep = rcu_dereference(iter->sptep); 677 678 /* 679 * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs 680 * and does not hold the mmu_lock. On failure, i.e. if a 681 * different logical CPU modified the SPTE, try_cmpxchg64() 682 * updates iter->old_spte with the current value, so the caller 683 * operates on fresh data, e.g. if it retries 684 * tdp_mmu_set_spte_atomic() 685 */ 686 if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte)) 687 return -EBUSY; 688 } 689 690 return 0; 691 } 692 693 /* 694 * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically 695 * and handle the associated bookkeeping. Do not mark the page dirty 696 * in KVM's dirty bitmaps. 697 * 698 * If setting the SPTE fails because it has changed, iter->old_spte will be 699 * refreshed to the current value of the spte. 700 * 701 * @kvm: kvm instance 702 * @iter: a tdp_iter instance currently on the SPTE that should be set 703 * @new_spte: The value the SPTE should be set to 704 * Return: 705 * * 0 - If the SPTE was set. 706 * * -EBUSY - If the SPTE cannot be set. In this case this function will have 707 * no side-effects other than setting iter->old_spte to the last 708 * known value of the spte. 709 */ 710 static inline int __must_check tdp_mmu_set_spte_atomic(struct kvm *kvm, 711 struct tdp_iter *iter, 712 u64 new_spte) 713 { 714 int ret; 715 716 lockdep_assert_held_read(&kvm->mmu_lock); 717 718 ret = __tdp_mmu_set_spte_atomic(kvm, iter, new_spte); 719 if (ret) 720 return ret; 721 722 handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, 723 new_spte, iter->level, true); 724 725 return 0; 726 } 727 728 /* 729 * tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping 730 * @kvm: KVM instance 731 * @as_id: Address space ID, i.e. regular vs. SMM 732 * @sptep: Pointer to the SPTE 733 * @old_spte: The current value of the SPTE 734 * @new_spte: The new value that will be set for the SPTE 735 * @gfn: The base GFN that was (or will be) mapped by the SPTE 736 * @level: The level _containing_ the SPTE (its parent PT's level) 737 * 738 * Returns the old SPTE value, which _may_ be different than @old_spte if the 739 * SPTE had voldatile bits. 740 */ 741 static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep, 742 u64 old_spte, u64 new_spte, gfn_t gfn, int level) 743 { 744 lockdep_assert_held_write(&kvm->mmu_lock); 745 746 /* 747 * No thread should be using this function to set SPTEs to or from the 748 * temporary frozen SPTE value. 749 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic 750 * should be used. If operating under the MMU lock in write mode, the 751 * use of the frozen SPTE should not be necessary. 752 */ 753 WARN_ON_ONCE(is_frozen_spte(old_spte) || is_frozen_spte(new_spte)); 754 755 old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level); 756 757 handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false); 758 759 /* 760 * Users that do non-atomic setting of PTEs don't operate on mirror 761 * roots, so don't handle it and bug the VM if it's seen. 762 */ 763 if (is_mirror_sptep(sptep)) { 764 KVM_BUG_ON(is_shadow_present_pte(new_spte), kvm); 765 remove_external_spte(kvm, gfn, old_spte, level); 766 } 767 768 return old_spte; 769 } 770 771 static inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter, 772 u64 new_spte) 773 { 774 WARN_ON_ONCE(iter->yielded); 775 iter->old_spte = tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep, 776 iter->old_spte, new_spte, 777 iter->gfn, iter->level); 778 } 779 780 #define tdp_root_for_each_pte(_iter, _kvm, _root, _start, _end) \ 781 for_each_tdp_pte(_iter, _kvm, _root, _start, _end) 782 783 #define tdp_root_for_each_leaf_pte(_iter, _kvm, _root, _start, _end) \ 784 tdp_root_for_each_pte(_iter, _kvm, _root, _start, _end) \ 785 if (!is_shadow_present_pte(_iter.old_spte) || \ 786 !is_last_spte(_iter.old_spte, _iter.level)) \ 787 continue; \ 788 else 789 790 static inline bool __must_check tdp_mmu_iter_need_resched(struct kvm *kvm, 791 struct tdp_iter *iter) 792 { 793 if (!need_resched() && !rwlock_needbreak(&kvm->mmu_lock)) 794 return false; 795 796 /* Ensure forward progress has been made before yielding. */ 797 return iter->next_last_level_gfn != iter->yielded_gfn; 798 } 799 800 /* 801 * Yield if the MMU lock is contended or this thread needs to return control 802 * to the scheduler. 803 * 804 * If this function should yield and flush is set, it will perform a remote 805 * TLB flush before yielding. 806 * 807 * If this function yields, iter->yielded is set and the caller must skip to 808 * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk 809 * over the paging structures to allow the iterator to continue its traversal 810 * from the paging structure root. 811 * 812 * Returns true if this function yielded. 813 */ 814 static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm, 815 struct tdp_iter *iter, 816 bool flush, bool shared) 817 { 818 KVM_MMU_WARN_ON(iter->yielded); 819 820 if (!tdp_mmu_iter_need_resched(kvm, iter)) 821 return false; 822 823 if (flush) 824 kvm_flush_remote_tlbs(kvm); 825 826 rcu_read_unlock(); 827 828 if (shared) 829 cond_resched_rwlock_read(&kvm->mmu_lock); 830 else 831 cond_resched_rwlock_write(&kvm->mmu_lock); 832 833 rcu_read_lock(); 834 835 WARN_ON_ONCE(iter->gfn > iter->next_last_level_gfn); 836 837 iter->yielded = true; 838 return true; 839 } 840 841 static inline gfn_t tdp_mmu_max_gfn_exclusive(void) 842 { 843 /* 844 * Bound TDP MMU walks at host.MAXPHYADDR. KVM disallows memslots with 845 * a gpa range that would exceed the max gfn, and KVM does not create 846 * MMIO SPTEs for "impossible" gfns, instead sending such accesses down 847 * the slow emulation path every time. 848 */ 849 return kvm_mmu_max_gfn() + 1; 850 } 851 852 static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, 853 bool shared, int zap_level) 854 { 855 struct tdp_iter iter; 856 857 for_each_tdp_pte_min_level_all(iter, root, zap_level) { 858 retry: 859 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared)) 860 continue; 861 862 if (!is_shadow_present_pte(iter.old_spte)) 863 continue; 864 865 if (iter.level > zap_level) 866 continue; 867 868 if (!shared) 869 tdp_mmu_iter_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE); 870 else if (tdp_mmu_set_spte_atomic(kvm, &iter, SHADOW_NONPRESENT_VALUE)) 871 goto retry; 872 } 873 } 874 875 static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, 876 bool shared) 877 { 878 879 /* 880 * The root must have an elevated refcount so that it's reachable via 881 * mmu_notifier callbacks, which allows this path to yield and drop 882 * mmu_lock. When handling an unmap/release mmu_notifier command, KVM 883 * must drop all references to relevant pages prior to completing the 884 * callback. Dropping mmu_lock with an unreachable root would result 885 * in zapping SPTEs after a relevant mmu_notifier callback completes 886 * and lead to use-after-free as zapping a SPTE triggers "writeback" of 887 * dirty accessed bits to the SPTE's associated struct page. 888 */ 889 WARN_ON_ONCE(!refcount_read(&root->tdp_mmu_root_count)); 890 891 kvm_lockdep_assert_mmu_lock_held(kvm, shared); 892 893 rcu_read_lock(); 894 895 /* 896 * Zap roots in multiple passes of decreasing granularity, i.e. zap at 897 * 4KiB=>2MiB=>1GiB=>root, in order to better honor need_resched() (all 898 * preempt models) or mmu_lock contention (full or real-time models). 899 * Zapping at finer granularity marginally increases the total time of 900 * the zap, but in most cases the zap itself isn't latency sensitive. 901 * 902 * If KVM is configured to prove the MMU, skip the 4KiB and 2MiB zaps 903 * in order to mimic the page fault path, which can replace a 1GiB page 904 * table with an equivalent 1GiB hugepage, i.e. can get saddled with 905 * zapping a 1GiB region that's fully populated with 4KiB SPTEs. This 906 * allows verifying that KVM can safely zap 1GiB regions, e.g. without 907 * inducing RCU stalls, without relying on a relatively rare event 908 * (zapping roots is orders of magnitude more common). Note, because 909 * zapping a SP recurses on its children, stepping down to PG_LEVEL_4K 910 * in the iterator itself is unnecessary. 911 */ 912 if (!IS_ENABLED(CONFIG_KVM_PROVE_MMU)) { 913 __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_4K); 914 __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_2M); 915 } 916 __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G); 917 __tdp_mmu_zap_root(kvm, root, shared, root->role.level); 918 919 rcu_read_unlock(); 920 } 921 922 bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) 923 { 924 u64 old_spte; 925 926 /* 927 * This helper intentionally doesn't allow zapping a root shadow page, 928 * which doesn't have a parent page table and thus no associated entry. 929 */ 930 if (WARN_ON_ONCE(!sp->ptep)) 931 return false; 932 933 old_spte = kvm_tdp_mmu_read_spte(sp->ptep); 934 if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte))) 935 return false; 936 937 tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 938 SHADOW_NONPRESENT_VALUE, sp->gfn, sp->role.level + 1); 939 940 return true; 941 } 942 943 /* 944 * If can_yield is true, will release the MMU lock and reschedule if the 945 * scheduler needs the CPU or there is contention on the MMU lock. If this 946 * function cannot yield, it will not release the MMU lock or reschedule and 947 * the caller must ensure it does not supply too large a GFN range, or the 948 * operation can cause a soft lockup. 949 */ 950 static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root, 951 gfn_t start, gfn_t end, bool can_yield, bool flush) 952 { 953 struct tdp_iter iter; 954 955 end = min(end, tdp_mmu_max_gfn_exclusive()); 956 957 lockdep_assert_held_write(&kvm->mmu_lock); 958 959 rcu_read_lock(); 960 961 for_each_tdp_pte_min_level(iter, kvm, root, PG_LEVEL_4K, start, end) { 962 if (can_yield && 963 tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) { 964 flush = false; 965 continue; 966 } 967 968 if (!is_shadow_present_pte(iter.old_spte) || 969 !is_last_spte(iter.old_spte, iter.level)) 970 continue; 971 972 tdp_mmu_iter_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE); 973 974 /* 975 * Zappings SPTEs in invalid roots doesn't require a TLB flush, 976 * see kvm_tdp_mmu_zap_invalidated_roots() for details. 977 */ 978 if (!root->role.invalid) 979 flush = true; 980 } 981 982 rcu_read_unlock(); 983 984 /* 985 * Because this flow zaps _only_ leaf SPTEs, the caller doesn't need 986 * to provide RCU protection as no 'struct kvm_mmu_page' will be freed. 987 */ 988 return flush; 989 } 990 991 /* 992 * Zap leaf SPTEs for the range of gfns, [start, end), for all *VALID** roots. 993 * Returns true if a TLB flush is needed before releasing the MMU lock, i.e. if 994 * one or more SPTEs were zapped since the MMU lock was last acquired. 995 */ 996 bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush) 997 { 998 struct kvm_mmu_page *root; 999 1000 lockdep_assert_held_write(&kvm->mmu_lock); 1001 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, -1) 1002 flush = tdp_mmu_zap_leafs(kvm, root, start, end, true, flush); 1003 1004 return flush; 1005 } 1006 1007 void kvm_tdp_mmu_zap_all(struct kvm *kvm) 1008 { 1009 struct kvm_mmu_page *root; 1010 1011 /* 1012 * Zap all direct roots, including invalid direct roots, as all direct 1013 * SPTEs must be dropped before returning to the caller. For TDX, mirror 1014 * roots don't need handling in response to the mmu notifier (the caller). 1015 * 1016 * Zap directly even if the root is also being zapped by a concurrent 1017 * "fast zap". Walking zapped top-level SPTEs isn't all that expensive 1018 * and mmu_lock is already held, which means the other thread has yielded. 1019 * 1020 * A TLB flush is unnecessary, KVM zaps everything if and only the VM 1021 * is being destroyed or the userspace VMM has exited. In both cases, 1022 * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request. 1023 */ 1024 lockdep_assert_held_write(&kvm->mmu_lock); 1025 __for_each_tdp_mmu_root_yield_safe(kvm, root, -1, 1026 KVM_DIRECT_ROOTS | KVM_INVALID_ROOTS) 1027 tdp_mmu_zap_root(kvm, root, false); 1028 } 1029 1030 /* 1031 * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast 1032 * zap" completes. 1033 */ 1034 void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm, bool shared) 1035 { 1036 struct kvm_mmu_page *root; 1037 1038 if (shared) 1039 read_lock(&kvm->mmu_lock); 1040 else 1041 write_lock(&kvm->mmu_lock); 1042 1043 for_each_tdp_mmu_root_yield_safe(kvm, root) { 1044 if (!root->tdp_mmu_scheduled_root_to_zap) 1045 continue; 1046 1047 root->tdp_mmu_scheduled_root_to_zap = false; 1048 KVM_BUG_ON(!root->role.invalid, kvm); 1049 1050 /* 1051 * A TLB flush is not necessary as KVM performs a local TLB 1052 * flush when allocating a new root (see kvm_mmu_load()), and 1053 * when migrating a vCPU to a different pCPU. Note, the local 1054 * TLB flush on reuse also invalidates paging-structure-cache 1055 * entries, i.e. TLB entries for intermediate paging structures, 1056 * that may be zapped, as such entries are associated with the 1057 * ASID on both VMX and SVM. 1058 */ 1059 tdp_mmu_zap_root(kvm, root, shared); 1060 1061 /* 1062 * The referenced needs to be put *after* zapping the root, as 1063 * the root must be reachable by mmu_notifiers while it's being 1064 * zapped 1065 */ 1066 kvm_tdp_mmu_put_root(kvm, root); 1067 } 1068 1069 if (shared) 1070 read_unlock(&kvm->mmu_lock); 1071 else 1072 write_unlock(&kvm->mmu_lock); 1073 } 1074 1075 /* 1076 * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that 1077 * is about to be zapped, e.g. in response to a memslots update. The actual 1078 * zapping is done separately so that it happens with mmu_lock with read, 1079 * whereas invalidating roots must be done with mmu_lock held for write (unless 1080 * the VM is being destroyed). 1081 * 1082 * Note, kvm_tdp_mmu_zap_invalidated_roots() is gifted the TDP MMU's reference. 1083 * See kvm_tdp_mmu_alloc_root(). 1084 */ 1085 void kvm_tdp_mmu_invalidate_roots(struct kvm *kvm, 1086 enum kvm_tdp_mmu_root_types root_types) 1087 { 1088 struct kvm_mmu_page *root; 1089 1090 /* 1091 * Invalidating invalid roots doesn't make sense, prevent developers from 1092 * having to think about it. 1093 */ 1094 if (WARN_ON_ONCE(root_types & KVM_INVALID_ROOTS)) 1095 root_types &= ~KVM_INVALID_ROOTS; 1096 1097 /* 1098 * mmu_lock must be held for write to ensure that a root doesn't become 1099 * invalid while there are active readers (invalidating a root while 1100 * there are active readers may or may not be problematic in practice, 1101 * but it's uncharted territory and not supported). 1102 * 1103 * Waive the assertion if there are no users of @kvm, i.e. the VM is 1104 * being destroyed after all references have been put, or if no vCPUs 1105 * have been created (which means there are no roots), i.e. the VM is 1106 * being destroyed in an error path of KVM_CREATE_VM. 1107 */ 1108 if (IS_ENABLED(CONFIG_PROVE_LOCKING) && 1109 refcount_read(&kvm->users_count) && kvm->created_vcpus) 1110 lockdep_assert_held_write(&kvm->mmu_lock); 1111 1112 /* 1113 * As above, mmu_lock isn't held when destroying the VM! There can't 1114 * be other references to @kvm, i.e. nothing else can invalidate roots 1115 * or get/put references to roots. 1116 */ 1117 list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) { 1118 if (!tdp_mmu_root_match(root, root_types)) 1119 continue; 1120 1121 /* 1122 * Note, invalid roots can outlive a memslot update! Invalid 1123 * roots must be *zapped* before the memslot update completes, 1124 * but a different task can acquire a reference and keep the 1125 * root alive after its been zapped. 1126 */ 1127 if (!root->role.invalid) { 1128 root->tdp_mmu_scheduled_root_to_zap = true; 1129 root->role.invalid = true; 1130 } 1131 } 1132 } 1133 1134 /* 1135 * Installs a last-level SPTE to handle a TDP page fault. 1136 * (NPT/EPT violation/misconfiguration) 1137 */ 1138 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, 1139 struct kvm_page_fault *fault, 1140 struct tdp_iter *iter) 1141 { 1142 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep)); 1143 u64 new_spte; 1144 int ret = RET_PF_FIXED; 1145 bool wrprot = false; 1146 1147 if (WARN_ON_ONCE(sp->role.level != fault->goal_level)) 1148 return RET_PF_RETRY; 1149 1150 if (fault->prefetch && is_shadow_present_pte(iter->old_spte)) 1151 return RET_PF_SPURIOUS; 1152 1153 if (is_shadow_present_pte(iter->old_spte) && 1154 is_access_allowed(fault, iter->old_spte) && 1155 is_last_spte(iter->old_spte, iter->level)) 1156 return RET_PF_SPURIOUS; 1157 1158 if (unlikely(!fault->slot)) 1159 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); 1160 else 1161 wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn, 1162 fault->pfn, iter->old_spte, fault->prefetch, 1163 false, fault->map_writable, &new_spte); 1164 1165 if (new_spte == iter->old_spte) 1166 ret = RET_PF_SPURIOUS; 1167 else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte)) 1168 return RET_PF_RETRY; 1169 else if (is_shadow_present_pte(iter->old_spte) && 1170 (!is_last_spte(iter->old_spte, iter->level) || 1171 WARN_ON_ONCE(leaf_spte_change_needs_tlb_flush(iter->old_spte, new_spte)))) 1172 kvm_flush_remote_tlbs_gfn(vcpu->kvm, iter->gfn, iter->level); 1173 1174 /* 1175 * If the page fault was caused by a write but the page is write 1176 * protected, emulation is needed. If the emulation was skipped, 1177 * the vCPU would have the same fault again. 1178 */ 1179 if (wrprot && fault->write) 1180 ret = RET_PF_WRITE_PROTECTED; 1181 1182 /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */ 1183 if (unlikely(is_mmio_spte(vcpu->kvm, new_spte))) { 1184 vcpu->stat.pf_mmio_spte_created++; 1185 trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn, 1186 new_spte); 1187 ret = RET_PF_EMULATE; 1188 } else { 1189 trace_kvm_mmu_set_spte(iter->level, iter->gfn, 1190 rcu_dereference(iter->sptep)); 1191 } 1192 1193 return ret; 1194 } 1195 1196 /* 1197 * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the 1198 * provided page table. 1199 * 1200 * @kvm: kvm instance 1201 * @iter: a tdp_iter instance currently on the SPTE that should be set 1202 * @sp: The new TDP page table to install. 1203 * @shared: This operation is running under the MMU lock in read mode. 1204 * 1205 * Returns: 0 if the new page table was installed. Non-0 if the page table 1206 * could not be installed (e.g. the atomic compare-exchange failed). 1207 */ 1208 static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter, 1209 struct kvm_mmu_page *sp, bool shared) 1210 { 1211 u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled); 1212 int ret = 0; 1213 1214 if (shared) { 1215 ret = tdp_mmu_set_spte_atomic(kvm, iter, spte); 1216 if (ret) 1217 return ret; 1218 } else { 1219 tdp_mmu_iter_set_spte(kvm, iter, spte); 1220 } 1221 1222 tdp_account_mmu_page(kvm, sp); 1223 1224 return 0; 1225 } 1226 1227 static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, 1228 struct kvm_mmu_page *sp, bool shared); 1229 1230 /* 1231 * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing 1232 * page tables and SPTEs to translate the faulting guest physical address. 1233 */ 1234 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) 1235 { 1236 struct kvm_mmu_page *root = tdp_mmu_get_root_for_fault(vcpu, fault); 1237 struct kvm *kvm = vcpu->kvm; 1238 struct tdp_iter iter; 1239 struct kvm_mmu_page *sp; 1240 int ret = RET_PF_RETRY; 1241 1242 kvm_mmu_hugepage_adjust(vcpu, fault); 1243 1244 trace_kvm_mmu_spte_requested(fault); 1245 1246 rcu_read_lock(); 1247 1248 for_each_tdp_pte(iter, kvm, root, fault->gfn, fault->gfn + 1) { 1249 int r; 1250 1251 if (fault->nx_huge_page_workaround_enabled) 1252 disallowed_hugepage_adjust(fault, iter.old_spte, iter.level); 1253 1254 /* 1255 * If SPTE has been frozen by another thread, just give up and 1256 * retry, avoiding unnecessary page table allocation and free. 1257 */ 1258 if (is_frozen_spte(iter.old_spte)) 1259 goto retry; 1260 1261 if (iter.level == fault->goal_level) 1262 goto map_target_level; 1263 1264 /* Step down into the lower level page table if it exists. */ 1265 if (is_shadow_present_pte(iter.old_spte) && 1266 !is_large_pte(iter.old_spte)) 1267 continue; 1268 1269 /* 1270 * The SPTE is either non-present or points to a huge page that 1271 * needs to be split. 1272 */ 1273 sp = tdp_mmu_alloc_sp(vcpu); 1274 tdp_mmu_init_child_sp(sp, &iter); 1275 if (is_mirror_sp(sp)) 1276 kvm_mmu_alloc_external_spt(vcpu, sp); 1277 1278 sp->nx_huge_page_disallowed = fault->huge_page_disallowed; 1279 1280 if (is_shadow_present_pte(iter.old_spte)) { 1281 /* Don't support large page for mirrored roots (TDX) */ 1282 KVM_BUG_ON(is_mirror_sptep(iter.sptep), vcpu->kvm); 1283 r = tdp_mmu_split_huge_page(kvm, &iter, sp, true); 1284 } else { 1285 r = tdp_mmu_link_sp(kvm, &iter, sp, true); 1286 } 1287 1288 /* 1289 * Force the guest to retry if installing an upper level SPTE 1290 * failed, e.g. because a different task modified the SPTE. 1291 */ 1292 if (r) { 1293 tdp_mmu_free_sp(sp); 1294 goto retry; 1295 } 1296 1297 if (fault->huge_page_disallowed && 1298 fault->req_level >= iter.level) { 1299 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 1300 if (sp->nx_huge_page_disallowed) 1301 track_possible_nx_huge_page(kvm, sp); 1302 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 1303 } 1304 } 1305 1306 /* 1307 * The walk aborted before reaching the target level, e.g. because the 1308 * iterator detected an upper level SPTE was frozen during traversal. 1309 */ 1310 WARN_ON_ONCE(iter.level == fault->goal_level); 1311 goto retry; 1312 1313 map_target_level: 1314 ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter); 1315 1316 retry: 1317 rcu_read_unlock(); 1318 return ret; 1319 } 1320 1321 /* Used by mmu notifier via kvm_unmap_gfn_range() */ 1322 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, 1323 bool flush) 1324 { 1325 enum kvm_tdp_mmu_root_types types; 1326 struct kvm_mmu_page *root; 1327 1328 types = kvm_gfn_range_filter_to_root_types(kvm, range->attr_filter) | KVM_INVALID_ROOTS; 1329 1330 __for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, types) 1331 flush = tdp_mmu_zap_leafs(kvm, root, range->start, range->end, 1332 range->may_block, flush); 1333 1334 return flush; 1335 } 1336 1337 /* 1338 * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero 1339 * if any of the GFNs in the range have been accessed. 1340 * 1341 * No need to mark the corresponding PFN as accessed as this call is coming 1342 * from the clear_young() or clear_flush_young() notifier, which uses the 1343 * return value to determine if the page has been accessed. 1344 */ 1345 static void kvm_tdp_mmu_age_spte(struct kvm *kvm, struct tdp_iter *iter) 1346 { 1347 u64 new_spte; 1348 1349 if (spte_ad_enabled(iter->old_spte)) { 1350 iter->old_spte = tdp_mmu_clear_spte_bits_atomic(iter->sptep, 1351 shadow_accessed_mask); 1352 new_spte = iter->old_spte & ~shadow_accessed_mask; 1353 } else { 1354 new_spte = mark_spte_for_access_track(iter->old_spte); 1355 /* 1356 * It is safe for the following cmpxchg to fail. Leave the 1357 * Accessed bit set, as the spte is most likely young anyway. 1358 */ 1359 if (__tdp_mmu_set_spte_atomic(kvm, iter, new_spte)) 1360 return; 1361 } 1362 1363 trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level, 1364 iter->old_spte, new_spte); 1365 } 1366 1367 static bool __kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, 1368 struct kvm_gfn_range *range, 1369 bool test_only) 1370 { 1371 enum kvm_tdp_mmu_root_types types; 1372 struct kvm_mmu_page *root; 1373 struct tdp_iter iter; 1374 bool ret = false; 1375 1376 types = kvm_gfn_range_filter_to_root_types(kvm, range->attr_filter); 1377 1378 /* 1379 * Don't support rescheduling, none of the MMU notifiers that funnel 1380 * into this helper allow blocking; it'd be dead, wasteful code. Note, 1381 * this helper must NOT be used to unmap GFNs, as it processes only 1382 * valid roots! 1383 */ 1384 WARN_ON(types & ~KVM_VALID_ROOTS); 1385 1386 guard(rcu)(); 1387 for_each_tdp_mmu_root_rcu(kvm, root, range->slot->as_id, types) { 1388 tdp_root_for_each_leaf_pte(iter, kvm, root, range->start, range->end) { 1389 if (!is_accessed_spte(iter.old_spte)) 1390 continue; 1391 1392 if (test_only) 1393 return true; 1394 1395 ret = true; 1396 kvm_tdp_mmu_age_spte(kvm, &iter); 1397 } 1398 } 1399 1400 return ret; 1401 } 1402 1403 bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) 1404 { 1405 return __kvm_tdp_mmu_age_gfn_range(kvm, range, false); 1406 } 1407 1408 bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 1409 { 1410 return __kvm_tdp_mmu_age_gfn_range(kvm, range, true); 1411 } 1412 1413 /* 1414 * Remove write access from all SPTEs at or above min_level that map GFNs 1415 * [start, end). Returns true if an SPTE has been changed and the TLBs need to 1416 * be flushed. 1417 */ 1418 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 1419 gfn_t start, gfn_t end, int min_level) 1420 { 1421 struct tdp_iter iter; 1422 u64 new_spte; 1423 bool spte_set = false; 1424 1425 rcu_read_lock(); 1426 1427 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); 1428 1429 for_each_tdp_pte_min_level(iter, kvm, root, min_level, start, end) { 1430 retry: 1431 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) 1432 continue; 1433 1434 if (!is_shadow_present_pte(iter.old_spte) || 1435 !is_last_spte(iter.old_spte, iter.level) || 1436 !(iter.old_spte & PT_WRITABLE_MASK)) 1437 continue; 1438 1439 new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 1440 1441 if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) 1442 goto retry; 1443 1444 spte_set = true; 1445 } 1446 1447 rcu_read_unlock(); 1448 return spte_set; 1449 } 1450 1451 /* 1452 * Remove write access from all the SPTEs mapping GFNs in the memslot. Will 1453 * only affect leaf SPTEs down to min_level. 1454 * Returns true if an SPTE has been changed and the TLBs need to be flushed. 1455 */ 1456 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, 1457 const struct kvm_memory_slot *slot, int min_level) 1458 { 1459 struct kvm_mmu_page *root; 1460 bool spte_set = false; 1461 1462 lockdep_assert_held_read(&kvm->mmu_lock); 1463 1464 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) 1465 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn, 1466 slot->base_gfn + slot->npages, min_level); 1467 1468 return spte_set; 1469 } 1470 1471 static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(void) 1472 { 1473 struct kvm_mmu_page *sp; 1474 1475 sp = kmem_cache_zalloc(mmu_page_header_cache, GFP_KERNEL_ACCOUNT); 1476 if (!sp) 1477 return NULL; 1478 1479 sp->spt = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); 1480 if (!sp->spt) { 1481 kmem_cache_free(mmu_page_header_cache, sp); 1482 return NULL; 1483 } 1484 1485 return sp; 1486 } 1487 1488 /* Note, the caller is responsible for initializing @sp. */ 1489 static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, 1490 struct kvm_mmu_page *sp, bool shared) 1491 { 1492 const u64 huge_spte = iter->old_spte; 1493 const int level = iter->level; 1494 int ret, i; 1495 1496 /* 1497 * No need for atomics when writing to sp->spt since the page table has 1498 * not been linked in yet and thus is not reachable from any other CPU. 1499 */ 1500 for (i = 0; i < SPTE_ENT_PER_PAGE; i++) 1501 sp->spt[i] = make_small_spte(kvm, huge_spte, sp->role, i); 1502 1503 /* 1504 * Replace the huge spte with a pointer to the populated lower level 1505 * page table. Since we are making this change without a TLB flush vCPUs 1506 * will see a mix of the split mappings and the original huge mapping, 1507 * depending on what's currently in their TLB. This is fine from a 1508 * correctness standpoint since the translation will be the same either 1509 * way. 1510 */ 1511 ret = tdp_mmu_link_sp(kvm, iter, sp, shared); 1512 if (ret) 1513 goto out; 1514 1515 /* 1516 * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we 1517 * are overwriting from the page stats. But we have to manually update 1518 * the page stats with the new present child pages. 1519 */ 1520 kvm_update_page_stats(kvm, level - 1, SPTE_ENT_PER_PAGE); 1521 1522 out: 1523 trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret); 1524 return ret; 1525 } 1526 1527 static int tdp_mmu_split_huge_pages_root(struct kvm *kvm, 1528 struct kvm_mmu_page *root, 1529 gfn_t start, gfn_t end, 1530 int target_level, bool shared) 1531 { 1532 struct kvm_mmu_page *sp = NULL; 1533 struct tdp_iter iter; 1534 1535 rcu_read_lock(); 1536 1537 /* 1538 * Traverse the page table splitting all huge pages above the target 1539 * level into one lower level. For example, if we encounter a 1GB page 1540 * we split it into 512 2MB pages. 1541 * 1542 * Since the TDP iterator uses a pre-order traversal, we are guaranteed 1543 * to visit an SPTE before ever visiting its children, which means we 1544 * will correctly recursively split huge pages that are more than one 1545 * level above the target level (e.g. splitting a 1GB to 512 2MB pages, 1546 * and then splitting each of those to 512 4KB pages). 1547 */ 1548 for_each_tdp_pte_min_level(iter, kvm, root, target_level + 1, start, end) { 1549 retry: 1550 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared)) 1551 continue; 1552 1553 if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte)) 1554 continue; 1555 1556 if (!sp) { 1557 rcu_read_unlock(); 1558 1559 if (shared) 1560 read_unlock(&kvm->mmu_lock); 1561 else 1562 write_unlock(&kvm->mmu_lock); 1563 1564 sp = tdp_mmu_alloc_sp_for_split(); 1565 1566 if (shared) 1567 read_lock(&kvm->mmu_lock); 1568 else 1569 write_lock(&kvm->mmu_lock); 1570 1571 if (!sp) { 1572 trace_kvm_mmu_split_huge_page(iter.gfn, 1573 iter.old_spte, 1574 iter.level, -ENOMEM); 1575 return -ENOMEM; 1576 } 1577 1578 rcu_read_lock(); 1579 1580 iter.yielded = true; 1581 continue; 1582 } 1583 1584 tdp_mmu_init_child_sp(sp, &iter); 1585 1586 if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared)) 1587 goto retry; 1588 1589 sp = NULL; 1590 } 1591 1592 rcu_read_unlock(); 1593 1594 /* 1595 * It's possible to exit the loop having never used the last sp if, for 1596 * example, a vCPU doing HugePage NX splitting wins the race and 1597 * installs its own sp in place of the last sp we tried to split. 1598 */ 1599 if (sp) 1600 tdp_mmu_free_sp(sp); 1601 1602 return 0; 1603 } 1604 1605 1606 /* 1607 * Try to split all huge pages mapped by the TDP MMU down to the target level. 1608 */ 1609 void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm, 1610 const struct kvm_memory_slot *slot, 1611 gfn_t start, gfn_t end, 1612 int target_level, bool shared) 1613 { 1614 struct kvm_mmu_page *root; 1615 int r = 0; 1616 1617 kvm_lockdep_assert_mmu_lock_held(kvm, shared); 1618 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) { 1619 r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared); 1620 if (r) { 1621 kvm_tdp_mmu_put_root(kvm, root); 1622 break; 1623 } 1624 } 1625 } 1626 1627 static bool tdp_mmu_need_write_protect(struct kvm_mmu_page *sp) 1628 { 1629 /* 1630 * All TDP MMU shadow pages share the same role as their root, aside 1631 * from level, so it is valid to key off any shadow page to determine if 1632 * write protection is needed for an entire tree. 1633 */ 1634 return kvm_mmu_page_ad_need_write_protect(sp) || !kvm_ad_enabled; 1635 } 1636 1637 static void clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 1638 gfn_t start, gfn_t end) 1639 { 1640 const u64 dbit = tdp_mmu_need_write_protect(root) ? PT_WRITABLE_MASK : 1641 shadow_dirty_mask; 1642 struct tdp_iter iter; 1643 1644 rcu_read_lock(); 1645 1646 tdp_root_for_each_pte(iter, kvm, root, start, end) { 1647 retry: 1648 if (!is_shadow_present_pte(iter.old_spte) || 1649 !is_last_spte(iter.old_spte, iter.level)) 1650 continue; 1651 1652 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) 1653 continue; 1654 1655 KVM_MMU_WARN_ON(dbit == shadow_dirty_mask && 1656 spte_ad_need_write_protect(iter.old_spte)); 1657 1658 if (!(iter.old_spte & dbit)) 1659 continue; 1660 1661 if (tdp_mmu_set_spte_atomic(kvm, &iter, iter.old_spte & ~dbit)) 1662 goto retry; 1663 } 1664 1665 rcu_read_unlock(); 1666 } 1667 1668 /* 1669 * Clear the dirty status (D-bit or W-bit) of all the SPTEs mapping GFNs in the 1670 * memslot. 1671 */ 1672 void kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, 1673 const struct kvm_memory_slot *slot) 1674 { 1675 struct kvm_mmu_page *root; 1676 1677 lockdep_assert_held_read(&kvm->mmu_lock); 1678 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) 1679 clear_dirty_gfn_range(kvm, root, slot->base_gfn, 1680 slot->base_gfn + slot->npages); 1681 } 1682 1683 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, 1684 gfn_t gfn, unsigned long mask, bool wrprot) 1685 { 1686 const u64 dbit = (wrprot || tdp_mmu_need_write_protect(root)) ? PT_WRITABLE_MASK : 1687 shadow_dirty_mask; 1688 struct tdp_iter iter; 1689 1690 lockdep_assert_held_write(&kvm->mmu_lock); 1691 1692 rcu_read_lock(); 1693 1694 tdp_root_for_each_leaf_pte(iter, kvm, root, gfn + __ffs(mask), 1695 gfn + BITS_PER_LONG) { 1696 if (!mask) 1697 break; 1698 1699 KVM_MMU_WARN_ON(dbit == shadow_dirty_mask && 1700 spte_ad_need_write_protect(iter.old_spte)); 1701 1702 if (iter.level > PG_LEVEL_4K || 1703 !(mask & (1UL << (iter.gfn - gfn)))) 1704 continue; 1705 1706 mask &= ~(1UL << (iter.gfn - gfn)); 1707 1708 if (!(iter.old_spte & dbit)) 1709 continue; 1710 1711 iter.old_spte = tdp_mmu_clear_spte_bits(iter.sptep, 1712 iter.old_spte, dbit, 1713 iter.level); 1714 1715 trace_kvm_tdp_mmu_spte_changed(iter.as_id, iter.gfn, iter.level, 1716 iter.old_spte, 1717 iter.old_spte & ~dbit); 1718 } 1719 1720 rcu_read_unlock(); 1721 } 1722 1723 /* 1724 * Clear the dirty status (D-bit or W-bit) of all the 4k SPTEs mapping GFNs for 1725 * which a bit is set in mask, starting at gfn. The given memslot is expected to 1726 * contain all the GFNs represented by set bits in the mask. 1727 */ 1728 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, 1729 struct kvm_memory_slot *slot, 1730 gfn_t gfn, unsigned long mask, 1731 bool wrprot) 1732 { 1733 struct kvm_mmu_page *root; 1734 1735 for_each_valid_tdp_mmu_root(kvm, root, slot->as_id) 1736 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot); 1737 } 1738 1739 static int tdp_mmu_make_huge_spte(struct kvm *kvm, 1740 struct tdp_iter *parent, 1741 u64 *huge_spte) 1742 { 1743 struct kvm_mmu_page *root = spte_to_child_sp(parent->old_spte); 1744 gfn_t start = parent->gfn; 1745 gfn_t end = start + KVM_PAGES_PER_HPAGE(parent->level); 1746 struct tdp_iter iter; 1747 1748 tdp_root_for_each_leaf_pte(iter, kvm, root, start, end) { 1749 /* 1750 * Use the parent iterator when checking for forward progress so 1751 * that KVM doesn't get stuck continuously trying to yield (i.e. 1752 * returning -EAGAIN here and then failing the forward progress 1753 * check in the caller ad nauseam). 1754 */ 1755 if (tdp_mmu_iter_need_resched(kvm, parent)) 1756 return -EAGAIN; 1757 1758 *huge_spte = make_huge_spte(kvm, iter.old_spte, parent->level); 1759 return 0; 1760 } 1761 1762 return -ENOENT; 1763 } 1764 1765 static void recover_huge_pages_range(struct kvm *kvm, 1766 struct kvm_mmu_page *root, 1767 const struct kvm_memory_slot *slot) 1768 { 1769 gfn_t start = slot->base_gfn; 1770 gfn_t end = start + slot->npages; 1771 struct tdp_iter iter; 1772 int max_mapping_level; 1773 bool flush = false; 1774 u64 huge_spte; 1775 int r; 1776 1777 if (WARN_ON_ONCE(kvm_slot_dirty_track_enabled(slot))) 1778 return; 1779 1780 rcu_read_lock(); 1781 1782 for_each_tdp_pte_min_level(iter, kvm, root, PG_LEVEL_2M, start, end) { 1783 retry: 1784 if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, true)) { 1785 flush = false; 1786 continue; 1787 } 1788 1789 if (iter.level > KVM_MAX_HUGEPAGE_LEVEL || 1790 !is_shadow_present_pte(iter.old_spte)) 1791 continue; 1792 1793 /* 1794 * Don't zap leaf SPTEs, if a leaf SPTE could be replaced with 1795 * a large page size, then its parent would have been zapped 1796 * instead of stepping down. 1797 */ 1798 if (is_last_spte(iter.old_spte, iter.level)) 1799 continue; 1800 1801 /* 1802 * If iter.gfn resides outside of the slot, i.e. the page for 1803 * the current level overlaps but is not contained by the slot, 1804 * then the SPTE can't be made huge. More importantly, trying 1805 * to query that info from slot->arch.lpage_info will cause an 1806 * out-of-bounds access. 1807 */ 1808 if (iter.gfn < start || iter.gfn >= end) 1809 continue; 1810 1811 max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, iter.gfn); 1812 if (max_mapping_level < iter.level) 1813 continue; 1814 1815 r = tdp_mmu_make_huge_spte(kvm, &iter, &huge_spte); 1816 if (r == -EAGAIN) 1817 goto retry; 1818 else if (r) 1819 continue; 1820 1821 if (tdp_mmu_set_spte_atomic(kvm, &iter, huge_spte)) 1822 goto retry; 1823 1824 flush = true; 1825 } 1826 1827 if (flush) 1828 kvm_flush_remote_tlbs_memslot(kvm, slot); 1829 1830 rcu_read_unlock(); 1831 } 1832 1833 /* 1834 * Recover huge page mappings within the slot by replacing non-leaf SPTEs with 1835 * huge SPTEs where possible. 1836 */ 1837 void kvm_tdp_mmu_recover_huge_pages(struct kvm *kvm, 1838 const struct kvm_memory_slot *slot) 1839 { 1840 struct kvm_mmu_page *root; 1841 1842 lockdep_assert_held_read(&kvm->mmu_lock); 1843 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) 1844 recover_huge_pages_range(kvm, root, slot); 1845 } 1846 1847 /* 1848 * Removes write access on the last level SPTE mapping this GFN and unsets the 1849 * MMU-writable bit to ensure future writes continue to be intercepted. 1850 * Returns true if an SPTE was set and a TLB flush is needed. 1851 */ 1852 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, 1853 gfn_t gfn, int min_level) 1854 { 1855 struct tdp_iter iter; 1856 u64 new_spte; 1857 bool spte_set = false; 1858 1859 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); 1860 1861 rcu_read_lock(); 1862 1863 for_each_tdp_pte_min_level(iter, kvm, root, min_level, gfn, gfn + 1) { 1864 if (!is_shadow_present_pte(iter.old_spte) || 1865 !is_last_spte(iter.old_spte, iter.level)) 1866 continue; 1867 1868 new_spte = iter.old_spte & 1869 ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask); 1870 1871 if (new_spte == iter.old_spte) 1872 break; 1873 1874 tdp_mmu_iter_set_spte(kvm, &iter, new_spte); 1875 spte_set = true; 1876 } 1877 1878 rcu_read_unlock(); 1879 1880 return spte_set; 1881 } 1882 1883 /* 1884 * Removes write access on the last level SPTE mapping this GFN and unsets the 1885 * MMU-writable bit to ensure future writes continue to be intercepted. 1886 * Returns true if an SPTE was set and a TLB flush is needed. 1887 */ 1888 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, 1889 struct kvm_memory_slot *slot, gfn_t gfn, 1890 int min_level) 1891 { 1892 struct kvm_mmu_page *root; 1893 bool spte_set = false; 1894 1895 lockdep_assert_held_write(&kvm->mmu_lock); 1896 for_each_valid_tdp_mmu_root(kvm, root, slot->as_id) 1897 spte_set |= write_protect_gfn(kvm, root, gfn, min_level); 1898 1899 return spte_set; 1900 } 1901 1902 /* 1903 * Return the level of the lowest level SPTE added to sptes. 1904 * That SPTE may be non-present. 1905 * 1906 * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. 1907 */ 1908 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, 1909 int *root_level) 1910 { 1911 struct kvm_mmu_page *root = root_to_sp(vcpu->arch.mmu->root.hpa); 1912 struct tdp_iter iter; 1913 gfn_t gfn = addr >> PAGE_SHIFT; 1914 int leaf = -1; 1915 1916 *root_level = vcpu->arch.mmu->root_role.level; 1917 1918 for_each_tdp_pte(iter, vcpu->kvm, root, gfn, gfn + 1) { 1919 leaf = iter.level; 1920 sptes[leaf] = iter.old_spte; 1921 } 1922 1923 return leaf; 1924 } 1925 1926 /* 1927 * Returns the last level spte pointer of the shadow page walk for the given 1928 * gpa, and sets *spte to the spte value. This spte may be non-preset. If no 1929 * walk could be performed, returns NULL and *spte does not contain valid data. 1930 * 1931 * Contract: 1932 * - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. 1933 * - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end. 1934 * 1935 * WARNING: This function is only intended to be called during fast_page_fault. 1936 */ 1937 u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gfn_t gfn, 1938 u64 *spte) 1939 { 1940 /* Fast pf is not supported for mirrored roots */ 1941 struct kvm_mmu_page *root = tdp_mmu_get_root(vcpu, KVM_DIRECT_ROOTS); 1942 struct tdp_iter iter; 1943 tdp_ptep_t sptep = NULL; 1944 1945 for_each_tdp_pte(iter, vcpu->kvm, root, gfn, gfn + 1) { 1946 *spte = iter.old_spte; 1947 sptep = iter.sptep; 1948 } 1949 1950 /* 1951 * Perform the rcu_dereference to get the raw spte pointer value since 1952 * we are passing it up to fast_page_fault, which is shared with the 1953 * legacy MMU and thus does not retain the TDP MMU-specific __rcu 1954 * annotation. 1955 * 1956 * This is safe since fast_page_fault obeys the contracts of this 1957 * function as well as all TDP MMU contracts around modifying SPTEs 1958 * outside of mmu_lock. 1959 */ 1960 return rcu_dereference(sptep); 1961 } 1962