1 // SPDX-License-Identifier: GPL-2.0 2 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 3 4 #include "mmu.h" 5 #include "mmu_internal.h" 6 #include "mmutrace.h" 7 #include "tdp_iter.h" 8 #include "tdp_mmu.h" 9 #include "spte.h" 10 11 #include <asm/cmpxchg.h> 12 #include <trace/events/kvm.h> 13 14 /* Initializes the TDP MMU for the VM, if enabled. */ 15 void kvm_mmu_init_tdp_mmu(struct kvm *kvm) 16 { 17 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); 18 spin_lock_init(&kvm->arch.tdp_mmu_pages_lock); 19 } 20 21 /* Arbitrarily returns true so that this may be used in if statements. */ 22 static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm, 23 bool shared) 24 { 25 if (shared) 26 lockdep_assert_held_read(&kvm->mmu_lock); 27 else 28 lockdep_assert_held_write(&kvm->mmu_lock); 29 30 return true; 31 } 32 33 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) 34 { 35 /* 36 * Invalidate all roots, which besides the obvious, schedules all roots 37 * for zapping and thus puts the TDP MMU's reference to each root, i.e. 38 * ultimately frees all roots. 39 */ 40 kvm_tdp_mmu_invalidate_all_roots(kvm); 41 kvm_tdp_mmu_zap_invalidated_roots(kvm); 42 43 WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages)); 44 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); 45 46 /* 47 * Ensure that all the outstanding RCU callbacks to free shadow pages 48 * can run before the VM is torn down. Putting the last reference to 49 * zapped roots will create new callbacks. 50 */ 51 rcu_barrier(); 52 } 53 54 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp) 55 { 56 free_page((unsigned long)sp->spt); 57 kmem_cache_free(mmu_page_header_cache, sp); 58 } 59 60 /* 61 * This is called through call_rcu in order to free TDP page table memory 62 * safely with respect to other kernel threads that may be operating on 63 * the memory. 64 * By only accessing TDP MMU page table memory in an RCU read critical 65 * section, and freeing it after a grace period, lockless access to that 66 * memory won't use it after it is freed. 67 */ 68 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head) 69 { 70 struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page, 71 rcu_head); 72 73 tdp_mmu_free_sp(sp); 74 } 75 76 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root) 77 { 78 if (!refcount_dec_and_test(&root->tdp_mmu_root_count)) 79 return; 80 81 /* 82 * The TDP MMU itself holds a reference to each root until the root is 83 * explicitly invalidated, i.e. the final reference should be never be 84 * put for a valid root. 85 */ 86 KVM_BUG_ON(!is_tdp_mmu_page(root) || !root->role.invalid, kvm); 87 88 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 89 list_del_rcu(&root->link); 90 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 91 call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback); 92 } 93 94 /* 95 * Returns the next root after @prev_root (or the first root if @prev_root is 96 * NULL). A reference to the returned root is acquired, and the reference to 97 * @prev_root is released (the caller obviously must hold a reference to 98 * @prev_root if it's non-NULL). 99 * 100 * If @only_valid is true, invalid roots are skipped. 101 * 102 * Returns NULL if the end of tdp_mmu_roots was reached. 103 */ 104 static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, 105 struct kvm_mmu_page *prev_root, 106 bool only_valid) 107 { 108 struct kvm_mmu_page *next_root; 109 110 /* 111 * While the roots themselves are RCU-protected, fields such as 112 * role.invalid are protected by mmu_lock. 113 */ 114 lockdep_assert_held(&kvm->mmu_lock); 115 116 rcu_read_lock(); 117 118 if (prev_root) 119 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, 120 &prev_root->link, 121 typeof(*prev_root), link); 122 else 123 next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots, 124 typeof(*next_root), link); 125 126 while (next_root) { 127 if ((!only_valid || !next_root->role.invalid) && 128 kvm_tdp_mmu_get_root(next_root)) 129 break; 130 131 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, 132 &next_root->link, typeof(*next_root), link); 133 } 134 135 rcu_read_unlock(); 136 137 if (prev_root) 138 kvm_tdp_mmu_put_root(kvm, prev_root); 139 140 return next_root; 141 } 142 143 /* 144 * Note: this iterator gets and puts references to the roots it iterates over. 145 * This makes it safe to release the MMU lock and yield within the loop, but 146 * if exiting the loop early, the caller must drop the reference to the most 147 * recent root. (Unless keeping a live reference is desirable.) 148 * 149 * If shared is set, this function is operating under the MMU lock in read 150 * mode. 151 */ 152 #define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _only_valid)\ 153 for (_root = tdp_mmu_next_root(_kvm, NULL, _only_valid); \ 154 ({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \ 155 _root = tdp_mmu_next_root(_kvm, _root, _only_valid)) \ 156 if (kvm_mmu_page_as_id(_root) != _as_id) { \ 157 } else 158 159 #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \ 160 __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, true) 161 162 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root) \ 163 for (_root = tdp_mmu_next_root(_kvm, NULL, false); \ 164 ({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \ 165 _root = tdp_mmu_next_root(_kvm, _root, false)) 166 167 /* 168 * Iterate over all TDP MMU roots. Requires that mmu_lock be held for write, 169 * the implication being that any flow that holds mmu_lock for read is 170 * inherently yield-friendly and should use the yield-safe variant above. 171 * Holding mmu_lock for write obviates the need for RCU protection as the list 172 * is guaranteed to be stable. 173 */ 174 #define for_each_tdp_mmu_root(_kvm, _root, _as_id) \ 175 list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \ 176 if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \ 177 kvm_mmu_page_as_id(_root) != _as_id) { \ 178 } else 179 180 static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu) 181 { 182 struct kvm_mmu_page *sp; 183 184 sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); 185 sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); 186 187 return sp; 188 } 189 190 static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep, 191 gfn_t gfn, union kvm_mmu_page_role role) 192 { 193 INIT_LIST_HEAD(&sp->possible_nx_huge_page_link); 194 195 set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 196 197 sp->role = role; 198 sp->gfn = gfn; 199 sp->ptep = sptep; 200 sp->tdp_mmu_page = true; 201 202 trace_kvm_mmu_get_page(sp, true); 203 } 204 205 static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp, 206 struct tdp_iter *iter) 207 { 208 struct kvm_mmu_page *parent_sp; 209 union kvm_mmu_page_role role; 210 211 parent_sp = sptep_to_sp(rcu_dereference(iter->sptep)); 212 213 role = parent_sp->role; 214 role.level--; 215 216 tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role); 217 } 218 219 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) 220 { 221 union kvm_mmu_page_role role = vcpu->arch.mmu->root_role; 222 struct kvm *kvm = vcpu->kvm; 223 struct kvm_mmu_page *root; 224 225 lockdep_assert_held_write(&kvm->mmu_lock); 226 227 /* 228 * Check for an existing root before allocating a new one. Note, the 229 * role check prevents consuming an invalid root. 230 */ 231 for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) { 232 if (root->role.word == role.word && 233 kvm_tdp_mmu_get_root(root)) 234 goto out; 235 } 236 237 root = tdp_mmu_alloc_sp(vcpu); 238 tdp_mmu_init_sp(root, NULL, 0, role); 239 240 /* 241 * TDP MMU roots are kept until they are explicitly invalidated, either 242 * by a memslot update or by the destruction of the VM. Initialize the 243 * refcount to two; one reference for the vCPU, and one reference for 244 * the TDP MMU itself, which is held until the root is invalidated and 245 * is ultimately put by kvm_tdp_mmu_zap_invalidated_roots(). 246 */ 247 refcount_set(&root->tdp_mmu_root_count, 2); 248 249 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 250 list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots); 251 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 252 253 out: 254 return __pa(root->spt); 255 } 256 257 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 258 u64 old_spte, u64 new_spte, int level, 259 bool shared); 260 261 static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) 262 { 263 kvm_account_pgtable_pages((void *)sp->spt, +1); 264 atomic64_inc(&kvm->arch.tdp_mmu_pages); 265 } 266 267 static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) 268 { 269 kvm_account_pgtable_pages((void *)sp->spt, -1); 270 atomic64_dec(&kvm->arch.tdp_mmu_pages); 271 } 272 273 /** 274 * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages 275 * 276 * @kvm: kvm instance 277 * @sp: the page to be removed 278 */ 279 static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp) 280 { 281 tdp_unaccount_mmu_page(kvm, sp); 282 283 if (!sp->nx_huge_page_disallowed) 284 return; 285 286 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 287 sp->nx_huge_page_disallowed = false; 288 untrack_possible_nx_huge_page(kvm, sp); 289 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 290 } 291 292 /** 293 * handle_removed_pt() - handle a page table removed from the TDP structure 294 * 295 * @kvm: kvm instance 296 * @pt: the page removed from the paging structure 297 * @shared: This operation may not be running under the exclusive use 298 * of the MMU lock and the operation must synchronize with other 299 * threads that might be modifying SPTEs. 300 * 301 * Given a page table that has been removed from the TDP paging structure, 302 * iterates through the page table to clear SPTEs and free child page tables. 303 * 304 * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU 305 * protection. Since this thread removed it from the paging structure, 306 * this thread will be responsible for ensuring the page is freed. Hence the 307 * early rcu_dereferences in the function. 308 */ 309 static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) 310 { 311 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt)); 312 int level = sp->role.level; 313 gfn_t base_gfn = sp->gfn; 314 int i; 315 316 trace_kvm_mmu_prepare_zap_page(sp); 317 318 tdp_mmu_unlink_sp(kvm, sp); 319 320 for (i = 0; i < SPTE_ENT_PER_PAGE; i++) { 321 tdp_ptep_t sptep = pt + i; 322 gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level); 323 u64 old_spte; 324 325 if (shared) { 326 /* 327 * Set the SPTE to a nonpresent value that other 328 * threads will not overwrite. If the SPTE was 329 * already marked as removed then another thread 330 * handling a page fault could overwrite it, so 331 * set the SPTE until it is set from some other 332 * value to the removed SPTE value. 333 */ 334 for (;;) { 335 old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, REMOVED_SPTE); 336 if (!is_removed_spte(old_spte)) 337 break; 338 cpu_relax(); 339 } 340 } else { 341 /* 342 * If the SPTE is not MMU-present, there is no backing 343 * page associated with the SPTE and so no side effects 344 * that need to be recorded, and exclusive ownership of 345 * mmu_lock ensures the SPTE can't be made present. 346 * Note, zapping MMIO SPTEs is also unnecessary as they 347 * are guarded by the memslots generation, not by being 348 * unreachable. 349 */ 350 old_spte = kvm_tdp_mmu_read_spte(sptep); 351 if (!is_shadow_present_pte(old_spte)) 352 continue; 353 354 /* 355 * Use the common helper instead of a raw WRITE_ONCE as 356 * the SPTE needs to be updated atomically if it can be 357 * modified by a different vCPU outside of mmu_lock. 358 * Even though the parent SPTE is !PRESENT, the TLB 359 * hasn't yet been flushed, and both Intel and AMD 360 * document that A/D assists can use upper-level PxE 361 * entries that are cached in the TLB, i.e. the CPU can 362 * still access the page and mark it dirty. 363 * 364 * No retry is needed in the atomic update path as the 365 * sole concern is dropping a Dirty bit, i.e. no other 366 * task can zap/remove the SPTE as mmu_lock is held for 367 * write. Marking the SPTE as a removed SPTE is not 368 * strictly necessary for the same reason, but using 369 * the remove SPTE value keeps the shared/exclusive 370 * paths consistent and allows the handle_changed_spte() 371 * call below to hardcode the new value to REMOVED_SPTE. 372 * 373 * Note, even though dropping a Dirty bit is the only 374 * scenario where a non-atomic update could result in a 375 * functional bug, simply checking the Dirty bit isn't 376 * sufficient as a fast page fault could read the upper 377 * level SPTE before it is zapped, and then make this 378 * target SPTE writable, resume the guest, and set the 379 * Dirty bit between reading the SPTE above and writing 380 * it here. 381 */ 382 old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, 383 REMOVED_SPTE, level); 384 } 385 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn, 386 old_spte, REMOVED_SPTE, level, shared); 387 } 388 389 call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); 390 } 391 392 /** 393 * handle_changed_spte - handle bookkeeping associated with an SPTE change 394 * @kvm: kvm instance 395 * @as_id: the address space of the paging structure the SPTE was a part of 396 * @gfn: the base GFN that was mapped by the SPTE 397 * @old_spte: The value of the SPTE before the change 398 * @new_spte: The value of the SPTE after the change 399 * @level: the level of the PT the SPTE is part of in the paging structure 400 * @shared: This operation may not be running under the exclusive use of 401 * the MMU lock and the operation must synchronize with other 402 * threads that might be modifying SPTEs. 403 * 404 * Handle bookkeeping that might result from the modification of a SPTE. Note, 405 * dirty logging updates are handled in common code, not here (see make_spte() 406 * and fast_pf_fix_direct_spte()). 407 */ 408 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 409 u64 old_spte, u64 new_spte, int level, 410 bool shared) 411 { 412 bool was_present = is_shadow_present_pte(old_spte); 413 bool is_present = is_shadow_present_pte(new_spte); 414 bool was_leaf = was_present && is_last_spte(old_spte, level); 415 bool is_leaf = is_present && is_last_spte(new_spte, level); 416 bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); 417 418 WARN_ON_ONCE(level > PT64_ROOT_MAX_LEVEL); 419 WARN_ON_ONCE(level < PG_LEVEL_4K); 420 WARN_ON_ONCE(gfn & (KVM_PAGES_PER_HPAGE(level) - 1)); 421 422 /* 423 * If this warning were to trigger it would indicate that there was a 424 * missing MMU notifier or a race with some notifier handler. 425 * A present, leaf SPTE should never be directly replaced with another 426 * present leaf SPTE pointing to a different PFN. A notifier handler 427 * should be zapping the SPTE before the main MM's page table is 428 * changed, or the SPTE should be zeroed, and the TLBs flushed by the 429 * thread before replacement. 430 */ 431 if (was_leaf && is_leaf && pfn_changed) { 432 pr_err("Invalid SPTE change: cannot replace a present leaf\n" 433 "SPTE with another present leaf SPTE mapping a\n" 434 "different PFN!\n" 435 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", 436 as_id, gfn, old_spte, new_spte, level); 437 438 /* 439 * Crash the host to prevent error propagation and guest data 440 * corruption. 441 */ 442 BUG(); 443 } 444 445 if (old_spte == new_spte) 446 return; 447 448 trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte); 449 450 if (is_leaf) 451 check_spte_writable_invariants(new_spte); 452 453 /* 454 * The only times a SPTE should be changed from a non-present to 455 * non-present state is when an MMIO entry is installed/modified/ 456 * removed. In that case, there is nothing to do here. 457 */ 458 if (!was_present && !is_present) { 459 /* 460 * If this change does not involve a MMIO SPTE or removed SPTE, 461 * it is unexpected. Log the change, though it should not 462 * impact the guest since both the former and current SPTEs 463 * are nonpresent. 464 */ 465 if (WARN_ON_ONCE(!is_mmio_spte(old_spte) && 466 !is_mmio_spte(new_spte) && 467 !is_removed_spte(new_spte))) 468 pr_err("Unexpected SPTE change! Nonpresent SPTEs\n" 469 "should not be replaced with another,\n" 470 "different nonpresent SPTE, unless one or both\n" 471 "are MMIO SPTEs, or the new SPTE is\n" 472 "a temporary removed SPTE.\n" 473 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", 474 as_id, gfn, old_spte, new_spte, level); 475 return; 476 } 477 478 if (is_leaf != was_leaf) 479 kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1); 480 481 if (was_leaf && is_dirty_spte(old_spte) && 482 (!is_present || !is_dirty_spte(new_spte) || pfn_changed)) 483 kvm_set_pfn_dirty(spte_to_pfn(old_spte)); 484 485 /* 486 * Recursively handle child PTs if the change removed a subtree from 487 * the paging structure. Note the WARN on the PFN changing without the 488 * SPTE being converted to a hugepage (leaf) or being zapped. Shadow 489 * pages are kernel allocations and should never be migrated. 490 */ 491 if (was_present && !was_leaf && 492 (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed))) 493 handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared); 494 495 if (was_leaf && is_accessed_spte(old_spte) && 496 (!is_present || !is_accessed_spte(new_spte) || pfn_changed)) 497 kvm_set_pfn_accessed(spte_to_pfn(old_spte)); 498 } 499 500 /* 501 * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically 502 * and handle the associated bookkeeping. Do not mark the page dirty 503 * in KVM's dirty bitmaps. 504 * 505 * If setting the SPTE fails because it has changed, iter->old_spte will be 506 * refreshed to the current value of the spte. 507 * 508 * @kvm: kvm instance 509 * @iter: a tdp_iter instance currently on the SPTE that should be set 510 * @new_spte: The value the SPTE should be set to 511 * Return: 512 * * 0 - If the SPTE was set. 513 * * -EBUSY - If the SPTE cannot be set. In this case this function will have 514 * no side-effects other than setting iter->old_spte to the last 515 * known value of the spte. 516 */ 517 static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm, 518 struct tdp_iter *iter, 519 u64 new_spte) 520 { 521 u64 *sptep = rcu_dereference(iter->sptep); 522 523 /* 524 * The caller is responsible for ensuring the old SPTE is not a REMOVED 525 * SPTE. KVM should never attempt to zap or manipulate a REMOVED SPTE, 526 * and pre-checking before inserting a new SPTE is advantageous as it 527 * avoids unnecessary work. 528 */ 529 WARN_ON_ONCE(iter->yielded || is_removed_spte(iter->old_spte)); 530 531 lockdep_assert_held_read(&kvm->mmu_lock); 532 533 /* 534 * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and 535 * does not hold the mmu_lock. On failure, i.e. if a different logical 536 * CPU modified the SPTE, try_cmpxchg64() updates iter->old_spte with 537 * the current value, so the caller operates on fresh data, e.g. if it 538 * retries tdp_mmu_set_spte_atomic() 539 */ 540 if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte)) 541 return -EBUSY; 542 543 handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, 544 new_spte, iter->level, true); 545 546 return 0; 547 } 548 549 static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm, 550 struct tdp_iter *iter) 551 { 552 int ret; 553 554 /* 555 * Freeze the SPTE by setting it to a special, 556 * non-present value. This will stop other threads from 557 * immediately installing a present entry in its place 558 * before the TLBs are flushed. 559 */ 560 ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE); 561 if (ret) 562 return ret; 563 564 kvm_flush_remote_tlbs_gfn(kvm, iter->gfn, iter->level); 565 566 /* 567 * No other thread can overwrite the removed SPTE as they must either 568 * wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not 569 * overwrite the special removed SPTE value. No bookkeeping is needed 570 * here since the SPTE is going from non-present to non-present. Use 571 * the raw write helper to avoid an unnecessary check on volatile bits. 572 */ 573 __kvm_tdp_mmu_write_spte(iter->sptep, 0); 574 575 return 0; 576 } 577 578 579 /* 580 * tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping 581 * @kvm: KVM instance 582 * @as_id: Address space ID, i.e. regular vs. SMM 583 * @sptep: Pointer to the SPTE 584 * @old_spte: The current value of the SPTE 585 * @new_spte: The new value that will be set for the SPTE 586 * @gfn: The base GFN that was (or will be) mapped by the SPTE 587 * @level: The level _containing_ the SPTE (its parent PT's level) 588 * 589 * Returns the old SPTE value, which _may_ be different than @old_spte if the 590 * SPTE had voldatile bits. 591 */ 592 static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep, 593 u64 old_spte, u64 new_spte, gfn_t gfn, int level) 594 { 595 lockdep_assert_held_write(&kvm->mmu_lock); 596 597 /* 598 * No thread should be using this function to set SPTEs to or from the 599 * temporary removed SPTE value. 600 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic 601 * should be used. If operating under the MMU lock in write mode, the 602 * use of the removed SPTE should not be necessary. 603 */ 604 WARN_ON_ONCE(is_removed_spte(old_spte) || is_removed_spte(new_spte)); 605 606 old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level); 607 608 handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false); 609 return old_spte; 610 } 611 612 static inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter, 613 u64 new_spte) 614 { 615 WARN_ON_ONCE(iter->yielded); 616 iter->old_spte = tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep, 617 iter->old_spte, new_spte, 618 iter->gfn, iter->level); 619 } 620 621 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \ 622 for_each_tdp_pte(_iter, _root, _start, _end) 623 624 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \ 625 tdp_root_for_each_pte(_iter, _root, _start, _end) \ 626 if (!is_shadow_present_pte(_iter.old_spte) || \ 627 !is_last_spte(_iter.old_spte, _iter.level)) \ 628 continue; \ 629 else 630 631 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \ 632 for_each_tdp_pte(_iter, root_to_sp(_mmu->root.hpa), _start, _end) 633 634 /* 635 * Yield if the MMU lock is contended or this thread needs to return control 636 * to the scheduler. 637 * 638 * If this function should yield and flush is set, it will perform a remote 639 * TLB flush before yielding. 640 * 641 * If this function yields, iter->yielded is set and the caller must skip to 642 * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk 643 * over the paging structures to allow the iterator to continue its traversal 644 * from the paging structure root. 645 * 646 * Returns true if this function yielded. 647 */ 648 static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm, 649 struct tdp_iter *iter, 650 bool flush, bool shared) 651 { 652 WARN_ON_ONCE(iter->yielded); 653 654 /* Ensure forward progress has been made before yielding. */ 655 if (iter->next_last_level_gfn == iter->yielded_gfn) 656 return false; 657 658 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { 659 if (flush) 660 kvm_flush_remote_tlbs(kvm); 661 662 rcu_read_unlock(); 663 664 if (shared) 665 cond_resched_rwlock_read(&kvm->mmu_lock); 666 else 667 cond_resched_rwlock_write(&kvm->mmu_lock); 668 669 rcu_read_lock(); 670 671 WARN_ON_ONCE(iter->gfn > iter->next_last_level_gfn); 672 673 iter->yielded = true; 674 } 675 676 return iter->yielded; 677 } 678 679 static inline gfn_t tdp_mmu_max_gfn_exclusive(void) 680 { 681 /* 682 * Bound TDP MMU walks at host.MAXPHYADDR. KVM disallows memslots with 683 * a gpa range that would exceed the max gfn, and KVM does not create 684 * MMIO SPTEs for "impossible" gfns, instead sending such accesses down 685 * the slow emulation path every time. 686 */ 687 return kvm_mmu_max_gfn() + 1; 688 } 689 690 static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, 691 bool shared, int zap_level) 692 { 693 struct tdp_iter iter; 694 695 gfn_t end = tdp_mmu_max_gfn_exclusive(); 696 gfn_t start = 0; 697 698 for_each_tdp_pte_min_level(iter, root, zap_level, start, end) { 699 retry: 700 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared)) 701 continue; 702 703 if (!is_shadow_present_pte(iter.old_spte)) 704 continue; 705 706 if (iter.level > zap_level) 707 continue; 708 709 if (!shared) 710 tdp_mmu_iter_set_spte(kvm, &iter, 0); 711 else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0)) 712 goto retry; 713 } 714 } 715 716 static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, 717 bool shared) 718 { 719 720 /* 721 * The root must have an elevated refcount so that it's reachable via 722 * mmu_notifier callbacks, which allows this path to yield and drop 723 * mmu_lock. When handling an unmap/release mmu_notifier command, KVM 724 * must drop all references to relevant pages prior to completing the 725 * callback. Dropping mmu_lock with an unreachable root would result 726 * in zapping SPTEs after a relevant mmu_notifier callback completes 727 * and lead to use-after-free as zapping a SPTE triggers "writeback" of 728 * dirty accessed bits to the SPTE's associated struct page. 729 */ 730 WARN_ON_ONCE(!refcount_read(&root->tdp_mmu_root_count)); 731 732 kvm_lockdep_assert_mmu_lock_held(kvm, shared); 733 734 rcu_read_lock(); 735 736 /* 737 * To avoid RCU stalls due to recursively removing huge swaths of SPs, 738 * split the zap into two passes. On the first pass, zap at the 1gb 739 * level, and then zap top-level SPs on the second pass. "1gb" is not 740 * arbitrary, as KVM must be able to zap a 1gb shadow page without 741 * inducing a stall to allow in-place replacement with a 1gb hugepage. 742 * 743 * Because zapping a SP recurses on its children, stepping down to 744 * PG_LEVEL_4K in the iterator itself is unnecessary. 745 */ 746 __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G); 747 __tdp_mmu_zap_root(kvm, root, shared, root->role.level); 748 749 rcu_read_unlock(); 750 } 751 752 bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) 753 { 754 u64 old_spte; 755 756 /* 757 * This helper intentionally doesn't allow zapping a root shadow page, 758 * which doesn't have a parent page table and thus no associated entry. 759 */ 760 if (WARN_ON_ONCE(!sp->ptep)) 761 return false; 762 763 old_spte = kvm_tdp_mmu_read_spte(sp->ptep); 764 if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte))) 765 return false; 766 767 tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0, 768 sp->gfn, sp->role.level + 1); 769 770 return true; 771 } 772 773 /* 774 * If can_yield is true, will release the MMU lock and reschedule if the 775 * scheduler needs the CPU or there is contention on the MMU lock. If this 776 * function cannot yield, it will not release the MMU lock or reschedule and 777 * the caller must ensure it does not supply too large a GFN range, or the 778 * operation can cause a soft lockup. 779 */ 780 static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root, 781 gfn_t start, gfn_t end, bool can_yield, bool flush) 782 { 783 struct tdp_iter iter; 784 785 end = min(end, tdp_mmu_max_gfn_exclusive()); 786 787 lockdep_assert_held_write(&kvm->mmu_lock); 788 789 rcu_read_lock(); 790 791 for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) { 792 if (can_yield && 793 tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) { 794 flush = false; 795 continue; 796 } 797 798 if (!is_shadow_present_pte(iter.old_spte) || 799 !is_last_spte(iter.old_spte, iter.level)) 800 continue; 801 802 tdp_mmu_iter_set_spte(kvm, &iter, 0); 803 flush = true; 804 } 805 806 rcu_read_unlock(); 807 808 /* 809 * Because this flow zaps _only_ leaf SPTEs, the caller doesn't need 810 * to provide RCU protection as no 'struct kvm_mmu_page' will be freed. 811 */ 812 return flush; 813 } 814 815 /* 816 * Zap leaf SPTEs for the range of gfns, [start, end), for all roots. Returns 817 * true if a TLB flush is needed before releasing the MMU lock, i.e. if one or 818 * more SPTEs were zapped since the MMU lock was last acquired. 819 */ 820 bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush) 821 { 822 struct kvm_mmu_page *root; 823 824 lockdep_assert_held_write(&kvm->mmu_lock); 825 for_each_tdp_mmu_root_yield_safe(kvm, root) 826 flush = tdp_mmu_zap_leafs(kvm, root, start, end, true, flush); 827 828 return flush; 829 } 830 831 void kvm_tdp_mmu_zap_all(struct kvm *kvm) 832 { 833 struct kvm_mmu_page *root; 834 835 /* 836 * Zap all roots, including invalid roots, as all SPTEs must be dropped 837 * before returning to the caller. Zap directly even if the root is 838 * also being zapped by a worker. Walking zapped top-level SPTEs isn't 839 * all that expensive and mmu_lock is already held, which means the 840 * worker has yielded, i.e. flushing the work instead of zapping here 841 * isn't guaranteed to be any faster. 842 * 843 * A TLB flush is unnecessary, KVM zaps everything if and only the VM 844 * is being destroyed or the userspace VMM has exited. In both cases, 845 * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request. 846 */ 847 lockdep_assert_held_write(&kvm->mmu_lock); 848 for_each_tdp_mmu_root_yield_safe(kvm, root) 849 tdp_mmu_zap_root(kvm, root, false); 850 } 851 852 /* 853 * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast 854 * zap" completes. 855 */ 856 void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm) 857 { 858 struct kvm_mmu_page *root; 859 860 read_lock(&kvm->mmu_lock); 861 862 for_each_tdp_mmu_root_yield_safe(kvm, root) { 863 if (!root->tdp_mmu_scheduled_root_to_zap) 864 continue; 865 866 root->tdp_mmu_scheduled_root_to_zap = false; 867 KVM_BUG_ON(!root->role.invalid, kvm); 868 869 /* 870 * A TLB flush is not necessary as KVM performs a local TLB 871 * flush when allocating a new root (see kvm_mmu_load()), and 872 * when migrating a vCPU to a different pCPU. Note, the local 873 * TLB flush on reuse also invalidates paging-structure-cache 874 * entries, i.e. TLB entries for intermediate paging structures, 875 * that may be zapped, as such entries are associated with the 876 * ASID on both VMX and SVM. 877 */ 878 tdp_mmu_zap_root(kvm, root, true); 879 880 /* 881 * The referenced needs to be put *after* zapping the root, as 882 * the root must be reachable by mmu_notifiers while it's being 883 * zapped 884 */ 885 kvm_tdp_mmu_put_root(kvm, root); 886 } 887 888 read_unlock(&kvm->mmu_lock); 889 } 890 891 /* 892 * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that 893 * is about to be zapped, e.g. in response to a memslots update. The actual 894 * zapping is done separately so that it happens with mmu_lock with read, 895 * whereas invalidating roots must be done with mmu_lock held for write (unless 896 * the VM is being destroyed). 897 * 898 * Note, kvm_tdp_mmu_zap_invalidated_roots() is gifted the TDP MMU's reference. 899 * See kvm_tdp_mmu_get_vcpu_root_hpa(). 900 */ 901 void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm) 902 { 903 struct kvm_mmu_page *root; 904 905 /* 906 * mmu_lock must be held for write to ensure that a root doesn't become 907 * invalid while there are active readers (invalidating a root while 908 * there are active readers may or may not be problematic in practice, 909 * but it's uncharted territory and not supported). 910 * 911 * Waive the assertion if there are no users of @kvm, i.e. the VM is 912 * being destroyed after all references have been put, or if no vCPUs 913 * have been created (which means there are no roots), i.e. the VM is 914 * being destroyed in an error path of KVM_CREATE_VM. 915 */ 916 if (IS_ENABLED(CONFIG_PROVE_LOCKING) && 917 refcount_read(&kvm->users_count) && kvm->created_vcpus) 918 lockdep_assert_held_write(&kvm->mmu_lock); 919 920 /* 921 * As above, mmu_lock isn't held when destroying the VM! There can't 922 * be other references to @kvm, i.e. nothing else can invalidate roots 923 * or get/put references to roots. 924 */ 925 list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) { 926 /* 927 * Note, invalid roots can outlive a memslot update! Invalid 928 * roots must be *zapped* before the memslot update completes, 929 * but a different task can acquire a reference and keep the 930 * root alive after its been zapped. 931 */ 932 if (!root->role.invalid) { 933 root->tdp_mmu_scheduled_root_to_zap = true; 934 root->role.invalid = true; 935 } 936 } 937 } 938 939 /* 940 * Installs a last-level SPTE to handle a TDP page fault. 941 * (NPT/EPT violation/misconfiguration) 942 */ 943 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, 944 struct kvm_page_fault *fault, 945 struct tdp_iter *iter) 946 { 947 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep)); 948 u64 new_spte; 949 int ret = RET_PF_FIXED; 950 bool wrprot = false; 951 952 if (WARN_ON_ONCE(sp->role.level != fault->goal_level)) 953 return RET_PF_RETRY; 954 955 if (unlikely(!fault->slot)) 956 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); 957 else 958 wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn, 959 fault->pfn, iter->old_spte, fault->prefetch, true, 960 fault->map_writable, &new_spte); 961 962 if (new_spte == iter->old_spte) 963 ret = RET_PF_SPURIOUS; 964 else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte)) 965 return RET_PF_RETRY; 966 else if (is_shadow_present_pte(iter->old_spte) && 967 !is_last_spte(iter->old_spte, iter->level)) 968 kvm_flush_remote_tlbs_gfn(vcpu->kvm, iter->gfn, iter->level); 969 970 /* 971 * If the page fault was caused by a write but the page is write 972 * protected, emulation is needed. If the emulation was skipped, 973 * the vCPU would have the same fault again. 974 */ 975 if (wrprot) { 976 if (fault->write) 977 ret = RET_PF_EMULATE; 978 } 979 980 /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */ 981 if (unlikely(is_mmio_spte(new_spte))) { 982 vcpu->stat.pf_mmio_spte_created++; 983 trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn, 984 new_spte); 985 ret = RET_PF_EMULATE; 986 } else { 987 trace_kvm_mmu_set_spte(iter->level, iter->gfn, 988 rcu_dereference(iter->sptep)); 989 } 990 991 return ret; 992 } 993 994 /* 995 * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the 996 * provided page table. 997 * 998 * @kvm: kvm instance 999 * @iter: a tdp_iter instance currently on the SPTE that should be set 1000 * @sp: The new TDP page table to install. 1001 * @shared: This operation is running under the MMU lock in read mode. 1002 * 1003 * Returns: 0 if the new page table was installed. Non-0 if the page table 1004 * could not be installed (e.g. the atomic compare-exchange failed). 1005 */ 1006 static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter, 1007 struct kvm_mmu_page *sp, bool shared) 1008 { 1009 u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled()); 1010 int ret = 0; 1011 1012 if (shared) { 1013 ret = tdp_mmu_set_spte_atomic(kvm, iter, spte); 1014 if (ret) 1015 return ret; 1016 } else { 1017 tdp_mmu_iter_set_spte(kvm, iter, spte); 1018 } 1019 1020 tdp_account_mmu_page(kvm, sp); 1021 1022 return 0; 1023 } 1024 1025 static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, 1026 struct kvm_mmu_page *sp, bool shared); 1027 1028 /* 1029 * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing 1030 * page tables and SPTEs to translate the faulting guest physical address. 1031 */ 1032 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) 1033 { 1034 struct kvm_mmu *mmu = vcpu->arch.mmu; 1035 struct kvm *kvm = vcpu->kvm; 1036 struct tdp_iter iter; 1037 struct kvm_mmu_page *sp; 1038 int ret = RET_PF_RETRY; 1039 1040 kvm_mmu_hugepage_adjust(vcpu, fault); 1041 1042 trace_kvm_mmu_spte_requested(fault); 1043 1044 rcu_read_lock(); 1045 1046 tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) { 1047 int r; 1048 1049 if (fault->nx_huge_page_workaround_enabled) 1050 disallowed_hugepage_adjust(fault, iter.old_spte, iter.level); 1051 1052 /* 1053 * If SPTE has been frozen by another thread, just give up and 1054 * retry, avoiding unnecessary page table allocation and free. 1055 */ 1056 if (is_removed_spte(iter.old_spte)) 1057 goto retry; 1058 1059 if (iter.level == fault->goal_level) 1060 goto map_target_level; 1061 1062 /* Step down into the lower level page table if it exists. */ 1063 if (is_shadow_present_pte(iter.old_spte) && 1064 !is_large_pte(iter.old_spte)) 1065 continue; 1066 1067 /* 1068 * The SPTE is either non-present or points to a huge page that 1069 * needs to be split. 1070 */ 1071 sp = tdp_mmu_alloc_sp(vcpu); 1072 tdp_mmu_init_child_sp(sp, &iter); 1073 1074 sp->nx_huge_page_disallowed = fault->huge_page_disallowed; 1075 1076 if (is_shadow_present_pte(iter.old_spte)) 1077 r = tdp_mmu_split_huge_page(kvm, &iter, sp, true); 1078 else 1079 r = tdp_mmu_link_sp(kvm, &iter, sp, true); 1080 1081 /* 1082 * Force the guest to retry if installing an upper level SPTE 1083 * failed, e.g. because a different task modified the SPTE. 1084 */ 1085 if (r) { 1086 tdp_mmu_free_sp(sp); 1087 goto retry; 1088 } 1089 1090 if (fault->huge_page_disallowed && 1091 fault->req_level >= iter.level) { 1092 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 1093 if (sp->nx_huge_page_disallowed) 1094 track_possible_nx_huge_page(kvm, sp); 1095 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 1096 } 1097 } 1098 1099 /* 1100 * The walk aborted before reaching the target level, e.g. because the 1101 * iterator detected an upper level SPTE was frozen during traversal. 1102 */ 1103 WARN_ON_ONCE(iter.level == fault->goal_level); 1104 goto retry; 1105 1106 map_target_level: 1107 ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter); 1108 1109 retry: 1110 rcu_read_unlock(); 1111 return ret; 1112 } 1113 1114 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, 1115 bool flush) 1116 { 1117 struct kvm_mmu_page *root; 1118 1119 __for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, false) 1120 flush = tdp_mmu_zap_leafs(kvm, root, range->start, range->end, 1121 range->may_block, flush); 1122 1123 return flush; 1124 } 1125 1126 typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter, 1127 struct kvm_gfn_range *range); 1128 1129 static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm, 1130 struct kvm_gfn_range *range, 1131 tdp_handler_t handler) 1132 { 1133 struct kvm_mmu_page *root; 1134 struct tdp_iter iter; 1135 bool ret = false; 1136 1137 /* 1138 * Don't support rescheduling, none of the MMU notifiers that funnel 1139 * into this helper allow blocking; it'd be dead, wasteful code. 1140 */ 1141 for_each_tdp_mmu_root(kvm, root, range->slot->as_id) { 1142 rcu_read_lock(); 1143 1144 tdp_root_for_each_leaf_pte(iter, root, range->start, range->end) 1145 ret |= handler(kvm, &iter, range); 1146 1147 rcu_read_unlock(); 1148 } 1149 1150 return ret; 1151 } 1152 1153 /* 1154 * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero 1155 * if any of the GFNs in the range have been accessed. 1156 * 1157 * No need to mark the corresponding PFN as accessed as this call is coming 1158 * from the clear_young() or clear_flush_young() notifier, which uses the 1159 * return value to determine if the page has been accessed. 1160 */ 1161 static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter, 1162 struct kvm_gfn_range *range) 1163 { 1164 u64 new_spte; 1165 1166 /* If we have a non-accessed entry we don't need to change the pte. */ 1167 if (!is_accessed_spte(iter->old_spte)) 1168 return false; 1169 1170 if (spte_ad_enabled(iter->old_spte)) { 1171 iter->old_spte = tdp_mmu_clear_spte_bits(iter->sptep, 1172 iter->old_spte, 1173 shadow_accessed_mask, 1174 iter->level); 1175 new_spte = iter->old_spte & ~shadow_accessed_mask; 1176 } else { 1177 /* 1178 * Capture the dirty status of the page, so that it doesn't get 1179 * lost when the SPTE is marked for access tracking. 1180 */ 1181 if (is_writable_pte(iter->old_spte)) 1182 kvm_set_pfn_dirty(spte_to_pfn(iter->old_spte)); 1183 1184 new_spte = mark_spte_for_access_track(iter->old_spte); 1185 iter->old_spte = kvm_tdp_mmu_write_spte(iter->sptep, 1186 iter->old_spte, new_spte, 1187 iter->level); 1188 } 1189 1190 trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level, 1191 iter->old_spte, new_spte); 1192 return true; 1193 } 1194 1195 bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) 1196 { 1197 return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range); 1198 } 1199 1200 static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter, 1201 struct kvm_gfn_range *range) 1202 { 1203 return is_accessed_spte(iter->old_spte); 1204 } 1205 1206 bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 1207 { 1208 return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn); 1209 } 1210 1211 static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter, 1212 struct kvm_gfn_range *range) 1213 { 1214 u64 new_spte; 1215 1216 /* Huge pages aren't expected to be modified without first being zapped. */ 1217 WARN_ON_ONCE(pte_huge(range->arg.pte) || range->start + 1 != range->end); 1218 1219 if (iter->level != PG_LEVEL_4K || 1220 !is_shadow_present_pte(iter->old_spte)) 1221 return false; 1222 1223 /* 1224 * Note, when changing a read-only SPTE, it's not strictly necessary to 1225 * zero the SPTE before setting the new PFN, but doing so preserves the 1226 * invariant that the PFN of a present * leaf SPTE can never change. 1227 * See handle_changed_spte(). 1228 */ 1229 tdp_mmu_iter_set_spte(kvm, iter, 0); 1230 1231 if (!pte_write(range->arg.pte)) { 1232 new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte, 1233 pte_pfn(range->arg.pte)); 1234 1235 tdp_mmu_iter_set_spte(kvm, iter, new_spte); 1236 } 1237 1238 return true; 1239 } 1240 1241 /* 1242 * Handle the changed_pte MMU notifier for the TDP MMU. 1243 * data is a pointer to the new pte_t mapping the HVA specified by the MMU 1244 * notifier. 1245 * Returns non-zero if a flush is needed before releasing the MMU lock. 1246 */ 1247 bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 1248 { 1249 /* 1250 * No need to handle the remote TLB flush under RCU protection, the 1251 * target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a 1252 * shadow page. See the WARN on pfn_changed in handle_changed_spte(). 1253 */ 1254 return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn); 1255 } 1256 1257 /* 1258 * Remove write access from all SPTEs at or above min_level that map GFNs 1259 * [start, end). Returns true if an SPTE has been changed and the TLBs need to 1260 * be flushed. 1261 */ 1262 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 1263 gfn_t start, gfn_t end, int min_level) 1264 { 1265 struct tdp_iter iter; 1266 u64 new_spte; 1267 bool spte_set = false; 1268 1269 rcu_read_lock(); 1270 1271 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); 1272 1273 for_each_tdp_pte_min_level(iter, root, min_level, start, end) { 1274 retry: 1275 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) 1276 continue; 1277 1278 if (!is_shadow_present_pte(iter.old_spte) || 1279 !is_last_spte(iter.old_spte, iter.level) || 1280 !(iter.old_spte & PT_WRITABLE_MASK)) 1281 continue; 1282 1283 new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 1284 1285 if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) 1286 goto retry; 1287 1288 spte_set = true; 1289 } 1290 1291 rcu_read_unlock(); 1292 return spte_set; 1293 } 1294 1295 /* 1296 * Remove write access from all the SPTEs mapping GFNs in the memslot. Will 1297 * only affect leaf SPTEs down to min_level. 1298 * Returns true if an SPTE has been changed and the TLBs need to be flushed. 1299 */ 1300 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, 1301 const struct kvm_memory_slot *slot, int min_level) 1302 { 1303 struct kvm_mmu_page *root; 1304 bool spte_set = false; 1305 1306 lockdep_assert_held_read(&kvm->mmu_lock); 1307 1308 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) 1309 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn, 1310 slot->base_gfn + slot->npages, min_level); 1311 1312 return spte_set; 1313 } 1314 1315 static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp) 1316 { 1317 struct kvm_mmu_page *sp; 1318 1319 gfp |= __GFP_ZERO; 1320 1321 sp = kmem_cache_alloc(mmu_page_header_cache, gfp); 1322 if (!sp) 1323 return NULL; 1324 1325 sp->spt = (void *)__get_free_page(gfp); 1326 if (!sp->spt) { 1327 kmem_cache_free(mmu_page_header_cache, sp); 1328 return NULL; 1329 } 1330 1331 return sp; 1332 } 1333 1334 static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm, 1335 struct tdp_iter *iter, 1336 bool shared) 1337 { 1338 struct kvm_mmu_page *sp; 1339 1340 kvm_lockdep_assert_mmu_lock_held(kvm, shared); 1341 1342 /* 1343 * Since we are allocating while under the MMU lock we have to be 1344 * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct 1345 * reclaim and to avoid making any filesystem callbacks (which can end 1346 * up invoking KVM MMU notifiers, resulting in a deadlock). 1347 * 1348 * If this allocation fails we drop the lock and retry with reclaim 1349 * allowed. 1350 */ 1351 sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT); 1352 if (sp) 1353 return sp; 1354 1355 rcu_read_unlock(); 1356 1357 if (shared) 1358 read_unlock(&kvm->mmu_lock); 1359 else 1360 write_unlock(&kvm->mmu_lock); 1361 1362 iter->yielded = true; 1363 sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT); 1364 1365 if (shared) 1366 read_lock(&kvm->mmu_lock); 1367 else 1368 write_lock(&kvm->mmu_lock); 1369 1370 rcu_read_lock(); 1371 1372 return sp; 1373 } 1374 1375 /* Note, the caller is responsible for initializing @sp. */ 1376 static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, 1377 struct kvm_mmu_page *sp, bool shared) 1378 { 1379 const u64 huge_spte = iter->old_spte; 1380 const int level = iter->level; 1381 int ret, i; 1382 1383 /* 1384 * No need for atomics when writing to sp->spt since the page table has 1385 * not been linked in yet and thus is not reachable from any other CPU. 1386 */ 1387 for (i = 0; i < SPTE_ENT_PER_PAGE; i++) 1388 sp->spt[i] = make_huge_page_split_spte(kvm, huge_spte, sp->role, i); 1389 1390 /* 1391 * Replace the huge spte with a pointer to the populated lower level 1392 * page table. Since we are making this change without a TLB flush vCPUs 1393 * will see a mix of the split mappings and the original huge mapping, 1394 * depending on what's currently in their TLB. This is fine from a 1395 * correctness standpoint since the translation will be the same either 1396 * way. 1397 */ 1398 ret = tdp_mmu_link_sp(kvm, iter, sp, shared); 1399 if (ret) 1400 goto out; 1401 1402 /* 1403 * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we 1404 * are overwriting from the page stats. But we have to manually update 1405 * the page stats with the new present child pages. 1406 */ 1407 kvm_update_page_stats(kvm, level - 1, SPTE_ENT_PER_PAGE); 1408 1409 out: 1410 trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret); 1411 return ret; 1412 } 1413 1414 static int tdp_mmu_split_huge_pages_root(struct kvm *kvm, 1415 struct kvm_mmu_page *root, 1416 gfn_t start, gfn_t end, 1417 int target_level, bool shared) 1418 { 1419 struct kvm_mmu_page *sp = NULL; 1420 struct tdp_iter iter; 1421 int ret = 0; 1422 1423 rcu_read_lock(); 1424 1425 /* 1426 * Traverse the page table splitting all huge pages above the target 1427 * level into one lower level. For example, if we encounter a 1GB page 1428 * we split it into 512 2MB pages. 1429 * 1430 * Since the TDP iterator uses a pre-order traversal, we are guaranteed 1431 * to visit an SPTE before ever visiting its children, which means we 1432 * will correctly recursively split huge pages that are more than one 1433 * level above the target level (e.g. splitting a 1GB to 512 2MB pages, 1434 * and then splitting each of those to 512 4KB pages). 1435 */ 1436 for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) { 1437 retry: 1438 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared)) 1439 continue; 1440 1441 if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte)) 1442 continue; 1443 1444 if (!sp) { 1445 sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared); 1446 if (!sp) { 1447 ret = -ENOMEM; 1448 trace_kvm_mmu_split_huge_page(iter.gfn, 1449 iter.old_spte, 1450 iter.level, ret); 1451 break; 1452 } 1453 1454 if (iter.yielded) 1455 continue; 1456 } 1457 1458 tdp_mmu_init_child_sp(sp, &iter); 1459 1460 if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared)) 1461 goto retry; 1462 1463 sp = NULL; 1464 } 1465 1466 rcu_read_unlock(); 1467 1468 /* 1469 * It's possible to exit the loop having never used the last sp if, for 1470 * example, a vCPU doing HugePage NX splitting wins the race and 1471 * installs its own sp in place of the last sp we tried to split. 1472 */ 1473 if (sp) 1474 tdp_mmu_free_sp(sp); 1475 1476 return ret; 1477 } 1478 1479 1480 /* 1481 * Try to split all huge pages mapped by the TDP MMU down to the target level. 1482 */ 1483 void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm, 1484 const struct kvm_memory_slot *slot, 1485 gfn_t start, gfn_t end, 1486 int target_level, bool shared) 1487 { 1488 struct kvm_mmu_page *root; 1489 int r = 0; 1490 1491 kvm_lockdep_assert_mmu_lock_held(kvm, shared); 1492 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) { 1493 r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared); 1494 if (r) { 1495 kvm_tdp_mmu_put_root(kvm, root); 1496 break; 1497 } 1498 } 1499 } 1500 1501 /* 1502 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If 1503 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. 1504 * If AD bits are not enabled, this will require clearing the writable bit on 1505 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to 1506 * be flushed. 1507 */ 1508 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 1509 gfn_t start, gfn_t end) 1510 { 1511 u64 dbit = kvm_ad_enabled() ? shadow_dirty_mask : PT_WRITABLE_MASK; 1512 struct tdp_iter iter; 1513 bool spte_set = false; 1514 1515 rcu_read_lock(); 1516 1517 tdp_root_for_each_pte(iter, root, start, end) { 1518 retry: 1519 if (!is_shadow_present_pte(iter.old_spte) || 1520 !is_last_spte(iter.old_spte, iter.level)) 1521 continue; 1522 1523 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) 1524 continue; 1525 1526 KVM_MMU_WARN_ON(kvm_ad_enabled() && 1527 spte_ad_need_write_protect(iter.old_spte)); 1528 1529 if (!(iter.old_spte & dbit)) 1530 continue; 1531 1532 if (tdp_mmu_set_spte_atomic(kvm, &iter, iter.old_spte & ~dbit)) 1533 goto retry; 1534 1535 spte_set = true; 1536 } 1537 1538 rcu_read_unlock(); 1539 return spte_set; 1540 } 1541 1542 /* 1543 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If 1544 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. 1545 * If AD bits are not enabled, this will require clearing the writable bit on 1546 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to 1547 * be flushed. 1548 */ 1549 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, 1550 const struct kvm_memory_slot *slot) 1551 { 1552 struct kvm_mmu_page *root; 1553 bool spte_set = false; 1554 1555 lockdep_assert_held_read(&kvm->mmu_lock); 1556 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) 1557 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn, 1558 slot->base_gfn + slot->npages); 1559 1560 return spte_set; 1561 } 1562 1563 /* 1564 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is 1565 * set in mask, starting at gfn. The given memslot is expected to contain all 1566 * the GFNs represented by set bits in the mask. If AD bits are enabled, 1567 * clearing the dirty status will involve clearing the dirty bit on each SPTE 1568 * or, if AD bits are not enabled, clearing the writable bit on each SPTE. 1569 */ 1570 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, 1571 gfn_t gfn, unsigned long mask, bool wrprot) 1572 { 1573 u64 dbit = (wrprot || !kvm_ad_enabled()) ? PT_WRITABLE_MASK : 1574 shadow_dirty_mask; 1575 struct tdp_iter iter; 1576 1577 lockdep_assert_held_write(&kvm->mmu_lock); 1578 1579 rcu_read_lock(); 1580 1581 tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask), 1582 gfn + BITS_PER_LONG) { 1583 if (!mask) 1584 break; 1585 1586 KVM_MMU_WARN_ON(kvm_ad_enabled() && 1587 spte_ad_need_write_protect(iter.old_spte)); 1588 1589 if (iter.level > PG_LEVEL_4K || 1590 !(mask & (1UL << (iter.gfn - gfn)))) 1591 continue; 1592 1593 mask &= ~(1UL << (iter.gfn - gfn)); 1594 1595 if (!(iter.old_spte & dbit)) 1596 continue; 1597 1598 iter.old_spte = tdp_mmu_clear_spte_bits(iter.sptep, 1599 iter.old_spte, dbit, 1600 iter.level); 1601 1602 trace_kvm_tdp_mmu_spte_changed(iter.as_id, iter.gfn, iter.level, 1603 iter.old_spte, 1604 iter.old_spte & ~dbit); 1605 kvm_set_pfn_dirty(spte_to_pfn(iter.old_spte)); 1606 } 1607 1608 rcu_read_unlock(); 1609 } 1610 1611 /* 1612 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is 1613 * set in mask, starting at gfn. The given memslot is expected to contain all 1614 * the GFNs represented by set bits in the mask. If AD bits are enabled, 1615 * clearing the dirty status will involve clearing the dirty bit on each SPTE 1616 * or, if AD bits are not enabled, clearing the writable bit on each SPTE. 1617 */ 1618 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, 1619 struct kvm_memory_slot *slot, 1620 gfn_t gfn, unsigned long mask, 1621 bool wrprot) 1622 { 1623 struct kvm_mmu_page *root; 1624 1625 for_each_tdp_mmu_root(kvm, root, slot->as_id) 1626 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot); 1627 } 1628 1629 static void zap_collapsible_spte_range(struct kvm *kvm, 1630 struct kvm_mmu_page *root, 1631 const struct kvm_memory_slot *slot) 1632 { 1633 gfn_t start = slot->base_gfn; 1634 gfn_t end = start + slot->npages; 1635 struct tdp_iter iter; 1636 int max_mapping_level; 1637 1638 rcu_read_lock(); 1639 1640 for_each_tdp_pte_min_level(iter, root, PG_LEVEL_2M, start, end) { 1641 retry: 1642 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) 1643 continue; 1644 1645 if (iter.level > KVM_MAX_HUGEPAGE_LEVEL || 1646 !is_shadow_present_pte(iter.old_spte)) 1647 continue; 1648 1649 /* 1650 * Don't zap leaf SPTEs, if a leaf SPTE could be replaced with 1651 * a large page size, then its parent would have been zapped 1652 * instead of stepping down. 1653 */ 1654 if (is_last_spte(iter.old_spte, iter.level)) 1655 continue; 1656 1657 /* 1658 * If iter.gfn resides outside of the slot, i.e. the page for 1659 * the current level overlaps but is not contained by the slot, 1660 * then the SPTE can't be made huge. More importantly, trying 1661 * to query that info from slot->arch.lpage_info will cause an 1662 * out-of-bounds access. 1663 */ 1664 if (iter.gfn < start || iter.gfn >= end) 1665 continue; 1666 1667 max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, 1668 iter.gfn, PG_LEVEL_NUM); 1669 if (max_mapping_level < iter.level) 1670 continue; 1671 1672 /* Note, a successful atomic zap also does a remote TLB flush. */ 1673 if (tdp_mmu_zap_spte_atomic(kvm, &iter)) 1674 goto retry; 1675 } 1676 1677 rcu_read_unlock(); 1678 } 1679 1680 /* 1681 * Zap non-leaf SPTEs (and free their associated page tables) which could 1682 * be replaced by huge pages, for GFNs within the slot. 1683 */ 1684 void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, 1685 const struct kvm_memory_slot *slot) 1686 { 1687 struct kvm_mmu_page *root; 1688 1689 lockdep_assert_held_read(&kvm->mmu_lock); 1690 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) 1691 zap_collapsible_spte_range(kvm, root, slot); 1692 } 1693 1694 /* 1695 * Removes write access on the last level SPTE mapping this GFN and unsets the 1696 * MMU-writable bit to ensure future writes continue to be intercepted. 1697 * Returns true if an SPTE was set and a TLB flush is needed. 1698 */ 1699 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, 1700 gfn_t gfn, int min_level) 1701 { 1702 struct tdp_iter iter; 1703 u64 new_spte; 1704 bool spte_set = false; 1705 1706 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); 1707 1708 rcu_read_lock(); 1709 1710 for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) { 1711 if (!is_shadow_present_pte(iter.old_spte) || 1712 !is_last_spte(iter.old_spte, iter.level)) 1713 continue; 1714 1715 new_spte = iter.old_spte & 1716 ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask); 1717 1718 if (new_spte == iter.old_spte) 1719 break; 1720 1721 tdp_mmu_iter_set_spte(kvm, &iter, new_spte); 1722 spte_set = true; 1723 } 1724 1725 rcu_read_unlock(); 1726 1727 return spte_set; 1728 } 1729 1730 /* 1731 * Removes write access on the last level SPTE mapping this GFN and unsets the 1732 * MMU-writable bit to ensure future writes continue to be intercepted. 1733 * Returns true if an SPTE was set and a TLB flush is needed. 1734 */ 1735 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, 1736 struct kvm_memory_slot *slot, gfn_t gfn, 1737 int min_level) 1738 { 1739 struct kvm_mmu_page *root; 1740 bool spte_set = false; 1741 1742 lockdep_assert_held_write(&kvm->mmu_lock); 1743 for_each_tdp_mmu_root(kvm, root, slot->as_id) 1744 spte_set |= write_protect_gfn(kvm, root, gfn, min_level); 1745 1746 return spte_set; 1747 } 1748 1749 /* 1750 * Return the level of the lowest level SPTE added to sptes. 1751 * That SPTE may be non-present. 1752 * 1753 * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. 1754 */ 1755 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, 1756 int *root_level) 1757 { 1758 struct tdp_iter iter; 1759 struct kvm_mmu *mmu = vcpu->arch.mmu; 1760 gfn_t gfn = addr >> PAGE_SHIFT; 1761 int leaf = -1; 1762 1763 *root_level = vcpu->arch.mmu->root_role.level; 1764 1765 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { 1766 leaf = iter.level; 1767 sptes[leaf] = iter.old_spte; 1768 } 1769 1770 return leaf; 1771 } 1772 1773 /* 1774 * Returns the last level spte pointer of the shadow page walk for the given 1775 * gpa, and sets *spte to the spte value. This spte may be non-preset. If no 1776 * walk could be performed, returns NULL and *spte does not contain valid data. 1777 * 1778 * Contract: 1779 * - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. 1780 * - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end. 1781 * 1782 * WARNING: This function is only intended to be called during fast_page_fault. 1783 */ 1784 u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr, 1785 u64 *spte) 1786 { 1787 struct tdp_iter iter; 1788 struct kvm_mmu *mmu = vcpu->arch.mmu; 1789 gfn_t gfn = addr >> PAGE_SHIFT; 1790 tdp_ptep_t sptep = NULL; 1791 1792 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { 1793 *spte = iter.old_spte; 1794 sptep = iter.sptep; 1795 } 1796 1797 /* 1798 * Perform the rcu_dereference to get the raw spte pointer value since 1799 * we are passing it up to fast_page_fault, which is shared with the 1800 * legacy MMU and thus does not retain the TDP MMU-specific __rcu 1801 * annotation. 1802 * 1803 * This is safe since fast_page_fault obeys the contracts of this 1804 * function as well as all TDP MMU contracts around modifying SPTEs 1805 * outside of mmu_lock. 1806 */ 1807 return rcu_dereference(sptep); 1808 } 1809