1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "mmu.h" 4 #include "mmu_internal.h" 5 #include "mmutrace.h" 6 #include "tdp_iter.h" 7 #include "tdp_mmu.h" 8 #include "spte.h" 9 10 #include <asm/cmpxchg.h> 11 #include <trace/events/kvm.h> 12 13 static bool __read_mostly tdp_mmu_enabled = true; 14 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644); 15 16 /* Initializes the TDP MMU for the VM, if enabled. */ 17 int kvm_mmu_init_tdp_mmu(struct kvm *kvm) 18 { 19 struct workqueue_struct *wq; 20 21 if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled)) 22 return 0; 23 24 wq = alloc_workqueue("kvm", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0); 25 if (!wq) 26 return -ENOMEM; 27 28 /* This should not be changed for the lifetime of the VM. */ 29 kvm->arch.tdp_mmu_enabled = true; 30 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); 31 spin_lock_init(&kvm->arch.tdp_mmu_pages_lock); 32 kvm->arch.tdp_mmu_zap_wq = wq; 33 return 1; 34 } 35 36 /* Arbitrarily returns true so that this may be used in if statements. */ 37 static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm, 38 bool shared) 39 { 40 if (shared) 41 lockdep_assert_held_read(&kvm->mmu_lock); 42 else 43 lockdep_assert_held_write(&kvm->mmu_lock); 44 45 return true; 46 } 47 48 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) 49 { 50 if (!kvm->arch.tdp_mmu_enabled) 51 return; 52 53 /* Also waits for any queued work items. */ 54 destroy_workqueue(kvm->arch.tdp_mmu_zap_wq); 55 56 WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages)); 57 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); 58 59 /* 60 * Ensure that all the outstanding RCU callbacks to free shadow pages 61 * can run before the VM is torn down. Work items on tdp_mmu_zap_wq 62 * can call kvm_tdp_mmu_put_root and create new callbacks. 63 */ 64 rcu_barrier(); 65 } 66 67 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp) 68 { 69 free_page((unsigned long)sp->spt); 70 kmem_cache_free(mmu_page_header_cache, sp); 71 } 72 73 /* 74 * This is called through call_rcu in order to free TDP page table memory 75 * safely with respect to other kernel threads that may be operating on 76 * the memory. 77 * By only accessing TDP MMU page table memory in an RCU read critical 78 * section, and freeing it after a grace period, lockless access to that 79 * memory won't use it after it is freed. 80 */ 81 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head) 82 { 83 struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page, 84 rcu_head); 85 86 tdp_mmu_free_sp(sp); 87 } 88 89 static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, 90 bool shared); 91 92 static void tdp_mmu_zap_root_work(struct work_struct *work) 93 { 94 struct kvm_mmu_page *root = container_of(work, struct kvm_mmu_page, 95 tdp_mmu_async_work); 96 struct kvm *kvm = root->tdp_mmu_async_data; 97 98 read_lock(&kvm->mmu_lock); 99 100 /* 101 * A TLB flush is not necessary as KVM performs a local TLB flush when 102 * allocating a new root (see kvm_mmu_load()), and when migrating vCPU 103 * to a different pCPU. Note, the local TLB flush on reuse also 104 * invalidates any paging-structure-cache entries, i.e. TLB entries for 105 * intermediate paging structures, that may be zapped, as such entries 106 * are associated with the ASID on both VMX and SVM. 107 */ 108 tdp_mmu_zap_root(kvm, root, true); 109 110 /* 111 * Drop the refcount using kvm_tdp_mmu_put_root() to test its logic for 112 * avoiding an infinite loop. By design, the root is reachable while 113 * it's being asynchronously zapped, thus a different task can put its 114 * last reference, i.e. flowing through kvm_tdp_mmu_put_root() for an 115 * asynchronously zapped root is unavoidable. 116 */ 117 kvm_tdp_mmu_put_root(kvm, root, true); 118 119 read_unlock(&kvm->mmu_lock); 120 } 121 122 static void tdp_mmu_schedule_zap_root(struct kvm *kvm, struct kvm_mmu_page *root) 123 { 124 root->tdp_mmu_async_data = kvm; 125 INIT_WORK(&root->tdp_mmu_async_work, tdp_mmu_zap_root_work); 126 queue_work(kvm->arch.tdp_mmu_zap_wq, &root->tdp_mmu_async_work); 127 } 128 129 static inline bool kvm_tdp_root_mark_invalid(struct kvm_mmu_page *page) 130 { 131 union kvm_mmu_page_role role = page->role; 132 role.invalid = true; 133 134 /* No need to use cmpxchg, only the invalid bit can change. */ 135 role.word = xchg(&page->role.word, role.word); 136 return role.invalid; 137 } 138 139 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, 140 bool shared) 141 { 142 kvm_lockdep_assert_mmu_lock_held(kvm, shared); 143 144 if (!refcount_dec_and_test(&root->tdp_mmu_root_count)) 145 return; 146 147 WARN_ON(!root->tdp_mmu_page); 148 149 /* 150 * The root now has refcount=0. It is valid, but readers already 151 * cannot acquire a reference to it because kvm_tdp_mmu_get_root() 152 * rejects it. This remains true for the rest of the execution 153 * of this function, because readers visit valid roots only 154 * (except for tdp_mmu_zap_root_work(), which however 155 * does not acquire any reference itself). 156 * 157 * Even though there are flows that need to visit all roots for 158 * correctness, they all take mmu_lock for write, so they cannot yet 159 * run concurrently. The same is true after kvm_tdp_root_mark_invalid, 160 * since the root still has refcount=0. 161 * 162 * However, tdp_mmu_zap_root can yield, and writers do not expect to 163 * see refcount=0 (see for example kvm_tdp_mmu_invalidate_all_roots()). 164 * So the root temporarily gets an extra reference, going to refcount=1 165 * while staying invalid. Readers still cannot acquire any reference; 166 * but writers are now allowed to run if tdp_mmu_zap_root yields and 167 * they might take an extra reference if they themselves yield. 168 * Therefore, when the reference is given back by the worker, 169 * there is no guarantee that the refcount is still 1. If not, whoever 170 * puts the last reference will free the page, but they will not have to 171 * zap the root because a root cannot go from invalid to valid. 172 */ 173 if (!kvm_tdp_root_mark_invalid(root)) { 174 refcount_set(&root->tdp_mmu_root_count, 1); 175 176 /* 177 * Zapping the root in a worker is not just "nice to have"; 178 * it is required because kvm_tdp_mmu_invalidate_all_roots() 179 * skips already-invalid roots. If kvm_tdp_mmu_put_root() did 180 * not add the root to the workqueue, kvm_tdp_mmu_zap_all_fast() 181 * might return with some roots not zapped yet. 182 */ 183 tdp_mmu_schedule_zap_root(kvm, root); 184 return; 185 } 186 187 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 188 list_del_rcu(&root->link); 189 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 190 call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback); 191 } 192 193 /* 194 * Returns the next root after @prev_root (or the first root if @prev_root is 195 * NULL). A reference to the returned root is acquired, and the reference to 196 * @prev_root is released (the caller obviously must hold a reference to 197 * @prev_root if it's non-NULL). 198 * 199 * If @only_valid is true, invalid roots are skipped. 200 * 201 * Returns NULL if the end of tdp_mmu_roots was reached. 202 */ 203 static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, 204 struct kvm_mmu_page *prev_root, 205 bool shared, bool only_valid) 206 { 207 struct kvm_mmu_page *next_root; 208 209 rcu_read_lock(); 210 211 if (prev_root) 212 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, 213 &prev_root->link, 214 typeof(*prev_root), link); 215 else 216 next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots, 217 typeof(*next_root), link); 218 219 while (next_root) { 220 if ((!only_valid || !next_root->role.invalid) && 221 kvm_tdp_mmu_get_root(next_root)) 222 break; 223 224 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, 225 &next_root->link, typeof(*next_root), link); 226 } 227 228 rcu_read_unlock(); 229 230 if (prev_root) 231 kvm_tdp_mmu_put_root(kvm, prev_root, shared); 232 233 return next_root; 234 } 235 236 /* 237 * Note: this iterator gets and puts references to the roots it iterates over. 238 * This makes it safe to release the MMU lock and yield within the loop, but 239 * if exiting the loop early, the caller must drop the reference to the most 240 * recent root. (Unless keeping a live reference is desirable.) 241 * 242 * If shared is set, this function is operating under the MMU lock in read 243 * mode. In the unlikely event that this thread must free a root, the lock 244 * will be temporarily dropped and reacquired in write mode. 245 */ 246 #define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\ 247 for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid); \ 248 _root; \ 249 _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid)) \ 250 if (kvm_lockdep_assert_mmu_lock_held(_kvm, _shared) && \ 251 kvm_mmu_page_as_id(_root) != _as_id) { \ 252 } else 253 254 #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \ 255 __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true) 256 257 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \ 258 __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, false, false) 259 260 /* 261 * Iterate over all TDP MMU roots. Requires that mmu_lock be held for write, 262 * the implication being that any flow that holds mmu_lock for read is 263 * inherently yield-friendly and should use the yield-safe variant above. 264 * Holding mmu_lock for write obviates the need for RCU protection as the list 265 * is guaranteed to be stable. 266 */ 267 #define for_each_tdp_mmu_root(_kvm, _root, _as_id) \ 268 list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \ 269 if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \ 270 kvm_mmu_page_as_id(_root) != _as_id) { \ 271 } else 272 273 static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu) 274 { 275 struct kvm_mmu_page *sp; 276 277 sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); 278 sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); 279 280 return sp; 281 } 282 283 static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep, 284 gfn_t gfn, union kvm_mmu_page_role role) 285 { 286 INIT_LIST_HEAD(&sp->possible_nx_huge_page_link); 287 288 set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 289 290 sp->role = role; 291 sp->gfn = gfn; 292 sp->ptep = sptep; 293 sp->tdp_mmu_page = true; 294 295 trace_kvm_mmu_get_page(sp, true); 296 } 297 298 static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp, 299 struct tdp_iter *iter) 300 { 301 struct kvm_mmu_page *parent_sp; 302 union kvm_mmu_page_role role; 303 304 parent_sp = sptep_to_sp(rcu_dereference(iter->sptep)); 305 306 role = parent_sp->role; 307 role.level--; 308 309 tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role); 310 } 311 312 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) 313 { 314 union kvm_mmu_page_role role = vcpu->arch.mmu->root_role; 315 struct kvm *kvm = vcpu->kvm; 316 struct kvm_mmu_page *root; 317 318 lockdep_assert_held_write(&kvm->mmu_lock); 319 320 /* 321 * Check for an existing root before allocating a new one. Note, the 322 * role check prevents consuming an invalid root. 323 */ 324 for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) { 325 if (root->role.word == role.word && 326 kvm_tdp_mmu_get_root(root)) 327 goto out; 328 } 329 330 root = tdp_mmu_alloc_sp(vcpu); 331 tdp_mmu_init_sp(root, NULL, 0, role); 332 333 refcount_set(&root->tdp_mmu_root_count, 1); 334 335 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 336 list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots); 337 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 338 339 out: 340 return __pa(root->spt); 341 } 342 343 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 344 u64 old_spte, u64 new_spte, int level, 345 bool shared); 346 347 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level) 348 { 349 if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level)) 350 return; 351 352 if (is_accessed_spte(old_spte) && 353 (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) || 354 spte_to_pfn(old_spte) != spte_to_pfn(new_spte))) 355 kvm_set_pfn_accessed(spte_to_pfn(old_spte)); 356 } 357 358 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn, 359 u64 old_spte, u64 new_spte, int level) 360 { 361 bool pfn_changed; 362 struct kvm_memory_slot *slot; 363 364 if (level > PG_LEVEL_4K) 365 return; 366 367 pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); 368 369 if ((!is_writable_pte(old_spte) || pfn_changed) && 370 is_writable_pte(new_spte)) { 371 slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn); 372 mark_page_dirty_in_slot(kvm, slot, gfn); 373 } 374 } 375 376 static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) 377 { 378 kvm_account_pgtable_pages((void *)sp->spt, +1); 379 atomic64_inc(&kvm->arch.tdp_mmu_pages); 380 } 381 382 static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) 383 { 384 kvm_account_pgtable_pages((void *)sp->spt, -1); 385 atomic64_dec(&kvm->arch.tdp_mmu_pages); 386 } 387 388 /** 389 * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages 390 * 391 * @kvm: kvm instance 392 * @sp: the page to be removed 393 * @shared: This operation may not be running under the exclusive use of 394 * the MMU lock and the operation must synchronize with other 395 * threads that might be adding or removing pages. 396 */ 397 static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp, 398 bool shared) 399 { 400 tdp_unaccount_mmu_page(kvm, sp); 401 402 if (!sp->nx_huge_page_disallowed) 403 return; 404 405 if (shared) 406 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 407 else 408 lockdep_assert_held_write(&kvm->mmu_lock); 409 410 sp->nx_huge_page_disallowed = false; 411 untrack_possible_nx_huge_page(kvm, sp); 412 413 if (shared) 414 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 415 } 416 417 /** 418 * handle_removed_pt() - handle a page table removed from the TDP structure 419 * 420 * @kvm: kvm instance 421 * @pt: the page removed from the paging structure 422 * @shared: This operation may not be running under the exclusive use 423 * of the MMU lock and the operation must synchronize with other 424 * threads that might be modifying SPTEs. 425 * 426 * Given a page table that has been removed from the TDP paging structure, 427 * iterates through the page table to clear SPTEs and free child page tables. 428 * 429 * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU 430 * protection. Since this thread removed it from the paging structure, 431 * this thread will be responsible for ensuring the page is freed. Hence the 432 * early rcu_dereferences in the function. 433 */ 434 static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) 435 { 436 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt)); 437 int level = sp->role.level; 438 gfn_t base_gfn = sp->gfn; 439 int i; 440 441 trace_kvm_mmu_prepare_zap_page(sp); 442 443 tdp_mmu_unlink_sp(kvm, sp, shared); 444 445 for (i = 0; i < SPTE_ENT_PER_PAGE; i++) { 446 tdp_ptep_t sptep = pt + i; 447 gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level); 448 u64 old_spte; 449 450 if (shared) { 451 /* 452 * Set the SPTE to a nonpresent value that other 453 * threads will not overwrite. If the SPTE was 454 * already marked as removed then another thread 455 * handling a page fault could overwrite it, so 456 * set the SPTE until it is set from some other 457 * value to the removed SPTE value. 458 */ 459 for (;;) { 460 old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, REMOVED_SPTE); 461 if (!is_removed_spte(old_spte)) 462 break; 463 cpu_relax(); 464 } 465 } else { 466 /* 467 * If the SPTE is not MMU-present, there is no backing 468 * page associated with the SPTE and so no side effects 469 * that need to be recorded, and exclusive ownership of 470 * mmu_lock ensures the SPTE can't be made present. 471 * Note, zapping MMIO SPTEs is also unnecessary as they 472 * are guarded by the memslots generation, not by being 473 * unreachable. 474 */ 475 old_spte = kvm_tdp_mmu_read_spte(sptep); 476 if (!is_shadow_present_pte(old_spte)) 477 continue; 478 479 /* 480 * Use the common helper instead of a raw WRITE_ONCE as 481 * the SPTE needs to be updated atomically if it can be 482 * modified by a different vCPU outside of mmu_lock. 483 * Even though the parent SPTE is !PRESENT, the TLB 484 * hasn't yet been flushed, and both Intel and AMD 485 * document that A/D assists can use upper-level PxE 486 * entries that are cached in the TLB, i.e. the CPU can 487 * still access the page and mark it dirty. 488 * 489 * No retry is needed in the atomic update path as the 490 * sole concern is dropping a Dirty bit, i.e. no other 491 * task can zap/remove the SPTE as mmu_lock is held for 492 * write. Marking the SPTE as a removed SPTE is not 493 * strictly necessary for the same reason, but using 494 * the remove SPTE value keeps the shared/exclusive 495 * paths consistent and allows the handle_changed_spte() 496 * call below to hardcode the new value to REMOVED_SPTE. 497 * 498 * Note, even though dropping a Dirty bit is the only 499 * scenario where a non-atomic update could result in a 500 * functional bug, simply checking the Dirty bit isn't 501 * sufficient as a fast page fault could read the upper 502 * level SPTE before it is zapped, and then make this 503 * target SPTE writable, resume the guest, and set the 504 * Dirty bit between reading the SPTE above and writing 505 * it here. 506 */ 507 old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, 508 REMOVED_SPTE, level); 509 } 510 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn, 511 old_spte, REMOVED_SPTE, level, shared); 512 } 513 514 call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); 515 } 516 517 /** 518 * __handle_changed_spte - handle bookkeeping associated with an SPTE change 519 * @kvm: kvm instance 520 * @as_id: the address space of the paging structure the SPTE was a part of 521 * @gfn: the base GFN that was mapped by the SPTE 522 * @old_spte: The value of the SPTE before the change 523 * @new_spte: The value of the SPTE after the change 524 * @level: the level of the PT the SPTE is part of in the paging structure 525 * @shared: This operation may not be running under the exclusive use of 526 * the MMU lock and the operation must synchronize with other 527 * threads that might be modifying SPTEs. 528 * 529 * Handle bookkeeping that might result from the modification of a SPTE. 530 * This function must be called for all TDP SPTE modifications. 531 */ 532 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 533 u64 old_spte, u64 new_spte, int level, 534 bool shared) 535 { 536 bool was_present = is_shadow_present_pte(old_spte); 537 bool is_present = is_shadow_present_pte(new_spte); 538 bool was_leaf = was_present && is_last_spte(old_spte, level); 539 bool is_leaf = is_present && is_last_spte(new_spte, level); 540 bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); 541 542 WARN_ON(level > PT64_ROOT_MAX_LEVEL); 543 WARN_ON(level < PG_LEVEL_4K); 544 WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1)); 545 546 /* 547 * If this warning were to trigger it would indicate that there was a 548 * missing MMU notifier or a race with some notifier handler. 549 * A present, leaf SPTE should never be directly replaced with another 550 * present leaf SPTE pointing to a different PFN. A notifier handler 551 * should be zapping the SPTE before the main MM's page table is 552 * changed, or the SPTE should be zeroed, and the TLBs flushed by the 553 * thread before replacement. 554 */ 555 if (was_leaf && is_leaf && pfn_changed) { 556 pr_err("Invalid SPTE change: cannot replace a present leaf\n" 557 "SPTE with another present leaf SPTE mapping a\n" 558 "different PFN!\n" 559 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", 560 as_id, gfn, old_spte, new_spte, level); 561 562 /* 563 * Crash the host to prevent error propagation and guest data 564 * corruption. 565 */ 566 BUG(); 567 } 568 569 if (old_spte == new_spte) 570 return; 571 572 trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte); 573 574 if (is_leaf) 575 check_spte_writable_invariants(new_spte); 576 577 /* 578 * The only times a SPTE should be changed from a non-present to 579 * non-present state is when an MMIO entry is installed/modified/ 580 * removed. In that case, there is nothing to do here. 581 */ 582 if (!was_present && !is_present) { 583 /* 584 * If this change does not involve a MMIO SPTE or removed SPTE, 585 * it is unexpected. Log the change, though it should not 586 * impact the guest since both the former and current SPTEs 587 * are nonpresent. 588 */ 589 if (WARN_ON(!is_mmio_spte(old_spte) && 590 !is_mmio_spte(new_spte) && 591 !is_removed_spte(new_spte))) 592 pr_err("Unexpected SPTE change! Nonpresent SPTEs\n" 593 "should not be replaced with another,\n" 594 "different nonpresent SPTE, unless one or both\n" 595 "are MMIO SPTEs, or the new SPTE is\n" 596 "a temporary removed SPTE.\n" 597 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", 598 as_id, gfn, old_spte, new_spte, level); 599 return; 600 } 601 602 if (is_leaf != was_leaf) 603 kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1); 604 605 if (was_leaf && is_dirty_spte(old_spte) && 606 (!is_present || !is_dirty_spte(new_spte) || pfn_changed)) 607 kvm_set_pfn_dirty(spte_to_pfn(old_spte)); 608 609 /* 610 * Recursively handle child PTs if the change removed a subtree from 611 * the paging structure. Note the WARN on the PFN changing without the 612 * SPTE being converted to a hugepage (leaf) or being zapped. Shadow 613 * pages are kernel allocations and should never be migrated. 614 */ 615 if (was_present && !was_leaf && 616 (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed))) 617 handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared); 618 } 619 620 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 621 u64 old_spte, u64 new_spte, int level, 622 bool shared) 623 { 624 __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, 625 shared); 626 handle_changed_spte_acc_track(old_spte, new_spte, level); 627 handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, 628 new_spte, level); 629 } 630 631 /* 632 * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically 633 * and handle the associated bookkeeping. Do not mark the page dirty 634 * in KVM's dirty bitmaps. 635 * 636 * If setting the SPTE fails because it has changed, iter->old_spte will be 637 * refreshed to the current value of the spte. 638 * 639 * @kvm: kvm instance 640 * @iter: a tdp_iter instance currently on the SPTE that should be set 641 * @new_spte: The value the SPTE should be set to 642 * Return: 643 * * 0 - If the SPTE was set. 644 * * -EBUSY - If the SPTE cannot be set. In this case this function will have 645 * no side-effects other than setting iter->old_spte to the last 646 * known value of the spte. 647 */ 648 static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm, 649 struct tdp_iter *iter, 650 u64 new_spte) 651 { 652 u64 *sptep = rcu_dereference(iter->sptep); 653 654 /* 655 * The caller is responsible for ensuring the old SPTE is not a REMOVED 656 * SPTE. KVM should never attempt to zap or manipulate a REMOVED SPTE, 657 * and pre-checking before inserting a new SPTE is advantageous as it 658 * avoids unnecessary work. 659 */ 660 WARN_ON_ONCE(iter->yielded || is_removed_spte(iter->old_spte)); 661 662 lockdep_assert_held_read(&kvm->mmu_lock); 663 664 /* 665 * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and 666 * does not hold the mmu_lock. 667 */ 668 if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte)) 669 return -EBUSY; 670 671 __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, 672 new_spte, iter->level, true); 673 handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level); 674 675 return 0; 676 } 677 678 static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm, 679 struct tdp_iter *iter) 680 { 681 int ret; 682 683 /* 684 * Freeze the SPTE by setting it to a special, 685 * non-present value. This will stop other threads from 686 * immediately installing a present entry in its place 687 * before the TLBs are flushed. 688 */ 689 ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE); 690 if (ret) 691 return ret; 692 693 kvm_flush_remote_tlbs_with_address(kvm, iter->gfn, 694 KVM_PAGES_PER_HPAGE(iter->level)); 695 696 /* 697 * No other thread can overwrite the removed SPTE as they must either 698 * wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not 699 * overwrite the special removed SPTE value. No bookkeeping is needed 700 * here since the SPTE is going from non-present to non-present. Use 701 * the raw write helper to avoid an unnecessary check on volatile bits. 702 */ 703 __kvm_tdp_mmu_write_spte(iter->sptep, 0); 704 705 return 0; 706 } 707 708 709 /* 710 * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping 711 * @kvm: KVM instance 712 * @as_id: Address space ID, i.e. regular vs. SMM 713 * @sptep: Pointer to the SPTE 714 * @old_spte: The current value of the SPTE 715 * @new_spte: The new value that will be set for the SPTE 716 * @gfn: The base GFN that was (or will be) mapped by the SPTE 717 * @level: The level _containing_ the SPTE (its parent PT's level) 718 * @record_acc_track: Notify the MM subsystem of changes to the accessed state 719 * of the page. Should be set unless handling an MMU 720 * notifier for access tracking. Leaving record_acc_track 721 * unset in that case prevents page accesses from being 722 * double counted. 723 * @record_dirty_log: Record the page as dirty in the dirty bitmap if 724 * appropriate for the change being made. Should be set 725 * unless performing certain dirty logging operations. 726 * Leaving record_dirty_log unset in that case prevents page 727 * writes from being double counted. 728 * 729 * Returns the old SPTE value, which _may_ be different than @old_spte if the 730 * SPTE had voldatile bits. 731 */ 732 static u64 __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep, 733 u64 old_spte, u64 new_spte, gfn_t gfn, int level, 734 bool record_acc_track, bool record_dirty_log) 735 { 736 lockdep_assert_held_write(&kvm->mmu_lock); 737 738 /* 739 * No thread should be using this function to set SPTEs to or from the 740 * temporary removed SPTE value. 741 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic 742 * should be used. If operating under the MMU lock in write mode, the 743 * use of the removed SPTE should not be necessary. 744 */ 745 WARN_ON(is_removed_spte(old_spte) || is_removed_spte(new_spte)); 746 747 old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level); 748 749 __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false); 750 751 if (record_acc_track) 752 handle_changed_spte_acc_track(old_spte, new_spte, level); 753 if (record_dirty_log) 754 handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, 755 new_spte, level); 756 return old_spte; 757 } 758 759 static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, 760 u64 new_spte, bool record_acc_track, 761 bool record_dirty_log) 762 { 763 WARN_ON_ONCE(iter->yielded); 764 765 iter->old_spte = __tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep, 766 iter->old_spte, new_spte, 767 iter->gfn, iter->level, 768 record_acc_track, record_dirty_log); 769 } 770 771 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, 772 u64 new_spte) 773 { 774 _tdp_mmu_set_spte(kvm, iter, new_spte, true, true); 775 } 776 777 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm, 778 struct tdp_iter *iter, 779 u64 new_spte) 780 { 781 _tdp_mmu_set_spte(kvm, iter, new_spte, false, true); 782 } 783 784 static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm, 785 struct tdp_iter *iter, 786 u64 new_spte) 787 { 788 _tdp_mmu_set_spte(kvm, iter, new_spte, true, false); 789 } 790 791 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \ 792 for_each_tdp_pte(_iter, _root, _start, _end) 793 794 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \ 795 tdp_root_for_each_pte(_iter, _root, _start, _end) \ 796 if (!is_shadow_present_pte(_iter.old_spte) || \ 797 !is_last_spte(_iter.old_spte, _iter.level)) \ 798 continue; \ 799 else 800 801 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \ 802 for_each_tdp_pte(_iter, to_shadow_page(_mmu->root.hpa), _start, _end) 803 804 /* 805 * Yield if the MMU lock is contended or this thread needs to return control 806 * to the scheduler. 807 * 808 * If this function should yield and flush is set, it will perform a remote 809 * TLB flush before yielding. 810 * 811 * If this function yields, iter->yielded is set and the caller must skip to 812 * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk 813 * over the paging structures to allow the iterator to continue its traversal 814 * from the paging structure root. 815 * 816 * Returns true if this function yielded. 817 */ 818 static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm, 819 struct tdp_iter *iter, 820 bool flush, bool shared) 821 { 822 WARN_ON(iter->yielded); 823 824 /* Ensure forward progress has been made before yielding. */ 825 if (iter->next_last_level_gfn == iter->yielded_gfn) 826 return false; 827 828 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { 829 if (flush) 830 kvm_flush_remote_tlbs(kvm); 831 832 rcu_read_unlock(); 833 834 if (shared) 835 cond_resched_rwlock_read(&kvm->mmu_lock); 836 else 837 cond_resched_rwlock_write(&kvm->mmu_lock); 838 839 rcu_read_lock(); 840 841 WARN_ON(iter->gfn > iter->next_last_level_gfn); 842 843 iter->yielded = true; 844 } 845 846 return iter->yielded; 847 } 848 849 static inline gfn_t tdp_mmu_max_gfn_exclusive(void) 850 { 851 /* 852 * Bound TDP MMU walks at host.MAXPHYADDR. KVM disallows memslots with 853 * a gpa range that would exceed the max gfn, and KVM does not create 854 * MMIO SPTEs for "impossible" gfns, instead sending such accesses down 855 * the slow emulation path every time. 856 */ 857 return kvm_mmu_max_gfn() + 1; 858 } 859 860 static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, 861 bool shared, int zap_level) 862 { 863 struct tdp_iter iter; 864 865 gfn_t end = tdp_mmu_max_gfn_exclusive(); 866 gfn_t start = 0; 867 868 for_each_tdp_pte_min_level(iter, root, zap_level, start, end) { 869 retry: 870 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared)) 871 continue; 872 873 if (!is_shadow_present_pte(iter.old_spte)) 874 continue; 875 876 if (iter.level > zap_level) 877 continue; 878 879 if (!shared) 880 tdp_mmu_set_spte(kvm, &iter, 0); 881 else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0)) 882 goto retry; 883 } 884 } 885 886 static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, 887 bool shared) 888 { 889 890 /* 891 * The root must have an elevated refcount so that it's reachable via 892 * mmu_notifier callbacks, which allows this path to yield and drop 893 * mmu_lock. When handling an unmap/release mmu_notifier command, KVM 894 * must drop all references to relevant pages prior to completing the 895 * callback. Dropping mmu_lock with an unreachable root would result 896 * in zapping SPTEs after a relevant mmu_notifier callback completes 897 * and lead to use-after-free as zapping a SPTE triggers "writeback" of 898 * dirty accessed bits to the SPTE's associated struct page. 899 */ 900 WARN_ON_ONCE(!refcount_read(&root->tdp_mmu_root_count)); 901 902 kvm_lockdep_assert_mmu_lock_held(kvm, shared); 903 904 rcu_read_lock(); 905 906 /* 907 * To avoid RCU stalls due to recursively removing huge swaths of SPs, 908 * split the zap into two passes. On the first pass, zap at the 1gb 909 * level, and then zap top-level SPs on the second pass. "1gb" is not 910 * arbitrary, as KVM must be able to zap a 1gb shadow page without 911 * inducing a stall to allow in-place replacement with a 1gb hugepage. 912 * 913 * Because zapping a SP recurses on its children, stepping down to 914 * PG_LEVEL_4K in the iterator itself is unnecessary. 915 */ 916 __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G); 917 __tdp_mmu_zap_root(kvm, root, shared, root->role.level); 918 919 rcu_read_unlock(); 920 } 921 922 bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) 923 { 924 u64 old_spte; 925 926 /* 927 * This helper intentionally doesn't allow zapping a root shadow page, 928 * which doesn't have a parent page table and thus no associated entry. 929 */ 930 if (WARN_ON_ONCE(!sp->ptep)) 931 return false; 932 933 old_spte = kvm_tdp_mmu_read_spte(sp->ptep); 934 if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte))) 935 return false; 936 937 __tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0, 938 sp->gfn, sp->role.level + 1, true, true); 939 940 return true; 941 } 942 943 /* 944 * If can_yield is true, will release the MMU lock and reschedule if the 945 * scheduler needs the CPU or there is contention on the MMU lock. If this 946 * function cannot yield, it will not release the MMU lock or reschedule and 947 * the caller must ensure it does not supply too large a GFN range, or the 948 * operation can cause a soft lockup. 949 */ 950 static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root, 951 gfn_t start, gfn_t end, bool can_yield, bool flush) 952 { 953 struct tdp_iter iter; 954 955 end = min(end, tdp_mmu_max_gfn_exclusive()); 956 957 lockdep_assert_held_write(&kvm->mmu_lock); 958 959 rcu_read_lock(); 960 961 for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) { 962 if (can_yield && 963 tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) { 964 flush = false; 965 continue; 966 } 967 968 if (!is_shadow_present_pte(iter.old_spte) || 969 !is_last_spte(iter.old_spte, iter.level)) 970 continue; 971 972 tdp_mmu_set_spte(kvm, &iter, 0); 973 flush = true; 974 } 975 976 rcu_read_unlock(); 977 978 /* 979 * Because this flow zaps _only_ leaf SPTEs, the caller doesn't need 980 * to provide RCU protection as no 'struct kvm_mmu_page' will be freed. 981 */ 982 return flush; 983 } 984 985 /* 986 * Zap leaf SPTEs for the range of gfns, [start, end), for all roots. Returns 987 * true if a TLB flush is needed before releasing the MMU lock, i.e. if one or 988 * more SPTEs were zapped since the MMU lock was last acquired. 989 */ 990 bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end, 991 bool can_yield, bool flush) 992 { 993 struct kvm_mmu_page *root; 994 995 for_each_tdp_mmu_root_yield_safe(kvm, root, as_id) 996 flush = tdp_mmu_zap_leafs(kvm, root, start, end, can_yield, flush); 997 998 return flush; 999 } 1000 1001 void kvm_tdp_mmu_zap_all(struct kvm *kvm) 1002 { 1003 struct kvm_mmu_page *root; 1004 int i; 1005 1006 /* 1007 * Zap all roots, including invalid roots, as all SPTEs must be dropped 1008 * before returning to the caller. Zap directly even if the root is 1009 * also being zapped by a worker. Walking zapped top-level SPTEs isn't 1010 * all that expensive and mmu_lock is already held, which means the 1011 * worker has yielded, i.e. flushing the work instead of zapping here 1012 * isn't guaranteed to be any faster. 1013 * 1014 * A TLB flush is unnecessary, KVM zaps everything if and only the VM 1015 * is being destroyed or the userspace VMM has exited. In both cases, 1016 * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request. 1017 */ 1018 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { 1019 for_each_tdp_mmu_root_yield_safe(kvm, root, i) 1020 tdp_mmu_zap_root(kvm, root, false); 1021 } 1022 } 1023 1024 /* 1025 * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast 1026 * zap" completes. 1027 */ 1028 void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm) 1029 { 1030 flush_workqueue(kvm->arch.tdp_mmu_zap_wq); 1031 } 1032 1033 /* 1034 * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that 1035 * is about to be zapped, e.g. in response to a memslots update. The actual 1036 * zapping is performed asynchronously, so a reference is taken on all roots. 1037 * Using a separate workqueue makes it easy to ensure that the destruction is 1038 * performed before the "fast zap" completes, without keeping a separate list 1039 * of invalidated roots; the list is effectively the list of work items in 1040 * the workqueue. 1041 * 1042 * Get a reference even if the root is already invalid, the asynchronous worker 1043 * assumes it was gifted a reference to the root it processes. Because mmu_lock 1044 * is held for write, it should be impossible to observe a root with zero refcount, 1045 * i.e. the list of roots cannot be stale. 1046 * 1047 * This has essentially the same effect for the TDP MMU 1048 * as updating mmu_valid_gen does for the shadow MMU. 1049 */ 1050 void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm) 1051 { 1052 struct kvm_mmu_page *root; 1053 1054 lockdep_assert_held_write(&kvm->mmu_lock); 1055 list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) { 1056 if (!root->role.invalid && 1057 !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root))) { 1058 root->role.invalid = true; 1059 tdp_mmu_schedule_zap_root(kvm, root); 1060 } 1061 } 1062 } 1063 1064 /* 1065 * Installs a last-level SPTE to handle a TDP page fault. 1066 * (NPT/EPT violation/misconfiguration) 1067 */ 1068 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, 1069 struct kvm_page_fault *fault, 1070 struct tdp_iter *iter) 1071 { 1072 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep)); 1073 u64 new_spte; 1074 int ret = RET_PF_FIXED; 1075 bool wrprot = false; 1076 1077 WARN_ON(sp->role.level != fault->goal_level); 1078 if (unlikely(!fault->slot)) 1079 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); 1080 else 1081 wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn, 1082 fault->pfn, iter->old_spte, fault->prefetch, true, 1083 fault->map_writable, &new_spte); 1084 1085 if (new_spte == iter->old_spte) 1086 ret = RET_PF_SPURIOUS; 1087 else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte)) 1088 return RET_PF_RETRY; 1089 else if (is_shadow_present_pte(iter->old_spte) && 1090 !is_last_spte(iter->old_spte, iter->level)) 1091 kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn, 1092 KVM_PAGES_PER_HPAGE(iter->level + 1)); 1093 1094 /* 1095 * If the page fault was caused by a write but the page is write 1096 * protected, emulation is needed. If the emulation was skipped, 1097 * the vCPU would have the same fault again. 1098 */ 1099 if (wrprot) { 1100 if (fault->write) 1101 ret = RET_PF_EMULATE; 1102 } 1103 1104 /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */ 1105 if (unlikely(is_mmio_spte(new_spte))) { 1106 vcpu->stat.pf_mmio_spte_created++; 1107 trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn, 1108 new_spte); 1109 ret = RET_PF_EMULATE; 1110 } else { 1111 trace_kvm_mmu_set_spte(iter->level, iter->gfn, 1112 rcu_dereference(iter->sptep)); 1113 } 1114 1115 return ret; 1116 } 1117 1118 /* 1119 * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the 1120 * provided page table. 1121 * 1122 * @kvm: kvm instance 1123 * @iter: a tdp_iter instance currently on the SPTE that should be set 1124 * @sp: The new TDP page table to install. 1125 * @shared: This operation is running under the MMU lock in read mode. 1126 * 1127 * Returns: 0 if the new page table was installed. Non-0 if the page table 1128 * could not be installed (e.g. the atomic compare-exchange failed). 1129 */ 1130 static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter, 1131 struct kvm_mmu_page *sp, bool shared) 1132 { 1133 u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled()); 1134 int ret = 0; 1135 1136 if (shared) { 1137 ret = tdp_mmu_set_spte_atomic(kvm, iter, spte); 1138 if (ret) 1139 return ret; 1140 } else { 1141 tdp_mmu_set_spte(kvm, iter, spte); 1142 } 1143 1144 tdp_account_mmu_page(kvm, sp); 1145 1146 return 0; 1147 } 1148 1149 static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, 1150 struct kvm_mmu_page *sp, bool shared); 1151 1152 /* 1153 * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing 1154 * page tables and SPTEs to translate the faulting guest physical address. 1155 */ 1156 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) 1157 { 1158 struct kvm_mmu *mmu = vcpu->arch.mmu; 1159 struct kvm *kvm = vcpu->kvm; 1160 struct tdp_iter iter; 1161 struct kvm_mmu_page *sp; 1162 int ret = RET_PF_RETRY; 1163 1164 kvm_mmu_hugepage_adjust(vcpu, fault); 1165 1166 trace_kvm_mmu_spte_requested(fault); 1167 1168 rcu_read_lock(); 1169 1170 tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) { 1171 int r; 1172 1173 if (fault->nx_huge_page_workaround_enabled) 1174 disallowed_hugepage_adjust(fault, iter.old_spte, iter.level); 1175 1176 if (iter.level == fault->goal_level) 1177 break; 1178 1179 /* 1180 * If SPTE has been frozen by another thread, just give up and 1181 * retry, avoiding unnecessary page table allocation and free. 1182 */ 1183 if (is_removed_spte(iter.old_spte)) 1184 goto retry; 1185 1186 /* Step down into the lower level page table if it exists. */ 1187 if (is_shadow_present_pte(iter.old_spte) && 1188 !is_large_pte(iter.old_spte)) 1189 continue; 1190 1191 /* 1192 * The SPTE is either non-present or points to a huge page that 1193 * needs to be split. 1194 */ 1195 sp = tdp_mmu_alloc_sp(vcpu); 1196 tdp_mmu_init_child_sp(sp, &iter); 1197 1198 sp->nx_huge_page_disallowed = fault->huge_page_disallowed; 1199 1200 if (is_shadow_present_pte(iter.old_spte)) 1201 r = tdp_mmu_split_huge_page(kvm, &iter, sp, true); 1202 else 1203 r = tdp_mmu_link_sp(kvm, &iter, sp, true); 1204 1205 /* 1206 * Also force the guest to retry the access if the upper level SPTEs 1207 * aren't in place. 1208 */ 1209 if (r) { 1210 tdp_mmu_free_sp(sp); 1211 goto retry; 1212 } 1213 1214 if (fault->huge_page_disallowed && 1215 fault->req_level >= iter.level) { 1216 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 1217 track_possible_nx_huge_page(kvm, sp); 1218 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 1219 } 1220 } 1221 1222 ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter); 1223 1224 retry: 1225 rcu_read_unlock(); 1226 return ret; 1227 } 1228 1229 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, 1230 bool flush) 1231 { 1232 return kvm_tdp_mmu_zap_leafs(kvm, range->slot->as_id, range->start, 1233 range->end, range->may_block, flush); 1234 } 1235 1236 typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter, 1237 struct kvm_gfn_range *range); 1238 1239 static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm, 1240 struct kvm_gfn_range *range, 1241 tdp_handler_t handler) 1242 { 1243 struct kvm_mmu_page *root; 1244 struct tdp_iter iter; 1245 bool ret = false; 1246 1247 /* 1248 * Don't support rescheduling, none of the MMU notifiers that funnel 1249 * into this helper allow blocking; it'd be dead, wasteful code. 1250 */ 1251 for_each_tdp_mmu_root(kvm, root, range->slot->as_id) { 1252 rcu_read_lock(); 1253 1254 tdp_root_for_each_leaf_pte(iter, root, range->start, range->end) 1255 ret |= handler(kvm, &iter, range); 1256 1257 rcu_read_unlock(); 1258 } 1259 1260 return ret; 1261 } 1262 1263 /* 1264 * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero 1265 * if any of the GFNs in the range have been accessed. 1266 */ 1267 static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter, 1268 struct kvm_gfn_range *range) 1269 { 1270 u64 new_spte = 0; 1271 1272 /* If we have a non-accessed entry we don't need to change the pte. */ 1273 if (!is_accessed_spte(iter->old_spte)) 1274 return false; 1275 1276 new_spte = iter->old_spte; 1277 1278 if (spte_ad_enabled(new_spte)) { 1279 new_spte &= ~shadow_accessed_mask; 1280 } else { 1281 /* 1282 * Capture the dirty status of the page, so that it doesn't get 1283 * lost when the SPTE is marked for access tracking. 1284 */ 1285 if (is_writable_pte(new_spte)) 1286 kvm_set_pfn_dirty(spte_to_pfn(new_spte)); 1287 1288 new_spte = mark_spte_for_access_track(new_spte); 1289 } 1290 1291 tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte); 1292 1293 return true; 1294 } 1295 1296 bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) 1297 { 1298 return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range); 1299 } 1300 1301 static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter, 1302 struct kvm_gfn_range *range) 1303 { 1304 return is_accessed_spte(iter->old_spte); 1305 } 1306 1307 bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 1308 { 1309 return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn); 1310 } 1311 1312 static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter, 1313 struct kvm_gfn_range *range) 1314 { 1315 u64 new_spte; 1316 1317 /* Huge pages aren't expected to be modified without first being zapped. */ 1318 WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end); 1319 1320 if (iter->level != PG_LEVEL_4K || 1321 !is_shadow_present_pte(iter->old_spte)) 1322 return false; 1323 1324 /* 1325 * Note, when changing a read-only SPTE, it's not strictly necessary to 1326 * zero the SPTE before setting the new PFN, but doing so preserves the 1327 * invariant that the PFN of a present * leaf SPTE can never change. 1328 * See __handle_changed_spte(). 1329 */ 1330 tdp_mmu_set_spte(kvm, iter, 0); 1331 1332 if (!pte_write(range->pte)) { 1333 new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte, 1334 pte_pfn(range->pte)); 1335 1336 tdp_mmu_set_spte(kvm, iter, new_spte); 1337 } 1338 1339 return true; 1340 } 1341 1342 /* 1343 * Handle the changed_pte MMU notifier for the TDP MMU. 1344 * data is a pointer to the new pte_t mapping the HVA specified by the MMU 1345 * notifier. 1346 * Returns non-zero if a flush is needed before releasing the MMU lock. 1347 */ 1348 bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 1349 { 1350 /* 1351 * No need to handle the remote TLB flush under RCU protection, the 1352 * target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a 1353 * shadow page. See the WARN on pfn_changed in __handle_changed_spte(). 1354 */ 1355 return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn); 1356 } 1357 1358 /* 1359 * Remove write access from all SPTEs at or above min_level that map GFNs 1360 * [start, end). Returns true if an SPTE has been changed and the TLBs need to 1361 * be flushed. 1362 */ 1363 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 1364 gfn_t start, gfn_t end, int min_level) 1365 { 1366 struct tdp_iter iter; 1367 u64 new_spte; 1368 bool spte_set = false; 1369 1370 rcu_read_lock(); 1371 1372 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); 1373 1374 for_each_tdp_pte_min_level(iter, root, min_level, start, end) { 1375 retry: 1376 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) 1377 continue; 1378 1379 if (!is_shadow_present_pte(iter.old_spte) || 1380 !is_last_spte(iter.old_spte, iter.level) || 1381 !(iter.old_spte & PT_WRITABLE_MASK)) 1382 continue; 1383 1384 new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 1385 1386 if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) 1387 goto retry; 1388 1389 spte_set = true; 1390 } 1391 1392 rcu_read_unlock(); 1393 return spte_set; 1394 } 1395 1396 /* 1397 * Remove write access from all the SPTEs mapping GFNs in the memslot. Will 1398 * only affect leaf SPTEs down to min_level. 1399 * Returns true if an SPTE has been changed and the TLBs need to be flushed. 1400 */ 1401 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, 1402 const struct kvm_memory_slot *slot, int min_level) 1403 { 1404 struct kvm_mmu_page *root; 1405 bool spte_set = false; 1406 1407 lockdep_assert_held_read(&kvm->mmu_lock); 1408 1409 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) 1410 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn, 1411 slot->base_gfn + slot->npages, min_level); 1412 1413 return spte_set; 1414 } 1415 1416 static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp) 1417 { 1418 struct kvm_mmu_page *sp; 1419 1420 gfp |= __GFP_ZERO; 1421 1422 sp = kmem_cache_alloc(mmu_page_header_cache, gfp); 1423 if (!sp) 1424 return NULL; 1425 1426 sp->spt = (void *)__get_free_page(gfp); 1427 if (!sp->spt) { 1428 kmem_cache_free(mmu_page_header_cache, sp); 1429 return NULL; 1430 } 1431 1432 return sp; 1433 } 1434 1435 static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm, 1436 struct tdp_iter *iter, 1437 bool shared) 1438 { 1439 struct kvm_mmu_page *sp; 1440 1441 /* 1442 * Since we are allocating while under the MMU lock we have to be 1443 * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct 1444 * reclaim and to avoid making any filesystem callbacks (which can end 1445 * up invoking KVM MMU notifiers, resulting in a deadlock). 1446 * 1447 * If this allocation fails we drop the lock and retry with reclaim 1448 * allowed. 1449 */ 1450 sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT); 1451 if (sp) 1452 return sp; 1453 1454 rcu_read_unlock(); 1455 1456 if (shared) 1457 read_unlock(&kvm->mmu_lock); 1458 else 1459 write_unlock(&kvm->mmu_lock); 1460 1461 iter->yielded = true; 1462 sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT); 1463 1464 if (shared) 1465 read_lock(&kvm->mmu_lock); 1466 else 1467 write_lock(&kvm->mmu_lock); 1468 1469 rcu_read_lock(); 1470 1471 return sp; 1472 } 1473 1474 /* Note, the caller is responsible for initializing @sp. */ 1475 static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, 1476 struct kvm_mmu_page *sp, bool shared) 1477 { 1478 const u64 huge_spte = iter->old_spte; 1479 const int level = iter->level; 1480 int ret, i; 1481 1482 /* 1483 * No need for atomics when writing to sp->spt since the page table has 1484 * not been linked in yet and thus is not reachable from any other CPU. 1485 */ 1486 for (i = 0; i < SPTE_ENT_PER_PAGE; i++) 1487 sp->spt[i] = make_huge_page_split_spte(kvm, huge_spte, sp->role, i); 1488 1489 /* 1490 * Replace the huge spte with a pointer to the populated lower level 1491 * page table. Since we are making this change without a TLB flush vCPUs 1492 * will see a mix of the split mappings and the original huge mapping, 1493 * depending on what's currently in their TLB. This is fine from a 1494 * correctness standpoint since the translation will be the same either 1495 * way. 1496 */ 1497 ret = tdp_mmu_link_sp(kvm, iter, sp, shared); 1498 if (ret) 1499 goto out; 1500 1501 /* 1502 * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we 1503 * are overwriting from the page stats. But we have to manually update 1504 * the page stats with the new present child pages. 1505 */ 1506 kvm_update_page_stats(kvm, level - 1, SPTE_ENT_PER_PAGE); 1507 1508 out: 1509 trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret); 1510 return ret; 1511 } 1512 1513 static int tdp_mmu_split_huge_pages_root(struct kvm *kvm, 1514 struct kvm_mmu_page *root, 1515 gfn_t start, gfn_t end, 1516 int target_level, bool shared) 1517 { 1518 struct kvm_mmu_page *sp = NULL; 1519 struct tdp_iter iter; 1520 int ret = 0; 1521 1522 rcu_read_lock(); 1523 1524 /* 1525 * Traverse the page table splitting all huge pages above the target 1526 * level into one lower level. For example, if we encounter a 1GB page 1527 * we split it into 512 2MB pages. 1528 * 1529 * Since the TDP iterator uses a pre-order traversal, we are guaranteed 1530 * to visit an SPTE before ever visiting its children, which means we 1531 * will correctly recursively split huge pages that are more than one 1532 * level above the target level (e.g. splitting a 1GB to 512 2MB pages, 1533 * and then splitting each of those to 512 4KB pages). 1534 */ 1535 for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) { 1536 retry: 1537 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared)) 1538 continue; 1539 1540 if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte)) 1541 continue; 1542 1543 if (!sp) { 1544 sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared); 1545 if (!sp) { 1546 ret = -ENOMEM; 1547 trace_kvm_mmu_split_huge_page(iter.gfn, 1548 iter.old_spte, 1549 iter.level, ret); 1550 break; 1551 } 1552 1553 if (iter.yielded) 1554 continue; 1555 } 1556 1557 tdp_mmu_init_child_sp(sp, &iter); 1558 1559 if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared)) 1560 goto retry; 1561 1562 sp = NULL; 1563 } 1564 1565 rcu_read_unlock(); 1566 1567 /* 1568 * It's possible to exit the loop having never used the last sp if, for 1569 * example, a vCPU doing HugePage NX splitting wins the race and 1570 * installs its own sp in place of the last sp we tried to split. 1571 */ 1572 if (sp) 1573 tdp_mmu_free_sp(sp); 1574 1575 return ret; 1576 } 1577 1578 1579 /* 1580 * Try to split all huge pages mapped by the TDP MMU down to the target level. 1581 */ 1582 void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm, 1583 const struct kvm_memory_slot *slot, 1584 gfn_t start, gfn_t end, 1585 int target_level, bool shared) 1586 { 1587 struct kvm_mmu_page *root; 1588 int r = 0; 1589 1590 kvm_lockdep_assert_mmu_lock_held(kvm, shared); 1591 1592 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) { 1593 r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared); 1594 if (r) { 1595 kvm_tdp_mmu_put_root(kvm, root, shared); 1596 break; 1597 } 1598 } 1599 } 1600 1601 /* 1602 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If 1603 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. 1604 * If AD bits are not enabled, this will require clearing the writable bit on 1605 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to 1606 * be flushed. 1607 */ 1608 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 1609 gfn_t start, gfn_t end) 1610 { 1611 struct tdp_iter iter; 1612 u64 new_spte; 1613 bool spte_set = false; 1614 1615 rcu_read_lock(); 1616 1617 tdp_root_for_each_leaf_pte(iter, root, start, end) { 1618 retry: 1619 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) 1620 continue; 1621 1622 if (!is_shadow_present_pte(iter.old_spte)) 1623 continue; 1624 1625 if (spte_ad_need_write_protect(iter.old_spte)) { 1626 if (is_writable_pte(iter.old_spte)) 1627 new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 1628 else 1629 continue; 1630 } else { 1631 if (iter.old_spte & shadow_dirty_mask) 1632 new_spte = iter.old_spte & ~shadow_dirty_mask; 1633 else 1634 continue; 1635 } 1636 1637 if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) 1638 goto retry; 1639 1640 spte_set = true; 1641 } 1642 1643 rcu_read_unlock(); 1644 return spte_set; 1645 } 1646 1647 /* 1648 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If 1649 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. 1650 * If AD bits are not enabled, this will require clearing the writable bit on 1651 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to 1652 * be flushed. 1653 */ 1654 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, 1655 const struct kvm_memory_slot *slot) 1656 { 1657 struct kvm_mmu_page *root; 1658 bool spte_set = false; 1659 1660 lockdep_assert_held_read(&kvm->mmu_lock); 1661 1662 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) 1663 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn, 1664 slot->base_gfn + slot->npages); 1665 1666 return spte_set; 1667 } 1668 1669 /* 1670 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is 1671 * set in mask, starting at gfn. The given memslot is expected to contain all 1672 * the GFNs represented by set bits in the mask. If AD bits are enabled, 1673 * clearing the dirty status will involve clearing the dirty bit on each SPTE 1674 * or, if AD bits are not enabled, clearing the writable bit on each SPTE. 1675 */ 1676 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, 1677 gfn_t gfn, unsigned long mask, bool wrprot) 1678 { 1679 struct tdp_iter iter; 1680 u64 new_spte; 1681 1682 rcu_read_lock(); 1683 1684 tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask), 1685 gfn + BITS_PER_LONG) { 1686 if (!mask) 1687 break; 1688 1689 if (iter.level > PG_LEVEL_4K || 1690 !(mask & (1UL << (iter.gfn - gfn)))) 1691 continue; 1692 1693 mask &= ~(1UL << (iter.gfn - gfn)); 1694 1695 if (wrprot || spte_ad_need_write_protect(iter.old_spte)) { 1696 if (is_writable_pte(iter.old_spte)) 1697 new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 1698 else 1699 continue; 1700 } else { 1701 if (iter.old_spte & shadow_dirty_mask) 1702 new_spte = iter.old_spte & ~shadow_dirty_mask; 1703 else 1704 continue; 1705 } 1706 1707 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); 1708 } 1709 1710 rcu_read_unlock(); 1711 } 1712 1713 /* 1714 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is 1715 * set in mask, starting at gfn. The given memslot is expected to contain all 1716 * the GFNs represented by set bits in the mask. If AD bits are enabled, 1717 * clearing the dirty status will involve clearing the dirty bit on each SPTE 1718 * or, if AD bits are not enabled, clearing the writable bit on each SPTE. 1719 */ 1720 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, 1721 struct kvm_memory_slot *slot, 1722 gfn_t gfn, unsigned long mask, 1723 bool wrprot) 1724 { 1725 struct kvm_mmu_page *root; 1726 1727 lockdep_assert_held_write(&kvm->mmu_lock); 1728 for_each_tdp_mmu_root(kvm, root, slot->as_id) 1729 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot); 1730 } 1731 1732 static void zap_collapsible_spte_range(struct kvm *kvm, 1733 struct kvm_mmu_page *root, 1734 const struct kvm_memory_slot *slot) 1735 { 1736 gfn_t start = slot->base_gfn; 1737 gfn_t end = start + slot->npages; 1738 struct tdp_iter iter; 1739 int max_mapping_level; 1740 1741 rcu_read_lock(); 1742 1743 for_each_tdp_pte_min_level(iter, root, PG_LEVEL_2M, start, end) { 1744 retry: 1745 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) 1746 continue; 1747 1748 if (iter.level > KVM_MAX_HUGEPAGE_LEVEL || 1749 !is_shadow_present_pte(iter.old_spte)) 1750 continue; 1751 1752 /* 1753 * Don't zap leaf SPTEs, if a leaf SPTE could be replaced with 1754 * a large page size, then its parent would have been zapped 1755 * instead of stepping down. 1756 */ 1757 if (is_last_spte(iter.old_spte, iter.level)) 1758 continue; 1759 1760 /* 1761 * If iter.gfn resides outside of the slot, i.e. the page for 1762 * the current level overlaps but is not contained by the slot, 1763 * then the SPTE can't be made huge. More importantly, trying 1764 * to query that info from slot->arch.lpage_info will cause an 1765 * out-of-bounds access. 1766 */ 1767 if (iter.gfn < start || iter.gfn >= end) 1768 continue; 1769 1770 max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, 1771 iter.gfn, PG_LEVEL_NUM); 1772 if (max_mapping_level < iter.level) 1773 continue; 1774 1775 /* Note, a successful atomic zap also does a remote TLB flush. */ 1776 if (tdp_mmu_zap_spte_atomic(kvm, &iter)) 1777 goto retry; 1778 } 1779 1780 rcu_read_unlock(); 1781 } 1782 1783 /* 1784 * Zap non-leaf SPTEs (and free their associated page tables) which could 1785 * be replaced by huge pages, for GFNs within the slot. 1786 */ 1787 void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, 1788 const struct kvm_memory_slot *slot) 1789 { 1790 struct kvm_mmu_page *root; 1791 1792 lockdep_assert_held_read(&kvm->mmu_lock); 1793 1794 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) 1795 zap_collapsible_spte_range(kvm, root, slot); 1796 } 1797 1798 /* 1799 * Removes write access on the last level SPTE mapping this GFN and unsets the 1800 * MMU-writable bit to ensure future writes continue to be intercepted. 1801 * Returns true if an SPTE was set and a TLB flush is needed. 1802 */ 1803 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, 1804 gfn_t gfn, int min_level) 1805 { 1806 struct tdp_iter iter; 1807 u64 new_spte; 1808 bool spte_set = false; 1809 1810 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); 1811 1812 rcu_read_lock(); 1813 1814 for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) { 1815 if (!is_shadow_present_pte(iter.old_spte) || 1816 !is_last_spte(iter.old_spte, iter.level)) 1817 continue; 1818 1819 new_spte = iter.old_spte & 1820 ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask); 1821 1822 if (new_spte == iter.old_spte) 1823 break; 1824 1825 tdp_mmu_set_spte(kvm, &iter, new_spte); 1826 spte_set = true; 1827 } 1828 1829 rcu_read_unlock(); 1830 1831 return spte_set; 1832 } 1833 1834 /* 1835 * Removes write access on the last level SPTE mapping this GFN and unsets the 1836 * MMU-writable bit to ensure future writes continue to be intercepted. 1837 * Returns true if an SPTE was set and a TLB flush is needed. 1838 */ 1839 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, 1840 struct kvm_memory_slot *slot, gfn_t gfn, 1841 int min_level) 1842 { 1843 struct kvm_mmu_page *root; 1844 bool spte_set = false; 1845 1846 lockdep_assert_held_write(&kvm->mmu_lock); 1847 for_each_tdp_mmu_root(kvm, root, slot->as_id) 1848 spte_set |= write_protect_gfn(kvm, root, gfn, min_level); 1849 1850 return spte_set; 1851 } 1852 1853 /* 1854 * Return the level of the lowest level SPTE added to sptes. 1855 * That SPTE may be non-present. 1856 * 1857 * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. 1858 */ 1859 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, 1860 int *root_level) 1861 { 1862 struct tdp_iter iter; 1863 struct kvm_mmu *mmu = vcpu->arch.mmu; 1864 gfn_t gfn = addr >> PAGE_SHIFT; 1865 int leaf = -1; 1866 1867 *root_level = vcpu->arch.mmu->root_role.level; 1868 1869 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { 1870 leaf = iter.level; 1871 sptes[leaf] = iter.old_spte; 1872 } 1873 1874 return leaf; 1875 } 1876 1877 /* 1878 * Returns the last level spte pointer of the shadow page walk for the given 1879 * gpa, and sets *spte to the spte value. This spte may be non-preset. If no 1880 * walk could be performed, returns NULL and *spte does not contain valid data. 1881 * 1882 * Contract: 1883 * - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. 1884 * - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end. 1885 * 1886 * WARNING: This function is only intended to be called during fast_page_fault. 1887 */ 1888 u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr, 1889 u64 *spte) 1890 { 1891 struct tdp_iter iter; 1892 struct kvm_mmu *mmu = vcpu->arch.mmu; 1893 gfn_t gfn = addr >> PAGE_SHIFT; 1894 tdp_ptep_t sptep = NULL; 1895 1896 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { 1897 *spte = iter.old_spte; 1898 sptep = iter.sptep; 1899 } 1900 1901 /* 1902 * Perform the rcu_dereference to get the raw spte pointer value since 1903 * we are passing it up to fast_page_fault, which is shared with the 1904 * legacy MMU and thus does not retain the TDP MMU-specific __rcu 1905 * annotation. 1906 * 1907 * This is safe since fast_page_fault obeys the contracts of this 1908 * function as well as all TDP MMU contracts around modifying SPTEs 1909 * outside of mmu_lock. 1910 */ 1911 return rcu_dereference(sptep); 1912 } 1913