1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "mmu.h" 4 #include "mmu_internal.h" 5 #include "mmutrace.h" 6 #include "tdp_iter.h" 7 #include "tdp_mmu.h" 8 #include "spte.h" 9 10 #include <asm/cmpxchg.h> 11 #include <trace/events/kvm.h> 12 13 static bool __read_mostly tdp_mmu_enabled = true; 14 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644); 15 16 /* Initializes the TDP MMU for the VM, if enabled. */ 17 int kvm_mmu_init_tdp_mmu(struct kvm *kvm) 18 { 19 struct workqueue_struct *wq; 20 21 if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled)) 22 return 0; 23 24 wq = alloc_workqueue("kvm", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0); 25 if (!wq) 26 return -ENOMEM; 27 28 /* This should not be changed for the lifetime of the VM. */ 29 kvm->arch.tdp_mmu_enabled = true; 30 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); 31 spin_lock_init(&kvm->arch.tdp_mmu_pages_lock); 32 kvm->arch.tdp_mmu_zap_wq = wq; 33 return 1; 34 } 35 36 /* Arbitrarily returns true so that this may be used in if statements. */ 37 static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm, 38 bool shared) 39 { 40 if (shared) 41 lockdep_assert_held_read(&kvm->mmu_lock); 42 else 43 lockdep_assert_held_write(&kvm->mmu_lock); 44 45 return true; 46 } 47 48 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) 49 { 50 if (!kvm->arch.tdp_mmu_enabled) 51 return; 52 53 /* Also waits for any queued work items. */ 54 destroy_workqueue(kvm->arch.tdp_mmu_zap_wq); 55 56 WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages)); 57 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); 58 59 /* 60 * Ensure that all the outstanding RCU callbacks to free shadow pages 61 * can run before the VM is torn down. Work items on tdp_mmu_zap_wq 62 * can call kvm_tdp_mmu_put_root and create new callbacks. 63 */ 64 rcu_barrier(); 65 } 66 67 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp) 68 { 69 free_page((unsigned long)sp->spt); 70 kmem_cache_free(mmu_page_header_cache, sp); 71 } 72 73 /* 74 * This is called through call_rcu in order to free TDP page table memory 75 * safely with respect to other kernel threads that may be operating on 76 * the memory. 77 * By only accessing TDP MMU page table memory in an RCU read critical 78 * section, and freeing it after a grace period, lockless access to that 79 * memory won't use it after it is freed. 80 */ 81 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head) 82 { 83 struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page, 84 rcu_head); 85 86 tdp_mmu_free_sp(sp); 87 } 88 89 static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, 90 bool shared); 91 92 static void tdp_mmu_zap_root_work(struct work_struct *work) 93 { 94 struct kvm_mmu_page *root = container_of(work, struct kvm_mmu_page, 95 tdp_mmu_async_work); 96 struct kvm *kvm = root->tdp_mmu_async_data; 97 98 read_lock(&kvm->mmu_lock); 99 100 /* 101 * A TLB flush is not necessary as KVM performs a local TLB flush when 102 * allocating a new root (see kvm_mmu_load()), and when migrating vCPU 103 * to a different pCPU. Note, the local TLB flush on reuse also 104 * invalidates any paging-structure-cache entries, i.e. TLB entries for 105 * intermediate paging structures, that may be zapped, as such entries 106 * are associated with the ASID on both VMX and SVM. 107 */ 108 tdp_mmu_zap_root(kvm, root, true); 109 110 /* 111 * Drop the refcount using kvm_tdp_mmu_put_root() to test its logic for 112 * avoiding an infinite loop. By design, the root is reachable while 113 * it's being asynchronously zapped, thus a different task can put its 114 * last reference, i.e. flowing through kvm_tdp_mmu_put_root() for an 115 * asynchronously zapped root is unavoidable. 116 */ 117 kvm_tdp_mmu_put_root(kvm, root, true); 118 119 read_unlock(&kvm->mmu_lock); 120 } 121 122 static void tdp_mmu_schedule_zap_root(struct kvm *kvm, struct kvm_mmu_page *root) 123 { 124 root->tdp_mmu_async_data = kvm; 125 INIT_WORK(&root->tdp_mmu_async_work, tdp_mmu_zap_root_work); 126 queue_work(kvm->arch.tdp_mmu_zap_wq, &root->tdp_mmu_async_work); 127 } 128 129 static inline bool kvm_tdp_root_mark_invalid(struct kvm_mmu_page *page) 130 { 131 union kvm_mmu_page_role role = page->role; 132 role.invalid = true; 133 134 /* No need to use cmpxchg, only the invalid bit can change. */ 135 role.word = xchg(&page->role.word, role.word); 136 return role.invalid; 137 } 138 139 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, 140 bool shared) 141 { 142 kvm_lockdep_assert_mmu_lock_held(kvm, shared); 143 144 if (!refcount_dec_and_test(&root->tdp_mmu_root_count)) 145 return; 146 147 WARN_ON(!root->tdp_mmu_page); 148 149 /* 150 * The root now has refcount=0. It is valid, but readers already 151 * cannot acquire a reference to it because kvm_tdp_mmu_get_root() 152 * rejects it. This remains true for the rest of the execution 153 * of this function, because readers visit valid roots only 154 * (except for tdp_mmu_zap_root_work(), which however 155 * does not acquire any reference itself). 156 * 157 * Even though there are flows that need to visit all roots for 158 * correctness, they all take mmu_lock for write, so they cannot yet 159 * run concurrently. The same is true after kvm_tdp_root_mark_invalid, 160 * since the root still has refcount=0. 161 * 162 * However, tdp_mmu_zap_root can yield, and writers do not expect to 163 * see refcount=0 (see for example kvm_tdp_mmu_invalidate_all_roots()). 164 * So the root temporarily gets an extra reference, going to refcount=1 165 * while staying invalid. Readers still cannot acquire any reference; 166 * but writers are now allowed to run if tdp_mmu_zap_root yields and 167 * they might take an extra reference if they themselves yield. 168 * Therefore, when the reference is given back by the worker, 169 * there is no guarantee that the refcount is still 1. If not, whoever 170 * puts the last reference will free the page, but they will not have to 171 * zap the root because a root cannot go from invalid to valid. 172 */ 173 if (!kvm_tdp_root_mark_invalid(root)) { 174 refcount_set(&root->tdp_mmu_root_count, 1); 175 176 /* 177 * Zapping the root in a worker is not just "nice to have"; 178 * it is required because kvm_tdp_mmu_invalidate_all_roots() 179 * skips already-invalid roots. If kvm_tdp_mmu_put_root() did 180 * not add the root to the workqueue, kvm_tdp_mmu_zap_all_fast() 181 * might return with some roots not zapped yet. 182 */ 183 tdp_mmu_schedule_zap_root(kvm, root); 184 return; 185 } 186 187 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 188 list_del_rcu(&root->link); 189 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 190 call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback); 191 } 192 193 /* 194 * Returns the next root after @prev_root (or the first root if @prev_root is 195 * NULL). A reference to the returned root is acquired, and the reference to 196 * @prev_root is released (the caller obviously must hold a reference to 197 * @prev_root if it's non-NULL). 198 * 199 * If @only_valid is true, invalid roots are skipped. 200 * 201 * Returns NULL if the end of tdp_mmu_roots was reached. 202 */ 203 static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, 204 struct kvm_mmu_page *prev_root, 205 bool shared, bool only_valid) 206 { 207 struct kvm_mmu_page *next_root; 208 209 rcu_read_lock(); 210 211 if (prev_root) 212 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, 213 &prev_root->link, 214 typeof(*prev_root), link); 215 else 216 next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots, 217 typeof(*next_root), link); 218 219 while (next_root) { 220 if ((!only_valid || !next_root->role.invalid) && 221 kvm_tdp_mmu_get_root(next_root)) 222 break; 223 224 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, 225 &next_root->link, typeof(*next_root), link); 226 } 227 228 rcu_read_unlock(); 229 230 if (prev_root) 231 kvm_tdp_mmu_put_root(kvm, prev_root, shared); 232 233 return next_root; 234 } 235 236 /* 237 * Note: this iterator gets and puts references to the roots it iterates over. 238 * This makes it safe to release the MMU lock and yield within the loop, but 239 * if exiting the loop early, the caller must drop the reference to the most 240 * recent root. (Unless keeping a live reference is desirable.) 241 * 242 * If shared is set, this function is operating under the MMU lock in read 243 * mode. In the unlikely event that this thread must free a root, the lock 244 * will be temporarily dropped and reacquired in write mode. 245 */ 246 #define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\ 247 for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid); \ 248 _root; \ 249 _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid)) \ 250 if (kvm_lockdep_assert_mmu_lock_held(_kvm, _shared) && \ 251 kvm_mmu_page_as_id(_root) != _as_id) { \ 252 } else 253 254 #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \ 255 __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true) 256 257 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \ 258 __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, false, false) 259 260 /* 261 * Iterate over all TDP MMU roots. Requires that mmu_lock be held for write, 262 * the implication being that any flow that holds mmu_lock for read is 263 * inherently yield-friendly and should use the yield-safe variant above. 264 * Holding mmu_lock for write obviates the need for RCU protection as the list 265 * is guaranteed to be stable. 266 */ 267 #define for_each_tdp_mmu_root(_kvm, _root, _as_id) \ 268 list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \ 269 if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \ 270 kvm_mmu_page_as_id(_root) != _as_id) { \ 271 } else 272 273 static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu) 274 { 275 struct kvm_mmu_page *sp; 276 277 sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); 278 sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); 279 280 return sp; 281 } 282 283 static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep, 284 gfn_t gfn, union kvm_mmu_page_role role) 285 { 286 INIT_LIST_HEAD(&sp->possible_nx_huge_page_link); 287 288 set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 289 290 sp->role = role; 291 sp->gfn = gfn; 292 sp->ptep = sptep; 293 sp->tdp_mmu_page = true; 294 295 trace_kvm_mmu_get_page(sp, true); 296 } 297 298 static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp, 299 struct tdp_iter *iter) 300 { 301 struct kvm_mmu_page *parent_sp; 302 union kvm_mmu_page_role role; 303 304 parent_sp = sptep_to_sp(rcu_dereference(iter->sptep)); 305 306 role = parent_sp->role; 307 role.level--; 308 309 tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role); 310 } 311 312 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) 313 { 314 union kvm_mmu_page_role role = vcpu->arch.mmu->root_role; 315 struct kvm *kvm = vcpu->kvm; 316 struct kvm_mmu_page *root; 317 318 lockdep_assert_held_write(&kvm->mmu_lock); 319 320 /* 321 * Check for an existing root before allocating a new one. Note, the 322 * role check prevents consuming an invalid root. 323 */ 324 for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) { 325 if (root->role.word == role.word && 326 kvm_tdp_mmu_get_root(root)) 327 goto out; 328 } 329 330 root = tdp_mmu_alloc_sp(vcpu); 331 tdp_mmu_init_sp(root, NULL, 0, role); 332 333 refcount_set(&root->tdp_mmu_root_count, 1); 334 335 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 336 list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots); 337 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 338 339 out: 340 return __pa(root->spt); 341 } 342 343 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 344 u64 old_spte, u64 new_spte, int level, 345 bool shared); 346 347 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level) 348 { 349 if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level)) 350 return; 351 352 if (is_accessed_spte(old_spte) && 353 (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) || 354 spte_to_pfn(old_spte) != spte_to_pfn(new_spte))) 355 kvm_set_pfn_accessed(spte_to_pfn(old_spte)); 356 } 357 358 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn, 359 u64 old_spte, u64 new_spte, int level) 360 { 361 bool pfn_changed; 362 struct kvm_memory_slot *slot; 363 364 if (level > PG_LEVEL_4K) 365 return; 366 367 pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); 368 369 if ((!is_writable_pte(old_spte) || pfn_changed) && 370 is_writable_pte(new_spte)) { 371 slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn); 372 mark_page_dirty_in_slot(kvm, slot, gfn); 373 } 374 } 375 376 static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) 377 { 378 kvm_account_pgtable_pages((void *)sp->spt, +1); 379 atomic64_inc(&kvm->arch.tdp_mmu_pages); 380 } 381 382 static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) 383 { 384 kvm_account_pgtable_pages((void *)sp->spt, -1); 385 atomic64_dec(&kvm->arch.tdp_mmu_pages); 386 } 387 388 /** 389 * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages 390 * 391 * @kvm: kvm instance 392 * @sp: the page to be removed 393 * @shared: This operation may not be running under the exclusive use of 394 * the MMU lock and the operation must synchronize with other 395 * threads that might be adding or removing pages. 396 */ 397 static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp, 398 bool shared) 399 { 400 tdp_unaccount_mmu_page(kvm, sp); 401 402 if (!sp->nx_huge_page_disallowed) 403 return; 404 405 if (shared) 406 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 407 else 408 lockdep_assert_held_write(&kvm->mmu_lock); 409 410 sp->nx_huge_page_disallowed = false; 411 untrack_possible_nx_huge_page(kvm, sp); 412 413 if (shared) 414 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 415 } 416 417 /** 418 * handle_removed_pt() - handle a page table removed from the TDP structure 419 * 420 * @kvm: kvm instance 421 * @pt: the page removed from the paging structure 422 * @shared: This operation may not be running under the exclusive use 423 * of the MMU lock and the operation must synchronize with other 424 * threads that might be modifying SPTEs. 425 * 426 * Given a page table that has been removed from the TDP paging structure, 427 * iterates through the page table to clear SPTEs and free child page tables. 428 * 429 * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU 430 * protection. Since this thread removed it from the paging structure, 431 * this thread will be responsible for ensuring the page is freed. Hence the 432 * early rcu_dereferences in the function. 433 */ 434 static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) 435 { 436 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt)); 437 int level = sp->role.level; 438 gfn_t base_gfn = sp->gfn; 439 int i; 440 441 trace_kvm_mmu_prepare_zap_page(sp); 442 443 tdp_mmu_unlink_sp(kvm, sp, shared); 444 445 for (i = 0; i < SPTE_ENT_PER_PAGE; i++) { 446 tdp_ptep_t sptep = pt + i; 447 gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level); 448 u64 old_spte; 449 450 if (shared) { 451 /* 452 * Set the SPTE to a nonpresent value that other 453 * threads will not overwrite. If the SPTE was 454 * already marked as removed then another thread 455 * handling a page fault could overwrite it, so 456 * set the SPTE until it is set from some other 457 * value to the removed SPTE value. 458 */ 459 for (;;) { 460 old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, REMOVED_SPTE); 461 if (!is_removed_spte(old_spte)) 462 break; 463 cpu_relax(); 464 } 465 } else { 466 /* 467 * If the SPTE is not MMU-present, there is no backing 468 * page associated with the SPTE and so no side effects 469 * that need to be recorded, and exclusive ownership of 470 * mmu_lock ensures the SPTE can't be made present. 471 * Note, zapping MMIO SPTEs is also unnecessary as they 472 * are guarded by the memslots generation, not by being 473 * unreachable. 474 */ 475 old_spte = kvm_tdp_mmu_read_spte(sptep); 476 if (!is_shadow_present_pte(old_spte)) 477 continue; 478 479 /* 480 * Use the common helper instead of a raw WRITE_ONCE as 481 * the SPTE needs to be updated atomically if it can be 482 * modified by a different vCPU outside of mmu_lock. 483 * Even though the parent SPTE is !PRESENT, the TLB 484 * hasn't yet been flushed, and both Intel and AMD 485 * document that A/D assists can use upper-level PxE 486 * entries that are cached in the TLB, i.e. the CPU can 487 * still access the page and mark it dirty. 488 * 489 * No retry is needed in the atomic update path as the 490 * sole concern is dropping a Dirty bit, i.e. no other 491 * task can zap/remove the SPTE as mmu_lock is held for 492 * write. Marking the SPTE as a removed SPTE is not 493 * strictly necessary for the same reason, but using 494 * the remove SPTE value keeps the shared/exclusive 495 * paths consistent and allows the handle_changed_spte() 496 * call below to hardcode the new value to REMOVED_SPTE. 497 * 498 * Note, even though dropping a Dirty bit is the only 499 * scenario where a non-atomic update could result in a 500 * functional bug, simply checking the Dirty bit isn't 501 * sufficient as a fast page fault could read the upper 502 * level SPTE before it is zapped, and then make this 503 * target SPTE writable, resume the guest, and set the 504 * Dirty bit between reading the SPTE above and writing 505 * it here. 506 */ 507 old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, 508 REMOVED_SPTE, level); 509 } 510 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn, 511 old_spte, REMOVED_SPTE, level, shared); 512 } 513 514 call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); 515 } 516 517 /** 518 * __handle_changed_spte - handle bookkeeping associated with an SPTE change 519 * @kvm: kvm instance 520 * @as_id: the address space of the paging structure the SPTE was a part of 521 * @gfn: the base GFN that was mapped by the SPTE 522 * @old_spte: The value of the SPTE before the change 523 * @new_spte: The value of the SPTE after the change 524 * @level: the level of the PT the SPTE is part of in the paging structure 525 * @shared: This operation may not be running under the exclusive use of 526 * the MMU lock and the operation must synchronize with other 527 * threads that might be modifying SPTEs. 528 * 529 * Handle bookkeeping that might result from the modification of a SPTE. 530 * This function must be called for all TDP SPTE modifications. 531 */ 532 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 533 u64 old_spte, u64 new_spte, int level, 534 bool shared) 535 { 536 bool was_present = is_shadow_present_pte(old_spte); 537 bool is_present = is_shadow_present_pte(new_spte); 538 bool was_leaf = was_present && is_last_spte(old_spte, level); 539 bool is_leaf = is_present && is_last_spte(new_spte, level); 540 bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); 541 542 WARN_ON(level > PT64_ROOT_MAX_LEVEL); 543 WARN_ON(level < PG_LEVEL_4K); 544 WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1)); 545 546 /* 547 * If this warning were to trigger it would indicate that there was a 548 * missing MMU notifier or a race with some notifier handler. 549 * A present, leaf SPTE should never be directly replaced with another 550 * present leaf SPTE pointing to a different PFN. A notifier handler 551 * should be zapping the SPTE before the main MM's page table is 552 * changed, or the SPTE should be zeroed, and the TLBs flushed by the 553 * thread before replacement. 554 */ 555 if (was_leaf && is_leaf && pfn_changed) { 556 pr_err("Invalid SPTE change: cannot replace a present leaf\n" 557 "SPTE with another present leaf SPTE mapping a\n" 558 "different PFN!\n" 559 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", 560 as_id, gfn, old_spte, new_spte, level); 561 562 /* 563 * Crash the host to prevent error propagation and guest data 564 * corruption. 565 */ 566 BUG(); 567 } 568 569 if (old_spte == new_spte) 570 return; 571 572 trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte); 573 574 if (is_leaf) 575 check_spte_writable_invariants(new_spte); 576 577 /* 578 * The only times a SPTE should be changed from a non-present to 579 * non-present state is when an MMIO entry is installed/modified/ 580 * removed. In that case, there is nothing to do here. 581 */ 582 if (!was_present && !is_present) { 583 /* 584 * If this change does not involve a MMIO SPTE or removed SPTE, 585 * it is unexpected. Log the change, though it should not 586 * impact the guest since both the former and current SPTEs 587 * are nonpresent. 588 */ 589 if (WARN_ON(!is_mmio_spte(old_spte) && 590 !is_mmio_spte(new_spte) && 591 !is_removed_spte(new_spte))) 592 pr_err("Unexpected SPTE change! Nonpresent SPTEs\n" 593 "should not be replaced with another,\n" 594 "different nonpresent SPTE, unless one or both\n" 595 "are MMIO SPTEs, or the new SPTE is\n" 596 "a temporary removed SPTE.\n" 597 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", 598 as_id, gfn, old_spte, new_spte, level); 599 return; 600 } 601 602 if (is_leaf != was_leaf) 603 kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1); 604 605 if (was_leaf && is_dirty_spte(old_spte) && 606 (!is_present || !is_dirty_spte(new_spte) || pfn_changed)) 607 kvm_set_pfn_dirty(spte_to_pfn(old_spte)); 608 609 /* 610 * Recursively handle child PTs if the change removed a subtree from 611 * the paging structure. Note the WARN on the PFN changing without the 612 * SPTE being converted to a hugepage (leaf) or being zapped. Shadow 613 * pages are kernel allocations and should never be migrated. 614 */ 615 if (was_present && !was_leaf && 616 (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed))) 617 handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared); 618 } 619 620 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 621 u64 old_spte, u64 new_spte, int level, 622 bool shared) 623 { 624 __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, 625 shared); 626 handle_changed_spte_acc_track(old_spte, new_spte, level); 627 handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, 628 new_spte, level); 629 } 630 631 /* 632 * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically 633 * and handle the associated bookkeeping. Do not mark the page dirty 634 * in KVM's dirty bitmaps. 635 * 636 * If setting the SPTE fails because it has changed, iter->old_spte will be 637 * refreshed to the current value of the spte. 638 * 639 * @kvm: kvm instance 640 * @iter: a tdp_iter instance currently on the SPTE that should be set 641 * @new_spte: The value the SPTE should be set to 642 * Return: 643 * * 0 - If the SPTE was set. 644 * * -EBUSY - If the SPTE cannot be set. In this case this function will have 645 * no side-effects other than setting iter->old_spte to the last 646 * known value of the spte. 647 */ 648 static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm, 649 struct tdp_iter *iter, 650 u64 new_spte) 651 { 652 u64 *sptep = rcu_dereference(iter->sptep); 653 654 /* 655 * The caller is responsible for ensuring the old SPTE is not a REMOVED 656 * SPTE. KVM should never attempt to zap or manipulate a REMOVED SPTE, 657 * and pre-checking before inserting a new SPTE is advantageous as it 658 * avoids unnecessary work. 659 */ 660 WARN_ON_ONCE(iter->yielded || is_removed_spte(iter->old_spte)); 661 662 lockdep_assert_held_read(&kvm->mmu_lock); 663 664 /* 665 * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and 666 * does not hold the mmu_lock. 667 */ 668 if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte)) 669 return -EBUSY; 670 671 __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, 672 new_spte, iter->level, true); 673 handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level); 674 675 return 0; 676 } 677 678 static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm, 679 struct tdp_iter *iter) 680 { 681 int ret; 682 683 /* 684 * Freeze the SPTE by setting it to a special, 685 * non-present value. This will stop other threads from 686 * immediately installing a present entry in its place 687 * before the TLBs are flushed. 688 */ 689 ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE); 690 if (ret) 691 return ret; 692 693 kvm_flush_remote_tlbs_with_address(kvm, iter->gfn, 694 KVM_PAGES_PER_HPAGE(iter->level)); 695 696 /* 697 * No other thread can overwrite the removed SPTE as they must either 698 * wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not 699 * overwrite the special removed SPTE value. No bookkeeping is needed 700 * here since the SPTE is going from non-present to non-present. Use 701 * the raw write helper to avoid an unnecessary check on volatile bits. 702 */ 703 __kvm_tdp_mmu_write_spte(iter->sptep, 0); 704 705 return 0; 706 } 707 708 709 /* 710 * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping 711 * @kvm: KVM instance 712 * @as_id: Address space ID, i.e. regular vs. SMM 713 * @sptep: Pointer to the SPTE 714 * @old_spte: The current value of the SPTE 715 * @new_spte: The new value that will be set for the SPTE 716 * @gfn: The base GFN that was (or will be) mapped by the SPTE 717 * @level: The level _containing_ the SPTE (its parent PT's level) 718 * @record_acc_track: Notify the MM subsystem of changes to the accessed state 719 * of the page. Should be set unless handling an MMU 720 * notifier for access tracking. Leaving record_acc_track 721 * unset in that case prevents page accesses from being 722 * double counted. 723 * @record_dirty_log: Record the page as dirty in the dirty bitmap if 724 * appropriate for the change being made. Should be set 725 * unless performing certain dirty logging operations. 726 * Leaving record_dirty_log unset in that case prevents page 727 * writes from being double counted. 728 * 729 * Returns the old SPTE value, which _may_ be different than @old_spte if the 730 * SPTE had voldatile bits. 731 */ 732 static u64 __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep, 733 u64 old_spte, u64 new_spte, gfn_t gfn, int level, 734 bool record_acc_track, bool record_dirty_log) 735 { 736 lockdep_assert_held_write(&kvm->mmu_lock); 737 738 /* 739 * No thread should be using this function to set SPTEs to or from the 740 * temporary removed SPTE value. 741 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic 742 * should be used. If operating under the MMU lock in write mode, the 743 * use of the removed SPTE should not be necessary. 744 */ 745 WARN_ON(is_removed_spte(old_spte) || is_removed_spte(new_spte)); 746 747 old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level); 748 749 __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false); 750 751 if (record_acc_track) 752 handle_changed_spte_acc_track(old_spte, new_spte, level); 753 if (record_dirty_log) 754 handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, 755 new_spte, level); 756 return old_spte; 757 } 758 759 static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, 760 u64 new_spte, bool record_acc_track, 761 bool record_dirty_log) 762 { 763 WARN_ON_ONCE(iter->yielded); 764 765 iter->old_spte = __tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep, 766 iter->old_spte, new_spte, 767 iter->gfn, iter->level, 768 record_acc_track, record_dirty_log); 769 } 770 771 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, 772 u64 new_spte) 773 { 774 _tdp_mmu_set_spte(kvm, iter, new_spte, true, true); 775 } 776 777 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm, 778 struct tdp_iter *iter, 779 u64 new_spte) 780 { 781 _tdp_mmu_set_spte(kvm, iter, new_spte, false, true); 782 } 783 784 static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm, 785 struct tdp_iter *iter, 786 u64 new_spte) 787 { 788 _tdp_mmu_set_spte(kvm, iter, new_spte, true, false); 789 } 790 791 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \ 792 for_each_tdp_pte(_iter, _root, _start, _end) 793 794 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \ 795 tdp_root_for_each_pte(_iter, _root, _start, _end) \ 796 if (!is_shadow_present_pte(_iter.old_spte) || \ 797 !is_last_spte(_iter.old_spte, _iter.level)) \ 798 continue; \ 799 else 800 801 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \ 802 for_each_tdp_pte(_iter, to_shadow_page(_mmu->root.hpa), _start, _end) 803 804 /* 805 * Yield if the MMU lock is contended or this thread needs to return control 806 * to the scheduler. 807 * 808 * If this function should yield and flush is set, it will perform a remote 809 * TLB flush before yielding. 810 * 811 * If this function yields, iter->yielded is set and the caller must skip to 812 * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk 813 * over the paging structures to allow the iterator to continue its traversal 814 * from the paging structure root. 815 * 816 * Returns true if this function yielded. 817 */ 818 static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm, 819 struct tdp_iter *iter, 820 bool flush, bool shared) 821 { 822 WARN_ON(iter->yielded); 823 824 /* Ensure forward progress has been made before yielding. */ 825 if (iter->next_last_level_gfn == iter->yielded_gfn) 826 return false; 827 828 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { 829 if (flush) 830 kvm_flush_remote_tlbs(kvm); 831 832 rcu_read_unlock(); 833 834 if (shared) 835 cond_resched_rwlock_read(&kvm->mmu_lock); 836 else 837 cond_resched_rwlock_write(&kvm->mmu_lock); 838 839 rcu_read_lock(); 840 841 WARN_ON(iter->gfn > iter->next_last_level_gfn); 842 843 iter->yielded = true; 844 } 845 846 return iter->yielded; 847 } 848 849 static inline gfn_t tdp_mmu_max_gfn_exclusive(void) 850 { 851 /* 852 * Bound TDP MMU walks at host.MAXPHYADDR. KVM disallows memslots with 853 * a gpa range that would exceed the max gfn, and KVM does not create 854 * MMIO SPTEs for "impossible" gfns, instead sending such accesses down 855 * the slow emulation path every time. 856 */ 857 return kvm_mmu_max_gfn() + 1; 858 } 859 860 static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, 861 bool shared, int zap_level) 862 { 863 struct tdp_iter iter; 864 865 gfn_t end = tdp_mmu_max_gfn_exclusive(); 866 gfn_t start = 0; 867 868 for_each_tdp_pte_min_level(iter, root, zap_level, start, end) { 869 retry: 870 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared)) 871 continue; 872 873 if (!is_shadow_present_pte(iter.old_spte)) 874 continue; 875 876 if (iter.level > zap_level) 877 continue; 878 879 if (!shared) 880 tdp_mmu_set_spte(kvm, &iter, 0); 881 else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0)) 882 goto retry; 883 } 884 } 885 886 static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, 887 bool shared) 888 { 889 890 /* 891 * The root must have an elevated refcount so that it's reachable via 892 * mmu_notifier callbacks, which allows this path to yield and drop 893 * mmu_lock. When handling an unmap/release mmu_notifier command, KVM 894 * must drop all references to relevant pages prior to completing the 895 * callback. Dropping mmu_lock with an unreachable root would result 896 * in zapping SPTEs after a relevant mmu_notifier callback completes 897 * and lead to use-after-free as zapping a SPTE triggers "writeback" of 898 * dirty accessed bits to the SPTE's associated struct page. 899 */ 900 WARN_ON_ONCE(!refcount_read(&root->tdp_mmu_root_count)); 901 902 kvm_lockdep_assert_mmu_lock_held(kvm, shared); 903 904 rcu_read_lock(); 905 906 /* 907 * To avoid RCU stalls due to recursively removing huge swaths of SPs, 908 * split the zap into two passes. On the first pass, zap at the 1gb 909 * level, and then zap top-level SPs on the second pass. "1gb" is not 910 * arbitrary, as KVM must be able to zap a 1gb shadow page without 911 * inducing a stall to allow in-place replacement with a 1gb hugepage. 912 * 913 * Because zapping a SP recurses on its children, stepping down to 914 * PG_LEVEL_4K in the iterator itself is unnecessary. 915 */ 916 __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G); 917 __tdp_mmu_zap_root(kvm, root, shared, root->role.level); 918 919 rcu_read_unlock(); 920 } 921 922 bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) 923 { 924 u64 old_spte; 925 926 /* 927 * This helper intentionally doesn't allow zapping a root shadow page, 928 * which doesn't have a parent page table and thus no associated entry. 929 */ 930 if (WARN_ON_ONCE(!sp->ptep)) 931 return false; 932 933 old_spte = kvm_tdp_mmu_read_spte(sp->ptep); 934 if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte))) 935 return false; 936 937 __tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0, 938 sp->gfn, sp->role.level + 1, true, true); 939 940 return true; 941 } 942 943 /* 944 * If can_yield is true, will release the MMU lock and reschedule if the 945 * scheduler needs the CPU or there is contention on the MMU lock. If this 946 * function cannot yield, it will not release the MMU lock or reschedule and 947 * the caller must ensure it does not supply too large a GFN range, or the 948 * operation can cause a soft lockup. 949 */ 950 static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root, 951 gfn_t start, gfn_t end, bool can_yield, bool flush) 952 { 953 struct tdp_iter iter; 954 955 end = min(end, tdp_mmu_max_gfn_exclusive()); 956 957 lockdep_assert_held_write(&kvm->mmu_lock); 958 959 rcu_read_lock(); 960 961 for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) { 962 if (can_yield && 963 tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) { 964 flush = false; 965 continue; 966 } 967 968 if (!is_shadow_present_pte(iter.old_spte) || 969 !is_last_spte(iter.old_spte, iter.level)) 970 continue; 971 972 tdp_mmu_set_spte(kvm, &iter, 0); 973 flush = true; 974 } 975 976 rcu_read_unlock(); 977 978 /* 979 * Because this flow zaps _only_ leaf SPTEs, the caller doesn't need 980 * to provide RCU protection as no 'struct kvm_mmu_page' will be freed. 981 */ 982 return flush; 983 } 984 985 /* 986 * Zap leaf SPTEs for the range of gfns, [start, end), for all roots. Returns 987 * true if a TLB flush is needed before releasing the MMU lock, i.e. if one or 988 * more SPTEs were zapped since the MMU lock was last acquired. 989 */ 990 bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end, 991 bool can_yield, bool flush) 992 { 993 struct kvm_mmu_page *root; 994 995 for_each_tdp_mmu_root_yield_safe(kvm, root, as_id) 996 flush = tdp_mmu_zap_leafs(kvm, root, start, end, can_yield, flush); 997 998 return flush; 999 } 1000 1001 void kvm_tdp_mmu_zap_all(struct kvm *kvm) 1002 { 1003 struct kvm_mmu_page *root; 1004 int i; 1005 1006 /* 1007 * Zap all roots, including invalid roots, as all SPTEs must be dropped 1008 * before returning to the caller. Zap directly even if the root is 1009 * also being zapped by a worker. Walking zapped top-level SPTEs isn't 1010 * all that expensive and mmu_lock is already held, which means the 1011 * worker has yielded, i.e. flushing the work instead of zapping here 1012 * isn't guaranteed to be any faster. 1013 * 1014 * A TLB flush is unnecessary, KVM zaps everything if and only the VM 1015 * is being destroyed or the userspace VMM has exited. In both cases, 1016 * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request. 1017 */ 1018 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { 1019 for_each_tdp_mmu_root_yield_safe(kvm, root, i) 1020 tdp_mmu_zap_root(kvm, root, false); 1021 } 1022 } 1023 1024 /* 1025 * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast 1026 * zap" completes. 1027 */ 1028 void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm) 1029 { 1030 flush_workqueue(kvm->arch.tdp_mmu_zap_wq); 1031 } 1032 1033 /* 1034 * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that 1035 * is about to be zapped, e.g. in response to a memslots update. The actual 1036 * zapping is performed asynchronously, so a reference is taken on all roots. 1037 * Using a separate workqueue makes it easy to ensure that the destruction is 1038 * performed before the "fast zap" completes, without keeping a separate list 1039 * of invalidated roots; the list is effectively the list of work items in 1040 * the workqueue. 1041 * 1042 * Get a reference even if the root is already invalid, the asynchronous worker 1043 * assumes it was gifted a reference to the root it processes. Because mmu_lock 1044 * is held for write, it should be impossible to observe a root with zero refcount, 1045 * i.e. the list of roots cannot be stale. 1046 * 1047 * This has essentially the same effect for the TDP MMU 1048 * as updating mmu_valid_gen does for the shadow MMU. 1049 */ 1050 void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm) 1051 { 1052 struct kvm_mmu_page *root; 1053 1054 lockdep_assert_held_write(&kvm->mmu_lock); 1055 list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) { 1056 if (!root->role.invalid && 1057 !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root))) { 1058 root->role.invalid = true; 1059 tdp_mmu_schedule_zap_root(kvm, root); 1060 } 1061 } 1062 } 1063 1064 /* 1065 * Installs a last-level SPTE to handle a TDP page fault. 1066 * (NPT/EPT violation/misconfiguration) 1067 */ 1068 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, 1069 struct kvm_page_fault *fault, 1070 struct tdp_iter *iter) 1071 { 1072 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep)); 1073 u64 new_spte; 1074 int ret = RET_PF_FIXED; 1075 bool wrprot = false; 1076 1077 if (WARN_ON_ONCE(sp->role.level != fault->goal_level)) 1078 return RET_PF_RETRY; 1079 1080 if (unlikely(!fault->slot)) 1081 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); 1082 else 1083 wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn, 1084 fault->pfn, iter->old_spte, fault->prefetch, true, 1085 fault->map_writable, &new_spte); 1086 1087 if (new_spte == iter->old_spte) 1088 ret = RET_PF_SPURIOUS; 1089 else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte)) 1090 return RET_PF_RETRY; 1091 else if (is_shadow_present_pte(iter->old_spte) && 1092 !is_last_spte(iter->old_spte, iter->level)) 1093 kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn, 1094 KVM_PAGES_PER_HPAGE(iter->level + 1)); 1095 1096 /* 1097 * If the page fault was caused by a write but the page is write 1098 * protected, emulation is needed. If the emulation was skipped, 1099 * the vCPU would have the same fault again. 1100 */ 1101 if (wrprot) { 1102 if (fault->write) 1103 ret = RET_PF_EMULATE; 1104 } 1105 1106 /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */ 1107 if (unlikely(is_mmio_spte(new_spte))) { 1108 vcpu->stat.pf_mmio_spte_created++; 1109 trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn, 1110 new_spte); 1111 ret = RET_PF_EMULATE; 1112 } else { 1113 trace_kvm_mmu_set_spte(iter->level, iter->gfn, 1114 rcu_dereference(iter->sptep)); 1115 } 1116 1117 return ret; 1118 } 1119 1120 /* 1121 * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the 1122 * provided page table. 1123 * 1124 * @kvm: kvm instance 1125 * @iter: a tdp_iter instance currently on the SPTE that should be set 1126 * @sp: The new TDP page table to install. 1127 * @shared: This operation is running under the MMU lock in read mode. 1128 * 1129 * Returns: 0 if the new page table was installed. Non-0 if the page table 1130 * could not be installed (e.g. the atomic compare-exchange failed). 1131 */ 1132 static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter, 1133 struct kvm_mmu_page *sp, bool shared) 1134 { 1135 u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled()); 1136 int ret = 0; 1137 1138 if (shared) { 1139 ret = tdp_mmu_set_spte_atomic(kvm, iter, spte); 1140 if (ret) 1141 return ret; 1142 } else { 1143 tdp_mmu_set_spte(kvm, iter, spte); 1144 } 1145 1146 tdp_account_mmu_page(kvm, sp); 1147 1148 return 0; 1149 } 1150 1151 static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, 1152 struct kvm_mmu_page *sp, bool shared); 1153 1154 /* 1155 * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing 1156 * page tables and SPTEs to translate the faulting guest physical address. 1157 */ 1158 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) 1159 { 1160 struct kvm_mmu *mmu = vcpu->arch.mmu; 1161 struct kvm *kvm = vcpu->kvm; 1162 struct tdp_iter iter; 1163 struct kvm_mmu_page *sp; 1164 int ret = RET_PF_RETRY; 1165 1166 kvm_mmu_hugepage_adjust(vcpu, fault); 1167 1168 trace_kvm_mmu_spte_requested(fault); 1169 1170 rcu_read_lock(); 1171 1172 tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) { 1173 int r; 1174 1175 if (fault->nx_huge_page_workaround_enabled) 1176 disallowed_hugepage_adjust(fault, iter.old_spte, iter.level); 1177 1178 /* 1179 * If SPTE has been frozen by another thread, just give up and 1180 * retry, avoiding unnecessary page table allocation and free. 1181 */ 1182 if (is_removed_spte(iter.old_spte)) 1183 goto retry; 1184 1185 if (iter.level == fault->goal_level) 1186 goto map_target_level; 1187 1188 /* Step down into the lower level page table if it exists. */ 1189 if (is_shadow_present_pte(iter.old_spte) && 1190 !is_large_pte(iter.old_spte)) 1191 continue; 1192 1193 /* 1194 * The SPTE is either non-present or points to a huge page that 1195 * needs to be split. 1196 */ 1197 sp = tdp_mmu_alloc_sp(vcpu); 1198 tdp_mmu_init_child_sp(sp, &iter); 1199 1200 sp->nx_huge_page_disallowed = fault->huge_page_disallowed; 1201 1202 if (is_shadow_present_pte(iter.old_spte)) 1203 r = tdp_mmu_split_huge_page(kvm, &iter, sp, true); 1204 else 1205 r = tdp_mmu_link_sp(kvm, &iter, sp, true); 1206 1207 /* 1208 * Force the guest to retry if installing an upper level SPTE 1209 * failed, e.g. because a different task modified the SPTE. 1210 */ 1211 if (r) { 1212 tdp_mmu_free_sp(sp); 1213 goto retry; 1214 } 1215 1216 if (fault->huge_page_disallowed && 1217 fault->req_level >= iter.level) { 1218 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 1219 if (sp->nx_huge_page_disallowed) 1220 track_possible_nx_huge_page(kvm, sp); 1221 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 1222 } 1223 } 1224 1225 /* 1226 * The walk aborted before reaching the target level, e.g. because the 1227 * iterator detected an upper level SPTE was frozen during traversal. 1228 */ 1229 WARN_ON_ONCE(iter.level == fault->goal_level); 1230 goto retry; 1231 1232 map_target_level: 1233 ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter); 1234 1235 retry: 1236 rcu_read_unlock(); 1237 return ret; 1238 } 1239 1240 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, 1241 bool flush) 1242 { 1243 return kvm_tdp_mmu_zap_leafs(kvm, range->slot->as_id, range->start, 1244 range->end, range->may_block, flush); 1245 } 1246 1247 typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter, 1248 struct kvm_gfn_range *range); 1249 1250 static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm, 1251 struct kvm_gfn_range *range, 1252 tdp_handler_t handler) 1253 { 1254 struct kvm_mmu_page *root; 1255 struct tdp_iter iter; 1256 bool ret = false; 1257 1258 /* 1259 * Don't support rescheduling, none of the MMU notifiers that funnel 1260 * into this helper allow blocking; it'd be dead, wasteful code. 1261 */ 1262 for_each_tdp_mmu_root(kvm, root, range->slot->as_id) { 1263 rcu_read_lock(); 1264 1265 tdp_root_for_each_leaf_pte(iter, root, range->start, range->end) 1266 ret |= handler(kvm, &iter, range); 1267 1268 rcu_read_unlock(); 1269 } 1270 1271 return ret; 1272 } 1273 1274 /* 1275 * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero 1276 * if any of the GFNs in the range have been accessed. 1277 */ 1278 static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter, 1279 struct kvm_gfn_range *range) 1280 { 1281 u64 new_spte = 0; 1282 1283 /* If we have a non-accessed entry we don't need to change the pte. */ 1284 if (!is_accessed_spte(iter->old_spte)) 1285 return false; 1286 1287 new_spte = iter->old_spte; 1288 1289 if (spte_ad_enabled(new_spte)) { 1290 new_spte &= ~shadow_accessed_mask; 1291 } else { 1292 /* 1293 * Capture the dirty status of the page, so that it doesn't get 1294 * lost when the SPTE is marked for access tracking. 1295 */ 1296 if (is_writable_pte(new_spte)) 1297 kvm_set_pfn_dirty(spte_to_pfn(new_spte)); 1298 1299 new_spte = mark_spte_for_access_track(new_spte); 1300 } 1301 1302 tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte); 1303 1304 return true; 1305 } 1306 1307 bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) 1308 { 1309 return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range); 1310 } 1311 1312 static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter, 1313 struct kvm_gfn_range *range) 1314 { 1315 return is_accessed_spte(iter->old_spte); 1316 } 1317 1318 bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 1319 { 1320 return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn); 1321 } 1322 1323 static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter, 1324 struct kvm_gfn_range *range) 1325 { 1326 u64 new_spte; 1327 1328 /* Huge pages aren't expected to be modified without first being zapped. */ 1329 WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end); 1330 1331 if (iter->level != PG_LEVEL_4K || 1332 !is_shadow_present_pte(iter->old_spte)) 1333 return false; 1334 1335 /* 1336 * Note, when changing a read-only SPTE, it's not strictly necessary to 1337 * zero the SPTE before setting the new PFN, but doing so preserves the 1338 * invariant that the PFN of a present * leaf SPTE can never change. 1339 * See __handle_changed_spte(). 1340 */ 1341 tdp_mmu_set_spte(kvm, iter, 0); 1342 1343 if (!pte_write(range->pte)) { 1344 new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte, 1345 pte_pfn(range->pte)); 1346 1347 tdp_mmu_set_spte(kvm, iter, new_spte); 1348 } 1349 1350 return true; 1351 } 1352 1353 /* 1354 * Handle the changed_pte MMU notifier for the TDP MMU. 1355 * data is a pointer to the new pte_t mapping the HVA specified by the MMU 1356 * notifier. 1357 * Returns non-zero if a flush is needed before releasing the MMU lock. 1358 */ 1359 bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 1360 { 1361 /* 1362 * No need to handle the remote TLB flush under RCU protection, the 1363 * target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a 1364 * shadow page. See the WARN on pfn_changed in __handle_changed_spte(). 1365 */ 1366 return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn); 1367 } 1368 1369 /* 1370 * Remove write access from all SPTEs at or above min_level that map GFNs 1371 * [start, end). Returns true if an SPTE has been changed and the TLBs need to 1372 * be flushed. 1373 */ 1374 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 1375 gfn_t start, gfn_t end, int min_level) 1376 { 1377 struct tdp_iter iter; 1378 u64 new_spte; 1379 bool spte_set = false; 1380 1381 rcu_read_lock(); 1382 1383 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); 1384 1385 for_each_tdp_pte_min_level(iter, root, min_level, start, end) { 1386 retry: 1387 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) 1388 continue; 1389 1390 if (!is_shadow_present_pte(iter.old_spte) || 1391 !is_last_spte(iter.old_spte, iter.level) || 1392 !(iter.old_spte & PT_WRITABLE_MASK)) 1393 continue; 1394 1395 new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 1396 1397 if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) 1398 goto retry; 1399 1400 spte_set = true; 1401 } 1402 1403 rcu_read_unlock(); 1404 return spte_set; 1405 } 1406 1407 /* 1408 * Remove write access from all the SPTEs mapping GFNs in the memslot. Will 1409 * only affect leaf SPTEs down to min_level. 1410 * Returns true if an SPTE has been changed and the TLBs need to be flushed. 1411 */ 1412 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, 1413 const struct kvm_memory_slot *slot, int min_level) 1414 { 1415 struct kvm_mmu_page *root; 1416 bool spte_set = false; 1417 1418 lockdep_assert_held_read(&kvm->mmu_lock); 1419 1420 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) 1421 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn, 1422 slot->base_gfn + slot->npages, min_level); 1423 1424 return spte_set; 1425 } 1426 1427 static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp) 1428 { 1429 struct kvm_mmu_page *sp; 1430 1431 gfp |= __GFP_ZERO; 1432 1433 sp = kmem_cache_alloc(mmu_page_header_cache, gfp); 1434 if (!sp) 1435 return NULL; 1436 1437 sp->spt = (void *)__get_free_page(gfp); 1438 if (!sp->spt) { 1439 kmem_cache_free(mmu_page_header_cache, sp); 1440 return NULL; 1441 } 1442 1443 return sp; 1444 } 1445 1446 static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm, 1447 struct tdp_iter *iter, 1448 bool shared) 1449 { 1450 struct kvm_mmu_page *sp; 1451 1452 /* 1453 * Since we are allocating while under the MMU lock we have to be 1454 * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct 1455 * reclaim and to avoid making any filesystem callbacks (which can end 1456 * up invoking KVM MMU notifiers, resulting in a deadlock). 1457 * 1458 * If this allocation fails we drop the lock and retry with reclaim 1459 * allowed. 1460 */ 1461 sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT); 1462 if (sp) 1463 return sp; 1464 1465 rcu_read_unlock(); 1466 1467 if (shared) 1468 read_unlock(&kvm->mmu_lock); 1469 else 1470 write_unlock(&kvm->mmu_lock); 1471 1472 iter->yielded = true; 1473 sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT); 1474 1475 if (shared) 1476 read_lock(&kvm->mmu_lock); 1477 else 1478 write_lock(&kvm->mmu_lock); 1479 1480 rcu_read_lock(); 1481 1482 return sp; 1483 } 1484 1485 /* Note, the caller is responsible for initializing @sp. */ 1486 static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, 1487 struct kvm_mmu_page *sp, bool shared) 1488 { 1489 const u64 huge_spte = iter->old_spte; 1490 const int level = iter->level; 1491 int ret, i; 1492 1493 /* 1494 * No need for atomics when writing to sp->spt since the page table has 1495 * not been linked in yet and thus is not reachable from any other CPU. 1496 */ 1497 for (i = 0; i < SPTE_ENT_PER_PAGE; i++) 1498 sp->spt[i] = make_huge_page_split_spte(kvm, huge_spte, sp->role, i); 1499 1500 /* 1501 * Replace the huge spte with a pointer to the populated lower level 1502 * page table. Since we are making this change without a TLB flush vCPUs 1503 * will see a mix of the split mappings and the original huge mapping, 1504 * depending on what's currently in their TLB. This is fine from a 1505 * correctness standpoint since the translation will be the same either 1506 * way. 1507 */ 1508 ret = tdp_mmu_link_sp(kvm, iter, sp, shared); 1509 if (ret) 1510 goto out; 1511 1512 /* 1513 * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we 1514 * are overwriting from the page stats. But we have to manually update 1515 * the page stats with the new present child pages. 1516 */ 1517 kvm_update_page_stats(kvm, level - 1, SPTE_ENT_PER_PAGE); 1518 1519 out: 1520 trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret); 1521 return ret; 1522 } 1523 1524 static int tdp_mmu_split_huge_pages_root(struct kvm *kvm, 1525 struct kvm_mmu_page *root, 1526 gfn_t start, gfn_t end, 1527 int target_level, bool shared) 1528 { 1529 struct kvm_mmu_page *sp = NULL; 1530 struct tdp_iter iter; 1531 int ret = 0; 1532 1533 rcu_read_lock(); 1534 1535 /* 1536 * Traverse the page table splitting all huge pages above the target 1537 * level into one lower level. For example, if we encounter a 1GB page 1538 * we split it into 512 2MB pages. 1539 * 1540 * Since the TDP iterator uses a pre-order traversal, we are guaranteed 1541 * to visit an SPTE before ever visiting its children, which means we 1542 * will correctly recursively split huge pages that are more than one 1543 * level above the target level (e.g. splitting a 1GB to 512 2MB pages, 1544 * and then splitting each of those to 512 4KB pages). 1545 */ 1546 for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) { 1547 retry: 1548 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared)) 1549 continue; 1550 1551 if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte)) 1552 continue; 1553 1554 if (!sp) { 1555 sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared); 1556 if (!sp) { 1557 ret = -ENOMEM; 1558 trace_kvm_mmu_split_huge_page(iter.gfn, 1559 iter.old_spte, 1560 iter.level, ret); 1561 break; 1562 } 1563 1564 if (iter.yielded) 1565 continue; 1566 } 1567 1568 tdp_mmu_init_child_sp(sp, &iter); 1569 1570 if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared)) 1571 goto retry; 1572 1573 sp = NULL; 1574 } 1575 1576 rcu_read_unlock(); 1577 1578 /* 1579 * It's possible to exit the loop having never used the last sp if, for 1580 * example, a vCPU doing HugePage NX splitting wins the race and 1581 * installs its own sp in place of the last sp we tried to split. 1582 */ 1583 if (sp) 1584 tdp_mmu_free_sp(sp); 1585 1586 return ret; 1587 } 1588 1589 1590 /* 1591 * Try to split all huge pages mapped by the TDP MMU down to the target level. 1592 */ 1593 void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm, 1594 const struct kvm_memory_slot *slot, 1595 gfn_t start, gfn_t end, 1596 int target_level, bool shared) 1597 { 1598 struct kvm_mmu_page *root; 1599 int r = 0; 1600 1601 kvm_lockdep_assert_mmu_lock_held(kvm, shared); 1602 1603 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) { 1604 r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared); 1605 if (r) { 1606 kvm_tdp_mmu_put_root(kvm, root, shared); 1607 break; 1608 } 1609 } 1610 } 1611 1612 /* 1613 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If 1614 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. 1615 * If AD bits are not enabled, this will require clearing the writable bit on 1616 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to 1617 * be flushed. 1618 */ 1619 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 1620 gfn_t start, gfn_t end) 1621 { 1622 struct tdp_iter iter; 1623 u64 new_spte; 1624 bool spte_set = false; 1625 1626 rcu_read_lock(); 1627 1628 tdp_root_for_each_leaf_pte(iter, root, start, end) { 1629 retry: 1630 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) 1631 continue; 1632 1633 if (!is_shadow_present_pte(iter.old_spte)) 1634 continue; 1635 1636 if (spte_ad_need_write_protect(iter.old_spte)) { 1637 if (is_writable_pte(iter.old_spte)) 1638 new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 1639 else 1640 continue; 1641 } else { 1642 if (iter.old_spte & shadow_dirty_mask) 1643 new_spte = iter.old_spte & ~shadow_dirty_mask; 1644 else 1645 continue; 1646 } 1647 1648 if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) 1649 goto retry; 1650 1651 spte_set = true; 1652 } 1653 1654 rcu_read_unlock(); 1655 return spte_set; 1656 } 1657 1658 /* 1659 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If 1660 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. 1661 * If AD bits are not enabled, this will require clearing the writable bit on 1662 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to 1663 * be flushed. 1664 */ 1665 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, 1666 const struct kvm_memory_slot *slot) 1667 { 1668 struct kvm_mmu_page *root; 1669 bool spte_set = false; 1670 1671 lockdep_assert_held_read(&kvm->mmu_lock); 1672 1673 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) 1674 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn, 1675 slot->base_gfn + slot->npages); 1676 1677 return spte_set; 1678 } 1679 1680 /* 1681 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is 1682 * set in mask, starting at gfn. The given memslot is expected to contain all 1683 * the GFNs represented by set bits in the mask. If AD bits are enabled, 1684 * clearing the dirty status will involve clearing the dirty bit on each SPTE 1685 * or, if AD bits are not enabled, clearing the writable bit on each SPTE. 1686 */ 1687 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, 1688 gfn_t gfn, unsigned long mask, bool wrprot) 1689 { 1690 struct tdp_iter iter; 1691 u64 new_spte; 1692 1693 rcu_read_lock(); 1694 1695 tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask), 1696 gfn + BITS_PER_LONG) { 1697 if (!mask) 1698 break; 1699 1700 if (iter.level > PG_LEVEL_4K || 1701 !(mask & (1UL << (iter.gfn - gfn)))) 1702 continue; 1703 1704 mask &= ~(1UL << (iter.gfn - gfn)); 1705 1706 if (wrprot || spte_ad_need_write_protect(iter.old_spte)) { 1707 if (is_writable_pte(iter.old_spte)) 1708 new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 1709 else 1710 continue; 1711 } else { 1712 if (iter.old_spte & shadow_dirty_mask) 1713 new_spte = iter.old_spte & ~shadow_dirty_mask; 1714 else 1715 continue; 1716 } 1717 1718 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); 1719 } 1720 1721 rcu_read_unlock(); 1722 } 1723 1724 /* 1725 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is 1726 * set in mask, starting at gfn. The given memslot is expected to contain all 1727 * the GFNs represented by set bits in the mask. If AD bits are enabled, 1728 * clearing the dirty status will involve clearing the dirty bit on each SPTE 1729 * or, if AD bits are not enabled, clearing the writable bit on each SPTE. 1730 */ 1731 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, 1732 struct kvm_memory_slot *slot, 1733 gfn_t gfn, unsigned long mask, 1734 bool wrprot) 1735 { 1736 struct kvm_mmu_page *root; 1737 1738 lockdep_assert_held_write(&kvm->mmu_lock); 1739 for_each_tdp_mmu_root(kvm, root, slot->as_id) 1740 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot); 1741 } 1742 1743 static void zap_collapsible_spte_range(struct kvm *kvm, 1744 struct kvm_mmu_page *root, 1745 const struct kvm_memory_slot *slot) 1746 { 1747 gfn_t start = slot->base_gfn; 1748 gfn_t end = start + slot->npages; 1749 struct tdp_iter iter; 1750 int max_mapping_level; 1751 1752 rcu_read_lock(); 1753 1754 for_each_tdp_pte_min_level(iter, root, PG_LEVEL_2M, start, end) { 1755 retry: 1756 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) 1757 continue; 1758 1759 if (iter.level > KVM_MAX_HUGEPAGE_LEVEL || 1760 !is_shadow_present_pte(iter.old_spte)) 1761 continue; 1762 1763 /* 1764 * Don't zap leaf SPTEs, if a leaf SPTE could be replaced with 1765 * a large page size, then its parent would have been zapped 1766 * instead of stepping down. 1767 */ 1768 if (is_last_spte(iter.old_spte, iter.level)) 1769 continue; 1770 1771 /* 1772 * If iter.gfn resides outside of the slot, i.e. the page for 1773 * the current level overlaps but is not contained by the slot, 1774 * then the SPTE can't be made huge. More importantly, trying 1775 * to query that info from slot->arch.lpage_info will cause an 1776 * out-of-bounds access. 1777 */ 1778 if (iter.gfn < start || iter.gfn >= end) 1779 continue; 1780 1781 max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, 1782 iter.gfn, PG_LEVEL_NUM); 1783 if (max_mapping_level < iter.level) 1784 continue; 1785 1786 /* Note, a successful atomic zap also does a remote TLB flush. */ 1787 if (tdp_mmu_zap_spte_atomic(kvm, &iter)) 1788 goto retry; 1789 } 1790 1791 rcu_read_unlock(); 1792 } 1793 1794 /* 1795 * Zap non-leaf SPTEs (and free their associated page tables) which could 1796 * be replaced by huge pages, for GFNs within the slot. 1797 */ 1798 void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, 1799 const struct kvm_memory_slot *slot) 1800 { 1801 struct kvm_mmu_page *root; 1802 1803 lockdep_assert_held_read(&kvm->mmu_lock); 1804 1805 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) 1806 zap_collapsible_spte_range(kvm, root, slot); 1807 } 1808 1809 /* 1810 * Removes write access on the last level SPTE mapping this GFN and unsets the 1811 * MMU-writable bit to ensure future writes continue to be intercepted. 1812 * Returns true if an SPTE was set and a TLB flush is needed. 1813 */ 1814 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, 1815 gfn_t gfn, int min_level) 1816 { 1817 struct tdp_iter iter; 1818 u64 new_spte; 1819 bool spte_set = false; 1820 1821 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); 1822 1823 rcu_read_lock(); 1824 1825 for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) { 1826 if (!is_shadow_present_pte(iter.old_spte) || 1827 !is_last_spte(iter.old_spte, iter.level)) 1828 continue; 1829 1830 new_spte = iter.old_spte & 1831 ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask); 1832 1833 if (new_spte == iter.old_spte) 1834 break; 1835 1836 tdp_mmu_set_spte(kvm, &iter, new_spte); 1837 spte_set = true; 1838 } 1839 1840 rcu_read_unlock(); 1841 1842 return spte_set; 1843 } 1844 1845 /* 1846 * Removes write access on the last level SPTE mapping this GFN and unsets the 1847 * MMU-writable bit to ensure future writes continue to be intercepted. 1848 * Returns true if an SPTE was set and a TLB flush is needed. 1849 */ 1850 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, 1851 struct kvm_memory_slot *slot, gfn_t gfn, 1852 int min_level) 1853 { 1854 struct kvm_mmu_page *root; 1855 bool spte_set = false; 1856 1857 lockdep_assert_held_write(&kvm->mmu_lock); 1858 for_each_tdp_mmu_root(kvm, root, slot->as_id) 1859 spte_set |= write_protect_gfn(kvm, root, gfn, min_level); 1860 1861 return spte_set; 1862 } 1863 1864 /* 1865 * Return the level of the lowest level SPTE added to sptes. 1866 * That SPTE may be non-present. 1867 * 1868 * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. 1869 */ 1870 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, 1871 int *root_level) 1872 { 1873 struct tdp_iter iter; 1874 struct kvm_mmu *mmu = vcpu->arch.mmu; 1875 gfn_t gfn = addr >> PAGE_SHIFT; 1876 int leaf = -1; 1877 1878 *root_level = vcpu->arch.mmu->root_role.level; 1879 1880 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { 1881 leaf = iter.level; 1882 sptes[leaf] = iter.old_spte; 1883 } 1884 1885 return leaf; 1886 } 1887 1888 /* 1889 * Returns the last level spte pointer of the shadow page walk for the given 1890 * gpa, and sets *spte to the spte value. This spte may be non-preset. If no 1891 * walk could be performed, returns NULL and *spte does not contain valid data. 1892 * 1893 * Contract: 1894 * - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. 1895 * - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end. 1896 * 1897 * WARNING: This function is only intended to be called during fast_page_fault. 1898 */ 1899 u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr, 1900 u64 *spte) 1901 { 1902 struct tdp_iter iter; 1903 struct kvm_mmu *mmu = vcpu->arch.mmu; 1904 gfn_t gfn = addr >> PAGE_SHIFT; 1905 tdp_ptep_t sptep = NULL; 1906 1907 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { 1908 *spte = iter.old_spte; 1909 sptep = iter.sptep; 1910 } 1911 1912 /* 1913 * Perform the rcu_dereference to get the raw spte pointer value since 1914 * we are passing it up to fast_page_fault, which is shared with the 1915 * legacy MMU and thus does not retain the TDP MMU-specific __rcu 1916 * annotation. 1917 * 1918 * This is safe since fast_page_fault obeys the contracts of this 1919 * function as well as all TDP MMU contracts around modifying SPTEs 1920 * outside of mmu_lock. 1921 */ 1922 return rcu_dereference(sptep); 1923 } 1924