1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "mmu.h" 4 #include "mmu_internal.h" 5 #include "mmutrace.h" 6 #include "tdp_iter.h" 7 #include "tdp_mmu.h" 8 #include "spte.h" 9 10 #include <asm/cmpxchg.h> 11 #include <trace/events/kvm.h> 12 13 static bool __read_mostly tdp_mmu_enabled = true; 14 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644); 15 16 /* Initializes the TDP MMU for the VM, if enabled. */ 17 int kvm_mmu_init_tdp_mmu(struct kvm *kvm) 18 { 19 struct workqueue_struct *wq; 20 21 if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled)) 22 return 0; 23 24 wq = alloc_workqueue("kvm", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0); 25 if (!wq) 26 return -ENOMEM; 27 28 /* This should not be changed for the lifetime of the VM. */ 29 kvm->arch.tdp_mmu_enabled = true; 30 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); 31 spin_lock_init(&kvm->arch.tdp_mmu_pages_lock); 32 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages); 33 kvm->arch.tdp_mmu_zap_wq = wq; 34 return 1; 35 } 36 37 /* Arbitrarily returns true so that this may be used in if statements. */ 38 static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm, 39 bool shared) 40 { 41 if (shared) 42 lockdep_assert_held_read(&kvm->mmu_lock); 43 else 44 lockdep_assert_held_write(&kvm->mmu_lock); 45 46 return true; 47 } 48 49 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) 50 { 51 if (!kvm->arch.tdp_mmu_enabled) 52 return; 53 54 /* Also waits for any queued work items. */ 55 destroy_workqueue(kvm->arch.tdp_mmu_zap_wq); 56 57 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages)); 58 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); 59 60 /* 61 * Ensure that all the outstanding RCU callbacks to free shadow pages 62 * can run before the VM is torn down. Work items on tdp_mmu_zap_wq 63 * can call kvm_tdp_mmu_put_root and create new callbacks. 64 */ 65 rcu_barrier(); 66 } 67 68 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp) 69 { 70 free_page((unsigned long)sp->spt); 71 kmem_cache_free(mmu_page_header_cache, sp); 72 } 73 74 /* 75 * This is called through call_rcu in order to free TDP page table memory 76 * safely with respect to other kernel threads that may be operating on 77 * the memory. 78 * By only accessing TDP MMU page table memory in an RCU read critical 79 * section, and freeing it after a grace period, lockless access to that 80 * memory won't use it after it is freed. 81 */ 82 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head) 83 { 84 struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page, 85 rcu_head); 86 87 tdp_mmu_free_sp(sp); 88 } 89 90 static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, 91 bool shared); 92 93 static void tdp_mmu_zap_root_work(struct work_struct *work) 94 { 95 struct kvm_mmu_page *root = container_of(work, struct kvm_mmu_page, 96 tdp_mmu_async_work); 97 struct kvm *kvm = root->tdp_mmu_async_data; 98 99 read_lock(&kvm->mmu_lock); 100 101 /* 102 * A TLB flush is not necessary as KVM performs a local TLB flush when 103 * allocating a new root (see kvm_mmu_load()), and when migrating vCPU 104 * to a different pCPU. Note, the local TLB flush on reuse also 105 * invalidates any paging-structure-cache entries, i.e. TLB entries for 106 * intermediate paging structures, that may be zapped, as such entries 107 * are associated with the ASID on both VMX and SVM. 108 */ 109 tdp_mmu_zap_root(kvm, root, true); 110 111 /* 112 * Drop the refcount using kvm_tdp_mmu_put_root() to test its logic for 113 * avoiding an infinite loop. By design, the root is reachable while 114 * it's being asynchronously zapped, thus a different task can put its 115 * last reference, i.e. flowing through kvm_tdp_mmu_put_root() for an 116 * asynchronously zapped root is unavoidable. 117 */ 118 kvm_tdp_mmu_put_root(kvm, root, true); 119 120 read_unlock(&kvm->mmu_lock); 121 } 122 123 static void tdp_mmu_schedule_zap_root(struct kvm *kvm, struct kvm_mmu_page *root) 124 { 125 root->tdp_mmu_async_data = kvm; 126 INIT_WORK(&root->tdp_mmu_async_work, tdp_mmu_zap_root_work); 127 queue_work(kvm->arch.tdp_mmu_zap_wq, &root->tdp_mmu_async_work); 128 } 129 130 static inline bool kvm_tdp_root_mark_invalid(struct kvm_mmu_page *page) 131 { 132 union kvm_mmu_page_role role = page->role; 133 role.invalid = true; 134 135 /* No need to use cmpxchg, only the invalid bit can change. */ 136 role.word = xchg(&page->role.word, role.word); 137 return role.invalid; 138 } 139 140 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, 141 bool shared) 142 { 143 kvm_lockdep_assert_mmu_lock_held(kvm, shared); 144 145 if (!refcount_dec_and_test(&root->tdp_mmu_root_count)) 146 return; 147 148 WARN_ON(!root->tdp_mmu_page); 149 150 /* 151 * The root now has refcount=0. It is valid, but readers already 152 * cannot acquire a reference to it because kvm_tdp_mmu_get_root() 153 * rejects it. This remains true for the rest of the execution 154 * of this function, because readers visit valid roots only 155 * (except for tdp_mmu_zap_root_work(), which however 156 * does not acquire any reference itself). 157 * 158 * Even though there are flows that need to visit all roots for 159 * correctness, they all take mmu_lock for write, so they cannot yet 160 * run concurrently. The same is true after kvm_tdp_root_mark_invalid, 161 * since the root still has refcount=0. 162 * 163 * However, tdp_mmu_zap_root can yield, and writers do not expect to 164 * see refcount=0 (see for example kvm_tdp_mmu_invalidate_all_roots()). 165 * So the root temporarily gets an extra reference, going to refcount=1 166 * while staying invalid. Readers still cannot acquire any reference; 167 * but writers are now allowed to run if tdp_mmu_zap_root yields and 168 * they might take an extra reference if they themselves yield. 169 * Therefore, when the reference is given back by the worker, 170 * there is no guarantee that the refcount is still 1. If not, whoever 171 * puts the last reference will free the page, but they will not have to 172 * zap the root because a root cannot go from invalid to valid. 173 */ 174 if (!kvm_tdp_root_mark_invalid(root)) { 175 refcount_set(&root->tdp_mmu_root_count, 1); 176 177 /* 178 * Zapping the root in a worker is not just "nice to have"; 179 * it is required because kvm_tdp_mmu_invalidate_all_roots() 180 * skips already-invalid roots. If kvm_tdp_mmu_put_root() did 181 * not add the root to the workqueue, kvm_tdp_mmu_zap_all_fast() 182 * might return with some roots not zapped yet. 183 */ 184 tdp_mmu_schedule_zap_root(kvm, root); 185 return; 186 } 187 188 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 189 list_del_rcu(&root->link); 190 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 191 call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback); 192 } 193 194 /* 195 * Returns the next root after @prev_root (or the first root if @prev_root is 196 * NULL). A reference to the returned root is acquired, and the reference to 197 * @prev_root is released (the caller obviously must hold a reference to 198 * @prev_root if it's non-NULL). 199 * 200 * If @only_valid is true, invalid roots are skipped. 201 * 202 * Returns NULL if the end of tdp_mmu_roots was reached. 203 */ 204 static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, 205 struct kvm_mmu_page *prev_root, 206 bool shared, bool only_valid) 207 { 208 struct kvm_mmu_page *next_root; 209 210 rcu_read_lock(); 211 212 if (prev_root) 213 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, 214 &prev_root->link, 215 typeof(*prev_root), link); 216 else 217 next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots, 218 typeof(*next_root), link); 219 220 while (next_root) { 221 if ((!only_valid || !next_root->role.invalid) && 222 kvm_tdp_mmu_get_root(next_root)) 223 break; 224 225 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, 226 &next_root->link, typeof(*next_root), link); 227 } 228 229 rcu_read_unlock(); 230 231 if (prev_root) 232 kvm_tdp_mmu_put_root(kvm, prev_root, shared); 233 234 return next_root; 235 } 236 237 /* 238 * Note: this iterator gets and puts references to the roots it iterates over. 239 * This makes it safe to release the MMU lock and yield within the loop, but 240 * if exiting the loop early, the caller must drop the reference to the most 241 * recent root. (Unless keeping a live reference is desirable.) 242 * 243 * If shared is set, this function is operating under the MMU lock in read 244 * mode. In the unlikely event that this thread must free a root, the lock 245 * will be temporarily dropped and reacquired in write mode. 246 */ 247 #define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\ 248 for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid); \ 249 _root; \ 250 _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid)) \ 251 if (kvm_lockdep_assert_mmu_lock_held(_kvm, _shared) && \ 252 kvm_mmu_page_as_id(_root) != _as_id) { \ 253 } else 254 255 #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \ 256 __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true) 257 258 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \ 259 __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, false, false) 260 261 /* 262 * Iterate over all TDP MMU roots. Requires that mmu_lock be held for write, 263 * the implication being that any flow that holds mmu_lock for read is 264 * inherently yield-friendly and should use the yield-safe variant above. 265 * Holding mmu_lock for write obviates the need for RCU protection as the list 266 * is guaranteed to be stable. 267 */ 268 #define for_each_tdp_mmu_root(_kvm, _root, _as_id) \ 269 list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \ 270 if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \ 271 kvm_mmu_page_as_id(_root) != _as_id) { \ 272 } else 273 274 static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu) 275 { 276 struct kvm_mmu_page *sp; 277 278 sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); 279 sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); 280 281 return sp; 282 } 283 284 static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep, 285 gfn_t gfn, union kvm_mmu_page_role role) 286 { 287 set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 288 289 sp->role = role; 290 sp->gfn = gfn; 291 sp->ptep = sptep; 292 sp->tdp_mmu_page = true; 293 294 trace_kvm_mmu_get_page(sp, true); 295 } 296 297 static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp, 298 struct tdp_iter *iter) 299 { 300 struct kvm_mmu_page *parent_sp; 301 union kvm_mmu_page_role role; 302 303 parent_sp = sptep_to_sp(rcu_dereference(iter->sptep)); 304 305 role = parent_sp->role; 306 role.level--; 307 308 tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role); 309 } 310 311 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) 312 { 313 union kvm_mmu_page_role role = vcpu->arch.mmu->root_role; 314 struct kvm *kvm = vcpu->kvm; 315 struct kvm_mmu_page *root; 316 317 lockdep_assert_held_write(&kvm->mmu_lock); 318 319 /* 320 * Check for an existing root before allocating a new one. Note, the 321 * role check prevents consuming an invalid root. 322 */ 323 for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) { 324 if (root->role.word == role.word && 325 kvm_tdp_mmu_get_root(root)) 326 goto out; 327 } 328 329 root = tdp_mmu_alloc_sp(vcpu); 330 tdp_mmu_init_sp(root, NULL, 0, role); 331 332 refcount_set(&root->tdp_mmu_root_count, 1); 333 334 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 335 list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots); 336 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 337 338 out: 339 return __pa(root->spt); 340 } 341 342 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 343 u64 old_spte, u64 new_spte, int level, 344 bool shared); 345 346 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level) 347 { 348 if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level)) 349 return; 350 351 if (is_accessed_spte(old_spte) && 352 (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) || 353 spte_to_pfn(old_spte) != spte_to_pfn(new_spte))) 354 kvm_set_pfn_accessed(spte_to_pfn(old_spte)); 355 } 356 357 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn, 358 u64 old_spte, u64 new_spte, int level) 359 { 360 bool pfn_changed; 361 struct kvm_memory_slot *slot; 362 363 if (level > PG_LEVEL_4K) 364 return; 365 366 pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); 367 368 if ((!is_writable_pte(old_spte) || pfn_changed) && 369 is_writable_pte(new_spte)) { 370 slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn); 371 mark_page_dirty_in_slot(kvm, slot, gfn); 372 } 373 } 374 375 static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) 376 { 377 kvm_account_pgtable_pages((void *)sp->spt, +1); 378 } 379 380 static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) 381 { 382 kvm_account_pgtable_pages((void *)sp->spt, -1); 383 } 384 385 /** 386 * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages 387 * 388 * @kvm: kvm instance 389 * @sp: the page to be removed 390 * @shared: This operation may not be running under the exclusive use of 391 * the MMU lock and the operation must synchronize with other 392 * threads that might be adding or removing pages. 393 */ 394 static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp, 395 bool shared) 396 { 397 tdp_unaccount_mmu_page(kvm, sp); 398 if (shared) 399 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 400 else 401 lockdep_assert_held_write(&kvm->mmu_lock); 402 403 list_del(&sp->link); 404 if (sp->lpage_disallowed) 405 unaccount_huge_nx_page(kvm, sp); 406 407 if (shared) 408 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 409 } 410 411 /** 412 * handle_removed_pt() - handle a page table removed from the TDP structure 413 * 414 * @kvm: kvm instance 415 * @pt: the page removed from the paging structure 416 * @shared: This operation may not be running under the exclusive use 417 * of the MMU lock and the operation must synchronize with other 418 * threads that might be modifying SPTEs. 419 * 420 * Given a page table that has been removed from the TDP paging structure, 421 * iterates through the page table to clear SPTEs and free child page tables. 422 * 423 * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU 424 * protection. Since this thread removed it from the paging structure, 425 * this thread will be responsible for ensuring the page is freed. Hence the 426 * early rcu_dereferences in the function. 427 */ 428 static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) 429 { 430 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt)); 431 int level = sp->role.level; 432 gfn_t base_gfn = sp->gfn; 433 int i; 434 435 trace_kvm_mmu_prepare_zap_page(sp); 436 437 tdp_mmu_unlink_sp(kvm, sp, shared); 438 439 for (i = 0; i < SPTE_ENT_PER_PAGE; i++) { 440 tdp_ptep_t sptep = pt + i; 441 gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level); 442 u64 old_spte; 443 444 if (shared) { 445 /* 446 * Set the SPTE to a nonpresent value that other 447 * threads will not overwrite. If the SPTE was 448 * already marked as removed then another thread 449 * handling a page fault could overwrite it, so 450 * set the SPTE until it is set from some other 451 * value to the removed SPTE value. 452 */ 453 for (;;) { 454 old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, REMOVED_SPTE); 455 if (!is_removed_spte(old_spte)) 456 break; 457 cpu_relax(); 458 } 459 } else { 460 /* 461 * If the SPTE is not MMU-present, there is no backing 462 * page associated with the SPTE and so no side effects 463 * that need to be recorded, and exclusive ownership of 464 * mmu_lock ensures the SPTE can't be made present. 465 * Note, zapping MMIO SPTEs is also unnecessary as they 466 * are guarded by the memslots generation, not by being 467 * unreachable. 468 */ 469 old_spte = kvm_tdp_mmu_read_spte(sptep); 470 if (!is_shadow_present_pte(old_spte)) 471 continue; 472 473 /* 474 * Use the common helper instead of a raw WRITE_ONCE as 475 * the SPTE needs to be updated atomically if it can be 476 * modified by a different vCPU outside of mmu_lock. 477 * Even though the parent SPTE is !PRESENT, the TLB 478 * hasn't yet been flushed, and both Intel and AMD 479 * document that A/D assists can use upper-level PxE 480 * entries that are cached in the TLB, i.e. the CPU can 481 * still access the page and mark it dirty. 482 * 483 * No retry is needed in the atomic update path as the 484 * sole concern is dropping a Dirty bit, i.e. no other 485 * task can zap/remove the SPTE as mmu_lock is held for 486 * write. Marking the SPTE as a removed SPTE is not 487 * strictly necessary for the same reason, but using 488 * the remove SPTE value keeps the shared/exclusive 489 * paths consistent and allows the handle_changed_spte() 490 * call below to hardcode the new value to REMOVED_SPTE. 491 * 492 * Note, even though dropping a Dirty bit is the only 493 * scenario where a non-atomic update could result in a 494 * functional bug, simply checking the Dirty bit isn't 495 * sufficient as a fast page fault could read the upper 496 * level SPTE before it is zapped, and then make this 497 * target SPTE writable, resume the guest, and set the 498 * Dirty bit between reading the SPTE above and writing 499 * it here. 500 */ 501 old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, 502 REMOVED_SPTE, level); 503 } 504 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn, 505 old_spte, REMOVED_SPTE, level, shared); 506 } 507 508 call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); 509 } 510 511 /** 512 * __handle_changed_spte - handle bookkeeping associated with an SPTE change 513 * @kvm: kvm instance 514 * @as_id: the address space of the paging structure the SPTE was a part of 515 * @gfn: the base GFN that was mapped by the SPTE 516 * @old_spte: The value of the SPTE before the change 517 * @new_spte: The value of the SPTE after the change 518 * @level: the level of the PT the SPTE is part of in the paging structure 519 * @shared: This operation may not be running under the exclusive use of 520 * the MMU lock and the operation must synchronize with other 521 * threads that might be modifying SPTEs. 522 * 523 * Handle bookkeeping that might result from the modification of a SPTE. 524 * This function must be called for all TDP SPTE modifications. 525 */ 526 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 527 u64 old_spte, u64 new_spte, int level, 528 bool shared) 529 { 530 bool was_present = is_shadow_present_pte(old_spte); 531 bool is_present = is_shadow_present_pte(new_spte); 532 bool was_leaf = was_present && is_last_spte(old_spte, level); 533 bool is_leaf = is_present && is_last_spte(new_spte, level); 534 bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); 535 536 WARN_ON(level > PT64_ROOT_MAX_LEVEL); 537 WARN_ON(level < PG_LEVEL_4K); 538 WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1)); 539 540 /* 541 * If this warning were to trigger it would indicate that there was a 542 * missing MMU notifier or a race with some notifier handler. 543 * A present, leaf SPTE should never be directly replaced with another 544 * present leaf SPTE pointing to a different PFN. A notifier handler 545 * should be zapping the SPTE before the main MM's page table is 546 * changed, or the SPTE should be zeroed, and the TLBs flushed by the 547 * thread before replacement. 548 */ 549 if (was_leaf && is_leaf && pfn_changed) { 550 pr_err("Invalid SPTE change: cannot replace a present leaf\n" 551 "SPTE with another present leaf SPTE mapping a\n" 552 "different PFN!\n" 553 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", 554 as_id, gfn, old_spte, new_spte, level); 555 556 /* 557 * Crash the host to prevent error propagation and guest data 558 * corruption. 559 */ 560 BUG(); 561 } 562 563 if (old_spte == new_spte) 564 return; 565 566 trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte); 567 568 if (is_leaf) 569 check_spte_writable_invariants(new_spte); 570 571 /* 572 * The only times a SPTE should be changed from a non-present to 573 * non-present state is when an MMIO entry is installed/modified/ 574 * removed. In that case, there is nothing to do here. 575 */ 576 if (!was_present && !is_present) { 577 /* 578 * If this change does not involve a MMIO SPTE or removed SPTE, 579 * it is unexpected. Log the change, though it should not 580 * impact the guest since both the former and current SPTEs 581 * are nonpresent. 582 */ 583 if (WARN_ON(!is_mmio_spte(old_spte) && 584 !is_mmio_spte(new_spte) && 585 !is_removed_spte(new_spte))) 586 pr_err("Unexpected SPTE change! Nonpresent SPTEs\n" 587 "should not be replaced with another,\n" 588 "different nonpresent SPTE, unless one or both\n" 589 "are MMIO SPTEs, or the new SPTE is\n" 590 "a temporary removed SPTE.\n" 591 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", 592 as_id, gfn, old_spte, new_spte, level); 593 return; 594 } 595 596 if (is_leaf != was_leaf) 597 kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1); 598 599 if (was_leaf && is_dirty_spte(old_spte) && 600 (!is_present || !is_dirty_spte(new_spte) || pfn_changed)) 601 kvm_set_pfn_dirty(spte_to_pfn(old_spte)); 602 603 /* 604 * Recursively handle child PTs if the change removed a subtree from 605 * the paging structure. Note the WARN on the PFN changing without the 606 * SPTE being converted to a hugepage (leaf) or being zapped. Shadow 607 * pages are kernel allocations and should never be migrated. 608 */ 609 if (was_present && !was_leaf && 610 (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed))) 611 handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared); 612 } 613 614 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 615 u64 old_spte, u64 new_spte, int level, 616 bool shared) 617 { 618 __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, 619 shared); 620 handle_changed_spte_acc_track(old_spte, new_spte, level); 621 handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, 622 new_spte, level); 623 } 624 625 /* 626 * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically 627 * and handle the associated bookkeeping. Do not mark the page dirty 628 * in KVM's dirty bitmaps. 629 * 630 * If setting the SPTE fails because it has changed, iter->old_spte will be 631 * refreshed to the current value of the spte. 632 * 633 * @kvm: kvm instance 634 * @iter: a tdp_iter instance currently on the SPTE that should be set 635 * @new_spte: The value the SPTE should be set to 636 * Return: 637 * * 0 - If the SPTE was set. 638 * * -EBUSY - If the SPTE cannot be set. In this case this function will have 639 * no side-effects other than setting iter->old_spte to the last 640 * known value of the spte. 641 */ 642 static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm, 643 struct tdp_iter *iter, 644 u64 new_spte) 645 { 646 u64 *sptep = rcu_dereference(iter->sptep); 647 648 /* 649 * The caller is responsible for ensuring the old SPTE is not a REMOVED 650 * SPTE. KVM should never attempt to zap or manipulate a REMOVED SPTE, 651 * and pre-checking before inserting a new SPTE is advantageous as it 652 * avoids unnecessary work. 653 */ 654 WARN_ON_ONCE(iter->yielded || is_removed_spte(iter->old_spte)); 655 656 lockdep_assert_held_read(&kvm->mmu_lock); 657 658 /* 659 * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and 660 * does not hold the mmu_lock. 661 */ 662 if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte)) 663 return -EBUSY; 664 665 __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, 666 new_spte, iter->level, true); 667 handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level); 668 669 return 0; 670 } 671 672 static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm, 673 struct tdp_iter *iter) 674 { 675 int ret; 676 677 /* 678 * Freeze the SPTE by setting it to a special, 679 * non-present value. This will stop other threads from 680 * immediately installing a present entry in its place 681 * before the TLBs are flushed. 682 */ 683 ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE); 684 if (ret) 685 return ret; 686 687 kvm_flush_remote_tlbs_with_address(kvm, iter->gfn, 688 KVM_PAGES_PER_HPAGE(iter->level)); 689 690 /* 691 * No other thread can overwrite the removed SPTE as they must either 692 * wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not 693 * overwrite the special removed SPTE value. No bookkeeping is needed 694 * here since the SPTE is going from non-present to non-present. Use 695 * the raw write helper to avoid an unnecessary check on volatile bits. 696 */ 697 __kvm_tdp_mmu_write_spte(iter->sptep, 0); 698 699 return 0; 700 } 701 702 703 /* 704 * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping 705 * @kvm: KVM instance 706 * @as_id: Address space ID, i.e. regular vs. SMM 707 * @sptep: Pointer to the SPTE 708 * @old_spte: The current value of the SPTE 709 * @new_spte: The new value that will be set for the SPTE 710 * @gfn: The base GFN that was (or will be) mapped by the SPTE 711 * @level: The level _containing_ the SPTE (its parent PT's level) 712 * @record_acc_track: Notify the MM subsystem of changes to the accessed state 713 * of the page. Should be set unless handling an MMU 714 * notifier for access tracking. Leaving record_acc_track 715 * unset in that case prevents page accesses from being 716 * double counted. 717 * @record_dirty_log: Record the page as dirty in the dirty bitmap if 718 * appropriate for the change being made. Should be set 719 * unless performing certain dirty logging operations. 720 * Leaving record_dirty_log unset in that case prevents page 721 * writes from being double counted. 722 * 723 * Returns the old SPTE value, which _may_ be different than @old_spte if the 724 * SPTE had voldatile bits. 725 */ 726 static u64 __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep, 727 u64 old_spte, u64 new_spte, gfn_t gfn, int level, 728 bool record_acc_track, bool record_dirty_log) 729 { 730 lockdep_assert_held_write(&kvm->mmu_lock); 731 732 /* 733 * No thread should be using this function to set SPTEs to or from the 734 * temporary removed SPTE value. 735 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic 736 * should be used. If operating under the MMU lock in write mode, the 737 * use of the removed SPTE should not be necessary. 738 */ 739 WARN_ON(is_removed_spte(old_spte) || is_removed_spte(new_spte)); 740 741 old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level); 742 743 __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false); 744 745 if (record_acc_track) 746 handle_changed_spte_acc_track(old_spte, new_spte, level); 747 if (record_dirty_log) 748 handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, 749 new_spte, level); 750 return old_spte; 751 } 752 753 static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, 754 u64 new_spte, bool record_acc_track, 755 bool record_dirty_log) 756 { 757 WARN_ON_ONCE(iter->yielded); 758 759 iter->old_spte = __tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep, 760 iter->old_spte, new_spte, 761 iter->gfn, iter->level, 762 record_acc_track, record_dirty_log); 763 } 764 765 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, 766 u64 new_spte) 767 { 768 _tdp_mmu_set_spte(kvm, iter, new_spte, true, true); 769 } 770 771 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm, 772 struct tdp_iter *iter, 773 u64 new_spte) 774 { 775 _tdp_mmu_set_spte(kvm, iter, new_spte, false, true); 776 } 777 778 static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm, 779 struct tdp_iter *iter, 780 u64 new_spte) 781 { 782 _tdp_mmu_set_spte(kvm, iter, new_spte, true, false); 783 } 784 785 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \ 786 for_each_tdp_pte(_iter, _root, _start, _end) 787 788 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \ 789 tdp_root_for_each_pte(_iter, _root, _start, _end) \ 790 if (!is_shadow_present_pte(_iter.old_spte) || \ 791 !is_last_spte(_iter.old_spte, _iter.level)) \ 792 continue; \ 793 else 794 795 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \ 796 for_each_tdp_pte(_iter, to_shadow_page(_mmu->root.hpa), _start, _end) 797 798 /* 799 * Yield if the MMU lock is contended or this thread needs to return control 800 * to the scheduler. 801 * 802 * If this function should yield and flush is set, it will perform a remote 803 * TLB flush before yielding. 804 * 805 * If this function yields, iter->yielded is set and the caller must skip to 806 * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk 807 * over the paging structures to allow the iterator to continue its traversal 808 * from the paging structure root. 809 * 810 * Returns true if this function yielded. 811 */ 812 static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm, 813 struct tdp_iter *iter, 814 bool flush, bool shared) 815 { 816 WARN_ON(iter->yielded); 817 818 /* Ensure forward progress has been made before yielding. */ 819 if (iter->next_last_level_gfn == iter->yielded_gfn) 820 return false; 821 822 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { 823 if (flush) 824 kvm_flush_remote_tlbs(kvm); 825 826 rcu_read_unlock(); 827 828 if (shared) 829 cond_resched_rwlock_read(&kvm->mmu_lock); 830 else 831 cond_resched_rwlock_write(&kvm->mmu_lock); 832 833 rcu_read_lock(); 834 835 WARN_ON(iter->gfn > iter->next_last_level_gfn); 836 837 iter->yielded = true; 838 } 839 840 return iter->yielded; 841 } 842 843 static inline gfn_t tdp_mmu_max_gfn_exclusive(void) 844 { 845 /* 846 * Bound TDP MMU walks at host.MAXPHYADDR. KVM disallows memslots with 847 * a gpa range that would exceed the max gfn, and KVM does not create 848 * MMIO SPTEs for "impossible" gfns, instead sending such accesses down 849 * the slow emulation path every time. 850 */ 851 return kvm_mmu_max_gfn() + 1; 852 } 853 854 static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, 855 bool shared, int zap_level) 856 { 857 struct tdp_iter iter; 858 859 gfn_t end = tdp_mmu_max_gfn_exclusive(); 860 gfn_t start = 0; 861 862 for_each_tdp_pte_min_level(iter, root, zap_level, start, end) { 863 retry: 864 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared)) 865 continue; 866 867 if (!is_shadow_present_pte(iter.old_spte)) 868 continue; 869 870 if (iter.level > zap_level) 871 continue; 872 873 if (!shared) 874 tdp_mmu_set_spte(kvm, &iter, 0); 875 else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0)) 876 goto retry; 877 } 878 } 879 880 static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, 881 bool shared) 882 { 883 884 /* 885 * The root must have an elevated refcount so that it's reachable via 886 * mmu_notifier callbacks, which allows this path to yield and drop 887 * mmu_lock. When handling an unmap/release mmu_notifier command, KVM 888 * must drop all references to relevant pages prior to completing the 889 * callback. Dropping mmu_lock with an unreachable root would result 890 * in zapping SPTEs after a relevant mmu_notifier callback completes 891 * and lead to use-after-free as zapping a SPTE triggers "writeback" of 892 * dirty accessed bits to the SPTE's associated struct page. 893 */ 894 WARN_ON_ONCE(!refcount_read(&root->tdp_mmu_root_count)); 895 896 kvm_lockdep_assert_mmu_lock_held(kvm, shared); 897 898 rcu_read_lock(); 899 900 /* 901 * To avoid RCU stalls due to recursively removing huge swaths of SPs, 902 * split the zap into two passes. On the first pass, zap at the 1gb 903 * level, and then zap top-level SPs on the second pass. "1gb" is not 904 * arbitrary, as KVM must be able to zap a 1gb shadow page without 905 * inducing a stall to allow in-place replacement with a 1gb hugepage. 906 * 907 * Because zapping a SP recurses on its children, stepping down to 908 * PG_LEVEL_4K in the iterator itself is unnecessary. 909 */ 910 __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G); 911 __tdp_mmu_zap_root(kvm, root, shared, root->role.level); 912 913 rcu_read_unlock(); 914 } 915 916 bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) 917 { 918 u64 old_spte; 919 920 /* 921 * This helper intentionally doesn't allow zapping a root shadow page, 922 * which doesn't have a parent page table and thus no associated entry. 923 */ 924 if (WARN_ON_ONCE(!sp->ptep)) 925 return false; 926 927 old_spte = kvm_tdp_mmu_read_spte(sp->ptep); 928 if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte))) 929 return false; 930 931 __tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0, 932 sp->gfn, sp->role.level + 1, true, true); 933 934 return true; 935 } 936 937 /* 938 * If can_yield is true, will release the MMU lock and reschedule if the 939 * scheduler needs the CPU or there is contention on the MMU lock. If this 940 * function cannot yield, it will not release the MMU lock or reschedule and 941 * the caller must ensure it does not supply too large a GFN range, or the 942 * operation can cause a soft lockup. 943 */ 944 static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root, 945 gfn_t start, gfn_t end, bool can_yield, bool flush) 946 { 947 struct tdp_iter iter; 948 949 end = min(end, tdp_mmu_max_gfn_exclusive()); 950 951 lockdep_assert_held_write(&kvm->mmu_lock); 952 953 rcu_read_lock(); 954 955 for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) { 956 if (can_yield && 957 tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) { 958 flush = false; 959 continue; 960 } 961 962 if (!is_shadow_present_pte(iter.old_spte) || 963 !is_last_spte(iter.old_spte, iter.level)) 964 continue; 965 966 tdp_mmu_set_spte(kvm, &iter, 0); 967 flush = true; 968 } 969 970 rcu_read_unlock(); 971 972 /* 973 * Because this flow zaps _only_ leaf SPTEs, the caller doesn't need 974 * to provide RCU protection as no 'struct kvm_mmu_page' will be freed. 975 */ 976 return flush; 977 } 978 979 /* 980 * Zap leaf SPTEs for the range of gfns, [start, end), for all roots. Returns 981 * true if a TLB flush is needed before releasing the MMU lock, i.e. if one or 982 * more SPTEs were zapped since the MMU lock was last acquired. 983 */ 984 bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end, 985 bool can_yield, bool flush) 986 { 987 struct kvm_mmu_page *root; 988 989 for_each_tdp_mmu_root_yield_safe(kvm, root, as_id) 990 flush = tdp_mmu_zap_leafs(kvm, root, start, end, can_yield, flush); 991 992 return flush; 993 } 994 995 void kvm_tdp_mmu_zap_all(struct kvm *kvm) 996 { 997 struct kvm_mmu_page *root; 998 int i; 999 1000 /* 1001 * Zap all roots, including invalid roots, as all SPTEs must be dropped 1002 * before returning to the caller. Zap directly even if the root is 1003 * also being zapped by a worker. Walking zapped top-level SPTEs isn't 1004 * all that expensive and mmu_lock is already held, which means the 1005 * worker has yielded, i.e. flushing the work instead of zapping here 1006 * isn't guaranteed to be any faster. 1007 * 1008 * A TLB flush is unnecessary, KVM zaps everything if and only the VM 1009 * is being destroyed or the userspace VMM has exited. In both cases, 1010 * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request. 1011 */ 1012 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { 1013 for_each_tdp_mmu_root_yield_safe(kvm, root, i) 1014 tdp_mmu_zap_root(kvm, root, false); 1015 } 1016 } 1017 1018 /* 1019 * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast 1020 * zap" completes. 1021 */ 1022 void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm) 1023 { 1024 flush_workqueue(kvm->arch.tdp_mmu_zap_wq); 1025 } 1026 1027 /* 1028 * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that 1029 * is about to be zapped, e.g. in response to a memslots update. The actual 1030 * zapping is performed asynchronously, so a reference is taken on all roots. 1031 * Using a separate workqueue makes it easy to ensure that the destruction is 1032 * performed before the "fast zap" completes, without keeping a separate list 1033 * of invalidated roots; the list is effectively the list of work items in 1034 * the workqueue. 1035 * 1036 * Get a reference even if the root is already invalid, the asynchronous worker 1037 * assumes it was gifted a reference to the root it processes. Because mmu_lock 1038 * is held for write, it should be impossible to observe a root with zero refcount, 1039 * i.e. the list of roots cannot be stale. 1040 * 1041 * This has essentially the same effect for the TDP MMU 1042 * as updating mmu_valid_gen does for the shadow MMU. 1043 */ 1044 void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm) 1045 { 1046 struct kvm_mmu_page *root; 1047 1048 lockdep_assert_held_write(&kvm->mmu_lock); 1049 list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) { 1050 if (!root->role.invalid && 1051 !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root))) { 1052 root->role.invalid = true; 1053 tdp_mmu_schedule_zap_root(kvm, root); 1054 } 1055 } 1056 } 1057 1058 /* 1059 * Installs a last-level SPTE to handle a TDP page fault. 1060 * (NPT/EPT violation/misconfiguration) 1061 */ 1062 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, 1063 struct kvm_page_fault *fault, 1064 struct tdp_iter *iter) 1065 { 1066 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep)); 1067 u64 new_spte; 1068 int ret = RET_PF_FIXED; 1069 bool wrprot = false; 1070 1071 WARN_ON(sp->role.level != fault->goal_level); 1072 if (unlikely(!fault->slot)) 1073 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); 1074 else 1075 wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn, 1076 fault->pfn, iter->old_spte, fault->prefetch, true, 1077 fault->map_writable, &new_spte); 1078 1079 if (new_spte == iter->old_spte) 1080 ret = RET_PF_SPURIOUS; 1081 else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte)) 1082 return RET_PF_RETRY; 1083 else if (is_shadow_present_pte(iter->old_spte) && 1084 !is_last_spte(iter->old_spte, iter->level)) 1085 kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn, 1086 KVM_PAGES_PER_HPAGE(iter->level + 1)); 1087 1088 /* 1089 * If the page fault was caused by a write but the page is write 1090 * protected, emulation is needed. If the emulation was skipped, 1091 * the vCPU would have the same fault again. 1092 */ 1093 if (wrprot) { 1094 if (fault->write) 1095 ret = RET_PF_EMULATE; 1096 } 1097 1098 /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */ 1099 if (unlikely(is_mmio_spte(new_spte))) { 1100 vcpu->stat.pf_mmio_spte_created++; 1101 trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn, 1102 new_spte); 1103 ret = RET_PF_EMULATE; 1104 } else { 1105 trace_kvm_mmu_set_spte(iter->level, iter->gfn, 1106 rcu_dereference(iter->sptep)); 1107 } 1108 1109 return ret; 1110 } 1111 1112 /* 1113 * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the 1114 * provided page table. 1115 * 1116 * @kvm: kvm instance 1117 * @iter: a tdp_iter instance currently on the SPTE that should be set 1118 * @sp: The new TDP page table to install. 1119 * @account_nx: True if this page table is being installed to split a 1120 * non-executable huge page. 1121 * @shared: This operation is running under the MMU lock in read mode. 1122 * 1123 * Returns: 0 if the new page table was installed. Non-0 if the page table 1124 * could not be installed (e.g. the atomic compare-exchange failed). 1125 */ 1126 static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter, 1127 struct kvm_mmu_page *sp, bool account_nx, 1128 bool shared) 1129 { 1130 u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled()); 1131 int ret = 0; 1132 1133 if (shared) { 1134 ret = tdp_mmu_set_spte_atomic(kvm, iter, spte); 1135 if (ret) 1136 return ret; 1137 } else { 1138 tdp_mmu_set_spte(kvm, iter, spte); 1139 } 1140 1141 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 1142 list_add(&sp->link, &kvm->arch.tdp_mmu_pages); 1143 if (account_nx) 1144 account_huge_nx_page(kvm, sp); 1145 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 1146 tdp_account_mmu_page(kvm, sp); 1147 1148 return 0; 1149 } 1150 1151 /* 1152 * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing 1153 * page tables and SPTEs to translate the faulting guest physical address. 1154 */ 1155 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) 1156 { 1157 struct kvm_mmu *mmu = vcpu->arch.mmu; 1158 struct tdp_iter iter; 1159 struct kvm_mmu_page *sp; 1160 int ret; 1161 1162 kvm_mmu_hugepage_adjust(vcpu, fault); 1163 1164 trace_kvm_mmu_spte_requested(fault); 1165 1166 rcu_read_lock(); 1167 1168 tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) { 1169 if (fault->nx_huge_page_workaround_enabled) 1170 disallowed_hugepage_adjust(fault, iter.old_spte, iter.level); 1171 1172 if (iter.level == fault->goal_level) 1173 break; 1174 1175 /* 1176 * If there is an SPTE mapping a large page at a higher level 1177 * than the target, that SPTE must be cleared and replaced 1178 * with a non-leaf SPTE. 1179 */ 1180 if (is_shadow_present_pte(iter.old_spte) && 1181 is_large_pte(iter.old_spte)) { 1182 if (tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter)) 1183 break; 1184 1185 /* 1186 * The iter must explicitly re-read the spte here 1187 * because the new value informs the !present 1188 * path below. 1189 */ 1190 iter.old_spte = kvm_tdp_mmu_read_spte(iter.sptep); 1191 } 1192 1193 if (!is_shadow_present_pte(iter.old_spte)) { 1194 bool account_nx = fault->huge_page_disallowed && 1195 fault->req_level >= iter.level; 1196 1197 /* 1198 * If SPTE has been frozen by another thread, just 1199 * give up and retry, avoiding unnecessary page table 1200 * allocation and free. 1201 */ 1202 if (is_removed_spte(iter.old_spte)) 1203 break; 1204 1205 sp = tdp_mmu_alloc_sp(vcpu); 1206 tdp_mmu_init_child_sp(sp, &iter); 1207 1208 if (tdp_mmu_link_sp(vcpu->kvm, &iter, sp, account_nx, true)) { 1209 tdp_mmu_free_sp(sp); 1210 break; 1211 } 1212 } 1213 } 1214 1215 /* 1216 * Force the guest to retry the access if the upper level SPTEs aren't 1217 * in place, or if the target leaf SPTE is frozen by another CPU. 1218 */ 1219 if (iter.level != fault->goal_level || is_removed_spte(iter.old_spte)) { 1220 rcu_read_unlock(); 1221 return RET_PF_RETRY; 1222 } 1223 1224 ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter); 1225 rcu_read_unlock(); 1226 1227 return ret; 1228 } 1229 1230 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, 1231 bool flush) 1232 { 1233 return kvm_tdp_mmu_zap_leafs(kvm, range->slot->as_id, range->start, 1234 range->end, range->may_block, flush); 1235 } 1236 1237 typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter, 1238 struct kvm_gfn_range *range); 1239 1240 static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm, 1241 struct kvm_gfn_range *range, 1242 tdp_handler_t handler) 1243 { 1244 struct kvm_mmu_page *root; 1245 struct tdp_iter iter; 1246 bool ret = false; 1247 1248 /* 1249 * Don't support rescheduling, none of the MMU notifiers that funnel 1250 * into this helper allow blocking; it'd be dead, wasteful code. 1251 */ 1252 for_each_tdp_mmu_root(kvm, root, range->slot->as_id) { 1253 rcu_read_lock(); 1254 1255 tdp_root_for_each_leaf_pte(iter, root, range->start, range->end) 1256 ret |= handler(kvm, &iter, range); 1257 1258 rcu_read_unlock(); 1259 } 1260 1261 return ret; 1262 } 1263 1264 /* 1265 * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero 1266 * if any of the GFNs in the range have been accessed. 1267 */ 1268 static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter, 1269 struct kvm_gfn_range *range) 1270 { 1271 u64 new_spte = 0; 1272 1273 /* If we have a non-accessed entry we don't need to change the pte. */ 1274 if (!is_accessed_spte(iter->old_spte)) 1275 return false; 1276 1277 new_spte = iter->old_spte; 1278 1279 if (spte_ad_enabled(new_spte)) { 1280 new_spte &= ~shadow_accessed_mask; 1281 } else { 1282 /* 1283 * Capture the dirty status of the page, so that it doesn't get 1284 * lost when the SPTE is marked for access tracking. 1285 */ 1286 if (is_writable_pte(new_spte)) 1287 kvm_set_pfn_dirty(spte_to_pfn(new_spte)); 1288 1289 new_spte = mark_spte_for_access_track(new_spte); 1290 } 1291 1292 tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte); 1293 1294 return true; 1295 } 1296 1297 bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) 1298 { 1299 return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range); 1300 } 1301 1302 static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter, 1303 struct kvm_gfn_range *range) 1304 { 1305 return is_accessed_spte(iter->old_spte); 1306 } 1307 1308 bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 1309 { 1310 return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn); 1311 } 1312 1313 static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter, 1314 struct kvm_gfn_range *range) 1315 { 1316 u64 new_spte; 1317 1318 /* Huge pages aren't expected to be modified without first being zapped. */ 1319 WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end); 1320 1321 if (iter->level != PG_LEVEL_4K || 1322 !is_shadow_present_pte(iter->old_spte)) 1323 return false; 1324 1325 /* 1326 * Note, when changing a read-only SPTE, it's not strictly necessary to 1327 * zero the SPTE before setting the new PFN, but doing so preserves the 1328 * invariant that the PFN of a present * leaf SPTE can never change. 1329 * See __handle_changed_spte(). 1330 */ 1331 tdp_mmu_set_spte(kvm, iter, 0); 1332 1333 if (!pte_write(range->pte)) { 1334 new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte, 1335 pte_pfn(range->pte)); 1336 1337 tdp_mmu_set_spte(kvm, iter, new_spte); 1338 } 1339 1340 return true; 1341 } 1342 1343 /* 1344 * Handle the changed_pte MMU notifier for the TDP MMU. 1345 * data is a pointer to the new pte_t mapping the HVA specified by the MMU 1346 * notifier. 1347 * Returns non-zero if a flush is needed before releasing the MMU lock. 1348 */ 1349 bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 1350 { 1351 /* 1352 * No need to handle the remote TLB flush under RCU protection, the 1353 * target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a 1354 * shadow page. See the WARN on pfn_changed in __handle_changed_spte(). 1355 */ 1356 return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn); 1357 } 1358 1359 /* 1360 * Remove write access from all SPTEs at or above min_level that map GFNs 1361 * [start, end). Returns true if an SPTE has been changed and the TLBs need to 1362 * be flushed. 1363 */ 1364 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 1365 gfn_t start, gfn_t end, int min_level) 1366 { 1367 struct tdp_iter iter; 1368 u64 new_spte; 1369 bool spte_set = false; 1370 1371 rcu_read_lock(); 1372 1373 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); 1374 1375 for_each_tdp_pte_min_level(iter, root, min_level, start, end) { 1376 retry: 1377 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) 1378 continue; 1379 1380 if (!is_shadow_present_pte(iter.old_spte) || 1381 !is_last_spte(iter.old_spte, iter.level) || 1382 !(iter.old_spte & PT_WRITABLE_MASK)) 1383 continue; 1384 1385 new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 1386 1387 if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) 1388 goto retry; 1389 1390 spte_set = true; 1391 } 1392 1393 rcu_read_unlock(); 1394 return spte_set; 1395 } 1396 1397 /* 1398 * Remove write access from all the SPTEs mapping GFNs in the memslot. Will 1399 * only affect leaf SPTEs down to min_level. 1400 * Returns true if an SPTE has been changed and the TLBs need to be flushed. 1401 */ 1402 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, 1403 const struct kvm_memory_slot *slot, int min_level) 1404 { 1405 struct kvm_mmu_page *root; 1406 bool spte_set = false; 1407 1408 lockdep_assert_held_read(&kvm->mmu_lock); 1409 1410 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) 1411 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn, 1412 slot->base_gfn + slot->npages, min_level); 1413 1414 return spte_set; 1415 } 1416 1417 static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp) 1418 { 1419 struct kvm_mmu_page *sp; 1420 1421 gfp |= __GFP_ZERO; 1422 1423 sp = kmem_cache_alloc(mmu_page_header_cache, gfp); 1424 if (!sp) 1425 return NULL; 1426 1427 sp->spt = (void *)__get_free_page(gfp); 1428 if (!sp->spt) { 1429 kmem_cache_free(mmu_page_header_cache, sp); 1430 return NULL; 1431 } 1432 1433 return sp; 1434 } 1435 1436 static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm, 1437 struct tdp_iter *iter, 1438 bool shared) 1439 { 1440 struct kvm_mmu_page *sp; 1441 1442 /* 1443 * Since we are allocating while under the MMU lock we have to be 1444 * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct 1445 * reclaim and to avoid making any filesystem callbacks (which can end 1446 * up invoking KVM MMU notifiers, resulting in a deadlock). 1447 * 1448 * If this allocation fails we drop the lock and retry with reclaim 1449 * allowed. 1450 */ 1451 sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT); 1452 if (sp) 1453 return sp; 1454 1455 rcu_read_unlock(); 1456 1457 if (shared) 1458 read_unlock(&kvm->mmu_lock); 1459 else 1460 write_unlock(&kvm->mmu_lock); 1461 1462 iter->yielded = true; 1463 sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT); 1464 1465 if (shared) 1466 read_lock(&kvm->mmu_lock); 1467 else 1468 write_lock(&kvm->mmu_lock); 1469 1470 rcu_read_lock(); 1471 1472 return sp; 1473 } 1474 1475 static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, 1476 struct kvm_mmu_page *sp, bool shared) 1477 { 1478 const u64 huge_spte = iter->old_spte; 1479 const int level = iter->level; 1480 int ret, i; 1481 1482 tdp_mmu_init_child_sp(sp, iter); 1483 1484 /* 1485 * No need for atomics when writing to sp->spt since the page table has 1486 * not been linked in yet and thus is not reachable from any other CPU. 1487 */ 1488 for (i = 0; i < SPTE_ENT_PER_PAGE; i++) 1489 sp->spt[i] = make_huge_page_split_spte(kvm, huge_spte, sp->role, i); 1490 1491 /* 1492 * Replace the huge spte with a pointer to the populated lower level 1493 * page table. Since we are making this change without a TLB flush vCPUs 1494 * will see a mix of the split mappings and the original huge mapping, 1495 * depending on what's currently in their TLB. This is fine from a 1496 * correctness standpoint since the translation will be the same either 1497 * way. 1498 */ 1499 ret = tdp_mmu_link_sp(kvm, iter, sp, false, shared); 1500 if (ret) 1501 goto out; 1502 1503 /* 1504 * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we 1505 * are overwriting from the page stats. But we have to manually update 1506 * the page stats with the new present child pages. 1507 */ 1508 kvm_update_page_stats(kvm, level - 1, SPTE_ENT_PER_PAGE); 1509 1510 out: 1511 trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret); 1512 return ret; 1513 } 1514 1515 static int tdp_mmu_split_huge_pages_root(struct kvm *kvm, 1516 struct kvm_mmu_page *root, 1517 gfn_t start, gfn_t end, 1518 int target_level, bool shared) 1519 { 1520 struct kvm_mmu_page *sp = NULL; 1521 struct tdp_iter iter; 1522 int ret = 0; 1523 1524 rcu_read_lock(); 1525 1526 /* 1527 * Traverse the page table splitting all huge pages above the target 1528 * level into one lower level. For example, if we encounter a 1GB page 1529 * we split it into 512 2MB pages. 1530 * 1531 * Since the TDP iterator uses a pre-order traversal, we are guaranteed 1532 * to visit an SPTE before ever visiting its children, which means we 1533 * will correctly recursively split huge pages that are more than one 1534 * level above the target level (e.g. splitting a 1GB to 512 2MB pages, 1535 * and then splitting each of those to 512 4KB pages). 1536 */ 1537 for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) { 1538 retry: 1539 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared)) 1540 continue; 1541 1542 if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte)) 1543 continue; 1544 1545 if (!sp) { 1546 sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared); 1547 if (!sp) { 1548 ret = -ENOMEM; 1549 trace_kvm_mmu_split_huge_page(iter.gfn, 1550 iter.old_spte, 1551 iter.level, ret); 1552 break; 1553 } 1554 1555 if (iter.yielded) 1556 continue; 1557 } 1558 1559 if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared)) 1560 goto retry; 1561 1562 sp = NULL; 1563 } 1564 1565 rcu_read_unlock(); 1566 1567 /* 1568 * It's possible to exit the loop having never used the last sp if, for 1569 * example, a vCPU doing HugePage NX splitting wins the race and 1570 * installs its own sp in place of the last sp we tried to split. 1571 */ 1572 if (sp) 1573 tdp_mmu_free_sp(sp); 1574 1575 return ret; 1576 } 1577 1578 1579 /* 1580 * Try to split all huge pages mapped by the TDP MMU down to the target level. 1581 */ 1582 void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm, 1583 const struct kvm_memory_slot *slot, 1584 gfn_t start, gfn_t end, 1585 int target_level, bool shared) 1586 { 1587 struct kvm_mmu_page *root; 1588 int r = 0; 1589 1590 kvm_lockdep_assert_mmu_lock_held(kvm, shared); 1591 1592 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) { 1593 r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared); 1594 if (r) { 1595 kvm_tdp_mmu_put_root(kvm, root, shared); 1596 break; 1597 } 1598 } 1599 } 1600 1601 /* 1602 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If 1603 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. 1604 * If AD bits are not enabled, this will require clearing the writable bit on 1605 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to 1606 * be flushed. 1607 */ 1608 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 1609 gfn_t start, gfn_t end) 1610 { 1611 struct tdp_iter iter; 1612 u64 new_spte; 1613 bool spte_set = false; 1614 1615 rcu_read_lock(); 1616 1617 tdp_root_for_each_leaf_pte(iter, root, start, end) { 1618 retry: 1619 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) 1620 continue; 1621 1622 if (!is_shadow_present_pte(iter.old_spte)) 1623 continue; 1624 1625 if (spte_ad_need_write_protect(iter.old_spte)) { 1626 if (is_writable_pte(iter.old_spte)) 1627 new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 1628 else 1629 continue; 1630 } else { 1631 if (iter.old_spte & shadow_dirty_mask) 1632 new_spte = iter.old_spte & ~shadow_dirty_mask; 1633 else 1634 continue; 1635 } 1636 1637 if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) 1638 goto retry; 1639 1640 spte_set = true; 1641 } 1642 1643 rcu_read_unlock(); 1644 return spte_set; 1645 } 1646 1647 /* 1648 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If 1649 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. 1650 * If AD bits are not enabled, this will require clearing the writable bit on 1651 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to 1652 * be flushed. 1653 */ 1654 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, 1655 const struct kvm_memory_slot *slot) 1656 { 1657 struct kvm_mmu_page *root; 1658 bool spte_set = false; 1659 1660 lockdep_assert_held_read(&kvm->mmu_lock); 1661 1662 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) 1663 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn, 1664 slot->base_gfn + slot->npages); 1665 1666 return spte_set; 1667 } 1668 1669 /* 1670 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is 1671 * set in mask, starting at gfn. The given memslot is expected to contain all 1672 * the GFNs represented by set bits in the mask. If AD bits are enabled, 1673 * clearing the dirty status will involve clearing the dirty bit on each SPTE 1674 * or, if AD bits are not enabled, clearing the writable bit on each SPTE. 1675 */ 1676 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, 1677 gfn_t gfn, unsigned long mask, bool wrprot) 1678 { 1679 struct tdp_iter iter; 1680 u64 new_spte; 1681 1682 rcu_read_lock(); 1683 1684 tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask), 1685 gfn + BITS_PER_LONG) { 1686 if (!mask) 1687 break; 1688 1689 if (iter.level > PG_LEVEL_4K || 1690 !(mask & (1UL << (iter.gfn - gfn)))) 1691 continue; 1692 1693 mask &= ~(1UL << (iter.gfn - gfn)); 1694 1695 if (wrprot || spte_ad_need_write_protect(iter.old_spte)) { 1696 if (is_writable_pte(iter.old_spte)) 1697 new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 1698 else 1699 continue; 1700 } else { 1701 if (iter.old_spte & shadow_dirty_mask) 1702 new_spte = iter.old_spte & ~shadow_dirty_mask; 1703 else 1704 continue; 1705 } 1706 1707 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); 1708 } 1709 1710 rcu_read_unlock(); 1711 } 1712 1713 /* 1714 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is 1715 * set in mask, starting at gfn. The given memslot is expected to contain all 1716 * the GFNs represented by set bits in the mask. If AD bits are enabled, 1717 * clearing the dirty status will involve clearing the dirty bit on each SPTE 1718 * or, if AD bits are not enabled, clearing the writable bit on each SPTE. 1719 */ 1720 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, 1721 struct kvm_memory_slot *slot, 1722 gfn_t gfn, unsigned long mask, 1723 bool wrprot) 1724 { 1725 struct kvm_mmu_page *root; 1726 1727 lockdep_assert_held_write(&kvm->mmu_lock); 1728 for_each_tdp_mmu_root(kvm, root, slot->as_id) 1729 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot); 1730 } 1731 1732 static void zap_collapsible_spte_range(struct kvm *kvm, 1733 struct kvm_mmu_page *root, 1734 const struct kvm_memory_slot *slot) 1735 { 1736 gfn_t start = slot->base_gfn; 1737 gfn_t end = start + slot->npages; 1738 struct tdp_iter iter; 1739 int max_mapping_level; 1740 1741 rcu_read_lock(); 1742 1743 for_each_tdp_pte_min_level(iter, root, PG_LEVEL_2M, start, end) { 1744 retry: 1745 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) 1746 continue; 1747 1748 if (iter.level > KVM_MAX_HUGEPAGE_LEVEL || 1749 !is_shadow_present_pte(iter.old_spte)) 1750 continue; 1751 1752 /* 1753 * Don't zap leaf SPTEs, if a leaf SPTE could be replaced with 1754 * a large page size, then its parent would have been zapped 1755 * instead of stepping down. 1756 */ 1757 if (is_last_spte(iter.old_spte, iter.level)) 1758 continue; 1759 1760 /* 1761 * If iter.gfn resides outside of the slot, i.e. the page for 1762 * the current level overlaps but is not contained by the slot, 1763 * then the SPTE can't be made huge. More importantly, trying 1764 * to query that info from slot->arch.lpage_info will cause an 1765 * out-of-bounds access. 1766 */ 1767 if (iter.gfn < start || iter.gfn >= end) 1768 continue; 1769 1770 max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, 1771 iter.gfn, PG_LEVEL_NUM); 1772 if (max_mapping_level < iter.level) 1773 continue; 1774 1775 /* Note, a successful atomic zap also does a remote TLB flush. */ 1776 if (tdp_mmu_zap_spte_atomic(kvm, &iter)) 1777 goto retry; 1778 } 1779 1780 rcu_read_unlock(); 1781 } 1782 1783 /* 1784 * Zap non-leaf SPTEs (and free their associated page tables) which could 1785 * be replaced by huge pages, for GFNs within the slot. 1786 */ 1787 void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, 1788 const struct kvm_memory_slot *slot) 1789 { 1790 struct kvm_mmu_page *root; 1791 1792 lockdep_assert_held_read(&kvm->mmu_lock); 1793 1794 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) 1795 zap_collapsible_spte_range(kvm, root, slot); 1796 } 1797 1798 /* 1799 * Removes write access on the last level SPTE mapping this GFN and unsets the 1800 * MMU-writable bit to ensure future writes continue to be intercepted. 1801 * Returns true if an SPTE was set and a TLB flush is needed. 1802 */ 1803 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, 1804 gfn_t gfn, int min_level) 1805 { 1806 struct tdp_iter iter; 1807 u64 new_spte; 1808 bool spte_set = false; 1809 1810 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); 1811 1812 rcu_read_lock(); 1813 1814 for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) { 1815 if (!is_shadow_present_pte(iter.old_spte) || 1816 !is_last_spte(iter.old_spte, iter.level)) 1817 continue; 1818 1819 new_spte = iter.old_spte & 1820 ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask); 1821 1822 if (new_spte == iter.old_spte) 1823 break; 1824 1825 tdp_mmu_set_spte(kvm, &iter, new_spte); 1826 spte_set = true; 1827 } 1828 1829 rcu_read_unlock(); 1830 1831 return spte_set; 1832 } 1833 1834 /* 1835 * Removes write access on the last level SPTE mapping this GFN and unsets the 1836 * MMU-writable bit to ensure future writes continue to be intercepted. 1837 * Returns true if an SPTE was set and a TLB flush is needed. 1838 */ 1839 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, 1840 struct kvm_memory_slot *slot, gfn_t gfn, 1841 int min_level) 1842 { 1843 struct kvm_mmu_page *root; 1844 bool spte_set = false; 1845 1846 lockdep_assert_held_write(&kvm->mmu_lock); 1847 for_each_tdp_mmu_root(kvm, root, slot->as_id) 1848 spte_set |= write_protect_gfn(kvm, root, gfn, min_level); 1849 1850 return spte_set; 1851 } 1852 1853 /* 1854 * Return the level of the lowest level SPTE added to sptes. 1855 * That SPTE may be non-present. 1856 * 1857 * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. 1858 */ 1859 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, 1860 int *root_level) 1861 { 1862 struct tdp_iter iter; 1863 struct kvm_mmu *mmu = vcpu->arch.mmu; 1864 gfn_t gfn = addr >> PAGE_SHIFT; 1865 int leaf = -1; 1866 1867 *root_level = vcpu->arch.mmu->root_role.level; 1868 1869 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { 1870 leaf = iter.level; 1871 sptes[leaf] = iter.old_spte; 1872 } 1873 1874 return leaf; 1875 } 1876 1877 /* 1878 * Returns the last level spte pointer of the shadow page walk for the given 1879 * gpa, and sets *spte to the spte value. This spte may be non-preset. If no 1880 * walk could be performed, returns NULL and *spte does not contain valid data. 1881 * 1882 * Contract: 1883 * - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. 1884 * - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end. 1885 * 1886 * WARNING: This function is only intended to be called during fast_page_fault. 1887 */ 1888 u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr, 1889 u64 *spte) 1890 { 1891 struct tdp_iter iter; 1892 struct kvm_mmu *mmu = vcpu->arch.mmu; 1893 gfn_t gfn = addr >> PAGE_SHIFT; 1894 tdp_ptep_t sptep = NULL; 1895 1896 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { 1897 *spte = iter.old_spte; 1898 sptep = iter.sptep; 1899 } 1900 1901 /* 1902 * Perform the rcu_dereference to get the raw spte pointer value since 1903 * we are passing it up to fast_page_fault, which is shared with the 1904 * legacy MMU and thus does not retain the TDP MMU-specific __rcu 1905 * annotation. 1906 * 1907 * This is safe since fast_page_fault obeys the contracts of this 1908 * function as well as all TDP MMU contracts around modifying SPTEs 1909 * outside of mmu_lock. 1910 */ 1911 return rcu_dereference(sptep); 1912 } 1913