1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "mmu.h" 4 #include "mmu_internal.h" 5 #include "mmutrace.h" 6 #include "tdp_iter.h" 7 #include "tdp_mmu.h" 8 #include "spte.h" 9 10 #include <asm/cmpxchg.h> 11 #include <trace/events/kvm.h> 12 13 static bool __read_mostly tdp_mmu_enabled = false; 14 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644); 15 16 /* Initializes the TDP MMU for the VM, if enabled. */ 17 bool kvm_mmu_init_tdp_mmu(struct kvm *kvm) 18 { 19 if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled)) 20 return false; 21 22 /* This should not be changed for the lifetime of the VM. */ 23 kvm->arch.tdp_mmu_enabled = true; 24 25 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); 26 spin_lock_init(&kvm->arch.tdp_mmu_pages_lock); 27 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages); 28 29 return true; 30 } 31 32 static __always_inline void kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm, 33 bool shared) 34 { 35 if (shared) 36 lockdep_assert_held_read(&kvm->mmu_lock); 37 else 38 lockdep_assert_held_write(&kvm->mmu_lock); 39 } 40 41 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) 42 { 43 if (!kvm->arch.tdp_mmu_enabled) 44 return; 45 46 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); 47 48 /* 49 * Ensure that all the outstanding RCU callbacks to free shadow pages 50 * can run before the VM is torn down. 51 */ 52 rcu_barrier(); 53 } 54 55 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 56 gfn_t start, gfn_t end, bool can_yield, bool flush, 57 bool shared); 58 59 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp) 60 { 61 free_page((unsigned long)sp->spt); 62 kmem_cache_free(mmu_page_header_cache, sp); 63 } 64 65 /* 66 * This is called through call_rcu in order to free TDP page table memory 67 * safely with respect to other kernel threads that may be operating on 68 * the memory. 69 * By only accessing TDP MMU page table memory in an RCU read critical 70 * section, and freeing it after a grace period, lockless access to that 71 * memory won't use it after it is freed. 72 */ 73 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head) 74 { 75 struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page, 76 rcu_head); 77 78 tdp_mmu_free_sp(sp); 79 } 80 81 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, 82 bool shared) 83 { 84 gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT); 85 86 kvm_lockdep_assert_mmu_lock_held(kvm, shared); 87 88 if (!refcount_dec_and_test(&root->tdp_mmu_root_count)) 89 return; 90 91 WARN_ON(!root->tdp_mmu_page); 92 93 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 94 list_del_rcu(&root->link); 95 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 96 97 zap_gfn_range(kvm, root, 0, max_gfn, false, false, shared); 98 99 call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback); 100 } 101 102 /* 103 * Finds the next valid root after root (or the first valid root if root 104 * is NULL), takes a reference on it, and returns that next root. If root 105 * is not NULL, this thread should have already taken a reference on it, and 106 * that reference will be dropped. If no valid root is found, this 107 * function will return NULL. 108 */ 109 static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, 110 struct kvm_mmu_page *prev_root, 111 bool shared) 112 { 113 struct kvm_mmu_page *next_root; 114 115 rcu_read_lock(); 116 117 if (prev_root) 118 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, 119 &prev_root->link, 120 typeof(*prev_root), link); 121 else 122 next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots, 123 typeof(*next_root), link); 124 125 while (next_root && !kvm_tdp_mmu_get_root(kvm, next_root)) 126 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, 127 &next_root->link, typeof(*next_root), link); 128 129 rcu_read_unlock(); 130 131 if (prev_root) 132 kvm_tdp_mmu_put_root(kvm, prev_root, shared); 133 134 return next_root; 135 } 136 137 /* 138 * Note: this iterator gets and puts references to the roots it iterates over. 139 * This makes it safe to release the MMU lock and yield within the loop, but 140 * if exiting the loop early, the caller must drop the reference to the most 141 * recent root. (Unless keeping a live reference is desirable.) 142 * 143 * If shared is set, this function is operating under the MMU lock in read 144 * mode. In the unlikely event that this thread must free a root, the lock 145 * will be temporarily dropped and reacquired in write mode. 146 */ 147 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared) \ 148 for (_root = tdp_mmu_next_root(_kvm, NULL, _shared); \ 149 _root; \ 150 _root = tdp_mmu_next_root(_kvm, _root, _shared)) \ 151 if (kvm_mmu_page_as_id(_root) != _as_id) { \ 152 } else 153 154 #define for_each_tdp_mmu_root(_kvm, _root, _as_id) \ 155 list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link, \ 156 lockdep_is_held_type(&kvm->mmu_lock, 0) || \ 157 lockdep_is_held(&kvm->arch.tdp_mmu_pages_lock)) \ 158 if (kvm_mmu_page_as_id(_root) != _as_id) { \ 159 } else 160 161 static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu, 162 int level) 163 { 164 union kvm_mmu_page_role role; 165 166 role = vcpu->arch.mmu->mmu_role.base; 167 role.level = level; 168 role.direct = true; 169 role.gpte_is_8_bytes = true; 170 role.access = ACC_ALL; 171 172 return role; 173 } 174 175 static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn, 176 int level) 177 { 178 struct kvm_mmu_page *sp; 179 180 sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); 181 sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); 182 set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 183 184 sp->role.word = page_role_for_level(vcpu, level).word; 185 sp->gfn = gfn; 186 sp->tdp_mmu_page = true; 187 188 trace_kvm_mmu_get_page(sp, true); 189 190 return sp; 191 } 192 193 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) 194 { 195 union kvm_mmu_page_role role; 196 struct kvm *kvm = vcpu->kvm; 197 struct kvm_mmu_page *root; 198 199 lockdep_assert_held_write(&kvm->mmu_lock); 200 201 role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level); 202 203 /* Check for an existing root before allocating a new one. */ 204 for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) { 205 if (root->role.word == role.word && 206 kvm_tdp_mmu_get_root(kvm, root)) 207 goto out; 208 } 209 210 root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level); 211 refcount_set(&root->tdp_mmu_root_count, 1); 212 213 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 214 list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots); 215 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 216 217 out: 218 return __pa(root->spt); 219 } 220 221 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 222 u64 old_spte, u64 new_spte, int level, 223 bool shared); 224 225 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level) 226 { 227 if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level)) 228 return; 229 230 if (is_accessed_spte(old_spte) && 231 (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) || 232 spte_to_pfn(old_spte) != spte_to_pfn(new_spte))) 233 kvm_set_pfn_accessed(spte_to_pfn(old_spte)); 234 } 235 236 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn, 237 u64 old_spte, u64 new_spte, int level) 238 { 239 bool pfn_changed; 240 struct kvm_memory_slot *slot; 241 242 if (level > PG_LEVEL_4K) 243 return; 244 245 pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); 246 247 if ((!is_writable_pte(old_spte) || pfn_changed) && 248 is_writable_pte(new_spte)) { 249 slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn); 250 mark_page_dirty_in_slot(kvm, slot, gfn); 251 } 252 } 253 254 /** 255 * tdp_mmu_link_page - Add a new page to the list of pages used by the TDP MMU 256 * 257 * @kvm: kvm instance 258 * @sp: the new page 259 * @shared: This operation may not be running under the exclusive use of 260 * the MMU lock and the operation must synchronize with other 261 * threads that might be adding or removing pages. 262 * @account_nx: This page replaces a NX large page and should be marked for 263 * eventual reclaim. 264 */ 265 static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp, 266 bool shared, bool account_nx) 267 { 268 if (shared) 269 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 270 else 271 lockdep_assert_held_write(&kvm->mmu_lock); 272 273 list_add(&sp->link, &kvm->arch.tdp_mmu_pages); 274 if (account_nx) 275 account_huge_nx_page(kvm, sp); 276 277 if (shared) 278 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 279 } 280 281 /** 282 * tdp_mmu_unlink_page - Remove page from the list of pages used by the TDP MMU 283 * 284 * @kvm: kvm instance 285 * @sp: the page to be removed 286 * @shared: This operation may not be running under the exclusive use of 287 * the MMU lock and the operation must synchronize with other 288 * threads that might be adding or removing pages. 289 */ 290 static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp, 291 bool shared) 292 { 293 if (shared) 294 spin_lock(&kvm->arch.tdp_mmu_pages_lock); 295 else 296 lockdep_assert_held_write(&kvm->mmu_lock); 297 298 list_del(&sp->link); 299 if (sp->lpage_disallowed) 300 unaccount_huge_nx_page(kvm, sp); 301 302 if (shared) 303 spin_unlock(&kvm->arch.tdp_mmu_pages_lock); 304 } 305 306 /** 307 * handle_removed_tdp_mmu_page - handle a pt removed from the TDP structure 308 * 309 * @kvm: kvm instance 310 * @pt: the page removed from the paging structure 311 * @shared: This operation may not be running under the exclusive use 312 * of the MMU lock and the operation must synchronize with other 313 * threads that might be modifying SPTEs. 314 * 315 * Given a page table that has been removed from the TDP paging structure, 316 * iterates through the page table to clear SPTEs and free child page tables. 317 * 318 * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU 319 * protection. Since this thread removed it from the paging structure, 320 * this thread will be responsible for ensuring the page is freed. Hence the 321 * early rcu_dereferences in the function. 322 */ 323 static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt, 324 bool shared) 325 { 326 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt)); 327 int level = sp->role.level; 328 gfn_t base_gfn = sp->gfn; 329 u64 old_child_spte; 330 u64 *sptep; 331 gfn_t gfn; 332 int i; 333 334 trace_kvm_mmu_prepare_zap_page(sp); 335 336 tdp_mmu_unlink_page(kvm, sp, shared); 337 338 for (i = 0; i < PT64_ENT_PER_PAGE; i++) { 339 sptep = rcu_dereference(pt) + i; 340 gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level); 341 342 if (shared) { 343 /* 344 * Set the SPTE to a nonpresent value that other 345 * threads will not overwrite. If the SPTE was 346 * already marked as removed then another thread 347 * handling a page fault could overwrite it, so 348 * set the SPTE until it is set from some other 349 * value to the removed SPTE value. 350 */ 351 for (;;) { 352 old_child_spte = xchg(sptep, REMOVED_SPTE); 353 if (!is_removed_spte(old_child_spte)) 354 break; 355 cpu_relax(); 356 } 357 } else { 358 /* 359 * If the SPTE is not MMU-present, there is no backing 360 * page associated with the SPTE and so no side effects 361 * that need to be recorded, and exclusive ownership of 362 * mmu_lock ensures the SPTE can't be made present. 363 * Note, zapping MMIO SPTEs is also unnecessary as they 364 * are guarded by the memslots generation, not by being 365 * unreachable. 366 */ 367 old_child_spte = READ_ONCE(*sptep); 368 if (!is_shadow_present_pte(old_child_spte)) 369 continue; 370 371 /* 372 * Marking the SPTE as a removed SPTE is not 373 * strictly necessary here as the MMU lock will 374 * stop other threads from concurrently modifying 375 * this SPTE. Using the removed SPTE value keeps 376 * the two branches consistent and simplifies 377 * the function. 378 */ 379 WRITE_ONCE(*sptep, REMOVED_SPTE); 380 } 381 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn, 382 old_child_spte, REMOVED_SPTE, level, 383 shared); 384 } 385 386 kvm_flush_remote_tlbs_with_address(kvm, gfn, 387 KVM_PAGES_PER_HPAGE(level + 1)); 388 389 call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); 390 } 391 392 /** 393 * __handle_changed_spte - handle bookkeeping associated with an SPTE change 394 * @kvm: kvm instance 395 * @as_id: the address space of the paging structure the SPTE was a part of 396 * @gfn: the base GFN that was mapped by the SPTE 397 * @old_spte: The value of the SPTE before the change 398 * @new_spte: The value of the SPTE after the change 399 * @level: the level of the PT the SPTE is part of in the paging structure 400 * @shared: This operation may not be running under the exclusive use of 401 * the MMU lock and the operation must synchronize with other 402 * threads that might be modifying SPTEs. 403 * 404 * Handle bookkeeping that might result from the modification of a SPTE. 405 * This function must be called for all TDP SPTE modifications. 406 */ 407 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 408 u64 old_spte, u64 new_spte, int level, 409 bool shared) 410 { 411 bool was_present = is_shadow_present_pte(old_spte); 412 bool is_present = is_shadow_present_pte(new_spte); 413 bool was_leaf = was_present && is_last_spte(old_spte, level); 414 bool is_leaf = is_present && is_last_spte(new_spte, level); 415 bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); 416 417 WARN_ON(level > PT64_ROOT_MAX_LEVEL); 418 WARN_ON(level < PG_LEVEL_4K); 419 WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1)); 420 421 /* 422 * If this warning were to trigger it would indicate that there was a 423 * missing MMU notifier or a race with some notifier handler. 424 * A present, leaf SPTE should never be directly replaced with another 425 * present leaf SPTE pointing to a different PFN. A notifier handler 426 * should be zapping the SPTE before the main MM's page table is 427 * changed, or the SPTE should be zeroed, and the TLBs flushed by the 428 * thread before replacement. 429 */ 430 if (was_leaf && is_leaf && pfn_changed) { 431 pr_err("Invalid SPTE change: cannot replace a present leaf\n" 432 "SPTE with another present leaf SPTE mapping a\n" 433 "different PFN!\n" 434 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", 435 as_id, gfn, old_spte, new_spte, level); 436 437 /* 438 * Crash the host to prevent error propagation and guest data 439 * corruption. 440 */ 441 BUG(); 442 } 443 444 if (old_spte == new_spte) 445 return; 446 447 trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte); 448 449 if (is_large_pte(old_spte) != is_large_pte(new_spte)) { 450 if (is_large_pte(old_spte)) 451 atomic64_sub(1, (atomic64_t*)&kvm->stat.lpages); 452 else 453 atomic64_add(1, (atomic64_t*)&kvm->stat.lpages); 454 } 455 456 /* 457 * The only times a SPTE should be changed from a non-present to 458 * non-present state is when an MMIO entry is installed/modified/ 459 * removed. In that case, there is nothing to do here. 460 */ 461 if (!was_present && !is_present) { 462 /* 463 * If this change does not involve a MMIO SPTE or removed SPTE, 464 * it is unexpected. Log the change, though it should not 465 * impact the guest since both the former and current SPTEs 466 * are nonpresent. 467 */ 468 if (WARN_ON(!is_mmio_spte(old_spte) && 469 !is_mmio_spte(new_spte) && 470 !is_removed_spte(new_spte))) 471 pr_err("Unexpected SPTE change! Nonpresent SPTEs\n" 472 "should not be replaced with another,\n" 473 "different nonpresent SPTE, unless one or both\n" 474 "are MMIO SPTEs, or the new SPTE is\n" 475 "a temporary removed SPTE.\n" 476 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", 477 as_id, gfn, old_spte, new_spte, level); 478 return; 479 } 480 481 482 if (was_leaf && is_dirty_spte(old_spte) && 483 (!is_present || !is_dirty_spte(new_spte) || pfn_changed)) 484 kvm_set_pfn_dirty(spte_to_pfn(old_spte)); 485 486 /* 487 * Recursively handle child PTs if the change removed a subtree from 488 * the paging structure. 489 */ 490 if (was_present && !was_leaf && (pfn_changed || !is_present)) 491 handle_removed_tdp_mmu_page(kvm, 492 spte_to_child_pt(old_spte, level), shared); 493 } 494 495 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 496 u64 old_spte, u64 new_spte, int level, 497 bool shared) 498 { 499 __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, 500 shared); 501 handle_changed_spte_acc_track(old_spte, new_spte, level); 502 handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, 503 new_spte, level); 504 } 505 506 /* 507 * tdp_mmu_set_spte_atomic_no_dirty_log - Set a TDP MMU SPTE atomically 508 * and handle the associated bookkeeping, but do not mark the page dirty 509 * in KVM's dirty bitmaps. 510 * 511 * @kvm: kvm instance 512 * @iter: a tdp_iter instance currently on the SPTE that should be set 513 * @new_spte: The value the SPTE should be set to 514 * Returns: true if the SPTE was set, false if it was not. If false is returned, 515 * this function will have no side-effects. 516 */ 517 static inline bool tdp_mmu_set_spte_atomic_no_dirty_log(struct kvm *kvm, 518 struct tdp_iter *iter, 519 u64 new_spte) 520 { 521 lockdep_assert_held_read(&kvm->mmu_lock); 522 523 /* 524 * Do not change removed SPTEs. Only the thread that froze the SPTE 525 * may modify it. 526 */ 527 if (is_removed_spte(iter->old_spte)) 528 return false; 529 530 if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte, 531 new_spte) != iter->old_spte) 532 return false; 533 534 __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, 535 new_spte, iter->level, true); 536 handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level); 537 538 return true; 539 } 540 541 static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm, 542 struct tdp_iter *iter, 543 u64 new_spte) 544 { 545 if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, iter, new_spte)) 546 return false; 547 548 handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn, 549 iter->old_spte, new_spte, iter->level); 550 return true; 551 } 552 553 static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm, 554 struct tdp_iter *iter) 555 { 556 /* 557 * Freeze the SPTE by setting it to a special, 558 * non-present value. This will stop other threads from 559 * immediately installing a present entry in its place 560 * before the TLBs are flushed. 561 */ 562 if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE)) 563 return false; 564 565 kvm_flush_remote_tlbs_with_address(kvm, iter->gfn, 566 KVM_PAGES_PER_HPAGE(iter->level)); 567 568 /* 569 * No other thread can overwrite the removed SPTE as they 570 * must either wait on the MMU lock or use 571 * tdp_mmu_set_spte_atomic which will not overwrite the 572 * special removed SPTE value. No bookkeeping is needed 573 * here since the SPTE is going from non-present 574 * to non-present. 575 */ 576 WRITE_ONCE(*rcu_dereference(iter->sptep), 0); 577 578 return true; 579 } 580 581 582 /* 583 * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping 584 * @kvm: kvm instance 585 * @iter: a tdp_iter instance currently on the SPTE that should be set 586 * @new_spte: The value the SPTE should be set to 587 * @record_acc_track: Notify the MM subsystem of changes to the accessed state 588 * of the page. Should be set unless handling an MMU 589 * notifier for access tracking. Leaving record_acc_track 590 * unset in that case prevents page accesses from being 591 * double counted. 592 * @record_dirty_log: Record the page as dirty in the dirty bitmap if 593 * appropriate for the change being made. Should be set 594 * unless performing certain dirty logging operations. 595 * Leaving record_dirty_log unset in that case prevents page 596 * writes from being double counted. 597 */ 598 static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, 599 u64 new_spte, bool record_acc_track, 600 bool record_dirty_log) 601 { 602 lockdep_assert_held_write(&kvm->mmu_lock); 603 604 /* 605 * No thread should be using this function to set SPTEs to the 606 * temporary removed SPTE value. 607 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic 608 * should be used. If operating under the MMU lock in write mode, the 609 * use of the removed SPTE should not be necessary. 610 */ 611 WARN_ON(is_removed_spte(iter->old_spte)); 612 613 WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte); 614 615 __handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte, 616 new_spte, iter->level, false); 617 if (record_acc_track) 618 handle_changed_spte_acc_track(iter->old_spte, new_spte, 619 iter->level); 620 if (record_dirty_log) 621 handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn, 622 iter->old_spte, new_spte, 623 iter->level); 624 } 625 626 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, 627 u64 new_spte) 628 { 629 __tdp_mmu_set_spte(kvm, iter, new_spte, true, true); 630 } 631 632 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm, 633 struct tdp_iter *iter, 634 u64 new_spte) 635 { 636 __tdp_mmu_set_spte(kvm, iter, new_spte, false, true); 637 } 638 639 static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm, 640 struct tdp_iter *iter, 641 u64 new_spte) 642 { 643 __tdp_mmu_set_spte(kvm, iter, new_spte, true, false); 644 } 645 646 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \ 647 for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end) 648 649 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \ 650 tdp_root_for_each_pte(_iter, _root, _start, _end) \ 651 if (!is_shadow_present_pte(_iter.old_spte) || \ 652 !is_last_spte(_iter.old_spte, _iter.level)) \ 653 continue; \ 654 else 655 656 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \ 657 for_each_tdp_pte(_iter, __va(_mmu->root_hpa), \ 658 _mmu->shadow_root_level, _start, _end) 659 660 /* 661 * Yield if the MMU lock is contended or this thread needs to return control 662 * to the scheduler. 663 * 664 * If this function should yield and flush is set, it will perform a remote 665 * TLB flush before yielding. 666 * 667 * If this function yields, it will also reset the tdp_iter's walk over the 668 * paging structure and the calling function should skip to the next 669 * iteration to allow the iterator to continue its traversal from the 670 * paging structure root. 671 * 672 * Return true if this function yielded and the iterator's traversal was reset. 673 * Return false if a yield was not needed. 674 */ 675 static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm, 676 struct tdp_iter *iter, bool flush, 677 bool shared) 678 { 679 /* Ensure forward progress has been made before yielding. */ 680 if (iter->next_last_level_gfn == iter->yielded_gfn) 681 return false; 682 683 if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { 684 rcu_read_unlock(); 685 686 if (flush) 687 kvm_flush_remote_tlbs(kvm); 688 689 if (shared) 690 cond_resched_rwlock_read(&kvm->mmu_lock); 691 else 692 cond_resched_rwlock_write(&kvm->mmu_lock); 693 694 rcu_read_lock(); 695 696 WARN_ON(iter->gfn > iter->next_last_level_gfn); 697 698 tdp_iter_restart(iter); 699 700 return true; 701 } 702 703 return false; 704 } 705 706 /* 707 * Tears down the mappings for the range of gfns, [start, end), and frees the 708 * non-root pages mapping GFNs strictly within that range. Returns true if 709 * SPTEs have been cleared and a TLB flush is needed before releasing the 710 * MMU lock. 711 * 712 * If can_yield is true, will release the MMU lock and reschedule if the 713 * scheduler needs the CPU or there is contention on the MMU lock. If this 714 * function cannot yield, it will not release the MMU lock or reschedule and 715 * the caller must ensure it does not supply too large a GFN range, or the 716 * operation can cause a soft lockup. 717 * 718 * If shared is true, this thread holds the MMU lock in read mode and must 719 * account for the possibility that other threads are modifying the paging 720 * structures concurrently. If shared is false, this thread should hold the 721 * MMU lock in write mode. 722 */ 723 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 724 gfn_t start, gfn_t end, bool can_yield, bool flush, 725 bool shared) 726 { 727 struct tdp_iter iter; 728 729 kvm_lockdep_assert_mmu_lock_held(kvm, shared); 730 731 rcu_read_lock(); 732 733 tdp_root_for_each_pte(iter, root, start, end) { 734 retry: 735 if (can_yield && 736 tdp_mmu_iter_cond_resched(kvm, &iter, flush, shared)) { 737 flush = false; 738 continue; 739 } 740 741 if (!is_shadow_present_pte(iter.old_spte)) 742 continue; 743 744 /* 745 * If this is a non-last-level SPTE that covers a larger range 746 * than should be zapped, continue, and zap the mappings at a 747 * lower level. 748 */ 749 if ((iter.gfn < start || 750 iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) && 751 !is_last_spte(iter.old_spte, iter.level)) 752 continue; 753 754 if (!shared) { 755 tdp_mmu_set_spte(kvm, &iter, 0); 756 flush = true; 757 } else if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) { 758 /* 759 * The iter must explicitly re-read the SPTE because 760 * the atomic cmpxchg failed. 761 */ 762 iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep)); 763 goto retry; 764 } 765 } 766 767 rcu_read_unlock(); 768 return flush; 769 } 770 771 /* 772 * Tears down the mappings for the range of gfns, [start, end), and frees the 773 * non-root pages mapping GFNs strictly within that range. Returns true if 774 * SPTEs have been cleared and a TLB flush is needed before releasing the 775 * MMU lock. 776 * 777 * If shared is true, this thread holds the MMU lock in read mode and must 778 * account for the possibility that other threads are modifying the paging 779 * structures concurrently. If shared is false, this thread should hold the 780 * MMU in write mode. 781 */ 782 bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start, 783 gfn_t end, bool can_yield, bool flush, 784 bool shared) 785 { 786 struct kvm_mmu_page *root; 787 788 for_each_tdp_mmu_root_yield_safe(kvm, root, as_id, shared) 789 flush = zap_gfn_range(kvm, root, start, end, can_yield, flush, 790 shared); 791 792 return flush; 793 } 794 795 void kvm_tdp_mmu_zap_all(struct kvm *kvm) 796 { 797 gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT); 798 bool flush = false; 799 int i; 800 801 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) 802 flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, max_gfn, 803 flush, false); 804 805 if (flush) 806 kvm_flush_remote_tlbs(kvm); 807 } 808 809 static struct kvm_mmu_page *next_invalidated_root(struct kvm *kvm, 810 struct kvm_mmu_page *prev_root) 811 { 812 struct kvm_mmu_page *next_root; 813 814 if (prev_root) 815 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, 816 &prev_root->link, 817 typeof(*prev_root), link); 818 else 819 next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots, 820 typeof(*next_root), link); 821 822 while (next_root && !(next_root->role.invalid && 823 refcount_read(&next_root->tdp_mmu_root_count))) 824 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots, 825 &next_root->link, 826 typeof(*next_root), link); 827 828 return next_root; 829 } 830 831 /* 832 * Since kvm_tdp_mmu_zap_all_fast has acquired a reference to each 833 * invalidated root, they will not be freed until this function drops the 834 * reference. Before dropping that reference, tear down the paging 835 * structure so that whichever thread does drop the last reference 836 * only has to do a trivial amount of work. Since the roots are invalid, 837 * no new SPTEs should be created under them. 838 */ 839 void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm) 840 { 841 gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT); 842 struct kvm_mmu_page *next_root; 843 struct kvm_mmu_page *root; 844 bool flush = false; 845 846 lockdep_assert_held_read(&kvm->mmu_lock); 847 848 rcu_read_lock(); 849 850 root = next_invalidated_root(kvm, NULL); 851 852 while (root) { 853 next_root = next_invalidated_root(kvm, root); 854 855 rcu_read_unlock(); 856 857 flush = zap_gfn_range(kvm, root, 0, max_gfn, true, flush, 858 true); 859 860 /* 861 * Put the reference acquired in 862 * kvm_tdp_mmu_invalidate_roots 863 */ 864 kvm_tdp_mmu_put_root(kvm, root, true); 865 866 root = next_root; 867 868 rcu_read_lock(); 869 } 870 871 rcu_read_unlock(); 872 873 if (flush) 874 kvm_flush_remote_tlbs(kvm); 875 } 876 877 /* 878 * Mark each TDP MMU root as invalid so that other threads 879 * will drop their references and allow the root count to 880 * go to 0. 881 * 882 * Also take a reference on all roots so that this thread 883 * can do the bulk of the work required to free the roots 884 * once they are invalidated. Without this reference, a 885 * vCPU thread might drop the last reference to a root and 886 * get stuck with tearing down the entire paging structure. 887 * 888 * Roots which have a zero refcount should be skipped as 889 * they're already being torn down. 890 * Already invalid roots should be referenced again so that 891 * they aren't freed before kvm_tdp_mmu_zap_all_fast is 892 * done with them. 893 * 894 * This has essentially the same effect for the TDP MMU 895 * as updating mmu_valid_gen does for the shadow MMU. 896 */ 897 void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm) 898 { 899 struct kvm_mmu_page *root; 900 901 lockdep_assert_held_write(&kvm->mmu_lock); 902 list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) 903 if (refcount_inc_not_zero(&root->tdp_mmu_root_count)) 904 root->role.invalid = true; 905 } 906 907 /* 908 * Installs a last-level SPTE to handle a TDP page fault. 909 * (NPT/EPT violation/misconfiguration) 910 */ 911 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, 912 int map_writable, 913 struct tdp_iter *iter, 914 kvm_pfn_t pfn, bool prefault) 915 { 916 u64 new_spte; 917 int ret = RET_PF_FIXED; 918 int make_spte_ret = 0; 919 920 if (unlikely(is_noslot_pfn(pfn))) 921 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); 922 else 923 make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn, 924 pfn, iter->old_spte, prefault, true, 925 map_writable, !shadow_accessed_mask, 926 &new_spte); 927 928 if (new_spte == iter->old_spte) 929 ret = RET_PF_SPURIOUS; 930 else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte)) 931 return RET_PF_RETRY; 932 933 /* 934 * If the page fault was caused by a write but the page is write 935 * protected, emulation is needed. If the emulation was skipped, 936 * the vCPU would have the same fault again. 937 */ 938 if (make_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) { 939 if (write) 940 ret = RET_PF_EMULATE; 941 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); 942 } 943 944 /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */ 945 if (unlikely(is_mmio_spte(new_spte))) { 946 trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn, 947 new_spte); 948 ret = RET_PF_EMULATE; 949 } else { 950 trace_kvm_mmu_set_spte(iter->level, iter->gfn, 951 rcu_dereference(iter->sptep)); 952 } 953 954 /* 955 * Increase pf_fixed in both RET_PF_EMULATE and RET_PF_FIXED to be 956 * consistent with legacy MMU behavior. 957 */ 958 if (ret != RET_PF_SPURIOUS) 959 vcpu->stat.pf_fixed++; 960 961 return ret; 962 } 963 964 /* 965 * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing 966 * page tables and SPTEs to translate the faulting guest physical address. 967 */ 968 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, 969 int map_writable, int max_level, kvm_pfn_t pfn, 970 bool prefault) 971 { 972 bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(); 973 bool write = error_code & PFERR_WRITE_MASK; 974 bool exec = error_code & PFERR_FETCH_MASK; 975 bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled; 976 struct kvm_mmu *mmu = vcpu->arch.mmu; 977 struct tdp_iter iter; 978 struct kvm_mmu_page *sp; 979 u64 *child_pt; 980 u64 new_spte; 981 int ret; 982 gfn_t gfn = gpa >> PAGE_SHIFT; 983 int level; 984 int req_level; 985 986 level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn, 987 huge_page_disallowed, &req_level); 988 989 trace_kvm_mmu_spte_requested(gpa, level, pfn); 990 991 rcu_read_lock(); 992 993 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { 994 if (nx_huge_page_workaround_enabled) 995 disallowed_hugepage_adjust(iter.old_spte, gfn, 996 iter.level, &pfn, &level); 997 998 if (iter.level == level) 999 break; 1000 1001 /* 1002 * If there is an SPTE mapping a large page at a higher level 1003 * than the target, that SPTE must be cleared and replaced 1004 * with a non-leaf SPTE. 1005 */ 1006 if (is_shadow_present_pte(iter.old_spte) && 1007 is_large_pte(iter.old_spte)) { 1008 if (!tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter)) 1009 break; 1010 1011 /* 1012 * The iter must explicitly re-read the spte here 1013 * because the new value informs the !present 1014 * path below. 1015 */ 1016 iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep)); 1017 } 1018 1019 if (!is_shadow_present_pte(iter.old_spte)) { 1020 /* 1021 * If SPTE has been frozen by another thread, just 1022 * give up and retry, avoiding unnecessary page table 1023 * allocation and free. 1024 */ 1025 if (is_removed_spte(iter.old_spte)) 1026 break; 1027 1028 sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level - 1); 1029 child_pt = sp->spt; 1030 1031 new_spte = make_nonleaf_spte(child_pt, 1032 !shadow_accessed_mask); 1033 1034 if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter, 1035 new_spte)) { 1036 tdp_mmu_link_page(vcpu->kvm, sp, true, 1037 huge_page_disallowed && 1038 req_level >= iter.level); 1039 1040 trace_kvm_mmu_get_page(sp, true); 1041 } else { 1042 tdp_mmu_free_sp(sp); 1043 break; 1044 } 1045 } 1046 } 1047 1048 if (iter.level != level) { 1049 rcu_read_unlock(); 1050 return RET_PF_RETRY; 1051 } 1052 1053 ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter, 1054 pfn, prefault); 1055 rcu_read_unlock(); 1056 1057 return ret; 1058 } 1059 1060 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, 1061 bool flush) 1062 { 1063 struct kvm_mmu_page *root; 1064 1065 for_each_tdp_mmu_root(kvm, root, range->slot->as_id) 1066 flush |= zap_gfn_range(kvm, root, range->start, range->end, 1067 range->may_block, flush, false); 1068 1069 return flush; 1070 } 1071 1072 typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter, 1073 struct kvm_gfn_range *range); 1074 1075 static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm, 1076 struct kvm_gfn_range *range, 1077 tdp_handler_t handler) 1078 { 1079 struct kvm_mmu_page *root; 1080 struct tdp_iter iter; 1081 bool ret = false; 1082 1083 rcu_read_lock(); 1084 1085 /* 1086 * Don't support rescheduling, none of the MMU notifiers that funnel 1087 * into this helper allow blocking; it'd be dead, wasteful code. 1088 */ 1089 for_each_tdp_mmu_root(kvm, root, range->slot->as_id) { 1090 tdp_root_for_each_leaf_pte(iter, root, range->start, range->end) 1091 ret |= handler(kvm, &iter, range); 1092 } 1093 1094 rcu_read_unlock(); 1095 1096 return ret; 1097 } 1098 1099 /* 1100 * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero 1101 * if any of the GFNs in the range have been accessed. 1102 */ 1103 static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter, 1104 struct kvm_gfn_range *range) 1105 { 1106 u64 new_spte = 0; 1107 1108 /* If we have a non-accessed entry we don't need to change the pte. */ 1109 if (!is_accessed_spte(iter->old_spte)) 1110 return false; 1111 1112 new_spte = iter->old_spte; 1113 1114 if (spte_ad_enabled(new_spte)) { 1115 new_spte &= ~shadow_accessed_mask; 1116 } else { 1117 /* 1118 * Capture the dirty status of the page, so that it doesn't get 1119 * lost when the SPTE is marked for access tracking. 1120 */ 1121 if (is_writable_pte(new_spte)) 1122 kvm_set_pfn_dirty(spte_to_pfn(new_spte)); 1123 1124 new_spte = mark_spte_for_access_track(new_spte); 1125 } 1126 1127 tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte); 1128 1129 return true; 1130 } 1131 1132 bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) 1133 { 1134 return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range); 1135 } 1136 1137 static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter, 1138 struct kvm_gfn_range *range) 1139 { 1140 return is_accessed_spte(iter->old_spte); 1141 } 1142 1143 bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 1144 { 1145 return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn); 1146 } 1147 1148 static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter, 1149 struct kvm_gfn_range *range) 1150 { 1151 u64 new_spte; 1152 1153 /* Huge pages aren't expected to be modified without first being zapped. */ 1154 WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end); 1155 1156 if (iter->level != PG_LEVEL_4K || 1157 !is_shadow_present_pte(iter->old_spte)) 1158 return false; 1159 1160 /* 1161 * Note, when changing a read-only SPTE, it's not strictly necessary to 1162 * zero the SPTE before setting the new PFN, but doing so preserves the 1163 * invariant that the PFN of a present * leaf SPTE can never change. 1164 * See __handle_changed_spte(). 1165 */ 1166 tdp_mmu_set_spte(kvm, iter, 0); 1167 1168 if (!pte_write(range->pte)) { 1169 new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte, 1170 pte_pfn(range->pte)); 1171 1172 tdp_mmu_set_spte(kvm, iter, new_spte); 1173 } 1174 1175 return true; 1176 } 1177 1178 /* 1179 * Handle the changed_pte MMU notifier for the TDP MMU. 1180 * data is a pointer to the new pte_t mapping the HVA specified by the MMU 1181 * notifier. 1182 * Returns non-zero if a flush is needed before releasing the MMU lock. 1183 */ 1184 bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 1185 { 1186 bool flush = kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn); 1187 1188 /* FIXME: return 'flush' instead of flushing here. */ 1189 if (flush) 1190 kvm_flush_remote_tlbs_with_address(kvm, range->start, 1); 1191 1192 return false; 1193 } 1194 1195 /* 1196 * Remove write access from all SPTEs at or above min_level that map GFNs 1197 * [start, end). Returns true if an SPTE has been changed and the TLBs need to 1198 * be flushed. 1199 */ 1200 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 1201 gfn_t start, gfn_t end, int min_level) 1202 { 1203 struct tdp_iter iter; 1204 u64 new_spte; 1205 bool spte_set = false; 1206 1207 rcu_read_lock(); 1208 1209 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); 1210 1211 for_each_tdp_pte_min_level(iter, root->spt, root->role.level, 1212 min_level, start, end) { 1213 retry: 1214 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) 1215 continue; 1216 1217 if (!is_shadow_present_pte(iter.old_spte) || 1218 !is_last_spte(iter.old_spte, iter.level) || 1219 !(iter.old_spte & PT_WRITABLE_MASK)) 1220 continue; 1221 1222 new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 1223 1224 if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, &iter, 1225 new_spte)) { 1226 /* 1227 * The iter must explicitly re-read the SPTE because 1228 * the atomic cmpxchg failed. 1229 */ 1230 iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep)); 1231 goto retry; 1232 } 1233 spte_set = true; 1234 } 1235 1236 rcu_read_unlock(); 1237 return spte_set; 1238 } 1239 1240 /* 1241 * Remove write access from all the SPTEs mapping GFNs in the memslot. Will 1242 * only affect leaf SPTEs down to min_level. 1243 * Returns true if an SPTE has been changed and the TLBs need to be flushed. 1244 */ 1245 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot, 1246 int min_level) 1247 { 1248 struct kvm_mmu_page *root; 1249 bool spte_set = false; 1250 1251 lockdep_assert_held_read(&kvm->mmu_lock); 1252 1253 for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) 1254 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn, 1255 slot->base_gfn + slot->npages, min_level); 1256 1257 return spte_set; 1258 } 1259 1260 /* 1261 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If 1262 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. 1263 * If AD bits are not enabled, this will require clearing the writable bit on 1264 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to 1265 * be flushed. 1266 */ 1267 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 1268 gfn_t start, gfn_t end) 1269 { 1270 struct tdp_iter iter; 1271 u64 new_spte; 1272 bool spte_set = false; 1273 1274 rcu_read_lock(); 1275 1276 tdp_root_for_each_leaf_pte(iter, root, start, end) { 1277 retry: 1278 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) 1279 continue; 1280 1281 if (spte_ad_need_write_protect(iter.old_spte)) { 1282 if (is_writable_pte(iter.old_spte)) 1283 new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 1284 else 1285 continue; 1286 } else { 1287 if (iter.old_spte & shadow_dirty_mask) 1288 new_spte = iter.old_spte & ~shadow_dirty_mask; 1289 else 1290 continue; 1291 } 1292 1293 if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, &iter, 1294 new_spte)) { 1295 /* 1296 * The iter must explicitly re-read the SPTE because 1297 * the atomic cmpxchg failed. 1298 */ 1299 iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep)); 1300 goto retry; 1301 } 1302 spte_set = true; 1303 } 1304 1305 rcu_read_unlock(); 1306 return spte_set; 1307 } 1308 1309 /* 1310 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If 1311 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. 1312 * If AD bits are not enabled, this will require clearing the writable bit on 1313 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to 1314 * be flushed. 1315 */ 1316 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot) 1317 { 1318 struct kvm_mmu_page *root; 1319 bool spte_set = false; 1320 1321 lockdep_assert_held_read(&kvm->mmu_lock); 1322 1323 for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) 1324 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn, 1325 slot->base_gfn + slot->npages); 1326 1327 return spte_set; 1328 } 1329 1330 /* 1331 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is 1332 * set in mask, starting at gfn. The given memslot is expected to contain all 1333 * the GFNs represented by set bits in the mask. If AD bits are enabled, 1334 * clearing the dirty status will involve clearing the dirty bit on each SPTE 1335 * or, if AD bits are not enabled, clearing the writable bit on each SPTE. 1336 */ 1337 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, 1338 gfn_t gfn, unsigned long mask, bool wrprot) 1339 { 1340 struct tdp_iter iter; 1341 u64 new_spte; 1342 1343 rcu_read_lock(); 1344 1345 tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask), 1346 gfn + BITS_PER_LONG) { 1347 if (!mask) 1348 break; 1349 1350 if (iter.level > PG_LEVEL_4K || 1351 !(mask & (1UL << (iter.gfn - gfn)))) 1352 continue; 1353 1354 mask &= ~(1UL << (iter.gfn - gfn)); 1355 1356 if (wrprot || spte_ad_need_write_protect(iter.old_spte)) { 1357 if (is_writable_pte(iter.old_spte)) 1358 new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 1359 else 1360 continue; 1361 } else { 1362 if (iter.old_spte & shadow_dirty_mask) 1363 new_spte = iter.old_spte & ~shadow_dirty_mask; 1364 else 1365 continue; 1366 } 1367 1368 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); 1369 } 1370 1371 rcu_read_unlock(); 1372 } 1373 1374 /* 1375 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is 1376 * set in mask, starting at gfn. The given memslot is expected to contain all 1377 * the GFNs represented by set bits in the mask. If AD bits are enabled, 1378 * clearing the dirty status will involve clearing the dirty bit on each SPTE 1379 * or, if AD bits are not enabled, clearing the writable bit on each SPTE. 1380 */ 1381 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, 1382 struct kvm_memory_slot *slot, 1383 gfn_t gfn, unsigned long mask, 1384 bool wrprot) 1385 { 1386 struct kvm_mmu_page *root; 1387 1388 lockdep_assert_held_write(&kvm->mmu_lock); 1389 for_each_tdp_mmu_root(kvm, root, slot->as_id) 1390 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot); 1391 } 1392 1393 /* 1394 * Clear leaf entries which could be replaced by large mappings, for 1395 * GFNs within the slot. 1396 */ 1397 static bool zap_collapsible_spte_range(struct kvm *kvm, 1398 struct kvm_mmu_page *root, 1399 const struct kvm_memory_slot *slot, 1400 bool flush) 1401 { 1402 gfn_t start = slot->base_gfn; 1403 gfn_t end = start + slot->npages; 1404 struct tdp_iter iter; 1405 kvm_pfn_t pfn; 1406 1407 rcu_read_lock(); 1408 1409 tdp_root_for_each_pte(iter, root, start, end) { 1410 retry: 1411 if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, true)) { 1412 flush = false; 1413 continue; 1414 } 1415 1416 if (!is_shadow_present_pte(iter.old_spte) || 1417 !is_last_spte(iter.old_spte, iter.level)) 1418 continue; 1419 1420 pfn = spte_to_pfn(iter.old_spte); 1421 if (kvm_is_reserved_pfn(pfn) || 1422 iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn, 1423 pfn, PG_LEVEL_NUM)) 1424 continue; 1425 1426 if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) { 1427 /* 1428 * The iter must explicitly re-read the SPTE because 1429 * the atomic cmpxchg failed. 1430 */ 1431 iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep)); 1432 goto retry; 1433 } 1434 flush = true; 1435 } 1436 1437 rcu_read_unlock(); 1438 1439 return flush; 1440 } 1441 1442 /* 1443 * Clear non-leaf entries (and free associated page tables) which could 1444 * be replaced by large mappings, for GFNs within the slot. 1445 */ 1446 bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, 1447 const struct kvm_memory_slot *slot, 1448 bool flush) 1449 { 1450 struct kvm_mmu_page *root; 1451 1452 lockdep_assert_held_read(&kvm->mmu_lock); 1453 1454 for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) 1455 flush = zap_collapsible_spte_range(kvm, root, slot, flush); 1456 1457 return flush; 1458 } 1459 1460 /* 1461 * Removes write access on the last level SPTE mapping this GFN and unsets the 1462 * MMU-writable bit to ensure future writes continue to be intercepted. 1463 * Returns true if an SPTE was set and a TLB flush is needed. 1464 */ 1465 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, 1466 gfn_t gfn, int min_level) 1467 { 1468 struct tdp_iter iter; 1469 u64 new_spte; 1470 bool spte_set = false; 1471 1472 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); 1473 1474 rcu_read_lock(); 1475 1476 for_each_tdp_pte_min_level(iter, root->spt, root->role.level, 1477 min_level, gfn, gfn + 1) { 1478 if (!is_shadow_present_pte(iter.old_spte) || 1479 !is_last_spte(iter.old_spte, iter.level)) 1480 continue; 1481 1482 if (!is_writable_pte(iter.old_spte)) 1483 break; 1484 1485 new_spte = iter.old_spte & 1486 ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask); 1487 1488 tdp_mmu_set_spte(kvm, &iter, new_spte); 1489 spte_set = true; 1490 } 1491 1492 rcu_read_unlock(); 1493 1494 return spte_set; 1495 } 1496 1497 /* 1498 * Removes write access on the last level SPTE mapping this GFN and unsets the 1499 * MMU-writable bit to ensure future writes continue to be intercepted. 1500 * Returns true if an SPTE was set and a TLB flush is needed. 1501 */ 1502 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, 1503 struct kvm_memory_slot *slot, gfn_t gfn, 1504 int min_level) 1505 { 1506 struct kvm_mmu_page *root; 1507 bool spte_set = false; 1508 1509 lockdep_assert_held_write(&kvm->mmu_lock); 1510 for_each_tdp_mmu_root(kvm, root, slot->as_id) 1511 spte_set |= write_protect_gfn(kvm, root, gfn, min_level); 1512 1513 return spte_set; 1514 } 1515 1516 /* 1517 * Return the level of the lowest level SPTE added to sptes. 1518 * That SPTE may be non-present. 1519 */ 1520 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, 1521 int *root_level) 1522 { 1523 struct tdp_iter iter; 1524 struct kvm_mmu *mmu = vcpu->arch.mmu; 1525 gfn_t gfn = addr >> PAGE_SHIFT; 1526 int leaf = -1; 1527 1528 *root_level = vcpu->arch.mmu->shadow_root_level; 1529 1530 rcu_read_lock(); 1531 1532 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { 1533 leaf = iter.level; 1534 sptes[leaf] = iter.old_spte; 1535 } 1536 1537 rcu_read_unlock(); 1538 1539 return leaf; 1540 } 1541