1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "mmu.h" 4 #include "mmu_internal.h" 5 #include "mmutrace.h" 6 #include "tdp_iter.h" 7 #include "tdp_mmu.h" 8 #include "spte.h" 9 10 #include <trace/events/kvm.h> 11 12 #ifdef CONFIG_X86_64 13 static bool __read_mostly tdp_mmu_enabled = false; 14 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644); 15 #endif 16 17 static bool is_tdp_mmu_enabled(void) 18 { 19 #ifdef CONFIG_X86_64 20 return tdp_enabled && READ_ONCE(tdp_mmu_enabled); 21 #else 22 return false; 23 #endif /* CONFIG_X86_64 */ 24 } 25 26 /* Initializes the TDP MMU for the VM, if enabled. */ 27 void kvm_mmu_init_tdp_mmu(struct kvm *kvm) 28 { 29 if (!is_tdp_mmu_enabled()) 30 return; 31 32 /* This should not be changed for the lifetime of the VM. */ 33 kvm->arch.tdp_mmu_enabled = true; 34 35 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots); 36 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages); 37 } 38 39 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) 40 { 41 if (!kvm->arch.tdp_mmu_enabled) 42 return; 43 44 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); 45 } 46 47 #define for_each_tdp_mmu_root(_kvm, _root) \ 48 list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) 49 50 bool is_tdp_mmu_root(struct kvm *kvm, hpa_t hpa) 51 { 52 struct kvm_mmu_page *sp; 53 54 if (!kvm->arch.tdp_mmu_enabled) 55 return false; 56 if (WARN_ON(!VALID_PAGE(hpa))) 57 return false; 58 59 sp = to_shadow_page(hpa); 60 if (WARN_ON(!sp)) 61 return false; 62 63 return sp->tdp_mmu_page && sp->root_count; 64 } 65 66 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 67 gfn_t start, gfn_t end, bool can_yield); 68 69 void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root) 70 { 71 gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT); 72 73 lockdep_assert_held(&kvm->mmu_lock); 74 75 WARN_ON(root->root_count); 76 WARN_ON(!root->tdp_mmu_page); 77 78 list_del(&root->link); 79 80 zap_gfn_range(kvm, root, 0, max_gfn, false); 81 82 free_page((unsigned long)root->spt); 83 kmem_cache_free(mmu_page_header_cache, root); 84 } 85 86 static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu, 87 int level) 88 { 89 union kvm_mmu_page_role role; 90 91 role = vcpu->arch.mmu->mmu_role.base; 92 role.level = level; 93 role.direct = true; 94 role.gpte_is_8_bytes = true; 95 role.access = ACC_ALL; 96 97 return role; 98 } 99 100 static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn, 101 int level) 102 { 103 struct kvm_mmu_page *sp; 104 105 sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); 106 sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); 107 set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 108 109 sp->role.word = page_role_for_level(vcpu, level).word; 110 sp->gfn = gfn; 111 sp->tdp_mmu_page = true; 112 113 trace_kvm_mmu_get_page(sp, true); 114 115 return sp; 116 } 117 118 static struct kvm_mmu_page *get_tdp_mmu_vcpu_root(struct kvm_vcpu *vcpu) 119 { 120 union kvm_mmu_page_role role; 121 struct kvm *kvm = vcpu->kvm; 122 struct kvm_mmu_page *root; 123 124 role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level); 125 126 spin_lock(&kvm->mmu_lock); 127 128 /* Check for an existing root before allocating a new one. */ 129 for_each_tdp_mmu_root(kvm, root) { 130 if (root->role.word == role.word) { 131 kvm_mmu_get_root(kvm, root); 132 spin_unlock(&kvm->mmu_lock); 133 return root; 134 } 135 } 136 137 root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level); 138 root->root_count = 1; 139 140 list_add(&root->link, &kvm->arch.tdp_mmu_roots); 141 142 spin_unlock(&kvm->mmu_lock); 143 144 return root; 145 } 146 147 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) 148 { 149 struct kvm_mmu_page *root; 150 151 root = get_tdp_mmu_vcpu_root(vcpu); 152 if (!root) 153 return INVALID_PAGE; 154 155 return __pa(root->spt); 156 } 157 158 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 159 u64 old_spte, u64 new_spte, int level); 160 161 static int kvm_mmu_page_as_id(struct kvm_mmu_page *sp) 162 { 163 return sp->role.smm ? 1 : 0; 164 } 165 166 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level) 167 { 168 bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); 169 170 if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level)) 171 return; 172 173 if (is_accessed_spte(old_spte) && 174 (!is_accessed_spte(new_spte) || pfn_changed)) 175 kvm_set_pfn_accessed(spte_to_pfn(old_spte)); 176 } 177 178 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn, 179 u64 old_spte, u64 new_spte, int level) 180 { 181 bool pfn_changed; 182 struct kvm_memory_slot *slot; 183 184 if (level > PG_LEVEL_4K) 185 return; 186 187 pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); 188 189 if ((!is_writable_pte(old_spte) || pfn_changed) && 190 is_writable_pte(new_spte)) { 191 slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn); 192 mark_page_dirty_in_slot(kvm, slot, gfn); 193 } 194 } 195 196 /** 197 * handle_changed_spte - handle bookkeeping associated with an SPTE change 198 * @kvm: kvm instance 199 * @as_id: the address space of the paging structure the SPTE was a part of 200 * @gfn: the base GFN that was mapped by the SPTE 201 * @old_spte: The value of the SPTE before the change 202 * @new_spte: The value of the SPTE after the change 203 * @level: the level of the PT the SPTE is part of in the paging structure 204 * 205 * Handle bookkeeping that might result from the modification of a SPTE. 206 * This function must be called for all TDP SPTE modifications. 207 */ 208 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 209 u64 old_spte, u64 new_spte, int level) 210 { 211 bool was_present = is_shadow_present_pte(old_spte); 212 bool is_present = is_shadow_present_pte(new_spte); 213 bool was_leaf = was_present && is_last_spte(old_spte, level); 214 bool is_leaf = is_present && is_last_spte(new_spte, level); 215 bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); 216 u64 *pt; 217 struct kvm_mmu_page *sp; 218 u64 old_child_spte; 219 int i; 220 221 WARN_ON(level > PT64_ROOT_MAX_LEVEL); 222 WARN_ON(level < PG_LEVEL_4K); 223 WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1)); 224 225 /* 226 * If this warning were to trigger it would indicate that there was a 227 * missing MMU notifier or a race with some notifier handler. 228 * A present, leaf SPTE should never be directly replaced with another 229 * present leaf SPTE pointing to a differnt PFN. A notifier handler 230 * should be zapping the SPTE before the main MM's page table is 231 * changed, or the SPTE should be zeroed, and the TLBs flushed by the 232 * thread before replacement. 233 */ 234 if (was_leaf && is_leaf && pfn_changed) { 235 pr_err("Invalid SPTE change: cannot replace a present leaf\n" 236 "SPTE with another present leaf SPTE mapping a\n" 237 "different PFN!\n" 238 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", 239 as_id, gfn, old_spte, new_spte, level); 240 241 /* 242 * Crash the host to prevent error propagation and guest data 243 * courruption. 244 */ 245 BUG(); 246 } 247 248 if (old_spte == new_spte) 249 return; 250 251 trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte); 252 253 /* 254 * The only times a SPTE should be changed from a non-present to 255 * non-present state is when an MMIO entry is installed/modified/ 256 * removed. In that case, there is nothing to do here. 257 */ 258 if (!was_present && !is_present) { 259 /* 260 * If this change does not involve a MMIO SPTE, it is 261 * unexpected. Log the change, though it should not impact the 262 * guest since both the former and current SPTEs are nonpresent. 263 */ 264 if (WARN_ON(!is_mmio_spte(old_spte) && !is_mmio_spte(new_spte))) 265 pr_err("Unexpected SPTE change! Nonpresent SPTEs\n" 266 "should not be replaced with another,\n" 267 "different nonpresent SPTE, unless one or both\n" 268 "are MMIO SPTEs.\n" 269 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d", 270 as_id, gfn, old_spte, new_spte, level); 271 return; 272 } 273 274 275 if (was_leaf && is_dirty_spte(old_spte) && 276 (!is_dirty_spte(new_spte) || pfn_changed)) 277 kvm_set_pfn_dirty(spte_to_pfn(old_spte)); 278 279 /* 280 * Recursively handle child PTs if the change removed a subtree from 281 * the paging structure. 282 */ 283 if (was_present && !was_leaf && (pfn_changed || !is_present)) { 284 pt = spte_to_child_pt(old_spte, level); 285 sp = sptep_to_sp(pt); 286 287 trace_kvm_mmu_prepare_zap_page(sp); 288 289 list_del(&sp->link); 290 291 if (sp->lpage_disallowed) 292 unaccount_huge_nx_page(kvm, sp); 293 294 for (i = 0; i < PT64_ENT_PER_PAGE; i++) { 295 old_child_spte = READ_ONCE(*(pt + i)); 296 WRITE_ONCE(*(pt + i), 0); 297 handle_changed_spte(kvm, as_id, 298 gfn + (i * KVM_PAGES_PER_HPAGE(level - 1)), 299 old_child_spte, 0, level - 1); 300 } 301 302 kvm_flush_remote_tlbs_with_address(kvm, gfn, 303 KVM_PAGES_PER_HPAGE(level)); 304 305 free_page((unsigned long)pt); 306 kmem_cache_free(mmu_page_header_cache, sp); 307 } 308 } 309 310 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, 311 u64 old_spte, u64 new_spte, int level) 312 { 313 __handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level); 314 handle_changed_spte_acc_track(old_spte, new_spte, level); 315 handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte, 316 new_spte, level); 317 } 318 319 static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, 320 u64 new_spte, bool record_acc_track, 321 bool record_dirty_log) 322 { 323 u64 *root_pt = tdp_iter_root_pt(iter); 324 struct kvm_mmu_page *root = sptep_to_sp(root_pt); 325 int as_id = kvm_mmu_page_as_id(root); 326 327 WRITE_ONCE(*iter->sptep, new_spte); 328 329 __handle_changed_spte(kvm, as_id, iter->gfn, iter->old_spte, new_spte, 330 iter->level); 331 if (record_acc_track) 332 handle_changed_spte_acc_track(iter->old_spte, new_spte, 333 iter->level); 334 if (record_dirty_log) 335 handle_changed_spte_dirty_log(kvm, as_id, iter->gfn, 336 iter->old_spte, new_spte, 337 iter->level); 338 } 339 340 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter, 341 u64 new_spte) 342 { 343 __tdp_mmu_set_spte(kvm, iter, new_spte, true, true); 344 } 345 346 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm, 347 struct tdp_iter *iter, 348 u64 new_spte) 349 { 350 __tdp_mmu_set_spte(kvm, iter, new_spte, false, true); 351 } 352 353 static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm, 354 struct tdp_iter *iter, 355 u64 new_spte) 356 { 357 __tdp_mmu_set_spte(kvm, iter, new_spte, true, false); 358 } 359 360 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \ 361 for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end) 362 363 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \ 364 tdp_root_for_each_pte(_iter, _root, _start, _end) \ 365 if (!is_shadow_present_pte(_iter.old_spte) || \ 366 !is_last_spte(_iter.old_spte, _iter.level)) \ 367 continue; \ 368 else 369 370 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) \ 371 for_each_tdp_pte(_iter, __va(_mmu->root_hpa), \ 372 _mmu->shadow_root_level, _start, _end) 373 374 /* 375 * Flush the TLB if the process should drop kvm->mmu_lock. 376 * Return whether the caller still needs to flush the tlb. 377 */ 378 static bool tdp_mmu_iter_flush_cond_resched(struct kvm *kvm, struct tdp_iter *iter) 379 { 380 if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { 381 kvm_flush_remote_tlbs(kvm); 382 cond_resched_lock(&kvm->mmu_lock); 383 tdp_iter_refresh_walk(iter); 384 return false; 385 } else { 386 return true; 387 } 388 } 389 390 static void tdp_mmu_iter_cond_resched(struct kvm *kvm, struct tdp_iter *iter) 391 { 392 if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { 393 cond_resched_lock(&kvm->mmu_lock); 394 tdp_iter_refresh_walk(iter); 395 } 396 } 397 398 /* 399 * Tears down the mappings for the range of gfns, [start, end), and frees the 400 * non-root pages mapping GFNs strictly within that range. Returns true if 401 * SPTEs have been cleared and a TLB flush is needed before releasing the 402 * MMU lock. 403 * If can_yield is true, will release the MMU lock and reschedule if the 404 * scheduler needs the CPU or there is contention on the MMU lock. If this 405 * function cannot yield, it will not release the MMU lock or reschedule and 406 * the caller must ensure it does not supply too large a GFN range, or the 407 * operation can cause a soft lockup. 408 */ 409 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 410 gfn_t start, gfn_t end, bool can_yield) 411 { 412 struct tdp_iter iter; 413 bool flush_needed = false; 414 415 tdp_root_for_each_pte(iter, root, start, end) { 416 if (!is_shadow_present_pte(iter.old_spte)) 417 continue; 418 419 /* 420 * If this is a non-last-level SPTE that covers a larger range 421 * than should be zapped, continue, and zap the mappings at a 422 * lower level. 423 */ 424 if ((iter.gfn < start || 425 iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) && 426 !is_last_spte(iter.old_spte, iter.level)) 427 continue; 428 429 tdp_mmu_set_spte(kvm, &iter, 0); 430 431 if (can_yield) 432 flush_needed = tdp_mmu_iter_flush_cond_resched(kvm, &iter); 433 else 434 flush_needed = true; 435 } 436 return flush_needed; 437 } 438 439 /* 440 * Tears down the mappings for the range of gfns, [start, end), and frees the 441 * non-root pages mapping GFNs strictly within that range. Returns true if 442 * SPTEs have been cleared and a TLB flush is needed before releasing the 443 * MMU lock. 444 */ 445 bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end) 446 { 447 struct kvm_mmu_page *root; 448 bool flush = false; 449 450 for_each_tdp_mmu_root(kvm, root) { 451 /* 452 * Take a reference on the root so that it cannot be freed if 453 * this thread releases the MMU lock and yields in this loop. 454 */ 455 kvm_mmu_get_root(kvm, root); 456 457 flush |= zap_gfn_range(kvm, root, start, end, true); 458 459 kvm_mmu_put_root(kvm, root); 460 } 461 462 return flush; 463 } 464 465 void kvm_tdp_mmu_zap_all(struct kvm *kvm) 466 { 467 gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT); 468 bool flush; 469 470 flush = kvm_tdp_mmu_zap_gfn_range(kvm, 0, max_gfn); 471 if (flush) 472 kvm_flush_remote_tlbs(kvm); 473 } 474 475 /* 476 * Installs a last-level SPTE to handle a TDP page fault. 477 * (NPT/EPT violation/misconfiguration) 478 */ 479 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, 480 int map_writable, 481 struct tdp_iter *iter, 482 kvm_pfn_t pfn, bool prefault) 483 { 484 u64 new_spte; 485 int ret = 0; 486 int make_spte_ret = 0; 487 488 if (unlikely(is_noslot_pfn(pfn))) { 489 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); 490 trace_mark_mmio_spte(iter->sptep, iter->gfn, new_spte); 491 } else { 492 make_spte_ret = make_spte(vcpu, ACC_ALL, iter->level, iter->gfn, 493 pfn, iter->old_spte, prefault, true, 494 map_writable, !shadow_accessed_mask, 495 &new_spte); 496 trace_kvm_mmu_set_spte(iter->level, iter->gfn, iter->sptep); 497 } 498 499 if (new_spte == iter->old_spte) 500 ret = RET_PF_SPURIOUS; 501 else 502 tdp_mmu_set_spte(vcpu->kvm, iter, new_spte); 503 504 /* 505 * If the page fault was caused by a write but the page is write 506 * protected, emulation is needed. If the emulation was skipped, 507 * the vCPU would have the same fault again. 508 */ 509 if (make_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) { 510 if (write) 511 ret = RET_PF_EMULATE; 512 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); 513 } 514 515 /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */ 516 if (unlikely(is_mmio_spte(new_spte))) 517 ret = RET_PF_EMULATE; 518 519 trace_kvm_mmu_set_spte(iter->level, iter->gfn, iter->sptep); 520 if (!prefault) 521 vcpu->stat.pf_fixed++; 522 523 return ret; 524 } 525 526 /* 527 * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing 528 * page tables and SPTEs to translate the faulting guest physical address. 529 */ 530 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, 531 int map_writable, int max_level, kvm_pfn_t pfn, 532 bool prefault) 533 { 534 bool nx_huge_page_workaround_enabled = is_nx_huge_page_enabled(); 535 bool write = error_code & PFERR_WRITE_MASK; 536 bool exec = error_code & PFERR_FETCH_MASK; 537 bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled; 538 struct kvm_mmu *mmu = vcpu->arch.mmu; 539 struct tdp_iter iter; 540 struct kvm_mmu_page *sp; 541 u64 *child_pt; 542 u64 new_spte; 543 int ret; 544 gfn_t gfn = gpa >> PAGE_SHIFT; 545 int level; 546 int req_level; 547 548 if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) 549 return RET_PF_RETRY; 550 if (WARN_ON(!is_tdp_mmu_root(vcpu->kvm, vcpu->arch.mmu->root_hpa))) 551 return RET_PF_RETRY; 552 553 level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn, 554 huge_page_disallowed, &req_level); 555 556 trace_kvm_mmu_spte_requested(gpa, level, pfn); 557 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { 558 if (nx_huge_page_workaround_enabled) 559 disallowed_hugepage_adjust(iter.old_spte, gfn, 560 iter.level, &pfn, &level); 561 562 if (iter.level == level) 563 break; 564 565 /* 566 * If there is an SPTE mapping a large page at a higher level 567 * than the target, that SPTE must be cleared and replaced 568 * with a non-leaf SPTE. 569 */ 570 if (is_shadow_present_pte(iter.old_spte) && 571 is_large_pte(iter.old_spte)) { 572 tdp_mmu_set_spte(vcpu->kvm, &iter, 0); 573 574 kvm_flush_remote_tlbs_with_address(vcpu->kvm, iter.gfn, 575 KVM_PAGES_PER_HPAGE(iter.level)); 576 577 /* 578 * The iter must explicitly re-read the spte here 579 * because the new value informs the !present 580 * path below. 581 */ 582 iter.old_spte = READ_ONCE(*iter.sptep); 583 } 584 585 if (!is_shadow_present_pte(iter.old_spte)) { 586 sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level); 587 list_add(&sp->link, &vcpu->kvm->arch.tdp_mmu_pages); 588 child_pt = sp->spt; 589 clear_page(child_pt); 590 new_spte = make_nonleaf_spte(child_pt, 591 !shadow_accessed_mask); 592 593 trace_kvm_mmu_get_page(sp, true); 594 if (huge_page_disallowed && req_level >= iter.level) 595 account_huge_nx_page(vcpu->kvm, sp); 596 597 tdp_mmu_set_spte(vcpu->kvm, &iter, new_spte); 598 } 599 } 600 601 if (WARN_ON(iter.level != level)) 602 return RET_PF_RETRY; 603 604 ret = tdp_mmu_map_handle_target_level(vcpu, write, map_writable, &iter, 605 pfn, prefault); 606 607 return ret; 608 } 609 610 static int kvm_tdp_mmu_handle_hva_range(struct kvm *kvm, unsigned long start, 611 unsigned long end, unsigned long data, 612 int (*handler)(struct kvm *kvm, struct kvm_memory_slot *slot, 613 struct kvm_mmu_page *root, gfn_t start, 614 gfn_t end, unsigned long data)) 615 { 616 struct kvm_memslots *slots; 617 struct kvm_memory_slot *memslot; 618 struct kvm_mmu_page *root; 619 int ret = 0; 620 int as_id; 621 622 for_each_tdp_mmu_root(kvm, root) { 623 /* 624 * Take a reference on the root so that it cannot be freed if 625 * this thread releases the MMU lock and yields in this loop. 626 */ 627 kvm_mmu_get_root(kvm, root); 628 629 as_id = kvm_mmu_page_as_id(root); 630 slots = __kvm_memslots(kvm, as_id); 631 kvm_for_each_memslot(memslot, slots) { 632 unsigned long hva_start, hva_end; 633 gfn_t gfn_start, gfn_end; 634 635 hva_start = max(start, memslot->userspace_addr); 636 hva_end = min(end, memslot->userspace_addr + 637 (memslot->npages << PAGE_SHIFT)); 638 if (hva_start >= hva_end) 639 continue; 640 /* 641 * {gfn(page) | page intersects with [hva_start, hva_end)} = 642 * {gfn_start, gfn_start+1, ..., gfn_end-1}. 643 */ 644 gfn_start = hva_to_gfn_memslot(hva_start, memslot); 645 gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); 646 647 ret |= handler(kvm, memslot, root, gfn_start, 648 gfn_end, data); 649 } 650 651 kvm_mmu_put_root(kvm, root); 652 } 653 654 return ret; 655 } 656 657 static int zap_gfn_range_hva_wrapper(struct kvm *kvm, 658 struct kvm_memory_slot *slot, 659 struct kvm_mmu_page *root, gfn_t start, 660 gfn_t end, unsigned long unused) 661 { 662 return zap_gfn_range(kvm, root, start, end, false); 663 } 664 665 int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start, 666 unsigned long end) 667 { 668 return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0, 669 zap_gfn_range_hva_wrapper); 670 } 671 672 /* 673 * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero 674 * if any of the GFNs in the range have been accessed. 675 */ 676 static int age_gfn_range(struct kvm *kvm, struct kvm_memory_slot *slot, 677 struct kvm_mmu_page *root, gfn_t start, gfn_t end, 678 unsigned long unused) 679 { 680 struct tdp_iter iter; 681 int young = 0; 682 u64 new_spte = 0; 683 684 tdp_root_for_each_leaf_pte(iter, root, start, end) { 685 /* 686 * If we have a non-accessed entry we don't need to change the 687 * pte. 688 */ 689 if (!is_accessed_spte(iter.old_spte)) 690 continue; 691 692 new_spte = iter.old_spte; 693 694 if (spte_ad_enabled(new_spte)) { 695 clear_bit((ffs(shadow_accessed_mask) - 1), 696 (unsigned long *)&new_spte); 697 } else { 698 /* 699 * Capture the dirty status of the page, so that it doesn't get 700 * lost when the SPTE is marked for access tracking. 701 */ 702 if (is_writable_pte(new_spte)) 703 kvm_set_pfn_dirty(spte_to_pfn(new_spte)); 704 705 new_spte = mark_spte_for_access_track(new_spte); 706 } 707 new_spte &= ~shadow_dirty_mask; 708 709 tdp_mmu_set_spte_no_acc_track(kvm, &iter, new_spte); 710 young = 1; 711 712 trace_kvm_age_page(iter.gfn, iter.level, slot, young); 713 } 714 715 return young; 716 } 717 718 int kvm_tdp_mmu_age_hva_range(struct kvm *kvm, unsigned long start, 719 unsigned long end) 720 { 721 return kvm_tdp_mmu_handle_hva_range(kvm, start, end, 0, 722 age_gfn_range); 723 } 724 725 static int test_age_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, 726 struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused, 727 unsigned long unused2) 728 { 729 struct tdp_iter iter; 730 731 tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) 732 if (is_accessed_spte(iter.old_spte)) 733 return 1; 734 735 return 0; 736 } 737 738 int kvm_tdp_mmu_test_age_hva(struct kvm *kvm, unsigned long hva) 739 { 740 return kvm_tdp_mmu_handle_hva_range(kvm, hva, hva + 1, 0, 741 test_age_gfn); 742 } 743 744 /* 745 * Handle the changed_pte MMU notifier for the TDP MMU. 746 * data is a pointer to the new pte_t mapping the HVA specified by the MMU 747 * notifier. 748 * Returns non-zero if a flush is needed before releasing the MMU lock. 749 */ 750 static int set_tdp_spte(struct kvm *kvm, struct kvm_memory_slot *slot, 751 struct kvm_mmu_page *root, gfn_t gfn, gfn_t unused, 752 unsigned long data) 753 { 754 struct tdp_iter iter; 755 pte_t *ptep = (pte_t *)data; 756 kvm_pfn_t new_pfn; 757 u64 new_spte; 758 int need_flush = 0; 759 760 WARN_ON(pte_huge(*ptep)); 761 762 new_pfn = pte_pfn(*ptep); 763 764 tdp_root_for_each_pte(iter, root, gfn, gfn + 1) { 765 if (iter.level != PG_LEVEL_4K) 766 continue; 767 768 if (!is_shadow_present_pte(iter.old_spte)) 769 break; 770 771 tdp_mmu_set_spte(kvm, &iter, 0); 772 773 kvm_flush_remote_tlbs_with_address(kvm, iter.gfn, 1); 774 775 if (!pte_write(*ptep)) { 776 new_spte = kvm_mmu_changed_pte_notifier_make_spte( 777 iter.old_spte, new_pfn); 778 779 tdp_mmu_set_spte(kvm, &iter, new_spte); 780 } 781 782 need_flush = 1; 783 } 784 785 if (need_flush) 786 kvm_flush_remote_tlbs_with_address(kvm, gfn, 1); 787 788 return 0; 789 } 790 791 int kvm_tdp_mmu_set_spte_hva(struct kvm *kvm, unsigned long address, 792 pte_t *host_ptep) 793 { 794 return kvm_tdp_mmu_handle_hva_range(kvm, address, address + 1, 795 (unsigned long)host_ptep, 796 set_tdp_spte); 797 } 798 799 /* 800 * Remove write access from all the SPTEs mapping GFNs [start, end). If 801 * skip_4k is set, SPTEs that map 4k pages, will not be write-protected. 802 * Returns true if an SPTE has been changed and the TLBs need to be flushed. 803 */ 804 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 805 gfn_t start, gfn_t end, int min_level) 806 { 807 struct tdp_iter iter; 808 u64 new_spte; 809 bool spte_set = false; 810 811 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL); 812 813 for_each_tdp_pte_min_level(iter, root->spt, root->role.level, 814 min_level, start, end) { 815 if (!is_shadow_present_pte(iter.old_spte) || 816 !is_last_spte(iter.old_spte, iter.level)) 817 continue; 818 819 new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 820 821 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); 822 spte_set = true; 823 824 tdp_mmu_iter_cond_resched(kvm, &iter); 825 } 826 return spte_set; 827 } 828 829 /* 830 * Remove write access from all the SPTEs mapping GFNs in the memslot. Will 831 * only affect leaf SPTEs down to min_level. 832 * Returns true if an SPTE has been changed and the TLBs need to be flushed. 833 */ 834 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot, 835 int min_level) 836 { 837 struct kvm_mmu_page *root; 838 int root_as_id; 839 bool spte_set = false; 840 841 for_each_tdp_mmu_root(kvm, root) { 842 root_as_id = kvm_mmu_page_as_id(root); 843 if (root_as_id != slot->as_id) 844 continue; 845 846 /* 847 * Take a reference on the root so that it cannot be freed if 848 * this thread releases the MMU lock and yields in this loop. 849 */ 850 kvm_mmu_get_root(kvm, root); 851 852 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn, 853 slot->base_gfn + slot->npages, min_level); 854 855 kvm_mmu_put_root(kvm, root); 856 } 857 858 return spte_set; 859 } 860 861 /* 862 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If 863 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. 864 * If AD bits are not enabled, this will require clearing the writable bit on 865 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to 866 * be flushed. 867 */ 868 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 869 gfn_t start, gfn_t end) 870 { 871 struct tdp_iter iter; 872 u64 new_spte; 873 bool spte_set = false; 874 875 tdp_root_for_each_leaf_pte(iter, root, start, end) { 876 if (spte_ad_need_write_protect(iter.old_spte)) { 877 if (is_writable_pte(iter.old_spte)) 878 new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 879 else 880 continue; 881 } else { 882 if (iter.old_spte & shadow_dirty_mask) 883 new_spte = iter.old_spte & ~shadow_dirty_mask; 884 else 885 continue; 886 } 887 888 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); 889 spte_set = true; 890 891 tdp_mmu_iter_cond_resched(kvm, &iter); 892 } 893 return spte_set; 894 } 895 896 /* 897 * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If 898 * AD bits are enabled, this will involve clearing the dirty bit on each SPTE. 899 * If AD bits are not enabled, this will require clearing the writable bit on 900 * each SPTE. Returns true if an SPTE has been changed and the TLBs need to 901 * be flushed. 902 */ 903 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot) 904 { 905 struct kvm_mmu_page *root; 906 int root_as_id; 907 bool spte_set = false; 908 909 for_each_tdp_mmu_root(kvm, root) { 910 root_as_id = kvm_mmu_page_as_id(root); 911 if (root_as_id != slot->as_id) 912 continue; 913 914 /* 915 * Take a reference on the root so that it cannot be freed if 916 * this thread releases the MMU lock and yields in this loop. 917 */ 918 kvm_mmu_get_root(kvm, root); 919 920 spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn, 921 slot->base_gfn + slot->npages); 922 923 kvm_mmu_put_root(kvm, root); 924 } 925 926 return spte_set; 927 } 928 929 /* 930 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is 931 * set in mask, starting at gfn. The given memslot is expected to contain all 932 * the GFNs represented by set bits in the mask. If AD bits are enabled, 933 * clearing the dirty status will involve clearing the dirty bit on each SPTE 934 * or, if AD bits are not enabled, clearing the writable bit on each SPTE. 935 */ 936 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, 937 gfn_t gfn, unsigned long mask, bool wrprot) 938 { 939 struct tdp_iter iter; 940 u64 new_spte; 941 942 tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask), 943 gfn + BITS_PER_LONG) { 944 if (!mask) 945 break; 946 947 if (iter.level > PG_LEVEL_4K || 948 !(mask & (1UL << (iter.gfn - gfn)))) 949 continue; 950 951 if (wrprot || spte_ad_need_write_protect(iter.old_spte)) { 952 if (is_writable_pte(iter.old_spte)) 953 new_spte = iter.old_spte & ~PT_WRITABLE_MASK; 954 else 955 continue; 956 } else { 957 if (iter.old_spte & shadow_dirty_mask) 958 new_spte = iter.old_spte & ~shadow_dirty_mask; 959 else 960 continue; 961 } 962 963 tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte); 964 965 mask &= ~(1UL << (iter.gfn - gfn)); 966 } 967 } 968 969 /* 970 * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is 971 * set in mask, starting at gfn. The given memslot is expected to contain all 972 * the GFNs represented by set bits in the mask. If AD bits are enabled, 973 * clearing the dirty status will involve clearing the dirty bit on each SPTE 974 * or, if AD bits are not enabled, clearing the writable bit on each SPTE. 975 */ 976 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, 977 struct kvm_memory_slot *slot, 978 gfn_t gfn, unsigned long mask, 979 bool wrprot) 980 { 981 struct kvm_mmu_page *root; 982 int root_as_id; 983 984 lockdep_assert_held(&kvm->mmu_lock); 985 for_each_tdp_mmu_root(kvm, root) { 986 root_as_id = kvm_mmu_page_as_id(root); 987 if (root_as_id != slot->as_id) 988 continue; 989 990 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot); 991 } 992 } 993 994 /* 995 * Set the dirty status of all the SPTEs mapping GFNs in the memslot. This is 996 * only used for PML, and so will involve setting the dirty bit on each SPTE. 997 * Returns true if an SPTE has been changed and the TLBs need to be flushed. 998 */ 999 static bool set_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, 1000 gfn_t start, gfn_t end) 1001 { 1002 struct tdp_iter iter; 1003 u64 new_spte; 1004 bool spte_set = false; 1005 1006 tdp_root_for_each_pte(iter, root, start, end) { 1007 if (!is_shadow_present_pte(iter.old_spte)) 1008 continue; 1009 1010 new_spte = iter.old_spte | shadow_dirty_mask; 1011 1012 tdp_mmu_set_spte(kvm, &iter, new_spte); 1013 spte_set = true; 1014 1015 tdp_mmu_iter_cond_resched(kvm, &iter); 1016 } 1017 1018 return spte_set; 1019 } 1020 1021 /* 1022 * Set the dirty status of all the SPTEs mapping GFNs in the memslot. This is 1023 * only used for PML, and so will involve setting the dirty bit on each SPTE. 1024 * Returns true if an SPTE has been changed and the TLBs need to be flushed. 1025 */ 1026 bool kvm_tdp_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *slot) 1027 { 1028 struct kvm_mmu_page *root; 1029 int root_as_id; 1030 bool spte_set = false; 1031 1032 for_each_tdp_mmu_root(kvm, root) { 1033 root_as_id = kvm_mmu_page_as_id(root); 1034 if (root_as_id != slot->as_id) 1035 continue; 1036 1037 /* 1038 * Take a reference on the root so that it cannot be freed if 1039 * this thread releases the MMU lock and yields in this loop. 1040 */ 1041 kvm_mmu_get_root(kvm, root); 1042 1043 spte_set |= set_dirty_gfn_range(kvm, root, slot->base_gfn, 1044 slot->base_gfn + slot->npages); 1045 1046 kvm_mmu_put_root(kvm, root); 1047 } 1048 return spte_set; 1049 } 1050 1051 /* 1052 * Clear non-leaf entries (and free associated page tables) which could 1053 * be replaced by large mappings, for GFNs within the slot. 1054 */ 1055 static void zap_collapsible_spte_range(struct kvm *kvm, 1056 struct kvm_mmu_page *root, 1057 gfn_t start, gfn_t end) 1058 { 1059 struct tdp_iter iter; 1060 kvm_pfn_t pfn; 1061 bool spte_set = false; 1062 1063 tdp_root_for_each_pte(iter, root, start, end) { 1064 if (!is_shadow_present_pte(iter.old_spte) || 1065 is_last_spte(iter.old_spte, iter.level)) 1066 continue; 1067 1068 pfn = spte_to_pfn(iter.old_spte); 1069 if (kvm_is_reserved_pfn(pfn) || 1070 !PageTransCompoundMap(pfn_to_page(pfn))) 1071 continue; 1072 1073 tdp_mmu_set_spte(kvm, &iter, 0); 1074 1075 spte_set = tdp_mmu_iter_flush_cond_resched(kvm, &iter); 1076 } 1077 1078 if (spte_set) 1079 kvm_flush_remote_tlbs(kvm); 1080 } 1081 1082 /* 1083 * Clear non-leaf entries (and free associated page tables) which could 1084 * be replaced by large mappings, for GFNs within the slot. 1085 */ 1086 void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, 1087 const struct kvm_memory_slot *slot) 1088 { 1089 struct kvm_mmu_page *root; 1090 int root_as_id; 1091 1092 for_each_tdp_mmu_root(kvm, root) { 1093 root_as_id = kvm_mmu_page_as_id(root); 1094 if (root_as_id != slot->as_id) 1095 continue; 1096 1097 /* 1098 * Take a reference on the root so that it cannot be freed if 1099 * this thread releases the MMU lock and yields in this loop. 1100 */ 1101 kvm_mmu_get_root(kvm, root); 1102 1103 zap_collapsible_spte_range(kvm, root, slot->base_gfn, 1104 slot->base_gfn + slot->npages); 1105 1106 kvm_mmu_put_root(kvm, root); 1107 } 1108 } 1109 1110 /* 1111 * Removes write access on the last level SPTE mapping this GFN and unsets the 1112 * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted. 1113 * Returns true if an SPTE was set and a TLB flush is needed. 1114 */ 1115 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, 1116 gfn_t gfn) 1117 { 1118 struct tdp_iter iter; 1119 u64 new_spte; 1120 bool spte_set = false; 1121 1122 tdp_root_for_each_leaf_pte(iter, root, gfn, gfn + 1) { 1123 if (!is_writable_pte(iter.old_spte)) 1124 break; 1125 1126 new_spte = iter.old_spte & 1127 ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE); 1128 1129 tdp_mmu_set_spte(kvm, &iter, new_spte); 1130 spte_set = true; 1131 } 1132 1133 return spte_set; 1134 } 1135 1136 /* 1137 * Removes write access on the last level SPTE mapping this GFN and unsets the 1138 * SPTE_MMU_WRITABLE bit to ensure future writes continue to be intercepted. 1139 * Returns true if an SPTE was set and a TLB flush is needed. 1140 */ 1141 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, 1142 struct kvm_memory_slot *slot, gfn_t gfn) 1143 { 1144 struct kvm_mmu_page *root; 1145 int root_as_id; 1146 bool spte_set = false; 1147 1148 lockdep_assert_held(&kvm->mmu_lock); 1149 for_each_tdp_mmu_root(kvm, root) { 1150 root_as_id = kvm_mmu_page_as_id(root); 1151 if (root_as_id != slot->as_id) 1152 continue; 1153 1154 spte_set |= write_protect_gfn(kvm, root, gfn); 1155 } 1156 return spte_set; 1157 } 1158 1159 /* 1160 * Return the level of the lowest level SPTE added to sptes. 1161 * That SPTE may be non-present. 1162 */ 1163 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes) 1164 { 1165 struct tdp_iter iter; 1166 struct kvm_mmu *mmu = vcpu->arch.mmu; 1167 int leaf = vcpu->arch.mmu->shadow_root_level; 1168 gfn_t gfn = addr >> PAGE_SHIFT; 1169 1170 tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { 1171 leaf = iter.level; 1172 sptes[leaf - 1] = iter.old_spte; 1173 } 1174 1175 return leaf; 1176 } 1177