1 // SPDX-License-Identifier: GPL-2.0
2 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
3
4 #include "mmu.h"
5 #include "mmu_internal.h"
6 #include "mmutrace.h"
7 #include "tdp_iter.h"
8 #include "tdp_mmu.h"
9 #include "spte.h"
10
11 #include <asm/cmpxchg.h>
12 #include <trace/events/kvm.h>
13
14 /* Initializes the TDP MMU for the VM, if enabled. */
kvm_mmu_init_tdp_mmu(struct kvm * kvm)15 void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
16 {
17 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
18 spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
19 }
20
21 /* Arbitrarily returns true so that this may be used in if statements. */
kvm_lockdep_assert_mmu_lock_held(struct kvm * kvm,bool shared)22 static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
23 bool shared)
24 {
25 if (shared)
26 lockdep_assert_held_read(&kvm->mmu_lock);
27 else
28 lockdep_assert_held_write(&kvm->mmu_lock);
29
30 return true;
31 }
32
kvm_mmu_uninit_tdp_mmu(struct kvm * kvm)33 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
34 {
35 /*
36 * Invalidate all roots, which besides the obvious, schedules all roots
37 * for zapping and thus puts the TDP MMU's reference to each root, i.e.
38 * ultimately frees all roots.
39 */
40 kvm_tdp_mmu_invalidate_roots(kvm, KVM_VALID_ROOTS);
41 kvm_tdp_mmu_zap_invalidated_roots(kvm, false);
42
43 #ifdef CONFIG_KVM_PROVE_MMU
44 KVM_MMU_WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages));
45 #endif
46 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
47
48 /*
49 * Ensure that all the outstanding RCU callbacks to free shadow pages
50 * can run before the VM is torn down. Putting the last reference to
51 * zapped roots will create new callbacks.
52 */
53 rcu_barrier();
54 }
55
tdp_mmu_free_sp(struct kvm_mmu_page * sp)56 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
57 {
58 free_page((unsigned long)sp->external_spt);
59 free_page((unsigned long)sp->spt);
60 kmem_cache_free(mmu_page_header_cache, sp);
61 }
62
63 /*
64 * This is called through call_rcu in order to free TDP page table memory
65 * safely with respect to other kernel threads that may be operating on
66 * the memory.
67 * By only accessing TDP MMU page table memory in an RCU read critical
68 * section, and freeing it after a grace period, lockless access to that
69 * memory won't use it after it is freed.
70 */
tdp_mmu_free_sp_rcu_callback(struct rcu_head * head)71 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
72 {
73 struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
74 rcu_head);
75
76 tdp_mmu_free_sp(sp);
77 }
78
kvm_tdp_mmu_put_root(struct kvm * kvm,struct kvm_mmu_page * root)79 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
80 {
81 if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
82 return;
83
84 /*
85 * The TDP MMU itself holds a reference to each root until the root is
86 * explicitly invalidated, i.e. the final reference should be never be
87 * put for a valid root.
88 */
89 KVM_BUG_ON(!is_tdp_mmu_page(root) || !root->role.invalid, kvm);
90
91 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
92 list_del_rcu(&root->link);
93 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
94 call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
95 }
96
tdp_mmu_root_match(struct kvm_mmu_page * root,enum kvm_tdp_mmu_root_types types)97 static bool tdp_mmu_root_match(struct kvm_mmu_page *root,
98 enum kvm_tdp_mmu_root_types types)
99 {
100 if (WARN_ON_ONCE(!(types & KVM_VALID_ROOTS)))
101 return false;
102
103 if (root->role.invalid && !(types & KVM_INVALID_ROOTS))
104 return false;
105
106 if (likely(!is_mirror_sp(root)))
107 return types & KVM_DIRECT_ROOTS;
108 return types & KVM_MIRROR_ROOTS;
109 }
110
111 /*
112 * Returns the next root after @prev_root (or the first root if @prev_root is
113 * NULL) that matches with @types. A reference to the returned root is
114 * acquired, and the reference to @prev_root is released (the caller obviously
115 * must hold a reference to @prev_root if it's non-NULL).
116 *
117 * Roots that doesn't match with @types are skipped.
118 *
119 * Returns NULL if the end of tdp_mmu_roots was reached.
120 */
tdp_mmu_next_root(struct kvm * kvm,struct kvm_mmu_page * prev_root,enum kvm_tdp_mmu_root_types types)121 static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
122 struct kvm_mmu_page *prev_root,
123 enum kvm_tdp_mmu_root_types types)
124 {
125 struct kvm_mmu_page *next_root;
126
127 /*
128 * While the roots themselves are RCU-protected, fields such as
129 * role.invalid are protected by mmu_lock.
130 */
131 lockdep_assert_held(&kvm->mmu_lock);
132
133 rcu_read_lock();
134
135 if (prev_root)
136 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
137 &prev_root->link,
138 typeof(*prev_root), link);
139 else
140 next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
141 typeof(*next_root), link);
142
143 while (next_root) {
144 if (tdp_mmu_root_match(next_root, types) &&
145 kvm_tdp_mmu_get_root(next_root))
146 break;
147
148 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
149 &next_root->link, typeof(*next_root), link);
150 }
151
152 rcu_read_unlock();
153
154 if (prev_root)
155 kvm_tdp_mmu_put_root(kvm, prev_root);
156
157 return next_root;
158 }
159
160 /*
161 * Note: this iterator gets and puts references to the roots it iterates over.
162 * This makes it safe to release the MMU lock and yield within the loop, but
163 * if exiting the loop early, the caller must drop the reference to the most
164 * recent root. (Unless keeping a live reference is desirable.)
165 *
166 * If shared is set, this function is operating under the MMU lock in read
167 * mode.
168 */
169 #define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _types) \
170 for (_root = tdp_mmu_next_root(_kvm, NULL, _types); \
171 ({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \
172 _root = tdp_mmu_next_root(_kvm, _root, _types)) \
173 if (_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) { \
174 } else
175
176 #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \
177 __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, KVM_VALID_ROOTS)
178
179 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root) \
180 for (_root = tdp_mmu_next_root(_kvm, NULL, KVM_ALL_ROOTS); \
181 ({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \
182 _root = tdp_mmu_next_root(_kvm, _root, KVM_ALL_ROOTS))
183
184 /*
185 * Iterate over all TDP MMU roots. Requires that mmu_lock be held for write,
186 * the implication being that any flow that holds mmu_lock for read is
187 * inherently yield-friendly and should use the yield-safe variant above.
188 * Holding mmu_lock for write obviates the need for RCU protection as the list
189 * is guaranteed to be stable.
190 */
191 #define __for_each_tdp_mmu_root(_kvm, _root, _as_id, _types) \
192 list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \
193 if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \
194 ((_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) || \
195 !tdp_mmu_root_match((_root), (_types)))) { \
196 } else
197
198 /*
199 * Iterate over all TDP MMU roots in an RCU read-side critical section.
200 * It is safe to iterate over the SPTEs under the root, but their values will
201 * be unstable, so all writes must be atomic. As this routine is meant to be
202 * used without holding the mmu_lock at all, any bits that are flipped must
203 * be reflected in kvm_tdp_mmu_spte_need_atomic_write().
204 */
205 #define for_each_tdp_mmu_root_rcu(_kvm, _root, _as_id, _types) \
206 list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link) \
207 if ((_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) || \
208 !tdp_mmu_root_match((_root), (_types))) { \
209 } else
210
211 #define for_each_valid_tdp_mmu_root(_kvm, _root, _as_id) \
212 __for_each_tdp_mmu_root(_kvm, _root, _as_id, KVM_VALID_ROOTS)
213
tdp_mmu_alloc_sp(struct kvm_vcpu * vcpu)214 static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
215 {
216 struct kvm_mmu_page *sp;
217
218 sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
219 sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
220
221 return sp;
222 }
223
tdp_mmu_init_sp(struct kvm_mmu_page * sp,tdp_ptep_t sptep,gfn_t gfn,union kvm_mmu_page_role role)224 static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep,
225 gfn_t gfn, union kvm_mmu_page_role role)
226 {
227 INIT_LIST_HEAD(&sp->possible_nx_huge_page_link);
228
229 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
230
231 sp->role = role;
232 sp->gfn = gfn;
233 sp->ptep = sptep;
234 sp->tdp_mmu_page = true;
235
236 trace_kvm_mmu_get_page(sp, true);
237 }
238
tdp_mmu_init_child_sp(struct kvm_mmu_page * child_sp,struct tdp_iter * iter)239 static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
240 struct tdp_iter *iter)
241 {
242 struct kvm_mmu_page *parent_sp;
243 union kvm_mmu_page_role role;
244
245 parent_sp = sptep_to_sp(rcu_dereference(iter->sptep));
246
247 role = parent_sp->role;
248 role.level--;
249
250 tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role);
251 }
252
kvm_tdp_mmu_alloc_root(struct kvm_vcpu * vcpu,bool mirror)253 void kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu, bool mirror)
254 {
255 struct kvm_mmu *mmu = vcpu->arch.mmu;
256 union kvm_mmu_page_role role = mmu->root_role;
257 int as_id = kvm_mmu_role_as_id(role);
258 struct kvm *kvm = vcpu->kvm;
259 struct kvm_mmu_page *root;
260
261 if (mirror)
262 role.is_mirror = true;
263
264 /*
265 * Check for an existing root before acquiring the pages lock to avoid
266 * unnecessary serialization if multiple vCPUs are loading a new root.
267 * E.g. when bringing up secondary vCPUs, KVM will already have created
268 * a valid root on behalf of the primary vCPU.
269 */
270 read_lock(&kvm->mmu_lock);
271
272 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, as_id) {
273 if (root->role.word == role.word)
274 goto out_read_unlock;
275 }
276
277 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
278
279 /*
280 * Recheck for an existing root after acquiring the pages lock, another
281 * vCPU may have raced ahead and created a new usable root. Manually
282 * walk the list of roots as the standard macros assume that the pages
283 * lock is *not* held. WARN if grabbing a reference to a usable root
284 * fails, as the last reference to a root can only be put *after* the
285 * root has been invalidated, which requires holding mmu_lock for write.
286 */
287 list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
288 if (root->role.word == role.word &&
289 !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root)))
290 goto out_spin_unlock;
291 }
292
293 root = tdp_mmu_alloc_sp(vcpu);
294 tdp_mmu_init_sp(root, NULL, 0, role);
295
296 /*
297 * TDP MMU roots are kept until they are explicitly invalidated, either
298 * by a memslot update or by the destruction of the VM. Initialize the
299 * refcount to two; one reference for the vCPU, and one reference for
300 * the TDP MMU itself, which is held until the root is invalidated and
301 * is ultimately put by kvm_tdp_mmu_zap_invalidated_roots().
302 */
303 refcount_set(&root->tdp_mmu_root_count, 2);
304 list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
305
306 out_spin_unlock:
307 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
308 out_read_unlock:
309 read_unlock(&kvm->mmu_lock);
310 /*
311 * Note, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS will prevent entering the guest
312 * and actually consuming the root if it's invalidated after dropping
313 * mmu_lock, and the root can't be freed as this vCPU holds a reference.
314 */
315 if (mirror) {
316 mmu->mirror_root_hpa = __pa(root->spt);
317 } else {
318 mmu->root.hpa = __pa(root->spt);
319 mmu->root.pgd = 0;
320 }
321 }
322
323 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
324 u64 old_spte, u64 new_spte, int level,
325 bool shared);
326
tdp_account_mmu_page(struct kvm * kvm,struct kvm_mmu_page * sp)327 static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
328 {
329 kvm_account_pgtable_pages((void *)sp->spt, +1);
330 #ifdef CONFIG_KVM_PROVE_MMU
331 atomic64_inc(&kvm->arch.tdp_mmu_pages);
332 #endif
333 }
334
tdp_unaccount_mmu_page(struct kvm * kvm,struct kvm_mmu_page * sp)335 static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
336 {
337 kvm_account_pgtable_pages((void *)sp->spt, -1);
338 #ifdef CONFIG_KVM_PROVE_MMU
339 atomic64_dec(&kvm->arch.tdp_mmu_pages);
340 #endif
341 }
342
343 /**
344 * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages
345 *
346 * @kvm: kvm instance
347 * @sp: the page to be removed
348 */
tdp_mmu_unlink_sp(struct kvm * kvm,struct kvm_mmu_page * sp)349 static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
350 {
351 tdp_unaccount_mmu_page(kvm, sp);
352
353 if (!sp->nx_huge_page_disallowed)
354 return;
355
356 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
357 sp->nx_huge_page_disallowed = false;
358 untrack_possible_nx_huge_page(kvm, sp, KVM_TDP_MMU);
359 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
360 }
361
remove_external_spte(struct kvm * kvm,gfn_t gfn,u64 old_spte,int level)362 static void remove_external_spte(struct kvm *kvm, gfn_t gfn, u64 old_spte,
363 int level)
364 {
365 /*
366 * External (TDX) SPTEs are limited to PG_LEVEL_4K, and external
367 * PTs are removed in a special order, involving free_external_spt().
368 * But remove_external_spte() will be called on non-leaf PTEs via
369 * __tdp_mmu_zap_root(), so avoid the error the former would return
370 * in this case.
371 */
372 if (!is_last_spte(old_spte, level))
373 return;
374
375 /* Zapping leaf spte is allowed only when write lock is held. */
376 lockdep_assert_held_write(&kvm->mmu_lock);
377
378 kvm_x86_call(remove_external_spte)(kvm, gfn, level, old_spte);
379 }
380
381 /**
382 * handle_removed_pt() - handle a page table removed from the TDP structure
383 *
384 * @kvm: kvm instance
385 * @pt: the page removed from the paging structure
386 * @shared: This operation may not be running under the exclusive use
387 * of the MMU lock and the operation must synchronize with other
388 * threads that might be modifying SPTEs.
389 *
390 * Given a page table that has been removed from the TDP paging structure,
391 * iterates through the page table to clear SPTEs and free child page tables.
392 *
393 * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
394 * protection. Since this thread removed it from the paging structure,
395 * this thread will be responsible for ensuring the page is freed. Hence the
396 * early rcu_dereferences in the function.
397 */
handle_removed_pt(struct kvm * kvm,tdp_ptep_t pt,bool shared)398 static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
399 {
400 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
401 int level = sp->role.level;
402 gfn_t base_gfn = sp->gfn;
403 int i;
404
405 trace_kvm_mmu_prepare_zap_page(sp);
406
407 tdp_mmu_unlink_sp(kvm, sp);
408
409 for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
410 tdp_ptep_t sptep = pt + i;
411 gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
412 u64 old_spte;
413
414 if (shared) {
415 /*
416 * Set the SPTE to a nonpresent value that other
417 * threads will not overwrite. If the SPTE was
418 * already marked as frozen then another thread
419 * handling a page fault could overwrite it, so
420 * set the SPTE until it is set from some other
421 * value to the frozen SPTE value.
422 */
423 for (;;) {
424 old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, FROZEN_SPTE);
425 if (!is_frozen_spte(old_spte))
426 break;
427 cpu_relax();
428 }
429 } else {
430 /*
431 * If the SPTE is not MMU-present, there is no backing
432 * page associated with the SPTE and so no side effects
433 * that need to be recorded, and exclusive ownership of
434 * mmu_lock ensures the SPTE can't be made present.
435 * Note, zapping MMIO SPTEs is also unnecessary as they
436 * are guarded by the memslots generation, not by being
437 * unreachable.
438 */
439 old_spte = kvm_tdp_mmu_read_spte(sptep);
440 if (!is_shadow_present_pte(old_spte))
441 continue;
442
443 /*
444 * Use the common helper instead of a raw WRITE_ONCE as
445 * the SPTE needs to be updated atomically if it can be
446 * modified by a different vCPU outside of mmu_lock.
447 * Even though the parent SPTE is !PRESENT, the TLB
448 * hasn't yet been flushed, and both Intel and AMD
449 * document that A/D assists can use upper-level PxE
450 * entries that are cached in the TLB, i.e. the CPU can
451 * still access the page and mark it dirty.
452 *
453 * No retry is needed in the atomic update path as the
454 * sole concern is dropping a Dirty bit, i.e. no other
455 * task can zap/remove the SPTE as mmu_lock is held for
456 * write. Marking the SPTE as a frozen SPTE is not
457 * strictly necessary for the same reason, but using
458 * the frozen SPTE value keeps the shared/exclusive
459 * paths consistent and allows the handle_changed_spte()
460 * call below to hardcode the new value to FROZEN_SPTE.
461 *
462 * Note, even though dropping a Dirty bit is the only
463 * scenario where a non-atomic update could result in a
464 * functional bug, simply checking the Dirty bit isn't
465 * sufficient as a fast page fault could read the upper
466 * level SPTE before it is zapped, and then make this
467 * target SPTE writable, resume the guest, and set the
468 * Dirty bit between reading the SPTE above and writing
469 * it here.
470 */
471 old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte,
472 FROZEN_SPTE, level);
473 }
474 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
475 old_spte, FROZEN_SPTE, level, shared);
476
477 if (is_mirror_sp(sp)) {
478 KVM_BUG_ON(shared, kvm);
479 remove_external_spte(kvm, gfn, old_spte, level);
480 }
481 }
482
483 if (is_mirror_sp(sp) &&
484 WARN_ON(kvm_x86_call(free_external_spt)(kvm, base_gfn, sp->role.level,
485 sp->external_spt))) {
486 /*
487 * Failed to free page table page in mirror page table and
488 * there is nothing to do further.
489 * Intentionally leak the page to prevent the kernel from
490 * accessing the encrypted page.
491 */
492 sp->external_spt = NULL;
493 }
494
495 call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
496 }
497
get_external_spt(gfn_t gfn,u64 new_spte,int level)498 static void *get_external_spt(gfn_t gfn, u64 new_spte, int level)
499 {
500 if (is_shadow_present_pte(new_spte) && !is_last_spte(new_spte, level)) {
501 struct kvm_mmu_page *sp = spte_to_child_sp(new_spte);
502
503 WARN_ON_ONCE(sp->role.level + 1 != level);
504 WARN_ON_ONCE(sp->gfn != gfn);
505 return sp->external_spt;
506 }
507
508 return NULL;
509 }
510
set_external_spte_present(struct kvm * kvm,tdp_ptep_t sptep,gfn_t gfn,u64 old_spte,u64 new_spte,int level)511 static int __must_check set_external_spte_present(struct kvm *kvm, tdp_ptep_t sptep,
512 gfn_t gfn, u64 old_spte,
513 u64 new_spte, int level)
514 {
515 bool was_present = is_shadow_present_pte(old_spte);
516 bool is_present = is_shadow_present_pte(new_spte);
517 bool is_leaf = is_present && is_last_spte(new_spte, level);
518 int ret = 0;
519
520 KVM_BUG_ON(was_present, kvm);
521
522 lockdep_assert_held(&kvm->mmu_lock);
523 /*
524 * We need to lock out other updates to the SPTE until the external
525 * page table has been modified. Use FROZEN_SPTE similar to
526 * the zapping case.
527 */
528 if (!try_cmpxchg64(rcu_dereference(sptep), &old_spte, FROZEN_SPTE))
529 return -EBUSY;
530
531 /*
532 * Use different call to either set up middle level
533 * external page table, or leaf.
534 */
535 if (is_leaf) {
536 ret = kvm_x86_call(set_external_spte)(kvm, gfn, level, new_spte);
537 } else {
538 void *external_spt = get_external_spt(gfn, new_spte, level);
539
540 KVM_BUG_ON(!external_spt, kvm);
541 ret = kvm_x86_call(link_external_spt)(kvm, gfn, level, external_spt);
542 }
543 if (ret)
544 __kvm_tdp_mmu_write_spte(sptep, old_spte);
545 else
546 __kvm_tdp_mmu_write_spte(sptep, new_spte);
547 return ret;
548 }
549
550 /**
551 * handle_changed_spte - handle bookkeeping associated with an SPTE change
552 * @kvm: kvm instance
553 * @as_id: the address space of the paging structure the SPTE was a part of
554 * @gfn: the base GFN that was mapped by the SPTE
555 * @old_spte: The value of the SPTE before the change
556 * @new_spte: The value of the SPTE after the change
557 * @level: the level of the PT the SPTE is part of in the paging structure
558 * @shared: This operation may not be running under the exclusive use of
559 * the MMU lock and the operation must synchronize with other
560 * threads that might be modifying SPTEs.
561 *
562 * Handle bookkeeping that might result from the modification of a SPTE. Note,
563 * dirty logging updates are handled in common code, not here (see make_spte()
564 * and fast_pf_fix_direct_spte()).
565 */
handle_changed_spte(struct kvm * kvm,int as_id,gfn_t gfn,u64 old_spte,u64 new_spte,int level,bool shared)566 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
567 u64 old_spte, u64 new_spte, int level,
568 bool shared)
569 {
570 bool was_present = is_shadow_present_pte(old_spte);
571 bool is_present = is_shadow_present_pte(new_spte);
572 bool was_leaf = was_present && is_last_spte(old_spte, level);
573 bool is_leaf = is_present && is_last_spte(new_spte, level);
574 bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
575
576 WARN_ON_ONCE(level > PT64_ROOT_MAX_LEVEL);
577 WARN_ON_ONCE(level < PG_LEVEL_4K);
578 WARN_ON_ONCE(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
579
580 /*
581 * If this warning were to trigger it would indicate that there was a
582 * missing MMU notifier or a race with some notifier handler.
583 * A present, leaf SPTE should never be directly replaced with another
584 * present leaf SPTE pointing to a different PFN. A notifier handler
585 * should be zapping the SPTE before the main MM's page table is
586 * changed, or the SPTE should be zeroed, and the TLBs flushed by the
587 * thread before replacement.
588 */
589 if (was_leaf && is_leaf && pfn_changed) {
590 pr_err("Invalid SPTE change: cannot replace a present leaf\n"
591 "SPTE with another present leaf SPTE mapping a\n"
592 "different PFN!\n"
593 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
594 as_id, gfn, old_spte, new_spte, level);
595
596 /*
597 * Crash the host to prevent error propagation and guest data
598 * corruption.
599 */
600 BUG();
601 }
602
603 if (old_spte == new_spte)
604 return;
605
606 trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
607
608 if (is_leaf)
609 check_spte_writable_invariants(new_spte);
610
611 /*
612 * The only times a SPTE should be changed from a non-present to
613 * non-present state is when an MMIO entry is installed/modified/
614 * removed. In that case, there is nothing to do here.
615 */
616 if (!was_present && !is_present) {
617 /*
618 * If this change does not involve a MMIO SPTE or frozen SPTE,
619 * it is unexpected. Log the change, though it should not
620 * impact the guest since both the former and current SPTEs
621 * are nonpresent.
622 */
623 if (WARN_ON_ONCE(!is_mmio_spte(kvm, old_spte) &&
624 !is_mmio_spte(kvm, new_spte) &&
625 !is_frozen_spte(new_spte)))
626 pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
627 "should not be replaced with another,\n"
628 "different nonpresent SPTE, unless one or both\n"
629 "are MMIO SPTEs, or the new SPTE is\n"
630 "a temporary frozen SPTE.\n"
631 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
632 as_id, gfn, old_spte, new_spte, level);
633 return;
634 }
635
636 if (is_leaf != was_leaf)
637 kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
638
639 /*
640 * Recursively handle child PTs if the change removed a subtree from
641 * the paging structure. Note the WARN on the PFN changing without the
642 * SPTE being converted to a hugepage (leaf) or being zapped. Shadow
643 * pages are kernel allocations and should never be migrated.
644 */
645 if (was_present && !was_leaf &&
646 (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
647 handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
648 }
649
__tdp_mmu_set_spte_atomic(struct kvm * kvm,struct tdp_iter * iter,u64 new_spte)650 static inline int __must_check __tdp_mmu_set_spte_atomic(struct kvm *kvm,
651 struct tdp_iter *iter,
652 u64 new_spte)
653 {
654 /*
655 * The caller is responsible for ensuring the old SPTE is not a FROZEN
656 * SPTE. KVM should never attempt to zap or manipulate a FROZEN SPTE,
657 * and pre-checking before inserting a new SPTE is advantageous as it
658 * avoids unnecessary work.
659 */
660 WARN_ON_ONCE(iter->yielded || is_frozen_spte(iter->old_spte));
661
662 if (is_mirror_sptep(iter->sptep) && !is_frozen_spte(new_spte)) {
663 int ret;
664
665 /*
666 * Users of atomic zapping don't operate on mirror roots,
667 * so don't handle it and bug the VM if it's seen.
668 */
669 if (KVM_BUG_ON(!is_shadow_present_pte(new_spte), kvm))
670 return -EBUSY;
671
672 ret = set_external_spte_present(kvm, iter->sptep, iter->gfn,
673 iter->old_spte, new_spte, iter->level);
674 if (ret)
675 return ret;
676 } else {
677 u64 *sptep = rcu_dereference(iter->sptep);
678
679 /*
680 * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs
681 * and does not hold the mmu_lock. On failure, i.e. if a
682 * different logical CPU modified the SPTE, try_cmpxchg64()
683 * updates iter->old_spte with the current value, so the caller
684 * operates on fresh data, e.g. if it retries
685 * tdp_mmu_set_spte_atomic()
686 */
687 if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte))
688 return -EBUSY;
689 }
690
691 return 0;
692 }
693
694 /*
695 * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically
696 * and handle the associated bookkeeping. Do not mark the page dirty
697 * in KVM's dirty bitmaps.
698 *
699 * If setting the SPTE fails because it has changed, iter->old_spte will be
700 * refreshed to the current value of the spte.
701 *
702 * @kvm: kvm instance
703 * @iter: a tdp_iter instance currently on the SPTE that should be set
704 * @new_spte: The value the SPTE should be set to
705 * Return:
706 * * 0 - If the SPTE was set.
707 * * -EBUSY - If the SPTE cannot be set. In this case this function will have
708 * no side-effects other than setting iter->old_spte to the last
709 * known value of the spte.
710 */
tdp_mmu_set_spte_atomic(struct kvm * kvm,struct tdp_iter * iter,u64 new_spte)711 static inline int __must_check tdp_mmu_set_spte_atomic(struct kvm *kvm,
712 struct tdp_iter *iter,
713 u64 new_spte)
714 {
715 int ret;
716
717 lockdep_assert_held_read(&kvm->mmu_lock);
718
719 ret = __tdp_mmu_set_spte_atomic(kvm, iter, new_spte);
720 if (ret)
721 return ret;
722
723 handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
724 new_spte, iter->level, true);
725
726 return 0;
727 }
728
729 /*
730 * tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
731 * @kvm: KVM instance
732 * @as_id: Address space ID, i.e. regular vs. SMM
733 * @sptep: Pointer to the SPTE
734 * @old_spte: The current value of the SPTE
735 * @new_spte: The new value that will be set for the SPTE
736 * @gfn: The base GFN that was (or will be) mapped by the SPTE
737 * @level: The level _containing_ the SPTE (its parent PT's level)
738 *
739 * Returns the old SPTE value, which _may_ be different than @old_spte if the
740 * SPTE had voldatile bits.
741 */
tdp_mmu_set_spte(struct kvm * kvm,int as_id,tdp_ptep_t sptep,u64 old_spte,u64 new_spte,gfn_t gfn,int level)742 static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
743 u64 old_spte, u64 new_spte, gfn_t gfn, int level)
744 {
745 lockdep_assert_held_write(&kvm->mmu_lock);
746
747 /*
748 * No thread should be using this function to set SPTEs to or from the
749 * temporary frozen SPTE value.
750 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
751 * should be used. If operating under the MMU lock in write mode, the
752 * use of the frozen SPTE should not be necessary.
753 */
754 WARN_ON_ONCE(is_frozen_spte(old_spte) || is_frozen_spte(new_spte));
755
756 old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level);
757
758 handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
759
760 /*
761 * Users that do non-atomic setting of PTEs don't operate on mirror
762 * roots, so don't handle it and bug the VM if it's seen.
763 */
764 if (is_mirror_sptep(sptep)) {
765 KVM_BUG_ON(is_shadow_present_pte(new_spte), kvm);
766 remove_external_spte(kvm, gfn, old_spte, level);
767 }
768
769 return old_spte;
770 }
771
tdp_mmu_iter_set_spte(struct kvm * kvm,struct tdp_iter * iter,u64 new_spte)772 static inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter,
773 u64 new_spte)
774 {
775 WARN_ON_ONCE(iter->yielded);
776 iter->old_spte = tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep,
777 iter->old_spte, new_spte,
778 iter->gfn, iter->level);
779 }
780
781 #define tdp_root_for_each_pte(_iter, _kvm, _root, _start, _end) \
782 for_each_tdp_pte(_iter, _kvm, _root, _start, _end)
783
784 #define tdp_root_for_each_leaf_pte(_iter, _kvm, _root, _start, _end) \
785 tdp_root_for_each_pte(_iter, _kvm, _root, _start, _end) \
786 if (!is_shadow_present_pte(_iter.old_spte) || \
787 !is_last_spte(_iter.old_spte, _iter.level)) \
788 continue; \
789 else
790
tdp_mmu_iter_need_resched(struct kvm * kvm,struct tdp_iter * iter)791 static inline bool __must_check tdp_mmu_iter_need_resched(struct kvm *kvm,
792 struct tdp_iter *iter)
793 {
794 if (!need_resched() && !rwlock_needbreak(&kvm->mmu_lock))
795 return false;
796
797 /* Ensure forward progress has been made before yielding. */
798 return iter->next_last_level_gfn != iter->yielded_gfn;
799 }
800
801 /*
802 * Yield if the MMU lock is contended or this thread needs to return control
803 * to the scheduler.
804 *
805 * If this function should yield and flush is set, it will perform a remote
806 * TLB flush before yielding.
807 *
808 * If this function yields, iter->yielded is set and the caller must skip to
809 * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk
810 * over the paging structures to allow the iterator to continue its traversal
811 * from the paging structure root.
812 *
813 * Returns true if this function yielded.
814 */
tdp_mmu_iter_cond_resched(struct kvm * kvm,struct tdp_iter * iter,bool flush,bool shared)815 static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
816 struct tdp_iter *iter,
817 bool flush, bool shared)
818 {
819 KVM_MMU_WARN_ON(iter->yielded);
820
821 if (!tdp_mmu_iter_need_resched(kvm, iter))
822 return false;
823
824 if (flush)
825 kvm_flush_remote_tlbs(kvm);
826
827 rcu_read_unlock();
828
829 if (shared)
830 cond_resched_rwlock_read(&kvm->mmu_lock);
831 else
832 cond_resched_rwlock_write(&kvm->mmu_lock);
833
834 rcu_read_lock();
835
836 WARN_ON_ONCE(iter->gfn > iter->next_last_level_gfn);
837
838 iter->yielded = true;
839 return true;
840 }
841
tdp_mmu_max_gfn_exclusive(void)842 static inline gfn_t tdp_mmu_max_gfn_exclusive(void)
843 {
844 /*
845 * Bound TDP MMU walks at host.MAXPHYADDR. KVM disallows memslots with
846 * a gpa range that would exceed the max gfn, and KVM does not create
847 * MMIO SPTEs for "impossible" gfns, instead sending such accesses down
848 * the slow emulation path every time.
849 */
850 return kvm_mmu_max_gfn() + 1;
851 }
852
__tdp_mmu_zap_root(struct kvm * kvm,struct kvm_mmu_page * root,bool shared,int zap_level)853 static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
854 bool shared, int zap_level)
855 {
856 struct tdp_iter iter;
857
858 for_each_tdp_pte_min_level_all(iter, root, zap_level) {
859 retry:
860 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
861 continue;
862
863 if (!is_shadow_present_pte(iter.old_spte))
864 continue;
865
866 if (iter.level > zap_level)
867 continue;
868
869 if (!shared)
870 tdp_mmu_iter_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE);
871 else if (tdp_mmu_set_spte_atomic(kvm, &iter, SHADOW_NONPRESENT_VALUE))
872 goto retry;
873 }
874 }
875
tdp_mmu_zap_root(struct kvm * kvm,struct kvm_mmu_page * root,bool shared)876 static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
877 bool shared)
878 {
879
880 /*
881 * The root must have an elevated refcount so that it's reachable via
882 * mmu_notifier callbacks, which allows this path to yield and drop
883 * mmu_lock. When handling an unmap/release mmu_notifier command, KVM
884 * must drop all references to relevant pages prior to completing the
885 * callback. Dropping mmu_lock with an unreachable root would result
886 * in zapping SPTEs after a relevant mmu_notifier callback completes
887 * and lead to use-after-free as zapping a SPTE triggers "writeback" of
888 * dirty accessed bits to the SPTE's associated struct page.
889 */
890 WARN_ON_ONCE(!refcount_read(&root->tdp_mmu_root_count));
891
892 kvm_lockdep_assert_mmu_lock_held(kvm, shared);
893
894 rcu_read_lock();
895
896 /*
897 * Zap roots in multiple passes of decreasing granularity, i.e. zap at
898 * 4KiB=>2MiB=>1GiB=>root, in order to better honor need_resched() (all
899 * preempt models) or mmu_lock contention (full or real-time models).
900 * Zapping at finer granularity marginally increases the total time of
901 * the zap, but in most cases the zap itself isn't latency sensitive.
902 *
903 * If KVM is configured to prove the MMU, skip the 4KiB and 2MiB zaps
904 * in order to mimic the page fault path, which can replace a 1GiB page
905 * table with an equivalent 1GiB hugepage, i.e. can get saddled with
906 * zapping a 1GiB region that's fully populated with 4KiB SPTEs. This
907 * allows verifying that KVM can safely zap 1GiB regions, e.g. without
908 * inducing RCU stalls, without relying on a relatively rare event
909 * (zapping roots is orders of magnitude more common). Note, because
910 * zapping a SP recurses on its children, stepping down to PG_LEVEL_4K
911 * in the iterator itself is unnecessary.
912 */
913 if (!IS_ENABLED(CONFIG_KVM_PROVE_MMU)) {
914 __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_4K);
915 __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_2M);
916 }
917 __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G);
918 __tdp_mmu_zap_root(kvm, root, shared, root->role.level);
919
920 rcu_read_unlock();
921 }
922
kvm_tdp_mmu_zap_possible_nx_huge_page(struct kvm * kvm,struct kvm_mmu_page * sp)923 bool kvm_tdp_mmu_zap_possible_nx_huge_page(struct kvm *kvm,
924 struct kvm_mmu_page *sp)
925 {
926 struct tdp_iter iter = {
927 .old_spte = sp->ptep ? kvm_tdp_mmu_read_spte(sp->ptep) : 0,
928 .sptep = sp->ptep,
929 .level = sp->role.level + 1,
930 .gfn = sp->gfn,
931 .as_id = kvm_mmu_page_as_id(sp),
932 };
933
934 lockdep_assert_held_read(&kvm->mmu_lock);
935
936 if (WARN_ON_ONCE(!is_tdp_mmu_page(sp)))
937 return false;
938
939 /*
940 * Root shadow pages don't have a parent page table and thus no
941 * associated entry, but they can never be possible NX huge pages.
942 */
943 if (WARN_ON_ONCE(!sp->ptep))
944 return false;
945
946 /*
947 * Since mmu_lock is held in read mode, it's possible another task has
948 * already modified the SPTE. Zap the SPTE if and only if the SPTE
949 * points at the SP's page table, as checking shadow-present isn't
950 * sufficient, e.g. the SPTE could be replaced by a leaf SPTE, or even
951 * another SP. Note, spte_to_child_pt() also checks that the SPTE is
952 * shadow-present, i.e. guards against zapping a frozen SPTE.
953 */
954 if ((tdp_ptep_t)sp->spt != spte_to_child_pt(iter.old_spte, iter.level))
955 return false;
956
957 /*
958 * If a different task modified the SPTE, then it should be impossible
959 * for the SPTE to still be used for the to-be-zapped SP. Non-leaf
960 * SPTEs don't have Dirty bits, KVM always sets the Accessed bit when
961 * creating non-leaf SPTEs, and all other bits are immutable for non-
962 * leaf SPTEs, i.e. the only legal operations for non-leaf SPTEs are
963 * zapping and replacement.
964 */
965 if (tdp_mmu_set_spte_atomic(kvm, &iter, SHADOW_NONPRESENT_VALUE)) {
966 WARN_ON_ONCE((tdp_ptep_t)sp->spt == spte_to_child_pt(iter.old_spte, iter.level));
967 return false;
968 }
969
970 return true;
971 }
972
973 /*
974 * If can_yield is true, will release the MMU lock and reschedule if the
975 * scheduler needs the CPU or there is contention on the MMU lock. If this
976 * function cannot yield, it will not release the MMU lock or reschedule and
977 * the caller must ensure it does not supply too large a GFN range, or the
978 * operation can cause a soft lockup.
979 */
tdp_mmu_zap_leafs(struct kvm * kvm,struct kvm_mmu_page * root,gfn_t start,gfn_t end,bool can_yield,bool flush)980 static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
981 gfn_t start, gfn_t end, bool can_yield, bool flush)
982 {
983 struct tdp_iter iter;
984
985 end = min(end, tdp_mmu_max_gfn_exclusive());
986
987 lockdep_assert_held_write(&kvm->mmu_lock);
988
989 rcu_read_lock();
990
991 for_each_tdp_pte_min_level(iter, kvm, root, PG_LEVEL_4K, start, end) {
992 if (can_yield &&
993 tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) {
994 flush = false;
995 continue;
996 }
997
998 if (!is_shadow_present_pte(iter.old_spte) ||
999 !is_last_spte(iter.old_spte, iter.level))
1000 continue;
1001
1002 tdp_mmu_iter_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE);
1003
1004 /*
1005 * Zappings SPTEs in invalid roots doesn't require a TLB flush,
1006 * see kvm_tdp_mmu_zap_invalidated_roots() for details.
1007 */
1008 if (!root->role.invalid)
1009 flush = true;
1010 }
1011
1012 rcu_read_unlock();
1013
1014 /*
1015 * Because this flow zaps _only_ leaf SPTEs, the caller doesn't need
1016 * to provide RCU protection as no 'struct kvm_mmu_page' will be freed.
1017 */
1018 return flush;
1019 }
1020
1021 /*
1022 * Zap leaf SPTEs for the range of gfns, [start, end), for all *VALID** roots.
1023 * Returns true if a TLB flush is needed before releasing the MMU lock, i.e. if
1024 * one or more SPTEs were zapped since the MMU lock was last acquired.
1025 */
kvm_tdp_mmu_zap_leafs(struct kvm * kvm,gfn_t start,gfn_t end,bool flush)1026 bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush)
1027 {
1028 struct kvm_mmu_page *root;
1029
1030 lockdep_assert_held_write(&kvm->mmu_lock);
1031 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, -1)
1032 flush = tdp_mmu_zap_leafs(kvm, root, start, end, true, flush);
1033
1034 return flush;
1035 }
1036
kvm_tdp_mmu_zap_all(struct kvm * kvm)1037 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
1038 {
1039 struct kvm_mmu_page *root;
1040
1041 /*
1042 * Zap all direct roots, including invalid direct roots, as all direct
1043 * SPTEs must be dropped before returning to the caller. For TDX, mirror
1044 * roots don't need handling in response to the mmu notifier (the caller).
1045 *
1046 * Zap directly even if the root is also being zapped by a concurrent
1047 * "fast zap". Walking zapped top-level SPTEs isn't all that expensive
1048 * and mmu_lock is already held, which means the other thread has yielded.
1049 *
1050 * A TLB flush is unnecessary, KVM zaps everything if and only the VM
1051 * is being destroyed or the userspace VMM has exited. In both cases,
1052 * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request.
1053 */
1054 lockdep_assert_held_write(&kvm->mmu_lock);
1055 __for_each_tdp_mmu_root_yield_safe(kvm, root, -1,
1056 KVM_DIRECT_ROOTS | KVM_INVALID_ROOTS)
1057 tdp_mmu_zap_root(kvm, root, false);
1058 }
1059
1060 /*
1061 * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast
1062 * zap" completes.
1063 */
kvm_tdp_mmu_zap_invalidated_roots(struct kvm * kvm,bool shared)1064 void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm, bool shared)
1065 {
1066 struct kvm_mmu_page *root;
1067
1068 if (shared)
1069 read_lock(&kvm->mmu_lock);
1070 else
1071 write_lock(&kvm->mmu_lock);
1072
1073 for_each_tdp_mmu_root_yield_safe(kvm, root) {
1074 if (!root->tdp_mmu_scheduled_root_to_zap)
1075 continue;
1076
1077 root->tdp_mmu_scheduled_root_to_zap = false;
1078 KVM_BUG_ON(!root->role.invalid, kvm);
1079
1080 /*
1081 * A TLB flush is not necessary as KVM performs a local TLB
1082 * flush when allocating a new root (see kvm_mmu_load()), and
1083 * when migrating a vCPU to a different pCPU. Note, the local
1084 * TLB flush on reuse also invalidates paging-structure-cache
1085 * entries, i.e. TLB entries for intermediate paging structures,
1086 * that may be zapped, as such entries are associated with the
1087 * ASID on both VMX and SVM.
1088 */
1089 tdp_mmu_zap_root(kvm, root, shared);
1090
1091 /*
1092 * The referenced needs to be put *after* zapping the root, as
1093 * the root must be reachable by mmu_notifiers while it's being
1094 * zapped
1095 */
1096 kvm_tdp_mmu_put_root(kvm, root);
1097 }
1098
1099 if (shared)
1100 read_unlock(&kvm->mmu_lock);
1101 else
1102 write_unlock(&kvm->mmu_lock);
1103 }
1104
1105 /*
1106 * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that
1107 * is about to be zapped, e.g. in response to a memslots update. The actual
1108 * zapping is done separately so that it happens with mmu_lock with read,
1109 * whereas invalidating roots must be done with mmu_lock held for write (unless
1110 * the VM is being destroyed).
1111 *
1112 * Note, kvm_tdp_mmu_zap_invalidated_roots() is gifted the TDP MMU's reference.
1113 * See kvm_tdp_mmu_alloc_root().
1114 */
kvm_tdp_mmu_invalidate_roots(struct kvm * kvm,enum kvm_tdp_mmu_root_types root_types)1115 void kvm_tdp_mmu_invalidate_roots(struct kvm *kvm,
1116 enum kvm_tdp_mmu_root_types root_types)
1117 {
1118 struct kvm_mmu_page *root;
1119
1120 /*
1121 * Invalidating invalid roots doesn't make sense, prevent developers from
1122 * having to think about it.
1123 */
1124 if (WARN_ON_ONCE(root_types & KVM_INVALID_ROOTS))
1125 root_types &= ~KVM_INVALID_ROOTS;
1126
1127 /*
1128 * mmu_lock must be held for write to ensure that a root doesn't become
1129 * invalid while there are active readers (invalidating a root while
1130 * there are active readers may or may not be problematic in practice,
1131 * but it's uncharted territory and not supported).
1132 *
1133 * Waive the assertion if there are no users of @kvm, i.e. the VM is
1134 * being destroyed after all references have been put, or if no vCPUs
1135 * have been created (which means there are no roots), i.e. the VM is
1136 * being destroyed in an error path of KVM_CREATE_VM.
1137 */
1138 if (IS_ENABLED(CONFIG_PROVE_LOCKING) &&
1139 refcount_read(&kvm->users_count) && kvm->created_vcpus)
1140 lockdep_assert_held_write(&kvm->mmu_lock);
1141
1142 /*
1143 * As above, mmu_lock isn't held when destroying the VM! There can't
1144 * be other references to @kvm, i.e. nothing else can invalidate roots
1145 * or get/put references to roots.
1146 */
1147 list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
1148 if (!tdp_mmu_root_match(root, root_types))
1149 continue;
1150
1151 /*
1152 * Note, invalid roots can outlive a memslot update! Invalid
1153 * roots must be *zapped* before the memslot update completes,
1154 * but a different task can acquire a reference and keep the
1155 * root alive after its been zapped.
1156 */
1157 if (!root->role.invalid) {
1158 root->tdp_mmu_scheduled_root_to_zap = true;
1159 root->role.invalid = true;
1160 }
1161 }
1162 }
1163
1164 /*
1165 * Installs a last-level SPTE to handle a TDP page fault.
1166 * (NPT/EPT violation/misconfiguration)
1167 */
tdp_mmu_map_handle_target_level(struct kvm_vcpu * vcpu,struct kvm_page_fault * fault,struct tdp_iter * iter)1168 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
1169 struct kvm_page_fault *fault,
1170 struct tdp_iter *iter)
1171 {
1172 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep));
1173 u64 new_spte;
1174 int ret = RET_PF_FIXED;
1175 bool wrprot = false;
1176
1177 if (WARN_ON_ONCE(sp->role.level != fault->goal_level))
1178 return RET_PF_RETRY;
1179
1180 if (is_shadow_present_pte(iter->old_spte) &&
1181 (fault->prefetch || is_access_allowed(fault, iter->old_spte)) &&
1182 is_last_spte(iter->old_spte, iter->level)) {
1183 WARN_ON_ONCE(fault->pfn != spte_to_pfn(iter->old_spte));
1184 return RET_PF_SPURIOUS;
1185 }
1186
1187 if (unlikely(!fault->slot))
1188 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
1189 else
1190 wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
1191 fault->pfn, iter->old_spte, fault->prefetch,
1192 false, fault->map_writable, &new_spte);
1193
1194 if (new_spte == iter->old_spte)
1195 ret = RET_PF_SPURIOUS;
1196 else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
1197 return RET_PF_RETRY;
1198 else if (is_shadow_present_pte(iter->old_spte) &&
1199 (!is_last_spte(iter->old_spte, iter->level) ||
1200 WARN_ON_ONCE(leaf_spte_change_needs_tlb_flush(iter->old_spte, new_spte))))
1201 kvm_flush_remote_tlbs_gfn(vcpu->kvm, iter->gfn, iter->level);
1202
1203 /*
1204 * If the page fault was caused by a write but the page is write
1205 * protected, emulation is needed. If the emulation was skipped,
1206 * the vCPU would have the same fault again.
1207 */
1208 if (wrprot && fault->write)
1209 ret = RET_PF_WRITE_PROTECTED;
1210
1211 /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
1212 if (unlikely(is_mmio_spte(vcpu->kvm, new_spte))) {
1213 vcpu->stat.pf_mmio_spte_created++;
1214 trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
1215 new_spte);
1216 ret = RET_PF_EMULATE;
1217 } else {
1218 trace_kvm_mmu_set_spte(iter->level, iter->gfn,
1219 rcu_dereference(iter->sptep));
1220 }
1221
1222 return ret;
1223 }
1224
1225 /*
1226 * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the
1227 * provided page table.
1228 *
1229 * @kvm: kvm instance
1230 * @iter: a tdp_iter instance currently on the SPTE that should be set
1231 * @sp: The new TDP page table to install.
1232 * @shared: This operation is running under the MMU lock in read mode.
1233 *
1234 * Returns: 0 if the new page table was installed. Non-0 if the page table
1235 * could not be installed (e.g. the atomic compare-exchange failed).
1236 */
tdp_mmu_link_sp(struct kvm * kvm,struct tdp_iter * iter,struct kvm_mmu_page * sp,bool shared)1237 static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
1238 struct kvm_mmu_page *sp, bool shared)
1239 {
1240 u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled);
1241 int ret = 0;
1242
1243 if (shared) {
1244 ret = tdp_mmu_set_spte_atomic(kvm, iter, spte);
1245 if (ret)
1246 return ret;
1247 } else {
1248 tdp_mmu_iter_set_spte(kvm, iter, spte);
1249 }
1250
1251 tdp_account_mmu_page(kvm, sp);
1252
1253 return 0;
1254 }
1255
1256 static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
1257 struct kvm_mmu_page *sp, bool shared);
1258
1259 /*
1260 * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
1261 * page tables and SPTEs to translate the faulting guest physical address.
1262 */
kvm_tdp_mmu_map(struct kvm_vcpu * vcpu,struct kvm_page_fault * fault)1263 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
1264 {
1265 struct kvm_mmu_page *root = tdp_mmu_get_root_for_fault(vcpu, fault);
1266 struct kvm *kvm = vcpu->kvm;
1267 struct tdp_iter iter;
1268 struct kvm_mmu_page *sp;
1269 int ret = RET_PF_RETRY;
1270
1271 KVM_MMU_WARN_ON(!root || root->role.invalid);
1272
1273 kvm_mmu_hugepage_adjust(vcpu, fault);
1274
1275 trace_kvm_mmu_spte_requested(fault);
1276
1277 rcu_read_lock();
1278
1279 for_each_tdp_pte(iter, kvm, root, fault->gfn, fault->gfn + 1) {
1280 int r;
1281
1282 if (fault->nx_huge_page_workaround_enabled)
1283 disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
1284
1285 /*
1286 * If SPTE has been frozen by another thread, just give up and
1287 * retry, avoiding unnecessary page table allocation and free.
1288 */
1289 if (is_frozen_spte(iter.old_spte))
1290 goto retry;
1291
1292 if (iter.level == fault->goal_level)
1293 goto map_target_level;
1294
1295 /* Step down into the lower level page table if it exists. */
1296 if (is_shadow_present_pte(iter.old_spte) &&
1297 !is_large_pte(iter.old_spte))
1298 continue;
1299
1300 /*
1301 * The SPTE is either non-present or points to a huge page that
1302 * needs to be split.
1303 */
1304 sp = tdp_mmu_alloc_sp(vcpu);
1305 tdp_mmu_init_child_sp(sp, &iter);
1306 if (is_mirror_sp(sp))
1307 kvm_mmu_alloc_external_spt(vcpu, sp);
1308
1309 sp->nx_huge_page_disallowed = fault->huge_page_disallowed;
1310
1311 if (is_shadow_present_pte(iter.old_spte)) {
1312 /* Don't support large page for mirrored roots (TDX) */
1313 KVM_BUG_ON(is_mirror_sptep(iter.sptep), vcpu->kvm);
1314 r = tdp_mmu_split_huge_page(kvm, &iter, sp, true);
1315 } else {
1316 r = tdp_mmu_link_sp(kvm, &iter, sp, true);
1317 }
1318
1319 /*
1320 * Force the guest to retry if installing an upper level SPTE
1321 * failed, e.g. because a different task modified the SPTE.
1322 */
1323 if (r) {
1324 tdp_mmu_free_sp(sp);
1325 goto retry;
1326 }
1327
1328 if (fault->huge_page_disallowed &&
1329 fault->req_level >= iter.level) {
1330 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
1331 if (sp->nx_huge_page_disallowed)
1332 track_possible_nx_huge_page(kvm, sp, KVM_TDP_MMU);
1333 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
1334 }
1335 }
1336
1337 /*
1338 * The walk aborted before reaching the target level, e.g. because the
1339 * iterator detected an upper level SPTE was frozen during traversal.
1340 */
1341 WARN_ON_ONCE(iter.level == fault->goal_level);
1342 goto retry;
1343
1344 map_target_level:
1345 ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
1346
1347 retry:
1348 rcu_read_unlock();
1349 return ret;
1350 }
1351
1352 /* Used by mmu notifier via kvm_unmap_gfn_range() */
kvm_tdp_mmu_unmap_gfn_range(struct kvm * kvm,struct kvm_gfn_range * range,bool flush)1353 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
1354 bool flush)
1355 {
1356 enum kvm_tdp_mmu_root_types types;
1357 struct kvm_mmu_page *root;
1358
1359 types = kvm_gfn_range_filter_to_root_types(kvm, range->attr_filter) | KVM_INVALID_ROOTS;
1360
1361 __for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, types)
1362 flush = tdp_mmu_zap_leafs(kvm, root, range->start, range->end,
1363 range->may_block, flush);
1364
1365 return flush;
1366 }
1367
1368 /*
1369 * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
1370 * if any of the GFNs in the range have been accessed.
1371 *
1372 * No need to mark the corresponding PFN as accessed as this call is coming
1373 * from the clear_young() or clear_flush_young() notifier, which uses the
1374 * return value to determine if the page has been accessed.
1375 */
kvm_tdp_mmu_age_spte(struct kvm * kvm,struct tdp_iter * iter)1376 static void kvm_tdp_mmu_age_spte(struct kvm *kvm, struct tdp_iter *iter)
1377 {
1378 u64 new_spte;
1379
1380 if (spte_ad_enabled(iter->old_spte)) {
1381 iter->old_spte = tdp_mmu_clear_spte_bits_atomic(iter->sptep,
1382 shadow_accessed_mask);
1383 new_spte = iter->old_spte & ~shadow_accessed_mask;
1384 } else {
1385 new_spte = mark_spte_for_access_track(iter->old_spte);
1386 /*
1387 * It is safe for the following cmpxchg to fail. Leave the
1388 * Accessed bit set, as the spte is most likely young anyway.
1389 */
1390 if (__tdp_mmu_set_spte_atomic(kvm, iter, new_spte))
1391 return;
1392 }
1393
1394 trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level,
1395 iter->old_spte, new_spte);
1396 }
1397
__kvm_tdp_mmu_age_gfn_range(struct kvm * kvm,struct kvm_gfn_range * range,bool test_only)1398 static bool __kvm_tdp_mmu_age_gfn_range(struct kvm *kvm,
1399 struct kvm_gfn_range *range,
1400 bool test_only)
1401 {
1402 enum kvm_tdp_mmu_root_types types;
1403 struct kvm_mmu_page *root;
1404 struct tdp_iter iter;
1405 bool ret = false;
1406
1407 types = kvm_gfn_range_filter_to_root_types(kvm, range->attr_filter);
1408
1409 /*
1410 * Don't support rescheduling, none of the MMU notifiers that funnel
1411 * into this helper allow blocking; it'd be dead, wasteful code. Note,
1412 * this helper must NOT be used to unmap GFNs, as it processes only
1413 * valid roots!
1414 */
1415 WARN_ON(types & ~KVM_VALID_ROOTS);
1416
1417 guard(rcu)();
1418 for_each_tdp_mmu_root_rcu(kvm, root, range->slot->as_id, types) {
1419 tdp_root_for_each_leaf_pte(iter, kvm, root, range->start, range->end) {
1420 if (!is_accessed_spte(iter.old_spte))
1421 continue;
1422
1423 if (test_only)
1424 return true;
1425
1426 ret = true;
1427 kvm_tdp_mmu_age_spte(kvm, &iter);
1428 }
1429 }
1430
1431 return ret;
1432 }
1433
kvm_tdp_mmu_age_gfn_range(struct kvm * kvm,struct kvm_gfn_range * range)1434 bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
1435 {
1436 return __kvm_tdp_mmu_age_gfn_range(kvm, range, false);
1437 }
1438
kvm_tdp_mmu_test_age_gfn(struct kvm * kvm,struct kvm_gfn_range * range)1439 bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1440 {
1441 return __kvm_tdp_mmu_age_gfn_range(kvm, range, true);
1442 }
1443
1444 /*
1445 * Remove write access from all SPTEs at or above min_level that map GFNs
1446 * [start, end). Returns true if an SPTE has been changed and the TLBs need to
1447 * be flushed.
1448 */
wrprot_gfn_range(struct kvm * kvm,struct kvm_mmu_page * root,gfn_t start,gfn_t end,int min_level)1449 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1450 gfn_t start, gfn_t end, int min_level)
1451 {
1452 struct tdp_iter iter;
1453 u64 new_spte;
1454 bool spte_set = false;
1455
1456 rcu_read_lock();
1457
1458 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1459
1460 for_each_tdp_pte_min_level(iter, kvm, root, min_level, start, end) {
1461 retry:
1462 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1463 continue;
1464
1465 if (!is_shadow_present_pte(iter.old_spte) ||
1466 !is_last_spte(iter.old_spte, iter.level) ||
1467 !(iter.old_spte & PT_WRITABLE_MASK))
1468 continue;
1469
1470 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1471
1472 if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
1473 goto retry;
1474
1475 spte_set = true;
1476 }
1477
1478 rcu_read_unlock();
1479 return spte_set;
1480 }
1481
1482 /*
1483 * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1484 * only affect leaf SPTEs down to min_level.
1485 * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1486 */
kvm_tdp_mmu_wrprot_slot(struct kvm * kvm,const struct kvm_memory_slot * slot,int min_level)1487 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
1488 const struct kvm_memory_slot *slot, int min_level)
1489 {
1490 struct kvm_mmu_page *root;
1491 bool spte_set = false;
1492
1493 lockdep_assert_held_read(&kvm->mmu_lock);
1494
1495 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
1496 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1497 slot->base_gfn + slot->npages, min_level);
1498
1499 return spte_set;
1500 }
1501
tdp_mmu_alloc_sp_for_split(void)1502 static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(void)
1503 {
1504 struct kvm_mmu_page *sp;
1505
1506 sp = kmem_cache_zalloc(mmu_page_header_cache, GFP_KERNEL_ACCOUNT);
1507 if (!sp)
1508 return NULL;
1509
1510 sp->spt = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
1511 if (!sp->spt) {
1512 kmem_cache_free(mmu_page_header_cache, sp);
1513 return NULL;
1514 }
1515
1516 return sp;
1517 }
1518
1519 /* Note, the caller is responsible for initializing @sp. */
tdp_mmu_split_huge_page(struct kvm * kvm,struct tdp_iter * iter,struct kvm_mmu_page * sp,bool shared)1520 static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
1521 struct kvm_mmu_page *sp, bool shared)
1522 {
1523 const u64 huge_spte = iter->old_spte;
1524 const int level = iter->level;
1525 int ret, i;
1526
1527 /*
1528 * No need for atomics when writing to sp->spt since the page table has
1529 * not been linked in yet and thus is not reachable from any other CPU.
1530 */
1531 for (i = 0; i < SPTE_ENT_PER_PAGE; i++)
1532 sp->spt[i] = make_small_spte(kvm, huge_spte, sp->role, i);
1533
1534 /*
1535 * Replace the huge spte with a pointer to the populated lower level
1536 * page table. Since we are making this change without a TLB flush vCPUs
1537 * will see a mix of the split mappings and the original huge mapping,
1538 * depending on what's currently in their TLB. This is fine from a
1539 * correctness standpoint since the translation will be the same either
1540 * way.
1541 */
1542 ret = tdp_mmu_link_sp(kvm, iter, sp, shared);
1543 if (ret)
1544 goto out;
1545
1546 /*
1547 * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we
1548 * are overwriting from the page stats. But we have to manually update
1549 * the page stats with the new present child pages.
1550 */
1551 kvm_update_page_stats(kvm, level - 1, SPTE_ENT_PER_PAGE);
1552
1553 out:
1554 trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret);
1555 return ret;
1556 }
1557
tdp_mmu_split_huge_pages_root(struct kvm * kvm,struct kvm_mmu_page * root,gfn_t start,gfn_t end,int target_level,bool shared)1558 static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
1559 struct kvm_mmu_page *root,
1560 gfn_t start, gfn_t end,
1561 int target_level, bool shared)
1562 {
1563 struct kvm_mmu_page *sp = NULL;
1564 struct tdp_iter iter;
1565
1566 rcu_read_lock();
1567
1568 /*
1569 * Traverse the page table splitting all huge pages above the target
1570 * level into one lower level. For example, if we encounter a 1GB page
1571 * we split it into 512 2MB pages.
1572 *
1573 * Since the TDP iterator uses a pre-order traversal, we are guaranteed
1574 * to visit an SPTE before ever visiting its children, which means we
1575 * will correctly recursively split huge pages that are more than one
1576 * level above the target level (e.g. splitting a 1GB to 512 2MB pages,
1577 * and then splitting each of those to 512 4KB pages).
1578 */
1579 for_each_tdp_pte_min_level(iter, kvm, root, target_level + 1, start, end) {
1580 retry:
1581 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
1582 continue;
1583
1584 if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte))
1585 continue;
1586
1587 if (!sp) {
1588 rcu_read_unlock();
1589
1590 if (shared)
1591 read_unlock(&kvm->mmu_lock);
1592 else
1593 write_unlock(&kvm->mmu_lock);
1594
1595 sp = tdp_mmu_alloc_sp_for_split();
1596
1597 if (shared)
1598 read_lock(&kvm->mmu_lock);
1599 else
1600 write_lock(&kvm->mmu_lock);
1601
1602 if (!sp) {
1603 trace_kvm_mmu_split_huge_page(iter.gfn,
1604 iter.old_spte,
1605 iter.level, -ENOMEM);
1606 return -ENOMEM;
1607 }
1608
1609 rcu_read_lock();
1610
1611 iter.yielded = true;
1612 continue;
1613 }
1614
1615 tdp_mmu_init_child_sp(sp, &iter);
1616
1617 if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared))
1618 goto retry;
1619
1620 sp = NULL;
1621 }
1622
1623 rcu_read_unlock();
1624
1625 /*
1626 * It's possible to exit the loop having never used the last sp if, for
1627 * example, a vCPU doing HugePage NX splitting wins the race and
1628 * installs its own sp in place of the last sp we tried to split.
1629 */
1630 if (sp)
1631 tdp_mmu_free_sp(sp);
1632
1633 return 0;
1634 }
1635
1636
1637 /*
1638 * Try to split all huge pages mapped by the TDP MMU down to the target level.
1639 */
kvm_tdp_mmu_try_split_huge_pages(struct kvm * kvm,const struct kvm_memory_slot * slot,gfn_t start,gfn_t end,int target_level,bool shared)1640 void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
1641 const struct kvm_memory_slot *slot,
1642 gfn_t start, gfn_t end,
1643 int target_level, bool shared)
1644 {
1645 struct kvm_mmu_page *root;
1646 int r = 0;
1647
1648 kvm_lockdep_assert_mmu_lock_held(kvm, shared);
1649 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) {
1650 r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared);
1651 if (r) {
1652 kvm_tdp_mmu_put_root(kvm, root);
1653 break;
1654 }
1655 }
1656 }
1657
tdp_mmu_need_write_protect(struct kvm * kvm,struct kvm_mmu_page * sp)1658 static bool tdp_mmu_need_write_protect(struct kvm *kvm, struct kvm_mmu_page *sp)
1659 {
1660 /*
1661 * All TDP MMU shadow pages share the same role as their root, aside
1662 * from level, so it is valid to key off any shadow page to determine if
1663 * write protection is needed for an entire tree.
1664 */
1665 return kvm_mmu_page_ad_need_write_protect(kvm, sp) || !kvm_ad_enabled;
1666 }
1667
clear_dirty_gfn_range(struct kvm * kvm,struct kvm_mmu_page * root,gfn_t start,gfn_t end)1668 static void clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1669 gfn_t start, gfn_t end)
1670 {
1671 const u64 dbit = tdp_mmu_need_write_protect(kvm, root) ?
1672 PT_WRITABLE_MASK : shadow_dirty_mask;
1673 struct tdp_iter iter;
1674
1675 rcu_read_lock();
1676
1677 tdp_root_for_each_pte(iter, kvm, root, start, end) {
1678 retry:
1679 if (!is_shadow_present_pte(iter.old_spte) ||
1680 !is_last_spte(iter.old_spte, iter.level))
1681 continue;
1682
1683 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1684 continue;
1685
1686 KVM_MMU_WARN_ON(dbit == shadow_dirty_mask &&
1687 spte_ad_need_write_protect(iter.old_spte));
1688
1689 if (!(iter.old_spte & dbit))
1690 continue;
1691
1692 if (tdp_mmu_set_spte_atomic(kvm, &iter, iter.old_spte & ~dbit))
1693 goto retry;
1694 }
1695
1696 rcu_read_unlock();
1697 }
1698
1699 /*
1700 * Clear the dirty status (D-bit or W-bit) of all the SPTEs mapping GFNs in the
1701 * memslot.
1702 */
kvm_tdp_mmu_clear_dirty_slot(struct kvm * kvm,const struct kvm_memory_slot * slot)1703 void kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
1704 const struct kvm_memory_slot *slot)
1705 {
1706 struct kvm_mmu_page *root;
1707
1708 lockdep_assert_held_read(&kvm->mmu_lock);
1709 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
1710 clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1711 slot->base_gfn + slot->npages);
1712 }
1713
clear_dirty_pt_masked(struct kvm * kvm,struct kvm_mmu_page * root,gfn_t gfn,unsigned long mask,bool wrprot)1714 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1715 gfn_t gfn, unsigned long mask, bool wrprot)
1716 {
1717 const u64 dbit = (wrprot || tdp_mmu_need_write_protect(kvm, root)) ?
1718 PT_WRITABLE_MASK : shadow_dirty_mask;
1719 struct tdp_iter iter;
1720
1721 lockdep_assert_held_write(&kvm->mmu_lock);
1722
1723 rcu_read_lock();
1724
1725 tdp_root_for_each_leaf_pte(iter, kvm, root, gfn + __ffs(mask),
1726 gfn + BITS_PER_LONG) {
1727 if (!mask)
1728 break;
1729
1730 KVM_MMU_WARN_ON(dbit == shadow_dirty_mask &&
1731 spte_ad_need_write_protect(iter.old_spte));
1732
1733 if (iter.level > PG_LEVEL_4K ||
1734 !(mask & (1UL << (iter.gfn - gfn))))
1735 continue;
1736
1737 mask &= ~(1UL << (iter.gfn - gfn));
1738
1739 if (!(iter.old_spte & dbit))
1740 continue;
1741
1742 iter.old_spte = tdp_mmu_clear_spte_bits(iter.sptep,
1743 iter.old_spte, dbit,
1744 iter.level);
1745
1746 trace_kvm_tdp_mmu_spte_changed(iter.as_id, iter.gfn, iter.level,
1747 iter.old_spte,
1748 iter.old_spte & ~dbit);
1749 }
1750
1751 rcu_read_unlock();
1752 }
1753
1754 /*
1755 * Clear the dirty status (D-bit or W-bit) of all the 4k SPTEs mapping GFNs for
1756 * which a bit is set in mask, starting at gfn. The given memslot is expected to
1757 * contain all the GFNs represented by set bits in the mask.
1758 */
kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm * kvm,struct kvm_memory_slot * slot,gfn_t gfn,unsigned long mask,bool wrprot)1759 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1760 struct kvm_memory_slot *slot,
1761 gfn_t gfn, unsigned long mask,
1762 bool wrprot)
1763 {
1764 struct kvm_mmu_page *root;
1765
1766 for_each_valid_tdp_mmu_root(kvm, root, slot->as_id)
1767 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1768 }
1769
tdp_mmu_make_huge_spte(struct kvm * kvm,struct tdp_iter * parent,u64 * huge_spte)1770 static int tdp_mmu_make_huge_spte(struct kvm *kvm,
1771 struct tdp_iter *parent,
1772 u64 *huge_spte)
1773 {
1774 struct kvm_mmu_page *root = spte_to_child_sp(parent->old_spte);
1775 gfn_t start = parent->gfn;
1776 gfn_t end = start + KVM_PAGES_PER_HPAGE(parent->level);
1777 struct tdp_iter iter;
1778
1779 tdp_root_for_each_leaf_pte(iter, kvm, root, start, end) {
1780 /*
1781 * Use the parent iterator when checking for forward progress so
1782 * that KVM doesn't get stuck continuously trying to yield (i.e.
1783 * returning -EAGAIN here and then failing the forward progress
1784 * check in the caller ad nauseam).
1785 */
1786 if (tdp_mmu_iter_need_resched(kvm, parent))
1787 return -EAGAIN;
1788
1789 *huge_spte = make_huge_spte(kvm, iter.old_spte, parent->level);
1790 return 0;
1791 }
1792
1793 return -ENOENT;
1794 }
1795
recover_huge_pages_range(struct kvm * kvm,struct kvm_mmu_page * root,const struct kvm_memory_slot * slot)1796 static void recover_huge_pages_range(struct kvm *kvm,
1797 struct kvm_mmu_page *root,
1798 const struct kvm_memory_slot *slot)
1799 {
1800 gfn_t start = slot->base_gfn;
1801 gfn_t end = start + slot->npages;
1802 struct tdp_iter iter;
1803 int max_mapping_level;
1804 bool flush = false;
1805 u64 huge_spte;
1806 int r;
1807
1808 if (WARN_ON_ONCE(kvm_slot_dirty_track_enabled(slot)))
1809 return;
1810
1811 rcu_read_lock();
1812
1813 for_each_tdp_pte_min_level(iter, kvm, root, PG_LEVEL_2M, start, end) {
1814 retry:
1815 if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, true)) {
1816 flush = false;
1817 continue;
1818 }
1819
1820 if (iter.level > KVM_MAX_HUGEPAGE_LEVEL ||
1821 !is_shadow_present_pte(iter.old_spte))
1822 continue;
1823
1824 /*
1825 * Don't zap leaf SPTEs, if a leaf SPTE could be replaced with
1826 * a large page size, then its parent would have been zapped
1827 * instead of stepping down.
1828 */
1829 if (is_last_spte(iter.old_spte, iter.level))
1830 continue;
1831
1832 /*
1833 * If iter.gfn resides outside of the slot, i.e. the page for
1834 * the current level overlaps but is not contained by the slot,
1835 * then the SPTE can't be made huge. More importantly, trying
1836 * to query that info from slot->arch.lpage_info will cause an
1837 * out-of-bounds access.
1838 */
1839 if (iter.gfn < start || iter.gfn >= end)
1840 continue;
1841
1842 max_mapping_level = kvm_mmu_max_mapping_level(kvm, NULL, slot, iter.gfn);
1843 if (max_mapping_level < iter.level)
1844 continue;
1845
1846 r = tdp_mmu_make_huge_spte(kvm, &iter, &huge_spte);
1847 if (r == -EAGAIN)
1848 goto retry;
1849 else if (r)
1850 continue;
1851
1852 if (tdp_mmu_set_spte_atomic(kvm, &iter, huge_spte))
1853 goto retry;
1854
1855 flush = true;
1856 }
1857
1858 if (flush)
1859 kvm_flush_remote_tlbs_memslot(kvm, slot);
1860
1861 rcu_read_unlock();
1862 }
1863
1864 /*
1865 * Recover huge page mappings within the slot by replacing non-leaf SPTEs with
1866 * huge SPTEs where possible.
1867 */
kvm_tdp_mmu_recover_huge_pages(struct kvm * kvm,const struct kvm_memory_slot * slot)1868 void kvm_tdp_mmu_recover_huge_pages(struct kvm *kvm,
1869 const struct kvm_memory_slot *slot)
1870 {
1871 struct kvm_mmu_page *root;
1872
1873 lockdep_assert_held_read(&kvm->mmu_lock);
1874 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
1875 recover_huge_pages_range(kvm, root, slot);
1876 }
1877
1878 /*
1879 * Removes write access on the last level SPTE mapping this GFN and unsets the
1880 * MMU-writable bit to ensure future writes continue to be intercepted.
1881 * Returns true if an SPTE was set and a TLB flush is needed.
1882 */
write_protect_gfn(struct kvm * kvm,struct kvm_mmu_page * root,gfn_t gfn,int min_level)1883 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
1884 gfn_t gfn, int min_level)
1885 {
1886 struct tdp_iter iter;
1887 u64 new_spte;
1888 bool spte_set = false;
1889
1890 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1891
1892 rcu_read_lock();
1893
1894 for_each_tdp_pte_min_level(iter, kvm, root, min_level, gfn, gfn + 1) {
1895 if (!is_shadow_present_pte(iter.old_spte) ||
1896 !is_last_spte(iter.old_spte, iter.level))
1897 continue;
1898
1899 new_spte = iter.old_spte &
1900 ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
1901
1902 if (new_spte == iter.old_spte)
1903 break;
1904
1905 tdp_mmu_iter_set_spte(kvm, &iter, new_spte);
1906 spte_set = true;
1907 }
1908
1909 rcu_read_unlock();
1910
1911 return spte_set;
1912 }
1913
1914 /*
1915 * Removes write access on the last level SPTE mapping this GFN and unsets the
1916 * MMU-writable bit to ensure future writes continue to be intercepted.
1917 * Returns true if an SPTE was set and a TLB flush is needed.
1918 */
kvm_tdp_mmu_write_protect_gfn(struct kvm * kvm,struct kvm_memory_slot * slot,gfn_t gfn,int min_level)1919 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
1920 struct kvm_memory_slot *slot, gfn_t gfn,
1921 int min_level)
1922 {
1923 struct kvm_mmu_page *root;
1924 bool spte_set = false;
1925
1926 lockdep_assert_held_write(&kvm->mmu_lock);
1927 for_each_valid_tdp_mmu_root(kvm, root, slot->as_id)
1928 spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
1929
1930 return spte_set;
1931 }
1932
1933 /*
1934 * Return the level of the lowest level SPTE added to sptes.
1935 * That SPTE may be non-present.
1936 *
1937 * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1938 */
kvm_tdp_mmu_get_walk(struct kvm_vcpu * vcpu,u64 addr,u64 * sptes,int * root_level)1939 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1940 int *root_level)
1941 {
1942 struct kvm_mmu_page *root = root_to_sp(vcpu->arch.mmu->root.hpa);
1943 struct tdp_iter iter;
1944 gfn_t gfn = addr >> PAGE_SHIFT;
1945 int leaf = -1;
1946
1947 *root_level = vcpu->arch.mmu->root_role.level;
1948
1949 for_each_tdp_pte(iter, vcpu->kvm, root, gfn, gfn + 1) {
1950 leaf = iter.level;
1951 sptes[leaf] = iter.old_spte;
1952 }
1953
1954 return leaf;
1955 }
1956
1957 /*
1958 * Returns the last level spte pointer of the shadow page walk for the given
1959 * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
1960 * walk could be performed, returns NULL and *spte does not contain valid data.
1961 *
1962 * Contract:
1963 * - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1964 * - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end.
1965 *
1966 * WARNING: This function is only intended to be called during fast_page_fault.
1967 */
kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu * vcpu,gfn_t gfn,u64 * spte)1968 u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gfn_t gfn,
1969 u64 *spte)
1970 {
1971 /* Fast pf is not supported for mirrored roots */
1972 struct kvm_mmu_page *root = tdp_mmu_get_root(vcpu, KVM_DIRECT_ROOTS);
1973 struct tdp_iter iter;
1974 tdp_ptep_t sptep = NULL;
1975
1976 for_each_tdp_pte(iter, vcpu->kvm, root, gfn, gfn + 1) {
1977 *spte = iter.old_spte;
1978 sptep = iter.sptep;
1979 }
1980
1981 /*
1982 * Perform the rcu_dereference to get the raw spte pointer value since
1983 * we are passing it up to fast_page_fault, which is shared with the
1984 * legacy MMU and thus does not retain the TDP MMU-specific __rcu
1985 * annotation.
1986 *
1987 * This is safe since fast_page_fault obeys the contracts of this
1988 * function as well as all TDP MMU contracts around modifying SPTEs
1989 * outside of mmu_lock.
1990 */
1991 return rcu_dereference(sptep);
1992 }
1993