1 // SPDX-License-Identifier: GPL-2.0
2 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
3
4 #include "mmu.h"
5 #include "mmu_internal.h"
6 #include "mmutrace.h"
7 #include "tdp_iter.h"
8 #include "tdp_mmu.h"
9 #include "spte.h"
10
11 #include <asm/cmpxchg.h>
12 #include <trace/events/kvm.h>
13
14 /* Initializes the TDP MMU for the VM, if enabled. */
kvm_mmu_init_tdp_mmu(struct kvm * kvm)15 void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
16 {
17 INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
18 spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
19 }
20
21 /* Arbitrarily returns true so that this may be used in if statements. */
kvm_lockdep_assert_mmu_lock_held(struct kvm * kvm,bool shared)22 static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
23 bool shared)
24 {
25 if (shared)
26 lockdep_assert_held_read(&kvm->mmu_lock);
27 else
28 lockdep_assert_held_write(&kvm->mmu_lock);
29
30 return true;
31 }
32
kvm_mmu_uninit_tdp_mmu(struct kvm * kvm)33 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
34 {
35 /*
36 * Invalidate all roots, which besides the obvious, schedules all roots
37 * for zapping and thus puts the TDP MMU's reference to each root, i.e.
38 * ultimately frees all roots.
39 */
40 kvm_tdp_mmu_invalidate_roots(kvm, KVM_VALID_ROOTS);
41 kvm_tdp_mmu_zap_invalidated_roots(kvm, false);
42
43 WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages));
44 WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
45
46 /*
47 * Ensure that all the outstanding RCU callbacks to free shadow pages
48 * can run before the VM is torn down. Putting the last reference to
49 * zapped roots will create new callbacks.
50 */
51 rcu_barrier();
52 }
53
tdp_mmu_free_sp(struct kvm_mmu_page * sp)54 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
55 {
56 free_page((unsigned long)sp->external_spt);
57 free_page((unsigned long)sp->spt);
58 kmem_cache_free(mmu_page_header_cache, sp);
59 }
60
61 /*
62 * This is called through call_rcu in order to free TDP page table memory
63 * safely with respect to other kernel threads that may be operating on
64 * the memory.
65 * By only accessing TDP MMU page table memory in an RCU read critical
66 * section, and freeing it after a grace period, lockless access to that
67 * memory won't use it after it is freed.
68 */
tdp_mmu_free_sp_rcu_callback(struct rcu_head * head)69 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
70 {
71 struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
72 rcu_head);
73
74 tdp_mmu_free_sp(sp);
75 }
76
kvm_tdp_mmu_put_root(struct kvm * kvm,struct kvm_mmu_page * root)77 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
78 {
79 if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
80 return;
81
82 /*
83 * The TDP MMU itself holds a reference to each root until the root is
84 * explicitly invalidated, i.e. the final reference should be never be
85 * put for a valid root.
86 */
87 KVM_BUG_ON(!is_tdp_mmu_page(root) || !root->role.invalid, kvm);
88
89 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
90 list_del_rcu(&root->link);
91 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
92 call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
93 }
94
tdp_mmu_root_match(struct kvm_mmu_page * root,enum kvm_tdp_mmu_root_types types)95 static bool tdp_mmu_root_match(struct kvm_mmu_page *root,
96 enum kvm_tdp_mmu_root_types types)
97 {
98 if (WARN_ON_ONCE(!(types & KVM_VALID_ROOTS)))
99 return false;
100
101 if (root->role.invalid && !(types & KVM_INVALID_ROOTS))
102 return false;
103
104 if (likely(!is_mirror_sp(root)))
105 return types & KVM_DIRECT_ROOTS;
106 return types & KVM_MIRROR_ROOTS;
107 }
108
109 /*
110 * Returns the next root after @prev_root (or the first root if @prev_root is
111 * NULL) that matches with @types. A reference to the returned root is
112 * acquired, and the reference to @prev_root is released (the caller obviously
113 * must hold a reference to @prev_root if it's non-NULL).
114 *
115 * Roots that doesn't match with @types are skipped.
116 *
117 * Returns NULL if the end of tdp_mmu_roots was reached.
118 */
tdp_mmu_next_root(struct kvm * kvm,struct kvm_mmu_page * prev_root,enum kvm_tdp_mmu_root_types types)119 static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
120 struct kvm_mmu_page *prev_root,
121 enum kvm_tdp_mmu_root_types types)
122 {
123 struct kvm_mmu_page *next_root;
124
125 /*
126 * While the roots themselves are RCU-protected, fields such as
127 * role.invalid are protected by mmu_lock.
128 */
129 lockdep_assert_held(&kvm->mmu_lock);
130
131 rcu_read_lock();
132
133 if (prev_root)
134 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
135 &prev_root->link,
136 typeof(*prev_root), link);
137 else
138 next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
139 typeof(*next_root), link);
140
141 while (next_root) {
142 if (tdp_mmu_root_match(next_root, types) &&
143 kvm_tdp_mmu_get_root(next_root))
144 break;
145
146 next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
147 &next_root->link, typeof(*next_root), link);
148 }
149
150 rcu_read_unlock();
151
152 if (prev_root)
153 kvm_tdp_mmu_put_root(kvm, prev_root);
154
155 return next_root;
156 }
157
158 /*
159 * Note: this iterator gets and puts references to the roots it iterates over.
160 * This makes it safe to release the MMU lock and yield within the loop, but
161 * if exiting the loop early, the caller must drop the reference to the most
162 * recent root. (Unless keeping a live reference is desirable.)
163 *
164 * If shared is set, this function is operating under the MMU lock in read
165 * mode.
166 */
167 #define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _types) \
168 for (_root = tdp_mmu_next_root(_kvm, NULL, _types); \
169 ({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \
170 _root = tdp_mmu_next_root(_kvm, _root, _types)) \
171 if (_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) { \
172 } else
173
174 #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) \
175 __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, KVM_VALID_ROOTS)
176
177 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root) \
178 for (_root = tdp_mmu_next_root(_kvm, NULL, KVM_ALL_ROOTS); \
179 ({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root; \
180 _root = tdp_mmu_next_root(_kvm, _root, KVM_ALL_ROOTS))
181
182 /*
183 * Iterate over all TDP MMU roots. Requires that mmu_lock be held for write,
184 * the implication being that any flow that holds mmu_lock for read is
185 * inherently yield-friendly and should use the yield-safe variant above.
186 * Holding mmu_lock for write obviates the need for RCU protection as the list
187 * is guaranteed to be stable.
188 */
189 #define __for_each_tdp_mmu_root(_kvm, _root, _as_id, _types) \
190 list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) \
191 if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) && \
192 ((_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) || \
193 !tdp_mmu_root_match((_root), (_types)))) { \
194 } else
195
196 #define for_each_valid_tdp_mmu_root(_kvm, _root, _as_id) \
197 __for_each_tdp_mmu_root(_kvm, _root, _as_id, KVM_VALID_ROOTS)
198
tdp_mmu_alloc_sp(struct kvm_vcpu * vcpu)199 static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
200 {
201 struct kvm_mmu_page *sp;
202
203 sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
204 sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
205
206 return sp;
207 }
208
tdp_mmu_init_sp(struct kvm_mmu_page * sp,tdp_ptep_t sptep,gfn_t gfn,union kvm_mmu_page_role role)209 static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep,
210 gfn_t gfn, union kvm_mmu_page_role role)
211 {
212 INIT_LIST_HEAD(&sp->possible_nx_huge_page_link);
213
214 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
215
216 sp->role = role;
217 sp->gfn = gfn;
218 sp->ptep = sptep;
219 sp->tdp_mmu_page = true;
220
221 trace_kvm_mmu_get_page(sp, true);
222 }
223
tdp_mmu_init_child_sp(struct kvm_mmu_page * child_sp,struct tdp_iter * iter)224 static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
225 struct tdp_iter *iter)
226 {
227 struct kvm_mmu_page *parent_sp;
228 union kvm_mmu_page_role role;
229
230 parent_sp = sptep_to_sp(rcu_dereference(iter->sptep));
231
232 role = parent_sp->role;
233 role.level--;
234
235 tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role);
236 }
237
kvm_tdp_mmu_alloc_root(struct kvm_vcpu * vcpu,bool mirror)238 void kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu, bool mirror)
239 {
240 struct kvm_mmu *mmu = vcpu->arch.mmu;
241 union kvm_mmu_page_role role = mmu->root_role;
242 int as_id = kvm_mmu_role_as_id(role);
243 struct kvm *kvm = vcpu->kvm;
244 struct kvm_mmu_page *root;
245
246 if (mirror)
247 role.is_mirror = true;
248
249 /*
250 * Check for an existing root before acquiring the pages lock to avoid
251 * unnecessary serialization if multiple vCPUs are loading a new root.
252 * E.g. when bringing up secondary vCPUs, KVM will already have created
253 * a valid root on behalf of the primary vCPU.
254 */
255 read_lock(&kvm->mmu_lock);
256
257 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, as_id) {
258 if (root->role.word == role.word)
259 goto out_read_unlock;
260 }
261
262 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
263
264 /*
265 * Recheck for an existing root after acquiring the pages lock, another
266 * vCPU may have raced ahead and created a new usable root. Manually
267 * walk the list of roots as the standard macros assume that the pages
268 * lock is *not* held. WARN if grabbing a reference to a usable root
269 * fails, as the last reference to a root can only be put *after* the
270 * root has been invalidated, which requires holding mmu_lock for write.
271 */
272 list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
273 if (root->role.word == role.word &&
274 !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root)))
275 goto out_spin_unlock;
276 }
277
278 root = tdp_mmu_alloc_sp(vcpu);
279 tdp_mmu_init_sp(root, NULL, 0, role);
280
281 /*
282 * TDP MMU roots are kept until they are explicitly invalidated, either
283 * by a memslot update or by the destruction of the VM. Initialize the
284 * refcount to two; one reference for the vCPU, and one reference for
285 * the TDP MMU itself, which is held until the root is invalidated and
286 * is ultimately put by kvm_tdp_mmu_zap_invalidated_roots().
287 */
288 refcount_set(&root->tdp_mmu_root_count, 2);
289 list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
290
291 out_spin_unlock:
292 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
293 out_read_unlock:
294 read_unlock(&kvm->mmu_lock);
295 /*
296 * Note, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS will prevent entering the guest
297 * and actually consuming the root if it's invalidated after dropping
298 * mmu_lock, and the root can't be freed as this vCPU holds a reference.
299 */
300 if (mirror) {
301 mmu->mirror_root_hpa = __pa(root->spt);
302 } else {
303 mmu->root.hpa = __pa(root->spt);
304 mmu->root.pgd = 0;
305 }
306 }
307
308 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
309 u64 old_spte, u64 new_spte, int level,
310 bool shared);
311
tdp_account_mmu_page(struct kvm * kvm,struct kvm_mmu_page * sp)312 static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
313 {
314 kvm_account_pgtable_pages((void *)sp->spt, +1);
315 atomic64_inc(&kvm->arch.tdp_mmu_pages);
316 }
317
tdp_unaccount_mmu_page(struct kvm * kvm,struct kvm_mmu_page * sp)318 static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
319 {
320 kvm_account_pgtable_pages((void *)sp->spt, -1);
321 atomic64_dec(&kvm->arch.tdp_mmu_pages);
322 }
323
324 /**
325 * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages
326 *
327 * @kvm: kvm instance
328 * @sp: the page to be removed
329 */
tdp_mmu_unlink_sp(struct kvm * kvm,struct kvm_mmu_page * sp)330 static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
331 {
332 tdp_unaccount_mmu_page(kvm, sp);
333
334 if (!sp->nx_huge_page_disallowed)
335 return;
336
337 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
338 sp->nx_huge_page_disallowed = false;
339 untrack_possible_nx_huge_page(kvm, sp);
340 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
341 }
342
remove_external_spte(struct kvm * kvm,gfn_t gfn,u64 old_spte,int level)343 static void remove_external_spte(struct kvm *kvm, gfn_t gfn, u64 old_spte,
344 int level)
345 {
346 kvm_pfn_t old_pfn = spte_to_pfn(old_spte);
347 int ret;
348
349 /*
350 * External (TDX) SPTEs are limited to PG_LEVEL_4K, and external
351 * PTs are removed in a special order, involving free_external_spt().
352 * But remove_external_spte() will be called on non-leaf PTEs via
353 * __tdp_mmu_zap_root(), so avoid the error the former would return
354 * in this case.
355 */
356 if (!is_last_spte(old_spte, level))
357 return;
358
359 /* Zapping leaf spte is allowed only when write lock is held. */
360 lockdep_assert_held_write(&kvm->mmu_lock);
361 /* Because write lock is held, operation should success. */
362 ret = static_call(kvm_x86_remove_external_spte)(kvm, gfn, level, old_pfn);
363 KVM_BUG_ON(ret, kvm);
364 }
365
366 /**
367 * handle_removed_pt() - handle a page table removed from the TDP structure
368 *
369 * @kvm: kvm instance
370 * @pt: the page removed from the paging structure
371 * @shared: This operation may not be running under the exclusive use
372 * of the MMU lock and the operation must synchronize with other
373 * threads that might be modifying SPTEs.
374 *
375 * Given a page table that has been removed from the TDP paging structure,
376 * iterates through the page table to clear SPTEs and free child page tables.
377 *
378 * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
379 * protection. Since this thread removed it from the paging structure,
380 * this thread will be responsible for ensuring the page is freed. Hence the
381 * early rcu_dereferences in the function.
382 */
handle_removed_pt(struct kvm * kvm,tdp_ptep_t pt,bool shared)383 static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
384 {
385 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
386 int level = sp->role.level;
387 gfn_t base_gfn = sp->gfn;
388 int i;
389
390 trace_kvm_mmu_prepare_zap_page(sp);
391
392 tdp_mmu_unlink_sp(kvm, sp);
393
394 for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
395 tdp_ptep_t sptep = pt + i;
396 gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
397 u64 old_spte;
398
399 if (shared) {
400 /*
401 * Set the SPTE to a nonpresent value that other
402 * threads will not overwrite. If the SPTE was
403 * already marked as frozen then another thread
404 * handling a page fault could overwrite it, so
405 * set the SPTE until it is set from some other
406 * value to the frozen SPTE value.
407 */
408 for (;;) {
409 old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, FROZEN_SPTE);
410 if (!is_frozen_spte(old_spte))
411 break;
412 cpu_relax();
413 }
414 } else {
415 /*
416 * If the SPTE is not MMU-present, there is no backing
417 * page associated with the SPTE and so no side effects
418 * that need to be recorded, and exclusive ownership of
419 * mmu_lock ensures the SPTE can't be made present.
420 * Note, zapping MMIO SPTEs is also unnecessary as they
421 * are guarded by the memslots generation, not by being
422 * unreachable.
423 */
424 old_spte = kvm_tdp_mmu_read_spte(sptep);
425 if (!is_shadow_present_pte(old_spte))
426 continue;
427
428 /*
429 * Use the common helper instead of a raw WRITE_ONCE as
430 * the SPTE needs to be updated atomically if it can be
431 * modified by a different vCPU outside of mmu_lock.
432 * Even though the parent SPTE is !PRESENT, the TLB
433 * hasn't yet been flushed, and both Intel and AMD
434 * document that A/D assists can use upper-level PxE
435 * entries that are cached in the TLB, i.e. the CPU can
436 * still access the page and mark it dirty.
437 *
438 * No retry is needed in the atomic update path as the
439 * sole concern is dropping a Dirty bit, i.e. no other
440 * task can zap/remove the SPTE as mmu_lock is held for
441 * write. Marking the SPTE as a frozen SPTE is not
442 * strictly necessary for the same reason, but using
443 * the frozen SPTE value keeps the shared/exclusive
444 * paths consistent and allows the handle_changed_spte()
445 * call below to hardcode the new value to FROZEN_SPTE.
446 *
447 * Note, even though dropping a Dirty bit is the only
448 * scenario where a non-atomic update could result in a
449 * functional bug, simply checking the Dirty bit isn't
450 * sufficient as a fast page fault could read the upper
451 * level SPTE before it is zapped, and then make this
452 * target SPTE writable, resume the guest, and set the
453 * Dirty bit between reading the SPTE above and writing
454 * it here.
455 */
456 old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte,
457 FROZEN_SPTE, level);
458 }
459 handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
460 old_spte, FROZEN_SPTE, level, shared);
461
462 if (is_mirror_sp(sp)) {
463 KVM_BUG_ON(shared, kvm);
464 remove_external_spte(kvm, gfn, old_spte, level);
465 }
466 }
467
468 if (is_mirror_sp(sp) &&
469 WARN_ON(static_call(kvm_x86_free_external_spt)(kvm, base_gfn, sp->role.level,
470 sp->external_spt))) {
471 /*
472 * Failed to free page table page in mirror page table and
473 * there is nothing to do further.
474 * Intentionally leak the page to prevent the kernel from
475 * accessing the encrypted page.
476 */
477 sp->external_spt = NULL;
478 }
479
480 call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
481 }
482
get_external_spt(gfn_t gfn,u64 new_spte,int level)483 static void *get_external_spt(gfn_t gfn, u64 new_spte, int level)
484 {
485 if (is_shadow_present_pte(new_spte) && !is_last_spte(new_spte, level)) {
486 struct kvm_mmu_page *sp = spte_to_child_sp(new_spte);
487
488 WARN_ON_ONCE(sp->role.level + 1 != level);
489 WARN_ON_ONCE(sp->gfn != gfn);
490 return sp->external_spt;
491 }
492
493 return NULL;
494 }
495
set_external_spte_present(struct kvm * kvm,tdp_ptep_t sptep,gfn_t gfn,u64 old_spte,u64 new_spte,int level)496 static int __must_check set_external_spte_present(struct kvm *kvm, tdp_ptep_t sptep,
497 gfn_t gfn, u64 old_spte,
498 u64 new_spte, int level)
499 {
500 bool was_present = is_shadow_present_pte(old_spte);
501 bool is_present = is_shadow_present_pte(new_spte);
502 bool is_leaf = is_present && is_last_spte(new_spte, level);
503 kvm_pfn_t new_pfn = spte_to_pfn(new_spte);
504 int ret = 0;
505
506 KVM_BUG_ON(was_present, kvm);
507
508 lockdep_assert_held(&kvm->mmu_lock);
509 /*
510 * We need to lock out other updates to the SPTE until the external
511 * page table has been modified. Use FROZEN_SPTE similar to
512 * the zapping case.
513 */
514 if (!try_cmpxchg64(rcu_dereference(sptep), &old_spte, FROZEN_SPTE))
515 return -EBUSY;
516
517 /*
518 * Use different call to either set up middle level
519 * external page table, or leaf.
520 */
521 if (is_leaf) {
522 ret = static_call(kvm_x86_set_external_spte)(kvm, gfn, level, new_pfn);
523 } else {
524 void *external_spt = get_external_spt(gfn, new_spte, level);
525
526 KVM_BUG_ON(!external_spt, kvm);
527 ret = static_call(kvm_x86_link_external_spt)(kvm, gfn, level, external_spt);
528 }
529 if (ret)
530 __kvm_tdp_mmu_write_spte(sptep, old_spte);
531 else
532 __kvm_tdp_mmu_write_spte(sptep, new_spte);
533 return ret;
534 }
535
536 /**
537 * handle_changed_spte - handle bookkeeping associated with an SPTE change
538 * @kvm: kvm instance
539 * @as_id: the address space of the paging structure the SPTE was a part of
540 * @gfn: the base GFN that was mapped by the SPTE
541 * @old_spte: The value of the SPTE before the change
542 * @new_spte: The value of the SPTE after the change
543 * @level: the level of the PT the SPTE is part of in the paging structure
544 * @shared: This operation may not be running under the exclusive use of
545 * the MMU lock and the operation must synchronize with other
546 * threads that might be modifying SPTEs.
547 *
548 * Handle bookkeeping that might result from the modification of a SPTE. Note,
549 * dirty logging updates are handled in common code, not here (see make_spte()
550 * and fast_pf_fix_direct_spte()).
551 */
handle_changed_spte(struct kvm * kvm,int as_id,gfn_t gfn,u64 old_spte,u64 new_spte,int level,bool shared)552 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
553 u64 old_spte, u64 new_spte, int level,
554 bool shared)
555 {
556 bool was_present = is_shadow_present_pte(old_spte);
557 bool is_present = is_shadow_present_pte(new_spte);
558 bool was_leaf = was_present && is_last_spte(old_spte, level);
559 bool is_leaf = is_present && is_last_spte(new_spte, level);
560 bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
561
562 WARN_ON_ONCE(level > PT64_ROOT_MAX_LEVEL);
563 WARN_ON_ONCE(level < PG_LEVEL_4K);
564 WARN_ON_ONCE(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
565
566 /*
567 * If this warning were to trigger it would indicate that there was a
568 * missing MMU notifier or a race with some notifier handler.
569 * A present, leaf SPTE should never be directly replaced with another
570 * present leaf SPTE pointing to a different PFN. A notifier handler
571 * should be zapping the SPTE before the main MM's page table is
572 * changed, or the SPTE should be zeroed, and the TLBs flushed by the
573 * thread before replacement.
574 */
575 if (was_leaf && is_leaf && pfn_changed) {
576 pr_err("Invalid SPTE change: cannot replace a present leaf\n"
577 "SPTE with another present leaf SPTE mapping a\n"
578 "different PFN!\n"
579 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
580 as_id, gfn, old_spte, new_spte, level);
581
582 /*
583 * Crash the host to prevent error propagation and guest data
584 * corruption.
585 */
586 BUG();
587 }
588
589 if (old_spte == new_spte)
590 return;
591
592 trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
593
594 if (is_leaf)
595 check_spte_writable_invariants(new_spte);
596
597 /*
598 * The only times a SPTE should be changed from a non-present to
599 * non-present state is when an MMIO entry is installed/modified/
600 * removed. In that case, there is nothing to do here.
601 */
602 if (!was_present && !is_present) {
603 /*
604 * If this change does not involve a MMIO SPTE or frozen SPTE,
605 * it is unexpected. Log the change, though it should not
606 * impact the guest since both the former and current SPTEs
607 * are nonpresent.
608 */
609 if (WARN_ON_ONCE(!is_mmio_spte(kvm, old_spte) &&
610 !is_mmio_spte(kvm, new_spte) &&
611 !is_frozen_spte(new_spte)))
612 pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
613 "should not be replaced with another,\n"
614 "different nonpresent SPTE, unless one or both\n"
615 "are MMIO SPTEs, or the new SPTE is\n"
616 "a temporary frozen SPTE.\n"
617 "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
618 as_id, gfn, old_spte, new_spte, level);
619 return;
620 }
621
622 if (is_leaf != was_leaf)
623 kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
624
625 /*
626 * Recursively handle child PTs if the change removed a subtree from
627 * the paging structure. Note the WARN on the PFN changing without the
628 * SPTE being converted to a hugepage (leaf) or being zapped. Shadow
629 * pages are kernel allocations and should never be migrated.
630 */
631 if (was_present && !was_leaf &&
632 (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
633 handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
634 }
635
__tdp_mmu_set_spte_atomic(struct kvm * kvm,struct tdp_iter * iter,u64 new_spte)636 static inline int __must_check __tdp_mmu_set_spte_atomic(struct kvm *kvm,
637 struct tdp_iter *iter,
638 u64 new_spte)
639 {
640 /*
641 * The caller is responsible for ensuring the old SPTE is not a FROZEN
642 * SPTE. KVM should never attempt to zap or manipulate a FROZEN SPTE,
643 * and pre-checking before inserting a new SPTE is advantageous as it
644 * avoids unnecessary work.
645 */
646 WARN_ON_ONCE(iter->yielded || is_frozen_spte(iter->old_spte));
647
648 if (is_mirror_sptep(iter->sptep) && !is_frozen_spte(new_spte)) {
649 int ret;
650
651 /*
652 * Users of atomic zapping don't operate on mirror roots,
653 * so don't handle it and bug the VM if it's seen.
654 */
655 if (KVM_BUG_ON(!is_shadow_present_pte(new_spte), kvm))
656 return -EBUSY;
657
658 ret = set_external_spte_present(kvm, iter->sptep, iter->gfn,
659 iter->old_spte, new_spte, iter->level);
660 if (ret)
661 return ret;
662 } else {
663 u64 *sptep = rcu_dereference(iter->sptep);
664
665 /*
666 * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs
667 * and does not hold the mmu_lock. On failure, i.e. if a
668 * different logical CPU modified the SPTE, try_cmpxchg64()
669 * updates iter->old_spte with the current value, so the caller
670 * operates on fresh data, e.g. if it retries
671 * tdp_mmu_set_spte_atomic()
672 */
673 if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte))
674 return -EBUSY;
675 }
676
677 return 0;
678 }
679
680 /*
681 * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically
682 * and handle the associated bookkeeping. Do not mark the page dirty
683 * in KVM's dirty bitmaps.
684 *
685 * If setting the SPTE fails because it has changed, iter->old_spte will be
686 * refreshed to the current value of the spte.
687 *
688 * @kvm: kvm instance
689 * @iter: a tdp_iter instance currently on the SPTE that should be set
690 * @new_spte: The value the SPTE should be set to
691 * Return:
692 * * 0 - If the SPTE was set.
693 * * -EBUSY - If the SPTE cannot be set. In this case this function will have
694 * no side-effects other than setting iter->old_spte to the last
695 * known value of the spte.
696 */
tdp_mmu_set_spte_atomic(struct kvm * kvm,struct tdp_iter * iter,u64 new_spte)697 static inline int __must_check tdp_mmu_set_spte_atomic(struct kvm *kvm,
698 struct tdp_iter *iter,
699 u64 new_spte)
700 {
701 int ret;
702
703 lockdep_assert_held_read(&kvm->mmu_lock);
704
705 ret = __tdp_mmu_set_spte_atomic(kvm, iter, new_spte);
706 if (ret)
707 return ret;
708
709 handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
710 new_spte, iter->level, true);
711
712 return 0;
713 }
714
715 /*
716 * tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
717 * @kvm: KVM instance
718 * @as_id: Address space ID, i.e. regular vs. SMM
719 * @sptep: Pointer to the SPTE
720 * @old_spte: The current value of the SPTE
721 * @new_spte: The new value that will be set for the SPTE
722 * @gfn: The base GFN that was (or will be) mapped by the SPTE
723 * @level: The level _containing_ the SPTE (its parent PT's level)
724 *
725 * Returns the old SPTE value, which _may_ be different than @old_spte if the
726 * SPTE had voldatile bits.
727 */
tdp_mmu_set_spte(struct kvm * kvm,int as_id,tdp_ptep_t sptep,u64 old_spte,u64 new_spte,gfn_t gfn,int level)728 static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
729 u64 old_spte, u64 new_spte, gfn_t gfn, int level)
730 {
731 lockdep_assert_held_write(&kvm->mmu_lock);
732
733 /*
734 * No thread should be using this function to set SPTEs to or from the
735 * temporary frozen SPTE value.
736 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
737 * should be used. If operating under the MMU lock in write mode, the
738 * use of the frozen SPTE should not be necessary.
739 */
740 WARN_ON_ONCE(is_frozen_spte(old_spte) || is_frozen_spte(new_spte));
741
742 old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level);
743
744 handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
745
746 /*
747 * Users that do non-atomic setting of PTEs don't operate on mirror
748 * roots, so don't handle it and bug the VM if it's seen.
749 */
750 if (is_mirror_sptep(sptep)) {
751 KVM_BUG_ON(is_shadow_present_pte(new_spte), kvm);
752 remove_external_spte(kvm, gfn, old_spte, level);
753 }
754
755 return old_spte;
756 }
757
tdp_mmu_iter_set_spte(struct kvm * kvm,struct tdp_iter * iter,u64 new_spte)758 static inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter,
759 u64 new_spte)
760 {
761 WARN_ON_ONCE(iter->yielded);
762 iter->old_spte = tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep,
763 iter->old_spte, new_spte,
764 iter->gfn, iter->level);
765 }
766
767 #define tdp_root_for_each_pte(_iter, _kvm, _root, _start, _end) \
768 for_each_tdp_pte(_iter, _kvm, _root, _start, _end)
769
770 #define tdp_root_for_each_leaf_pte(_iter, _kvm, _root, _start, _end) \
771 tdp_root_for_each_pte(_iter, _kvm, _root, _start, _end) \
772 if (!is_shadow_present_pte(_iter.old_spte) || \
773 !is_last_spte(_iter.old_spte, _iter.level)) \
774 continue; \
775 else
776
777 #define tdp_mmu_for_each_pte(_iter, _kvm, _root, _start, _end) \
778 for_each_tdp_pte(_iter, _kvm, _root, _start, _end)
779
tdp_mmu_iter_need_resched(struct kvm * kvm,struct tdp_iter * iter)780 static inline bool __must_check tdp_mmu_iter_need_resched(struct kvm *kvm,
781 struct tdp_iter *iter)
782 {
783 if (!need_resched() && !rwlock_needbreak(&kvm->mmu_lock))
784 return false;
785
786 /* Ensure forward progress has been made before yielding. */
787 return iter->next_last_level_gfn != iter->yielded_gfn;
788 }
789
790 /*
791 * Yield if the MMU lock is contended or this thread needs to return control
792 * to the scheduler.
793 *
794 * If this function should yield and flush is set, it will perform a remote
795 * TLB flush before yielding.
796 *
797 * If this function yields, iter->yielded is set and the caller must skip to
798 * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk
799 * over the paging structures to allow the iterator to continue its traversal
800 * from the paging structure root.
801 *
802 * Returns true if this function yielded.
803 */
tdp_mmu_iter_cond_resched(struct kvm * kvm,struct tdp_iter * iter,bool flush,bool shared)804 static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
805 struct tdp_iter *iter,
806 bool flush, bool shared)
807 {
808 KVM_MMU_WARN_ON(iter->yielded);
809
810 if (!tdp_mmu_iter_need_resched(kvm, iter))
811 return false;
812
813 if (flush)
814 kvm_flush_remote_tlbs(kvm);
815
816 rcu_read_unlock();
817
818 if (shared)
819 cond_resched_rwlock_read(&kvm->mmu_lock);
820 else
821 cond_resched_rwlock_write(&kvm->mmu_lock);
822
823 rcu_read_lock();
824
825 WARN_ON_ONCE(iter->gfn > iter->next_last_level_gfn);
826
827 iter->yielded = true;
828 return true;
829 }
830
tdp_mmu_max_gfn_exclusive(void)831 static inline gfn_t tdp_mmu_max_gfn_exclusive(void)
832 {
833 /*
834 * Bound TDP MMU walks at host.MAXPHYADDR. KVM disallows memslots with
835 * a gpa range that would exceed the max gfn, and KVM does not create
836 * MMIO SPTEs for "impossible" gfns, instead sending such accesses down
837 * the slow emulation path every time.
838 */
839 return kvm_mmu_max_gfn() + 1;
840 }
841
__tdp_mmu_zap_root(struct kvm * kvm,struct kvm_mmu_page * root,bool shared,int zap_level)842 static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
843 bool shared, int zap_level)
844 {
845 struct tdp_iter iter;
846
847 for_each_tdp_pte_min_level_all(iter, root, zap_level) {
848 retry:
849 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
850 continue;
851
852 if (!is_shadow_present_pte(iter.old_spte))
853 continue;
854
855 if (iter.level > zap_level)
856 continue;
857
858 if (!shared)
859 tdp_mmu_iter_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE);
860 else if (tdp_mmu_set_spte_atomic(kvm, &iter, SHADOW_NONPRESENT_VALUE))
861 goto retry;
862 }
863 }
864
tdp_mmu_zap_root(struct kvm * kvm,struct kvm_mmu_page * root,bool shared)865 static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
866 bool shared)
867 {
868
869 /*
870 * The root must have an elevated refcount so that it's reachable via
871 * mmu_notifier callbacks, which allows this path to yield and drop
872 * mmu_lock. When handling an unmap/release mmu_notifier command, KVM
873 * must drop all references to relevant pages prior to completing the
874 * callback. Dropping mmu_lock with an unreachable root would result
875 * in zapping SPTEs after a relevant mmu_notifier callback completes
876 * and lead to use-after-free as zapping a SPTE triggers "writeback" of
877 * dirty accessed bits to the SPTE's associated struct page.
878 */
879 WARN_ON_ONCE(!refcount_read(&root->tdp_mmu_root_count));
880
881 kvm_lockdep_assert_mmu_lock_held(kvm, shared);
882
883 rcu_read_lock();
884
885 /*
886 * Zap roots in multiple passes of decreasing granularity, i.e. zap at
887 * 4KiB=>2MiB=>1GiB=>root, in order to better honor need_resched() (all
888 * preempt models) or mmu_lock contention (full or real-time models).
889 * Zapping at finer granularity marginally increases the total time of
890 * the zap, but in most cases the zap itself isn't latency sensitive.
891 *
892 * If KVM is configured to prove the MMU, skip the 4KiB and 2MiB zaps
893 * in order to mimic the page fault path, which can replace a 1GiB page
894 * table with an equivalent 1GiB hugepage, i.e. can get saddled with
895 * zapping a 1GiB region that's fully populated with 4KiB SPTEs. This
896 * allows verifying that KVM can safely zap 1GiB regions, e.g. without
897 * inducing RCU stalls, without relying on a relatively rare event
898 * (zapping roots is orders of magnitude more common). Note, because
899 * zapping a SP recurses on its children, stepping down to PG_LEVEL_4K
900 * in the iterator itself is unnecessary.
901 */
902 if (!IS_ENABLED(CONFIG_KVM_PROVE_MMU)) {
903 __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_4K);
904 __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_2M);
905 }
906 __tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G);
907 __tdp_mmu_zap_root(kvm, root, shared, root->role.level);
908
909 rcu_read_unlock();
910 }
911
kvm_tdp_mmu_zap_sp(struct kvm * kvm,struct kvm_mmu_page * sp)912 bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
913 {
914 u64 old_spte;
915
916 /*
917 * This helper intentionally doesn't allow zapping a root shadow page,
918 * which doesn't have a parent page table and thus no associated entry.
919 */
920 if (WARN_ON_ONCE(!sp->ptep))
921 return false;
922
923 old_spte = kvm_tdp_mmu_read_spte(sp->ptep);
924 if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte)))
925 return false;
926
927 tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte,
928 SHADOW_NONPRESENT_VALUE, sp->gfn, sp->role.level + 1);
929
930 return true;
931 }
932
933 /*
934 * If can_yield is true, will release the MMU lock and reschedule if the
935 * scheduler needs the CPU or there is contention on the MMU lock. If this
936 * function cannot yield, it will not release the MMU lock or reschedule and
937 * the caller must ensure it does not supply too large a GFN range, or the
938 * operation can cause a soft lockup.
939 */
tdp_mmu_zap_leafs(struct kvm * kvm,struct kvm_mmu_page * root,gfn_t start,gfn_t end,bool can_yield,bool flush)940 static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
941 gfn_t start, gfn_t end, bool can_yield, bool flush)
942 {
943 struct tdp_iter iter;
944
945 end = min(end, tdp_mmu_max_gfn_exclusive());
946
947 lockdep_assert_held_write(&kvm->mmu_lock);
948
949 rcu_read_lock();
950
951 for_each_tdp_pte_min_level(iter, kvm, root, PG_LEVEL_4K, start, end) {
952 if (can_yield &&
953 tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) {
954 flush = false;
955 continue;
956 }
957
958 if (!is_shadow_present_pte(iter.old_spte) ||
959 !is_last_spte(iter.old_spte, iter.level))
960 continue;
961
962 tdp_mmu_iter_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE);
963
964 /*
965 * Zappings SPTEs in invalid roots doesn't require a TLB flush,
966 * see kvm_tdp_mmu_zap_invalidated_roots() for details.
967 */
968 if (!root->role.invalid)
969 flush = true;
970 }
971
972 rcu_read_unlock();
973
974 /*
975 * Because this flow zaps _only_ leaf SPTEs, the caller doesn't need
976 * to provide RCU protection as no 'struct kvm_mmu_page' will be freed.
977 */
978 return flush;
979 }
980
981 /*
982 * Zap leaf SPTEs for the range of gfns, [start, end), for all *VALID** roots.
983 * Returns true if a TLB flush is needed before releasing the MMU lock, i.e. if
984 * one or more SPTEs were zapped since the MMU lock was last acquired.
985 */
kvm_tdp_mmu_zap_leafs(struct kvm * kvm,gfn_t start,gfn_t end,bool flush)986 bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush)
987 {
988 struct kvm_mmu_page *root;
989
990 lockdep_assert_held_write(&kvm->mmu_lock);
991 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, -1)
992 flush = tdp_mmu_zap_leafs(kvm, root, start, end, true, flush);
993
994 return flush;
995 }
996
kvm_tdp_mmu_zap_all(struct kvm * kvm)997 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
998 {
999 struct kvm_mmu_page *root;
1000
1001 /*
1002 * Zap all direct roots, including invalid direct roots, as all direct
1003 * SPTEs must be dropped before returning to the caller. For TDX, mirror
1004 * roots don't need handling in response to the mmu notifier (the caller).
1005 *
1006 * Zap directly even if the root is also being zapped by a concurrent
1007 * "fast zap". Walking zapped top-level SPTEs isn't all that expensive
1008 * and mmu_lock is already held, which means the other thread has yielded.
1009 *
1010 * A TLB flush is unnecessary, KVM zaps everything if and only the VM
1011 * is being destroyed or the userspace VMM has exited. In both cases,
1012 * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request.
1013 */
1014 lockdep_assert_held_write(&kvm->mmu_lock);
1015 __for_each_tdp_mmu_root_yield_safe(kvm, root, -1,
1016 KVM_DIRECT_ROOTS | KVM_INVALID_ROOTS)
1017 tdp_mmu_zap_root(kvm, root, false);
1018 }
1019
1020 /*
1021 * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast
1022 * zap" completes.
1023 */
kvm_tdp_mmu_zap_invalidated_roots(struct kvm * kvm,bool shared)1024 void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm, bool shared)
1025 {
1026 struct kvm_mmu_page *root;
1027
1028 if (shared)
1029 read_lock(&kvm->mmu_lock);
1030 else
1031 write_lock(&kvm->mmu_lock);
1032
1033 for_each_tdp_mmu_root_yield_safe(kvm, root) {
1034 if (!root->tdp_mmu_scheduled_root_to_zap)
1035 continue;
1036
1037 root->tdp_mmu_scheduled_root_to_zap = false;
1038 KVM_BUG_ON(!root->role.invalid, kvm);
1039
1040 /*
1041 * A TLB flush is not necessary as KVM performs a local TLB
1042 * flush when allocating a new root (see kvm_mmu_load()), and
1043 * when migrating a vCPU to a different pCPU. Note, the local
1044 * TLB flush on reuse also invalidates paging-structure-cache
1045 * entries, i.e. TLB entries for intermediate paging structures,
1046 * that may be zapped, as such entries are associated with the
1047 * ASID on both VMX and SVM.
1048 */
1049 tdp_mmu_zap_root(kvm, root, shared);
1050
1051 /*
1052 * The referenced needs to be put *after* zapping the root, as
1053 * the root must be reachable by mmu_notifiers while it's being
1054 * zapped
1055 */
1056 kvm_tdp_mmu_put_root(kvm, root);
1057 }
1058
1059 if (shared)
1060 read_unlock(&kvm->mmu_lock);
1061 else
1062 write_unlock(&kvm->mmu_lock);
1063 }
1064
1065 /*
1066 * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that
1067 * is about to be zapped, e.g. in response to a memslots update. The actual
1068 * zapping is done separately so that it happens with mmu_lock with read,
1069 * whereas invalidating roots must be done with mmu_lock held for write (unless
1070 * the VM is being destroyed).
1071 *
1072 * Note, kvm_tdp_mmu_zap_invalidated_roots() is gifted the TDP MMU's reference.
1073 * See kvm_tdp_mmu_alloc_root().
1074 */
kvm_tdp_mmu_invalidate_roots(struct kvm * kvm,enum kvm_tdp_mmu_root_types root_types)1075 void kvm_tdp_mmu_invalidate_roots(struct kvm *kvm,
1076 enum kvm_tdp_mmu_root_types root_types)
1077 {
1078 struct kvm_mmu_page *root;
1079
1080 /*
1081 * Invalidating invalid roots doesn't make sense, prevent developers from
1082 * having to think about it.
1083 */
1084 if (WARN_ON_ONCE(root_types & KVM_INVALID_ROOTS))
1085 root_types &= ~KVM_INVALID_ROOTS;
1086
1087 /*
1088 * mmu_lock must be held for write to ensure that a root doesn't become
1089 * invalid while there are active readers (invalidating a root while
1090 * there are active readers may or may not be problematic in practice,
1091 * but it's uncharted territory and not supported).
1092 *
1093 * Waive the assertion if there are no users of @kvm, i.e. the VM is
1094 * being destroyed after all references have been put, or if no vCPUs
1095 * have been created (which means there are no roots), i.e. the VM is
1096 * being destroyed in an error path of KVM_CREATE_VM.
1097 */
1098 if (IS_ENABLED(CONFIG_PROVE_LOCKING) &&
1099 refcount_read(&kvm->users_count) && kvm->created_vcpus)
1100 lockdep_assert_held_write(&kvm->mmu_lock);
1101
1102 /*
1103 * As above, mmu_lock isn't held when destroying the VM! There can't
1104 * be other references to @kvm, i.e. nothing else can invalidate roots
1105 * or get/put references to roots.
1106 */
1107 list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
1108 if (!tdp_mmu_root_match(root, root_types))
1109 continue;
1110
1111 /*
1112 * Note, invalid roots can outlive a memslot update! Invalid
1113 * roots must be *zapped* before the memslot update completes,
1114 * but a different task can acquire a reference and keep the
1115 * root alive after its been zapped.
1116 */
1117 if (!root->role.invalid) {
1118 root->tdp_mmu_scheduled_root_to_zap = true;
1119 root->role.invalid = true;
1120 }
1121 }
1122 }
1123
1124 /*
1125 * Installs a last-level SPTE to handle a TDP page fault.
1126 * (NPT/EPT violation/misconfiguration)
1127 */
tdp_mmu_map_handle_target_level(struct kvm_vcpu * vcpu,struct kvm_page_fault * fault,struct tdp_iter * iter)1128 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
1129 struct kvm_page_fault *fault,
1130 struct tdp_iter *iter)
1131 {
1132 struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep));
1133 u64 new_spte;
1134 int ret = RET_PF_FIXED;
1135 bool wrprot = false;
1136
1137 if (WARN_ON_ONCE(sp->role.level != fault->goal_level))
1138 return RET_PF_RETRY;
1139
1140 if (fault->prefetch && is_shadow_present_pte(iter->old_spte))
1141 return RET_PF_SPURIOUS;
1142
1143 if (is_shadow_present_pte(iter->old_spte) &&
1144 is_access_allowed(fault, iter->old_spte) &&
1145 is_last_spte(iter->old_spte, iter->level))
1146 return RET_PF_SPURIOUS;
1147
1148 if (unlikely(!fault->slot))
1149 new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
1150 else
1151 wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
1152 fault->pfn, iter->old_spte, fault->prefetch,
1153 false, fault->map_writable, &new_spte);
1154
1155 if (new_spte == iter->old_spte)
1156 ret = RET_PF_SPURIOUS;
1157 else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
1158 return RET_PF_RETRY;
1159 else if (is_shadow_present_pte(iter->old_spte) &&
1160 (!is_last_spte(iter->old_spte, iter->level) ||
1161 WARN_ON_ONCE(leaf_spte_change_needs_tlb_flush(iter->old_spte, new_spte))))
1162 kvm_flush_remote_tlbs_gfn(vcpu->kvm, iter->gfn, iter->level);
1163
1164 /*
1165 * If the page fault was caused by a write but the page is write
1166 * protected, emulation is needed. If the emulation was skipped,
1167 * the vCPU would have the same fault again.
1168 */
1169 if (wrprot && fault->write)
1170 ret = RET_PF_WRITE_PROTECTED;
1171
1172 /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
1173 if (unlikely(is_mmio_spte(vcpu->kvm, new_spte))) {
1174 vcpu->stat.pf_mmio_spte_created++;
1175 trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
1176 new_spte);
1177 ret = RET_PF_EMULATE;
1178 } else {
1179 trace_kvm_mmu_set_spte(iter->level, iter->gfn,
1180 rcu_dereference(iter->sptep));
1181 }
1182
1183 return ret;
1184 }
1185
1186 /*
1187 * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the
1188 * provided page table.
1189 *
1190 * @kvm: kvm instance
1191 * @iter: a tdp_iter instance currently on the SPTE that should be set
1192 * @sp: The new TDP page table to install.
1193 * @shared: This operation is running under the MMU lock in read mode.
1194 *
1195 * Returns: 0 if the new page table was installed. Non-0 if the page table
1196 * could not be installed (e.g. the atomic compare-exchange failed).
1197 */
tdp_mmu_link_sp(struct kvm * kvm,struct tdp_iter * iter,struct kvm_mmu_page * sp,bool shared)1198 static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
1199 struct kvm_mmu_page *sp, bool shared)
1200 {
1201 u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled);
1202 int ret = 0;
1203
1204 if (shared) {
1205 ret = tdp_mmu_set_spte_atomic(kvm, iter, spte);
1206 if (ret)
1207 return ret;
1208 } else {
1209 tdp_mmu_iter_set_spte(kvm, iter, spte);
1210 }
1211
1212 tdp_account_mmu_page(kvm, sp);
1213
1214 return 0;
1215 }
1216
1217 static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
1218 struct kvm_mmu_page *sp, bool shared);
1219
1220 /*
1221 * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
1222 * page tables and SPTEs to translate the faulting guest physical address.
1223 */
kvm_tdp_mmu_map(struct kvm_vcpu * vcpu,struct kvm_page_fault * fault)1224 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
1225 {
1226 struct kvm_mmu_page *root = tdp_mmu_get_root_for_fault(vcpu, fault);
1227 struct kvm *kvm = vcpu->kvm;
1228 struct tdp_iter iter;
1229 struct kvm_mmu_page *sp;
1230 int ret = RET_PF_RETRY;
1231
1232 kvm_mmu_hugepage_adjust(vcpu, fault);
1233
1234 trace_kvm_mmu_spte_requested(fault);
1235
1236 rcu_read_lock();
1237
1238 tdp_mmu_for_each_pte(iter, kvm, root, fault->gfn, fault->gfn + 1) {
1239 int r;
1240
1241 if (fault->nx_huge_page_workaround_enabled)
1242 disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
1243
1244 /*
1245 * If SPTE has been frozen by another thread, just give up and
1246 * retry, avoiding unnecessary page table allocation and free.
1247 */
1248 if (is_frozen_spte(iter.old_spte))
1249 goto retry;
1250
1251 if (iter.level == fault->goal_level)
1252 goto map_target_level;
1253
1254 /* Step down into the lower level page table if it exists. */
1255 if (is_shadow_present_pte(iter.old_spte) &&
1256 !is_large_pte(iter.old_spte))
1257 continue;
1258
1259 /*
1260 * The SPTE is either non-present or points to a huge page that
1261 * needs to be split.
1262 */
1263 sp = tdp_mmu_alloc_sp(vcpu);
1264 tdp_mmu_init_child_sp(sp, &iter);
1265 if (is_mirror_sp(sp))
1266 kvm_mmu_alloc_external_spt(vcpu, sp);
1267
1268 sp->nx_huge_page_disallowed = fault->huge_page_disallowed;
1269
1270 if (is_shadow_present_pte(iter.old_spte)) {
1271 /* Don't support large page for mirrored roots (TDX) */
1272 KVM_BUG_ON(is_mirror_sptep(iter.sptep), vcpu->kvm);
1273 r = tdp_mmu_split_huge_page(kvm, &iter, sp, true);
1274 } else {
1275 r = tdp_mmu_link_sp(kvm, &iter, sp, true);
1276 }
1277
1278 /*
1279 * Force the guest to retry if installing an upper level SPTE
1280 * failed, e.g. because a different task modified the SPTE.
1281 */
1282 if (r) {
1283 tdp_mmu_free_sp(sp);
1284 goto retry;
1285 }
1286
1287 if (fault->huge_page_disallowed &&
1288 fault->req_level >= iter.level) {
1289 spin_lock(&kvm->arch.tdp_mmu_pages_lock);
1290 if (sp->nx_huge_page_disallowed)
1291 track_possible_nx_huge_page(kvm, sp);
1292 spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
1293 }
1294 }
1295
1296 /*
1297 * The walk aborted before reaching the target level, e.g. because the
1298 * iterator detected an upper level SPTE was frozen during traversal.
1299 */
1300 WARN_ON_ONCE(iter.level == fault->goal_level);
1301 goto retry;
1302
1303 map_target_level:
1304 ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
1305
1306 retry:
1307 rcu_read_unlock();
1308 return ret;
1309 }
1310
1311 /* Used by mmu notifier via kvm_unmap_gfn_range() */
kvm_tdp_mmu_unmap_gfn_range(struct kvm * kvm,struct kvm_gfn_range * range,bool flush)1312 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
1313 bool flush)
1314 {
1315 enum kvm_tdp_mmu_root_types types;
1316 struct kvm_mmu_page *root;
1317
1318 types = kvm_gfn_range_filter_to_root_types(kvm, range->attr_filter) | KVM_INVALID_ROOTS;
1319
1320 __for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, types)
1321 flush = tdp_mmu_zap_leafs(kvm, root, range->start, range->end,
1322 range->may_block, flush);
1323
1324 return flush;
1325 }
1326
1327 /*
1328 * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
1329 * if any of the GFNs in the range have been accessed.
1330 *
1331 * No need to mark the corresponding PFN as accessed as this call is coming
1332 * from the clear_young() or clear_flush_young() notifier, which uses the
1333 * return value to determine if the page has been accessed.
1334 */
kvm_tdp_mmu_age_spte(struct tdp_iter * iter)1335 static void kvm_tdp_mmu_age_spte(struct tdp_iter *iter)
1336 {
1337 u64 new_spte;
1338
1339 if (spte_ad_enabled(iter->old_spte)) {
1340 iter->old_spte = tdp_mmu_clear_spte_bits(iter->sptep,
1341 iter->old_spte,
1342 shadow_accessed_mask,
1343 iter->level);
1344 new_spte = iter->old_spte & ~shadow_accessed_mask;
1345 } else {
1346 new_spte = mark_spte_for_access_track(iter->old_spte);
1347 iter->old_spte = kvm_tdp_mmu_write_spte(iter->sptep,
1348 iter->old_spte, new_spte,
1349 iter->level);
1350 }
1351
1352 trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level,
1353 iter->old_spte, new_spte);
1354 }
1355
__kvm_tdp_mmu_age_gfn_range(struct kvm * kvm,struct kvm_gfn_range * range,bool test_only)1356 static bool __kvm_tdp_mmu_age_gfn_range(struct kvm *kvm,
1357 struct kvm_gfn_range *range,
1358 bool test_only)
1359 {
1360 enum kvm_tdp_mmu_root_types types;
1361 struct kvm_mmu_page *root;
1362 struct tdp_iter iter;
1363 bool ret = false;
1364
1365 types = kvm_gfn_range_filter_to_root_types(kvm, range->attr_filter);
1366
1367 /*
1368 * Don't support rescheduling, none of the MMU notifiers that funnel
1369 * into this helper allow blocking; it'd be dead, wasteful code. Note,
1370 * this helper must NOT be used to unmap GFNs, as it processes only
1371 * valid roots!
1372 */
1373 WARN_ON(types & ~KVM_VALID_ROOTS);
1374 __for_each_tdp_mmu_root(kvm, root, range->slot->as_id, types) {
1375 guard(rcu)();
1376
1377 tdp_root_for_each_leaf_pte(iter, kvm, root, range->start, range->end) {
1378 if (!is_accessed_spte(iter.old_spte))
1379 continue;
1380
1381 if (test_only)
1382 return true;
1383
1384 ret = true;
1385 kvm_tdp_mmu_age_spte(&iter);
1386 }
1387 }
1388
1389 return ret;
1390 }
1391
kvm_tdp_mmu_age_gfn_range(struct kvm * kvm,struct kvm_gfn_range * range)1392 bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
1393 {
1394 return __kvm_tdp_mmu_age_gfn_range(kvm, range, false);
1395 }
1396
kvm_tdp_mmu_test_age_gfn(struct kvm * kvm,struct kvm_gfn_range * range)1397 bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1398 {
1399 return __kvm_tdp_mmu_age_gfn_range(kvm, range, true);
1400 }
1401
1402 /*
1403 * Remove write access from all SPTEs at or above min_level that map GFNs
1404 * [start, end). Returns true if an SPTE has been changed and the TLBs need to
1405 * be flushed.
1406 */
wrprot_gfn_range(struct kvm * kvm,struct kvm_mmu_page * root,gfn_t start,gfn_t end,int min_level)1407 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1408 gfn_t start, gfn_t end, int min_level)
1409 {
1410 struct tdp_iter iter;
1411 u64 new_spte;
1412 bool spte_set = false;
1413
1414 rcu_read_lock();
1415
1416 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1417
1418 for_each_tdp_pte_min_level(iter, kvm, root, min_level, start, end) {
1419 retry:
1420 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1421 continue;
1422
1423 if (!is_shadow_present_pte(iter.old_spte) ||
1424 !is_last_spte(iter.old_spte, iter.level) ||
1425 !(iter.old_spte & PT_WRITABLE_MASK))
1426 continue;
1427
1428 new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1429
1430 if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
1431 goto retry;
1432
1433 spte_set = true;
1434 }
1435
1436 rcu_read_unlock();
1437 return spte_set;
1438 }
1439
1440 /*
1441 * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1442 * only affect leaf SPTEs down to min_level.
1443 * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1444 */
kvm_tdp_mmu_wrprot_slot(struct kvm * kvm,const struct kvm_memory_slot * slot,int min_level)1445 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
1446 const struct kvm_memory_slot *slot, int min_level)
1447 {
1448 struct kvm_mmu_page *root;
1449 bool spte_set = false;
1450
1451 lockdep_assert_held_read(&kvm->mmu_lock);
1452
1453 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
1454 spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1455 slot->base_gfn + slot->npages, min_level);
1456
1457 return spte_set;
1458 }
1459
tdp_mmu_alloc_sp_for_split(void)1460 static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(void)
1461 {
1462 struct kvm_mmu_page *sp;
1463
1464 sp = kmem_cache_zalloc(mmu_page_header_cache, GFP_KERNEL_ACCOUNT);
1465 if (!sp)
1466 return NULL;
1467
1468 sp->spt = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
1469 if (!sp->spt) {
1470 kmem_cache_free(mmu_page_header_cache, sp);
1471 return NULL;
1472 }
1473
1474 return sp;
1475 }
1476
1477 /* Note, the caller is responsible for initializing @sp. */
tdp_mmu_split_huge_page(struct kvm * kvm,struct tdp_iter * iter,struct kvm_mmu_page * sp,bool shared)1478 static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
1479 struct kvm_mmu_page *sp, bool shared)
1480 {
1481 const u64 huge_spte = iter->old_spte;
1482 const int level = iter->level;
1483 int ret, i;
1484
1485 /*
1486 * No need for atomics when writing to sp->spt since the page table has
1487 * not been linked in yet and thus is not reachable from any other CPU.
1488 */
1489 for (i = 0; i < SPTE_ENT_PER_PAGE; i++)
1490 sp->spt[i] = make_small_spte(kvm, huge_spte, sp->role, i);
1491
1492 /*
1493 * Replace the huge spte with a pointer to the populated lower level
1494 * page table. Since we are making this change without a TLB flush vCPUs
1495 * will see a mix of the split mappings and the original huge mapping,
1496 * depending on what's currently in their TLB. This is fine from a
1497 * correctness standpoint since the translation will be the same either
1498 * way.
1499 */
1500 ret = tdp_mmu_link_sp(kvm, iter, sp, shared);
1501 if (ret)
1502 goto out;
1503
1504 /*
1505 * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we
1506 * are overwriting from the page stats. But we have to manually update
1507 * the page stats with the new present child pages.
1508 */
1509 kvm_update_page_stats(kvm, level - 1, SPTE_ENT_PER_PAGE);
1510
1511 out:
1512 trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret);
1513 return ret;
1514 }
1515
tdp_mmu_split_huge_pages_root(struct kvm * kvm,struct kvm_mmu_page * root,gfn_t start,gfn_t end,int target_level,bool shared)1516 static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
1517 struct kvm_mmu_page *root,
1518 gfn_t start, gfn_t end,
1519 int target_level, bool shared)
1520 {
1521 struct kvm_mmu_page *sp = NULL;
1522 struct tdp_iter iter;
1523
1524 rcu_read_lock();
1525
1526 /*
1527 * Traverse the page table splitting all huge pages above the target
1528 * level into one lower level. For example, if we encounter a 1GB page
1529 * we split it into 512 2MB pages.
1530 *
1531 * Since the TDP iterator uses a pre-order traversal, we are guaranteed
1532 * to visit an SPTE before ever visiting its children, which means we
1533 * will correctly recursively split huge pages that are more than one
1534 * level above the target level (e.g. splitting a 1GB to 512 2MB pages,
1535 * and then splitting each of those to 512 4KB pages).
1536 */
1537 for_each_tdp_pte_min_level(iter, kvm, root, target_level + 1, start, end) {
1538 retry:
1539 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
1540 continue;
1541
1542 if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte))
1543 continue;
1544
1545 if (!sp) {
1546 rcu_read_unlock();
1547
1548 if (shared)
1549 read_unlock(&kvm->mmu_lock);
1550 else
1551 write_unlock(&kvm->mmu_lock);
1552
1553 sp = tdp_mmu_alloc_sp_for_split();
1554
1555 if (shared)
1556 read_lock(&kvm->mmu_lock);
1557 else
1558 write_lock(&kvm->mmu_lock);
1559
1560 if (!sp) {
1561 trace_kvm_mmu_split_huge_page(iter.gfn,
1562 iter.old_spte,
1563 iter.level, -ENOMEM);
1564 return -ENOMEM;
1565 }
1566
1567 rcu_read_lock();
1568
1569 iter.yielded = true;
1570 continue;
1571 }
1572
1573 tdp_mmu_init_child_sp(sp, &iter);
1574
1575 if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared))
1576 goto retry;
1577
1578 sp = NULL;
1579 }
1580
1581 rcu_read_unlock();
1582
1583 /*
1584 * It's possible to exit the loop having never used the last sp if, for
1585 * example, a vCPU doing HugePage NX splitting wins the race and
1586 * installs its own sp in place of the last sp we tried to split.
1587 */
1588 if (sp)
1589 tdp_mmu_free_sp(sp);
1590
1591 return 0;
1592 }
1593
1594
1595 /*
1596 * Try to split all huge pages mapped by the TDP MMU down to the target level.
1597 */
kvm_tdp_mmu_try_split_huge_pages(struct kvm * kvm,const struct kvm_memory_slot * slot,gfn_t start,gfn_t end,int target_level,bool shared)1598 void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
1599 const struct kvm_memory_slot *slot,
1600 gfn_t start, gfn_t end,
1601 int target_level, bool shared)
1602 {
1603 struct kvm_mmu_page *root;
1604 int r = 0;
1605
1606 kvm_lockdep_assert_mmu_lock_held(kvm, shared);
1607 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) {
1608 r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared);
1609 if (r) {
1610 kvm_tdp_mmu_put_root(kvm, root);
1611 break;
1612 }
1613 }
1614 }
1615
tdp_mmu_need_write_protect(struct kvm_mmu_page * sp)1616 static bool tdp_mmu_need_write_protect(struct kvm_mmu_page *sp)
1617 {
1618 /*
1619 * All TDP MMU shadow pages share the same role as their root, aside
1620 * from level, so it is valid to key off any shadow page to determine if
1621 * write protection is needed for an entire tree.
1622 */
1623 return kvm_mmu_page_ad_need_write_protect(sp) || !kvm_ad_enabled;
1624 }
1625
clear_dirty_gfn_range(struct kvm * kvm,struct kvm_mmu_page * root,gfn_t start,gfn_t end)1626 static void clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1627 gfn_t start, gfn_t end)
1628 {
1629 const u64 dbit = tdp_mmu_need_write_protect(root) ? PT_WRITABLE_MASK :
1630 shadow_dirty_mask;
1631 struct tdp_iter iter;
1632
1633 rcu_read_lock();
1634
1635 tdp_root_for_each_pte(iter, kvm, root, start, end) {
1636 retry:
1637 if (!is_shadow_present_pte(iter.old_spte) ||
1638 !is_last_spte(iter.old_spte, iter.level))
1639 continue;
1640
1641 if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1642 continue;
1643
1644 KVM_MMU_WARN_ON(dbit == shadow_dirty_mask &&
1645 spte_ad_need_write_protect(iter.old_spte));
1646
1647 if (!(iter.old_spte & dbit))
1648 continue;
1649
1650 if (tdp_mmu_set_spte_atomic(kvm, &iter, iter.old_spte & ~dbit))
1651 goto retry;
1652 }
1653
1654 rcu_read_unlock();
1655 }
1656
1657 /*
1658 * Clear the dirty status (D-bit or W-bit) of all the SPTEs mapping GFNs in the
1659 * memslot.
1660 */
kvm_tdp_mmu_clear_dirty_slot(struct kvm * kvm,const struct kvm_memory_slot * slot)1661 void kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
1662 const struct kvm_memory_slot *slot)
1663 {
1664 struct kvm_mmu_page *root;
1665
1666 lockdep_assert_held_read(&kvm->mmu_lock);
1667 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
1668 clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1669 slot->base_gfn + slot->npages);
1670 }
1671
clear_dirty_pt_masked(struct kvm * kvm,struct kvm_mmu_page * root,gfn_t gfn,unsigned long mask,bool wrprot)1672 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1673 gfn_t gfn, unsigned long mask, bool wrprot)
1674 {
1675 const u64 dbit = (wrprot || tdp_mmu_need_write_protect(root)) ? PT_WRITABLE_MASK :
1676 shadow_dirty_mask;
1677 struct tdp_iter iter;
1678
1679 lockdep_assert_held_write(&kvm->mmu_lock);
1680
1681 rcu_read_lock();
1682
1683 tdp_root_for_each_leaf_pte(iter, kvm, root, gfn + __ffs(mask),
1684 gfn + BITS_PER_LONG) {
1685 if (!mask)
1686 break;
1687
1688 KVM_MMU_WARN_ON(dbit == shadow_dirty_mask &&
1689 spte_ad_need_write_protect(iter.old_spte));
1690
1691 if (iter.level > PG_LEVEL_4K ||
1692 !(mask & (1UL << (iter.gfn - gfn))))
1693 continue;
1694
1695 mask &= ~(1UL << (iter.gfn - gfn));
1696
1697 if (!(iter.old_spte & dbit))
1698 continue;
1699
1700 iter.old_spte = tdp_mmu_clear_spte_bits(iter.sptep,
1701 iter.old_spte, dbit,
1702 iter.level);
1703
1704 trace_kvm_tdp_mmu_spte_changed(iter.as_id, iter.gfn, iter.level,
1705 iter.old_spte,
1706 iter.old_spte & ~dbit);
1707 }
1708
1709 rcu_read_unlock();
1710 }
1711
1712 /*
1713 * Clear the dirty status (D-bit or W-bit) of all the 4k SPTEs mapping GFNs for
1714 * which a bit is set in mask, starting at gfn. The given memslot is expected to
1715 * contain all the GFNs represented by set bits in the mask.
1716 */
kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm * kvm,struct kvm_memory_slot * slot,gfn_t gfn,unsigned long mask,bool wrprot)1717 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1718 struct kvm_memory_slot *slot,
1719 gfn_t gfn, unsigned long mask,
1720 bool wrprot)
1721 {
1722 struct kvm_mmu_page *root;
1723
1724 for_each_valid_tdp_mmu_root(kvm, root, slot->as_id)
1725 clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1726 }
1727
tdp_mmu_make_huge_spte(struct kvm * kvm,struct tdp_iter * parent,u64 * huge_spte)1728 static int tdp_mmu_make_huge_spte(struct kvm *kvm,
1729 struct tdp_iter *parent,
1730 u64 *huge_spte)
1731 {
1732 struct kvm_mmu_page *root = spte_to_child_sp(parent->old_spte);
1733 gfn_t start = parent->gfn;
1734 gfn_t end = start + KVM_PAGES_PER_HPAGE(parent->level);
1735 struct tdp_iter iter;
1736
1737 tdp_root_for_each_leaf_pte(iter, kvm, root, start, end) {
1738 /*
1739 * Use the parent iterator when checking for forward progress so
1740 * that KVM doesn't get stuck continuously trying to yield (i.e.
1741 * returning -EAGAIN here and then failing the forward progress
1742 * check in the caller ad nauseam).
1743 */
1744 if (tdp_mmu_iter_need_resched(kvm, parent))
1745 return -EAGAIN;
1746
1747 *huge_spte = make_huge_spte(kvm, iter.old_spte, parent->level);
1748 return 0;
1749 }
1750
1751 return -ENOENT;
1752 }
1753
recover_huge_pages_range(struct kvm * kvm,struct kvm_mmu_page * root,const struct kvm_memory_slot * slot)1754 static void recover_huge_pages_range(struct kvm *kvm,
1755 struct kvm_mmu_page *root,
1756 const struct kvm_memory_slot *slot)
1757 {
1758 gfn_t start = slot->base_gfn;
1759 gfn_t end = start + slot->npages;
1760 struct tdp_iter iter;
1761 int max_mapping_level;
1762 bool flush = false;
1763 u64 huge_spte;
1764 int r;
1765
1766 if (WARN_ON_ONCE(kvm_slot_dirty_track_enabled(slot)))
1767 return;
1768
1769 rcu_read_lock();
1770
1771 for_each_tdp_pte_min_level(iter, kvm, root, PG_LEVEL_2M, start, end) {
1772 retry:
1773 if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, true)) {
1774 flush = false;
1775 continue;
1776 }
1777
1778 if (iter.level > KVM_MAX_HUGEPAGE_LEVEL ||
1779 !is_shadow_present_pte(iter.old_spte))
1780 continue;
1781
1782 /*
1783 * Don't zap leaf SPTEs, if a leaf SPTE could be replaced with
1784 * a large page size, then its parent would have been zapped
1785 * instead of stepping down.
1786 */
1787 if (is_last_spte(iter.old_spte, iter.level))
1788 continue;
1789
1790 /*
1791 * If iter.gfn resides outside of the slot, i.e. the page for
1792 * the current level overlaps but is not contained by the slot,
1793 * then the SPTE can't be made huge. More importantly, trying
1794 * to query that info from slot->arch.lpage_info will cause an
1795 * out-of-bounds access.
1796 */
1797 if (iter.gfn < start || iter.gfn >= end)
1798 continue;
1799
1800 max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, iter.gfn);
1801 if (max_mapping_level < iter.level)
1802 continue;
1803
1804 r = tdp_mmu_make_huge_spte(kvm, &iter, &huge_spte);
1805 if (r == -EAGAIN)
1806 goto retry;
1807 else if (r)
1808 continue;
1809
1810 if (tdp_mmu_set_spte_atomic(kvm, &iter, huge_spte))
1811 goto retry;
1812
1813 flush = true;
1814 }
1815
1816 if (flush)
1817 kvm_flush_remote_tlbs_memslot(kvm, slot);
1818
1819 rcu_read_unlock();
1820 }
1821
1822 /*
1823 * Recover huge page mappings within the slot by replacing non-leaf SPTEs with
1824 * huge SPTEs where possible.
1825 */
kvm_tdp_mmu_recover_huge_pages(struct kvm * kvm,const struct kvm_memory_slot * slot)1826 void kvm_tdp_mmu_recover_huge_pages(struct kvm *kvm,
1827 const struct kvm_memory_slot *slot)
1828 {
1829 struct kvm_mmu_page *root;
1830
1831 lockdep_assert_held_read(&kvm->mmu_lock);
1832 for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
1833 recover_huge_pages_range(kvm, root, slot);
1834 }
1835
1836 /*
1837 * Removes write access on the last level SPTE mapping this GFN and unsets the
1838 * MMU-writable bit to ensure future writes continue to be intercepted.
1839 * Returns true if an SPTE was set and a TLB flush is needed.
1840 */
write_protect_gfn(struct kvm * kvm,struct kvm_mmu_page * root,gfn_t gfn,int min_level)1841 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
1842 gfn_t gfn, int min_level)
1843 {
1844 struct tdp_iter iter;
1845 u64 new_spte;
1846 bool spte_set = false;
1847
1848 BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1849
1850 rcu_read_lock();
1851
1852 for_each_tdp_pte_min_level(iter, kvm, root, min_level, gfn, gfn + 1) {
1853 if (!is_shadow_present_pte(iter.old_spte) ||
1854 !is_last_spte(iter.old_spte, iter.level))
1855 continue;
1856
1857 new_spte = iter.old_spte &
1858 ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
1859
1860 if (new_spte == iter.old_spte)
1861 break;
1862
1863 tdp_mmu_iter_set_spte(kvm, &iter, new_spte);
1864 spte_set = true;
1865 }
1866
1867 rcu_read_unlock();
1868
1869 return spte_set;
1870 }
1871
1872 /*
1873 * Removes write access on the last level SPTE mapping this GFN and unsets the
1874 * MMU-writable bit to ensure future writes continue to be intercepted.
1875 * Returns true if an SPTE was set and a TLB flush is needed.
1876 */
kvm_tdp_mmu_write_protect_gfn(struct kvm * kvm,struct kvm_memory_slot * slot,gfn_t gfn,int min_level)1877 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
1878 struct kvm_memory_slot *slot, gfn_t gfn,
1879 int min_level)
1880 {
1881 struct kvm_mmu_page *root;
1882 bool spte_set = false;
1883
1884 lockdep_assert_held_write(&kvm->mmu_lock);
1885 for_each_valid_tdp_mmu_root(kvm, root, slot->as_id)
1886 spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
1887
1888 return spte_set;
1889 }
1890
1891 /*
1892 * Return the level of the lowest level SPTE added to sptes.
1893 * That SPTE may be non-present.
1894 *
1895 * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1896 */
kvm_tdp_mmu_get_walk(struct kvm_vcpu * vcpu,u64 addr,u64 * sptes,int * root_level)1897 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1898 int *root_level)
1899 {
1900 struct kvm_mmu_page *root = root_to_sp(vcpu->arch.mmu->root.hpa);
1901 struct tdp_iter iter;
1902 gfn_t gfn = addr >> PAGE_SHIFT;
1903 int leaf = -1;
1904
1905 *root_level = vcpu->arch.mmu->root_role.level;
1906
1907 tdp_mmu_for_each_pte(iter, vcpu->kvm, root, gfn, gfn + 1) {
1908 leaf = iter.level;
1909 sptes[leaf] = iter.old_spte;
1910 }
1911
1912 return leaf;
1913 }
1914
1915 /*
1916 * Returns the last level spte pointer of the shadow page walk for the given
1917 * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
1918 * walk could be performed, returns NULL and *spte does not contain valid data.
1919 *
1920 * Contract:
1921 * - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1922 * - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end.
1923 *
1924 * WARNING: This function is only intended to be called during fast_page_fault.
1925 */
kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu * vcpu,gfn_t gfn,u64 * spte)1926 u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gfn_t gfn,
1927 u64 *spte)
1928 {
1929 /* Fast pf is not supported for mirrored roots */
1930 struct kvm_mmu_page *root = tdp_mmu_get_root(vcpu, KVM_DIRECT_ROOTS);
1931 struct tdp_iter iter;
1932 tdp_ptep_t sptep = NULL;
1933
1934 tdp_mmu_for_each_pte(iter, vcpu->kvm, root, gfn, gfn + 1) {
1935 *spte = iter.old_spte;
1936 sptep = iter.sptep;
1937 }
1938
1939 /*
1940 * Perform the rcu_dereference to get the raw spte pointer value since
1941 * we are passing it up to fast_page_fault, which is shared with the
1942 * legacy MMU and thus does not retain the TDP MMU-specific __rcu
1943 * annotation.
1944 *
1945 * This is safe since fast_page_fault obeys the contracts of this
1946 * function as well as all TDP MMU contracts around modifying SPTEs
1947 * outside of mmu_lock.
1948 */
1949 return rcu_dereference(sptep);
1950 }
1951