xref: /linux/arch/x86/kvm/mmu/tdp_mmu.c (revision c24a65b6a27c78d8540409800886b6622ea86ebf)
1 // SPDX-License-Identifier: GPL-2.0
2 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
3 
4 #include "mmu.h"
5 #include "mmu_internal.h"
6 #include "mmutrace.h"
7 #include "tdp_iter.h"
8 #include "tdp_mmu.h"
9 #include "spte.h"
10 
11 #include <asm/cmpxchg.h>
12 #include <trace/events/kvm.h>
13 
14 /* Initializes the TDP MMU for the VM, if enabled. */
15 void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
16 {
17 	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
18 	spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
19 }
20 
21 /* Arbitrarily returns true so that this may be used in if statements. */
22 static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
23 							     bool shared)
24 {
25 	if (shared)
26 		lockdep_assert_held_read(&kvm->mmu_lock);
27 	else
28 		lockdep_assert_held_write(&kvm->mmu_lock);
29 
30 	return true;
31 }
32 
33 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
34 {
35 	/*
36 	 * Invalidate all roots, which besides the obvious, schedules all roots
37 	 * for zapping and thus puts the TDP MMU's reference to each root, i.e.
38 	 * ultimately frees all roots.
39 	 */
40 	kvm_tdp_mmu_invalidate_roots(kvm, KVM_VALID_ROOTS);
41 	kvm_tdp_mmu_zap_invalidated_roots(kvm, false);
42 
43 	WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages));
44 	WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
45 
46 	/*
47 	 * Ensure that all the outstanding RCU callbacks to free shadow pages
48 	 * can run before the VM is torn down.  Putting the last reference to
49 	 * zapped roots will create new callbacks.
50 	 */
51 	rcu_barrier();
52 }
53 
54 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
55 {
56 	free_page((unsigned long)sp->external_spt);
57 	free_page((unsigned long)sp->spt);
58 	kmem_cache_free(mmu_page_header_cache, sp);
59 }
60 
61 /*
62  * This is called through call_rcu in order to free TDP page table memory
63  * safely with respect to other kernel threads that may be operating on
64  * the memory.
65  * By only accessing TDP MMU page table memory in an RCU read critical
66  * section, and freeing it after a grace period, lockless access to that
67  * memory won't use it after it is freed.
68  */
69 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
70 {
71 	struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
72 					       rcu_head);
73 
74 	tdp_mmu_free_sp(sp);
75 }
76 
77 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
78 {
79 	if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
80 		return;
81 
82 	/*
83 	 * The TDP MMU itself holds a reference to each root until the root is
84 	 * explicitly invalidated, i.e. the final reference should be never be
85 	 * put for a valid root.
86 	 */
87 	KVM_BUG_ON(!is_tdp_mmu_page(root) || !root->role.invalid, kvm);
88 
89 	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
90 	list_del_rcu(&root->link);
91 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
92 	call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
93 }
94 
95 static bool tdp_mmu_root_match(struct kvm_mmu_page *root,
96 			       enum kvm_tdp_mmu_root_types types)
97 {
98 	if (WARN_ON_ONCE(!(types & KVM_VALID_ROOTS)))
99 		return false;
100 
101 	if (root->role.invalid && !(types & KVM_INVALID_ROOTS))
102 		return false;
103 
104 	if (likely(!is_mirror_sp(root)))
105 		return types & KVM_DIRECT_ROOTS;
106 	return types & KVM_MIRROR_ROOTS;
107 }
108 
109 /*
110  * Returns the next root after @prev_root (or the first root if @prev_root is
111  * NULL) that matches with @types.  A reference to the returned root is
112  * acquired, and the reference to @prev_root is released (the caller obviously
113  * must hold a reference to @prev_root if it's non-NULL).
114  *
115  * Roots that doesn't match with @types are skipped.
116  *
117  * Returns NULL if the end of tdp_mmu_roots was reached.
118  */
119 static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
120 					      struct kvm_mmu_page *prev_root,
121 					      enum kvm_tdp_mmu_root_types types)
122 {
123 	struct kvm_mmu_page *next_root;
124 
125 	/*
126 	 * While the roots themselves are RCU-protected, fields such as
127 	 * role.invalid are protected by mmu_lock.
128 	 */
129 	lockdep_assert_held(&kvm->mmu_lock);
130 
131 	rcu_read_lock();
132 
133 	if (prev_root)
134 		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
135 						  &prev_root->link,
136 						  typeof(*prev_root), link);
137 	else
138 		next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
139 						   typeof(*next_root), link);
140 
141 	while (next_root) {
142 		if (tdp_mmu_root_match(next_root, types) &&
143 		    kvm_tdp_mmu_get_root(next_root))
144 			break;
145 
146 		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
147 				&next_root->link, typeof(*next_root), link);
148 	}
149 
150 	rcu_read_unlock();
151 
152 	if (prev_root)
153 		kvm_tdp_mmu_put_root(kvm, prev_root);
154 
155 	return next_root;
156 }
157 
158 /*
159  * Note: this iterator gets and puts references to the roots it iterates over.
160  * This makes it safe to release the MMU lock and yield within the loop, but
161  * if exiting the loop early, the caller must drop the reference to the most
162  * recent root. (Unless keeping a live reference is desirable.)
163  *
164  * If shared is set, this function is operating under the MMU lock in read
165  * mode.
166  */
167 #define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _types)	\
168 	for (_root = tdp_mmu_next_root(_kvm, NULL, _types);		\
169 	     ({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root;		\
170 	     _root = tdp_mmu_next_root(_kvm, _root, _types))		\
171 		if (_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) {	\
172 		} else
173 
174 #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id)	\
175 	__for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, KVM_VALID_ROOTS)
176 
177 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root)			\
178 	for (_root = tdp_mmu_next_root(_kvm, NULL, KVM_ALL_ROOTS);		\
179 	     ({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root;	\
180 	     _root = tdp_mmu_next_root(_kvm, _root, KVM_ALL_ROOTS))
181 
182 /*
183  * Iterate over all TDP MMU roots.  Requires that mmu_lock be held for write,
184  * the implication being that any flow that holds mmu_lock for read is
185  * inherently yield-friendly and should use the yield-safe variant above.
186  * Holding mmu_lock for write obviates the need for RCU protection as the list
187  * is guaranteed to be stable.
188  */
189 #define __for_each_tdp_mmu_root(_kvm, _root, _as_id, _types)			\
190 	list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)		\
191 		if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) &&		\
192 		    ((_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) ||	\
193 		     !tdp_mmu_root_match((_root), (_types)))) {			\
194 		} else
195 
196 /*
197  * Iterate over all TDP MMU roots in an RCU read-side critical section.
198  * It is safe to iterate over the SPTEs under the root, but their values will
199  * be unstable, so all writes must be atomic. As this routine is meant to be
200  * used without holding the mmu_lock at all, any bits that are flipped must
201  * be reflected in kvm_tdp_mmu_spte_need_atomic_write().
202  */
203 #define for_each_tdp_mmu_root_rcu(_kvm, _root, _as_id, _types)			\
204 	list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link)		\
205 		if ((_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) ||	\
206 		    !tdp_mmu_root_match((_root), (_types))) {			\
207 		} else
208 
209 #define for_each_valid_tdp_mmu_root(_kvm, _root, _as_id)		\
210 	__for_each_tdp_mmu_root(_kvm, _root, _as_id, KVM_VALID_ROOTS)
211 
212 static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
213 {
214 	struct kvm_mmu_page *sp;
215 
216 	sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
217 	sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
218 
219 	return sp;
220 }
221 
222 static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep,
223 			    gfn_t gfn, union kvm_mmu_page_role role)
224 {
225 	INIT_LIST_HEAD(&sp->possible_nx_huge_page_link);
226 
227 	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
228 
229 	sp->role = role;
230 	sp->gfn = gfn;
231 	sp->ptep = sptep;
232 	sp->tdp_mmu_page = true;
233 
234 	trace_kvm_mmu_get_page(sp, true);
235 }
236 
237 static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
238 				  struct tdp_iter *iter)
239 {
240 	struct kvm_mmu_page *parent_sp;
241 	union kvm_mmu_page_role role;
242 
243 	parent_sp = sptep_to_sp(rcu_dereference(iter->sptep));
244 
245 	role = parent_sp->role;
246 	role.level--;
247 
248 	tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role);
249 }
250 
251 void kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu, bool mirror)
252 {
253 	struct kvm_mmu *mmu = vcpu->arch.mmu;
254 	union kvm_mmu_page_role role = mmu->root_role;
255 	int as_id = kvm_mmu_role_as_id(role);
256 	struct kvm *kvm = vcpu->kvm;
257 	struct kvm_mmu_page *root;
258 
259 	if (mirror)
260 		role.is_mirror = true;
261 
262 	/*
263 	 * Check for an existing root before acquiring the pages lock to avoid
264 	 * unnecessary serialization if multiple vCPUs are loading a new root.
265 	 * E.g. when bringing up secondary vCPUs, KVM will already have created
266 	 * a valid root on behalf of the primary vCPU.
267 	 */
268 	read_lock(&kvm->mmu_lock);
269 
270 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, as_id) {
271 		if (root->role.word == role.word)
272 			goto out_read_unlock;
273 	}
274 
275 	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
276 
277 	/*
278 	 * Recheck for an existing root after acquiring the pages lock, another
279 	 * vCPU may have raced ahead and created a new usable root.  Manually
280 	 * walk the list of roots as the standard macros assume that the pages
281 	 * lock is *not* held.  WARN if grabbing a reference to a usable root
282 	 * fails, as the last reference to a root can only be put *after* the
283 	 * root has been invalidated, which requires holding mmu_lock for write.
284 	 */
285 	list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
286 		if (root->role.word == role.word &&
287 		    !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root)))
288 			goto out_spin_unlock;
289 	}
290 
291 	root = tdp_mmu_alloc_sp(vcpu);
292 	tdp_mmu_init_sp(root, NULL, 0, role);
293 
294 	/*
295 	 * TDP MMU roots are kept until they are explicitly invalidated, either
296 	 * by a memslot update or by the destruction of the VM.  Initialize the
297 	 * refcount to two; one reference for the vCPU, and one reference for
298 	 * the TDP MMU itself, which is held until the root is invalidated and
299 	 * is ultimately put by kvm_tdp_mmu_zap_invalidated_roots().
300 	 */
301 	refcount_set(&root->tdp_mmu_root_count, 2);
302 	list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
303 
304 out_spin_unlock:
305 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
306 out_read_unlock:
307 	read_unlock(&kvm->mmu_lock);
308 	/*
309 	 * Note, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS will prevent entering the guest
310 	 * and actually consuming the root if it's invalidated after dropping
311 	 * mmu_lock, and the root can't be freed as this vCPU holds a reference.
312 	 */
313 	if (mirror) {
314 		mmu->mirror_root_hpa = __pa(root->spt);
315 	} else {
316 		mmu->root.hpa = __pa(root->spt);
317 		mmu->root.pgd = 0;
318 	}
319 }
320 
321 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
322 				u64 old_spte, u64 new_spte, int level,
323 				bool shared);
324 
325 static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
326 {
327 	kvm_account_pgtable_pages((void *)sp->spt, +1);
328 	atomic64_inc(&kvm->arch.tdp_mmu_pages);
329 }
330 
331 static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
332 {
333 	kvm_account_pgtable_pages((void *)sp->spt, -1);
334 	atomic64_dec(&kvm->arch.tdp_mmu_pages);
335 }
336 
337 /**
338  * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages
339  *
340  * @kvm: kvm instance
341  * @sp: the page to be removed
342  */
343 static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
344 {
345 	tdp_unaccount_mmu_page(kvm, sp);
346 
347 	if (!sp->nx_huge_page_disallowed)
348 		return;
349 
350 	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
351 	sp->nx_huge_page_disallowed = false;
352 	untrack_possible_nx_huge_page(kvm, sp);
353 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
354 }
355 
356 static void remove_external_spte(struct kvm *kvm, gfn_t gfn, u64 old_spte,
357 				 int level)
358 {
359 	kvm_pfn_t old_pfn = spte_to_pfn(old_spte);
360 	int ret;
361 
362 	/*
363 	 * External (TDX) SPTEs are limited to PG_LEVEL_4K, and external
364 	 * PTs are removed in a special order, involving free_external_spt().
365 	 * But remove_external_spte() will be called on non-leaf PTEs via
366 	 * __tdp_mmu_zap_root(), so avoid the error the former would return
367 	 * in this case.
368 	 */
369 	if (!is_last_spte(old_spte, level))
370 		return;
371 
372 	/* Zapping leaf spte is allowed only when write lock is held. */
373 	lockdep_assert_held_write(&kvm->mmu_lock);
374 	/* Because write lock is held, operation should success. */
375 	ret = static_call(kvm_x86_remove_external_spte)(kvm, gfn, level, old_pfn);
376 	KVM_BUG_ON(ret, kvm);
377 }
378 
379 /**
380  * handle_removed_pt() - handle a page table removed from the TDP structure
381  *
382  * @kvm: kvm instance
383  * @pt: the page removed from the paging structure
384  * @shared: This operation may not be running under the exclusive use
385  *	    of the MMU lock and the operation must synchronize with other
386  *	    threads that might be modifying SPTEs.
387  *
388  * Given a page table that has been removed from the TDP paging structure,
389  * iterates through the page table to clear SPTEs and free child page tables.
390  *
391  * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
392  * protection. Since this thread removed it from the paging structure,
393  * this thread will be responsible for ensuring the page is freed. Hence the
394  * early rcu_dereferences in the function.
395  */
396 static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
397 {
398 	struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
399 	int level = sp->role.level;
400 	gfn_t base_gfn = sp->gfn;
401 	int i;
402 
403 	trace_kvm_mmu_prepare_zap_page(sp);
404 
405 	tdp_mmu_unlink_sp(kvm, sp);
406 
407 	for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
408 		tdp_ptep_t sptep = pt + i;
409 		gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
410 		u64 old_spte;
411 
412 		if (shared) {
413 			/*
414 			 * Set the SPTE to a nonpresent value that other
415 			 * threads will not overwrite. If the SPTE was
416 			 * already marked as frozen then another thread
417 			 * handling a page fault could overwrite it, so
418 			 * set the SPTE until it is set from some other
419 			 * value to the frozen SPTE value.
420 			 */
421 			for (;;) {
422 				old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, FROZEN_SPTE);
423 				if (!is_frozen_spte(old_spte))
424 					break;
425 				cpu_relax();
426 			}
427 		} else {
428 			/*
429 			 * If the SPTE is not MMU-present, there is no backing
430 			 * page associated with the SPTE and so no side effects
431 			 * that need to be recorded, and exclusive ownership of
432 			 * mmu_lock ensures the SPTE can't be made present.
433 			 * Note, zapping MMIO SPTEs is also unnecessary as they
434 			 * are guarded by the memslots generation, not by being
435 			 * unreachable.
436 			 */
437 			old_spte = kvm_tdp_mmu_read_spte(sptep);
438 			if (!is_shadow_present_pte(old_spte))
439 				continue;
440 
441 			/*
442 			 * Use the common helper instead of a raw WRITE_ONCE as
443 			 * the SPTE needs to be updated atomically if it can be
444 			 * modified by a different vCPU outside of mmu_lock.
445 			 * Even though the parent SPTE is !PRESENT, the TLB
446 			 * hasn't yet been flushed, and both Intel and AMD
447 			 * document that A/D assists can use upper-level PxE
448 			 * entries that are cached in the TLB, i.e. the CPU can
449 			 * still access the page and mark it dirty.
450 			 *
451 			 * No retry is needed in the atomic update path as the
452 			 * sole concern is dropping a Dirty bit, i.e. no other
453 			 * task can zap/remove the SPTE as mmu_lock is held for
454 			 * write.  Marking the SPTE as a frozen SPTE is not
455 			 * strictly necessary for the same reason, but using
456 			 * the frozen SPTE value keeps the shared/exclusive
457 			 * paths consistent and allows the handle_changed_spte()
458 			 * call below to hardcode the new value to FROZEN_SPTE.
459 			 *
460 			 * Note, even though dropping a Dirty bit is the only
461 			 * scenario where a non-atomic update could result in a
462 			 * functional bug, simply checking the Dirty bit isn't
463 			 * sufficient as a fast page fault could read the upper
464 			 * level SPTE before it is zapped, and then make this
465 			 * target SPTE writable, resume the guest, and set the
466 			 * Dirty bit between reading the SPTE above and writing
467 			 * it here.
468 			 */
469 			old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte,
470 							  FROZEN_SPTE, level);
471 		}
472 		handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
473 				    old_spte, FROZEN_SPTE, level, shared);
474 
475 		if (is_mirror_sp(sp)) {
476 			KVM_BUG_ON(shared, kvm);
477 			remove_external_spte(kvm, gfn, old_spte, level);
478 		}
479 	}
480 
481 	if (is_mirror_sp(sp) &&
482 	    WARN_ON(static_call(kvm_x86_free_external_spt)(kvm, base_gfn, sp->role.level,
483 							  sp->external_spt))) {
484 		/*
485 		 * Failed to free page table page in mirror page table and
486 		 * there is nothing to do further.
487 		 * Intentionally leak the page to prevent the kernel from
488 		 * accessing the encrypted page.
489 		 */
490 		sp->external_spt = NULL;
491 	}
492 
493 	call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
494 }
495 
496 static void *get_external_spt(gfn_t gfn, u64 new_spte, int level)
497 {
498 	if (is_shadow_present_pte(new_spte) && !is_last_spte(new_spte, level)) {
499 		struct kvm_mmu_page *sp = spte_to_child_sp(new_spte);
500 
501 		WARN_ON_ONCE(sp->role.level + 1 != level);
502 		WARN_ON_ONCE(sp->gfn != gfn);
503 		return sp->external_spt;
504 	}
505 
506 	return NULL;
507 }
508 
509 static int __must_check set_external_spte_present(struct kvm *kvm, tdp_ptep_t sptep,
510 						 gfn_t gfn, u64 old_spte,
511 						 u64 new_spte, int level)
512 {
513 	bool was_present = is_shadow_present_pte(old_spte);
514 	bool is_present = is_shadow_present_pte(new_spte);
515 	bool is_leaf = is_present && is_last_spte(new_spte, level);
516 	kvm_pfn_t new_pfn = spte_to_pfn(new_spte);
517 	int ret = 0;
518 
519 	KVM_BUG_ON(was_present, kvm);
520 
521 	lockdep_assert_held(&kvm->mmu_lock);
522 	/*
523 	 * We need to lock out other updates to the SPTE until the external
524 	 * page table has been modified. Use FROZEN_SPTE similar to
525 	 * the zapping case.
526 	 */
527 	if (!try_cmpxchg64(rcu_dereference(sptep), &old_spte, FROZEN_SPTE))
528 		return -EBUSY;
529 
530 	/*
531 	 * Use different call to either set up middle level
532 	 * external page table, or leaf.
533 	 */
534 	if (is_leaf) {
535 		ret = static_call(kvm_x86_set_external_spte)(kvm, gfn, level, new_pfn);
536 	} else {
537 		void *external_spt = get_external_spt(gfn, new_spte, level);
538 
539 		KVM_BUG_ON(!external_spt, kvm);
540 		ret = static_call(kvm_x86_link_external_spt)(kvm, gfn, level, external_spt);
541 	}
542 	if (ret)
543 		__kvm_tdp_mmu_write_spte(sptep, old_spte);
544 	else
545 		__kvm_tdp_mmu_write_spte(sptep, new_spte);
546 	return ret;
547 }
548 
549 /**
550  * handle_changed_spte - handle bookkeeping associated with an SPTE change
551  * @kvm: kvm instance
552  * @as_id: the address space of the paging structure the SPTE was a part of
553  * @gfn: the base GFN that was mapped by the SPTE
554  * @old_spte: The value of the SPTE before the change
555  * @new_spte: The value of the SPTE after the change
556  * @level: the level of the PT the SPTE is part of in the paging structure
557  * @shared: This operation may not be running under the exclusive use of
558  *	    the MMU lock and the operation must synchronize with other
559  *	    threads that might be modifying SPTEs.
560  *
561  * Handle bookkeeping that might result from the modification of a SPTE.  Note,
562  * dirty logging updates are handled in common code, not here (see make_spte()
563  * and fast_pf_fix_direct_spte()).
564  */
565 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
566 				u64 old_spte, u64 new_spte, int level,
567 				bool shared)
568 {
569 	bool was_present = is_shadow_present_pte(old_spte);
570 	bool is_present = is_shadow_present_pte(new_spte);
571 	bool was_leaf = was_present && is_last_spte(old_spte, level);
572 	bool is_leaf = is_present && is_last_spte(new_spte, level);
573 	bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
574 
575 	WARN_ON_ONCE(level > PT64_ROOT_MAX_LEVEL);
576 	WARN_ON_ONCE(level < PG_LEVEL_4K);
577 	WARN_ON_ONCE(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
578 
579 	/*
580 	 * If this warning were to trigger it would indicate that there was a
581 	 * missing MMU notifier or a race with some notifier handler.
582 	 * A present, leaf SPTE should never be directly replaced with another
583 	 * present leaf SPTE pointing to a different PFN. A notifier handler
584 	 * should be zapping the SPTE before the main MM's page table is
585 	 * changed, or the SPTE should be zeroed, and the TLBs flushed by the
586 	 * thread before replacement.
587 	 */
588 	if (was_leaf && is_leaf && pfn_changed) {
589 		pr_err("Invalid SPTE change: cannot replace a present leaf\n"
590 		       "SPTE with another present leaf SPTE mapping a\n"
591 		       "different PFN!\n"
592 		       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
593 		       as_id, gfn, old_spte, new_spte, level);
594 
595 		/*
596 		 * Crash the host to prevent error propagation and guest data
597 		 * corruption.
598 		 */
599 		BUG();
600 	}
601 
602 	if (old_spte == new_spte)
603 		return;
604 
605 	trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
606 
607 	if (is_leaf)
608 		check_spte_writable_invariants(new_spte);
609 
610 	/*
611 	 * The only times a SPTE should be changed from a non-present to
612 	 * non-present state is when an MMIO entry is installed/modified/
613 	 * removed. In that case, there is nothing to do here.
614 	 */
615 	if (!was_present && !is_present) {
616 		/*
617 		 * If this change does not involve a MMIO SPTE or frozen SPTE,
618 		 * it is unexpected. Log the change, though it should not
619 		 * impact the guest since both the former and current SPTEs
620 		 * are nonpresent.
621 		 */
622 		if (WARN_ON_ONCE(!is_mmio_spte(kvm, old_spte) &&
623 				 !is_mmio_spte(kvm, new_spte) &&
624 				 !is_frozen_spte(new_spte)))
625 			pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
626 			       "should not be replaced with another,\n"
627 			       "different nonpresent SPTE, unless one or both\n"
628 			       "are MMIO SPTEs, or the new SPTE is\n"
629 			       "a temporary frozen SPTE.\n"
630 			       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
631 			       as_id, gfn, old_spte, new_spte, level);
632 		return;
633 	}
634 
635 	if (is_leaf != was_leaf)
636 		kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
637 
638 	/*
639 	 * Recursively handle child PTs if the change removed a subtree from
640 	 * the paging structure.  Note the WARN on the PFN changing without the
641 	 * SPTE being converted to a hugepage (leaf) or being zapped.  Shadow
642 	 * pages are kernel allocations and should never be migrated.
643 	 */
644 	if (was_present && !was_leaf &&
645 	    (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
646 		handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
647 }
648 
649 static inline int __must_check __tdp_mmu_set_spte_atomic(struct kvm *kvm,
650 							 struct tdp_iter *iter,
651 							 u64 new_spte)
652 {
653 	/*
654 	 * The caller is responsible for ensuring the old SPTE is not a FROZEN
655 	 * SPTE.  KVM should never attempt to zap or manipulate a FROZEN SPTE,
656 	 * and pre-checking before inserting a new SPTE is advantageous as it
657 	 * avoids unnecessary work.
658 	 */
659 	WARN_ON_ONCE(iter->yielded || is_frozen_spte(iter->old_spte));
660 
661 	if (is_mirror_sptep(iter->sptep) && !is_frozen_spte(new_spte)) {
662 		int ret;
663 
664 		/*
665 		 * Users of atomic zapping don't operate on mirror roots,
666 		 * so don't handle it and bug the VM if it's seen.
667 		 */
668 		if (KVM_BUG_ON(!is_shadow_present_pte(new_spte), kvm))
669 			return -EBUSY;
670 
671 		ret = set_external_spte_present(kvm, iter->sptep, iter->gfn,
672 						iter->old_spte, new_spte, iter->level);
673 		if (ret)
674 			return ret;
675 	} else {
676 		u64 *sptep = rcu_dereference(iter->sptep);
677 
678 		/*
679 		 * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs
680 		 * and does not hold the mmu_lock.  On failure, i.e. if a
681 		 * different logical CPU modified the SPTE, try_cmpxchg64()
682 		 * updates iter->old_spte with the current value, so the caller
683 		 * operates on fresh data, e.g. if it retries
684 		 * tdp_mmu_set_spte_atomic()
685 		 */
686 		if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte))
687 			return -EBUSY;
688 	}
689 
690 	return 0;
691 }
692 
693 /*
694  * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically
695  * and handle the associated bookkeeping.  Do not mark the page dirty
696  * in KVM's dirty bitmaps.
697  *
698  * If setting the SPTE fails because it has changed, iter->old_spte will be
699  * refreshed to the current value of the spte.
700  *
701  * @kvm: kvm instance
702  * @iter: a tdp_iter instance currently on the SPTE that should be set
703  * @new_spte: The value the SPTE should be set to
704  * Return:
705  * * 0      - If the SPTE was set.
706  * * -EBUSY - If the SPTE cannot be set. In this case this function will have
707  *            no side-effects other than setting iter->old_spte to the last
708  *            known value of the spte.
709  */
710 static inline int __must_check tdp_mmu_set_spte_atomic(struct kvm *kvm,
711 						       struct tdp_iter *iter,
712 						       u64 new_spte)
713 {
714 	int ret;
715 
716 	lockdep_assert_held_read(&kvm->mmu_lock);
717 
718 	ret = __tdp_mmu_set_spte_atomic(kvm, iter, new_spte);
719 	if (ret)
720 		return ret;
721 
722 	handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
723 			    new_spte, iter->level, true);
724 
725 	return 0;
726 }
727 
728 /*
729  * tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
730  * @kvm:	      KVM instance
731  * @as_id:	      Address space ID, i.e. regular vs. SMM
732  * @sptep:	      Pointer to the SPTE
733  * @old_spte:	      The current value of the SPTE
734  * @new_spte:	      The new value that will be set for the SPTE
735  * @gfn:	      The base GFN that was (or will be) mapped by the SPTE
736  * @level:	      The level _containing_ the SPTE (its parent PT's level)
737  *
738  * Returns the old SPTE value, which _may_ be different than @old_spte if the
739  * SPTE had voldatile bits.
740  */
741 static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
742 			    u64 old_spte, u64 new_spte, gfn_t gfn, int level)
743 {
744 	lockdep_assert_held_write(&kvm->mmu_lock);
745 
746 	/*
747 	 * No thread should be using this function to set SPTEs to or from the
748 	 * temporary frozen SPTE value.
749 	 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
750 	 * should be used. If operating under the MMU lock in write mode, the
751 	 * use of the frozen SPTE should not be necessary.
752 	 */
753 	WARN_ON_ONCE(is_frozen_spte(old_spte) || is_frozen_spte(new_spte));
754 
755 	old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level);
756 
757 	handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
758 
759 	/*
760 	 * Users that do non-atomic setting of PTEs don't operate on mirror
761 	 * roots, so don't handle it and bug the VM if it's seen.
762 	 */
763 	if (is_mirror_sptep(sptep)) {
764 		KVM_BUG_ON(is_shadow_present_pte(new_spte), kvm);
765 		remove_external_spte(kvm, gfn, old_spte, level);
766 	}
767 
768 	return old_spte;
769 }
770 
771 static inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter,
772 					 u64 new_spte)
773 {
774 	WARN_ON_ONCE(iter->yielded);
775 	iter->old_spte = tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep,
776 					  iter->old_spte, new_spte,
777 					  iter->gfn, iter->level);
778 }
779 
780 #define tdp_root_for_each_pte(_iter, _kvm, _root, _start, _end)	\
781 	for_each_tdp_pte(_iter, _kvm, _root, _start, _end)
782 
783 #define tdp_root_for_each_leaf_pte(_iter, _kvm, _root, _start, _end)	\
784 	tdp_root_for_each_pte(_iter, _kvm, _root, _start, _end)		\
785 		if (!is_shadow_present_pte(_iter.old_spte) ||		\
786 		    !is_last_spte(_iter.old_spte, _iter.level))		\
787 			continue;					\
788 		else
789 
790 static inline bool __must_check tdp_mmu_iter_need_resched(struct kvm *kvm,
791 							  struct tdp_iter *iter)
792 {
793 	if (!need_resched() && !rwlock_needbreak(&kvm->mmu_lock))
794 		return false;
795 
796 	/* Ensure forward progress has been made before yielding. */
797 	return iter->next_last_level_gfn != iter->yielded_gfn;
798 }
799 
800 /*
801  * Yield if the MMU lock is contended or this thread needs to return control
802  * to the scheduler.
803  *
804  * If this function should yield and flush is set, it will perform a remote
805  * TLB flush before yielding.
806  *
807  * If this function yields, iter->yielded is set and the caller must skip to
808  * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk
809  * over the paging structures to allow the iterator to continue its traversal
810  * from the paging structure root.
811  *
812  * Returns true if this function yielded.
813  */
814 static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
815 							  struct tdp_iter *iter,
816 							  bool flush, bool shared)
817 {
818 	KVM_MMU_WARN_ON(iter->yielded);
819 
820 	if (!tdp_mmu_iter_need_resched(kvm, iter))
821 		return false;
822 
823 	if (flush)
824 		kvm_flush_remote_tlbs(kvm);
825 
826 	rcu_read_unlock();
827 
828 	if (shared)
829 		cond_resched_rwlock_read(&kvm->mmu_lock);
830 	else
831 		cond_resched_rwlock_write(&kvm->mmu_lock);
832 
833 	rcu_read_lock();
834 
835 	WARN_ON_ONCE(iter->gfn > iter->next_last_level_gfn);
836 
837 	iter->yielded = true;
838 	return true;
839 }
840 
841 static inline gfn_t tdp_mmu_max_gfn_exclusive(void)
842 {
843 	/*
844 	 * Bound TDP MMU walks at host.MAXPHYADDR.  KVM disallows memslots with
845 	 * a gpa range that would exceed the max gfn, and KVM does not create
846 	 * MMIO SPTEs for "impossible" gfns, instead sending such accesses down
847 	 * the slow emulation path every time.
848 	 */
849 	return kvm_mmu_max_gfn() + 1;
850 }
851 
852 static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
853 			       bool shared, int zap_level)
854 {
855 	struct tdp_iter iter;
856 
857 	for_each_tdp_pte_min_level_all(iter, root, zap_level) {
858 retry:
859 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
860 			continue;
861 
862 		if (!is_shadow_present_pte(iter.old_spte))
863 			continue;
864 
865 		if (iter.level > zap_level)
866 			continue;
867 
868 		if (!shared)
869 			tdp_mmu_iter_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE);
870 		else if (tdp_mmu_set_spte_atomic(kvm, &iter, SHADOW_NONPRESENT_VALUE))
871 			goto retry;
872 	}
873 }
874 
875 static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
876 			     bool shared)
877 {
878 
879 	/*
880 	 * The root must have an elevated refcount so that it's reachable via
881 	 * mmu_notifier callbacks, which allows this path to yield and drop
882 	 * mmu_lock.  When handling an unmap/release mmu_notifier command, KVM
883 	 * must drop all references to relevant pages prior to completing the
884 	 * callback.  Dropping mmu_lock with an unreachable root would result
885 	 * in zapping SPTEs after a relevant mmu_notifier callback completes
886 	 * and lead to use-after-free as zapping a SPTE triggers "writeback" of
887 	 * dirty accessed bits to the SPTE's associated struct page.
888 	 */
889 	WARN_ON_ONCE(!refcount_read(&root->tdp_mmu_root_count));
890 
891 	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
892 
893 	rcu_read_lock();
894 
895 	/*
896 	 * Zap roots in multiple passes of decreasing granularity, i.e. zap at
897 	 * 4KiB=>2MiB=>1GiB=>root, in order to better honor need_resched() (all
898 	 * preempt models) or mmu_lock contention (full or real-time models).
899 	 * Zapping at finer granularity marginally increases the total time of
900 	 * the zap, but in most cases the zap itself isn't latency sensitive.
901 	 *
902 	 * If KVM is configured to prove the MMU, skip the 4KiB and 2MiB zaps
903 	 * in order to mimic the page fault path, which can replace a 1GiB page
904 	 * table with an equivalent 1GiB hugepage, i.e. can get saddled with
905 	 * zapping a 1GiB region that's fully populated with 4KiB SPTEs.  This
906 	 * allows verifying that KVM can safely zap 1GiB regions, e.g. without
907 	 * inducing RCU stalls, without relying on a relatively rare event
908 	 * (zapping roots is orders of magnitude more common).  Note, because
909 	 * zapping a SP recurses on its children, stepping down to PG_LEVEL_4K
910 	 * in the iterator itself is unnecessary.
911 	 */
912 	if (!IS_ENABLED(CONFIG_KVM_PROVE_MMU)) {
913 		__tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_4K);
914 		__tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_2M);
915 	}
916 	__tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G);
917 	__tdp_mmu_zap_root(kvm, root, shared, root->role.level);
918 
919 	rcu_read_unlock();
920 }
921 
922 bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
923 {
924 	u64 old_spte;
925 
926 	/*
927 	 * This helper intentionally doesn't allow zapping a root shadow page,
928 	 * which doesn't have a parent page table and thus no associated entry.
929 	 */
930 	if (WARN_ON_ONCE(!sp->ptep))
931 		return false;
932 
933 	old_spte = kvm_tdp_mmu_read_spte(sp->ptep);
934 	if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte)))
935 		return false;
936 
937 	tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte,
938 			 SHADOW_NONPRESENT_VALUE, sp->gfn, sp->role.level + 1);
939 
940 	return true;
941 }
942 
943 /*
944  * If can_yield is true, will release the MMU lock and reschedule if the
945  * scheduler needs the CPU or there is contention on the MMU lock. If this
946  * function cannot yield, it will not release the MMU lock or reschedule and
947  * the caller must ensure it does not supply too large a GFN range, or the
948  * operation can cause a soft lockup.
949  */
950 static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
951 			      gfn_t start, gfn_t end, bool can_yield, bool flush)
952 {
953 	struct tdp_iter iter;
954 
955 	end = min(end, tdp_mmu_max_gfn_exclusive());
956 
957 	lockdep_assert_held_write(&kvm->mmu_lock);
958 
959 	rcu_read_lock();
960 
961 	for_each_tdp_pte_min_level(iter, kvm, root, PG_LEVEL_4K, start, end) {
962 		if (can_yield &&
963 		    tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) {
964 			flush = false;
965 			continue;
966 		}
967 
968 		if (!is_shadow_present_pte(iter.old_spte) ||
969 		    !is_last_spte(iter.old_spte, iter.level))
970 			continue;
971 
972 		tdp_mmu_iter_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE);
973 
974 		/*
975 		 * Zappings SPTEs in invalid roots doesn't require a TLB flush,
976 		 * see kvm_tdp_mmu_zap_invalidated_roots() for details.
977 		 */
978 		if (!root->role.invalid)
979 			flush = true;
980 	}
981 
982 	rcu_read_unlock();
983 
984 	/*
985 	 * Because this flow zaps _only_ leaf SPTEs, the caller doesn't need
986 	 * to provide RCU protection as no 'struct kvm_mmu_page' will be freed.
987 	 */
988 	return flush;
989 }
990 
991 /*
992  * Zap leaf SPTEs for the range of gfns, [start, end), for all *VALID** roots.
993  * Returns true if a TLB flush is needed before releasing the MMU lock, i.e. if
994  * one or more SPTEs were zapped since the MMU lock was last acquired.
995  */
996 bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush)
997 {
998 	struct kvm_mmu_page *root;
999 
1000 	lockdep_assert_held_write(&kvm->mmu_lock);
1001 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, -1)
1002 		flush = tdp_mmu_zap_leafs(kvm, root, start, end, true, flush);
1003 
1004 	return flush;
1005 }
1006 
1007 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
1008 {
1009 	struct kvm_mmu_page *root;
1010 
1011 	/*
1012 	 * Zap all direct roots, including invalid direct roots, as all direct
1013 	 * SPTEs must be dropped before returning to the caller. For TDX, mirror
1014 	 * roots don't need handling in response to the mmu notifier (the caller).
1015 	 *
1016 	 * Zap directly even if the root is also being zapped by a concurrent
1017 	 * "fast zap".  Walking zapped top-level SPTEs isn't all that expensive
1018 	 * and mmu_lock is already held, which means the other thread has yielded.
1019 	 *
1020 	 * A TLB flush is unnecessary, KVM zaps everything if and only the VM
1021 	 * is being destroyed or the userspace VMM has exited.  In both cases,
1022 	 * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request.
1023 	 */
1024 	lockdep_assert_held_write(&kvm->mmu_lock);
1025 	__for_each_tdp_mmu_root_yield_safe(kvm, root, -1,
1026 					   KVM_DIRECT_ROOTS | KVM_INVALID_ROOTS)
1027 		tdp_mmu_zap_root(kvm, root, false);
1028 }
1029 
1030 /*
1031  * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast
1032  * zap" completes.
1033  */
1034 void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm, bool shared)
1035 {
1036 	struct kvm_mmu_page *root;
1037 
1038 	if (shared)
1039 		read_lock(&kvm->mmu_lock);
1040 	else
1041 		write_lock(&kvm->mmu_lock);
1042 
1043 	for_each_tdp_mmu_root_yield_safe(kvm, root) {
1044 		if (!root->tdp_mmu_scheduled_root_to_zap)
1045 			continue;
1046 
1047 		root->tdp_mmu_scheduled_root_to_zap = false;
1048 		KVM_BUG_ON(!root->role.invalid, kvm);
1049 
1050 		/*
1051 		 * A TLB flush is not necessary as KVM performs a local TLB
1052 		 * flush when allocating a new root (see kvm_mmu_load()), and
1053 		 * when migrating a vCPU to a different pCPU.  Note, the local
1054 		 * TLB flush on reuse also invalidates paging-structure-cache
1055 		 * entries, i.e. TLB entries for intermediate paging structures,
1056 		 * that may be zapped, as such entries are associated with the
1057 		 * ASID on both VMX and SVM.
1058 		 */
1059 		tdp_mmu_zap_root(kvm, root, shared);
1060 
1061 		/*
1062 		 * The referenced needs to be put *after* zapping the root, as
1063 		 * the root must be reachable by mmu_notifiers while it's being
1064 		 * zapped
1065 		 */
1066 		kvm_tdp_mmu_put_root(kvm, root);
1067 	}
1068 
1069 	if (shared)
1070 		read_unlock(&kvm->mmu_lock);
1071 	else
1072 		write_unlock(&kvm->mmu_lock);
1073 }
1074 
1075 /*
1076  * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that
1077  * is about to be zapped, e.g. in response to a memslots update.  The actual
1078  * zapping is done separately so that it happens with mmu_lock with read,
1079  * whereas invalidating roots must be done with mmu_lock held for write (unless
1080  * the VM is being destroyed).
1081  *
1082  * Note, kvm_tdp_mmu_zap_invalidated_roots() is gifted the TDP MMU's reference.
1083  * See kvm_tdp_mmu_alloc_root().
1084  */
1085 void kvm_tdp_mmu_invalidate_roots(struct kvm *kvm,
1086 				  enum kvm_tdp_mmu_root_types root_types)
1087 {
1088 	struct kvm_mmu_page *root;
1089 
1090 	/*
1091 	 * Invalidating invalid roots doesn't make sense, prevent developers from
1092 	 * having to think about it.
1093 	 */
1094 	if (WARN_ON_ONCE(root_types & KVM_INVALID_ROOTS))
1095 		root_types &= ~KVM_INVALID_ROOTS;
1096 
1097 	/*
1098 	 * mmu_lock must be held for write to ensure that a root doesn't become
1099 	 * invalid while there are active readers (invalidating a root while
1100 	 * there are active readers may or may not be problematic in practice,
1101 	 * but it's uncharted territory and not supported).
1102 	 *
1103 	 * Waive the assertion if there are no users of @kvm, i.e. the VM is
1104 	 * being destroyed after all references have been put, or if no vCPUs
1105 	 * have been created (which means there are no roots), i.e. the VM is
1106 	 * being destroyed in an error path of KVM_CREATE_VM.
1107 	 */
1108 	if (IS_ENABLED(CONFIG_PROVE_LOCKING) &&
1109 	    refcount_read(&kvm->users_count) && kvm->created_vcpus)
1110 		lockdep_assert_held_write(&kvm->mmu_lock);
1111 
1112 	/*
1113 	 * As above, mmu_lock isn't held when destroying the VM!  There can't
1114 	 * be other references to @kvm, i.e. nothing else can invalidate roots
1115 	 * or get/put references to roots.
1116 	 */
1117 	list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
1118 		if (!tdp_mmu_root_match(root, root_types))
1119 			continue;
1120 
1121 		/*
1122 		 * Note, invalid roots can outlive a memslot update!  Invalid
1123 		 * roots must be *zapped* before the memslot update completes,
1124 		 * but a different task can acquire a reference and keep the
1125 		 * root alive after its been zapped.
1126 		 */
1127 		if (!root->role.invalid) {
1128 			root->tdp_mmu_scheduled_root_to_zap = true;
1129 			root->role.invalid = true;
1130 		}
1131 	}
1132 }
1133 
1134 /*
1135  * Installs a last-level SPTE to handle a TDP page fault.
1136  * (NPT/EPT violation/misconfiguration)
1137  */
1138 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
1139 					  struct kvm_page_fault *fault,
1140 					  struct tdp_iter *iter)
1141 {
1142 	struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep));
1143 	u64 new_spte;
1144 	int ret = RET_PF_FIXED;
1145 	bool wrprot = false;
1146 
1147 	if (WARN_ON_ONCE(sp->role.level != fault->goal_level))
1148 		return RET_PF_RETRY;
1149 
1150 	if (fault->prefetch && is_shadow_present_pte(iter->old_spte))
1151 		return RET_PF_SPURIOUS;
1152 
1153 	if (is_shadow_present_pte(iter->old_spte) &&
1154 	    is_access_allowed(fault, iter->old_spte) &&
1155 	    is_last_spte(iter->old_spte, iter->level))
1156 		return RET_PF_SPURIOUS;
1157 
1158 	if (unlikely(!fault->slot))
1159 		new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
1160 	else
1161 		wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
1162 				   fault->pfn, iter->old_spte, fault->prefetch,
1163 				   false, fault->map_writable, &new_spte);
1164 
1165 	if (new_spte == iter->old_spte)
1166 		ret = RET_PF_SPURIOUS;
1167 	else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
1168 		return RET_PF_RETRY;
1169 	else if (is_shadow_present_pte(iter->old_spte) &&
1170 		 (!is_last_spte(iter->old_spte, iter->level) ||
1171 		  WARN_ON_ONCE(leaf_spte_change_needs_tlb_flush(iter->old_spte, new_spte))))
1172 		kvm_flush_remote_tlbs_gfn(vcpu->kvm, iter->gfn, iter->level);
1173 
1174 	/*
1175 	 * If the page fault was caused by a write but the page is write
1176 	 * protected, emulation is needed. If the emulation was skipped,
1177 	 * the vCPU would have the same fault again.
1178 	 */
1179 	if (wrprot && fault->write)
1180 		ret = RET_PF_WRITE_PROTECTED;
1181 
1182 	/* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
1183 	if (unlikely(is_mmio_spte(vcpu->kvm, new_spte))) {
1184 		vcpu->stat.pf_mmio_spte_created++;
1185 		trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
1186 				     new_spte);
1187 		ret = RET_PF_EMULATE;
1188 	} else {
1189 		trace_kvm_mmu_set_spte(iter->level, iter->gfn,
1190 				       rcu_dereference(iter->sptep));
1191 	}
1192 
1193 	return ret;
1194 }
1195 
1196 /*
1197  * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the
1198  * provided page table.
1199  *
1200  * @kvm: kvm instance
1201  * @iter: a tdp_iter instance currently on the SPTE that should be set
1202  * @sp: The new TDP page table to install.
1203  * @shared: This operation is running under the MMU lock in read mode.
1204  *
1205  * Returns: 0 if the new page table was installed. Non-0 if the page table
1206  *          could not be installed (e.g. the atomic compare-exchange failed).
1207  */
1208 static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
1209 			   struct kvm_mmu_page *sp, bool shared)
1210 {
1211 	u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled);
1212 	int ret = 0;
1213 
1214 	if (shared) {
1215 		ret = tdp_mmu_set_spte_atomic(kvm, iter, spte);
1216 		if (ret)
1217 			return ret;
1218 	} else {
1219 		tdp_mmu_iter_set_spte(kvm, iter, spte);
1220 	}
1221 
1222 	tdp_account_mmu_page(kvm, sp);
1223 
1224 	return 0;
1225 }
1226 
1227 static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
1228 				   struct kvm_mmu_page *sp, bool shared);
1229 
1230 /*
1231  * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
1232  * page tables and SPTEs to translate the faulting guest physical address.
1233  */
1234 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
1235 {
1236 	struct kvm_mmu_page *root = tdp_mmu_get_root_for_fault(vcpu, fault);
1237 	struct kvm *kvm = vcpu->kvm;
1238 	struct tdp_iter iter;
1239 	struct kvm_mmu_page *sp;
1240 	int ret = RET_PF_RETRY;
1241 
1242 	kvm_mmu_hugepage_adjust(vcpu, fault);
1243 
1244 	trace_kvm_mmu_spte_requested(fault);
1245 
1246 	rcu_read_lock();
1247 
1248 	for_each_tdp_pte(iter, kvm, root, fault->gfn, fault->gfn + 1) {
1249 		int r;
1250 
1251 		if (fault->nx_huge_page_workaround_enabled)
1252 			disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
1253 
1254 		/*
1255 		 * If SPTE has been frozen by another thread, just give up and
1256 		 * retry, avoiding unnecessary page table allocation and free.
1257 		 */
1258 		if (is_frozen_spte(iter.old_spte))
1259 			goto retry;
1260 
1261 		if (iter.level == fault->goal_level)
1262 			goto map_target_level;
1263 
1264 		/* Step down into the lower level page table if it exists. */
1265 		if (is_shadow_present_pte(iter.old_spte) &&
1266 		    !is_large_pte(iter.old_spte))
1267 			continue;
1268 
1269 		/*
1270 		 * The SPTE is either non-present or points to a huge page that
1271 		 * needs to be split.
1272 		 */
1273 		sp = tdp_mmu_alloc_sp(vcpu);
1274 		tdp_mmu_init_child_sp(sp, &iter);
1275 		if (is_mirror_sp(sp))
1276 			kvm_mmu_alloc_external_spt(vcpu, sp);
1277 
1278 		sp->nx_huge_page_disallowed = fault->huge_page_disallowed;
1279 
1280 		if (is_shadow_present_pte(iter.old_spte)) {
1281 			/* Don't support large page for mirrored roots (TDX) */
1282 			KVM_BUG_ON(is_mirror_sptep(iter.sptep), vcpu->kvm);
1283 			r = tdp_mmu_split_huge_page(kvm, &iter, sp, true);
1284 		} else {
1285 			r = tdp_mmu_link_sp(kvm, &iter, sp, true);
1286 		}
1287 
1288 		/*
1289 		 * Force the guest to retry if installing an upper level SPTE
1290 		 * failed, e.g. because a different task modified the SPTE.
1291 		 */
1292 		if (r) {
1293 			tdp_mmu_free_sp(sp);
1294 			goto retry;
1295 		}
1296 
1297 		if (fault->huge_page_disallowed &&
1298 		    fault->req_level >= iter.level) {
1299 			spin_lock(&kvm->arch.tdp_mmu_pages_lock);
1300 			if (sp->nx_huge_page_disallowed)
1301 				track_possible_nx_huge_page(kvm, sp);
1302 			spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
1303 		}
1304 	}
1305 
1306 	/*
1307 	 * The walk aborted before reaching the target level, e.g. because the
1308 	 * iterator detected an upper level SPTE was frozen during traversal.
1309 	 */
1310 	WARN_ON_ONCE(iter.level == fault->goal_level);
1311 	goto retry;
1312 
1313 map_target_level:
1314 	ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
1315 
1316 retry:
1317 	rcu_read_unlock();
1318 	return ret;
1319 }
1320 
1321 /* Used by mmu notifier via kvm_unmap_gfn_range() */
1322 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
1323 				 bool flush)
1324 {
1325 	enum kvm_tdp_mmu_root_types types;
1326 	struct kvm_mmu_page *root;
1327 
1328 	types = kvm_gfn_range_filter_to_root_types(kvm, range->attr_filter) | KVM_INVALID_ROOTS;
1329 
1330 	__for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, types)
1331 		flush = tdp_mmu_zap_leafs(kvm, root, range->start, range->end,
1332 					  range->may_block, flush);
1333 
1334 	return flush;
1335 }
1336 
1337 /*
1338  * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
1339  * if any of the GFNs in the range have been accessed.
1340  *
1341  * No need to mark the corresponding PFN as accessed as this call is coming
1342  * from the clear_young() or clear_flush_young() notifier, which uses the
1343  * return value to determine if the page has been accessed.
1344  */
1345 static void kvm_tdp_mmu_age_spte(struct kvm *kvm, struct tdp_iter *iter)
1346 {
1347 	u64 new_spte;
1348 
1349 	if (spte_ad_enabled(iter->old_spte)) {
1350 		iter->old_spte = tdp_mmu_clear_spte_bits_atomic(iter->sptep,
1351 								shadow_accessed_mask);
1352 		new_spte = iter->old_spte & ~shadow_accessed_mask;
1353 	} else {
1354 		new_spte = mark_spte_for_access_track(iter->old_spte);
1355 		/*
1356 		 * It is safe for the following cmpxchg to fail. Leave the
1357 		 * Accessed bit set, as the spte is most likely young anyway.
1358 		 */
1359 		if (__tdp_mmu_set_spte_atomic(kvm, iter, new_spte))
1360 			return;
1361 	}
1362 
1363 	trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level,
1364 				       iter->old_spte, new_spte);
1365 }
1366 
1367 static bool __kvm_tdp_mmu_age_gfn_range(struct kvm *kvm,
1368 					struct kvm_gfn_range *range,
1369 					bool test_only)
1370 {
1371 	enum kvm_tdp_mmu_root_types types;
1372 	struct kvm_mmu_page *root;
1373 	struct tdp_iter iter;
1374 	bool ret = false;
1375 
1376 	types = kvm_gfn_range_filter_to_root_types(kvm, range->attr_filter);
1377 
1378 	/*
1379 	 * Don't support rescheduling, none of the MMU notifiers that funnel
1380 	 * into this helper allow blocking; it'd be dead, wasteful code.  Note,
1381 	 * this helper must NOT be used to unmap GFNs, as it processes only
1382 	 * valid roots!
1383 	 */
1384 	WARN_ON(types & ~KVM_VALID_ROOTS);
1385 
1386 	guard(rcu)();
1387 	for_each_tdp_mmu_root_rcu(kvm, root, range->slot->as_id, types) {
1388 		tdp_root_for_each_leaf_pte(iter, kvm, root, range->start, range->end) {
1389 			if (!is_accessed_spte(iter.old_spte))
1390 				continue;
1391 
1392 			if (test_only)
1393 				return true;
1394 
1395 			ret = true;
1396 			kvm_tdp_mmu_age_spte(kvm, &iter);
1397 		}
1398 	}
1399 
1400 	return ret;
1401 }
1402 
1403 bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
1404 {
1405 	return __kvm_tdp_mmu_age_gfn_range(kvm, range, false);
1406 }
1407 
1408 bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1409 {
1410 	return __kvm_tdp_mmu_age_gfn_range(kvm, range, true);
1411 }
1412 
1413 /*
1414  * Remove write access from all SPTEs at or above min_level that map GFNs
1415  * [start, end). Returns true if an SPTE has been changed and the TLBs need to
1416  * be flushed.
1417  */
1418 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1419 			     gfn_t start, gfn_t end, int min_level)
1420 {
1421 	struct tdp_iter iter;
1422 	u64 new_spte;
1423 	bool spte_set = false;
1424 
1425 	rcu_read_lock();
1426 
1427 	BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1428 
1429 	for_each_tdp_pte_min_level(iter, kvm, root, min_level, start, end) {
1430 retry:
1431 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1432 			continue;
1433 
1434 		if (!is_shadow_present_pte(iter.old_spte) ||
1435 		    !is_last_spte(iter.old_spte, iter.level) ||
1436 		    !(iter.old_spte & PT_WRITABLE_MASK))
1437 			continue;
1438 
1439 		new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1440 
1441 		if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
1442 			goto retry;
1443 
1444 		spte_set = true;
1445 	}
1446 
1447 	rcu_read_unlock();
1448 	return spte_set;
1449 }
1450 
1451 /*
1452  * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1453  * only affect leaf SPTEs down to min_level.
1454  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1455  */
1456 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
1457 			     const struct kvm_memory_slot *slot, int min_level)
1458 {
1459 	struct kvm_mmu_page *root;
1460 	bool spte_set = false;
1461 
1462 	lockdep_assert_held_read(&kvm->mmu_lock);
1463 
1464 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
1465 		spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1466 			     slot->base_gfn + slot->npages, min_level);
1467 
1468 	return spte_set;
1469 }
1470 
1471 static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(void)
1472 {
1473 	struct kvm_mmu_page *sp;
1474 
1475 	sp = kmem_cache_zalloc(mmu_page_header_cache, GFP_KERNEL_ACCOUNT);
1476 	if (!sp)
1477 		return NULL;
1478 
1479 	sp->spt = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
1480 	if (!sp->spt) {
1481 		kmem_cache_free(mmu_page_header_cache, sp);
1482 		return NULL;
1483 	}
1484 
1485 	return sp;
1486 }
1487 
1488 /* Note, the caller is responsible for initializing @sp. */
1489 static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
1490 				   struct kvm_mmu_page *sp, bool shared)
1491 {
1492 	const u64 huge_spte = iter->old_spte;
1493 	const int level = iter->level;
1494 	int ret, i;
1495 
1496 	/*
1497 	 * No need for atomics when writing to sp->spt since the page table has
1498 	 * not been linked in yet and thus is not reachable from any other CPU.
1499 	 */
1500 	for (i = 0; i < SPTE_ENT_PER_PAGE; i++)
1501 		sp->spt[i] = make_small_spte(kvm, huge_spte, sp->role, i);
1502 
1503 	/*
1504 	 * Replace the huge spte with a pointer to the populated lower level
1505 	 * page table. Since we are making this change without a TLB flush vCPUs
1506 	 * will see a mix of the split mappings and the original huge mapping,
1507 	 * depending on what's currently in their TLB. This is fine from a
1508 	 * correctness standpoint since the translation will be the same either
1509 	 * way.
1510 	 */
1511 	ret = tdp_mmu_link_sp(kvm, iter, sp, shared);
1512 	if (ret)
1513 		goto out;
1514 
1515 	/*
1516 	 * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we
1517 	 * are overwriting from the page stats. But we have to manually update
1518 	 * the page stats with the new present child pages.
1519 	 */
1520 	kvm_update_page_stats(kvm, level - 1, SPTE_ENT_PER_PAGE);
1521 
1522 out:
1523 	trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret);
1524 	return ret;
1525 }
1526 
1527 static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
1528 					 struct kvm_mmu_page *root,
1529 					 gfn_t start, gfn_t end,
1530 					 int target_level, bool shared)
1531 {
1532 	struct kvm_mmu_page *sp = NULL;
1533 	struct tdp_iter iter;
1534 
1535 	rcu_read_lock();
1536 
1537 	/*
1538 	 * Traverse the page table splitting all huge pages above the target
1539 	 * level into one lower level. For example, if we encounter a 1GB page
1540 	 * we split it into 512 2MB pages.
1541 	 *
1542 	 * Since the TDP iterator uses a pre-order traversal, we are guaranteed
1543 	 * to visit an SPTE before ever visiting its children, which means we
1544 	 * will correctly recursively split huge pages that are more than one
1545 	 * level above the target level (e.g. splitting a 1GB to 512 2MB pages,
1546 	 * and then splitting each of those to 512 4KB pages).
1547 	 */
1548 	for_each_tdp_pte_min_level(iter, kvm, root, target_level + 1, start, end) {
1549 retry:
1550 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
1551 			continue;
1552 
1553 		if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte))
1554 			continue;
1555 
1556 		if (!sp) {
1557 			rcu_read_unlock();
1558 
1559 			if (shared)
1560 				read_unlock(&kvm->mmu_lock);
1561 			else
1562 				write_unlock(&kvm->mmu_lock);
1563 
1564 			sp = tdp_mmu_alloc_sp_for_split();
1565 
1566 			if (shared)
1567 				read_lock(&kvm->mmu_lock);
1568 			else
1569 				write_lock(&kvm->mmu_lock);
1570 
1571 			if (!sp) {
1572 				trace_kvm_mmu_split_huge_page(iter.gfn,
1573 							      iter.old_spte,
1574 							      iter.level, -ENOMEM);
1575 				return -ENOMEM;
1576 			}
1577 
1578 			rcu_read_lock();
1579 
1580 			iter.yielded = true;
1581 			continue;
1582 		}
1583 
1584 		tdp_mmu_init_child_sp(sp, &iter);
1585 
1586 		if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared))
1587 			goto retry;
1588 
1589 		sp = NULL;
1590 	}
1591 
1592 	rcu_read_unlock();
1593 
1594 	/*
1595 	 * It's possible to exit the loop having never used the last sp if, for
1596 	 * example, a vCPU doing HugePage NX splitting wins the race and
1597 	 * installs its own sp in place of the last sp we tried to split.
1598 	 */
1599 	if (sp)
1600 		tdp_mmu_free_sp(sp);
1601 
1602 	return 0;
1603 }
1604 
1605 
1606 /*
1607  * Try to split all huge pages mapped by the TDP MMU down to the target level.
1608  */
1609 void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
1610 				      const struct kvm_memory_slot *slot,
1611 				      gfn_t start, gfn_t end,
1612 				      int target_level, bool shared)
1613 {
1614 	struct kvm_mmu_page *root;
1615 	int r = 0;
1616 
1617 	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
1618 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) {
1619 		r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared);
1620 		if (r) {
1621 			kvm_tdp_mmu_put_root(kvm, root);
1622 			break;
1623 		}
1624 	}
1625 }
1626 
1627 static bool tdp_mmu_need_write_protect(struct kvm_mmu_page *sp)
1628 {
1629 	/*
1630 	 * All TDP MMU shadow pages share the same role as their root, aside
1631 	 * from level, so it is valid to key off any shadow page to determine if
1632 	 * write protection is needed for an entire tree.
1633 	 */
1634 	return kvm_mmu_page_ad_need_write_protect(sp) || !kvm_ad_enabled;
1635 }
1636 
1637 static void clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1638 				  gfn_t start, gfn_t end)
1639 {
1640 	const u64 dbit = tdp_mmu_need_write_protect(root) ? PT_WRITABLE_MASK :
1641 							    shadow_dirty_mask;
1642 	struct tdp_iter iter;
1643 
1644 	rcu_read_lock();
1645 
1646 	tdp_root_for_each_pte(iter, kvm, root, start, end) {
1647 retry:
1648 		if (!is_shadow_present_pte(iter.old_spte) ||
1649 		    !is_last_spte(iter.old_spte, iter.level))
1650 			continue;
1651 
1652 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1653 			continue;
1654 
1655 		KVM_MMU_WARN_ON(dbit == shadow_dirty_mask &&
1656 				spte_ad_need_write_protect(iter.old_spte));
1657 
1658 		if (!(iter.old_spte & dbit))
1659 			continue;
1660 
1661 		if (tdp_mmu_set_spte_atomic(kvm, &iter, iter.old_spte & ~dbit))
1662 			goto retry;
1663 	}
1664 
1665 	rcu_read_unlock();
1666 }
1667 
1668 /*
1669  * Clear the dirty status (D-bit or W-bit) of all the SPTEs mapping GFNs in the
1670  * memslot.
1671  */
1672 void kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
1673 				  const struct kvm_memory_slot *slot)
1674 {
1675 	struct kvm_mmu_page *root;
1676 
1677 	lockdep_assert_held_read(&kvm->mmu_lock);
1678 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
1679 		clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1680 				      slot->base_gfn + slot->npages);
1681 }
1682 
1683 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1684 				  gfn_t gfn, unsigned long mask, bool wrprot)
1685 {
1686 	const u64 dbit = (wrprot || tdp_mmu_need_write_protect(root)) ? PT_WRITABLE_MASK :
1687 									shadow_dirty_mask;
1688 	struct tdp_iter iter;
1689 
1690 	lockdep_assert_held_write(&kvm->mmu_lock);
1691 
1692 	rcu_read_lock();
1693 
1694 	tdp_root_for_each_leaf_pte(iter, kvm, root, gfn + __ffs(mask),
1695 				    gfn + BITS_PER_LONG) {
1696 		if (!mask)
1697 			break;
1698 
1699 		KVM_MMU_WARN_ON(dbit == shadow_dirty_mask &&
1700 				spte_ad_need_write_protect(iter.old_spte));
1701 
1702 		if (iter.level > PG_LEVEL_4K ||
1703 		    !(mask & (1UL << (iter.gfn - gfn))))
1704 			continue;
1705 
1706 		mask &= ~(1UL << (iter.gfn - gfn));
1707 
1708 		if (!(iter.old_spte & dbit))
1709 			continue;
1710 
1711 		iter.old_spte = tdp_mmu_clear_spte_bits(iter.sptep,
1712 							iter.old_spte, dbit,
1713 							iter.level);
1714 
1715 		trace_kvm_tdp_mmu_spte_changed(iter.as_id, iter.gfn, iter.level,
1716 					       iter.old_spte,
1717 					       iter.old_spte & ~dbit);
1718 	}
1719 
1720 	rcu_read_unlock();
1721 }
1722 
1723 /*
1724  * Clear the dirty status (D-bit or W-bit) of all the 4k SPTEs mapping GFNs for
1725  * which a bit is set in mask, starting at gfn. The given memslot is expected to
1726  * contain all the GFNs represented by set bits in the mask.
1727  */
1728 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1729 				       struct kvm_memory_slot *slot,
1730 				       gfn_t gfn, unsigned long mask,
1731 				       bool wrprot)
1732 {
1733 	struct kvm_mmu_page *root;
1734 
1735 	for_each_valid_tdp_mmu_root(kvm, root, slot->as_id)
1736 		clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1737 }
1738 
1739 static int tdp_mmu_make_huge_spte(struct kvm *kvm,
1740 				  struct tdp_iter *parent,
1741 				  u64 *huge_spte)
1742 {
1743 	struct kvm_mmu_page *root = spte_to_child_sp(parent->old_spte);
1744 	gfn_t start = parent->gfn;
1745 	gfn_t end = start + KVM_PAGES_PER_HPAGE(parent->level);
1746 	struct tdp_iter iter;
1747 
1748 	tdp_root_for_each_leaf_pte(iter, kvm, root, start, end) {
1749 		/*
1750 		 * Use the parent iterator when checking for forward progress so
1751 		 * that KVM doesn't get stuck continuously trying to yield (i.e.
1752 		 * returning -EAGAIN here and then failing the forward progress
1753 		 * check in the caller ad nauseam).
1754 		 */
1755 		if (tdp_mmu_iter_need_resched(kvm, parent))
1756 			return -EAGAIN;
1757 
1758 		*huge_spte = make_huge_spte(kvm, iter.old_spte, parent->level);
1759 		return 0;
1760 	}
1761 
1762 	return -ENOENT;
1763 }
1764 
1765 static void recover_huge_pages_range(struct kvm *kvm,
1766 				     struct kvm_mmu_page *root,
1767 				     const struct kvm_memory_slot *slot)
1768 {
1769 	gfn_t start = slot->base_gfn;
1770 	gfn_t end = start + slot->npages;
1771 	struct tdp_iter iter;
1772 	int max_mapping_level;
1773 	bool flush = false;
1774 	u64 huge_spte;
1775 	int r;
1776 
1777 	if (WARN_ON_ONCE(kvm_slot_dirty_track_enabled(slot)))
1778 		return;
1779 
1780 	rcu_read_lock();
1781 
1782 	for_each_tdp_pte_min_level(iter, kvm, root, PG_LEVEL_2M, start, end) {
1783 retry:
1784 		if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, true)) {
1785 			flush = false;
1786 			continue;
1787 		}
1788 
1789 		if (iter.level > KVM_MAX_HUGEPAGE_LEVEL ||
1790 		    !is_shadow_present_pte(iter.old_spte))
1791 			continue;
1792 
1793 		/*
1794 		 * Don't zap leaf SPTEs, if a leaf SPTE could be replaced with
1795 		 * a large page size, then its parent would have been zapped
1796 		 * instead of stepping down.
1797 		 */
1798 		if (is_last_spte(iter.old_spte, iter.level))
1799 			continue;
1800 
1801 		/*
1802 		 * If iter.gfn resides outside of the slot, i.e. the page for
1803 		 * the current level overlaps but is not contained by the slot,
1804 		 * then the SPTE can't be made huge.  More importantly, trying
1805 		 * to query that info from slot->arch.lpage_info will cause an
1806 		 * out-of-bounds access.
1807 		 */
1808 		if (iter.gfn < start || iter.gfn >= end)
1809 			continue;
1810 
1811 		max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, iter.gfn);
1812 		if (max_mapping_level < iter.level)
1813 			continue;
1814 
1815 		r = tdp_mmu_make_huge_spte(kvm, &iter, &huge_spte);
1816 		if (r == -EAGAIN)
1817 			goto retry;
1818 		else if (r)
1819 			continue;
1820 
1821 		if (tdp_mmu_set_spte_atomic(kvm, &iter, huge_spte))
1822 			goto retry;
1823 
1824 		flush = true;
1825 	}
1826 
1827 	if (flush)
1828 		kvm_flush_remote_tlbs_memslot(kvm, slot);
1829 
1830 	rcu_read_unlock();
1831 }
1832 
1833 /*
1834  * Recover huge page mappings within the slot by replacing non-leaf SPTEs with
1835  * huge SPTEs where possible.
1836  */
1837 void kvm_tdp_mmu_recover_huge_pages(struct kvm *kvm,
1838 				    const struct kvm_memory_slot *slot)
1839 {
1840 	struct kvm_mmu_page *root;
1841 
1842 	lockdep_assert_held_read(&kvm->mmu_lock);
1843 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
1844 		recover_huge_pages_range(kvm, root, slot);
1845 }
1846 
1847 /*
1848  * Removes write access on the last level SPTE mapping this GFN and unsets the
1849  * MMU-writable bit to ensure future writes continue to be intercepted.
1850  * Returns true if an SPTE was set and a TLB flush is needed.
1851  */
1852 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
1853 			      gfn_t gfn, int min_level)
1854 {
1855 	struct tdp_iter iter;
1856 	u64 new_spte;
1857 	bool spte_set = false;
1858 
1859 	BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1860 
1861 	rcu_read_lock();
1862 
1863 	for_each_tdp_pte_min_level(iter, kvm, root, min_level, gfn, gfn + 1) {
1864 		if (!is_shadow_present_pte(iter.old_spte) ||
1865 		    !is_last_spte(iter.old_spte, iter.level))
1866 			continue;
1867 
1868 		new_spte = iter.old_spte &
1869 			~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
1870 
1871 		if (new_spte == iter.old_spte)
1872 			break;
1873 
1874 		tdp_mmu_iter_set_spte(kvm, &iter, new_spte);
1875 		spte_set = true;
1876 	}
1877 
1878 	rcu_read_unlock();
1879 
1880 	return spte_set;
1881 }
1882 
1883 /*
1884  * Removes write access on the last level SPTE mapping this GFN and unsets the
1885  * MMU-writable bit to ensure future writes continue to be intercepted.
1886  * Returns true if an SPTE was set and a TLB flush is needed.
1887  */
1888 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
1889 				   struct kvm_memory_slot *slot, gfn_t gfn,
1890 				   int min_level)
1891 {
1892 	struct kvm_mmu_page *root;
1893 	bool spte_set = false;
1894 
1895 	lockdep_assert_held_write(&kvm->mmu_lock);
1896 	for_each_valid_tdp_mmu_root(kvm, root, slot->as_id)
1897 		spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
1898 
1899 	return spte_set;
1900 }
1901 
1902 /*
1903  * Return the level of the lowest level SPTE added to sptes.
1904  * That SPTE may be non-present.
1905  *
1906  * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1907  */
1908 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1909 			 int *root_level)
1910 {
1911 	struct kvm_mmu_page *root = root_to_sp(vcpu->arch.mmu->root.hpa);
1912 	struct tdp_iter iter;
1913 	gfn_t gfn = addr >> PAGE_SHIFT;
1914 	int leaf = -1;
1915 
1916 	*root_level = vcpu->arch.mmu->root_role.level;
1917 
1918 	for_each_tdp_pte(iter, vcpu->kvm, root, gfn, gfn + 1) {
1919 		leaf = iter.level;
1920 		sptes[leaf] = iter.old_spte;
1921 	}
1922 
1923 	return leaf;
1924 }
1925 
1926 /*
1927  * Returns the last level spte pointer of the shadow page walk for the given
1928  * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
1929  * walk could be performed, returns NULL and *spte does not contain valid data.
1930  *
1931  * Contract:
1932  *  - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1933  *  - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end.
1934  *
1935  * WARNING: This function is only intended to be called during fast_page_fault.
1936  */
1937 u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gfn_t gfn,
1938 					u64 *spte)
1939 {
1940 	/* Fast pf is not supported for mirrored roots  */
1941 	struct kvm_mmu_page *root = tdp_mmu_get_root(vcpu, KVM_DIRECT_ROOTS);
1942 	struct tdp_iter iter;
1943 	tdp_ptep_t sptep = NULL;
1944 
1945 	for_each_tdp_pte(iter, vcpu->kvm, root, gfn, gfn + 1) {
1946 		*spte = iter.old_spte;
1947 		sptep = iter.sptep;
1948 	}
1949 
1950 	/*
1951 	 * Perform the rcu_dereference to get the raw spte pointer value since
1952 	 * we are passing it up to fast_page_fault, which is shared with the
1953 	 * legacy MMU and thus does not retain the TDP MMU-specific __rcu
1954 	 * annotation.
1955 	 *
1956 	 * This is safe since fast_page_fault obeys the contracts of this
1957 	 * function as well as all TDP MMU contracts around modifying SPTEs
1958 	 * outside of mmu_lock.
1959 	 */
1960 	return rcu_dereference(sptep);
1961 }
1962