xref: /linux/arch/x86/kvm/mmu/tdp_mmu.c (revision 2c1ed907520c50326b8f604907a8478b27881a2e)
1 // SPDX-License-Identifier: GPL-2.0
2 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
3 
4 #include "mmu.h"
5 #include "mmu_internal.h"
6 #include "mmutrace.h"
7 #include "tdp_iter.h"
8 #include "tdp_mmu.h"
9 #include "spte.h"
10 
11 #include <asm/cmpxchg.h>
12 #include <trace/events/kvm.h>
13 
14 /* Initializes the TDP MMU for the VM, if enabled. */
kvm_mmu_init_tdp_mmu(struct kvm * kvm)15 void kvm_mmu_init_tdp_mmu(struct kvm *kvm)
16 {
17 	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
18 	spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
19 }
20 
21 /* Arbitrarily returns true so that this may be used in if statements. */
kvm_lockdep_assert_mmu_lock_held(struct kvm * kvm,bool shared)22 static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
23 							     bool shared)
24 {
25 	if (shared)
26 		lockdep_assert_held_read(&kvm->mmu_lock);
27 	else
28 		lockdep_assert_held_write(&kvm->mmu_lock);
29 
30 	return true;
31 }
32 
kvm_mmu_uninit_tdp_mmu(struct kvm * kvm)33 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
34 {
35 	/*
36 	 * Invalidate all roots, which besides the obvious, schedules all roots
37 	 * for zapping and thus puts the TDP MMU's reference to each root, i.e.
38 	 * ultimately frees all roots.
39 	 */
40 	kvm_tdp_mmu_invalidate_roots(kvm, KVM_VALID_ROOTS);
41 	kvm_tdp_mmu_zap_invalidated_roots(kvm, false);
42 
43 	WARN_ON(atomic64_read(&kvm->arch.tdp_mmu_pages));
44 	WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
45 
46 	/*
47 	 * Ensure that all the outstanding RCU callbacks to free shadow pages
48 	 * can run before the VM is torn down.  Putting the last reference to
49 	 * zapped roots will create new callbacks.
50 	 */
51 	rcu_barrier();
52 }
53 
tdp_mmu_free_sp(struct kvm_mmu_page * sp)54 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
55 {
56 	free_page((unsigned long)sp->external_spt);
57 	free_page((unsigned long)sp->spt);
58 	kmem_cache_free(mmu_page_header_cache, sp);
59 }
60 
61 /*
62  * This is called through call_rcu in order to free TDP page table memory
63  * safely with respect to other kernel threads that may be operating on
64  * the memory.
65  * By only accessing TDP MMU page table memory in an RCU read critical
66  * section, and freeing it after a grace period, lockless access to that
67  * memory won't use it after it is freed.
68  */
tdp_mmu_free_sp_rcu_callback(struct rcu_head * head)69 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
70 {
71 	struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
72 					       rcu_head);
73 
74 	tdp_mmu_free_sp(sp);
75 }
76 
kvm_tdp_mmu_put_root(struct kvm * kvm,struct kvm_mmu_page * root)77 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root)
78 {
79 	if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
80 		return;
81 
82 	/*
83 	 * The TDP MMU itself holds a reference to each root until the root is
84 	 * explicitly invalidated, i.e. the final reference should be never be
85 	 * put for a valid root.
86 	 */
87 	KVM_BUG_ON(!is_tdp_mmu_page(root) || !root->role.invalid, kvm);
88 
89 	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
90 	list_del_rcu(&root->link);
91 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
92 	call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
93 }
94 
tdp_mmu_root_match(struct kvm_mmu_page * root,enum kvm_tdp_mmu_root_types types)95 static bool tdp_mmu_root_match(struct kvm_mmu_page *root,
96 			       enum kvm_tdp_mmu_root_types types)
97 {
98 	if (WARN_ON_ONCE(!(types & KVM_VALID_ROOTS)))
99 		return false;
100 
101 	if (root->role.invalid && !(types & KVM_INVALID_ROOTS))
102 		return false;
103 
104 	if (likely(!is_mirror_sp(root)))
105 		return types & KVM_DIRECT_ROOTS;
106 	return types & KVM_MIRROR_ROOTS;
107 }
108 
109 /*
110  * Returns the next root after @prev_root (or the first root if @prev_root is
111  * NULL) that matches with @types.  A reference to the returned root is
112  * acquired, and the reference to @prev_root is released (the caller obviously
113  * must hold a reference to @prev_root if it's non-NULL).
114  *
115  * Roots that doesn't match with @types are skipped.
116  *
117  * Returns NULL if the end of tdp_mmu_roots was reached.
118  */
tdp_mmu_next_root(struct kvm * kvm,struct kvm_mmu_page * prev_root,enum kvm_tdp_mmu_root_types types)119 static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
120 					      struct kvm_mmu_page *prev_root,
121 					      enum kvm_tdp_mmu_root_types types)
122 {
123 	struct kvm_mmu_page *next_root;
124 
125 	/*
126 	 * While the roots themselves are RCU-protected, fields such as
127 	 * role.invalid are protected by mmu_lock.
128 	 */
129 	lockdep_assert_held(&kvm->mmu_lock);
130 
131 	rcu_read_lock();
132 
133 	if (prev_root)
134 		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
135 						  &prev_root->link,
136 						  typeof(*prev_root), link);
137 	else
138 		next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
139 						   typeof(*next_root), link);
140 
141 	while (next_root) {
142 		if (tdp_mmu_root_match(next_root, types) &&
143 		    kvm_tdp_mmu_get_root(next_root))
144 			break;
145 
146 		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
147 				&next_root->link, typeof(*next_root), link);
148 	}
149 
150 	rcu_read_unlock();
151 
152 	if (prev_root)
153 		kvm_tdp_mmu_put_root(kvm, prev_root);
154 
155 	return next_root;
156 }
157 
158 /*
159  * Note: this iterator gets and puts references to the roots it iterates over.
160  * This makes it safe to release the MMU lock and yield within the loop, but
161  * if exiting the loop early, the caller must drop the reference to the most
162  * recent root. (Unless keeping a live reference is desirable.)
163  *
164  * If shared is set, this function is operating under the MMU lock in read
165  * mode.
166  */
167 #define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _types)	\
168 	for (_root = tdp_mmu_next_root(_kvm, NULL, _types);		\
169 	     ({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root;		\
170 	     _root = tdp_mmu_next_root(_kvm, _root, _types))		\
171 		if (_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) {	\
172 		} else
173 
174 #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id)	\
175 	__for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, KVM_VALID_ROOTS)
176 
177 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root)			\
178 	for (_root = tdp_mmu_next_root(_kvm, NULL, KVM_ALL_ROOTS);		\
179 	     ({ lockdep_assert_held(&(_kvm)->mmu_lock); }), _root;	\
180 	     _root = tdp_mmu_next_root(_kvm, _root, KVM_ALL_ROOTS))
181 
182 /*
183  * Iterate over all TDP MMU roots.  Requires that mmu_lock be held for write,
184  * the implication being that any flow that holds mmu_lock for read is
185  * inherently yield-friendly and should use the yield-safe variant above.
186  * Holding mmu_lock for write obviates the need for RCU protection as the list
187  * is guaranteed to be stable.
188  */
189 #define __for_each_tdp_mmu_root(_kvm, _root, _as_id, _types)			\
190 	list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)		\
191 		if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) &&		\
192 		    ((_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) ||	\
193 		     !tdp_mmu_root_match((_root), (_types)))) {			\
194 		} else
195 
196 #define for_each_valid_tdp_mmu_root(_kvm, _root, _as_id)		\
197 	__for_each_tdp_mmu_root(_kvm, _root, _as_id, KVM_VALID_ROOTS)
198 
tdp_mmu_alloc_sp(struct kvm_vcpu * vcpu)199 static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
200 {
201 	struct kvm_mmu_page *sp;
202 
203 	sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
204 	sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
205 
206 	return sp;
207 }
208 
tdp_mmu_init_sp(struct kvm_mmu_page * sp,tdp_ptep_t sptep,gfn_t gfn,union kvm_mmu_page_role role)209 static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep,
210 			    gfn_t gfn, union kvm_mmu_page_role role)
211 {
212 	INIT_LIST_HEAD(&sp->possible_nx_huge_page_link);
213 
214 	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
215 
216 	sp->role = role;
217 	sp->gfn = gfn;
218 	sp->ptep = sptep;
219 	sp->tdp_mmu_page = true;
220 
221 	trace_kvm_mmu_get_page(sp, true);
222 }
223 
tdp_mmu_init_child_sp(struct kvm_mmu_page * child_sp,struct tdp_iter * iter)224 static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
225 				  struct tdp_iter *iter)
226 {
227 	struct kvm_mmu_page *parent_sp;
228 	union kvm_mmu_page_role role;
229 
230 	parent_sp = sptep_to_sp(rcu_dereference(iter->sptep));
231 
232 	role = parent_sp->role;
233 	role.level--;
234 
235 	tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role);
236 }
237 
kvm_tdp_mmu_alloc_root(struct kvm_vcpu * vcpu,bool mirror)238 void kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu, bool mirror)
239 {
240 	struct kvm_mmu *mmu = vcpu->arch.mmu;
241 	union kvm_mmu_page_role role = mmu->root_role;
242 	int as_id = kvm_mmu_role_as_id(role);
243 	struct kvm *kvm = vcpu->kvm;
244 	struct kvm_mmu_page *root;
245 
246 	if (mirror)
247 		role.is_mirror = true;
248 
249 	/*
250 	 * Check for an existing root before acquiring the pages lock to avoid
251 	 * unnecessary serialization if multiple vCPUs are loading a new root.
252 	 * E.g. when bringing up secondary vCPUs, KVM will already have created
253 	 * a valid root on behalf of the primary vCPU.
254 	 */
255 	read_lock(&kvm->mmu_lock);
256 
257 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, as_id) {
258 		if (root->role.word == role.word)
259 			goto out_read_unlock;
260 	}
261 
262 	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
263 
264 	/*
265 	 * Recheck for an existing root after acquiring the pages lock, another
266 	 * vCPU may have raced ahead and created a new usable root.  Manually
267 	 * walk the list of roots as the standard macros assume that the pages
268 	 * lock is *not* held.  WARN if grabbing a reference to a usable root
269 	 * fails, as the last reference to a root can only be put *after* the
270 	 * root has been invalidated, which requires holding mmu_lock for write.
271 	 */
272 	list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
273 		if (root->role.word == role.word &&
274 		    !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root)))
275 			goto out_spin_unlock;
276 	}
277 
278 	root = tdp_mmu_alloc_sp(vcpu);
279 	tdp_mmu_init_sp(root, NULL, 0, role);
280 
281 	/*
282 	 * TDP MMU roots are kept until they are explicitly invalidated, either
283 	 * by a memslot update or by the destruction of the VM.  Initialize the
284 	 * refcount to two; one reference for the vCPU, and one reference for
285 	 * the TDP MMU itself, which is held until the root is invalidated and
286 	 * is ultimately put by kvm_tdp_mmu_zap_invalidated_roots().
287 	 */
288 	refcount_set(&root->tdp_mmu_root_count, 2);
289 	list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
290 
291 out_spin_unlock:
292 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
293 out_read_unlock:
294 	read_unlock(&kvm->mmu_lock);
295 	/*
296 	 * Note, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS will prevent entering the guest
297 	 * and actually consuming the root if it's invalidated after dropping
298 	 * mmu_lock, and the root can't be freed as this vCPU holds a reference.
299 	 */
300 	if (mirror) {
301 		mmu->mirror_root_hpa = __pa(root->spt);
302 	} else {
303 		mmu->root.hpa = __pa(root->spt);
304 		mmu->root.pgd = 0;
305 	}
306 }
307 
308 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
309 				u64 old_spte, u64 new_spte, int level,
310 				bool shared);
311 
tdp_account_mmu_page(struct kvm * kvm,struct kvm_mmu_page * sp)312 static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
313 {
314 	kvm_account_pgtable_pages((void *)sp->spt, +1);
315 	atomic64_inc(&kvm->arch.tdp_mmu_pages);
316 }
317 
tdp_unaccount_mmu_page(struct kvm * kvm,struct kvm_mmu_page * sp)318 static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
319 {
320 	kvm_account_pgtable_pages((void *)sp->spt, -1);
321 	atomic64_dec(&kvm->arch.tdp_mmu_pages);
322 }
323 
324 /**
325  * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages
326  *
327  * @kvm: kvm instance
328  * @sp: the page to be removed
329  */
tdp_mmu_unlink_sp(struct kvm * kvm,struct kvm_mmu_page * sp)330 static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
331 {
332 	tdp_unaccount_mmu_page(kvm, sp);
333 
334 	if (!sp->nx_huge_page_disallowed)
335 		return;
336 
337 	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
338 	sp->nx_huge_page_disallowed = false;
339 	untrack_possible_nx_huge_page(kvm, sp);
340 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
341 }
342 
remove_external_spte(struct kvm * kvm,gfn_t gfn,u64 old_spte,int level)343 static void remove_external_spte(struct kvm *kvm, gfn_t gfn, u64 old_spte,
344 				 int level)
345 {
346 	kvm_pfn_t old_pfn = spte_to_pfn(old_spte);
347 	int ret;
348 
349 	/*
350 	 * External (TDX) SPTEs are limited to PG_LEVEL_4K, and external
351 	 * PTs are removed in a special order, involving free_external_spt().
352 	 * But remove_external_spte() will be called on non-leaf PTEs via
353 	 * __tdp_mmu_zap_root(), so avoid the error the former would return
354 	 * in this case.
355 	 */
356 	if (!is_last_spte(old_spte, level))
357 		return;
358 
359 	/* Zapping leaf spte is allowed only when write lock is held. */
360 	lockdep_assert_held_write(&kvm->mmu_lock);
361 	/* Because write lock is held, operation should success. */
362 	ret = static_call(kvm_x86_remove_external_spte)(kvm, gfn, level, old_pfn);
363 	KVM_BUG_ON(ret, kvm);
364 }
365 
366 /**
367  * handle_removed_pt() - handle a page table removed from the TDP structure
368  *
369  * @kvm: kvm instance
370  * @pt: the page removed from the paging structure
371  * @shared: This operation may not be running under the exclusive use
372  *	    of the MMU lock and the operation must synchronize with other
373  *	    threads that might be modifying SPTEs.
374  *
375  * Given a page table that has been removed from the TDP paging structure,
376  * iterates through the page table to clear SPTEs and free child page tables.
377  *
378  * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
379  * protection. Since this thread removed it from the paging structure,
380  * this thread will be responsible for ensuring the page is freed. Hence the
381  * early rcu_dereferences in the function.
382  */
handle_removed_pt(struct kvm * kvm,tdp_ptep_t pt,bool shared)383 static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
384 {
385 	struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
386 	int level = sp->role.level;
387 	gfn_t base_gfn = sp->gfn;
388 	int i;
389 
390 	trace_kvm_mmu_prepare_zap_page(sp);
391 
392 	tdp_mmu_unlink_sp(kvm, sp);
393 
394 	for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
395 		tdp_ptep_t sptep = pt + i;
396 		gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
397 		u64 old_spte;
398 
399 		if (shared) {
400 			/*
401 			 * Set the SPTE to a nonpresent value that other
402 			 * threads will not overwrite. If the SPTE was
403 			 * already marked as frozen then another thread
404 			 * handling a page fault could overwrite it, so
405 			 * set the SPTE until it is set from some other
406 			 * value to the frozen SPTE value.
407 			 */
408 			for (;;) {
409 				old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, FROZEN_SPTE);
410 				if (!is_frozen_spte(old_spte))
411 					break;
412 				cpu_relax();
413 			}
414 		} else {
415 			/*
416 			 * If the SPTE is not MMU-present, there is no backing
417 			 * page associated with the SPTE and so no side effects
418 			 * that need to be recorded, and exclusive ownership of
419 			 * mmu_lock ensures the SPTE can't be made present.
420 			 * Note, zapping MMIO SPTEs is also unnecessary as they
421 			 * are guarded by the memslots generation, not by being
422 			 * unreachable.
423 			 */
424 			old_spte = kvm_tdp_mmu_read_spte(sptep);
425 			if (!is_shadow_present_pte(old_spte))
426 				continue;
427 
428 			/*
429 			 * Use the common helper instead of a raw WRITE_ONCE as
430 			 * the SPTE needs to be updated atomically if it can be
431 			 * modified by a different vCPU outside of mmu_lock.
432 			 * Even though the parent SPTE is !PRESENT, the TLB
433 			 * hasn't yet been flushed, and both Intel and AMD
434 			 * document that A/D assists can use upper-level PxE
435 			 * entries that are cached in the TLB, i.e. the CPU can
436 			 * still access the page and mark it dirty.
437 			 *
438 			 * No retry is needed in the atomic update path as the
439 			 * sole concern is dropping a Dirty bit, i.e. no other
440 			 * task can zap/remove the SPTE as mmu_lock is held for
441 			 * write.  Marking the SPTE as a frozen SPTE is not
442 			 * strictly necessary for the same reason, but using
443 			 * the frozen SPTE value keeps the shared/exclusive
444 			 * paths consistent and allows the handle_changed_spte()
445 			 * call below to hardcode the new value to FROZEN_SPTE.
446 			 *
447 			 * Note, even though dropping a Dirty bit is the only
448 			 * scenario where a non-atomic update could result in a
449 			 * functional bug, simply checking the Dirty bit isn't
450 			 * sufficient as a fast page fault could read the upper
451 			 * level SPTE before it is zapped, and then make this
452 			 * target SPTE writable, resume the guest, and set the
453 			 * Dirty bit between reading the SPTE above and writing
454 			 * it here.
455 			 */
456 			old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte,
457 							  FROZEN_SPTE, level);
458 		}
459 		handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
460 				    old_spte, FROZEN_SPTE, level, shared);
461 
462 		if (is_mirror_sp(sp)) {
463 			KVM_BUG_ON(shared, kvm);
464 			remove_external_spte(kvm, gfn, old_spte, level);
465 		}
466 	}
467 
468 	if (is_mirror_sp(sp) &&
469 	    WARN_ON(static_call(kvm_x86_free_external_spt)(kvm, base_gfn, sp->role.level,
470 							  sp->external_spt))) {
471 		/*
472 		 * Failed to free page table page in mirror page table and
473 		 * there is nothing to do further.
474 		 * Intentionally leak the page to prevent the kernel from
475 		 * accessing the encrypted page.
476 		 */
477 		sp->external_spt = NULL;
478 	}
479 
480 	call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
481 }
482 
get_external_spt(gfn_t gfn,u64 new_spte,int level)483 static void *get_external_spt(gfn_t gfn, u64 new_spte, int level)
484 {
485 	if (is_shadow_present_pte(new_spte) && !is_last_spte(new_spte, level)) {
486 		struct kvm_mmu_page *sp = spte_to_child_sp(new_spte);
487 
488 		WARN_ON_ONCE(sp->role.level + 1 != level);
489 		WARN_ON_ONCE(sp->gfn != gfn);
490 		return sp->external_spt;
491 	}
492 
493 	return NULL;
494 }
495 
set_external_spte_present(struct kvm * kvm,tdp_ptep_t sptep,gfn_t gfn,u64 old_spte,u64 new_spte,int level)496 static int __must_check set_external_spte_present(struct kvm *kvm, tdp_ptep_t sptep,
497 						 gfn_t gfn, u64 old_spte,
498 						 u64 new_spte, int level)
499 {
500 	bool was_present = is_shadow_present_pte(old_spte);
501 	bool is_present = is_shadow_present_pte(new_spte);
502 	bool is_leaf = is_present && is_last_spte(new_spte, level);
503 	kvm_pfn_t new_pfn = spte_to_pfn(new_spte);
504 	int ret = 0;
505 
506 	KVM_BUG_ON(was_present, kvm);
507 
508 	lockdep_assert_held(&kvm->mmu_lock);
509 	/*
510 	 * We need to lock out other updates to the SPTE until the external
511 	 * page table has been modified. Use FROZEN_SPTE similar to
512 	 * the zapping case.
513 	 */
514 	if (!try_cmpxchg64(rcu_dereference(sptep), &old_spte, FROZEN_SPTE))
515 		return -EBUSY;
516 
517 	/*
518 	 * Use different call to either set up middle level
519 	 * external page table, or leaf.
520 	 */
521 	if (is_leaf) {
522 		ret = static_call(kvm_x86_set_external_spte)(kvm, gfn, level, new_pfn);
523 	} else {
524 		void *external_spt = get_external_spt(gfn, new_spte, level);
525 
526 		KVM_BUG_ON(!external_spt, kvm);
527 		ret = static_call(kvm_x86_link_external_spt)(kvm, gfn, level, external_spt);
528 	}
529 	if (ret)
530 		__kvm_tdp_mmu_write_spte(sptep, old_spte);
531 	else
532 		__kvm_tdp_mmu_write_spte(sptep, new_spte);
533 	return ret;
534 }
535 
536 /**
537  * handle_changed_spte - handle bookkeeping associated with an SPTE change
538  * @kvm: kvm instance
539  * @as_id: the address space of the paging structure the SPTE was a part of
540  * @gfn: the base GFN that was mapped by the SPTE
541  * @old_spte: The value of the SPTE before the change
542  * @new_spte: The value of the SPTE after the change
543  * @level: the level of the PT the SPTE is part of in the paging structure
544  * @shared: This operation may not be running under the exclusive use of
545  *	    the MMU lock and the operation must synchronize with other
546  *	    threads that might be modifying SPTEs.
547  *
548  * Handle bookkeeping that might result from the modification of a SPTE.  Note,
549  * dirty logging updates are handled in common code, not here (see make_spte()
550  * and fast_pf_fix_direct_spte()).
551  */
handle_changed_spte(struct kvm * kvm,int as_id,gfn_t gfn,u64 old_spte,u64 new_spte,int level,bool shared)552 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
553 				u64 old_spte, u64 new_spte, int level,
554 				bool shared)
555 {
556 	bool was_present = is_shadow_present_pte(old_spte);
557 	bool is_present = is_shadow_present_pte(new_spte);
558 	bool was_leaf = was_present && is_last_spte(old_spte, level);
559 	bool is_leaf = is_present && is_last_spte(new_spte, level);
560 	bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
561 
562 	WARN_ON_ONCE(level > PT64_ROOT_MAX_LEVEL);
563 	WARN_ON_ONCE(level < PG_LEVEL_4K);
564 	WARN_ON_ONCE(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
565 
566 	/*
567 	 * If this warning were to trigger it would indicate that there was a
568 	 * missing MMU notifier or a race with some notifier handler.
569 	 * A present, leaf SPTE should never be directly replaced with another
570 	 * present leaf SPTE pointing to a different PFN. A notifier handler
571 	 * should be zapping the SPTE before the main MM's page table is
572 	 * changed, or the SPTE should be zeroed, and the TLBs flushed by the
573 	 * thread before replacement.
574 	 */
575 	if (was_leaf && is_leaf && pfn_changed) {
576 		pr_err("Invalid SPTE change: cannot replace a present leaf\n"
577 		       "SPTE with another present leaf SPTE mapping a\n"
578 		       "different PFN!\n"
579 		       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
580 		       as_id, gfn, old_spte, new_spte, level);
581 
582 		/*
583 		 * Crash the host to prevent error propagation and guest data
584 		 * corruption.
585 		 */
586 		BUG();
587 	}
588 
589 	if (old_spte == new_spte)
590 		return;
591 
592 	trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
593 
594 	if (is_leaf)
595 		check_spte_writable_invariants(new_spte);
596 
597 	/*
598 	 * The only times a SPTE should be changed from a non-present to
599 	 * non-present state is when an MMIO entry is installed/modified/
600 	 * removed. In that case, there is nothing to do here.
601 	 */
602 	if (!was_present && !is_present) {
603 		/*
604 		 * If this change does not involve a MMIO SPTE or frozen SPTE,
605 		 * it is unexpected. Log the change, though it should not
606 		 * impact the guest since both the former and current SPTEs
607 		 * are nonpresent.
608 		 */
609 		if (WARN_ON_ONCE(!is_mmio_spte(kvm, old_spte) &&
610 				 !is_mmio_spte(kvm, new_spte) &&
611 				 !is_frozen_spte(new_spte)))
612 			pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
613 			       "should not be replaced with another,\n"
614 			       "different nonpresent SPTE, unless one or both\n"
615 			       "are MMIO SPTEs, or the new SPTE is\n"
616 			       "a temporary frozen SPTE.\n"
617 			       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
618 			       as_id, gfn, old_spte, new_spte, level);
619 		return;
620 	}
621 
622 	if (is_leaf != was_leaf)
623 		kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
624 
625 	/*
626 	 * Recursively handle child PTs if the change removed a subtree from
627 	 * the paging structure.  Note the WARN on the PFN changing without the
628 	 * SPTE being converted to a hugepage (leaf) or being zapped.  Shadow
629 	 * pages are kernel allocations and should never be migrated.
630 	 */
631 	if (was_present && !was_leaf &&
632 	    (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
633 		handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
634 }
635 
__tdp_mmu_set_spte_atomic(struct kvm * kvm,struct tdp_iter * iter,u64 new_spte)636 static inline int __must_check __tdp_mmu_set_spte_atomic(struct kvm *kvm,
637 							 struct tdp_iter *iter,
638 							 u64 new_spte)
639 {
640 	/*
641 	 * The caller is responsible for ensuring the old SPTE is not a FROZEN
642 	 * SPTE.  KVM should never attempt to zap or manipulate a FROZEN SPTE,
643 	 * and pre-checking before inserting a new SPTE is advantageous as it
644 	 * avoids unnecessary work.
645 	 */
646 	WARN_ON_ONCE(iter->yielded || is_frozen_spte(iter->old_spte));
647 
648 	if (is_mirror_sptep(iter->sptep) && !is_frozen_spte(new_spte)) {
649 		int ret;
650 
651 		/*
652 		 * Users of atomic zapping don't operate on mirror roots,
653 		 * so don't handle it and bug the VM if it's seen.
654 		 */
655 		if (KVM_BUG_ON(!is_shadow_present_pte(new_spte), kvm))
656 			return -EBUSY;
657 
658 		ret = set_external_spte_present(kvm, iter->sptep, iter->gfn,
659 						iter->old_spte, new_spte, iter->level);
660 		if (ret)
661 			return ret;
662 	} else {
663 		u64 *sptep = rcu_dereference(iter->sptep);
664 
665 		/*
666 		 * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs
667 		 * and does not hold the mmu_lock.  On failure, i.e. if a
668 		 * different logical CPU modified the SPTE, try_cmpxchg64()
669 		 * updates iter->old_spte with the current value, so the caller
670 		 * operates on fresh data, e.g. if it retries
671 		 * tdp_mmu_set_spte_atomic()
672 		 */
673 		if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte))
674 			return -EBUSY;
675 	}
676 
677 	return 0;
678 }
679 
680 /*
681  * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically
682  * and handle the associated bookkeeping.  Do not mark the page dirty
683  * in KVM's dirty bitmaps.
684  *
685  * If setting the SPTE fails because it has changed, iter->old_spte will be
686  * refreshed to the current value of the spte.
687  *
688  * @kvm: kvm instance
689  * @iter: a tdp_iter instance currently on the SPTE that should be set
690  * @new_spte: The value the SPTE should be set to
691  * Return:
692  * * 0      - If the SPTE was set.
693  * * -EBUSY - If the SPTE cannot be set. In this case this function will have
694  *            no side-effects other than setting iter->old_spte to the last
695  *            known value of the spte.
696  */
tdp_mmu_set_spte_atomic(struct kvm * kvm,struct tdp_iter * iter,u64 new_spte)697 static inline int __must_check tdp_mmu_set_spte_atomic(struct kvm *kvm,
698 						       struct tdp_iter *iter,
699 						       u64 new_spte)
700 {
701 	int ret;
702 
703 	lockdep_assert_held_read(&kvm->mmu_lock);
704 
705 	ret = __tdp_mmu_set_spte_atomic(kvm, iter, new_spte);
706 	if (ret)
707 		return ret;
708 
709 	handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
710 			    new_spte, iter->level, true);
711 
712 	return 0;
713 }
714 
715 /*
716  * tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
717  * @kvm:	      KVM instance
718  * @as_id:	      Address space ID, i.e. regular vs. SMM
719  * @sptep:	      Pointer to the SPTE
720  * @old_spte:	      The current value of the SPTE
721  * @new_spte:	      The new value that will be set for the SPTE
722  * @gfn:	      The base GFN that was (or will be) mapped by the SPTE
723  * @level:	      The level _containing_ the SPTE (its parent PT's level)
724  *
725  * Returns the old SPTE value, which _may_ be different than @old_spte if the
726  * SPTE had voldatile bits.
727  */
tdp_mmu_set_spte(struct kvm * kvm,int as_id,tdp_ptep_t sptep,u64 old_spte,u64 new_spte,gfn_t gfn,int level)728 static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
729 			    u64 old_spte, u64 new_spte, gfn_t gfn, int level)
730 {
731 	lockdep_assert_held_write(&kvm->mmu_lock);
732 
733 	/*
734 	 * No thread should be using this function to set SPTEs to or from the
735 	 * temporary frozen SPTE value.
736 	 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
737 	 * should be used. If operating under the MMU lock in write mode, the
738 	 * use of the frozen SPTE should not be necessary.
739 	 */
740 	WARN_ON_ONCE(is_frozen_spte(old_spte) || is_frozen_spte(new_spte));
741 
742 	old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level);
743 
744 	handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
745 
746 	/*
747 	 * Users that do non-atomic setting of PTEs don't operate on mirror
748 	 * roots, so don't handle it and bug the VM if it's seen.
749 	 */
750 	if (is_mirror_sptep(sptep)) {
751 		KVM_BUG_ON(is_shadow_present_pte(new_spte), kvm);
752 		remove_external_spte(kvm, gfn, old_spte, level);
753 	}
754 
755 	return old_spte;
756 }
757 
tdp_mmu_iter_set_spte(struct kvm * kvm,struct tdp_iter * iter,u64 new_spte)758 static inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter,
759 					 u64 new_spte)
760 {
761 	WARN_ON_ONCE(iter->yielded);
762 	iter->old_spte = tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep,
763 					  iter->old_spte, new_spte,
764 					  iter->gfn, iter->level);
765 }
766 
767 #define tdp_root_for_each_pte(_iter, _kvm, _root, _start, _end)	\
768 	for_each_tdp_pte(_iter, _kvm, _root, _start, _end)
769 
770 #define tdp_root_for_each_leaf_pte(_iter, _kvm, _root, _start, _end)	\
771 	tdp_root_for_each_pte(_iter, _kvm, _root, _start, _end)		\
772 		if (!is_shadow_present_pte(_iter.old_spte) ||		\
773 		    !is_last_spte(_iter.old_spte, _iter.level))		\
774 			continue;					\
775 		else
776 
777 #define tdp_mmu_for_each_pte(_iter, _kvm, _root, _start, _end)	\
778 	for_each_tdp_pte(_iter, _kvm, _root, _start, _end)
779 
tdp_mmu_iter_need_resched(struct kvm * kvm,struct tdp_iter * iter)780 static inline bool __must_check tdp_mmu_iter_need_resched(struct kvm *kvm,
781 							  struct tdp_iter *iter)
782 {
783 	if (!need_resched() && !rwlock_needbreak(&kvm->mmu_lock))
784 		return false;
785 
786 	/* Ensure forward progress has been made before yielding. */
787 	return iter->next_last_level_gfn != iter->yielded_gfn;
788 }
789 
790 /*
791  * Yield if the MMU lock is contended or this thread needs to return control
792  * to the scheduler.
793  *
794  * If this function should yield and flush is set, it will perform a remote
795  * TLB flush before yielding.
796  *
797  * If this function yields, iter->yielded is set and the caller must skip to
798  * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk
799  * over the paging structures to allow the iterator to continue its traversal
800  * from the paging structure root.
801  *
802  * Returns true if this function yielded.
803  */
tdp_mmu_iter_cond_resched(struct kvm * kvm,struct tdp_iter * iter,bool flush,bool shared)804 static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
805 							  struct tdp_iter *iter,
806 							  bool flush, bool shared)
807 {
808 	KVM_MMU_WARN_ON(iter->yielded);
809 
810 	if (!tdp_mmu_iter_need_resched(kvm, iter))
811 		return false;
812 
813 	if (flush)
814 		kvm_flush_remote_tlbs(kvm);
815 
816 	rcu_read_unlock();
817 
818 	if (shared)
819 		cond_resched_rwlock_read(&kvm->mmu_lock);
820 	else
821 		cond_resched_rwlock_write(&kvm->mmu_lock);
822 
823 	rcu_read_lock();
824 
825 	WARN_ON_ONCE(iter->gfn > iter->next_last_level_gfn);
826 
827 	iter->yielded = true;
828 	return true;
829 }
830 
tdp_mmu_max_gfn_exclusive(void)831 static inline gfn_t tdp_mmu_max_gfn_exclusive(void)
832 {
833 	/*
834 	 * Bound TDP MMU walks at host.MAXPHYADDR.  KVM disallows memslots with
835 	 * a gpa range that would exceed the max gfn, and KVM does not create
836 	 * MMIO SPTEs for "impossible" gfns, instead sending such accesses down
837 	 * the slow emulation path every time.
838 	 */
839 	return kvm_mmu_max_gfn() + 1;
840 }
841 
__tdp_mmu_zap_root(struct kvm * kvm,struct kvm_mmu_page * root,bool shared,int zap_level)842 static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
843 			       bool shared, int zap_level)
844 {
845 	struct tdp_iter iter;
846 
847 	for_each_tdp_pte_min_level_all(iter, root, zap_level) {
848 retry:
849 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
850 			continue;
851 
852 		if (!is_shadow_present_pte(iter.old_spte))
853 			continue;
854 
855 		if (iter.level > zap_level)
856 			continue;
857 
858 		if (!shared)
859 			tdp_mmu_iter_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE);
860 		else if (tdp_mmu_set_spte_atomic(kvm, &iter, SHADOW_NONPRESENT_VALUE))
861 			goto retry;
862 	}
863 }
864 
tdp_mmu_zap_root(struct kvm * kvm,struct kvm_mmu_page * root,bool shared)865 static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
866 			     bool shared)
867 {
868 
869 	/*
870 	 * The root must have an elevated refcount so that it's reachable via
871 	 * mmu_notifier callbacks, which allows this path to yield and drop
872 	 * mmu_lock.  When handling an unmap/release mmu_notifier command, KVM
873 	 * must drop all references to relevant pages prior to completing the
874 	 * callback.  Dropping mmu_lock with an unreachable root would result
875 	 * in zapping SPTEs after a relevant mmu_notifier callback completes
876 	 * and lead to use-after-free as zapping a SPTE triggers "writeback" of
877 	 * dirty accessed bits to the SPTE's associated struct page.
878 	 */
879 	WARN_ON_ONCE(!refcount_read(&root->tdp_mmu_root_count));
880 
881 	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
882 
883 	rcu_read_lock();
884 
885 	/*
886 	 * Zap roots in multiple passes of decreasing granularity, i.e. zap at
887 	 * 4KiB=>2MiB=>1GiB=>root, in order to better honor need_resched() (all
888 	 * preempt models) or mmu_lock contention (full or real-time models).
889 	 * Zapping at finer granularity marginally increases the total time of
890 	 * the zap, but in most cases the zap itself isn't latency sensitive.
891 	 *
892 	 * If KVM is configured to prove the MMU, skip the 4KiB and 2MiB zaps
893 	 * in order to mimic the page fault path, which can replace a 1GiB page
894 	 * table with an equivalent 1GiB hugepage, i.e. can get saddled with
895 	 * zapping a 1GiB region that's fully populated with 4KiB SPTEs.  This
896 	 * allows verifying that KVM can safely zap 1GiB regions, e.g. without
897 	 * inducing RCU stalls, without relying on a relatively rare event
898 	 * (zapping roots is orders of magnitude more common).  Note, because
899 	 * zapping a SP recurses on its children, stepping down to PG_LEVEL_4K
900 	 * in the iterator itself is unnecessary.
901 	 */
902 	if (!IS_ENABLED(CONFIG_KVM_PROVE_MMU)) {
903 		__tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_4K);
904 		__tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_2M);
905 	}
906 	__tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G);
907 	__tdp_mmu_zap_root(kvm, root, shared, root->role.level);
908 
909 	rcu_read_unlock();
910 }
911 
kvm_tdp_mmu_zap_sp(struct kvm * kvm,struct kvm_mmu_page * sp)912 bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
913 {
914 	u64 old_spte;
915 
916 	/*
917 	 * This helper intentionally doesn't allow zapping a root shadow page,
918 	 * which doesn't have a parent page table and thus no associated entry.
919 	 */
920 	if (WARN_ON_ONCE(!sp->ptep))
921 		return false;
922 
923 	old_spte = kvm_tdp_mmu_read_spte(sp->ptep);
924 	if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte)))
925 		return false;
926 
927 	tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte,
928 			 SHADOW_NONPRESENT_VALUE, sp->gfn, sp->role.level + 1);
929 
930 	return true;
931 }
932 
933 /*
934  * If can_yield is true, will release the MMU lock and reschedule if the
935  * scheduler needs the CPU or there is contention on the MMU lock. If this
936  * function cannot yield, it will not release the MMU lock or reschedule and
937  * the caller must ensure it does not supply too large a GFN range, or the
938  * operation can cause a soft lockup.
939  */
tdp_mmu_zap_leafs(struct kvm * kvm,struct kvm_mmu_page * root,gfn_t start,gfn_t end,bool can_yield,bool flush)940 static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
941 			      gfn_t start, gfn_t end, bool can_yield, bool flush)
942 {
943 	struct tdp_iter iter;
944 
945 	end = min(end, tdp_mmu_max_gfn_exclusive());
946 
947 	lockdep_assert_held_write(&kvm->mmu_lock);
948 
949 	rcu_read_lock();
950 
951 	for_each_tdp_pte_min_level(iter, kvm, root, PG_LEVEL_4K, start, end) {
952 		if (can_yield &&
953 		    tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) {
954 			flush = false;
955 			continue;
956 		}
957 
958 		if (!is_shadow_present_pte(iter.old_spte) ||
959 		    !is_last_spte(iter.old_spte, iter.level))
960 			continue;
961 
962 		tdp_mmu_iter_set_spte(kvm, &iter, SHADOW_NONPRESENT_VALUE);
963 
964 		/*
965 		 * Zappings SPTEs in invalid roots doesn't require a TLB flush,
966 		 * see kvm_tdp_mmu_zap_invalidated_roots() for details.
967 		 */
968 		if (!root->role.invalid)
969 			flush = true;
970 	}
971 
972 	rcu_read_unlock();
973 
974 	/*
975 	 * Because this flow zaps _only_ leaf SPTEs, the caller doesn't need
976 	 * to provide RCU protection as no 'struct kvm_mmu_page' will be freed.
977 	 */
978 	return flush;
979 }
980 
981 /*
982  * Zap leaf SPTEs for the range of gfns, [start, end), for all *VALID** roots.
983  * Returns true if a TLB flush is needed before releasing the MMU lock, i.e. if
984  * one or more SPTEs were zapped since the MMU lock was last acquired.
985  */
kvm_tdp_mmu_zap_leafs(struct kvm * kvm,gfn_t start,gfn_t end,bool flush)986 bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush)
987 {
988 	struct kvm_mmu_page *root;
989 
990 	lockdep_assert_held_write(&kvm->mmu_lock);
991 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, -1)
992 		flush = tdp_mmu_zap_leafs(kvm, root, start, end, true, flush);
993 
994 	return flush;
995 }
996 
kvm_tdp_mmu_zap_all(struct kvm * kvm)997 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
998 {
999 	struct kvm_mmu_page *root;
1000 
1001 	/*
1002 	 * Zap all direct roots, including invalid direct roots, as all direct
1003 	 * SPTEs must be dropped before returning to the caller. For TDX, mirror
1004 	 * roots don't need handling in response to the mmu notifier (the caller).
1005 	 *
1006 	 * Zap directly even if the root is also being zapped by a concurrent
1007 	 * "fast zap".  Walking zapped top-level SPTEs isn't all that expensive
1008 	 * and mmu_lock is already held, which means the other thread has yielded.
1009 	 *
1010 	 * A TLB flush is unnecessary, KVM zaps everything if and only the VM
1011 	 * is being destroyed or the userspace VMM has exited.  In both cases,
1012 	 * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request.
1013 	 */
1014 	lockdep_assert_held_write(&kvm->mmu_lock);
1015 	__for_each_tdp_mmu_root_yield_safe(kvm, root, -1,
1016 					   KVM_DIRECT_ROOTS | KVM_INVALID_ROOTS)
1017 		tdp_mmu_zap_root(kvm, root, false);
1018 }
1019 
1020 /*
1021  * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast
1022  * zap" completes.
1023  */
kvm_tdp_mmu_zap_invalidated_roots(struct kvm * kvm,bool shared)1024 void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm, bool shared)
1025 {
1026 	struct kvm_mmu_page *root;
1027 
1028 	if (shared)
1029 		read_lock(&kvm->mmu_lock);
1030 	else
1031 		write_lock(&kvm->mmu_lock);
1032 
1033 	for_each_tdp_mmu_root_yield_safe(kvm, root) {
1034 		if (!root->tdp_mmu_scheduled_root_to_zap)
1035 			continue;
1036 
1037 		root->tdp_mmu_scheduled_root_to_zap = false;
1038 		KVM_BUG_ON(!root->role.invalid, kvm);
1039 
1040 		/*
1041 		 * A TLB flush is not necessary as KVM performs a local TLB
1042 		 * flush when allocating a new root (see kvm_mmu_load()), and
1043 		 * when migrating a vCPU to a different pCPU.  Note, the local
1044 		 * TLB flush on reuse also invalidates paging-structure-cache
1045 		 * entries, i.e. TLB entries for intermediate paging structures,
1046 		 * that may be zapped, as such entries are associated with the
1047 		 * ASID on both VMX and SVM.
1048 		 */
1049 		tdp_mmu_zap_root(kvm, root, shared);
1050 
1051 		/*
1052 		 * The referenced needs to be put *after* zapping the root, as
1053 		 * the root must be reachable by mmu_notifiers while it's being
1054 		 * zapped
1055 		 */
1056 		kvm_tdp_mmu_put_root(kvm, root);
1057 	}
1058 
1059 	if (shared)
1060 		read_unlock(&kvm->mmu_lock);
1061 	else
1062 		write_unlock(&kvm->mmu_lock);
1063 }
1064 
1065 /*
1066  * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that
1067  * is about to be zapped, e.g. in response to a memslots update.  The actual
1068  * zapping is done separately so that it happens with mmu_lock with read,
1069  * whereas invalidating roots must be done with mmu_lock held for write (unless
1070  * the VM is being destroyed).
1071  *
1072  * Note, kvm_tdp_mmu_zap_invalidated_roots() is gifted the TDP MMU's reference.
1073  * See kvm_tdp_mmu_alloc_root().
1074  */
kvm_tdp_mmu_invalidate_roots(struct kvm * kvm,enum kvm_tdp_mmu_root_types root_types)1075 void kvm_tdp_mmu_invalidate_roots(struct kvm *kvm,
1076 				  enum kvm_tdp_mmu_root_types root_types)
1077 {
1078 	struct kvm_mmu_page *root;
1079 
1080 	/*
1081 	 * Invalidating invalid roots doesn't make sense, prevent developers from
1082 	 * having to think about it.
1083 	 */
1084 	if (WARN_ON_ONCE(root_types & KVM_INVALID_ROOTS))
1085 		root_types &= ~KVM_INVALID_ROOTS;
1086 
1087 	/*
1088 	 * mmu_lock must be held for write to ensure that a root doesn't become
1089 	 * invalid while there are active readers (invalidating a root while
1090 	 * there are active readers may or may not be problematic in practice,
1091 	 * but it's uncharted territory and not supported).
1092 	 *
1093 	 * Waive the assertion if there are no users of @kvm, i.e. the VM is
1094 	 * being destroyed after all references have been put, or if no vCPUs
1095 	 * have been created (which means there are no roots), i.e. the VM is
1096 	 * being destroyed in an error path of KVM_CREATE_VM.
1097 	 */
1098 	if (IS_ENABLED(CONFIG_PROVE_LOCKING) &&
1099 	    refcount_read(&kvm->users_count) && kvm->created_vcpus)
1100 		lockdep_assert_held_write(&kvm->mmu_lock);
1101 
1102 	/*
1103 	 * As above, mmu_lock isn't held when destroying the VM!  There can't
1104 	 * be other references to @kvm, i.e. nothing else can invalidate roots
1105 	 * or get/put references to roots.
1106 	 */
1107 	list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
1108 		if (!tdp_mmu_root_match(root, root_types))
1109 			continue;
1110 
1111 		/*
1112 		 * Note, invalid roots can outlive a memslot update!  Invalid
1113 		 * roots must be *zapped* before the memslot update completes,
1114 		 * but a different task can acquire a reference and keep the
1115 		 * root alive after its been zapped.
1116 		 */
1117 		if (!root->role.invalid) {
1118 			root->tdp_mmu_scheduled_root_to_zap = true;
1119 			root->role.invalid = true;
1120 		}
1121 	}
1122 }
1123 
1124 /*
1125  * Installs a last-level SPTE to handle a TDP page fault.
1126  * (NPT/EPT violation/misconfiguration)
1127  */
tdp_mmu_map_handle_target_level(struct kvm_vcpu * vcpu,struct kvm_page_fault * fault,struct tdp_iter * iter)1128 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
1129 					  struct kvm_page_fault *fault,
1130 					  struct tdp_iter *iter)
1131 {
1132 	struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep));
1133 	u64 new_spte;
1134 	int ret = RET_PF_FIXED;
1135 	bool wrprot = false;
1136 
1137 	if (WARN_ON_ONCE(sp->role.level != fault->goal_level))
1138 		return RET_PF_RETRY;
1139 
1140 	if (fault->prefetch && is_shadow_present_pte(iter->old_spte))
1141 		return RET_PF_SPURIOUS;
1142 
1143 	if (is_shadow_present_pte(iter->old_spte) &&
1144 	    is_access_allowed(fault, iter->old_spte) &&
1145 	    is_last_spte(iter->old_spte, iter->level))
1146 		return RET_PF_SPURIOUS;
1147 
1148 	if (unlikely(!fault->slot))
1149 		new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
1150 	else
1151 		wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
1152 				   fault->pfn, iter->old_spte, fault->prefetch,
1153 				   false, fault->map_writable, &new_spte);
1154 
1155 	if (new_spte == iter->old_spte)
1156 		ret = RET_PF_SPURIOUS;
1157 	else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
1158 		return RET_PF_RETRY;
1159 	else if (is_shadow_present_pte(iter->old_spte) &&
1160 		 (!is_last_spte(iter->old_spte, iter->level) ||
1161 		  WARN_ON_ONCE(leaf_spte_change_needs_tlb_flush(iter->old_spte, new_spte))))
1162 		kvm_flush_remote_tlbs_gfn(vcpu->kvm, iter->gfn, iter->level);
1163 
1164 	/*
1165 	 * If the page fault was caused by a write but the page is write
1166 	 * protected, emulation is needed. If the emulation was skipped,
1167 	 * the vCPU would have the same fault again.
1168 	 */
1169 	if (wrprot && fault->write)
1170 		ret = RET_PF_WRITE_PROTECTED;
1171 
1172 	/* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
1173 	if (unlikely(is_mmio_spte(vcpu->kvm, new_spte))) {
1174 		vcpu->stat.pf_mmio_spte_created++;
1175 		trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
1176 				     new_spte);
1177 		ret = RET_PF_EMULATE;
1178 	} else {
1179 		trace_kvm_mmu_set_spte(iter->level, iter->gfn,
1180 				       rcu_dereference(iter->sptep));
1181 	}
1182 
1183 	return ret;
1184 }
1185 
1186 /*
1187  * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the
1188  * provided page table.
1189  *
1190  * @kvm: kvm instance
1191  * @iter: a tdp_iter instance currently on the SPTE that should be set
1192  * @sp: The new TDP page table to install.
1193  * @shared: This operation is running under the MMU lock in read mode.
1194  *
1195  * Returns: 0 if the new page table was installed. Non-0 if the page table
1196  *          could not be installed (e.g. the atomic compare-exchange failed).
1197  */
tdp_mmu_link_sp(struct kvm * kvm,struct tdp_iter * iter,struct kvm_mmu_page * sp,bool shared)1198 static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
1199 			   struct kvm_mmu_page *sp, bool shared)
1200 {
1201 	u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled);
1202 	int ret = 0;
1203 
1204 	if (shared) {
1205 		ret = tdp_mmu_set_spte_atomic(kvm, iter, spte);
1206 		if (ret)
1207 			return ret;
1208 	} else {
1209 		tdp_mmu_iter_set_spte(kvm, iter, spte);
1210 	}
1211 
1212 	tdp_account_mmu_page(kvm, sp);
1213 
1214 	return 0;
1215 }
1216 
1217 static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
1218 				   struct kvm_mmu_page *sp, bool shared);
1219 
1220 /*
1221  * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
1222  * page tables and SPTEs to translate the faulting guest physical address.
1223  */
kvm_tdp_mmu_map(struct kvm_vcpu * vcpu,struct kvm_page_fault * fault)1224 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
1225 {
1226 	struct kvm_mmu_page *root = tdp_mmu_get_root_for_fault(vcpu, fault);
1227 	struct kvm *kvm = vcpu->kvm;
1228 	struct tdp_iter iter;
1229 	struct kvm_mmu_page *sp;
1230 	int ret = RET_PF_RETRY;
1231 
1232 	kvm_mmu_hugepage_adjust(vcpu, fault);
1233 
1234 	trace_kvm_mmu_spte_requested(fault);
1235 
1236 	rcu_read_lock();
1237 
1238 	tdp_mmu_for_each_pte(iter, kvm, root, fault->gfn, fault->gfn + 1) {
1239 		int r;
1240 
1241 		if (fault->nx_huge_page_workaround_enabled)
1242 			disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
1243 
1244 		/*
1245 		 * If SPTE has been frozen by another thread, just give up and
1246 		 * retry, avoiding unnecessary page table allocation and free.
1247 		 */
1248 		if (is_frozen_spte(iter.old_spte))
1249 			goto retry;
1250 
1251 		if (iter.level == fault->goal_level)
1252 			goto map_target_level;
1253 
1254 		/* Step down into the lower level page table if it exists. */
1255 		if (is_shadow_present_pte(iter.old_spte) &&
1256 		    !is_large_pte(iter.old_spte))
1257 			continue;
1258 
1259 		/*
1260 		 * The SPTE is either non-present or points to a huge page that
1261 		 * needs to be split.
1262 		 */
1263 		sp = tdp_mmu_alloc_sp(vcpu);
1264 		tdp_mmu_init_child_sp(sp, &iter);
1265 		if (is_mirror_sp(sp))
1266 			kvm_mmu_alloc_external_spt(vcpu, sp);
1267 
1268 		sp->nx_huge_page_disallowed = fault->huge_page_disallowed;
1269 
1270 		if (is_shadow_present_pte(iter.old_spte)) {
1271 			/* Don't support large page for mirrored roots (TDX) */
1272 			KVM_BUG_ON(is_mirror_sptep(iter.sptep), vcpu->kvm);
1273 			r = tdp_mmu_split_huge_page(kvm, &iter, sp, true);
1274 		} else {
1275 			r = tdp_mmu_link_sp(kvm, &iter, sp, true);
1276 		}
1277 
1278 		/*
1279 		 * Force the guest to retry if installing an upper level SPTE
1280 		 * failed, e.g. because a different task modified the SPTE.
1281 		 */
1282 		if (r) {
1283 			tdp_mmu_free_sp(sp);
1284 			goto retry;
1285 		}
1286 
1287 		if (fault->huge_page_disallowed &&
1288 		    fault->req_level >= iter.level) {
1289 			spin_lock(&kvm->arch.tdp_mmu_pages_lock);
1290 			if (sp->nx_huge_page_disallowed)
1291 				track_possible_nx_huge_page(kvm, sp);
1292 			spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
1293 		}
1294 	}
1295 
1296 	/*
1297 	 * The walk aborted before reaching the target level, e.g. because the
1298 	 * iterator detected an upper level SPTE was frozen during traversal.
1299 	 */
1300 	WARN_ON_ONCE(iter.level == fault->goal_level);
1301 	goto retry;
1302 
1303 map_target_level:
1304 	ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
1305 
1306 retry:
1307 	rcu_read_unlock();
1308 	return ret;
1309 }
1310 
1311 /* Used by mmu notifier via kvm_unmap_gfn_range() */
kvm_tdp_mmu_unmap_gfn_range(struct kvm * kvm,struct kvm_gfn_range * range,bool flush)1312 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
1313 				 bool flush)
1314 {
1315 	enum kvm_tdp_mmu_root_types types;
1316 	struct kvm_mmu_page *root;
1317 
1318 	types = kvm_gfn_range_filter_to_root_types(kvm, range->attr_filter) | KVM_INVALID_ROOTS;
1319 
1320 	__for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, types)
1321 		flush = tdp_mmu_zap_leafs(kvm, root, range->start, range->end,
1322 					  range->may_block, flush);
1323 
1324 	return flush;
1325 }
1326 
1327 /*
1328  * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
1329  * if any of the GFNs in the range have been accessed.
1330  *
1331  * No need to mark the corresponding PFN as accessed as this call is coming
1332  * from the clear_young() or clear_flush_young() notifier, which uses the
1333  * return value to determine if the page has been accessed.
1334  */
kvm_tdp_mmu_age_spte(struct tdp_iter * iter)1335 static void kvm_tdp_mmu_age_spte(struct tdp_iter *iter)
1336 {
1337 	u64 new_spte;
1338 
1339 	if (spte_ad_enabled(iter->old_spte)) {
1340 		iter->old_spte = tdp_mmu_clear_spte_bits(iter->sptep,
1341 							 iter->old_spte,
1342 							 shadow_accessed_mask,
1343 							 iter->level);
1344 		new_spte = iter->old_spte & ~shadow_accessed_mask;
1345 	} else {
1346 		new_spte = mark_spte_for_access_track(iter->old_spte);
1347 		iter->old_spte = kvm_tdp_mmu_write_spte(iter->sptep,
1348 							iter->old_spte, new_spte,
1349 							iter->level);
1350 	}
1351 
1352 	trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level,
1353 				       iter->old_spte, new_spte);
1354 }
1355 
__kvm_tdp_mmu_age_gfn_range(struct kvm * kvm,struct kvm_gfn_range * range,bool test_only)1356 static bool __kvm_tdp_mmu_age_gfn_range(struct kvm *kvm,
1357 					struct kvm_gfn_range *range,
1358 					bool test_only)
1359 {
1360 	enum kvm_tdp_mmu_root_types types;
1361 	struct kvm_mmu_page *root;
1362 	struct tdp_iter iter;
1363 	bool ret = false;
1364 
1365 	types = kvm_gfn_range_filter_to_root_types(kvm, range->attr_filter);
1366 
1367 	/*
1368 	 * Don't support rescheduling, none of the MMU notifiers that funnel
1369 	 * into this helper allow blocking; it'd be dead, wasteful code.  Note,
1370 	 * this helper must NOT be used to unmap GFNs, as it processes only
1371 	 * valid roots!
1372 	 */
1373 	WARN_ON(types & ~KVM_VALID_ROOTS);
1374 	__for_each_tdp_mmu_root(kvm, root, range->slot->as_id, types) {
1375 		guard(rcu)();
1376 
1377 		tdp_root_for_each_leaf_pte(iter, kvm, root, range->start, range->end) {
1378 			if (!is_accessed_spte(iter.old_spte))
1379 				continue;
1380 
1381 			if (test_only)
1382 				return true;
1383 
1384 			ret = true;
1385 			kvm_tdp_mmu_age_spte(&iter);
1386 		}
1387 	}
1388 
1389 	return ret;
1390 }
1391 
kvm_tdp_mmu_age_gfn_range(struct kvm * kvm,struct kvm_gfn_range * range)1392 bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
1393 {
1394 	return __kvm_tdp_mmu_age_gfn_range(kvm, range, false);
1395 }
1396 
kvm_tdp_mmu_test_age_gfn(struct kvm * kvm,struct kvm_gfn_range * range)1397 bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1398 {
1399 	return __kvm_tdp_mmu_age_gfn_range(kvm, range, true);
1400 }
1401 
1402 /*
1403  * Remove write access from all SPTEs at or above min_level that map GFNs
1404  * [start, end). Returns true if an SPTE has been changed and the TLBs need to
1405  * be flushed.
1406  */
wrprot_gfn_range(struct kvm * kvm,struct kvm_mmu_page * root,gfn_t start,gfn_t end,int min_level)1407 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1408 			     gfn_t start, gfn_t end, int min_level)
1409 {
1410 	struct tdp_iter iter;
1411 	u64 new_spte;
1412 	bool spte_set = false;
1413 
1414 	rcu_read_lock();
1415 
1416 	BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1417 
1418 	for_each_tdp_pte_min_level(iter, kvm, root, min_level, start, end) {
1419 retry:
1420 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1421 			continue;
1422 
1423 		if (!is_shadow_present_pte(iter.old_spte) ||
1424 		    !is_last_spte(iter.old_spte, iter.level) ||
1425 		    !(iter.old_spte & PT_WRITABLE_MASK))
1426 			continue;
1427 
1428 		new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1429 
1430 		if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
1431 			goto retry;
1432 
1433 		spte_set = true;
1434 	}
1435 
1436 	rcu_read_unlock();
1437 	return spte_set;
1438 }
1439 
1440 /*
1441  * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1442  * only affect leaf SPTEs down to min_level.
1443  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1444  */
kvm_tdp_mmu_wrprot_slot(struct kvm * kvm,const struct kvm_memory_slot * slot,int min_level)1445 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
1446 			     const struct kvm_memory_slot *slot, int min_level)
1447 {
1448 	struct kvm_mmu_page *root;
1449 	bool spte_set = false;
1450 
1451 	lockdep_assert_held_read(&kvm->mmu_lock);
1452 
1453 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
1454 		spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1455 			     slot->base_gfn + slot->npages, min_level);
1456 
1457 	return spte_set;
1458 }
1459 
tdp_mmu_alloc_sp_for_split(void)1460 static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(void)
1461 {
1462 	struct kvm_mmu_page *sp;
1463 
1464 	sp = kmem_cache_zalloc(mmu_page_header_cache, GFP_KERNEL_ACCOUNT);
1465 	if (!sp)
1466 		return NULL;
1467 
1468 	sp->spt = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
1469 	if (!sp->spt) {
1470 		kmem_cache_free(mmu_page_header_cache, sp);
1471 		return NULL;
1472 	}
1473 
1474 	return sp;
1475 }
1476 
1477 /* Note, the caller is responsible for initializing @sp. */
tdp_mmu_split_huge_page(struct kvm * kvm,struct tdp_iter * iter,struct kvm_mmu_page * sp,bool shared)1478 static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
1479 				   struct kvm_mmu_page *sp, bool shared)
1480 {
1481 	const u64 huge_spte = iter->old_spte;
1482 	const int level = iter->level;
1483 	int ret, i;
1484 
1485 	/*
1486 	 * No need for atomics when writing to sp->spt since the page table has
1487 	 * not been linked in yet and thus is not reachable from any other CPU.
1488 	 */
1489 	for (i = 0; i < SPTE_ENT_PER_PAGE; i++)
1490 		sp->spt[i] = make_small_spte(kvm, huge_spte, sp->role, i);
1491 
1492 	/*
1493 	 * Replace the huge spte with a pointer to the populated lower level
1494 	 * page table. Since we are making this change without a TLB flush vCPUs
1495 	 * will see a mix of the split mappings and the original huge mapping,
1496 	 * depending on what's currently in their TLB. This is fine from a
1497 	 * correctness standpoint since the translation will be the same either
1498 	 * way.
1499 	 */
1500 	ret = tdp_mmu_link_sp(kvm, iter, sp, shared);
1501 	if (ret)
1502 		goto out;
1503 
1504 	/*
1505 	 * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we
1506 	 * are overwriting from the page stats. But we have to manually update
1507 	 * the page stats with the new present child pages.
1508 	 */
1509 	kvm_update_page_stats(kvm, level - 1, SPTE_ENT_PER_PAGE);
1510 
1511 out:
1512 	trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret);
1513 	return ret;
1514 }
1515 
tdp_mmu_split_huge_pages_root(struct kvm * kvm,struct kvm_mmu_page * root,gfn_t start,gfn_t end,int target_level,bool shared)1516 static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
1517 					 struct kvm_mmu_page *root,
1518 					 gfn_t start, gfn_t end,
1519 					 int target_level, bool shared)
1520 {
1521 	struct kvm_mmu_page *sp = NULL;
1522 	struct tdp_iter iter;
1523 
1524 	rcu_read_lock();
1525 
1526 	/*
1527 	 * Traverse the page table splitting all huge pages above the target
1528 	 * level into one lower level. For example, if we encounter a 1GB page
1529 	 * we split it into 512 2MB pages.
1530 	 *
1531 	 * Since the TDP iterator uses a pre-order traversal, we are guaranteed
1532 	 * to visit an SPTE before ever visiting its children, which means we
1533 	 * will correctly recursively split huge pages that are more than one
1534 	 * level above the target level (e.g. splitting a 1GB to 512 2MB pages,
1535 	 * and then splitting each of those to 512 4KB pages).
1536 	 */
1537 	for_each_tdp_pte_min_level(iter, kvm, root, target_level + 1, start, end) {
1538 retry:
1539 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
1540 			continue;
1541 
1542 		if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte))
1543 			continue;
1544 
1545 		if (!sp) {
1546 			rcu_read_unlock();
1547 
1548 			if (shared)
1549 				read_unlock(&kvm->mmu_lock);
1550 			else
1551 				write_unlock(&kvm->mmu_lock);
1552 
1553 			sp = tdp_mmu_alloc_sp_for_split();
1554 
1555 			if (shared)
1556 				read_lock(&kvm->mmu_lock);
1557 			else
1558 				write_lock(&kvm->mmu_lock);
1559 
1560 			if (!sp) {
1561 				trace_kvm_mmu_split_huge_page(iter.gfn,
1562 							      iter.old_spte,
1563 							      iter.level, -ENOMEM);
1564 				return -ENOMEM;
1565 			}
1566 
1567 			rcu_read_lock();
1568 
1569 			iter.yielded = true;
1570 			continue;
1571 		}
1572 
1573 		tdp_mmu_init_child_sp(sp, &iter);
1574 
1575 		if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared))
1576 			goto retry;
1577 
1578 		sp = NULL;
1579 	}
1580 
1581 	rcu_read_unlock();
1582 
1583 	/*
1584 	 * It's possible to exit the loop having never used the last sp if, for
1585 	 * example, a vCPU doing HugePage NX splitting wins the race and
1586 	 * installs its own sp in place of the last sp we tried to split.
1587 	 */
1588 	if (sp)
1589 		tdp_mmu_free_sp(sp);
1590 
1591 	return 0;
1592 }
1593 
1594 
1595 /*
1596  * Try to split all huge pages mapped by the TDP MMU down to the target level.
1597  */
kvm_tdp_mmu_try_split_huge_pages(struct kvm * kvm,const struct kvm_memory_slot * slot,gfn_t start,gfn_t end,int target_level,bool shared)1598 void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
1599 				      const struct kvm_memory_slot *slot,
1600 				      gfn_t start, gfn_t end,
1601 				      int target_level, bool shared)
1602 {
1603 	struct kvm_mmu_page *root;
1604 	int r = 0;
1605 
1606 	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
1607 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id) {
1608 		r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared);
1609 		if (r) {
1610 			kvm_tdp_mmu_put_root(kvm, root);
1611 			break;
1612 		}
1613 	}
1614 }
1615 
tdp_mmu_need_write_protect(struct kvm_mmu_page * sp)1616 static bool tdp_mmu_need_write_protect(struct kvm_mmu_page *sp)
1617 {
1618 	/*
1619 	 * All TDP MMU shadow pages share the same role as their root, aside
1620 	 * from level, so it is valid to key off any shadow page to determine if
1621 	 * write protection is needed for an entire tree.
1622 	 */
1623 	return kvm_mmu_page_ad_need_write_protect(sp) || !kvm_ad_enabled;
1624 }
1625 
clear_dirty_gfn_range(struct kvm * kvm,struct kvm_mmu_page * root,gfn_t start,gfn_t end)1626 static void clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1627 				  gfn_t start, gfn_t end)
1628 {
1629 	const u64 dbit = tdp_mmu_need_write_protect(root) ? PT_WRITABLE_MASK :
1630 							    shadow_dirty_mask;
1631 	struct tdp_iter iter;
1632 
1633 	rcu_read_lock();
1634 
1635 	tdp_root_for_each_pte(iter, kvm, root, start, end) {
1636 retry:
1637 		if (!is_shadow_present_pte(iter.old_spte) ||
1638 		    !is_last_spte(iter.old_spte, iter.level))
1639 			continue;
1640 
1641 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1642 			continue;
1643 
1644 		KVM_MMU_WARN_ON(dbit == shadow_dirty_mask &&
1645 				spte_ad_need_write_protect(iter.old_spte));
1646 
1647 		if (!(iter.old_spte & dbit))
1648 			continue;
1649 
1650 		if (tdp_mmu_set_spte_atomic(kvm, &iter, iter.old_spte & ~dbit))
1651 			goto retry;
1652 	}
1653 
1654 	rcu_read_unlock();
1655 }
1656 
1657 /*
1658  * Clear the dirty status (D-bit or W-bit) of all the SPTEs mapping GFNs in the
1659  * memslot.
1660  */
kvm_tdp_mmu_clear_dirty_slot(struct kvm * kvm,const struct kvm_memory_slot * slot)1661 void kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
1662 				  const struct kvm_memory_slot *slot)
1663 {
1664 	struct kvm_mmu_page *root;
1665 
1666 	lockdep_assert_held_read(&kvm->mmu_lock);
1667 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
1668 		clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1669 				      slot->base_gfn + slot->npages);
1670 }
1671 
clear_dirty_pt_masked(struct kvm * kvm,struct kvm_mmu_page * root,gfn_t gfn,unsigned long mask,bool wrprot)1672 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1673 				  gfn_t gfn, unsigned long mask, bool wrprot)
1674 {
1675 	const u64 dbit = (wrprot || tdp_mmu_need_write_protect(root)) ? PT_WRITABLE_MASK :
1676 									shadow_dirty_mask;
1677 	struct tdp_iter iter;
1678 
1679 	lockdep_assert_held_write(&kvm->mmu_lock);
1680 
1681 	rcu_read_lock();
1682 
1683 	tdp_root_for_each_leaf_pte(iter, kvm, root, gfn + __ffs(mask),
1684 				    gfn + BITS_PER_LONG) {
1685 		if (!mask)
1686 			break;
1687 
1688 		KVM_MMU_WARN_ON(dbit == shadow_dirty_mask &&
1689 				spte_ad_need_write_protect(iter.old_spte));
1690 
1691 		if (iter.level > PG_LEVEL_4K ||
1692 		    !(mask & (1UL << (iter.gfn - gfn))))
1693 			continue;
1694 
1695 		mask &= ~(1UL << (iter.gfn - gfn));
1696 
1697 		if (!(iter.old_spte & dbit))
1698 			continue;
1699 
1700 		iter.old_spte = tdp_mmu_clear_spte_bits(iter.sptep,
1701 							iter.old_spte, dbit,
1702 							iter.level);
1703 
1704 		trace_kvm_tdp_mmu_spte_changed(iter.as_id, iter.gfn, iter.level,
1705 					       iter.old_spte,
1706 					       iter.old_spte & ~dbit);
1707 	}
1708 
1709 	rcu_read_unlock();
1710 }
1711 
1712 /*
1713  * Clear the dirty status (D-bit or W-bit) of all the 4k SPTEs mapping GFNs for
1714  * which a bit is set in mask, starting at gfn. The given memslot is expected to
1715  * contain all the GFNs represented by set bits in the mask.
1716  */
kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm * kvm,struct kvm_memory_slot * slot,gfn_t gfn,unsigned long mask,bool wrprot)1717 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1718 				       struct kvm_memory_slot *slot,
1719 				       gfn_t gfn, unsigned long mask,
1720 				       bool wrprot)
1721 {
1722 	struct kvm_mmu_page *root;
1723 
1724 	for_each_valid_tdp_mmu_root(kvm, root, slot->as_id)
1725 		clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1726 }
1727 
tdp_mmu_make_huge_spte(struct kvm * kvm,struct tdp_iter * parent,u64 * huge_spte)1728 static int tdp_mmu_make_huge_spte(struct kvm *kvm,
1729 				  struct tdp_iter *parent,
1730 				  u64 *huge_spte)
1731 {
1732 	struct kvm_mmu_page *root = spte_to_child_sp(parent->old_spte);
1733 	gfn_t start = parent->gfn;
1734 	gfn_t end = start + KVM_PAGES_PER_HPAGE(parent->level);
1735 	struct tdp_iter iter;
1736 
1737 	tdp_root_for_each_leaf_pte(iter, kvm, root, start, end) {
1738 		/*
1739 		 * Use the parent iterator when checking for forward progress so
1740 		 * that KVM doesn't get stuck continuously trying to yield (i.e.
1741 		 * returning -EAGAIN here and then failing the forward progress
1742 		 * check in the caller ad nauseam).
1743 		 */
1744 		if (tdp_mmu_iter_need_resched(kvm, parent))
1745 			return -EAGAIN;
1746 
1747 		*huge_spte = make_huge_spte(kvm, iter.old_spte, parent->level);
1748 		return 0;
1749 	}
1750 
1751 	return -ENOENT;
1752 }
1753 
recover_huge_pages_range(struct kvm * kvm,struct kvm_mmu_page * root,const struct kvm_memory_slot * slot)1754 static void recover_huge_pages_range(struct kvm *kvm,
1755 				     struct kvm_mmu_page *root,
1756 				     const struct kvm_memory_slot *slot)
1757 {
1758 	gfn_t start = slot->base_gfn;
1759 	gfn_t end = start + slot->npages;
1760 	struct tdp_iter iter;
1761 	int max_mapping_level;
1762 	bool flush = false;
1763 	u64 huge_spte;
1764 	int r;
1765 
1766 	if (WARN_ON_ONCE(kvm_slot_dirty_track_enabled(slot)))
1767 		return;
1768 
1769 	rcu_read_lock();
1770 
1771 	for_each_tdp_pte_min_level(iter, kvm, root, PG_LEVEL_2M, start, end) {
1772 retry:
1773 		if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, true)) {
1774 			flush = false;
1775 			continue;
1776 		}
1777 
1778 		if (iter.level > KVM_MAX_HUGEPAGE_LEVEL ||
1779 		    !is_shadow_present_pte(iter.old_spte))
1780 			continue;
1781 
1782 		/*
1783 		 * Don't zap leaf SPTEs, if a leaf SPTE could be replaced with
1784 		 * a large page size, then its parent would have been zapped
1785 		 * instead of stepping down.
1786 		 */
1787 		if (is_last_spte(iter.old_spte, iter.level))
1788 			continue;
1789 
1790 		/*
1791 		 * If iter.gfn resides outside of the slot, i.e. the page for
1792 		 * the current level overlaps but is not contained by the slot,
1793 		 * then the SPTE can't be made huge.  More importantly, trying
1794 		 * to query that info from slot->arch.lpage_info will cause an
1795 		 * out-of-bounds access.
1796 		 */
1797 		if (iter.gfn < start || iter.gfn >= end)
1798 			continue;
1799 
1800 		max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, iter.gfn);
1801 		if (max_mapping_level < iter.level)
1802 			continue;
1803 
1804 		r = tdp_mmu_make_huge_spte(kvm, &iter, &huge_spte);
1805 		if (r == -EAGAIN)
1806 			goto retry;
1807 		else if (r)
1808 			continue;
1809 
1810 		if (tdp_mmu_set_spte_atomic(kvm, &iter, huge_spte))
1811 			goto retry;
1812 
1813 		flush = true;
1814 	}
1815 
1816 	if (flush)
1817 		kvm_flush_remote_tlbs_memslot(kvm, slot);
1818 
1819 	rcu_read_unlock();
1820 }
1821 
1822 /*
1823  * Recover huge page mappings within the slot by replacing non-leaf SPTEs with
1824  * huge SPTEs where possible.
1825  */
kvm_tdp_mmu_recover_huge_pages(struct kvm * kvm,const struct kvm_memory_slot * slot)1826 void kvm_tdp_mmu_recover_huge_pages(struct kvm *kvm,
1827 				    const struct kvm_memory_slot *slot)
1828 {
1829 	struct kvm_mmu_page *root;
1830 
1831 	lockdep_assert_held_read(&kvm->mmu_lock);
1832 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id)
1833 		recover_huge_pages_range(kvm, root, slot);
1834 }
1835 
1836 /*
1837  * Removes write access on the last level SPTE mapping this GFN and unsets the
1838  * MMU-writable bit to ensure future writes continue to be intercepted.
1839  * Returns true if an SPTE was set and a TLB flush is needed.
1840  */
write_protect_gfn(struct kvm * kvm,struct kvm_mmu_page * root,gfn_t gfn,int min_level)1841 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
1842 			      gfn_t gfn, int min_level)
1843 {
1844 	struct tdp_iter iter;
1845 	u64 new_spte;
1846 	bool spte_set = false;
1847 
1848 	BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1849 
1850 	rcu_read_lock();
1851 
1852 	for_each_tdp_pte_min_level(iter, kvm, root, min_level, gfn, gfn + 1) {
1853 		if (!is_shadow_present_pte(iter.old_spte) ||
1854 		    !is_last_spte(iter.old_spte, iter.level))
1855 			continue;
1856 
1857 		new_spte = iter.old_spte &
1858 			~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
1859 
1860 		if (new_spte == iter.old_spte)
1861 			break;
1862 
1863 		tdp_mmu_iter_set_spte(kvm, &iter, new_spte);
1864 		spte_set = true;
1865 	}
1866 
1867 	rcu_read_unlock();
1868 
1869 	return spte_set;
1870 }
1871 
1872 /*
1873  * Removes write access on the last level SPTE mapping this GFN and unsets the
1874  * MMU-writable bit to ensure future writes continue to be intercepted.
1875  * Returns true if an SPTE was set and a TLB flush is needed.
1876  */
kvm_tdp_mmu_write_protect_gfn(struct kvm * kvm,struct kvm_memory_slot * slot,gfn_t gfn,int min_level)1877 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
1878 				   struct kvm_memory_slot *slot, gfn_t gfn,
1879 				   int min_level)
1880 {
1881 	struct kvm_mmu_page *root;
1882 	bool spte_set = false;
1883 
1884 	lockdep_assert_held_write(&kvm->mmu_lock);
1885 	for_each_valid_tdp_mmu_root(kvm, root, slot->as_id)
1886 		spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
1887 
1888 	return spte_set;
1889 }
1890 
1891 /*
1892  * Return the level of the lowest level SPTE added to sptes.
1893  * That SPTE may be non-present.
1894  *
1895  * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1896  */
kvm_tdp_mmu_get_walk(struct kvm_vcpu * vcpu,u64 addr,u64 * sptes,int * root_level)1897 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1898 			 int *root_level)
1899 {
1900 	struct kvm_mmu_page *root = root_to_sp(vcpu->arch.mmu->root.hpa);
1901 	struct tdp_iter iter;
1902 	gfn_t gfn = addr >> PAGE_SHIFT;
1903 	int leaf = -1;
1904 
1905 	*root_level = vcpu->arch.mmu->root_role.level;
1906 
1907 	tdp_mmu_for_each_pte(iter, vcpu->kvm, root, gfn, gfn + 1) {
1908 		leaf = iter.level;
1909 		sptes[leaf] = iter.old_spte;
1910 	}
1911 
1912 	return leaf;
1913 }
1914 
1915 /*
1916  * Returns the last level spte pointer of the shadow page walk for the given
1917  * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
1918  * walk could be performed, returns NULL and *spte does not contain valid data.
1919  *
1920  * Contract:
1921  *  - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1922  *  - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end.
1923  *
1924  * WARNING: This function is only intended to be called during fast_page_fault.
1925  */
kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu * vcpu,gfn_t gfn,u64 * spte)1926 u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gfn_t gfn,
1927 					u64 *spte)
1928 {
1929 	/* Fast pf is not supported for mirrored roots  */
1930 	struct kvm_mmu_page *root = tdp_mmu_get_root(vcpu, KVM_DIRECT_ROOTS);
1931 	struct tdp_iter iter;
1932 	tdp_ptep_t sptep = NULL;
1933 
1934 	tdp_mmu_for_each_pte(iter, vcpu->kvm, root, gfn, gfn + 1) {
1935 		*spte = iter.old_spte;
1936 		sptep = iter.sptep;
1937 	}
1938 
1939 	/*
1940 	 * Perform the rcu_dereference to get the raw spte pointer value since
1941 	 * we are passing it up to fast_page_fault, which is shared with the
1942 	 * legacy MMU and thus does not retain the TDP MMU-specific __rcu
1943 	 * annotation.
1944 	 *
1945 	 * This is safe since fast_page_fault obeys the contracts of this
1946 	 * function as well as all TDP MMU contracts around modifying SPTEs
1947 	 * outside of mmu_lock.
1948 	 */
1949 	return rcu_dereference(sptep);
1950 }
1951