xref: /linux/arch/x86/kvm/mmu/tdp_mmu.c (revision 6c8c1406a6d6a3f2e61ac590f5c0994231bc6be7)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "mmu.h"
4 #include "mmu_internal.h"
5 #include "mmutrace.h"
6 #include "tdp_iter.h"
7 #include "tdp_mmu.h"
8 #include "spte.h"
9 
10 #include <asm/cmpxchg.h>
11 #include <trace/events/kvm.h>
12 
13 static bool __read_mostly tdp_mmu_enabled = true;
14 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
15 
16 /* Initializes the TDP MMU for the VM, if enabled. */
17 int kvm_mmu_init_tdp_mmu(struct kvm *kvm)
18 {
19 	struct workqueue_struct *wq;
20 
21 	if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
22 		return 0;
23 
24 	wq = alloc_workqueue("kvm", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0);
25 	if (!wq)
26 		return -ENOMEM;
27 
28 	/* This should not be changed for the lifetime of the VM. */
29 	kvm->arch.tdp_mmu_enabled = true;
30 	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
31 	spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
32 	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
33 	kvm->arch.tdp_mmu_zap_wq = wq;
34 	return 1;
35 }
36 
37 /* Arbitrarily returns true so that this may be used in if statements. */
38 static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
39 							     bool shared)
40 {
41 	if (shared)
42 		lockdep_assert_held_read(&kvm->mmu_lock);
43 	else
44 		lockdep_assert_held_write(&kvm->mmu_lock);
45 
46 	return true;
47 }
48 
49 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
50 {
51 	if (!kvm->arch.tdp_mmu_enabled)
52 		return;
53 
54 	/* Also waits for any queued work items.  */
55 	destroy_workqueue(kvm->arch.tdp_mmu_zap_wq);
56 
57 	WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages));
58 	WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
59 
60 	/*
61 	 * Ensure that all the outstanding RCU callbacks to free shadow pages
62 	 * can run before the VM is torn down.  Work items on tdp_mmu_zap_wq
63 	 * can call kvm_tdp_mmu_put_root and create new callbacks.
64 	 */
65 	rcu_barrier();
66 }
67 
68 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
69 {
70 	free_page((unsigned long)sp->spt);
71 	kmem_cache_free(mmu_page_header_cache, sp);
72 }
73 
74 /*
75  * This is called through call_rcu in order to free TDP page table memory
76  * safely with respect to other kernel threads that may be operating on
77  * the memory.
78  * By only accessing TDP MMU page table memory in an RCU read critical
79  * section, and freeing it after a grace period, lockless access to that
80  * memory won't use it after it is freed.
81  */
82 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
83 {
84 	struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
85 					       rcu_head);
86 
87 	tdp_mmu_free_sp(sp);
88 }
89 
90 static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
91 			     bool shared);
92 
93 static void tdp_mmu_zap_root_work(struct work_struct *work)
94 {
95 	struct kvm_mmu_page *root = container_of(work, struct kvm_mmu_page,
96 						 tdp_mmu_async_work);
97 	struct kvm *kvm = root->tdp_mmu_async_data;
98 
99 	read_lock(&kvm->mmu_lock);
100 
101 	/*
102 	 * A TLB flush is not necessary as KVM performs a local TLB flush when
103 	 * allocating a new root (see kvm_mmu_load()), and when migrating vCPU
104 	 * to a different pCPU.  Note, the local TLB flush on reuse also
105 	 * invalidates any paging-structure-cache entries, i.e. TLB entries for
106 	 * intermediate paging structures, that may be zapped, as such entries
107 	 * are associated with the ASID on both VMX and SVM.
108 	 */
109 	tdp_mmu_zap_root(kvm, root, true);
110 
111 	/*
112 	 * Drop the refcount using kvm_tdp_mmu_put_root() to test its logic for
113 	 * avoiding an infinite loop.  By design, the root is reachable while
114 	 * it's being asynchronously zapped, thus a different task can put its
115 	 * last reference, i.e. flowing through kvm_tdp_mmu_put_root() for an
116 	 * asynchronously zapped root is unavoidable.
117 	 */
118 	kvm_tdp_mmu_put_root(kvm, root, true);
119 
120 	read_unlock(&kvm->mmu_lock);
121 }
122 
123 static void tdp_mmu_schedule_zap_root(struct kvm *kvm, struct kvm_mmu_page *root)
124 {
125 	root->tdp_mmu_async_data = kvm;
126 	INIT_WORK(&root->tdp_mmu_async_work, tdp_mmu_zap_root_work);
127 	queue_work(kvm->arch.tdp_mmu_zap_wq, &root->tdp_mmu_async_work);
128 }
129 
130 static inline bool kvm_tdp_root_mark_invalid(struct kvm_mmu_page *page)
131 {
132 	union kvm_mmu_page_role role = page->role;
133 	role.invalid = true;
134 
135 	/* No need to use cmpxchg, only the invalid bit can change.  */
136 	role.word = xchg(&page->role.word, role.word);
137 	return role.invalid;
138 }
139 
140 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
141 			  bool shared)
142 {
143 	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
144 
145 	if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
146 		return;
147 
148 	WARN_ON(!root->tdp_mmu_page);
149 
150 	/*
151 	 * The root now has refcount=0.  It is valid, but readers already
152 	 * cannot acquire a reference to it because kvm_tdp_mmu_get_root()
153 	 * rejects it.  This remains true for the rest of the execution
154 	 * of this function, because readers visit valid roots only
155 	 * (except for tdp_mmu_zap_root_work(), which however
156 	 * does not acquire any reference itself).
157 	 *
158 	 * Even though there are flows that need to visit all roots for
159 	 * correctness, they all take mmu_lock for write, so they cannot yet
160 	 * run concurrently. The same is true after kvm_tdp_root_mark_invalid,
161 	 * since the root still has refcount=0.
162 	 *
163 	 * However, tdp_mmu_zap_root can yield, and writers do not expect to
164 	 * see refcount=0 (see for example kvm_tdp_mmu_invalidate_all_roots()).
165 	 * So the root temporarily gets an extra reference, going to refcount=1
166 	 * while staying invalid.  Readers still cannot acquire any reference;
167 	 * but writers are now allowed to run if tdp_mmu_zap_root yields and
168 	 * they might take an extra reference if they themselves yield.
169 	 * Therefore, when the reference is given back by the worker,
170 	 * there is no guarantee that the refcount is still 1.  If not, whoever
171 	 * puts the last reference will free the page, but they will not have to
172 	 * zap the root because a root cannot go from invalid to valid.
173 	 */
174 	if (!kvm_tdp_root_mark_invalid(root)) {
175 		refcount_set(&root->tdp_mmu_root_count, 1);
176 
177 		/*
178 		 * Zapping the root in a worker is not just "nice to have";
179 		 * it is required because kvm_tdp_mmu_invalidate_all_roots()
180 		 * skips already-invalid roots.  If kvm_tdp_mmu_put_root() did
181 		 * not add the root to the workqueue, kvm_tdp_mmu_zap_all_fast()
182 		 * might return with some roots not zapped yet.
183 		 */
184 		tdp_mmu_schedule_zap_root(kvm, root);
185 		return;
186 	}
187 
188 	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
189 	list_del_rcu(&root->link);
190 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
191 	call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
192 }
193 
194 /*
195  * Returns the next root after @prev_root (or the first root if @prev_root is
196  * NULL).  A reference to the returned root is acquired, and the reference to
197  * @prev_root is released (the caller obviously must hold a reference to
198  * @prev_root if it's non-NULL).
199  *
200  * If @only_valid is true, invalid roots are skipped.
201  *
202  * Returns NULL if the end of tdp_mmu_roots was reached.
203  */
204 static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
205 					      struct kvm_mmu_page *prev_root,
206 					      bool shared, bool only_valid)
207 {
208 	struct kvm_mmu_page *next_root;
209 
210 	rcu_read_lock();
211 
212 	if (prev_root)
213 		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
214 						  &prev_root->link,
215 						  typeof(*prev_root), link);
216 	else
217 		next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
218 						   typeof(*next_root), link);
219 
220 	while (next_root) {
221 		if ((!only_valid || !next_root->role.invalid) &&
222 		    kvm_tdp_mmu_get_root(next_root))
223 			break;
224 
225 		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
226 				&next_root->link, typeof(*next_root), link);
227 	}
228 
229 	rcu_read_unlock();
230 
231 	if (prev_root)
232 		kvm_tdp_mmu_put_root(kvm, prev_root, shared);
233 
234 	return next_root;
235 }
236 
237 /*
238  * Note: this iterator gets and puts references to the roots it iterates over.
239  * This makes it safe to release the MMU lock and yield within the loop, but
240  * if exiting the loop early, the caller must drop the reference to the most
241  * recent root. (Unless keeping a live reference is desirable.)
242  *
243  * If shared is set, this function is operating under the MMU lock in read
244  * mode. In the unlikely event that this thread must free a root, the lock
245  * will be temporarily dropped and reacquired in write mode.
246  */
247 #define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, _only_valid)\
248 	for (_root = tdp_mmu_next_root(_kvm, NULL, _shared, _only_valid);	\
249 	     _root;								\
250 	     _root = tdp_mmu_next_root(_kvm, _root, _shared, _only_valid))	\
251 		if (kvm_lockdep_assert_mmu_lock_held(_kvm, _shared) &&		\
252 		    kvm_mmu_page_as_id(_root) != _as_id) {			\
253 		} else
254 
255 #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared)	\
256 	__for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared, true)
257 
258 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id)			\
259 	__for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, false, false)
260 
261 /*
262  * Iterate over all TDP MMU roots.  Requires that mmu_lock be held for write,
263  * the implication being that any flow that holds mmu_lock for read is
264  * inherently yield-friendly and should use the yield-safe variant above.
265  * Holding mmu_lock for write obviates the need for RCU protection as the list
266  * is guaranteed to be stable.
267  */
268 #define for_each_tdp_mmu_root(_kvm, _root, _as_id)			\
269 	list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link)	\
270 		if (kvm_lockdep_assert_mmu_lock_held(_kvm, false) &&	\
271 		    kvm_mmu_page_as_id(_root) != _as_id) {		\
272 		} else
273 
274 static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu)
275 {
276 	struct kvm_mmu_page *sp;
277 
278 	sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
279 	sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
280 
281 	return sp;
282 }
283 
284 static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep,
285 			    gfn_t gfn, union kvm_mmu_page_role role)
286 {
287 	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
288 
289 	sp->role = role;
290 	sp->gfn = gfn;
291 	sp->ptep = sptep;
292 	sp->tdp_mmu_page = true;
293 
294 	trace_kvm_mmu_get_page(sp, true);
295 }
296 
297 static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
298 				  struct tdp_iter *iter)
299 {
300 	struct kvm_mmu_page *parent_sp;
301 	union kvm_mmu_page_role role;
302 
303 	parent_sp = sptep_to_sp(rcu_dereference(iter->sptep));
304 
305 	role = parent_sp->role;
306 	role.level--;
307 
308 	tdp_mmu_init_sp(child_sp, iter->sptep, iter->gfn, role);
309 }
310 
311 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
312 {
313 	union kvm_mmu_page_role role = vcpu->arch.mmu->root_role;
314 	struct kvm *kvm = vcpu->kvm;
315 	struct kvm_mmu_page *root;
316 
317 	lockdep_assert_held_write(&kvm->mmu_lock);
318 
319 	/*
320 	 * Check for an existing root before allocating a new one.  Note, the
321 	 * role check prevents consuming an invalid root.
322 	 */
323 	for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
324 		if (root->role.word == role.word &&
325 		    kvm_tdp_mmu_get_root(root))
326 			goto out;
327 	}
328 
329 	root = tdp_mmu_alloc_sp(vcpu);
330 	tdp_mmu_init_sp(root, NULL, 0, role);
331 
332 	refcount_set(&root->tdp_mmu_root_count, 1);
333 
334 	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
335 	list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
336 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
337 
338 out:
339 	return __pa(root->spt);
340 }
341 
342 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
343 				u64 old_spte, u64 new_spte, int level,
344 				bool shared);
345 
346 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
347 {
348 	if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
349 		return;
350 
351 	if (is_accessed_spte(old_spte) &&
352 	    (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) ||
353 	     spte_to_pfn(old_spte) != spte_to_pfn(new_spte)))
354 		kvm_set_pfn_accessed(spte_to_pfn(old_spte));
355 }
356 
357 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
358 					  u64 old_spte, u64 new_spte, int level)
359 {
360 	bool pfn_changed;
361 	struct kvm_memory_slot *slot;
362 
363 	if (level > PG_LEVEL_4K)
364 		return;
365 
366 	pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
367 
368 	if ((!is_writable_pte(old_spte) || pfn_changed) &&
369 	    is_writable_pte(new_spte)) {
370 		slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
371 		mark_page_dirty_in_slot(kvm, slot, gfn);
372 	}
373 }
374 
375 static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
376 {
377 	kvm_account_pgtable_pages((void *)sp->spt, +1);
378 }
379 
380 static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
381 {
382 	kvm_account_pgtable_pages((void *)sp->spt, -1);
383 }
384 
385 /**
386  * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages
387  *
388  * @kvm: kvm instance
389  * @sp: the page to be removed
390  * @shared: This operation may not be running under the exclusive use of
391  *	    the MMU lock and the operation must synchronize with other
392  *	    threads that might be adding or removing pages.
393  */
394 static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp,
395 			      bool shared)
396 {
397 	tdp_unaccount_mmu_page(kvm, sp);
398 	if (shared)
399 		spin_lock(&kvm->arch.tdp_mmu_pages_lock);
400 	else
401 		lockdep_assert_held_write(&kvm->mmu_lock);
402 
403 	list_del(&sp->link);
404 	if (sp->lpage_disallowed)
405 		unaccount_huge_nx_page(kvm, sp);
406 
407 	if (shared)
408 		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
409 }
410 
411 /**
412  * handle_removed_pt() - handle a page table removed from the TDP structure
413  *
414  * @kvm: kvm instance
415  * @pt: the page removed from the paging structure
416  * @shared: This operation may not be running under the exclusive use
417  *	    of the MMU lock and the operation must synchronize with other
418  *	    threads that might be modifying SPTEs.
419  *
420  * Given a page table that has been removed from the TDP paging structure,
421  * iterates through the page table to clear SPTEs and free child page tables.
422  *
423  * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
424  * protection. Since this thread removed it from the paging structure,
425  * this thread will be responsible for ensuring the page is freed. Hence the
426  * early rcu_dereferences in the function.
427  */
428 static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
429 {
430 	struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
431 	int level = sp->role.level;
432 	gfn_t base_gfn = sp->gfn;
433 	int i;
434 
435 	trace_kvm_mmu_prepare_zap_page(sp);
436 
437 	tdp_mmu_unlink_sp(kvm, sp, shared);
438 
439 	for (i = 0; i < SPTE_ENT_PER_PAGE; i++) {
440 		tdp_ptep_t sptep = pt + i;
441 		gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
442 		u64 old_spte;
443 
444 		if (shared) {
445 			/*
446 			 * Set the SPTE to a nonpresent value that other
447 			 * threads will not overwrite. If the SPTE was
448 			 * already marked as removed then another thread
449 			 * handling a page fault could overwrite it, so
450 			 * set the SPTE until it is set from some other
451 			 * value to the removed SPTE value.
452 			 */
453 			for (;;) {
454 				old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, REMOVED_SPTE);
455 				if (!is_removed_spte(old_spte))
456 					break;
457 				cpu_relax();
458 			}
459 		} else {
460 			/*
461 			 * If the SPTE is not MMU-present, there is no backing
462 			 * page associated with the SPTE and so no side effects
463 			 * that need to be recorded, and exclusive ownership of
464 			 * mmu_lock ensures the SPTE can't be made present.
465 			 * Note, zapping MMIO SPTEs is also unnecessary as they
466 			 * are guarded by the memslots generation, not by being
467 			 * unreachable.
468 			 */
469 			old_spte = kvm_tdp_mmu_read_spte(sptep);
470 			if (!is_shadow_present_pte(old_spte))
471 				continue;
472 
473 			/*
474 			 * Use the common helper instead of a raw WRITE_ONCE as
475 			 * the SPTE needs to be updated atomically if it can be
476 			 * modified by a different vCPU outside of mmu_lock.
477 			 * Even though the parent SPTE is !PRESENT, the TLB
478 			 * hasn't yet been flushed, and both Intel and AMD
479 			 * document that A/D assists can use upper-level PxE
480 			 * entries that are cached in the TLB, i.e. the CPU can
481 			 * still access the page and mark it dirty.
482 			 *
483 			 * No retry is needed in the atomic update path as the
484 			 * sole concern is dropping a Dirty bit, i.e. no other
485 			 * task can zap/remove the SPTE as mmu_lock is held for
486 			 * write.  Marking the SPTE as a removed SPTE is not
487 			 * strictly necessary for the same reason, but using
488 			 * the remove SPTE value keeps the shared/exclusive
489 			 * paths consistent and allows the handle_changed_spte()
490 			 * call below to hardcode the new value to REMOVED_SPTE.
491 			 *
492 			 * Note, even though dropping a Dirty bit is the only
493 			 * scenario where a non-atomic update could result in a
494 			 * functional bug, simply checking the Dirty bit isn't
495 			 * sufficient as a fast page fault could read the upper
496 			 * level SPTE before it is zapped, and then make this
497 			 * target SPTE writable, resume the guest, and set the
498 			 * Dirty bit between reading the SPTE above and writing
499 			 * it here.
500 			 */
501 			old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte,
502 							  REMOVED_SPTE, level);
503 		}
504 		handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
505 				    old_spte, REMOVED_SPTE, level, shared);
506 	}
507 
508 	call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
509 }
510 
511 /**
512  * __handle_changed_spte - handle bookkeeping associated with an SPTE change
513  * @kvm: kvm instance
514  * @as_id: the address space of the paging structure the SPTE was a part of
515  * @gfn: the base GFN that was mapped by the SPTE
516  * @old_spte: The value of the SPTE before the change
517  * @new_spte: The value of the SPTE after the change
518  * @level: the level of the PT the SPTE is part of in the paging structure
519  * @shared: This operation may not be running under the exclusive use of
520  *	    the MMU lock and the operation must synchronize with other
521  *	    threads that might be modifying SPTEs.
522  *
523  * Handle bookkeeping that might result from the modification of a SPTE.
524  * This function must be called for all TDP SPTE modifications.
525  */
526 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
527 				  u64 old_spte, u64 new_spte, int level,
528 				  bool shared)
529 {
530 	bool was_present = is_shadow_present_pte(old_spte);
531 	bool is_present = is_shadow_present_pte(new_spte);
532 	bool was_leaf = was_present && is_last_spte(old_spte, level);
533 	bool is_leaf = is_present && is_last_spte(new_spte, level);
534 	bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
535 
536 	WARN_ON(level > PT64_ROOT_MAX_LEVEL);
537 	WARN_ON(level < PG_LEVEL_4K);
538 	WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
539 
540 	/*
541 	 * If this warning were to trigger it would indicate that there was a
542 	 * missing MMU notifier or a race with some notifier handler.
543 	 * A present, leaf SPTE should never be directly replaced with another
544 	 * present leaf SPTE pointing to a different PFN. A notifier handler
545 	 * should be zapping the SPTE before the main MM's page table is
546 	 * changed, or the SPTE should be zeroed, and the TLBs flushed by the
547 	 * thread before replacement.
548 	 */
549 	if (was_leaf && is_leaf && pfn_changed) {
550 		pr_err("Invalid SPTE change: cannot replace a present leaf\n"
551 		       "SPTE with another present leaf SPTE mapping a\n"
552 		       "different PFN!\n"
553 		       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
554 		       as_id, gfn, old_spte, new_spte, level);
555 
556 		/*
557 		 * Crash the host to prevent error propagation and guest data
558 		 * corruption.
559 		 */
560 		BUG();
561 	}
562 
563 	if (old_spte == new_spte)
564 		return;
565 
566 	trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
567 
568 	if (is_leaf)
569 		check_spte_writable_invariants(new_spte);
570 
571 	/*
572 	 * The only times a SPTE should be changed from a non-present to
573 	 * non-present state is when an MMIO entry is installed/modified/
574 	 * removed. In that case, there is nothing to do here.
575 	 */
576 	if (!was_present && !is_present) {
577 		/*
578 		 * If this change does not involve a MMIO SPTE or removed SPTE,
579 		 * it is unexpected. Log the change, though it should not
580 		 * impact the guest since both the former and current SPTEs
581 		 * are nonpresent.
582 		 */
583 		if (WARN_ON(!is_mmio_spte(old_spte) &&
584 			    !is_mmio_spte(new_spte) &&
585 			    !is_removed_spte(new_spte)))
586 			pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
587 			       "should not be replaced with another,\n"
588 			       "different nonpresent SPTE, unless one or both\n"
589 			       "are MMIO SPTEs, or the new SPTE is\n"
590 			       "a temporary removed SPTE.\n"
591 			       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
592 			       as_id, gfn, old_spte, new_spte, level);
593 		return;
594 	}
595 
596 	if (is_leaf != was_leaf)
597 		kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
598 
599 	if (was_leaf && is_dirty_spte(old_spte) &&
600 	    (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
601 		kvm_set_pfn_dirty(spte_to_pfn(old_spte));
602 
603 	/*
604 	 * Recursively handle child PTs if the change removed a subtree from
605 	 * the paging structure.  Note the WARN on the PFN changing without the
606 	 * SPTE being converted to a hugepage (leaf) or being zapped.  Shadow
607 	 * pages are kernel allocations and should never be migrated.
608 	 */
609 	if (was_present && !was_leaf &&
610 	    (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed)))
611 		handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared);
612 }
613 
614 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
615 				u64 old_spte, u64 new_spte, int level,
616 				bool shared)
617 {
618 	__handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
619 			      shared);
620 	handle_changed_spte_acc_track(old_spte, new_spte, level);
621 	handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
622 				      new_spte, level);
623 }
624 
625 /*
626  * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically
627  * and handle the associated bookkeeping.  Do not mark the page dirty
628  * in KVM's dirty bitmaps.
629  *
630  * If setting the SPTE fails because it has changed, iter->old_spte will be
631  * refreshed to the current value of the spte.
632  *
633  * @kvm: kvm instance
634  * @iter: a tdp_iter instance currently on the SPTE that should be set
635  * @new_spte: The value the SPTE should be set to
636  * Return:
637  * * 0      - If the SPTE was set.
638  * * -EBUSY - If the SPTE cannot be set. In this case this function will have
639  *            no side-effects other than setting iter->old_spte to the last
640  *            known value of the spte.
641  */
642 static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm,
643 					  struct tdp_iter *iter,
644 					  u64 new_spte)
645 {
646 	u64 *sptep = rcu_dereference(iter->sptep);
647 
648 	/*
649 	 * The caller is responsible for ensuring the old SPTE is not a REMOVED
650 	 * SPTE.  KVM should never attempt to zap or manipulate a REMOVED SPTE,
651 	 * and pre-checking before inserting a new SPTE is advantageous as it
652 	 * avoids unnecessary work.
653 	 */
654 	WARN_ON_ONCE(iter->yielded || is_removed_spte(iter->old_spte));
655 
656 	lockdep_assert_held_read(&kvm->mmu_lock);
657 
658 	/*
659 	 * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
660 	 * does not hold the mmu_lock.
661 	 */
662 	if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte))
663 		return -EBUSY;
664 
665 	__handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
666 			      new_spte, iter->level, true);
667 	handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level);
668 
669 	return 0;
670 }
671 
672 static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
673 					  struct tdp_iter *iter)
674 {
675 	int ret;
676 
677 	/*
678 	 * Freeze the SPTE by setting it to a special,
679 	 * non-present value. This will stop other threads from
680 	 * immediately installing a present entry in its place
681 	 * before the TLBs are flushed.
682 	 */
683 	ret = tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE);
684 	if (ret)
685 		return ret;
686 
687 	kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
688 					   KVM_PAGES_PER_HPAGE(iter->level));
689 
690 	/*
691 	 * No other thread can overwrite the removed SPTE as they must either
692 	 * wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not
693 	 * overwrite the special removed SPTE value. No bookkeeping is needed
694 	 * here since the SPTE is going from non-present to non-present.  Use
695 	 * the raw write helper to avoid an unnecessary check on volatile bits.
696 	 */
697 	__kvm_tdp_mmu_write_spte(iter->sptep, 0);
698 
699 	return 0;
700 }
701 
702 
703 /*
704  * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
705  * @kvm:	      KVM instance
706  * @as_id:	      Address space ID, i.e. regular vs. SMM
707  * @sptep:	      Pointer to the SPTE
708  * @old_spte:	      The current value of the SPTE
709  * @new_spte:	      The new value that will be set for the SPTE
710  * @gfn:	      The base GFN that was (or will be) mapped by the SPTE
711  * @level:	      The level _containing_ the SPTE (its parent PT's level)
712  * @record_acc_track: Notify the MM subsystem of changes to the accessed state
713  *		      of the page. Should be set unless handling an MMU
714  *		      notifier for access tracking. Leaving record_acc_track
715  *		      unset in that case prevents page accesses from being
716  *		      double counted.
717  * @record_dirty_log: Record the page as dirty in the dirty bitmap if
718  *		      appropriate for the change being made. Should be set
719  *		      unless performing certain dirty logging operations.
720  *		      Leaving record_dirty_log unset in that case prevents page
721  *		      writes from being double counted.
722  *
723  * Returns the old SPTE value, which _may_ be different than @old_spte if the
724  * SPTE had voldatile bits.
725  */
726 static u64 __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
727 			      u64 old_spte, u64 new_spte, gfn_t gfn, int level,
728 			      bool record_acc_track, bool record_dirty_log)
729 {
730 	lockdep_assert_held_write(&kvm->mmu_lock);
731 
732 	/*
733 	 * No thread should be using this function to set SPTEs to or from the
734 	 * temporary removed SPTE value.
735 	 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
736 	 * should be used. If operating under the MMU lock in write mode, the
737 	 * use of the removed SPTE should not be necessary.
738 	 */
739 	WARN_ON(is_removed_spte(old_spte) || is_removed_spte(new_spte));
740 
741 	old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level);
742 
743 	__handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
744 
745 	if (record_acc_track)
746 		handle_changed_spte_acc_track(old_spte, new_spte, level);
747 	if (record_dirty_log)
748 		handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
749 					      new_spte, level);
750 	return old_spte;
751 }
752 
753 static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
754 				     u64 new_spte, bool record_acc_track,
755 				     bool record_dirty_log)
756 {
757 	WARN_ON_ONCE(iter->yielded);
758 
759 	iter->old_spte = __tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep,
760 					    iter->old_spte, new_spte,
761 					    iter->gfn, iter->level,
762 					    record_acc_track, record_dirty_log);
763 }
764 
765 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
766 				    u64 new_spte)
767 {
768 	_tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
769 }
770 
771 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
772 						 struct tdp_iter *iter,
773 						 u64 new_spte)
774 {
775 	_tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
776 }
777 
778 static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
779 						 struct tdp_iter *iter,
780 						 u64 new_spte)
781 {
782 	_tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
783 }
784 
785 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
786 	for_each_tdp_pte(_iter, _root, _start, _end)
787 
788 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end)	\
789 	tdp_root_for_each_pte(_iter, _root, _start, _end)		\
790 		if (!is_shadow_present_pte(_iter.old_spte) ||		\
791 		    !is_last_spte(_iter.old_spte, _iter.level))		\
792 			continue;					\
793 		else
794 
795 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end)		\
796 	for_each_tdp_pte(_iter, to_shadow_page(_mmu->root.hpa), _start, _end)
797 
798 /*
799  * Yield if the MMU lock is contended or this thread needs to return control
800  * to the scheduler.
801  *
802  * If this function should yield and flush is set, it will perform a remote
803  * TLB flush before yielding.
804  *
805  * If this function yields, iter->yielded is set and the caller must skip to
806  * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk
807  * over the paging structures to allow the iterator to continue its traversal
808  * from the paging structure root.
809  *
810  * Returns true if this function yielded.
811  */
812 static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
813 							  struct tdp_iter *iter,
814 							  bool flush, bool shared)
815 {
816 	WARN_ON(iter->yielded);
817 
818 	/* Ensure forward progress has been made before yielding. */
819 	if (iter->next_last_level_gfn == iter->yielded_gfn)
820 		return false;
821 
822 	if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
823 		if (flush)
824 			kvm_flush_remote_tlbs(kvm);
825 
826 		rcu_read_unlock();
827 
828 		if (shared)
829 			cond_resched_rwlock_read(&kvm->mmu_lock);
830 		else
831 			cond_resched_rwlock_write(&kvm->mmu_lock);
832 
833 		rcu_read_lock();
834 
835 		WARN_ON(iter->gfn > iter->next_last_level_gfn);
836 
837 		iter->yielded = true;
838 	}
839 
840 	return iter->yielded;
841 }
842 
843 static inline gfn_t tdp_mmu_max_gfn_exclusive(void)
844 {
845 	/*
846 	 * Bound TDP MMU walks at host.MAXPHYADDR.  KVM disallows memslots with
847 	 * a gpa range that would exceed the max gfn, and KVM does not create
848 	 * MMIO SPTEs for "impossible" gfns, instead sending such accesses down
849 	 * the slow emulation path every time.
850 	 */
851 	return kvm_mmu_max_gfn() + 1;
852 }
853 
854 static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
855 			       bool shared, int zap_level)
856 {
857 	struct tdp_iter iter;
858 
859 	gfn_t end = tdp_mmu_max_gfn_exclusive();
860 	gfn_t start = 0;
861 
862 	for_each_tdp_pte_min_level(iter, root, zap_level, start, end) {
863 retry:
864 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
865 			continue;
866 
867 		if (!is_shadow_present_pte(iter.old_spte))
868 			continue;
869 
870 		if (iter.level > zap_level)
871 			continue;
872 
873 		if (!shared)
874 			tdp_mmu_set_spte(kvm, &iter, 0);
875 		else if (tdp_mmu_set_spte_atomic(kvm, &iter, 0))
876 			goto retry;
877 	}
878 }
879 
880 static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
881 			     bool shared)
882 {
883 
884 	/*
885 	 * The root must have an elevated refcount so that it's reachable via
886 	 * mmu_notifier callbacks, which allows this path to yield and drop
887 	 * mmu_lock.  When handling an unmap/release mmu_notifier command, KVM
888 	 * must drop all references to relevant pages prior to completing the
889 	 * callback.  Dropping mmu_lock with an unreachable root would result
890 	 * in zapping SPTEs after a relevant mmu_notifier callback completes
891 	 * and lead to use-after-free as zapping a SPTE triggers "writeback" of
892 	 * dirty accessed bits to the SPTE's associated struct page.
893 	 */
894 	WARN_ON_ONCE(!refcount_read(&root->tdp_mmu_root_count));
895 
896 	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
897 
898 	rcu_read_lock();
899 
900 	/*
901 	 * To avoid RCU stalls due to recursively removing huge swaths of SPs,
902 	 * split the zap into two passes.  On the first pass, zap at the 1gb
903 	 * level, and then zap top-level SPs on the second pass.  "1gb" is not
904 	 * arbitrary, as KVM must be able to zap a 1gb shadow page without
905 	 * inducing a stall to allow in-place replacement with a 1gb hugepage.
906 	 *
907 	 * Because zapping a SP recurses on its children, stepping down to
908 	 * PG_LEVEL_4K in the iterator itself is unnecessary.
909 	 */
910 	__tdp_mmu_zap_root(kvm, root, shared, PG_LEVEL_1G);
911 	__tdp_mmu_zap_root(kvm, root, shared, root->role.level);
912 
913 	rcu_read_unlock();
914 }
915 
916 bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
917 {
918 	u64 old_spte;
919 
920 	/*
921 	 * This helper intentionally doesn't allow zapping a root shadow page,
922 	 * which doesn't have a parent page table and thus no associated entry.
923 	 */
924 	if (WARN_ON_ONCE(!sp->ptep))
925 		return false;
926 
927 	old_spte = kvm_tdp_mmu_read_spte(sp->ptep);
928 	if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte)))
929 		return false;
930 
931 	__tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, 0,
932 			   sp->gfn, sp->role.level + 1, true, true);
933 
934 	return true;
935 }
936 
937 /*
938  * If can_yield is true, will release the MMU lock and reschedule if the
939  * scheduler needs the CPU or there is contention on the MMU lock. If this
940  * function cannot yield, it will not release the MMU lock or reschedule and
941  * the caller must ensure it does not supply too large a GFN range, or the
942  * operation can cause a soft lockup.
943  */
944 static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root,
945 			      gfn_t start, gfn_t end, bool can_yield, bool flush)
946 {
947 	struct tdp_iter iter;
948 
949 	end = min(end, tdp_mmu_max_gfn_exclusive());
950 
951 	lockdep_assert_held_write(&kvm->mmu_lock);
952 
953 	rcu_read_lock();
954 
955 	for_each_tdp_pte_min_level(iter, root, PG_LEVEL_4K, start, end) {
956 		if (can_yield &&
957 		    tdp_mmu_iter_cond_resched(kvm, &iter, flush, false)) {
958 			flush = false;
959 			continue;
960 		}
961 
962 		if (!is_shadow_present_pte(iter.old_spte) ||
963 		    !is_last_spte(iter.old_spte, iter.level))
964 			continue;
965 
966 		tdp_mmu_set_spte(kvm, &iter, 0);
967 		flush = true;
968 	}
969 
970 	rcu_read_unlock();
971 
972 	/*
973 	 * Because this flow zaps _only_ leaf SPTEs, the caller doesn't need
974 	 * to provide RCU protection as no 'struct kvm_mmu_page' will be freed.
975 	 */
976 	return flush;
977 }
978 
979 /*
980  * Zap leaf SPTEs for the range of gfns, [start, end), for all roots. Returns
981  * true if a TLB flush is needed before releasing the MMU lock, i.e. if one or
982  * more SPTEs were zapped since the MMU lock was last acquired.
983  */
984 bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, int as_id, gfn_t start, gfn_t end,
985 			   bool can_yield, bool flush)
986 {
987 	struct kvm_mmu_page *root;
988 
989 	for_each_tdp_mmu_root_yield_safe(kvm, root, as_id)
990 		flush = tdp_mmu_zap_leafs(kvm, root, start, end, can_yield, flush);
991 
992 	return flush;
993 }
994 
995 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
996 {
997 	struct kvm_mmu_page *root;
998 	int i;
999 
1000 	/*
1001 	 * Zap all roots, including invalid roots, as all SPTEs must be dropped
1002 	 * before returning to the caller.  Zap directly even if the root is
1003 	 * also being zapped by a worker.  Walking zapped top-level SPTEs isn't
1004 	 * all that expensive and mmu_lock is already held, which means the
1005 	 * worker has yielded, i.e. flushing the work instead of zapping here
1006 	 * isn't guaranteed to be any faster.
1007 	 *
1008 	 * A TLB flush is unnecessary, KVM zaps everything if and only the VM
1009 	 * is being destroyed or the userspace VMM has exited.  In both cases,
1010 	 * KVM_RUN is unreachable, i.e. no vCPUs will ever service the request.
1011 	 */
1012 	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
1013 		for_each_tdp_mmu_root_yield_safe(kvm, root, i)
1014 			tdp_mmu_zap_root(kvm, root, false);
1015 	}
1016 }
1017 
1018 /*
1019  * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast
1020  * zap" completes.
1021  */
1022 void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
1023 {
1024 	flush_workqueue(kvm->arch.tdp_mmu_zap_wq);
1025 }
1026 
1027 /*
1028  * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that
1029  * is about to be zapped, e.g. in response to a memslots update.  The actual
1030  * zapping is performed asynchronously, so a reference is taken on all roots.
1031  * Using a separate workqueue makes it easy to ensure that the destruction is
1032  * performed before the "fast zap" completes, without keeping a separate list
1033  * of invalidated roots; the list is effectively the list of work items in
1034  * the workqueue.
1035  *
1036  * Get a reference even if the root is already invalid, the asynchronous worker
1037  * assumes it was gifted a reference to the root it processes.  Because mmu_lock
1038  * is held for write, it should be impossible to observe a root with zero refcount,
1039  * i.e. the list of roots cannot be stale.
1040  *
1041  * This has essentially the same effect for the TDP MMU
1042  * as updating mmu_valid_gen does for the shadow MMU.
1043  */
1044 void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
1045 {
1046 	struct kvm_mmu_page *root;
1047 
1048 	lockdep_assert_held_write(&kvm->mmu_lock);
1049 	list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link) {
1050 		if (!root->role.invalid &&
1051 		    !WARN_ON_ONCE(!kvm_tdp_mmu_get_root(root))) {
1052 			root->role.invalid = true;
1053 			tdp_mmu_schedule_zap_root(kvm, root);
1054 		}
1055 	}
1056 }
1057 
1058 /*
1059  * Installs a last-level SPTE to handle a TDP page fault.
1060  * (NPT/EPT violation/misconfiguration)
1061  */
1062 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
1063 					  struct kvm_page_fault *fault,
1064 					  struct tdp_iter *iter)
1065 {
1066 	struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep));
1067 	u64 new_spte;
1068 	int ret = RET_PF_FIXED;
1069 	bool wrprot = false;
1070 
1071 	WARN_ON(sp->role.level != fault->goal_level);
1072 	if (unlikely(!fault->slot))
1073 		new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
1074 	else
1075 		wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
1076 					 fault->pfn, iter->old_spte, fault->prefetch, true,
1077 					 fault->map_writable, &new_spte);
1078 
1079 	if (new_spte == iter->old_spte)
1080 		ret = RET_PF_SPURIOUS;
1081 	else if (tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
1082 		return RET_PF_RETRY;
1083 	else if (is_shadow_present_pte(iter->old_spte) &&
1084 		 !is_last_spte(iter->old_spte, iter->level))
1085 		kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
1086 						   KVM_PAGES_PER_HPAGE(iter->level + 1));
1087 
1088 	/*
1089 	 * If the page fault was caused by a write but the page is write
1090 	 * protected, emulation is needed. If the emulation was skipped,
1091 	 * the vCPU would have the same fault again.
1092 	 */
1093 	if (wrprot) {
1094 		if (fault->write)
1095 			ret = RET_PF_EMULATE;
1096 	}
1097 
1098 	/* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
1099 	if (unlikely(is_mmio_spte(new_spte))) {
1100 		vcpu->stat.pf_mmio_spte_created++;
1101 		trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
1102 				     new_spte);
1103 		ret = RET_PF_EMULATE;
1104 	} else {
1105 		trace_kvm_mmu_set_spte(iter->level, iter->gfn,
1106 				       rcu_dereference(iter->sptep));
1107 	}
1108 
1109 	return ret;
1110 }
1111 
1112 /*
1113  * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the
1114  * provided page table.
1115  *
1116  * @kvm: kvm instance
1117  * @iter: a tdp_iter instance currently on the SPTE that should be set
1118  * @sp: The new TDP page table to install.
1119  * @account_nx: True if this page table is being installed to split a
1120  *              non-executable huge page.
1121  * @shared: This operation is running under the MMU lock in read mode.
1122  *
1123  * Returns: 0 if the new page table was installed. Non-0 if the page table
1124  *          could not be installed (e.g. the atomic compare-exchange failed).
1125  */
1126 static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
1127 			   struct kvm_mmu_page *sp, bool account_nx,
1128 			   bool shared)
1129 {
1130 	u64 spte = make_nonleaf_spte(sp->spt, !kvm_ad_enabled());
1131 	int ret = 0;
1132 
1133 	if (shared) {
1134 		ret = tdp_mmu_set_spte_atomic(kvm, iter, spte);
1135 		if (ret)
1136 			return ret;
1137 	} else {
1138 		tdp_mmu_set_spte(kvm, iter, spte);
1139 	}
1140 
1141 	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
1142 	list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
1143 	if (account_nx)
1144 		account_huge_nx_page(kvm, sp);
1145 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
1146 	tdp_account_mmu_page(kvm, sp);
1147 
1148 	return 0;
1149 }
1150 
1151 /*
1152  * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
1153  * page tables and SPTEs to translate the faulting guest physical address.
1154  */
1155 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
1156 {
1157 	struct kvm_mmu *mmu = vcpu->arch.mmu;
1158 	struct tdp_iter iter;
1159 	struct kvm_mmu_page *sp;
1160 	int ret;
1161 
1162 	kvm_mmu_hugepage_adjust(vcpu, fault);
1163 
1164 	trace_kvm_mmu_spte_requested(fault);
1165 
1166 	rcu_read_lock();
1167 
1168 	tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) {
1169 		if (fault->nx_huge_page_workaround_enabled)
1170 			disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
1171 
1172 		if (iter.level == fault->goal_level)
1173 			break;
1174 
1175 		/*
1176 		 * If there is an SPTE mapping a large page at a higher level
1177 		 * than the target, that SPTE must be cleared and replaced
1178 		 * with a non-leaf SPTE.
1179 		 */
1180 		if (is_shadow_present_pte(iter.old_spte) &&
1181 		    is_large_pte(iter.old_spte)) {
1182 			if (tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
1183 				break;
1184 
1185 			/*
1186 			 * The iter must explicitly re-read the spte here
1187 			 * because the new value informs the !present
1188 			 * path below.
1189 			 */
1190 			iter.old_spte = kvm_tdp_mmu_read_spte(iter.sptep);
1191 		}
1192 
1193 		if (!is_shadow_present_pte(iter.old_spte)) {
1194 			bool account_nx = fault->huge_page_disallowed &&
1195 					  fault->req_level >= iter.level;
1196 
1197 			/*
1198 			 * If SPTE has been frozen by another thread, just
1199 			 * give up and retry, avoiding unnecessary page table
1200 			 * allocation and free.
1201 			 */
1202 			if (is_removed_spte(iter.old_spte))
1203 				break;
1204 
1205 			sp = tdp_mmu_alloc_sp(vcpu);
1206 			tdp_mmu_init_child_sp(sp, &iter);
1207 
1208 			if (tdp_mmu_link_sp(vcpu->kvm, &iter, sp, account_nx, true)) {
1209 				tdp_mmu_free_sp(sp);
1210 				break;
1211 			}
1212 		}
1213 	}
1214 
1215 	/*
1216 	 * Force the guest to retry the access if the upper level SPTEs aren't
1217 	 * in place, or if the target leaf SPTE is frozen by another CPU.
1218 	 */
1219 	if (iter.level != fault->goal_level || is_removed_spte(iter.old_spte)) {
1220 		rcu_read_unlock();
1221 		return RET_PF_RETRY;
1222 	}
1223 
1224 	ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
1225 	rcu_read_unlock();
1226 
1227 	return ret;
1228 }
1229 
1230 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
1231 				 bool flush)
1232 {
1233 	return kvm_tdp_mmu_zap_leafs(kvm, range->slot->as_id, range->start,
1234 				     range->end, range->may_block, flush);
1235 }
1236 
1237 typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
1238 			      struct kvm_gfn_range *range);
1239 
1240 static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
1241 						   struct kvm_gfn_range *range,
1242 						   tdp_handler_t handler)
1243 {
1244 	struct kvm_mmu_page *root;
1245 	struct tdp_iter iter;
1246 	bool ret = false;
1247 
1248 	/*
1249 	 * Don't support rescheduling, none of the MMU notifiers that funnel
1250 	 * into this helper allow blocking; it'd be dead, wasteful code.
1251 	 */
1252 	for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
1253 		rcu_read_lock();
1254 
1255 		tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
1256 			ret |= handler(kvm, &iter, range);
1257 
1258 		rcu_read_unlock();
1259 	}
1260 
1261 	return ret;
1262 }
1263 
1264 /*
1265  * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
1266  * if any of the GFNs in the range have been accessed.
1267  */
1268 static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
1269 			  struct kvm_gfn_range *range)
1270 {
1271 	u64 new_spte = 0;
1272 
1273 	/* If we have a non-accessed entry we don't need to change the pte. */
1274 	if (!is_accessed_spte(iter->old_spte))
1275 		return false;
1276 
1277 	new_spte = iter->old_spte;
1278 
1279 	if (spte_ad_enabled(new_spte)) {
1280 		new_spte &= ~shadow_accessed_mask;
1281 	} else {
1282 		/*
1283 		 * Capture the dirty status of the page, so that it doesn't get
1284 		 * lost when the SPTE is marked for access tracking.
1285 		 */
1286 		if (is_writable_pte(new_spte))
1287 			kvm_set_pfn_dirty(spte_to_pfn(new_spte));
1288 
1289 		new_spte = mark_spte_for_access_track(new_spte);
1290 	}
1291 
1292 	tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte);
1293 
1294 	return true;
1295 }
1296 
1297 bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
1298 {
1299 	return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
1300 }
1301 
1302 static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
1303 			 struct kvm_gfn_range *range)
1304 {
1305 	return is_accessed_spte(iter->old_spte);
1306 }
1307 
1308 bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1309 {
1310 	return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
1311 }
1312 
1313 static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
1314 			 struct kvm_gfn_range *range)
1315 {
1316 	u64 new_spte;
1317 
1318 	/* Huge pages aren't expected to be modified without first being zapped. */
1319 	WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end);
1320 
1321 	if (iter->level != PG_LEVEL_4K ||
1322 	    !is_shadow_present_pte(iter->old_spte))
1323 		return false;
1324 
1325 	/*
1326 	 * Note, when changing a read-only SPTE, it's not strictly necessary to
1327 	 * zero the SPTE before setting the new PFN, but doing so preserves the
1328 	 * invariant that the PFN of a present * leaf SPTE can never change.
1329 	 * See __handle_changed_spte().
1330 	 */
1331 	tdp_mmu_set_spte(kvm, iter, 0);
1332 
1333 	if (!pte_write(range->pte)) {
1334 		new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
1335 								  pte_pfn(range->pte));
1336 
1337 		tdp_mmu_set_spte(kvm, iter, new_spte);
1338 	}
1339 
1340 	return true;
1341 }
1342 
1343 /*
1344  * Handle the changed_pte MMU notifier for the TDP MMU.
1345  * data is a pointer to the new pte_t mapping the HVA specified by the MMU
1346  * notifier.
1347  * Returns non-zero if a flush is needed before releasing the MMU lock.
1348  */
1349 bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1350 {
1351 	/*
1352 	 * No need to handle the remote TLB flush under RCU protection, the
1353 	 * target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a
1354 	 * shadow page.  See the WARN on pfn_changed in __handle_changed_spte().
1355 	 */
1356 	return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
1357 }
1358 
1359 /*
1360  * Remove write access from all SPTEs at or above min_level that map GFNs
1361  * [start, end). Returns true if an SPTE has been changed and the TLBs need to
1362  * be flushed.
1363  */
1364 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1365 			     gfn_t start, gfn_t end, int min_level)
1366 {
1367 	struct tdp_iter iter;
1368 	u64 new_spte;
1369 	bool spte_set = false;
1370 
1371 	rcu_read_lock();
1372 
1373 	BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1374 
1375 	for_each_tdp_pte_min_level(iter, root, min_level, start, end) {
1376 retry:
1377 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1378 			continue;
1379 
1380 		if (!is_shadow_present_pte(iter.old_spte) ||
1381 		    !is_last_spte(iter.old_spte, iter.level) ||
1382 		    !(iter.old_spte & PT_WRITABLE_MASK))
1383 			continue;
1384 
1385 		new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1386 
1387 		if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
1388 			goto retry;
1389 
1390 		spte_set = true;
1391 	}
1392 
1393 	rcu_read_unlock();
1394 	return spte_set;
1395 }
1396 
1397 /*
1398  * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1399  * only affect leaf SPTEs down to min_level.
1400  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1401  */
1402 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
1403 			     const struct kvm_memory_slot *slot, int min_level)
1404 {
1405 	struct kvm_mmu_page *root;
1406 	bool spte_set = false;
1407 
1408 	lockdep_assert_held_read(&kvm->mmu_lock);
1409 
1410 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1411 		spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1412 			     slot->base_gfn + slot->npages, min_level);
1413 
1414 	return spte_set;
1415 }
1416 
1417 static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
1418 {
1419 	struct kvm_mmu_page *sp;
1420 
1421 	gfp |= __GFP_ZERO;
1422 
1423 	sp = kmem_cache_alloc(mmu_page_header_cache, gfp);
1424 	if (!sp)
1425 		return NULL;
1426 
1427 	sp->spt = (void *)__get_free_page(gfp);
1428 	if (!sp->spt) {
1429 		kmem_cache_free(mmu_page_header_cache, sp);
1430 		return NULL;
1431 	}
1432 
1433 	return sp;
1434 }
1435 
1436 static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
1437 						       struct tdp_iter *iter,
1438 						       bool shared)
1439 {
1440 	struct kvm_mmu_page *sp;
1441 
1442 	/*
1443 	 * Since we are allocating while under the MMU lock we have to be
1444 	 * careful about GFP flags. Use GFP_NOWAIT to avoid blocking on direct
1445 	 * reclaim and to avoid making any filesystem callbacks (which can end
1446 	 * up invoking KVM MMU notifiers, resulting in a deadlock).
1447 	 *
1448 	 * If this allocation fails we drop the lock and retry with reclaim
1449 	 * allowed.
1450 	 */
1451 	sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT);
1452 	if (sp)
1453 		return sp;
1454 
1455 	rcu_read_unlock();
1456 
1457 	if (shared)
1458 		read_unlock(&kvm->mmu_lock);
1459 	else
1460 		write_unlock(&kvm->mmu_lock);
1461 
1462 	iter->yielded = true;
1463 	sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT);
1464 
1465 	if (shared)
1466 		read_lock(&kvm->mmu_lock);
1467 	else
1468 		write_lock(&kvm->mmu_lock);
1469 
1470 	rcu_read_lock();
1471 
1472 	return sp;
1473 }
1474 
1475 static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter,
1476 				   struct kvm_mmu_page *sp, bool shared)
1477 {
1478 	const u64 huge_spte = iter->old_spte;
1479 	const int level = iter->level;
1480 	int ret, i;
1481 
1482 	tdp_mmu_init_child_sp(sp, iter);
1483 
1484 	/*
1485 	 * No need for atomics when writing to sp->spt since the page table has
1486 	 * not been linked in yet and thus is not reachable from any other CPU.
1487 	 */
1488 	for (i = 0; i < SPTE_ENT_PER_PAGE; i++)
1489 		sp->spt[i] = make_huge_page_split_spte(kvm, huge_spte, sp->role, i);
1490 
1491 	/*
1492 	 * Replace the huge spte with a pointer to the populated lower level
1493 	 * page table. Since we are making this change without a TLB flush vCPUs
1494 	 * will see a mix of the split mappings and the original huge mapping,
1495 	 * depending on what's currently in their TLB. This is fine from a
1496 	 * correctness standpoint since the translation will be the same either
1497 	 * way.
1498 	 */
1499 	ret = tdp_mmu_link_sp(kvm, iter, sp, false, shared);
1500 	if (ret)
1501 		goto out;
1502 
1503 	/*
1504 	 * tdp_mmu_link_sp_atomic() will handle subtracting the huge page we
1505 	 * are overwriting from the page stats. But we have to manually update
1506 	 * the page stats with the new present child pages.
1507 	 */
1508 	kvm_update_page_stats(kvm, level - 1, SPTE_ENT_PER_PAGE);
1509 
1510 out:
1511 	trace_kvm_mmu_split_huge_page(iter->gfn, huge_spte, level, ret);
1512 	return ret;
1513 }
1514 
1515 static int tdp_mmu_split_huge_pages_root(struct kvm *kvm,
1516 					 struct kvm_mmu_page *root,
1517 					 gfn_t start, gfn_t end,
1518 					 int target_level, bool shared)
1519 {
1520 	struct kvm_mmu_page *sp = NULL;
1521 	struct tdp_iter iter;
1522 	int ret = 0;
1523 
1524 	rcu_read_lock();
1525 
1526 	/*
1527 	 * Traverse the page table splitting all huge pages above the target
1528 	 * level into one lower level. For example, if we encounter a 1GB page
1529 	 * we split it into 512 2MB pages.
1530 	 *
1531 	 * Since the TDP iterator uses a pre-order traversal, we are guaranteed
1532 	 * to visit an SPTE before ever visiting its children, which means we
1533 	 * will correctly recursively split huge pages that are more than one
1534 	 * level above the target level (e.g. splitting a 1GB to 512 2MB pages,
1535 	 * and then splitting each of those to 512 4KB pages).
1536 	 */
1537 	for_each_tdp_pte_min_level(iter, root, target_level + 1, start, end) {
1538 retry:
1539 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, shared))
1540 			continue;
1541 
1542 		if (!is_shadow_present_pte(iter.old_spte) || !is_large_pte(iter.old_spte))
1543 			continue;
1544 
1545 		if (!sp) {
1546 			sp = tdp_mmu_alloc_sp_for_split(kvm, &iter, shared);
1547 			if (!sp) {
1548 				ret = -ENOMEM;
1549 				trace_kvm_mmu_split_huge_page(iter.gfn,
1550 							      iter.old_spte,
1551 							      iter.level, ret);
1552 				break;
1553 			}
1554 
1555 			if (iter.yielded)
1556 				continue;
1557 		}
1558 
1559 		if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared))
1560 			goto retry;
1561 
1562 		sp = NULL;
1563 	}
1564 
1565 	rcu_read_unlock();
1566 
1567 	/*
1568 	 * It's possible to exit the loop having never used the last sp if, for
1569 	 * example, a vCPU doing HugePage NX splitting wins the race and
1570 	 * installs its own sp in place of the last sp we tried to split.
1571 	 */
1572 	if (sp)
1573 		tdp_mmu_free_sp(sp);
1574 
1575 	return ret;
1576 }
1577 
1578 
1579 /*
1580  * Try to split all huge pages mapped by the TDP MMU down to the target level.
1581  */
1582 void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm,
1583 				      const struct kvm_memory_slot *slot,
1584 				      gfn_t start, gfn_t end,
1585 				      int target_level, bool shared)
1586 {
1587 	struct kvm_mmu_page *root;
1588 	int r = 0;
1589 
1590 	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
1591 
1592 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, shared) {
1593 		r = tdp_mmu_split_huge_pages_root(kvm, root, start, end, target_level, shared);
1594 		if (r) {
1595 			kvm_tdp_mmu_put_root(kvm, root, shared);
1596 			break;
1597 		}
1598 	}
1599 }
1600 
1601 /*
1602  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1603  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1604  * If AD bits are not enabled, this will require clearing the writable bit on
1605  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1606  * be flushed.
1607  */
1608 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1609 			   gfn_t start, gfn_t end)
1610 {
1611 	struct tdp_iter iter;
1612 	u64 new_spte;
1613 	bool spte_set = false;
1614 
1615 	rcu_read_lock();
1616 
1617 	tdp_root_for_each_leaf_pte(iter, root, start, end) {
1618 retry:
1619 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1620 			continue;
1621 
1622 		if (!is_shadow_present_pte(iter.old_spte))
1623 			continue;
1624 
1625 		if (spte_ad_need_write_protect(iter.old_spte)) {
1626 			if (is_writable_pte(iter.old_spte))
1627 				new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1628 			else
1629 				continue;
1630 		} else {
1631 			if (iter.old_spte & shadow_dirty_mask)
1632 				new_spte = iter.old_spte & ~shadow_dirty_mask;
1633 			else
1634 				continue;
1635 		}
1636 
1637 		if (tdp_mmu_set_spte_atomic(kvm, &iter, new_spte))
1638 			goto retry;
1639 
1640 		spte_set = true;
1641 	}
1642 
1643 	rcu_read_unlock();
1644 	return spte_set;
1645 }
1646 
1647 /*
1648  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1649  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1650  * If AD bits are not enabled, this will require clearing the writable bit on
1651  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1652  * be flushed.
1653  */
1654 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
1655 				  const struct kvm_memory_slot *slot)
1656 {
1657 	struct kvm_mmu_page *root;
1658 	bool spte_set = false;
1659 
1660 	lockdep_assert_held_read(&kvm->mmu_lock);
1661 
1662 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1663 		spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1664 				slot->base_gfn + slot->npages);
1665 
1666 	return spte_set;
1667 }
1668 
1669 /*
1670  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1671  * set in mask, starting at gfn. The given memslot is expected to contain all
1672  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1673  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1674  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1675  */
1676 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1677 				  gfn_t gfn, unsigned long mask, bool wrprot)
1678 {
1679 	struct tdp_iter iter;
1680 	u64 new_spte;
1681 
1682 	rcu_read_lock();
1683 
1684 	tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1685 				    gfn + BITS_PER_LONG) {
1686 		if (!mask)
1687 			break;
1688 
1689 		if (iter.level > PG_LEVEL_4K ||
1690 		    !(mask & (1UL << (iter.gfn - gfn))))
1691 			continue;
1692 
1693 		mask &= ~(1UL << (iter.gfn - gfn));
1694 
1695 		if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
1696 			if (is_writable_pte(iter.old_spte))
1697 				new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1698 			else
1699 				continue;
1700 		} else {
1701 			if (iter.old_spte & shadow_dirty_mask)
1702 				new_spte = iter.old_spte & ~shadow_dirty_mask;
1703 			else
1704 				continue;
1705 		}
1706 
1707 		tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1708 	}
1709 
1710 	rcu_read_unlock();
1711 }
1712 
1713 /*
1714  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1715  * set in mask, starting at gfn. The given memslot is expected to contain all
1716  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1717  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1718  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1719  */
1720 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1721 				       struct kvm_memory_slot *slot,
1722 				       gfn_t gfn, unsigned long mask,
1723 				       bool wrprot)
1724 {
1725 	struct kvm_mmu_page *root;
1726 
1727 	lockdep_assert_held_write(&kvm->mmu_lock);
1728 	for_each_tdp_mmu_root(kvm, root, slot->as_id)
1729 		clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1730 }
1731 
1732 static void zap_collapsible_spte_range(struct kvm *kvm,
1733 				       struct kvm_mmu_page *root,
1734 				       const struct kvm_memory_slot *slot)
1735 {
1736 	gfn_t start = slot->base_gfn;
1737 	gfn_t end = start + slot->npages;
1738 	struct tdp_iter iter;
1739 	int max_mapping_level;
1740 
1741 	rcu_read_lock();
1742 
1743 	for_each_tdp_pte_min_level(iter, root, PG_LEVEL_2M, start, end) {
1744 retry:
1745 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1746 			continue;
1747 
1748 		if (iter.level > KVM_MAX_HUGEPAGE_LEVEL ||
1749 		    !is_shadow_present_pte(iter.old_spte))
1750 			continue;
1751 
1752 		/*
1753 		 * Don't zap leaf SPTEs, if a leaf SPTE could be replaced with
1754 		 * a large page size, then its parent would have been zapped
1755 		 * instead of stepping down.
1756 		 */
1757 		if (is_last_spte(iter.old_spte, iter.level))
1758 			continue;
1759 
1760 		/*
1761 		 * If iter.gfn resides outside of the slot, i.e. the page for
1762 		 * the current level overlaps but is not contained by the slot,
1763 		 * then the SPTE can't be made huge.  More importantly, trying
1764 		 * to query that info from slot->arch.lpage_info will cause an
1765 		 * out-of-bounds access.
1766 		 */
1767 		if (iter.gfn < start || iter.gfn >= end)
1768 			continue;
1769 
1770 		max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot,
1771 							      iter.gfn, PG_LEVEL_NUM);
1772 		if (max_mapping_level < iter.level)
1773 			continue;
1774 
1775 		/* Note, a successful atomic zap also does a remote TLB flush. */
1776 		if (tdp_mmu_zap_spte_atomic(kvm, &iter))
1777 			goto retry;
1778 	}
1779 
1780 	rcu_read_unlock();
1781 }
1782 
1783 /*
1784  * Zap non-leaf SPTEs (and free their associated page tables) which could
1785  * be replaced by huge pages, for GFNs within the slot.
1786  */
1787 void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
1788 				       const struct kvm_memory_slot *slot)
1789 {
1790 	struct kvm_mmu_page *root;
1791 
1792 	lockdep_assert_held_read(&kvm->mmu_lock);
1793 
1794 	for_each_valid_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1795 		zap_collapsible_spte_range(kvm, root, slot);
1796 }
1797 
1798 /*
1799  * Removes write access on the last level SPTE mapping this GFN and unsets the
1800  * MMU-writable bit to ensure future writes continue to be intercepted.
1801  * Returns true if an SPTE was set and a TLB flush is needed.
1802  */
1803 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
1804 			      gfn_t gfn, int min_level)
1805 {
1806 	struct tdp_iter iter;
1807 	u64 new_spte;
1808 	bool spte_set = false;
1809 
1810 	BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1811 
1812 	rcu_read_lock();
1813 
1814 	for_each_tdp_pte_min_level(iter, root, min_level, gfn, gfn + 1) {
1815 		if (!is_shadow_present_pte(iter.old_spte) ||
1816 		    !is_last_spte(iter.old_spte, iter.level))
1817 			continue;
1818 
1819 		new_spte = iter.old_spte &
1820 			~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
1821 
1822 		if (new_spte == iter.old_spte)
1823 			break;
1824 
1825 		tdp_mmu_set_spte(kvm, &iter, new_spte);
1826 		spte_set = true;
1827 	}
1828 
1829 	rcu_read_unlock();
1830 
1831 	return spte_set;
1832 }
1833 
1834 /*
1835  * Removes write access on the last level SPTE mapping this GFN and unsets the
1836  * MMU-writable bit to ensure future writes continue to be intercepted.
1837  * Returns true if an SPTE was set and a TLB flush is needed.
1838  */
1839 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
1840 				   struct kvm_memory_slot *slot, gfn_t gfn,
1841 				   int min_level)
1842 {
1843 	struct kvm_mmu_page *root;
1844 	bool spte_set = false;
1845 
1846 	lockdep_assert_held_write(&kvm->mmu_lock);
1847 	for_each_tdp_mmu_root(kvm, root, slot->as_id)
1848 		spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
1849 
1850 	return spte_set;
1851 }
1852 
1853 /*
1854  * Return the level of the lowest level SPTE added to sptes.
1855  * That SPTE may be non-present.
1856  *
1857  * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1858  */
1859 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1860 			 int *root_level)
1861 {
1862 	struct tdp_iter iter;
1863 	struct kvm_mmu *mmu = vcpu->arch.mmu;
1864 	gfn_t gfn = addr >> PAGE_SHIFT;
1865 	int leaf = -1;
1866 
1867 	*root_level = vcpu->arch.mmu->root_role.level;
1868 
1869 	tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1870 		leaf = iter.level;
1871 		sptes[leaf] = iter.old_spte;
1872 	}
1873 
1874 	return leaf;
1875 }
1876 
1877 /*
1878  * Returns the last level spte pointer of the shadow page walk for the given
1879  * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
1880  * walk could be performed, returns NULL and *spte does not contain valid data.
1881  *
1882  * Contract:
1883  *  - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1884  *  - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end.
1885  *
1886  * WARNING: This function is only intended to be called during fast_page_fault.
1887  */
1888 u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
1889 					u64 *spte)
1890 {
1891 	struct tdp_iter iter;
1892 	struct kvm_mmu *mmu = vcpu->arch.mmu;
1893 	gfn_t gfn = addr >> PAGE_SHIFT;
1894 	tdp_ptep_t sptep = NULL;
1895 
1896 	tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1897 		*spte = iter.old_spte;
1898 		sptep = iter.sptep;
1899 	}
1900 
1901 	/*
1902 	 * Perform the rcu_dereference to get the raw spte pointer value since
1903 	 * we are passing it up to fast_page_fault, which is shared with the
1904 	 * legacy MMU and thus does not retain the TDP MMU-specific __rcu
1905 	 * annotation.
1906 	 *
1907 	 * This is safe since fast_page_fault obeys the contracts of this
1908 	 * function as well as all TDP MMU contracts around modifying SPTEs
1909 	 * outside of mmu_lock.
1910 	 */
1911 	return rcu_dereference(sptep);
1912 }
1913