xref: /linux/arch/x86/kvm/mmu/tdp_mmu.c (revision a544684b790f3e9f75173b3b42d7dad1c89dd237)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "mmu.h"
4 #include "mmu_internal.h"
5 #include "mmutrace.h"
6 #include "tdp_iter.h"
7 #include "tdp_mmu.h"
8 #include "spte.h"
9 
10 #include <asm/cmpxchg.h>
11 #include <trace/events/kvm.h>
12 
13 static bool __read_mostly tdp_mmu_enabled = true;
14 module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
15 
16 /* Initializes the TDP MMU for the VM, if enabled. */
17 bool kvm_mmu_init_tdp_mmu(struct kvm *kvm)
18 {
19 	if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
20 		return false;
21 
22 	/* This should not be changed for the lifetime of the VM. */
23 	kvm->arch.tdp_mmu_enabled = true;
24 
25 	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
26 	spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
27 	INIT_LIST_HEAD(&kvm->arch.tdp_mmu_pages);
28 
29 	return true;
30 }
31 
32 static __always_inline void kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
33 							     bool shared)
34 {
35 	if (shared)
36 		lockdep_assert_held_read(&kvm->mmu_lock);
37 	else
38 		lockdep_assert_held_write(&kvm->mmu_lock);
39 }
40 
41 void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
42 {
43 	if (!kvm->arch.tdp_mmu_enabled)
44 		return;
45 
46 	WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages));
47 	WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots));
48 
49 	/*
50 	 * Ensure that all the outstanding RCU callbacks to free shadow pages
51 	 * can run before the VM is torn down.
52 	 */
53 	rcu_barrier();
54 }
55 
56 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
57 			  gfn_t start, gfn_t end, bool can_yield, bool flush,
58 			  bool shared);
59 
60 static void tdp_mmu_free_sp(struct kvm_mmu_page *sp)
61 {
62 	free_page((unsigned long)sp->spt);
63 	kmem_cache_free(mmu_page_header_cache, sp);
64 }
65 
66 /*
67  * This is called through call_rcu in order to free TDP page table memory
68  * safely with respect to other kernel threads that may be operating on
69  * the memory.
70  * By only accessing TDP MMU page table memory in an RCU read critical
71  * section, and freeing it after a grace period, lockless access to that
72  * memory won't use it after it is freed.
73  */
74 static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head)
75 {
76 	struct kvm_mmu_page *sp = container_of(head, struct kvm_mmu_page,
77 					       rcu_head);
78 
79 	tdp_mmu_free_sp(sp);
80 }
81 
82 void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
83 			  bool shared)
84 {
85 	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
86 
87 	if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
88 		return;
89 
90 	WARN_ON(!root->tdp_mmu_page);
91 
92 	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
93 	list_del_rcu(&root->link);
94 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
95 
96 	zap_gfn_range(kvm, root, 0, -1ull, false, false, shared);
97 
98 	call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback);
99 }
100 
101 /*
102  * Finds the next valid root after root (or the first valid root if root
103  * is NULL), takes a reference on it, and returns that next root. If root
104  * is not NULL, this thread should have already taken a reference on it, and
105  * that reference will be dropped. If no valid root is found, this
106  * function will return NULL.
107  */
108 static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
109 					      struct kvm_mmu_page *prev_root,
110 					      bool shared)
111 {
112 	struct kvm_mmu_page *next_root;
113 
114 	rcu_read_lock();
115 
116 	if (prev_root)
117 		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
118 						  &prev_root->link,
119 						  typeof(*prev_root), link);
120 	else
121 		next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
122 						   typeof(*next_root), link);
123 
124 	while (next_root && !kvm_tdp_mmu_get_root(kvm, next_root))
125 		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
126 				&next_root->link, typeof(*next_root), link);
127 
128 	rcu_read_unlock();
129 
130 	if (prev_root)
131 		kvm_tdp_mmu_put_root(kvm, prev_root, shared);
132 
133 	return next_root;
134 }
135 
136 /*
137  * Note: this iterator gets and puts references to the roots it iterates over.
138  * This makes it safe to release the MMU lock and yield within the loop, but
139  * if exiting the loop early, the caller must drop the reference to the most
140  * recent root. (Unless keeping a live reference is desirable.)
141  *
142  * If shared is set, this function is operating under the MMU lock in read
143  * mode. In the unlikely event that this thread must free a root, the lock
144  * will be temporarily dropped and reacquired in write mode.
145  */
146 #define for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _shared)	\
147 	for (_root = tdp_mmu_next_root(_kvm, NULL, _shared);		\
148 	     _root;							\
149 	     _root = tdp_mmu_next_root(_kvm, _root, _shared))		\
150 		if (kvm_mmu_page_as_id(_root) != _as_id) {		\
151 		} else
152 
153 #define for_each_tdp_mmu_root(_kvm, _root, _as_id)				\
154 	list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link,		\
155 				lockdep_is_held_type(&kvm->mmu_lock, 0) ||	\
156 				lockdep_is_held(&kvm->arch.tdp_mmu_pages_lock))	\
157 		if (kvm_mmu_page_as_id(_root) != _as_id) {		\
158 		} else
159 
160 static union kvm_mmu_page_role page_role_for_level(struct kvm_vcpu *vcpu,
161 						   int level)
162 {
163 	union kvm_mmu_page_role role;
164 
165 	role = vcpu->arch.mmu->mmu_role.base;
166 	role.level = level;
167 	role.direct = true;
168 	role.has_4_byte_gpte = false;
169 	role.access = ACC_ALL;
170 	role.ad_disabled = !shadow_accessed_mask;
171 
172 	return role;
173 }
174 
175 static struct kvm_mmu_page *alloc_tdp_mmu_page(struct kvm_vcpu *vcpu, gfn_t gfn,
176 					       int level)
177 {
178 	struct kvm_mmu_page *sp;
179 
180 	sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
181 	sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
182 	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
183 
184 	sp->role.word = page_role_for_level(vcpu, level).word;
185 	sp->gfn = gfn;
186 	sp->tdp_mmu_page = true;
187 
188 	trace_kvm_mmu_get_page(sp, true);
189 
190 	return sp;
191 }
192 
193 hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
194 {
195 	union kvm_mmu_page_role role;
196 	struct kvm *kvm = vcpu->kvm;
197 	struct kvm_mmu_page *root;
198 
199 	lockdep_assert_held_write(&kvm->mmu_lock);
200 
201 	role = page_role_for_level(vcpu, vcpu->arch.mmu->shadow_root_level);
202 
203 	/* Check for an existing root before allocating a new one. */
204 	for_each_tdp_mmu_root(kvm, root, kvm_mmu_role_as_id(role)) {
205 		if (root->role.word == role.word &&
206 		    kvm_tdp_mmu_get_root(kvm, root))
207 			goto out;
208 	}
209 
210 	root = alloc_tdp_mmu_page(vcpu, 0, vcpu->arch.mmu->shadow_root_level);
211 	refcount_set(&root->tdp_mmu_root_count, 1);
212 
213 	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
214 	list_add_rcu(&root->link, &kvm->arch.tdp_mmu_roots);
215 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
216 
217 out:
218 	return __pa(root->spt);
219 }
220 
221 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
222 				u64 old_spte, u64 new_spte, int level,
223 				bool shared);
224 
225 static void handle_changed_spte_acc_track(u64 old_spte, u64 new_spte, int level)
226 {
227 	if (!is_shadow_present_pte(old_spte) || !is_last_spte(old_spte, level))
228 		return;
229 
230 	if (is_accessed_spte(old_spte) &&
231 	    (!is_shadow_present_pte(new_spte) || !is_accessed_spte(new_spte) ||
232 	     spte_to_pfn(old_spte) != spte_to_pfn(new_spte)))
233 		kvm_set_pfn_accessed(spte_to_pfn(old_spte));
234 }
235 
236 static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
237 					  u64 old_spte, u64 new_spte, int level)
238 {
239 	bool pfn_changed;
240 	struct kvm_memory_slot *slot;
241 
242 	if (level > PG_LEVEL_4K)
243 		return;
244 
245 	pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
246 
247 	if ((!is_writable_pte(old_spte) || pfn_changed) &&
248 	    is_writable_pte(new_spte)) {
249 		slot = __gfn_to_memslot(__kvm_memslots(kvm, as_id), gfn);
250 		mark_page_dirty_in_slot(kvm, slot, gfn);
251 	}
252 }
253 
254 /**
255  * tdp_mmu_link_page - Add a new page to the list of pages used by the TDP MMU
256  *
257  * @kvm: kvm instance
258  * @sp: the new page
259  * @account_nx: This page replaces a NX large page and should be marked for
260  *		eventual reclaim.
261  */
262 static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp,
263 			      bool account_nx)
264 {
265 	spin_lock(&kvm->arch.tdp_mmu_pages_lock);
266 	list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
267 	if (account_nx)
268 		account_huge_nx_page(kvm, sp);
269 	spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
270 }
271 
272 /**
273  * tdp_mmu_unlink_page - Remove page from the list of pages used by the TDP MMU
274  *
275  * @kvm: kvm instance
276  * @sp: the page to be removed
277  * @shared: This operation may not be running under the exclusive use of
278  *	    the MMU lock and the operation must synchronize with other
279  *	    threads that might be adding or removing pages.
280  */
281 static void tdp_mmu_unlink_page(struct kvm *kvm, struct kvm_mmu_page *sp,
282 				bool shared)
283 {
284 	if (shared)
285 		spin_lock(&kvm->arch.tdp_mmu_pages_lock);
286 	else
287 		lockdep_assert_held_write(&kvm->mmu_lock);
288 
289 	list_del(&sp->link);
290 	if (sp->lpage_disallowed)
291 		unaccount_huge_nx_page(kvm, sp);
292 
293 	if (shared)
294 		spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
295 }
296 
297 /**
298  * handle_removed_tdp_mmu_page - handle a pt removed from the TDP structure
299  *
300  * @kvm: kvm instance
301  * @pt: the page removed from the paging structure
302  * @shared: This operation may not be running under the exclusive use
303  *	    of the MMU lock and the operation must synchronize with other
304  *	    threads that might be modifying SPTEs.
305  *
306  * Given a page table that has been removed from the TDP paging structure,
307  * iterates through the page table to clear SPTEs and free child page tables.
308  *
309  * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU
310  * protection. Since this thread removed it from the paging structure,
311  * this thread will be responsible for ensuring the page is freed. Hence the
312  * early rcu_dereferences in the function.
313  */
314 static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
315 					bool shared)
316 {
317 	struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
318 	int level = sp->role.level;
319 	gfn_t base_gfn = sp->gfn;
320 	int i;
321 
322 	trace_kvm_mmu_prepare_zap_page(sp);
323 
324 	tdp_mmu_unlink_page(kvm, sp, shared);
325 
326 	for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
327 		u64 *sptep = rcu_dereference(pt) + i;
328 		gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
329 		u64 old_child_spte;
330 
331 		if (shared) {
332 			/*
333 			 * Set the SPTE to a nonpresent value that other
334 			 * threads will not overwrite. If the SPTE was
335 			 * already marked as removed then another thread
336 			 * handling a page fault could overwrite it, so
337 			 * set the SPTE until it is set from some other
338 			 * value to the removed SPTE value.
339 			 */
340 			for (;;) {
341 				old_child_spte = xchg(sptep, REMOVED_SPTE);
342 				if (!is_removed_spte(old_child_spte))
343 					break;
344 				cpu_relax();
345 			}
346 		} else {
347 			/*
348 			 * If the SPTE is not MMU-present, there is no backing
349 			 * page associated with the SPTE and so no side effects
350 			 * that need to be recorded, and exclusive ownership of
351 			 * mmu_lock ensures the SPTE can't be made present.
352 			 * Note, zapping MMIO SPTEs is also unnecessary as they
353 			 * are guarded by the memslots generation, not by being
354 			 * unreachable.
355 			 */
356 			old_child_spte = READ_ONCE(*sptep);
357 			if (!is_shadow_present_pte(old_child_spte))
358 				continue;
359 
360 			/*
361 			 * Marking the SPTE as a removed SPTE is not
362 			 * strictly necessary here as the MMU lock will
363 			 * stop other threads from concurrently modifying
364 			 * this SPTE. Using the removed SPTE value keeps
365 			 * the two branches consistent and simplifies
366 			 * the function.
367 			 */
368 			WRITE_ONCE(*sptep, REMOVED_SPTE);
369 		}
370 		handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
371 				    old_child_spte, REMOVED_SPTE, level,
372 				    shared);
373 	}
374 
375 	kvm_flush_remote_tlbs_with_address(kvm, base_gfn,
376 					   KVM_PAGES_PER_HPAGE(level + 1));
377 
378 	call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
379 }
380 
381 /**
382  * __handle_changed_spte - handle bookkeeping associated with an SPTE change
383  * @kvm: kvm instance
384  * @as_id: the address space of the paging structure the SPTE was a part of
385  * @gfn: the base GFN that was mapped by the SPTE
386  * @old_spte: The value of the SPTE before the change
387  * @new_spte: The value of the SPTE after the change
388  * @level: the level of the PT the SPTE is part of in the paging structure
389  * @shared: This operation may not be running under the exclusive use of
390  *	    the MMU lock and the operation must synchronize with other
391  *	    threads that might be modifying SPTEs.
392  *
393  * Handle bookkeeping that might result from the modification of a SPTE.
394  * This function must be called for all TDP SPTE modifications.
395  */
396 static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
397 				  u64 old_spte, u64 new_spte, int level,
398 				  bool shared)
399 {
400 	bool was_present = is_shadow_present_pte(old_spte);
401 	bool is_present = is_shadow_present_pte(new_spte);
402 	bool was_leaf = was_present && is_last_spte(old_spte, level);
403 	bool is_leaf = is_present && is_last_spte(new_spte, level);
404 	bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte);
405 
406 	WARN_ON(level > PT64_ROOT_MAX_LEVEL);
407 	WARN_ON(level < PG_LEVEL_4K);
408 	WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
409 
410 	/*
411 	 * If this warning were to trigger it would indicate that there was a
412 	 * missing MMU notifier or a race with some notifier handler.
413 	 * A present, leaf SPTE should never be directly replaced with another
414 	 * present leaf SPTE pointing to a different PFN. A notifier handler
415 	 * should be zapping the SPTE before the main MM's page table is
416 	 * changed, or the SPTE should be zeroed, and the TLBs flushed by the
417 	 * thread before replacement.
418 	 */
419 	if (was_leaf && is_leaf && pfn_changed) {
420 		pr_err("Invalid SPTE change: cannot replace a present leaf\n"
421 		       "SPTE with another present leaf SPTE mapping a\n"
422 		       "different PFN!\n"
423 		       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
424 		       as_id, gfn, old_spte, new_spte, level);
425 
426 		/*
427 		 * Crash the host to prevent error propagation and guest data
428 		 * corruption.
429 		 */
430 		BUG();
431 	}
432 
433 	if (old_spte == new_spte)
434 		return;
435 
436 	trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
437 
438 	/*
439 	 * The only times a SPTE should be changed from a non-present to
440 	 * non-present state is when an MMIO entry is installed/modified/
441 	 * removed. In that case, there is nothing to do here.
442 	 */
443 	if (!was_present && !is_present) {
444 		/*
445 		 * If this change does not involve a MMIO SPTE or removed SPTE,
446 		 * it is unexpected. Log the change, though it should not
447 		 * impact the guest since both the former and current SPTEs
448 		 * are nonpresent.
449 		 */
450 		if (WARN_ON(!is_mmio_spte(old_spte) &&
451 			    !is_mmio_spte(new_spte) &&
452 			    !is_removed_spte(new_spte)))
453 			pr_err("Unexpected SPTE change! Nonpresent SPTEs\n"
454 			       "should not be replaced with another,\n"
455 			       "different nonpresent SPTE, unless one or both\n"
456 			       "are MMIO SPTEs, or the new SPTE is\n"
457 			       "a temporary removed SPTE.\n"
458 			       "as_id: %d gfn: %llx old_spte: %llx new_spte: %llx level: %d",
459 			       as_id, gfn, old_spte, new_spte, level);
460 		return;
461 	}
462 
463 	if (is_leaf != was_leaf)
464 		kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
465 
466 	if (was_leaf && is_dirty_spte(old_spte) &&
467 	    (!is_present || !is_dirty_spte(new_spte) || pfn_changed))
468 		kvm_set_pfn_dirty(spte_to_pfn(old_spte));
469 
470 	/*
471 	 * Recursively handle child PTs if the change removed a subtree from
472 	 * the paging structure.
473 	 */
474 	if (was_present && !was_leaf && (pfn_changed || !is_present))
475 		handle_removed_tdp_mmu_page(kvm,
476 				spte_to_child_pt(old_spte, level), shared);
477 }
478 
479 static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
480 				u64 old_spte, u64 new_spte, int level,
481 				bool shared)
482 {
483 	__handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level,
484 			      shared);
485 	handle_changed_spte_acc_track(old_spte, new_spte, level);
486 	handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
487 				      new_spte, level);
488 }
489 
490 /*
491  * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically
492  * and handle the associated bookkeeping.  Do not mark the page dirty
493  * in KVM's dirty bitmaps.
494  *
495  * @kvm: kvm instance
496  * @iter: a tdp_iter instance currently on the SPTE that should be set
497  * @new_spte: The value the SPTE should be set to
498  * Returns: true if the SPTE was set, false if it was not. If false is returned,
499  *	    this function will have no side-effects.
500  */
501 static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
502 					   struct tdp_iter *iter,
503 					   u64 new_spte)
504 {
505 	WARN_ON_ONCE(iter->yielded);
506 
507 	lockdep_assert_held_read(&kvm->mmu_lock);
508 
509 	/*
510 	 * Do not change removed SPTEs. Only the thread that froze the SPTE
511 	 * may modify it.
512 	 */
513 	if (is_removed_spte(iter->old_spte))
514 		return false;
515 
516 	/*
517 	 * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
518 	 * does not hold the mmu_lock.
519 	 */
520 	if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte,
521 		      new_spte) != iter->old_spte)
522 		return false;
523 
524 	__handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
525 			      new_spte, iter->level, true);
526 	handle_changed_spte_acc_track(iter->old_spte, new_spte, iter->level);
527 
528 	return true;
529 }
530 
531 static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
532 					   struct tdp_iter *iter)
533 {
534 	/*
535 	 * Freeze the SPTE by setting it to a special,
536 	 * non-present value. This will stop other threads from
537 	 * immediately installing a present entry in its place
538 	 * before the TLBs are flushed.
539 	 */
540 	if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE))
541 		return false;
542 
543 	kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
544 					   KVM_PAGES_PER_HPAGE(iter->level));
545 
546 	/*
547 	 * No other thread can overwrite the removed SPTE as they
548 	 * must either wait on the MMU lock or use
549 	 * tdp_mmu_set_spte_atomic which will not overwrite the
550 	 * special removed SPTE value. No bookkeeping is needed
551 	 * here since the SPTE is going from non-present
552 	 * to non-present.
553 	 */
554 	WRITE_ONCE(*rcu_dereference(iter->sptep), 0);
555 
556 	return true;
557 }
558 
559 
560 /*
561  * __tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping
562  * @kvm: kvm instance
563  * @iter: a tdp_iter instance currently on the SPTE that should be set
564  * @new_spte: The value the SPTE should be set to
565  * @record_acc_track: Notify the MM subsystem of changes to the accessed state
566  *		      of the page. Should be set unless handling an MMU
567  *		      notifier for access tracking. Leaving record_acc_track
568  *		      unset in that case prevents page accesses from being
569  *		      double counted.
570  * @record_dirty_log: Record the page as dirty in the dirty bitmap if
571  *		      appropriate for the change being made. Should be set
572  *		      unless performing certain dirty logging operations.
573  *		      Leaving record_dirty_log unset in that case prevents page
574  *		      writes from being double counted.
575  */
576 static inline void __tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
577 				      u64 new_spte, bool record_acc_track,
578 				      bool record_dirty_log)
579 {
580 	WARN_ON_ONCE(iter->yielded);
581 
582 	lockdep_assert_held_write(&kvm->mmu_lock);
583 
584 	/*
585 	 * No thread should be using this function to set SPTEs to the
586 	 * temporary removed SPTE value.
587 	 * If operating under the MMU lock in read mode, tdp_mmu_set_spte_atomic
588 	 * should be used. If operating under the MMU lock in write mode, the
589 	 * use of the removed SPTE should not be necessary.
590 	 */
591 	WARN_ON(is_removed_spte(iter->old_spte));
592 
593 	WRITE_ONCE(*rcu_dereference(iter->sptep), new_spte);
594 
595 	__handle_changed_spte(kvm, iter->as_id, iter->gfn, iter->old_spte,
596 			      new_spte, iter->level, false);
597 	if (record_acc_track)
598 		handle_changed_spte_acc_track(iter->old_spte, new_spte,
599 					      iter->level);
600 	if (record_dirty_log)
601 		handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn,
602 					      iter->old_spte, new_spte,
603 					      iter->level);
604 }
605 
606 static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
607 				    u64 new_spte)
608 {
609 	__tdp_mmu_set_spte(kvm, iter, new_spte, true, true);
610 }
611 
612 static inline void tdp_mmu_set_spte_no_acc_track(struct kvm *kvm,
613 						 struct tdp_iter *iter,
614 						 u64 new_spte)
615 {
616 	__tdp_mmu_set_spte(kvm, iter, new_spte, false, true);
617 }
618 
619 static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
620 						 struct tdp_iter *iter,
621 						 u64 new_spte)
622 {
623 	__tdp_mmu_set_spte(kvm, iter, new_spte, true, false);
624 }
625 
626 #define tdp_root_for_each_pte(_iter, _root, _start, _end) \
627 	for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end)
628 
629 #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end)	\
630 	tdp_root_for_each_pte(_iter, _root, _start, _end)		\
631 		if (!is_shadow_present_pte(_iter.old_spte) ||		\
632 		    !is_last_spte(_iter.old_spte, _iter.level))		\
633 			continue;					\
634 		else
635 
636 #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end)		\
637 	for_each_tdp_pte(_iter, __va(_mmu->root_hpa),		\
638 			 _mmu->shadow_root_level, _start, _end)
639 
640 /*
641  * Yield if the MMU lock is contended or this thread needs to return control
642  * to the scheduler.
643  *
644  * If this function should yield and flush is set, it will perform a remote
645  * TLB flush before yielding.
646  *
647  * If this function yields, iter->yielded is set and the caller must skip to
648  * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk
649  * over the paging structures to allow the iterator to continue its traversal
650  * from the paging structure root.
651  *
652  * Returns true if this function yielded.
653  */
654 static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm,
655 							  struct tdp_iter *iter,
656 							  bool flush, bool shared)
657 {
658 	WARN_ON(iter->yielded);
659 
660 	/* Ensure forward progress has been made before yielding. */
661 	if (iter->next_last_level_gfn == iter->yielded_gfn)
662 		return false;
663 
664 	if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
665 		rcu_read_unlock();
666 
667 		if (flush)
668 			kvm_flush_remote_tlbs(kvm);
669 
670 		if (shared)
671 			cond_resched_rwlock_read(&kvm->mmu_lock);
672 		else
673 			cond_resched_rwlock_write(&kvm->mmu_lock);
674 
675 		rcu_read_lock();
676 
677 		WARN_ON(iter->gfn > iter->next_last_level_gfn);
678 
679 		iter->yielded = true;
680 	}
681 
682 	return iter->yielded;
683 }
684 
685 /*
686  * Tears down the mappings for the range of gfns, [start, end), and frees the
687  * non-root pages mapping GFNs strictly within that range. Returns true if
688  * SPTEs have been cleared and a TLB flush is needed before releasing the
689  * MMU lock.
690  *
691  * If can_yield is true, will release the MMU lock and reschedule if the
692  * scheduler needs the CPU or there is contention on the MMU lock. If this
693  * function cannot yield, it will not release the MMU lock or reschedule and
694  * the caller must ensure it does not supply too large a GFN range, or the
695  * operation can cause a soft lockup.
696  *
697  * If shared is true, this thread holds the MMU lock in read mode and must
698  * account for the possibility that other threads are modifying the paging
699  * structures concurrently. If shared is false, this thread should hold the
700  * MMU lock in write mode.
701  */
702 static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
703 			  gfn_t start, gfn_t end, bool can_yield, bool flush,
704 			  bool shared)
705 {
706 	gfn_t max_gfn_host = 1ULL << (shadow_phys_bits - PAGE_SHIFT);
707 	bool zap_all = (start == 0 && end >= max_gfn_host);
708 	struct tdp_iter iter;
709 
710 	/*
711 	 * No need to try to step down in the iterator when zapping all SPTEs,
712 	 * zapping the top-level non-leaf SPTEs will recurse on their children.
713 	 */
714 	int min_level = zap_all ? root->role.level : PG_LEVEL_4K;
715 
716 	/*
717 	 * Bound the walk at host.MAXPHYADDR, guest accesses beyond that will
718 	 * hit a #PF(RSVD) and never get to an EPT Violation/Misconfig / #NPF,
719 	 * and so KVM will never install a SPTE for such addresses.
720 	 */
721 	end = min(end, max_gfn_host);
722 
723 	kvm_lockdep_assert_mmu_lock_held(kvm, shared);
724 
725 	rcu_read_lock();
726 
727 	for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
728 				   min_level, start, end) {
729 retry:
730 		if (can_yield &&
731 		    tdp_mmu_iter_cond_resched(kvm, &iter, flush, shared)) {
732 			flush = false;
733 			continue;
734 		}
735 
736 		if (!is_shadow_present_pte(iter.old_spte))
737 			continue;
738 
739 		/*
740 		 * If this is a non-last-level SPTE that covers a larger range
741 		 * than should be zapped, continue, and zap the mappings at a
742 		 * lower level, except when zapping all SPTEs.
743 		 */
744 		if (!zap_all &&
745 		    (iter.gfn < start ||
746 		     iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) &&
747 		    !is_last_spte(iter.old_spte, iter.level))
748 			continue;
749 
750 		if (!shared) {
751 			tdp_mmu_set_spte(kvm, &iter, 0);
752 			flush = true;
753 		} else if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
754 			/*
755 			 * The iter must explicitly re-read the SPTE because
756 			 * the atomic cmpxchg failed.
757 			 */
758 			iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
759 			goto retry;
760 		}
761 	}
762 
763 	rcu_read_unlock();
764 	return flush;
765 }
766 
767 /*
768  * Tears down the mappings for the range of gfns, [start, end), and frees the
769  * non-root pages mapping GFNs strictly within that range. Returns true if
770  * SPTEs have been cleared and a TLB flush is needed before releasing the
771  * MMU lock.
772  */
773 bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
774 				 gfn_t end, bool can_yield, bool flush)
775 {
776 	struct kvm_mmu_page *root;
777 
778 	for_each_tdp_mmu_root_yield_safe(kvm, root, as_id, false)
779 		flush = zap_gfn_range(kvm, root, start, end, can_yield, flush,
780 				      false);
781 
782 	return flush;
783 }
784 
785 void kvm_tdp_mmu_zap_all(struct kvm *kvm)
786 {
787 	bool flush = false;
788 	int i;
789 
790 	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
791 		flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, -1ull, flush);
792 
793 	if (flush)
794 		kvm_flush_remote_tlbs(kvm);
795 }
796 
797 static struct kvm_mmu_page *next_invalidated_root(struct kvm *kvm,
798 						  struct kvm_mmu_page *prev_root)
799 {
800 	struct kvm_mmu_page *next_root;
801 
802 	if (prev_root)
803 		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
804 						  &prev_root->link,
805 						  typeof(*prev_root), link);
806 	else
807 		next_root = list_first_or_null_rcu(&kvm->arch.tdp_mmu_roots,
808 						   typeof(*next_root), link);
809 
810 	while (next_root && !(next_root->role.invalid &&
811 			      refcount_read(&next_root->tdp_mmu_root_count)))
812 		next_root = list_next_or_null_rcu(&kvm->arch.tdp_mmu_roots,
813 						  &next_root->link,
814 						  typeof(*next_root), link);
815 
816 	return next_root;
817 }
818 
819 /*
820  * Since kvm_tdp_mmu_zap_all_fast has acquired a reference to each
821  * invalidated root, they will not be freed until this function drops the
822  * reference. Before dropping that reference, tear down the paging
823  * structure so that whichever thread does drop the last reference
824  * only has to do a trivial amount of work. Since the roots are invalid,
825  * no new SPTEs should be created under them.
826  */
827 void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm)
828 {
829 	struct kvm_mmu_page *next_root;
830 	struct kvm_mmu_page *root;
831 	bool flush = false;
832 
833 	lockdep_assert_held_read(&kvm->mmu_lock);
834 
835 	rcu_read_lock();
836 
837 	root = next_invalidated_root(kvm, NULL);
838 
839 	while (root) {
840 		next_root = next_invalidated_root(kvm, root);
841 
842 		rcu_read_unlock();
843 
844 		flush = zap_gfn_range(kvm, root, 0, -1ull, true, flush, true);
845 
846 		/*
847 		 * Put the reference acquired in
848 		 * kvm_tdp_mmu_invalidate_roots
849 		 */
850 		kvm_tdp_mmu_put_root(kvm, root, true);
851 
852 		root = next_root;
853 
854 		rcu_read_lock();
855 	}
856 
857 	rcu_read_unlock();
858 
859 	if (flush)
860 		kvm_flush_remote_tlbs(kvm);
861 }
862 
863 /*
864  * Mark each TDP MMU root as invalid so that other threads
865  * will drop their references and allow the root count to
866  * go to 0.
867  *
868  * Also take a reference on all roots so that this thread
869  * can do the bulk of the work required to free the roots
870  * once they are invalidated. Without this reference, a
871  * vCPU thread might drop the last reference to a root and
872  * get stuck with tearing down the entire paging structure.
873  *
874  * Roots which have a zero refcount should be skipped as
875  * they're already being torn down.
876  * Already invalid roots should be referenced again so that
877  * they aren't freed before kvm_tdp_mmu_zap_all_fast is
878  * done with them.
879  *
880  * This has essentially the same effect for the TDP MMU
881  * as updating mmu_valid_gen does for the shadow MMU.
882  */
883 void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm)
884 {
885 	struct kvm_mmu_page *root;
886 
887 	lockdep_assert_held_write(&kvm->mmu_lock);
888 	list_for_each_entry(root, &kvm->arch.tdp_mmu_roots, link)
889 		if (refcount_inc_not_zero(&root->tdp_mmu_root_count))
890 			root->role.invalid = true;
891 }
892 
893 /*
894  * Installs a last-level SPTE to handle a TDP page fault.
895  * (NPT/EPT violation/misconfiguration)
896  */
897 static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
898 					  struct kvm_page_fault *fault,
899 					  struct tdp_iter *iter)
900 {
901 	struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep));
902 	u64 new_spte;
903 	int ret = RET_PF_FIXED;
904 	bool wrprot = false;
905 
906 	WARN_ON(sp->role.level != fault->goal_level);
907 	if (unlikely(!fault->slot))
908 		new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL);
909 	else
910 		wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn,
911 					 fault->pfn, iter->old_spte, fault->prefetch, true,
912 					 fault->map_writable, &new_spte);
913 
914 	if (new_spte == iter->old_spte)
915 		ret = RET_PF_SPURIOUS;
916 	else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
917 		return RET_PF_RETRY;
918 
919 	/*
920 	 * If the page fault was caused by a write but the page is write
921 	 * protected, emulation is needed. If the emulation was skipped,
922 	 * the vCPU would have the same fault again.
923 	 */
924 	if (wrprot) {
925 		if (fault->write)
926 			ret = RET_PF_EMULATE;
927 	}
928 
929 	/* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
930 	if (unlikely(is_mmio_spte(new_spte))) {
931 		trace_mark_mmio_spte(rcu_dereference(iter->sptep), iter->gfn,
932 				     new_spte);
933 		ret = RET_PF_EMULATE;
934 	} else {
935 		trace_kvm_mmu_set_spte(iter->level, iter->gfn,
936 				       rcu_dereference(iter->sptep));
937 	}
938 
939 	/*
940 	 * Increase pf_fixed in both RET_PF_EMULATE and RET_PF_FIXED to be
941 	 * consistent with legacy MMU behavior.
942 	 */
943 	if (ret != RET_PF_SPURIOUS)
944 		vcpu->stat.pf_fixed++;
945 
946 	return ret;
947 }
948 
949 /*
950  * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing
951  * page tables and SPTEs to translate the faulting guest physical address.
952  */
953 int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
954 {
955 	struct kvm_mmu *mmu = vcpu->arch.mmu;
956 	struct tdp_iter iter;
957 	struct kvm_mmu_page *sp;
958 	u64 *child_pt;
959 	u64 new_spte;
960 	int ret;
961 
962 	kvm_mmu_hugepage_adjust(vcpu, fault);
963 
964 	trace_kvm_mmu_spte_requested(fault);
965 
966 	rcu_read_lock();
967 
968 	tdp_mmu_for_each_pte(iter, mmu, fault->gfn, fault->gfn + 1) {
969 		if (fault->nx_huge_page_workaround_enabled)
970 			disallowed_hugepage_adjust(fault, iter.old_spte, iter.level);
971 
972 		if (iter.level == fault->goal_level)
973 			break;
974 
975 		/*
976 		 * If there is an SPTE mapping a large page at a higher level
977 		 * than the target, that SPTE must be cleared and replaced
978 		 * with a non-leaf SPTE.
979 		 */
980 		if (is_shadow_present_pte(iter.old_spte) &&
981 		    is_large_pte(iter.old_spte)) {
982 			if (!tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter))
983 				break;
984 
985 			/*
986 			 * The iter must explicitly re-read the spte here
987 			 * because the new value informs the !present
988 			 * path below.
989 			 */
990 			iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
991 		}
992 
993 		if (!is_shadow_present_pte(iter.old_spte)) {
994 			/*
995 			 * If SPTE has been frozen by another thread, just
996 			 * give up and retry, avoiding unnecessary page table
997 			 * allocation and free.
998 			 */
999 			if (is_removed_spte(iter.old_spte))
1000 				break;
1001 
1002 			sp = alloc_tdp_mmu_page(vcpu, iter.gfn, iter.level - 1);
1003 			child_pt = sp->spt;
1004 
1005 			new_spte = make_nonleaf_spte(child_pt,
1006 						     !shadow_accessed_mask);
1007 
1008 			if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter, new_spte)) {
1009 				tdp_mmu_link_page(vcpu->kvm, sp,
1010 						  fault->huge_page_disallowed &&
1011 						  fault->req_level >= iter.level);
1012 
1013 				trace_kvm_mmu_get_page(sp, true);
1014 			} else {
1015 				tdp_mmu_free_sp(sp);
1016 				break;
1017 			}
1018 		}
1019 	}
1020 
1021 	if (iter.level != fault->goal_level) {
1022 		rcu_read_unlock();
1023 		return RET_PF_RETRY;
1024 	}
1025 
1026 	ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter);
1027 	rcu_read_unlock();
1028 
1029 	return ret;
1030 }
1031 
1032 bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
1033 				 bool flush)
1034 {
1035 	struct kvm_mmu_page *root;
1036 
1037 	for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, false)
1038 		flush = zap_gfn_range(kvm, root, range->start, range->end,
1039 				      range->may_block, flush, false);
1040 
1041 	return flush;
1042 }
1043 
1044 typedef bool (*tdp_handler_t)(struct kvm *kvm, struct tdp_iter *iter,
1045 			      struct kvm_gfn_range *range);
1046 
1047 static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
1048 						   struct kvm_gfn_range *range,
1049 						   tdp_handler_t handler)
1050 {
1051 	struct kvm_mmu_page *root;
1052 	struct tdp_iter iter;
1053 	bool ret = false;
1054 
1055 	rcu_read_lock();
1056 
1057 	/*
1058 	 * Don't support rescheduling, none of the MMU notifiers that funnel
1059 	 * into this helper allow blocking; it'd be dead, wasteful code.
1060 	 */
1061 	for_each_tdp_mmu_root(kvm, root, range->slot->as_id) {
1062 		tdp_root_for_each_leaf_pte(iter, root, range->start, range->end)
1063 			ret |= handler(kvm, &iter, range);
1064 	}
1065 
1066 	rcu_read_unlock();
1067 
1068 	return ret;
1069 }
1070 
1071 /*
1072  * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
1073  * if any of the GFNs in the range have been accessed.
1074  */
1075 static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter,
1076 			  struct kvm_gfn_range *range)
1077 {
1078 	u64 new_spte = 0;
1079 
1080 	/* If we have a non-accessed entry we don't need to change the pte. */
1081 	if (!is_accessed_spte(iter->old_spte))
1082 		return false;
1083 
1084 	new_spte = iter->old_spte;
1085 
1086 	if (spte_ad_enabled(new_spte)) {
1087 		new_spte &= ~shadow_accessed_mask;
1088 	} else {
1089 		/*
1090 		 * Capture the dirty status of the page, so that it doesn't get
1091 		 * lost when the SPTE is marked for access tracking.
1092 		 */
1093 		if (is_writable_pte(new_spte))
1094 			kvm_set_pfn_dirty(spte_to_pfn(new_spte));
1095 
1096 		new_spte = mark_spte_for_access_track(new_spte);
1097 	}
1098 
1099 	tdp_mmu_set_spte_no_acc_track(kvm, iter, new_spte);
1100 
1101 	return true;
1102 }
1103 
1104 bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
1105 {
1106 	return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
1107 }
1108 
1109 static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
1110 			 struct kvm_gfn_range *range)
1111 {
1112 	return is_accessed_spte(iter->old_spte);
1113 }
1114 
1115 bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1116 {
1117 	return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
1118 }
1119 
1120 static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
1121 			 struct kvm_gfn_range *range)
1122 {
1123 	u64 new_spte;
1124 
1125 	/* Huge pages aren't expected to be modified without first being zapped. */
1126 	WARN_ON(pte_huge(range->pte) || range->start + 1 != range->end);
1127 
1128 	if (iter->level != PG_LEVEL_4K ||
1129 	    !is_shadow_present_pte(iter->old_spte))
1130 		return false;
1131 
1132 	/*
1133 	 * Note, when changing a read-only SPTE, it's not strictly necessary to
1134 	 * zero the SPTE before setting the new PFN, but doing so preserves the
1135 	 * invariant that the PFN of a present * leaf SPTE can never change.
1136 	 * See __handle_changed_spte().
1137 	 */
1138 	tdp_mmu_set_spte(kvm, iter, 0);
1139 
1140 	if (!pte_write(range->pte)) {
1141 		new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
1142 								  pte_pfn(range->pte));
1143 
1144 		tdp_mmu_set_spte(kvm, iter, new_spte);
1145 	}
1146 
1147 	return true;
1148 }
1149 
1150 /*
1151  * Handle the changed_pte MMU notifier for the TDP MMU.
1152  * data is a pointer to the new pte_t mapping the HVA specified by the MMU
1153  * notifier.
1154  * Returns non-zero if a flush is needed before releasing the MMU lock.
1155  */
1156 bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1157 {
1158 	bool flush = kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
1159 
1160 	/* FIXME: return 'flush' instead of flushing here. */
1161 	if (flush)
1162 		kvm_flush_remote_tlbs_with_address(kvm, range->start, 1);
1163 
1164 	return false;
1165 }
1166 
1167 /*
1168  * Remove write access from all SPTEs at or above min_level that map GFNs
1169  * [start, end). Returns true if an SPTE has been changed and the TLBs need to
1170  * be flushed.
1171  */
1172 static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1173 			     gfn_t start, gfn_t end, int min_level)
1174 {
1175 	struct tdp_iter iter;
1176 	u64 new_spte;
1177 	bool spte_set = false;
1178 
1179 	rcu_read_lock();
1180 
1181 	BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1182 
1183 	for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
1184 				   min_level, start, end) {
1185 retry:
1186 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1187 			continue;
1188 
1189 		if (!is_shadow_present_pte(iter.old_spte) ||
1190 		    !is_last_spte(iter.old_spte, iter.level) ||
1191 		    !(iter.old_spte & PT_WRITABLE_MASK))
1192 			continue;
1193 
1194 		new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1195 
1196 		if (!tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) {
1197 			/*
1198 			 * The iter must explicitly re-read the SPTE because
1199 			 * the atomic cmpxchg failed.
1200 			 */
1201 			iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
1202 			goto retry;
1203 		}
1204 		spte_set = true;
1205 	}
1206 
1207 	rcu_read_unlock();
1208 	return spte_set;
1209 }
1210 
1211 /*
1212  * Remove write access from all the SPTEs mapping GFNs in the memslot. Will
1213  * only affect leaf SPTEs down to min_level.
1214  * Returns true if an SPTE has been changed and the TLBs need to be flushed.
1215  */
1216 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
1217 			     const struct kvm_memory_slot *slot, int min_level)
1218 {
1219 	struct kvm_mmu_page *root;
1220 	bool spte_set = false;
1221 
1222 	lockdep_assert_held_read(&kvm->mmu_lock);
1223 
1224 	for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1225 		spte_set |= wrprot_gfn_range(kvm, root, slot->base_gfn,
1226 			     slot->base_gfn + slot->npages, min_level);
1227 
1228 	return spte_set;
1229 }
1230 
1231 /*
1232  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1233  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1234  * If AD bits are not enabled, this will require clearing the writable bit on
1235  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1236  * be flushed.
1237  */
1238 static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
1239 			   gfn_t start, gfn_t end)
1240 {
1241 	struct tdp_iter iter;
1242 	u64 new_spte;
1243 	bool spte_set = false;
1244 
1245 	rcu_read_lock();
1246 
1247 	tdp_root_for_each_leaf_pte(iter, root, start, end) {
1248 retry:
1249 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1250 			continue;
1251 
1252 		if (spte_ad_need_write_protect(iter.old_spte)) {
1253 			if (is_writable_pte(iter.old_spte))
1254 				new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1255 			else
1256 				continue;
1257 		} else {
1258 			if (iter.old_spte & shadow_dirty_mask)
1259 				new_spte = iter.old_spte & ~shadow_dirty_mask;
1260 			else
1261 				continue;
1262 		}
1263 
1264 		if (!tdp_mmu_set_spte_atomic(kvm, &iter, new_spte)) {
1265 			/*
1266 			 * The iter must explicitly re-read the SPTE because
1267 			 * the atomic cmpxchg failed.
1268 			 */
1269 			iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
1270 			goto retry;
1271 		}
1272 		spte_set = true;
1273 	}
1274 
1275 	rcu_read_unlock();
1276 	return spte_set;
1277 }
1278 
1279 /*
1280  * Clear the dirty status of all the SPTEs mapping GFNs in the memslot. If
1281  * AD bits are enabled, this will involve clearing the dirty bit on each SPTE.
1282  * If AD bits are not enabled, this will require clearing the writable bit on
1283  * each SPTE. Returns true if an SPTE has been changed and the TLBs need to
1284  * be flushed.
1285  */
1286 bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
1287 				  const struct kvm_memory_slot *slot)
1288 {
1289 	struct kvm_mmu_page *root;
1290 	bool spte_set = false;
1291 
1292 	lockdep_assert_held_read(&kvm->mmu_lock);
1293 
1294 	for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1295 		spte_set |= clear_dirty_gfn_range(kvm, root, slot->base_gfn,
1296 				slot->base_gfn + slot->npages);
1297 
1298 	return spte_set;
1299 }
1300 
1301 /*
1302  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1303  * set in mask, starting at gfn. The given memslot is expected to contain all
1304  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1305  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1306  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1307  */
1308 static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root,
1309 				  gfn_t gfn, unsigned long mask, bool wrprot)
1310 {
1311 	struct tdp_iter iter;
1312 	u64 new_spte;
1313 
1314 	rcu_read_lock();
1315 
1316 	tdp_root_for_each_leaf_pte(iter, root, gfn + __ffs(mask),
1317 				    gfn + BITS_PER_LONG) {
1318 		if (!mask)
1319 			break;
1320 
1321 		if (iter.level > PG_LEVEL_4K ||
1322 		    !(mask & (1UL << (iter.gfn - gfn))))
1323 			continue;
1324 
1325 		mask &= ~(1UL << (iter.gfn - gfn));
1326 
1327 		if (wrprot || spte_ad_need_write_protect(iter.old_spte)) {
1328 			if (is_writable_pte(iter.old_spte))
1329 				new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
1330 			else
1331 				continue;
1332 		} else {
1333 			if (iter.old_spte & shadow_dirty_mask)
1334 				new_spte = iter.old_spte & ~shadow_dirty_mask;
1335 			else
1336 				continue;
1337 		}
1338 
1339 		tdp_mmu_set_spte_no_dirty_log(kvm, &iter, new_spte);
1340 	}
1341 
1342 	rcu_read_unlock();
1343 }
1344 
1345 /*
1346  * Clears the dirty status of all the 4k SPTEs mapping GFNs for which a bit is
1347  * set in mask, starting at gfn. The given memslot is expected to contain all
1348  * the GFNs represented by set bits in the mask. If AD bits are enabled,
1349  * clearing the dirty status will involve clearing the dirty bit on each SPTE
1350  * or, if AD bits are not enabled, clearing the writable bit on each SPTE.
1351  */
1352 void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1353 				       struct kvm_memory_slot *slot,
1354 				       gfn_t gfn, unsigned long mask,
1355 				       bool wrprot)
1356 {
1357 	struct kvm_mmu_page *root;
1358 
1359 	lockdep_assert_held_write(&kvm->mmu_lock);
1360 	for_each_tdp_mmu_root(kvm, root, slot->as_id)
1361 		clear_dirty_pt_masked(kvm, root, gfn, mask, wrprot);
1362 }
1363 
1364 /*
1365  * Clear leaf entries which could be replaced by large mappings, for
1366  * GFNs within the slot.
1367  */
1368 static void zap_collapsible_spte_range(struct kvm *kvm,
1369 				       struct kvm_mmu_page *root,
1370 				       const struct kvm_memory_slot *slot)
1371 {
1372 	gfn_t start = slot->base_gfn;
1373 	gfn_t end = start + slot->npages;
1374 	struct tdp_iter iter;
1375 	kvm_pfn_t pfn;
1376 
1377 	rcu_read_lock();
1378 
1379 	tdp_root_for_each_pte(iter, root, start, end) {
1380 retry:
1381 		if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
1382 			continue;
1383 
1384 		if (!is_shadow_present_pte(iter.old_spte) ||
1385 		    !is_last_spte(iter.old_spte, iter.level))
1386 			continue;
1387 
1388 		pfn = spte_to_pfn(iter.old_spte);
1389 		if (kvm_is_reserved_pfn(pfn) ||
1390 		    iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
1391 							    pfn, PG_LEVEL_NUM))
1392 			continue;
1393 
1394 		/* Note, a successful atomic zap also does a remote TLB flush. */
1395 		if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
1396 			/*
1397 			 * The iter must explicitly re-read the SPTE because
1398 			 * the atomic cmpxchg failed.
1399 			 */
1400 			iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
1401 			goto retry;
1402 		}
1403 	}
1404 
1405 	rcu_read_unlock();
1406 }
1407 
1408 /*
1409  * Clear non-leaf entries (and free associated page tables) which could
1410  * be replaced by large mappings, for GFNs within the slot.
1411  */
1412 void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
1413 				       const struct kvm_memory_slot *slot)
1414 {
1415 	struct kvm_mmu_page *root;
1416 
1417 	lockdep_assert_held_read(&kvm->mmu_lock);
1418 
1419 	for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
1420 		zap_collapsible_spte_range(kvm, root, slot);
1421 }
1422 
1423 /*
1424  * Removes write access on the last level SPTE mapping this GFN and unsets the
1425  * MMU-writable bit to ensure future writes continue to be intercepted.
1426  * Returns true if an SPTE was set and a TLB flush is needed.
1427  */
1428 static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
1429 			      gfn_t gfn, int min_level)
1430 {
1431 	struct tdp_iter iter;
1432 	u64 new_spte;
1433 	bool spte_set = false;
1434 
1435 	BUG_ON(min_level > KVM_MAX_HUGEPAGE_LEVEL);
1436 
1437 	rcu_read_lock();
1438 
1439 	for_each_tdp_pte_min_level(iter, root->spt, root->role.level,
1440 				   min_level, gfn, gfn + 1) {
1441 		if (!is_shadow_present_pte(iter.old_spte) ||
1442 		    !is_last_spte(iter.old_spte, iter.level))
1443 			continue;
1444 
1445 		new_spte = iter.old_spte &
1446 			~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
1447 
1448 		if (new_spte == iter.old_spte)
1449 			break;
1450 
1451 		tdp_mmu_set_spte(kvm, &iter, new_spte);
1452 		spte_set = true;
1453 	}
1454 
1455 	rcu_read_unlock();
1456 
1457 	return spte_set;
1458 }
1459 
1460 /*
1461  * Removes write access on the last level SPTE mapping this GFN and unsets the
1462  * MMU-writable bit to ensure future writes continue to be intercepted.
1463  * Returns true if an SPTE was set and a TLB flush is needed.
1464  */
1465 bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
1466 				   struct kvm_memory_slot *slot, gfn_t gfn,
1467 				   int min_level)
1468 {
1469 	struct kvm_mmu_page *root;
1470 	bool spte_set = false;
1471 
1472 	lockdep_assert_held_write(&kvm->mmu_lock);
1473 	for_each_tdp_mmu_root(kvm, root, slot->as_id)
1474 		spte_set |= write_protect_gfn(kvm, root, gfn, min_level);
1475 
1476 	return spte_set;
1477 }
1478 
1479 /*
1480  * Return the level of the lowest level SPTE added to sptes.
1481  * That SPTE may be non-present.
1482  *
1483  * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1484  */
1485 int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
1486 			 int *root_level)
1487 {
1488 	struct tdp_iter iter;
1489 	struct kvm_mmu *mmu = vcpu->arch.mmu;
1490 	gfn_t gfn = addr >> PAGE_SHIFT;
1491 	int leaf = -1;
1492 
1493 	*root_level = vcpu->arch.mmu->shadow_root_level;
1494 
1495 	tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1496 		leaf = iter.level;
1497 		sptes[leaf] = iter.old_spte;
1498 	}
1499 
1500 	return leaf;
1501 }
1502 
1503 /*
1504  * Returns the last level spte pointer of the shadow page walk for the given
1505  * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
1506  * walk could be performed, returns NULL and *spte does not contain valid data.
1507  *
1508  * Contract:
1509  *  - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
1510  *  - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end.
1511  *
1512  * WARNING: This function is only intended to be called during fast_page_fault.
1513  */
1514 u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
1515 					u64 *spte)
1516 {
1517 	struct tdp_iter iter;
1518 	struct kvm_mmu *mmu = vcpu->arch.mmu;
1519 	gfn_t gfn = addr >> PAGE_SHIFT;
1520 	tdp_ptep_t sptep = NULL;
1521 
1522 	tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
1523 		*spte = iter.old_spte;
1524 		sptep = iter.sptep;
1525 	}
1526 
1527 	/*
1528 	 * Perform the rcu_dereference to get the raw spte pointer value since
1529 	 * we are passing it up to fast_page_fault, which is shared with the
1530 	 * legacy MMU and thus does not retain the TDP MMU-specific __rcu
1531 	 * annotation.
1532 	 *
1533 	 * This is safe since fast_page_fault obeys the contracts of this
1534 	 * function as well as all TDP MMU contracts around modifying SPTEs
1535 	 * outside of mmu_lock.
1536 	 */
1537 	return rcu_dereference(sptep);
1538 }
1539