kvm/mmu/mmu_internal.h

6ca9a6f3SSean Christopherson/* SPDX-License-Identifier: GPL-2.0 */
6ca9a6f3SSean Christopherson#ifndef __KVM_X86_MMU_INTERNAL_H
6ca9a6f3SSean Christopherson#define __KVM_X86_MMU_INTERNAL_H
6ca9a6f3SSean Christopherson
985ab278SSean Christopherson#include <linux/types.h>
5a9624afSPaolo Bonzini#include <linux/kvm_host.h>
985ab278SSean Christopherson#include <asm/kvm_host.h>
985ab278SSean Christopherson
5a9624afSPaolo Bonzini#undef MMU_DEBUG
5a9624afSPaolo Bonzini
5a9624afSPaolo Bonzini#ifdef MMU_DEBUG
5a9624afSPaolo Bonziniextern bool dbg;
5a9624afSPaolo Bonzini
5a9624afSPaolo Bonzini#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
805a0f83SStephen Zhang#define rmap_printk(fmt, args...) do { if (dbg) printk("%s: " fmt, __func__, ## args); } while (0)
5a9624afSPaolo Bonzini#define MMU_WARN_ON(x) WARN_ON(x)
5a9624afSPaolo Bonzini#else
5a9624afSPaolo Bonzini#define pgprintk(x...) do { } while (0)
5a9624afSPaolo Bonzini#define rmap_printk(x...) do { } while (0)
5a9624afSPaolo Bonzini#define MMU_WARN_ON(x) do { } while (0)
5a9624afSPaolo Bonzini#endif
5a9624afSPaolo Bonzini
42c88ff8SSean Christopherson/* Page table builder macros common to shadow (host) PTEs and guest PTEs. */
42c88ff8SSean Christopherson#define __PT_LEVEL_SHIFT(level, bits_per_level)	\
42c88ff8SSean Christopherson	(PAGE_SHIFT + ((level) - 1) * (bits_per_level))
42c88ff8SSean Christopherson#define __PT_INDEX(address, level, bits_per_level) \
42c88ff8SSean Christopherson	(((address) >> __PT_LEVEL_SHIFT(level, bits_per_level)) & ((1 << (bits_per_level)) - 1))
42c88ff8SSean Christopherson
42c88ff8SSean Christopherson#define __PT_LVL_ADDR_MASK(base_addr_mask, level, bits_per_level) \
42c88ff8SSean Christopherson	((base_addr_mask) & ~((1ULL << (PAGE_SHIFT + (((level) - 1) * (bits_per_level)))) - 1))
42c88ff8SSean Christopherson
42c88ff8SSean Christopherson#define __PT_LVL_OFFSET_MASK(base_addr_mask, level, bits_per_level) \
42c88ff8SSean Christopherson	((base_addr_mask) & ((1ULL << (PAGE_SHIFT + (((level) - 1) * (bits_per_level)))) - 1))
42c88ff8SSean Christopherson
42c88ff8SSean Christopherson#define __PT_ENT_PER_PAGE(bits_per_level)  (1 << (bits_per_level))
42c88ff8SSean Christopherson
c834e5e4SSean Christopherson/*
c834e5e4SSean Christopherson * Unlike regular MMU roots, PAE "roots", a.k.a. PDPTEs/PDPTRs, have a PRESENT
c834e5e4SSean Christopherson * bit, and thus are guaranteed to be non-zero when valid.  And, when a guest
c834e5e4SSean Christopherson * PDPTR is !PRESENT, its corresponding PAE root cannot be set to INVALID_PAGE,
c834e5e4SSean Christopherson * as the CPU would treat that as PRESENT PDPTR with reserved bits set.  Use
c834e5e4SSean Christopherson * '0' instead of INVALID_PAGE to indicate an invalid PAE root.
c834e5e4SSean Christopherson */
c834e5e4SSean Christopherson#define INVALID_PAE_ROOT	0
c834e5e4SSean Christopherson#define IS_VALID_PAE_ROOT(x)	(!!(x))
c834e5e4SSean Christopherson
c10743a1SSean Christophersontypedef u64 __rcu *tdp_ptep_t;
c10743a1SSean Christopherson
985ab278SSean Christophersonstruct kvm_mmu_page {
1148bfc4SSean Christopherson	/*
1148bfc4SSean Christopherson	 * Note, "link" through "spt" fit in a single 64 byte cache line on
1148bfc4SSean Christopherson	 * 64-bit kernels, keep it that way unless there's a reason not to.
1148bfc4SSean Christopherson	 */
985ab278SSean Christopherson	struct list_head link;
985ab278SSean Christopherson	struct hlist_node hash_link;
985ab278SSean Christopherson
ca41c34cSSean Christopherson	bool tdp_mmu_page;
985ab278SSean Christopherson	bool unsync;
985ab278SSean Christopherson	u8 mmu_valid_gen;
55c510e2SSean Christopherson
55c510e2SSean Christopherson	 /*
55c510e2SSean Christopherson	  * The shadow page can't be replaced by an equivalent huge page
55c510e2SSean Christopherson	  * because it is being used to map an executable page in the guest
55c510e2SSean Christopherson	  * and the NX huge page mitigation is enabled.
55c510e2SSean Christopherson	  */
55c510e2SSean Christopherson	bool nx_huge_page_disallowed;
985ab278SSean Christopherson
985ab278SSean Christopherson	/*
985ab278SSean Christopherson	 * The following two entries are used to key the shadow page in the
985ab278SSean Christopherson	 * hash table.
985ab278SSean Christopherson	 */
985ab278SSean Christopherson	union kvm_mmu_page_role role;
985ab278SSean Christopherson	gfn_t gfn;
985ab278SSean Christopherson
985ab278SSean Christopherson	u64 *spt;
6a97575dSDavid Matlack
6a97575dSDavid Matlack	/*
6a97575dSDavid Matlack	 * Stores the result of the guest translation being shadowed by each
6a97575dSDavid Matlack	 * SPTE.  KVM shadows two types of guest translations: nGPA -> GPA
6a97575dSDavid Matlack	 * (shadow EPT/NPT) and GVA -> GPA (traditional shadow paging). In both
6a97575dSDavid Matlack	 * cases the result of the translation is a GPA and a set of access
6a97575dSDavid Matlack	 * constraints.
6a97575dSDavid Matlack	 *
6a97575dSDavid Matlack	 * The GFN is stored in the upper bits (PAGE_SHIFT) and the shadowed
6a97575dSDavid Matlack	 * access permissions are stored in the lower bits. Note, for
6a97575dSDavid Matlack	 * convenience and uniformity across guests, the access permissions are
6a97575dSDavid Matlack	 * stored in KVM format (e.g.  ACC_EXEC_MASK) not the raw guest format.
6a97575dSDavid Matlack	 */
6a97575dSDavid Matlack	u64 *shadowed_translation;
6a97575dSDavid Matlack
11cccf5cSBen Gardon	/* Currently serving as active root */
11cccf5cSBen Gardon	union {
11cccf5cSBen Gardon		int root_count;
11cccf5cSBen Gardon		refcount_t tdp_mmu_root_count;
11cccf5cSBen Gardon	};
985ab278SSean Christopherson	unsigned int unsync_children;
c10743a1SSean Christopherson	union {
985ab278SSean Christopherson		struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */
c10743a1SSean Christopherson		tdp_ptep_t ptep;
c10743a1SSean Christopherson	};
22b94c4bSPaolo Bonzini	union {
985ab278SSean Christopherson		DECLARE_BITMAP(unsync_child_bitmap, 512);
22b94c4bSPaolo Bonzini		struct {
22b94c4bSPaolo Bonzini			struct work_struct tdp_mmu_async_work;
22b94c4bSPaolo Bonzini			void *tdp_mmu_async_data;
22b94c4bSPaolo Bonzini		};
22b94c4bSPaolo Bonzini	};
985ab278SSean Christopherson
428e9216SSean Christopherson	/*
428e9216SSean Christopherson	 * Tracks shadow pages that, if zapped, would allow KVM to create an NX
55c510e2SSean Christopherson	 * huge page.  A shadow page will have nx_huge_page_disallowed set but
55c510e2SSean Christopherson	 * not be on the list if a huge page is disallowed for other reasons,
55c510e2SSean Christopherson	 * e.g. because KVM is shadowing a PTE at the same gfn, the memslot
55c510e2SSean Christopherson	 * isn't properly aligned, etc...
428e9216SSean Christopherson	 */
55c510e2SSean Christopherson	struct list_head possible_nx_huge_page_link;
985ab278SSean Christopherson#ifdef CONFIG_X86_32
985ab278SSean Christopherson	/*
985ab278SSean Christopherson	 * Used out of the mmu-lock to avoid reading spte values while an
985ab278SSean Christopherson	 * update is in progress; see the comments in __get_spte_lockless().
985ab278SSean Christopherson	 */
985ab278SSean Christopherson	int clear_spte_count;
985ab278SSean Christopherson#endif
985ab278SSean Christopherson
985ab278SSean Christopherson	/* Number of writes since the last time traversal visited this page.  */
985ab278SSean Christopherson	atomic_t write_flooding_count;
02c00b3aSBen Gardon
897218ffSPaolo Bonzini#ifdef CONFIG_X86_64
d9f6e12fSIngo Molnar	/* Used for freeing the page asynchronously if it is a TDP MMU page. */
7cca2d0bSBen Gardon	struct rcu_head rcu_head;
897218ffSPaolo Bonzini#endif
985ab278SSean Christopherson};
985ab278SSean Christopherson
02c00b3aSBen Gardonextern struct kmem_cache *mmu_page_header_cache;
02c00b3aSBen Gardon
a3f15bdaSSean Christophersonstatic inline int kvm_mmu_role_as_id(union kvm_mmu_page_role role)
a3f15bdaSSean Christopherson{
a3f15bdaSSean Christopherson	return role.smm ? 1 : 0;
a3f15bdaSSean Christopherson}
a3f15bdaSSean Christopherson
08889894SSean Christophersonstatic inline int kvm_mmu_page_as_id(struct kvm_mmu_page *sp)
08889894SSean Christopherson{
a3f15bdaSSean Christopherson	return kvm_mmu_role_as_id(sp->role);
08889894SSean Christopherson}
08889894SSean Christopherson
ce92ef76SSean Christophersonstatic inline bool kvm_mmu_page_ad_need_write_protect(struct kvm_mmu_page *sp)
5a9624afSPaolo Bonzini{
5a9624afSPaolo Bonzini	/*
44ac5958SSean Christopherson	 * When using the EPT page-modification log, the GPAs in the CPU dirty
44ac5958SSean Christopherson	 * log would come from L2 rather than L1.  Therefore, we need to rely
44ac5958SSean Christopherson	 * on write protection to record dirty pages, which bypasses PML, since
44ac5958SSean Christopherson	 * writes now result in a vmexit.  Note, the check on CPU dirty logging
44ac5958SSean Christopherson	 * being enabled is mandatory as the bits used to denote WP-only SPTEs
ce92ef76SSean Christopherson	 * are reserved for PAE paging (32-bit KVM).
5a9624afSPaolo Bonzini	 */
ce92ef76SSean Christopherson	return kvm_x86_ops.cpu_dirty_log_size && sp->role.guest_mode;
5a9624afSPaolo Bonzini}
5a9624afSPaolo Bonzini
8283e36aSBen Gardonint mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot,
2839180cSPaolo Bonzini			    gfn_t gfn, bool can_unsync, bool prefetch);
5a9624afSPaolo Bonzini
269e9552SHamza Mahfoozvoid kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn);
269e9552SHamza Mahfoozvoid kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn);
6ca9a6f3SSean Christophersonbool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
3ad93562SKeqian Zhu				    struct kvm_memory_slot *slot, u64 gfn,
3ad93562SKeqian Zhu				    int min_level);
2f2fad08SBen Gardonvoid kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
2f2fad08SBen Gardon					u64 start_gfn, u64 pages);
3bcd0662SPeter Xuunsigned int pte_list_count(struct kvm_rmap_head *rmap_head);
6ca9a6f3SSean Christopherson
8a009d5bSSean Christophersonextern int nx_huge_pages;
084cc29fSBen Gardonstatic inline bool is_nx_huge_page_enabled(struct kvm *kvm)
8a009d5bSSean Christopherson{
084cc29fSBen Gardon	return READ_ONCE(nx_huge_pages) && !kvm->arch.disable_nx_huge_pages;
8a009d5bSSean Christopherson}
8a009d5bSSean Christopherson
8a009d5bSSean Christophersonstruct kvm_page_fault {
8a009d5bSSean Christopherson	/* arguments to kvm_mmu_do_page_fault.  */
8a009d5bSSean Christopherson	const gpa_t addr;
8a009d5bSSean Christopherson	const u32 error_code;
8a009d5bSSean Christopherson	const bool prefetch;
8a009d5bSSean Christopherson
8a009d5bSSean Christopherson	/* Derived from error_code.  */
8a009d5bSSean Christopherson	const bool exec;
8a009d5bSSean Christopherson	const bool write;
8a009d5bSSean Christopherson	const bool present;
8a009d5bSSean Christopherson	const bool rsvd;
8a009d5bSSean Christopherson	const bool user;
8a009d5bSSean Christopherson
8a009d5bSSean Christopherson	/* Derived from mmu and global state.  */
8a009d5bSSean Christopherson	const bool is_tdp;
8a009d5bSSean Christopherson	const bool nx_huge_page_workaround_enabled;
8a009d5bSSean Christopherson
bb18842eSBen Gardon	/*
8a009d5bSSean Christopherson	 * Whether a >4KB mapping can be created or is forbidden due to NX
8a009d5bSSean Christopherson	 * hugepages.
8a009d5bSSean Christopherson	 */
8a009d5bSSean Christopherson	bool huge_page_disallowed;
8a009d5bSSean Christopherson
8a009d5bSSean Christopherson	/*
8a009d5bSSean Christopherson	 * Maximum page size that can be created for this fault; input to
8a009d5bSSean Christopherson	 * FNAME(fetch), __direct_map and kvm_tdp_mmu_map.
8a009d5bSSean Christopherson	 */
8a009d5bSSean Christopherson	u8 max_level;
8a009d5bSSean Christopherson
8a009d5bSSean Christopherson	/*
8a009d5bSSean Christopherson	 * Page size that can be created based on the max_level and the
8a009d5bSSean Christopherson	 * page size used by the host mapping.
8a009d5bSSean Christopherson	 */
8a009d5bSSean Christopherson	u8 req_level;
8a009d5bSSean Christopherson
8a009d5bSSean Christopherson	/*
8a009d5bSSean Christopherson	 * Page size that will be created based on the req_level and
8a009d5bSSean Christopherson	 * huge_page_disallowed.
8a009d5bSSean Christopherson	 */
8a009d5bSSean Christopherson	u8 goal_level;
8a009d5bSSean Christopherson
8a009d5bSSean Christopherson	/* Shifted addr, or result of guest page table walk if addr is a gva.  */
8a009d5bSSean Christopherson	gfn_t gfn;
8a009d5bSSean Christopherson
8a009d5bSSean Christopherson	/* The memslot containing gfn. May be NULL. */
8a009d5bSSean Christopherson	struct kvm_memory_slot *slot;
8a009d5bSSean Christopherson
8a009d5bSSean Christopherson	/* Outputs of kvm_faultin_pfn.  */
*ba6e3fe2SDavid Matlack	unsigned long mmu_seq;
8a009d5bSSean Christopherson	kvm_pfn_t pfn;
8a009d5bSSean Christopherson	hva_t hva;
8a009d5bSSean Christopherson	bool map_writable;
8a009d5bSSean Christopherson};
8a009d5bSSean Christopherson
8a009d5bSSean Christophersonint kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
8a009d5bSSean Christopherson
8a009d5bSSean Christopherson/*
8a009d5bSSean Christopherson * Return values of handle_mmio_page_fault(), mmu.page_fault(), fast_page_fault(),
8a009d5bSSean Christopherson * and of course kvm_mmu_do_page_fault().
bb18842eSBen Gardon *
5276c616SSean Christopherson * RET_PF_CONTINUE: So far, so good, keep handling the page fault.
bb18842eSBen Gardon * RET_PF_RETRY: let CPU fault again on the address.
bb18842eSBen Gardon * RET_PF_EMULATE: mmio page fault, emulate the instruction directly.
bb18842eSBen Gardon * RET_PF_INVALID: the spte is invalid, let the real page fault path update it.
bb18842eSBen Gardon * RET_PF_FIXED: The faulting entry has been fixed.
bb18842eSBen Gardon * RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU.
61bcd360SDavid Matlack *
61bcd360SDavid Matlack * Any names added to this enum should be exported to userspace for use in
61bcd360SDavid Matlack * tracepoints via TRACE_DEFINE_ENUM() in mmutrace.h
5276c616SSean Christopherson *
5276c616SSean Christopherson * Note, all values must be greater than or equal to zero so as not to encroach
5276c616SSean Christopherson * on -errno return values.  Somewhat arbitrarily use '0' for CONTINUE, which
5276c616SSean Christopherson * will allow for efficient machine code when checking for CONTINUE, e.g.
5276c616SSean Christopherson * "TEST %rax, %rax, JNZ", as all "stop!" values are non-zero.
bb18842eSBen Gardon */
bb18842eSBen Gardonenum {
5276c616SSean Christopherson	RET_PF_CONTINUE = 0,
5276c616SSean Christopherson	RET_PF_RETRY,
bb18842eSBen Gardon	RET_PF_EMULATE,
bb18842eSBen Gardon	RET_PF_INVALID,
bb18842eSBen Gardon	RET_PF_FIXED,
bb18842eSBen Gardon	RET_PF_SPURIOUS,
bb18842eSBen Gardon};
bb18842eSBen Gardon
8a009d5bSSean Christophersonstatic inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
8a009d5bSSean Christopherson					u32 err, bool prefetch)
8a009d5bSSean Christopherson{
8a009d5bSSean Christopherson	struct kvm_page_fault fault = {
8a009d5bSSean Christopherson		.addr = cr2_or_gpa,
8a009d5bSSean Christopherson		.error_code = err,
8a009d5bSSean Christopherson		.exec = err & PFERR_FETCH_MASK,
8a009d5bSSean Christopherson		.write = err & PFERR_WRITE_MASK,
8a009d5bSSean Christopherson		.present = err & PFERR_PRESENT_MASK,
8a009d5bSSean Christopherson		.rsvd = err & PFERR_RSVD_MASK,
8a009d5bSSean Christopherson		.user = err & PFERR_USER_MASK,
8a009d5bSSean Christopherson		.prefetch = prefetch,
8a009d5bSSean Christopherson		.is_tdp = likely(vcpu->arch.mmu->page_fault == kvm_tdp_page_fault),
084cc29fSBen Gardon		.nx_huge_page_workaround_enabled =
084cc29fSBen Gardon			is_nx_huge_page_enabled(vcpu->kvm),
8a009d5bSSean Christopherson
8a009d5bSSean Christopherson		.max_level = KVM_MAX_HUGEPAGE_LEVEL,
8a009d5bSSean Christopherson		.req_level = PG_LEVEL_4K,
8a009d5bSSean Christopherson		.goal_level = PG_LEVEL_4K,
8a009d5bSSean Christopherson	};
1075d41eSSean Christopherson	int r;
1075d41eSSean Christopherson
1075d41eSSean Christopherson	/*
1075d41eSSean Christopherson	 * Async #PF "faults", a.k.a. prefetch faults, are not faults from the
1075d41eSSean Christopherson	 * guest perspective and have already been counted at the time of the
1075d41eSSean Christopherson	 * original fault.
1075d41eSSean Christopherson	 */
1075d41eSSean Christopherson	if (!prefetch)
1075d41eSSean Christopherson		vcpu->stat.pf_taken++;
8d5265b1SSean Christopherson
8d5265b1SSean Christopherson	if (IS_ENABLED(CONFIG_RETPOLINE) && fault.is_tdp)
1075d41eSSean Christopherson		r = kvm_tdp_page_fault(vcpu, &fault);
1075d41eSSean Christopherson	else
1075d41eSSean Christopherson		r = vcpu->arch.mmu->page_fault(vcpu, &fault);
8d5265b1SSean Christopherson
1075d41eSSean Christopherson	/*
1075d41eSSean Christopherson	 * Similar to above, prefetch faults aren't truly spurious, and the
1075d41eSSean Christopherson	 * async #PF path doesn't do emulation.  Do count faults that are fixed
1075d41eSSean Christopherson	 * by the async #PF handler though, otherwise they'll never be counted.
1075d41eSSean Christopherson	 */
1075d41eSSean Christopherson	if (r == RET_PF_FIXED)
1075d41eSSean Christopherson		vcpu->stat.pf_fixed++;
1075d41eSSean Christopherson	else if (prefetch)
1075d41eSSean Christopherson		;
1075d41eSSean Christopherson	else if (r == RET_PF_EMULATE)
1075d41eSSean Christopherson		vcpu->stat.pf_emulate++;
1075d41eSSean Christopherson	else if (r == RET_PF_SPURIOUS)
1075d41eSSean Christopherson		vcpu->stat.pf_spurious++;
1075d41eSSean Christopherson	return r;
8a009d5bSSean Christopherson}
8a009d5bSSean Christopherson
8ca6f063SBen Gardonint kvm_mmu_max_mapping_level(struct kvm *kvm,
8ca6f063SBen Gardon			      const struct kvm_memory_slot *slot, gfn_t gfn,
a8ac499bSSean Christopherson			      int max_level);
73a3c659SPaolo Bonzinivoid kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
536f0e6aSPaolo Bonzinivoid disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level);
bb18842eSBen Gardon
bb18842eSBen Gardonvoid *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
bb18842eSBen Gardon
61f94478SSean Christophersonvoid track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);
61f94478SSean Christophersonvoid untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);
29cf0f50SBen Gardon
6ca9a6f3SSean Christopherson#endif /* __KVM_X86_MMU_INTERNAL_H */