xref: /linux/arch/x86/include/asm/kvm_host.h (revision a382b06d297e78ed7ac67afd0d8e8690406ac4ca)
1  /* SPDX-License-Identifier: GPL-2.0-only */
2  /*
3   * Kernel-based Virtual Machine driver for Linux
4   *
5   * This header defines architecture specific interfaces, x86 version
6   */
7  
8  #ifndef _ASM_X86_KVM_HOST_H
9  #define _ASM_X86_KVM_HOST_H
10  
11  #include <linux/types.h>
12  #include <linux/mm.h>
13  #include <linux/mmu_notifier.h>
14  #include <linux/tracepoint.h>
15  #include <linux/cpumask.h>
16  #include <linux/irq_work.h>
17  #include <linux/irq.h>
18  #include <linux/workqueue.h>
19  
20  #include <linux/kvm.h>
21  #include <linux/kvm_para.h>
22  #include <linux/kvm_types.h>
23  #include <linux/perf_event.h>
24  #include <linux/pvclock_gtod.h>
25  #include <linux/clocksource.h>
26  #include <linux/irqbypass.h>
27  #include <linux/kfifo.h>
28  #include <linux/sched/vhost_task.h>
29  #include <linux/call_once.h>
30  
31  #include <asm/apic.h>
32  #include <asm/pvclock-abi.h>
33  #include <asm/desc.h>
34  #include <asm/mtrr.h>
35  #include <asm/msr-index.h>
36  #include <asm/asm.h>
37  #include <asm/kvm_page_track.h>
38  #include <asm/kvm_vcpu_regs.h>
39  #include <asm/reboot.h>
40  #include <hyperv/hvhdk.h>
41  
42  #define __KVM_HAVE_ARCH_VCPU_DEBUGFS
43  
44  /*
45   * CONFIG_KVM_MAX_NR_VCPUS is defined iff CONFIG_KVM!=n, provide a dummy max if
46   * KVM is disabled (arbitrarily use the default from CONFIG_KVM_MAX_NR_VCPUS).
47   */
48  #ifdef CONFIG_KVM_MAX_NR_VCPUS
49  #define KVM_MAX_VCPUS CONFIG_KVM_MAX_NR_VCPUS
50  #else
51  #define KVM_MAX_VCPUS 1024
52  #endif
53  
54  /*
55   * In x86, the VCPU ID corresponds to the APIC ID, and APIC IDs
56   * might be larger than the actual number of VCPUs because the
57   * APIC ID encodes CPU topology information.
58   *
59   * In the worst case, we'll need less than one extra bit for the
60   * Core ID, and less than one extra bit for the Package (Die) ID,
61   * so ratio of 4 should be enough.
62   */
63  #define KVM_VCPU_ID_RATIO 4
64  #define KVM_MAX_VCPU_IDS (KVM_MAX_VCPUS * KVM_VCPU_ID_RATIO)
65  
66  /* memory slots that are not exposed to userspace */
67  #define KVM_INTERNAL_MEM_SLOTS 3
68  
69  #define KVM_HALT_POLL_NS_DEFAULT 200000
70  
71  #define KVM_IRQCHIP_NUM_PINS  KVM_IOAPIC_NUM_PINS
72  
73  #define KVM_DIRTY_LOG_MANUAL_CAPS   (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \
74  					KVM_DIRTY_LOG_INITIALLY_SET)
75  
76  #define KVM_BUS_LOCK_DETECTION_VALID_MODE	(KVM_BUS_LOCK_DETECTION_OFF | \
77  						 KVM_BUS_LOCK_DETECTION_EXIT)
78  
79  #define KVM_X86_NOTIFY_VMEXIT_VALID_BITS	(KVM_X86_NOTIFY_VMEXIT_ENABLED | \
80  						 KVM_X86_NOTIFY_VMEXIT_USER)
81  
82  /* x86-specific vcpu->requests bit members */
83  #define KVM_REQ_MIGRATE_TIMER		KVM_ARCH_REQ(0)
84  #define KVM_REQ_REPORT_TPR_ACCESS	KVM_ARCH_REQ(1)
85  #define KVM_REQ_TRIPLE_FAULT		KVM_ARCH_REQ(2)
86  #define KVM_REQ_MMU_SYNC		KVM_ARCH_REQ(3)
87  #define KVM_REQ_CLOCK_UPDATE		KVM_ARCH_REQ(4)
88  #define KVM_REQ_LOAD_MMU_PGD		KVM_ARCH_REQ(5)
89  #define KVM_REQ_EVENT			KVM_ARCH_REQ(6)
90  #define KVM_REQ_APF_HALT		KVM_ARCH_REQ(7)
91  #define KVM_REQ_STEAL_UPDATE		KVM_ARCH_REQ(8)
92  #define KVM_REQ_NMI			KVM_ARCH_REQ(9)
93  #define KVM_REQ_PMU			KVM_ARCH_REQ(10)
94  #define KVM_REQ_PMI			KVM_ARCH_REQ(11)
95  #ifdef CONFIG_KVM_SMM
96  #define KVM_REQ_SMI			KVM_ARCH_REQ(12)
97  #endif
98  #define KVM_REQ_MASTERCLOCK_UPDATE	KVM_ARCH_REQ(13)
99  #define KVM_REQ_MCLOCK_INPROGRESS \
100  	KVM_ARCH_REQ_FLAGS(14, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
101  #define KVM_REQ_SCAN_IOAPIC \
102  	KVM_ARCH_REQ_FLAGS(15, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
103  #define KVM_REQ_GLOBAL_CLOCK_UPDATE	KVM_ARCH_REQ(16)
104  #define KVM_REQ_APIC_PAGE_RELOAD \
105  	KVM_ARCH_REQ_FLAGS(17, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
106  #define KVM_REQ_HV_CRASH		KVM_ARCH_REQ(18)
107  #define KVM_REQ_IOAPIC_EOI_EXIT		KVM_ARCH_REQ(19)
108  #define KVM_REQ_HV_RESET		KVM_ARCH_REQ(20)
109  #define KVM_REQ_HV_EXIT			KVM_ARCH_REQ(21)
110  #define KVM_REQ_HV_STIMER		KVM_ARCH_REQ(22)
111  #define KVM_REQ_LOAD_EOI_EXITMAP	KVM_ARCH_REQ(23)
112  #define KVM_REQ_GET_NESTED_STATE_PAGES	KVM_ARCH_REQ(24)
113  #define KVM_REQ_APICV_UPDATE \
114  	KVM_ARCH_REQ_FLAGS(25, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
115  #define KVM_REQ_TLB_FLUSH_CURRENT	KVM_ARCH_REQ(26)
116  #define KVM_REQ_TLB_FLUSH_GUEST \
117  	KVM_ARCH_REQ_FLAGS(27, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
118  #define KVM_REQ_APF_READY		KVM_ARCH_REQ(28)
119  #define KVM_REQ_MSR_FILTER_CHANGED	KVM_ARCH_REQ(29)
120  #define KVM_REQ_UPDATE_CPU_DIRTY_LOGGING \
121  	KVM_ARCH_REQ_FLAGS(30, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
122  #define KVM_REQ_MMU_FREE_OBSOLETE_ROOTS \
123  	KVM_ARCH_REQ_FLAGS(31, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
124  #define KVM_REQ_HV_TLB_FLUSH \
125  	KVM_ARCH_REQ_FLAGS(32, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
126  #define KVM_REQ_UPDATE_PROTECTED_GUEST_STATE	KVM_ARCH_REQ(34)
127  
128  #define CR0_RESERVED_BITS                                               \
129  	(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
130  			  | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
131  			  | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
132  
133  #define CR4_RESERVED_BITS                                               \
134  	(~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
135  			  | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE     \
136  			  | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_PCIDE \
137  			  | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_FSGSBASE \
138  			  | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_VMXE \
139  			  | X86_CR4_SMAP | X86_CR4_PKE | X86_CR4_UMIP \
140  			  | X86_CR4_LAM_SUP))
141  
142  #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
143  
144  
145  
146  #define INVALID_PAGE (~(hpa_t)0)
147  #define VALID_PAGE(x) ((x) != INVALID_PAGE)
148  
149  /* KVM Hugepage definitions for x86 */
150  #define KVM_MAX_HUGEPAGE_LEVEL	PG_LEVEL_1G
151  #define KVM_NR_PAGE_SIZES	(KVM_MAX_HUGEPAGE_LEVEL - PG_LEVEL_4K + 1)
152  #define KVM_HPAGE_GFN_SHIFT(x)	(((x) - 1) * 9)
153  #define KVM_HPAGE_SHIFT(x)	(PAGE_SHIFT + KVM_HPAGE_GFN_SHIFT(x))
154  #define KVM_HPAGE_SIZE(x)	(1UL << KVM_HPAGE_SHIFT(x))
155  #define KVM_HPAGE_MASK(x)	(~(KVM_HPAGE_SIZE(x) - 1))
156  #define KVM_PAGES_PER_HPAGE(x)	(KVM_HPAGE_SIZE(x) / PAGE_SIZE)
157  
158  #define KVM_MEMSLOT_PAGES_TO_MMU_PAGES_RATIO 50
159  #define KVM_MIN_ALLOC_MMU_PAGES 64UL
160  #define KVM_MMU_HASH_SHIFT 12
161  #define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT)
162  #define KVM_MIN_FREE_MMU_PAGES 5
163  #define KVM_REFILL_PAGES 25
164  #define KVM_MAX_CPUID_ENTRIES 256
165  #define KVM_NR_VAR_MTRR 8
166  
167  #define ASYNC_PF_PER_VCPU 64
168  
169  enum kvm_reg {
170  	VCPU_REGS_RAX = __VCPU_REGS_RAX,
171  	VCPU_REGS_RCX = __VCPU_REGS_RCX,
172  	VCPU_REGS_RDX = __VCPU_REGS_RDX,
173  	VCPU_REGS_RBX = __VCPU_REGS_RBX,
174  	VCPU_REGS_RSP = __VCPU_REGS_RSP,
175  	VCPU_REGS_RBP = __VCPU_REGS_RBP,
176  	VCPU_REGS_RSI = __VCPU_REGS_RSI,
177  	VCPU_REGS_RDI = __VCPU_REGS_RDI,
178  #ifdef CONFIG_X86_64
179  	VCPU_REGS_R8  = __VCPU_REGS_R8,
180  	VCPU_REGS_R9  = __VCPU_REGS_R9,
181  	VCPU_REGS_R10 = __VCPU_REGS_R10,
182  	VCPU_REGS_R11 = __VCPU_REGS_R11,
183  	VCPU_REGS_R12 = __VCPU_REGS_R12,
184  	VCPU_REGS_R13 = __VCPU_REGS_R13,
185  	VCPU_REGS_R14 = __VCPU_REGS_R14,
186  	VCPU_REGS_R15 = __VCPU_REGS_R15,
187  #endif
188  	VCPU_REGS_RIP,
189  	NR_VCPU_REGS,
190  
191  	VCPU_EXREG_PDPTR = NR_VCPU_REGS,
192  	VCPU_EXREG_CR0,
193  	VCPU_EXREG_CR3,
194  	VCPU_EXREG_CR4,
195  	VCPU_EXREG_RFLAGS,
196  	VCPU_EXREG_SEGMENTS,
197  	VCPU_EXREG_EXIT_INFO_1,
198  	VCPU_EXREG_EXIT_INFO_2,
199  };
200  
201  enum {
202  	VCPU_SREG_ES,
203  	VCPU_SREG_CS,
204  	VCPU_SREG_SS,
205  	VCPU_SREG_DS,
206  	VCPU_SREG_FS,
207  	VCPU_SREG_GS,
208  	VCPU_SREG_TR,
209  	VCPU_SREG_LDTR,
210  };
211  
212  enum exit_fastpath_completion {
213  	EXIT_FASTPATH_NONE,
214  	EXIT_FASTPATH_REENTER_GUEST,
215  	EXIT_FASTPATH_EXIT_HANDLED,
216  	EXIT_FASTPATH_EXIT_USERSPACE,
217  };
218  typedef enum exit_fastpath_completion fastpath_t;
219  
220  struct x86_emulate_ctxt;
221  struct x86_exception;
222  union kvm_smram;
223  enum x86_intercept;
224  enum x86_intercept_stage;
225  
226  #define KVM_NR_DB_REGS	4
227  
228  #define DR6_BUS_LOCK   (1 << 11)
229  #define DR6_BD		(1 << 13)
230  #define DR6_BS		(1 << 14)
231  #define DR6_BT		(1 << 15)
232  #define DR6_RTM		(1 << 16)
233  /*
234   * DR6_ACTIVE_LOW combines fixed-1 and active-low bits.
235   * We can regard all the bits in DR6_FIXED_1 as active_low bits;
236   * they will never be 0 for now, but when they are defined
237   * in the future it will require no code change.
238   *
239   * DR6_ACTIVE_LOW is also used as the init/reset value for DR6.
240   */
241  #define DR6_ACTIVE_LOW	0xffff0ff0
242  #define DR6_VOLATILE	0x0001e80f
243  #define DR6_FIXED_1	(DR6_ACTIVE_LOW & ~DR6_VOLATILE)
244  
245  #define DR7_BP_EN_MASK	0x000000ff
246  #define DR7_GE		(1 << 9)
247  #define DR7_GD		(1 << 13)
248  #define DR7_FIXED_1	0x00000400
249  #define DR7_VOLATILE	0xffff2bff
250  
251  #define KVM_GUESTDBG_VALID_MASK \
252  	(KVM_GUESTDBG_ENABLE | \
253  	KVM_GUESTDBG_SINGLESTEP | \
254  	KVM_GUESTDBG_USE_HW_BP | \
255  	KVM_GUESTDBG_USE_SW_BP | \
256  	KVM_GUESTDBG_INJECT_BP | \
257  	KVM_GUESTDBG_INJECT_DB | \
258  	KVM_GUESTDBG_BLOCKIRQ)
259  
260  #define PFERR_PRESENT_MASK	BIT(0)
261  #define PFERR_WRITE_MASK	BIT(1)
262  #define PFERR_USER_MASK		BIT(2)
263  #define PFERR_RSVD_MASK		BIT(3)
264  #define PFERR_FETCH_MASK	BIT(4)
265  #define PFERR_PK_MASK		BIT(5)
266  #define PFERR_SGX_MASK		BIT(15)
267  #define PFERR_GUEST_RMP_MASK	BIT_ULL(31)
268  #define PFERR_GUEST_FINAL_MASK	BIT_ULL(32)
269  #define PFERR_GUEST_PAGE_MASK	BIT_ULL(33)
270  #define PFERR_GUEST_ENC_MASK	BIT_ULL(34)
271  #define PFERR_GUEST_SIZEM_MASK	BIT_ULL(35)
272  #define PFERR_GUEST_VMPL_MASK	BIT_ULL(36)
273  
274  /*
275   * IMPLICIT_ACCESS is a KVM-defined flag used to correctly perform SMAP checks
276   * when emulating instructions that triggers implicit access.
277   */
278  #define PFERR_IMPLICIT_ACCESS	BIT_ULL(48)
279  /*
280   * PRIVATE_ACCESS is a KVM-defined flag us to indicate that a fault occurred
281   * when the guest was accessing private memory.
282   */
283  #define PFERR_PRIVATE_ACCESS   BIT_ULL(49)
284  #define PFERR_SYNTHETIC_MASK   (PFERR_IMPLICIT_ACCESS | PFERR_PRIVATE_ACCESS)
285  
286  /* apic attention bits */
287  #define KVM_APIC_CHECK_VAPIC	0
288  /*
289   * The following bit is set with PV-EOI, unset on EOI.
290   * We detect PV-EOI changes by guest by comparing
291   * this bit with PV-EOI in guest memory.
292   * See the implementation in apic_update_pv_eoi.
293   */
294  #define KVM_APIC_PV_EOI_PENDING	1
295  
296  struct kvm_kernel_irq_routing_entry;
297  
298  /*
299   * kvm_mmu_page_role tracks the properties of a shadow page (where shadow page
300   * also includes TDP pages) to determine whether or not a page can be used in
301   * the given MMU context.  This is a subset of the overall kvm_cpu_role to
302   * minimize the size of kvm_memory_slot.arch.gfn_write_track, i.e. allows
303   * allocating 2 bytes per gfn instead of 4 bytes per gfn.
304   *
305   * Upper-level shadow pages having gptes are tracked for write-protection via
306   * gfn_write_track.  As above, gfn_write_track is a 16 bit counter, so KVM must
307   * not create more than 2^16-1 upper-level shadow pages at a single gfn,
308   * otherwise gfn_write_track will overflow and explosions will ensue.
309   *
310   * A unique shadow page (SP) for a gfn is created if and only if an existing SP
311   * cannot be reused.  The ability to reuse a SP is tracked by its role, which
312   * incorporates various mode bits and properties of the SP.  Roughly speaking,
313   * the number of unique SPs that can theoretically be created is 2^n, where n
314   * is the number of bits that are used to compute the role.
315   *
316   * But, even though there are 20 bits in the mask below, not all combinations
317   * of modes and flags are possible:
318   *
319   *   - invalid shadow pages are not accounted, mirror pages are not shadowed,
320   *     so the bits are effectively 18.
321   *
322   *   - quadrant will only be used if has_4_byte_gpte=1 (non-PAE paging);
323   *     execonly and ad_disabled are only used for nested EPT which has
324   *     has_4_byte_gpte=0.  Therefore, 2 bits are always unused.
325   *
326   *   - the 4 bits of level are effectively limited to the values 2/3/4/5,
327   *     as 4k SPs are not tracked (allowed to go unsync).  In addition non-PAE
328   *     paging has exactly one upper level, making level completely redundant
329   *     when has_4_byte_gpte=1.
330   *
331   *   - on top of this, smep_andnot_wp and smap_andnot_wp are only set if
332   *     cr0_wp=0, therefore these three bits only give rise to 5 possibilities.
333   *
334   * Therefore, the maximum number of possible upper-level shadow pages for a
335   * single gfn is a bit less than 2^13.
336   */
337  union kvm_mmu_page_role {
338  	u32 word;
339  	struct {
340  		unsigned level:4;
341  		unsigned has_4_byte_gpte:1;
342  		unsigned quadrant:2;
343  		unsigned direct:1;
344  		unsigned access:3;
345  		unsigned invalid:1;
346  		unsigned efer_nx:1;
347  		unsigned cr0_wp:1;
348  		unsigned smep_andnot_wp:1;
349  		unsigned smap_andnot_wp:1;
350  		unsigned ad_disabled:1;
351  		unsigned guest_mode:1;
352  		unsigned passthrough:1;
353  		unsigned is_mirror:1;
354  		unsigned :4;
355  
356  		/*
357  		 * This is left at the top of the word so that
358  		 * kvm_memslots_for_spte_role can extract it with a
359  		 * simple shift.  While there is room, give it a whole
360  		 * byte so it is also faster to load it from memory.
361  		 */
362  		unsigned smm:8;
363  	};
364  };
365  
366  /*
367   * kvm_mmu_extended_role complements kvm_mmu_page_role, tracking properties
368   * relevant to the current MMU configuration.   When loading CR0, CR4, or EFER,
369   * including on nested transitions, if nothing in the full role changes then
370   * MMU re-configuration can be skipped. @valid bit is set on first usage so we
371   * don't treat all-zero structure as valid data.
372   *
373   * The properties that are tracked in the extended role but not the page role
374   * are for things that either (a) do not affect the validity of the shadow page
375   * or (b) are indirectly reflected in the shadow page's role.  For example,
376   * CR4.PKE only affects permission checks for software walks of the guest page
377   * tables (because KVM doesn't support Protection Keys with shadow paging), and
378   * CR0.PG, CR4.PAE, and CR4.PSE are indirectly reflected in role.level.
379   *
380   * Note, SMEP and SMAP are not redundant with sm*p_andnot_wp in the page role.
381   * If CR0.WP=1, KVM can reuse shadow pages for the guest regardless of SMEP and
382   * SMAP, but the MMU's permission checks for software walks need to be SMEP and
383   * SMAP aware regardless of CR0.WP.
384   */
385  union kvm_mmu_extended_role {
386  	u32 word;
387  	struct {
388  		unsigned int valid:1;
389  		unsigned int execonly:1;
390  		unsigned int cr4_pse:1;
391  		unsigned int cr4_pke:1;
392  		unsigned int cr4_smap:1;
393  		unsigned int cr4_smep:1;
394  		unsigned int cr4_la57:1;
395  		unsigned int efer_lma:1;
396  	};
397  };
398  
399  union kvm_cpu_role {
400  	u64 as_u64;
401  	struct {
402  		union kvm_mmu_page_role base;
403  		union kvm_mmu_extended_role ext;
404  	};
405  };
406  
407  struct kvm_rmap_head {
408  	unsigned long val;
409  };
410  
411  struct kvm_pio_request {
412  	unsigned long linear_rip;
413  	unsigned long count;
414  	int in;
415  	int port;
416  	int size;
417  };
418  
419  #define PT64_ROOT_MAX_LEVEL 5
420  
421  struct rsvd_bits_validate {
422  	u64 rsvd_bits_mask[2][PT64_ROOT_MAX_LEVEL];
423  	u64 bad_mt_xwr;
424  };
425  
426  struct kvm_mmu_root_info {
427  	gpa_t pgd;
428  	hpa_t hpa;
429  };
430  
431  #define KVM_MMU_ROOT_INFO_INVALID \
432  	((struct kvm_mmu_root_info) { .pgd = INVALID_PAGE, .hpa = INVALID_PAGE })
433  
434  #define KVM_MMU_NUM_PREV_ROOTS 3
435  
436  #define KVM_MMU_ROOT_CURRENT		BIT(0)
437  #define KVM_MMU_ROOT_PREVIOUS(i)	BIT(1+i)
438  #define KVM_MMU_ROOTS_ALL		(BIT(1 + KVM_MMU_NUM_PREV_ROOTS) - 1)
439  
440  #define KVM_HAVE_MMU_RWLOCK
441  
442  struct kvm_mmu_page;
443  struct kvm_page_fault;
444  
445  /*
446   * x86 supports 4 paging modes (5-level 64-bit, 4-level 64-bit, 3-level 32-bit,
447   * and 2-level 32-bit).  The kvm_mmu structure abstracts the details of the
448   * current mmu mode.
449   */
450  struct kvm_mmu {
451  	unsigned long (*get_guest_pgd)(struct kvm_vcpu *vcpu);
452  	u64 (*get_pdptr)(struct kvm_vcpu *vcpu, int index);
453  	int (*page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
454  	void (*inject_page_fault)(struct kvm_vcpu *vcpu,
455  				  struct x86_exception *fault);
456  	gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
457  			    gpa_t gva_or_gpa, u64 access,
458  			    struct x86_exception *exception);
459  	int (*sync_spte)(struct kvm_vcpu *vcpu,
460  			 struct kvm_mmu_page *sp, int i);
461  	struct kvm_mmu_root_info root;
462  	hpa_t mirror_root_hpa;
463  	union kvm_cpu_role cpu_role;
464  	union kvm_mmu_page_role root_role;
465  
466  	/*
467  	* The pkru_mask indicates if protection key checks are needed.  It
468  	* consists of 16 domains indexed by page fault error code bits [4:1],
469  	* with PFEC.RSVD replaced by ACC_USER_MASK from the page tables.
470  	* Each domain has 2 bits which are ANDed with AD and WD from PKRU.
471  	*/
472  	u32 pkru_mask;
473  
474  	struct kvm_mmu_root_info prev_roots[KVM_MMU_NUM_PREV_ROOTS];
475  
476  	/*
477  	 * Bitmap; bit set = permission fault
478  	 * Byte index: page fault error code [4:1]
479  	 * Bit index: pte permissions in ACC_* format
480  	 */
481  	u8 permissions[16];
482  
483  	u64 *pae_root;
484  	u64 *pml4_root;
485  	u64 *pml5_root;
486  
487  	/*
488  	 * check zero bits on shadow page table entries, these
489  	 * bits include not only hardware reserved bits but also
490  	 * the bits spte never used.
491  	 */
492  	struct rsvd_bits_validate shadow_zero_check;
493  
494  	struct rsvd_bits_validate guest_rsvd_check;
495  
496  	u64 pdptrs[4]; /* pae */
497  };
498  
499  enum pmc_type {
500  	KVM_PMC_GP = 0,
501  	KVM_PMC_FIXED,
502  };
503  
504  struct kvm_pmc {
505  	enum pmc_type type;
506  	u8 idx;
507  	bool is_paused;
508  	bool intr;
509  	/*
510  	 * Base value of the PMC counter, relative to the *consumed* count in
511  	 * the associated perf_event.  This value includes counter updates from
512  	 * the perf_event and emulated_count since the last time the counter
513  	 * was reprogrammed, but it is *not* the current value as seen by the
514  	 * guest or userspace.
515  	 *
516  	 * The count is relative to the associated perf_event so that KVM
517  	 * doesn't need to reprogram the perf_event every time the guest writes
518  	 * to the counter.
519  	 */
520  	u64 counter;
521  	/*
522  	 * PMC events triggered by KVM emulation that haven't been fully
523  	 * processed, i.e. haven't undergone overflow detection.
524  	 */
525  	u64 emulated_counter;
526  	u64 eventsel;
527  	struct perf_event *perf_event;
528  	struct kvm_vcpu *vcpu;
529  	/*
530  	 * only for creating or reusing perf_event,
531  	 * eventsel value for general purpose counters,
532  	 * ctrl value for fixed counters.
533  	 */
534  	u64 current_config;
535  };
536  
537  /* More counters may conflict with other existing Architectural MSRs */
538  #define KVM_MAX(a, b)	((a) >= (b) ? (a) : (b))
539  #define KVM_MAX_NR_INTEL_GP_COUNTERS	8
540  #define KVM_MAX_NR_AMD_GP_COUNTERS	6
541  #define KVM_MAX_NR_GP_COUNTERS		KVM_MAX(KVM_MAX_NR_INTEL_GP_COUNTERS, \
542  						KVM_MAX_NR_AMD_GP_COUNTERS)
543  
544  #define KVM_MAX_NR_INTEL_FIXED_COUTNERS	3
545  #define KVM_MAX_NR_AMD_FIXED_COUTNERS	0
546  #define KVM_MAX_NR_FIXED_COUNTERS	KVM_MAX(KVM_MAX_NR_INTEL_FIXED_COUTNERS, \
547  						KVM_MAX_NR_AMD_FIXED_COUTNERS)
548  
549  struct kvm_pmu {
550  	u8 version;
551  	unsigned nr_arch_gp_counters;
552  	unsigned nr_arch_fixed_counters;
553  	unsigned available_event_types;
554  	u64 fixed_ctr_ctrl;
555  	u64 fixed_ctr_ctrl_rsvd;
556  	u64 global_ctrl;
557  	u64 global_status;
558  	u64 counter_bitmask[2];
559  	u64 global_ctrl_rsvd;
560  	u64 global_status_rsvd;
561  	u64 reserved_bits;
562  	u64 raw_event_mask;
563  	struct kvm_pmc gp_counters[KVM_MAX_NR_GP_COUNTERS];
564  	struct kvm_pmc fixed_counters[KVM_MAX_NR_FIXED_COUNTERS];
565  
566  	/*
567  	 * Overlay the bitmap with a 64-bit atomic so that all bits can be
568  	 * set in a single access, e.g. to reprogram all counters when the PMU
569  	 * filter changes.
570  	 */
571  	union {
572  		DECLARE_BITMAP(reprogram_pmi, X86_PMC_IDX_MAX);
573  		atomic64_t __reprogram_pmi;
574  	};
575  	DECLARE_BITMAP(all_valid_pmc_idx, X86_PMC_IDX_MAX);
576  	DECLARE_BITMAP(pmc_in_use, X86_PMC_IDX_MAX);
577  
578  	u64 ds_area;
579  	u64 pebs_enable;
580  	u64 pebs_enable_rsvd;
581  	u64 pebs_data_cfg;
582  	u64 pebs_data_cfg_rsvd;
583  
584  	/*
585  	 * If a guest counter is cross-mapped to host counter with different
586  	 * index, its PEBS capability will be temporarily disabled.
587  	 *
588  	 * The user should make sure that this mask is updated
589  	 * after disabling interrupts and before perf_guest_get_msrs();
590  	 */
591  	u64 host_cross_mapped_mask;
592  
593  	/*
594  	 * The gate to release perf_events not marked in
595  	 * pmc_in_use only once in a vcpu time slice.
596  	 */
597  	bool need_cleanup;
598  
599  	/*
600  	 * The total number of programmed perf_events and it helps to avoid
601  	 * redundant check before cleanup if guest don't use vPMU at all.
602  	 */
603  	u8 event_count;
604  };
605  
606  struct kvm_pmu_ops;
607  
608  enum {
609  	KVM_DEBUGREG_BP_ENABLED = 1,
610  	KVM_DEBUGREG_WONT_EXIT = 2,
611  };
612  
613  struct kvm_mtrr {
614  	u64 var[KVM_NR_VAR_MTRR * 2];
615  	u64 fixed_64k;
616  	u64 fixed_16k[2];
617  	u64 fixed_4k[8];
618  	u64 deftype;
619  };
620  
621  /* Hyper-V SynIC timer */
622  struct kvm_vcpu_hv_stimer {
623  	struct hrtimer timer;
624  	int index;
625  	union hv_stimer_config config;
626  	u64 count;
627  	u64 exp_time;
628  	struct hv_message msg;
629  	bool msg_pending;
630  };
631  
632  /* Hyper-V synthetic interrupt controller (SynIC)*/
633  struct kvm_vcpu_hv_synic {
634  	u64 version;
635  	u64 control;
636  	u64 msg_page;
637  	u64 evt_page;
638  	atomic64_t sint[HV_SYNIC_SINT_COUNT];
639  	atomic_t sint_to_gsi[HV_SYNIC_SINT_COUNT];
640  	DECLARE_BITMAP(auto_eoi_bitmap, 256);
641  	DECLARE_BITMAP(vec_bitmap, 256);
642  	bool active;
643  	bool dont_zero_synic_pages;
644  };
645  
646  /* The maximum number of entries on the TLB flush fifo. */
647  #define KVM_HV_TLB_FLUSH_FIFO_SIZE (16)
648  /*
649   * Note: the following 'magic' entry is made up by KVM to avoid putting
650   * anything besides GVA on the TLB flush fifo. It is theoretically possible
651   * to observe a request to flush 4095 PFNs starting from 0xfffffffffffff000
652   * which will look identical. KVM's action to 'flush everything' instead of
653   * flushing these particular addresses is, however, fully legitimate as
654   * flushing more than requested is always OK.
655   */
656  #define KVM_HV_TLB_FLUSHALL_ENTRY  ((u64)-1)
657  
658  enum hv_tlb_flush_fifos {
659  	HV_L1_TLB_FLUSH_FIFO,
660  	HV_L2_TLB_FLUSH_FIFO,
661  	HV_NR_TLB_FLUSH_FIFOS,
662  };
663  
664  struct kvm_vcpu_hv_tlb_flush_fifo {
665  	spinlock_t write_lock;
666  	DECLARE_KFIFO(entries, u64, KVM_HV_TLB_FLUSH_FIFO_SIZE);
667  };
668  
669  /* Hyper-V per vcpu emulation context */
670  struct kvm_vcpu_hv {
671  	struct kvm_vcpu *vcpu;
672  	u32 vp_index;
673  	u64 hv_vapic;
674  	s64 runtime_offset;
675  	struct kvm_vcpu_hv_synic synic;
676  	struct kvm_hyperv_exit exit;
677  	struct kvm_vcpu_hv_stimer stimer[HV_SYNIC_STIMER_COUNT];
678  	DECLARE_BITMAP(stimer_pending_bitmap, HV_SYNIC_STIMER_COUNT);
679  	bool enforce_cpuid;
680  	struct {
681  		u32 features_eax; /* HYPERV_CPUID_FEATURES.EAX */
682  		u32 features_ebx; /* HYPERV_CPUID_FEATURES.EBX */
683  		u32 features_edx; /* HYPERV_CPUID_FEATURES.EDX */
684  		u32 enlightenments_eax; /* HYPERV_CPUID_ENLIGHTMENT_INFO.EAX */
685  		u32 enlightenments_ebx; /* HYPERV_CPUID_ENLIGHTMENT_INFO.EBX */
686  		u32 syndbg_cap_eax; /* HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES.EAX */
687  		u32 nested_eax; /* HYPERV_CPUID_NESTED_FEATURES.EAX */
688  		u32 nested_ebx; /* HYPERV_CPUID_NESTED_FEATURES.EBX */
689  	} cpuid_cache;
690  
691  	struct kvm_vcpu_hv_tlb_flush_fifo tlb_flush_fifo[HV_NR_TLB_FLUSH_FIFOS];
692  
693  	/* Preallocated buffer for handling hypercalls passing sparse vCPU set */
694  	u64 sparse_banks[HV_MAX_SPARSE_VCPU_BANKS];
695  
696  	struct hv_vp_assist_page vp_assist_page;
697  
698  	struct {
699  		u64 pa_page_gpa;
700  		u64 vm_id;
701  		u32 vp_id;
702  	} nested;
703  };
704  
705  struct kvm_hypervisor_cpuid {
706  	u32 base;
707  	u32 limit;
708  };
709  
710  #ifdef CONFIG_KVM_XEN
711  /* Xen HVM per vcpu emulation context */
712  struct kvm_vcpu_xen {
713  	u64 hypercall_rip;
714  	u32 current_runstate;
715  	u8 upcall_vector;
716  	struct gfn_to_pfn_cache vcpu_info_cache;
717  	struct gfn_to_pfn_cache vcpu_time_info_cache;
718  	struct gfn_to_pfn_cache runstate_cache;
719  	struct gfn_to_pfn_cache runstate2_cache;
720  	u64 last_steal;
721  	u64 runstate_entry_time;
722  	u64 runstate_times[4];
723  	unsigned long evtchn_pending_sel;
724  	u32 vcpu_id; /* The Xen / ACPI vCPU ID */
725  	u32 timer_virq;
726  	u64 timer_expires; /* In guest epoch */
727  	atomic_t timer_pending;
728  	struct hrtimer timer;
729  	int poll_evtchn;
730  	struct timer_list poll_timer;
731  	struct kvm_hypervisor_cpuid cpuid;
732  };
733  #endif
734  
735  struct kvm_queued_exception {
736  	bool pending;
737  	bool injected;
738  	bool has_error_code;
739  	u8 vector;
740  	u32 error_code;
741  	unsigned long payload;
742  	bool has_payload;
743  };
744  
745  /*
746   * Hardware-defined CPUID leafs that are either scattered by the kernel or are
747   * unknown to the kernel, but need to be directly used by KVM.  Note, these
748   * word values conflict with the kernel's "bug" caps, but KVM doesn't use those.
749   */
750  enum kvm_only_cpuid_leafs {
751  	CPUID_12_EAX	 = NCAPINTS,
752  	CPUID_7_1_EDX,
753  	CPUID_8000_0007_EDX,
754  	CPUID_8000_0022_EAX,
755  	CPUID_7_2_EDX,
756  	CPUID_24_0_EBX,
757  	NR_KVM_CPU_CAPS,
758  
759  	NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS,
760  };
761  
762  struct kvm_vcpu_arch {
763  	/*
764  	 * rip and regs accesses must go through
765  	 * kvm_{register,rip}_{read,write} functions.
766  	 */
767  	unsigned long regs[NR_VCPU_REGS];
768  	u32 regs_avail;
769  	u32 regs_dirty;
770  
771  	unsigned long cr0;
772  	unsigned long cr0_guest_owned_bits;
773  	unsigned long cr2;
774  	unsigned long cr3;
775  	unsigned long cr4;
776  	unsigned long cr4_guest_owned_bits;
777  	unsigned long cr4_guest_rsvd_bits;
778  	unsigned long cr8;
779  	u32 host_pkru;
780  	u32 pkru;
781  	u32 hflags;
782  	u64 efer;
783  	u64 host_debugctl;
784  	u64 apic_base;
785  	struct kvm_lapic *apic;    /* kernel irqchip context */
786  	bool load_eoi_exitmap_pending;
787  	DECLARE_BITMAP(ioapic_handled_vectors, 256);
788  	unsigned long apic_attention;
789  	int32_t apic_arb_prio;
790  	int mp_state;
791  	u64 ia32_misc_enable_msr;
792  	u64 smbase;
793  	u64 smi_count;
794  	bool at_instruction_boundary;
795  	bool tpr_access_reporting;
796  	bool xfd_no_write_intercept;
797  	u64 ia32_xss;
798  	u64 microcode_version;
799  	u64 arch_capabilities;
800  	u64 perf_capabilities;
801  
802  	/*
803  	 * Paging state of the vcpu
804  	 *
805  	 * If the vcpu runs in guest mode with two level paging this still saves
806  	 * the paging mode of the l1 guest. This context is always used to
807  	 * handle faults.
808  	 */
809  	struct kvm_mmu *mmu;
810  
811  	/* Non-nested MMU for L1 */
812  	struct kvm_mmu root_mmu;
813  
814  	/* L1 MMU when running nested */
815  	struct kvm_mmu guest_mmu;
816  
817  	/*
818  	 * Paging state of an L2 guest (used for nested npt)
819  	 *
820  	 * This context will save all necessary information to walk page tables
821  	 * of an L2 guest. This context is only initialized for page table
822  	 * walking and not for faulting since we never handle l2 page faults on
823  	 * the host.
824  	 */
825  	struct kvm_mmu nested_mmu;
826  
827  	/*
828  	 * Pointer to the mmu context currently used for
829  	 * gva_to_gpa translations.
830  	 */
831  	struct kvm_mmu *walk_mmu;
832  
833  	struct kvm_mmu_memory_cache mmu_pte_list_desc_cache;
834  	struct kvm_mmu_memory_cache mmu_shadow_page_cache;
835  	struct kvm_mmu_memory_cache mmu_shadowed_info_cache;
836  	struct kvm_mmu_memory_cache mmu_page_header_cache;
837  	/*
838  	 * This cache is to allocate external page table. E.g. private EPT used
839  	 * by the TDX module.
840  	 */
841  	struct kvm_mmu_memory_cache mmu_external_spt_cache;
842  
843  	/*
844  	 * QEMU userspace and the guest each have their own FPU state.
845  	 * In vcpu_run, we switch between the user and guest FPU contexts.
846  	 * While running a VCPU, the VCPU thread will have the guest FPU
847  	 * context.
848  	 *
849  	 * Note that while the PKRU state lives inside the fpu registers,
850  	 * it is switched out separately at VMENTER and VMEXIT time. The
851  	 * "guest_fpstate" state here contains the guest FPU context, with the
852  	 * host PRKU bits.
853  	 */
854  	struct fpu_guest guest_fpu;
855  
856  	u64 xcr0;
857  	u64 guest_supported_xcr0;
858  
859  	struct kvm_pio_request pio;
860  	void *pio_data;
861  	void *sev_pio_data;
862  	unsigned sev_pio_count;
863  
864  	u8 event_exit_inst_len;
865  
866  	bool exception_from_userspace;
867  
868  	/* Exceptions to be injected to the guest. */
869  	struct kvm_queued_exception exception;
870  	/* Exception VM-Exits to be synthesized to L1. */
871  	struct kvm_queued_exception exception_vmexit;
872  
873  	struct kvm_queued_interrupt {
874  		bool injected;
875  		bool soft;
876  		u8 nr;
877  	} interrupt;
878  
879  	int halt_request; /* real mode on Intel only */
880  
881  	int cpuid_nent;
882  	struct kvm_cpuid_entry2 *cpuid_entries;
883  	bool is_amd_compatible;
884  
885  	/*
886  	 * cpu_caps holds the effective guest capabilities, i.e. the features
887  	 * the vCPU is allowed to use.  Typically, but not always, features can
888  	 * be used by the guest if and only if both KVM and userspace want to
889  	 * expose the feature to the guest.
890  	 *
891  	 * A common exception is for virtualization holes, i.e. when KVM can't
892  	 * prevent the guest from using a feature, in which case the vCPU "has"
893  	 * the feature regardless of what KVM or userspace desires.
894  	 *
895  	 * Note, features that don't require KVM involvement in any way are
896  	 * NOT enforced/sanitized by KVM, i.e. are taken verbatim from the
897  	 * guest CPUID provided by userspace.
898  	 */
899  	u32 cpu_caps[NR_KVM_CPU_CAPS];
900  
901  	u64 reserved_gpa_bits;
902  	int maxphyaddr;
903  
904  	/* emulate context */
905  
906  	struct x86_emulate_ctxt *emulate_ctxt;
907  	bool emulate_regs_need_sync_to_vcpu;
908  	bool emulate_regs_need_sync_from_vcpu;
909  	int (*complete_userspace_io)(struct kvm_vcpu *vcpu);
910  
911  	gpa_t time;
912  	struct pvclock_vcpu_time_info hv_clock;
913  	unsigned int hw_tsc_khz;
914  	struct gfn_to_pfn_cache pv_time;
915  	/* set guest stopped flag in pvclock flags field */
916  	bool pvclock_set_guest_stopped_request;
917  
918  	struct {
919  		u8 preempted;
920  		u64 msr_val;
921  		u64 last_steal;
922  		struct gfn_to_hva_cache cache;
923  	} st;
924  
925  	u64 l1_tsc_offset;
926  	u64 tsc_offset; /* current tsc offset */
927  	u64 last_guest_tsc;
928  	u64 last_host_tsc;
929  	u64 tsc_offset_adjustment;
930  	u64 this_tsc_nsec;
931  	u64 this_tsc_write;
932  	u64 this_tsc_generation;
933  	bool tsc_catchup;
934  	bool tsc_always_catchup;
935  	s8 virtual_tsc_shift;
936  	u32 virtual_tsc_mult;
937  	u32 virtual_tsc_khz;
938  	s64 ia32_tsc_adjust_msr;
939  	u64 msr_ia32_power_ctl;
940  	u64 l1_tsc_scaling_ratio;
941  	u64 tsc_scaling_ratio; /* current scaling ratio */
942  
943  	atomic_t nmi_queued;  /* unprocessed asynchronous NMIs */
944  	/* Number of NMIs pending injection, not including hardware vNMIs. */
945  	unsigned int nmi_pending;
946  	bool nmi_injected;    /* Trying to inject an NMI this entry */
947  	bool smi_pending;    /* SMI queued after currently running handler */
948  	u8 handling_intr_from_guest;
949  
950  	struct kvm_mtrr mtrr_state;
951  	u64 pat;
952  
953  	unsigned switch_db_regs;
954  	unsigned long db[KVM_NR_DB_REGS];
955  	unsigned long dr6;
956  	unsigned long dr7;
957  	unsigned long eff_db[KVM_NR_DB_REGS];
958  	unsigned long guest_debug_dr7;
959  	u64 msr_platform_info;
960  	u64 msr_misc_features_enables;
961  
962  	u64 mcg_cap;
963  	u64 mcg_status;
964  	u64 mcg_ctl;
965  	u64 mcg_ext_ctl;
966  	u64 *mce_banks;
967  	u64 *mci_ctl2_banks;
968  
969  	/* Cache MMIO info */
970  	u64 mmio_gva;
971  	unsigned mmio_access;
972  	gfn_t mmio_gfn;
973  	u64 mmio_gen;
974  
975  	struct kvm_pmu pmu;
976  
977  	/* used for guest single stepping over the given code position */
978  	unsigned long singlestep_rip;
979  
980  #ifdef CONFIG_KVM_HYPERV
981  	bool hyperv_enabled;
982  	struct kvm_vcpu_hv *hyperv;
983  #endif
984  #ifdef CONFIG_KVM_XEN
985  	struct kvm_vcpu_xen xen;
986  #endif
987  	cpumask_var_t wbinvd_dirty_mask;
988  
989  	unsigned long last_retry_eip;
990  	unsigned long last_retry_addr;
991  
992  	struct {
993  		bool halted;
994  		gfn_t gfns[ASYNC_PF_PER_VCPU];
995  		struct gfn_to_hva_cache data;
996  		u64 msr_en_val; /* MSR_KVM_ASYNC_PF_EN */
997  		u64 msr_int_val; /* MSR_KVM_ASYNC_PF_INT */
998  		u16 vec;
999  		u32 id;
1000  		bool send_user_only;
1001  		u32 host_apf_flags;
1002  		bool delivery_as_pf_vmexit;
1003  		bool pageready_pending;
1004  	} apf;
1005  
1006  	/* OSVW MSRs (AMD only) */
1007  	struct {
1008  		u64 length;
1009  		u64 status;
1010  	} osvw;
1011  
1012  	struct {
1013  		u64 msr_val;
1014  		struct gfn_to_hva_cache data;
1015  	} pv_eoi;
1016  
1017  	u64 msr_kvm_poll_control;
1018  
1019  	/* pv related host specific info */
1020  	struct {
1021  		bool pv_unhalted;
1022  	} pv;
1023  
1024  	int pending_ioapic_eoi;
1025  	int pending_external_vector;
1026  
1027  	/* be preempted when it's in kernel-mode(cpl=0) */
1028  	bool preempted_in_kernel;
1029  
1030  	/* Flush the L1 Data cache for L1TF mitigation on VMENTER */
1031  	bool l1tf_flush_l1d;
1032  
1033  	/* Host CPU on which VM-entry was most recently attempted */
1034  	int last_vmentry_cpu;
1035  
1036  	/* AMD MSRC001_0015 Hardware Configuration */
1037  	u64 msr_hwcr;
1038  
1039  	/* pv related cpuid info */
1040  	struct {
1041  		/*
1042  		 * value of the eax register in the KVM_CPUID_FEATURES CPUID
1043  		 * leaf.
1044  		 */
1045  		u32 features;
1046  
1047  		/*
1048  		 * indicates whether pv emulation should be disabled if features
1049  		 * are not present in the guest's cpuid
1050  		 */
1051  		bool enforce;
1052  	} pv_cpuid;
1053  
1054  	/* Protected Guests */
1055  	bool guest_state_protected;
1056  
1057  	/*
1058  	 * Set when PDPTS were loaded directly by the userspace without
1059  	 * reading the guest memory
1060  	 */
1061  	bool pdptrs_from_userspace;
1062  
1063  #if IS_ENABLED(CONFIG_HYPERV)
1064  	hpa_t hv_root_tdp;
1065  #endif
1066  };
1067  
1068  struct kvm_lpage_info {
1069  	int disallow_lpage;
1070  };
1071  
1072  struct kvm_arch_memory_slot {
1073  	struct kvm_rmap_head *rmap[KVM_NR_PAGE_SIZES];
1074  	struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
1075  	unsigned short *gfn_write_track;
1076  };
1077  
1078  /*
1079   * Track the mode of the optimized logical map, as the rules for decoding the
1080   * destination vary per mode.  Enabling the optimized logical map requires all
1081   * software-enabled local APIs to be in the same mode, each addressable APIC to
1082   * be mapped to only one MDA, and each MDA to map to at most one APIC.
1083   */
1084  enum kvm_apic_logical_mode {
1085  	/* All local APICs are software disabled. */
1086  	KVM_APIC_MODE_SW_DISABLED,
1087  	/* All software enabled local APICs in xAPIC cluster addressing mode. */
1088  	KVM_APIC_MODE_XAPIC_CLUSTER,
1089  	/* All software enabled local APICs in xAPIC flat addressing mode. */
1090  	KVM_APIC_MODE_XAPIC_FLAT,
1091  	/* All software enabled local APICs in x2APIC mode. */
1092  	KVM_APIC_MODE_X2APIC,
1093  	/*
1094  	 * Optimized map disabled, e.g. not all local APICs in the same logical
1095  	 * mode, same logical ID assigned to multiple APICs, etc.
1096  	 */
1097  	KVM_APIC_MODE_MAP_DISABLED,
1098  };
1099  
1100  struct kvm_apic_map {
1101  	struct rcu_head rcu;
1102  	enum kvm_apic_logical_mode logical_mode;
1103  	u32 max_apic_id;
1104  	union {
1105  		struct kvm_lapic *xapic_flat_map[8];
1106  		struct kvm_lapic *xapic_cluster_map[16][4];
1107  	};
1108  	struct kvm_lapic *phys_map[];
1109  };
1110  
1111  /* Hyper-V synthetic debugger (SynDbg)*/
1112  struct kvm_hv_syndbg {
1113  	struct {
1114  		u64 control;
1115  		u64 status;
1116  		u64 send_page;
1117  		u64 recv_page;
1118  		u64 pending_page;
1119  	} control;
1120  	u64 options;
1121  };
1122  
1123  /* Current state of Hyper-V TSC page clocksource */
1124  enum hv_tsc_page_status {
1125  	/* TSC page was not set up or disabled */
1126  	HV_TSC_PAGE_UNSET = 0,
1127  	/* TSC page MSR was written by the guest, update pending */
1128  	HV_TSC_PAGE_GUEST_CHANGED,
1129  	/* TSC page update was triggered from the host side */
1130  	HV_TSC_PAGE_HOST_CHANGED,
1131  	/* TSC page was properly set up and is currently active  */
1132  	HV_TSC_PAGE_SET,
1133  	/* TSC page was set up with an inaccessible GPA */
1134  	HV_TSC_PAGE_BROKEN,
1135  };
1136  
1137  #ifdef CONFIG_KVM_HYPERV
1138  /* Hyper-V emulation context */
1139  struct kvm_hv {
1140  	struct mutex hv_lock;
1141  	u64 hv_guest_os_id;
1142  	u64 hv_hypercall;
1143  	u64 hv_tsc_page;
1144  	enum hv_tsc_page_status hv_tsc_page_status;
1145  
1146  	/* Hyper-v based guest crash (NT kernel bugcheck) parameters */
1147  	u64 hv_crash_param[HV_X64_MSR_CRASH_PARAMS];
1148  	u64 hv_crash_ctl;
1149  
1150  	struct ms_hyperv_tsc_page tsc_ref;
1151  
1152  	struct idr conn_to_evt;
1153  
1154  	u64 hv_reenlightenment_control;
1155  	u64 hv_tsc_emulation_control;
1156  	u64 hv_tsc_emulation_status;
1157  	u64 hv_invtsc_control;
1158  
1159  	/* How many vCPUs have VP index != vCPU index */
1160  	atomic_t num_mismatched_vp_indexes;
1161  
1162  	/*
1163  	 * How many SynICs use 'AutoEOI' feature
1164  	 * (protected by arch.apicv_update_lock)
1165  	 */
1166  	unsigned int synic_auto_eoi_used;
1167  
1168  	struct kvm_hv_syndbg hv_syndbg;
1169  
1170  	bool xsaves_xsavec_checked;
1171  };
1172  #endif
1173  
1174  struct msr_bitmap_range {
1175  	u32 flags;
1176  	u32 nmsrs;
1177  	u32 base;
1178  	unsigned long *bitmap;
1179  };
1180  
1181  #ifdef CONFIG_KVM_XEN
1182  /* Xen emulation context */
1183  struct kvm_xen {
1184  	struct mutex xen_lock;
1185  	u32 xen_version;
1186  	bool long_mode;
1187  	bool runstate_update_flag;
1188  	u8 upcall_vector;
1189  	struct gfn_to_pfn_cache shinfo_cache;
1190  	struct idr evtchn_ports;
1191  	unsigned long poll_mask[BITS_TO_LONGS(KVM_MAX_VCPUS)];
1192  };
1193  #endif
1194  
1195  enum kvm_irqchip_mode {
1196  	KVM_IRQCHIP_NONE,
1197  	KVM_IRQCHIP_KERNEL,       /* created with KVM_CREATE_IRQCHIP */
1198  	KVM_IRQCHIP_SPLIT,        /* created with KVM_CAP_SPLIT_IRQCHIP */
1199  };
1200  
1201  struct kvm_x86_msr_filter {
1202  	u8 count;
1203  	bool default_allow:1;
1204  	struct msr_bitmap_range ranges[16];
1205  };
1206  
1207  struct kvm_x86_pmu_event_filter {
1208  	__u32 action;
1209  	__u32 nevents;
1210  	__u32 fixed_counter_bitmap;
1211  	__u32 flags;
1212  	__u32 nr_includes;
1213  	__u32 nr_excludes;
1214  	__u64 *includes;
1215  	__u64 *excludes;
1216  	__u64 events[];
1217  };
1218  
1219  enum kvm_apicv_inhibit {
1220  
1221  	/********************************************************************/
1222  	/* INHIBITs that are relevant to both Intel's APICv and AMD's AVIC. */
1223  	/********************************************************************/
1224  
1225  	/*
1226  	 * APIC acceleration is disabled by a module parameter
1227  	 * and/or not supported in hardware.
1228  	 */
1229  	APICV_INHIBIT_REASON_DISABLED,
1230  
1231  	/*
1232  	 * APIC acceleration is inhibited because AutoEOI feature is
1233  	 * being used by a HyperV guest.
1234  	 */
1235  	APICV_INHIBIT_REASON_HYPERV,
1236  
1237  	/*
1238  	 * APIC acceleration is inhibited because the userspace didn't yet
1239  	 * enable the kernel/split irqchip.
1240  	 */
1241  	APICV_INHIBIT_REASON_ABSENT,
1242  
1243  	/* APIC acceleration is inhibited because KVM_GUESTDBG_BLOCKIRQ
1244  	 * (out of band, debug measure of blocking all interrupts on this vCPU)
1245  	 * was enabled, to avoid AVIC/APICv bypassing it.
1246  	 */
1247  	APICV_INHIBIT_REASON_BLOCKIRQ,
1248  
1249  	/*
1250  	 * APICv is disabled because not all vCPUs have a 1:1 mapping between
1251  	 * APIC ID and vCPU, _and_ KVM is not applying its x2APIC hotplug hack.
1252  	 */
1253  	APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED,
1254  
1255  	/*
1256  	 * For simplicity, the APIC acceleration is inhibited
1257  	 * first time either APIC ID or APIC base are changed by the guest
1258  	 * from their reset values.
1259  	 */
1260  	APICV_INHIBIT_REASON_APIC_ID_MODIFIED,
1261  	APICV_INHIBIT_REASON_APIC_BASE_MODIFIED,
1262  
1263  	/******************************************************/
1264  	/* INHIBITs that are relevant only to the AMD's AVIC. */
1265  	/******************************************************/
1266  
1267  	/*
1268  	 * AVIC is inhibited on a vCPU because it runs a nested guest.
1269  	 *
1270  	 * This is needed because unlike APICv, the peers of this vCPU
1271  	 * cannot use the doorbell mechanism to signal interrupts via AVIC when
1272  	 * a vCPU runs nested.
1273  	 */
1274  	APICV_INHIBIT_REASON_NESTED,
1275  
1276  	/*
1277  	 * On SVM, the wait for the IRQ window is implemented with pending vIRQ,
1278  	 * which cannot be injected when the AVIC is enabled, thus AVIC
1279  	 * is inhibited while KVM waits for IRQ window.
1280  	 */
1281  	APICV_INHIBIT_REASON_IRQWIN,
1282  
1283  	/*
1284  	 * PIT (i8254) 're-inject' mode, relies on EOI intercept,
1285  	 * which AVIC doesn't support for edge triggered interrupts.
1286  	 */
1287  	APICV_INHIBIT_REASON_PIT_REINJ,
1288  
1289  	/*
1290  	 * AVIC is disabled because SEV doesn't support it.
1291  	 */
1292  	APICV_INHIBIT_REASON_SEV,
1293  
1294  	/*
1295  	 * AVIC is disabled because not all vCPUs with a valid LDR have a 1:1
1296  	 * mapping between logical ID and vCPU.
1297  	 */
1298  	APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED,
1299  
1300  	NR_APICV_INHIBIT_REASONS,
1301  };
1302  
1303  #define __APICV_INHIBIT_REASON(reason)			\
1304  	{ BIT(APICV_INHIBIT_REASON_##reason), #reason }
1305  
1306  #define APICV_INHIBIT_REASONS				\
1307  	__APICV_INHIBIT_REASON(DISABLED),		\
1308  	__APICV_INHIBIT_REASON(HYPERV),			\
1309  	__APICV_INHIBIT_REASON(ABSENT),			\
1310  	__APICV_INHIBIT_REASON(BLOCKIRQ),		\
1311  	__APICV_INHIBIT_REASON(PHYSICAL_ID_ALIASED),	\
1312  	__APICV_INHIBIT_REASON(APIC_ID_MODIFIED),	\
1313  	__APICV_INHIBIT_REASON(APIC_BASE_MODIFIED),	\
1314  	__APICV_INHIBIT_REASON(NESTED),			\
1315  	__APICV_INHIBIT_REASON(IRQWIN),			\
1316  	__APICV_INHIBIT_REASON(PIT_REINJ),		\
1317  	__APICV_INHIBIT_REASON(SEV),			\
1318  	__APICV_INHIBIT_REASON(LOGICAL_ID_ALIASED)
1319  
1320  struct kvm_arch {
1321  	unsigned long n_used_mmu_pages;
1322  	unsigned long n_requested_mmu_pages;
1323  	unsigned long n_max_mmu_pages;
1324  	unsigned int indirect_shadow_pages;
1325  	u8 mmu_valid_gen;
1326  	u8 vm_type;
1327  	bool has_private_mem;
1328  	bool has_protected_state;
1329  	bool pre_fault_allowed;
1330  	struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
1331  	struct list_head active_mmu_pages;
1332  	/*
1333  	 * A list of kvm_mmu_page structs that, if zapped, could possibly be
1334  	 * replaced by an NX huge page.  A shadow page is on this list if its
1335  	 * existence disallows an NX huge page (nx_huge_page_disallowed is set)
1336  	 * and there are no other conditions that prevent a huge page, e.g.
1337  	 * the backing host page is huge, dirtly logging is not enabled for its
1338  	 * memslot, etc...  Note, zapping shadow pages on this list doesn't
1339  	 * guarantee an NX huge page will be created in its stead, e.g. if the
1340  	 * guest attempts to execute from the region then KVM obviously can't
1341  	 * create an NX huge page (without hanging the guest).
1342  	 */
1343  	struct list_head possible_nx_huge_pages;
1344  #ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING
1345  	struct kvm_page_track_notifier_head track_notifier_head;
1346  #endif
1347  	/*
1348  	 * Protects marking pages unsync during page faults, as TDP MMU page
1349  	 * faults only take mmu_lock for read.  For simplicity, the unsync
1350  	 * pages lock is always taken when marking pages unsync regardless of
1351  	 * whether mmu_lock is held for read or write.
1352  	 */
1353  	spinlock_t mmu_unsync_pages_lock;
1354  
1355  	u64 shadow_mmio_value;
1356  
1357  	struct iommu_domain *iommu_domain;
1358  	bool iommu_noncoherent;
1359  #define __KVM_HAVE_ARCH_NONCOHERENT_DMA
1360  	atomic_t noncoherent_dma_count;
1361  #define __KVM_HAVE_ARCH_ASSIGNED_DEVICE
1362  	atomic_t assigned_device_count;
1363  	struct kvm_pic *vpic;
1364  	struct kvm_ioapic *vioapic;
1365  	struct kvm_pit *vpit;
1366  	atomic_t vapics_in_nmi_mode;
1367  	struct mutex apic_map_lock;
1368  	struct kvm_apic_map __rcu *apic_map;
1369  	atomic_t apic_map_dirty;
1370  
1371  	bool apic_access_memslot_enabled;
1372  	bool apic_access_memslot_inhibited;
1373  
1374  	/* Protects apicv_inhibit_reasons */
1375  	struct rw_semaphore apicv_update_lock;
1376  	unsigned long apicv_inhibit_reasons;
1377  
1378  	gpa_t wall_clock;
1379  
1380  	bool mwait_in_guest;
1381  	bool hlt_in_guest;
1382  	bool pause_in_guest;
1383  	bool cstate_in_guest;
1384  
1385  	unsigned long irq_sources_bitmap;
1386  	s64 kvmclock_offset;
1387  
1388  	/*
1389  	 * This also protects nr_vcpus_matched_tsc which is read from a
1390  	 * preemption-disabled region, so it must be a raw spinlock.
1391  	 */
1392  	raw_spinlock_t tsc_write_lock;
1393  	u64 last_tsc_nsec;
1394  	u64 last_tsc_write;
1395  	u32 last_tsc_khz;
1396  	u64 last_tsc_offset;
1397  	u64 cur_tsc_nsec;
1398  	u64 cur_tsc_write;
1399  	u64 cur_tsc_offset;
1400  	u64 cur_tsc_generation;
1401  	int nr_vcpus_matched_tsc;
1402  
1403  	u32 default_tsc_khz;
1404  	bool user_set_tsc;
1405  	u64 apic_bus_cycle_ns;
1406  
1407  	seqcount_raw_spinlock_t pvclock_sc;
1408  	bool use_master_clock;
1409  	u64 master_kernel_ns;
1410  	u64 master_cycle_now;
1411  	struct delayed_work kvmclock_update_work;
1412  	struct delayed_work kvmclock_sync_work;
1413  
1414  	struct kvm_xen_hvm_config xen_hvm_config;
1415  
1416  	/* reads protected by irq_srcu, writes by irq_lock */
1417  	struct hlist_head mask_notifier_list;
1418  
1419  #ifdef CONFIG_KVM_HYPERV
1420  	struct kvm_hv hyperv;
1421  #endif
1422  
1423  #ifdef CONFIG_KVM_XEN
1424  	struct kvm_xen xen;
1425  #endif
1426  
1427  	bool backwards_tsc_observed;
1428  	bool boot_vcpu_runs_old_kvmclock;
1429  	u32 bsp_vcpu_id;
1430  
1431  	u64 disabled_quirks;
1432  
1433  	enum kvm_irqchip_mode irqchip_mode;
1434  	u8 nr_reserved_ioapic_pins;
1435  
1436  	bool disabled_lapic_found;
1437  
1438  	bool x2apic_format;
1439  	bool x2apic_broadcast_quirk_disabled;
1440  
1441  	bool guest_can_read_msr_platform_info;
1442  	bool exception_payload_enabled;
1443  
1444  	bool triple_fault_event;
1445  
1446  	bool bus_lock_detection_enabled;
1447  	bool enable_pmu;
1448  
1449  	u32 notify_window;
1450  	u32 notify_vmexit_flags;
1451  	/*
1452  	 * If exit_on_emulation_error is set, and the in-kernel instruction
1453  	 * emulator fails to emulate an instruction, allow userspace
1454  	 * the opportunity to look at it.
1455  	 */
1456  	bool exit_on_emulation_error;
1457  
1458  	/* Deflect RDMSR and WRMSR to user space when they trigger a #GP */
1459  	u32 user_space_msr_mask;
1460  	struct kvm_x86_msr_filter __rcu *msr_filter;
1461  
1462  	u32 hypercall_exit_enabled;
1463  
1464  	/* Guest can access the SGX PROVISIONKEY. */
1465  	bool sgx_provisioning_allowed;
1466  
1467  	struct kvm_x86_pmu_event_filter __rcu *pmu_event_filter;
1468  	struct vhost_task *nx_huge_page_recovery_thread;
1469  	u64 nx_huge_page_last;
1470  	struct once nx_once;
1471  
1472  #ifdef CONFIG_X86_64
1473  	/* The number of TDP MMU pages across all roots. */
1474  	atomic64_t tdp_mmu_pages;
1475  
1476  	/*
1477  	 * List of struct kvm_mmu_pages being used as roots.
1478  	 * All struct kvm_mmu_pages in the list should have
1479  	 * tdp_mmu_page set.
1480  	 *
1481  	 * For reads, this list is protected by:
1482  	 *	the MMU lock in read mode + RCU or
1483  	 *	the MMU lock in write mode
1484  	 *
1485  	 * For writes, this list is protected by tdp_mmu_pages_lock; see
1486  	 * below for the details.
1487  	 *
1488  	 * Roots will remain in the list until their tdp_mmu_root_count
1489  	 * drops to zero, at which point the thread that decremented the
1490  	 * count to zero should removed the root from the list and clean
1491  	 * it up, freeing the root after an RCU grace period.
1492  	 */
1493  	struct list_head tdp_mmu_roots;
1494  
1495  	/*
1496  	 * Protects accesses to the following fields when the MMU lock
1497  	 * is held in read mode:
1498  	 *  - tdp_mmu_roots (above)
1499  	 *  - the link field of kvm_mmu_page structs used by the TDP MMU
1500  	 *  - possible_nx_huge_pages;
1501  	 *  - the possible_nx_huge_page_link field of kvm_mmu_page structs used
1502  	 *    by the TDP MMU
1503  	 * Because the lock is only taken within the MMU lock, strictly
1504  	 * speaking it is redundant to acquire this lock when the thread
1505  	 * holds the MMU lock in write mode.  However it often simplifies
1506  	 * the code to do so.
1507  	 */
1508  	spinlock_t tdp_mmu_pages_lock;
1509  #endif /* CONFIG_X86_64 */
1510  
1511  	/*
1512  	 * If set, at least one shadow root has been allocated. This flag
1513  	 * is used as one input when determining whether certain memslot
1514  	 * related allocations are necessary.
1515  	 */
1516  	bool shadow_root_allocated;
1517  
1518  #ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING
1519  	/*
1520  	 * If set, the VM has (or had) an external write tracking user, and
1521  	 * thus all write tracking metadata has been allocated, even if KVM
1522  	 * itself isn't using write tracking.
1523  	 */
1524  	bool external_write_tracking_enabled;
1525  #endif
1526  
1527  #if IS_ENABLED(CONFIG_HYPERV)
1528  	hpa_t	hv_root_tdp;
1529  	spinlock_t hv_root_tdp_lock;
1530  	struct hv_partition_assist_pg *hv_pa_pg;
1531  #endif
1532  	/*
1533  	 * VM-scope maximum vCPU ID. Used to determine the size of structures
1534  	 * that increase along with the maximum vCPU ID, in which case, using
1535  	 * the global KVM_MAX_VCPU_IDS may lead to significant memory waste.
1536  	 */
1537  	u32 max_vcpu_ids;
1538  
1539  	bool disable_nx_huge_pages;
1540  
1541  	/*
1542  	 * Memory caches used to allocate shadow pages when performing eager
1543  	 * page splitting. No need for a shadowed_info_cache since eager page
1544  	 * splitting only allocates direct shadow pages.
1545  	 *
1546  	 * Protected by kvm->slots_lock.
1547  	 */
1548  	struct kvm_mmu_memory_cache split_shadow_page_cache;
1549  	struct kvm_mmu_memory_cache split_page_header_cache;
1550  
1551  	/*
1552  	 * Memory cache used to allocate pte_list_desc structs while splitting
1553  	 * huge pages. In the worst case, to split one huge page, 512
1554  	 * pte_list_desc structs are needed to add each lower level leaf sptep
1555  	 * to the rmap plus 1 to extend the parent_ptes rmap of the lower level
1556  	 * page table.
1557  	 *
1558  	 * Protected by kvm->slots_lock.
1559  	 */
1560  #define SPLIT_DESC_CACHE_MIN_NR_OBJECTS (SPTE_ENT_PER_PAGE + 1)
1561  	struct kvm_mmu_memory_cache split_desc_cache;
1562  
1563  	gfn_t gfn_direct_bits;
1564  };
1565  
1566  struct kvm_vm_stat {
1567  	struct kvm_vm_stat_generic generic;
1568  	u64 mmu_shadow_zapped;
1569  	u64 mmu_pte_write;
1570  	u64 mmu_pde_zapped;
1571  	u64 mmu_flooded;
1572  	u64 mmu_recycled;
1573  	u64 mmu_cache_miss;
1574  	u64 mmu_unsync;
1575  	union {
1576  		struct {
1577  			atomic64_t pages_4k;
1578  			atomic64_t pages_2m;
1579  			atomic64_t pages_1g;
1580  		};
1581  		atomic64_t pages[KVM_NR_PAGE_SIZES];
1582  	};
1583  	u64 nx_lpage_splits;
1584  	u64 max_mmu_page_hash_collisions;
1585  	u64 max_mmu_rmap_size;
1586  };
1587  
1588  struct kvm_vcpu_stat {
1589  	struct kvm_vcpu_stat_generic generic;
1590  	u64 pf_taken;
1591  	u64 pf_fixed;
1592  	u64 pf_emulate;
1593  	u64 pf_spurious;
1594  	u64 pf_fast;
1595  	u64 pf_mmio_spte_created;
1596  	u64 pf_guest;
1597  	u64 tlb_flush;
1598  	u64 invlpg;
1599  
1600  	u64 exits;
1601  	u64 io_exits;
1602  	u64 mmio_exits;
1603  	u64 signal_exits;
1604  	u64 irq_window_exits;
1605  	u64 nmi_window_exits;
1606  	u64 l1d_flush;
1607  	u64 halt_exits;
1608  	u64 request_irq_exits;
1609  	u64 irq_exits;
1610  	u64 host_state_reload;
1611  	u64 fpu_reload;
1612  	u64 insn_emulation;
1613  	u64 insn_emulation_fail;
1614  	u64 hypercalls;
1615  	u64 irq_injections;
1616  	u64 nmi_injections;
1617  	u64 req_event;
1618  	u64 nested_run;
1619  	u64 directed_yield_attempted;
1620  	u64 directed_yield_successful;
1621  	u64 preemption_reported;
1622  	u64 preemption_other;
1623  	u64 guest_mode;
1624  	u64 notify_window_exits;
1625  };
1626  
1627  struct x86_instruction_info;
1628  
1629  struct msr_data {
1630  	bool host_initiated;
1631  	u32 index;
1632  	u64 data;
1633  };
1634  
1635  struct kvm_lapic_irq {
1636  	u32 vector;
1637  	u16 delivery_mode;
1638  	u16 dest_mode;
1639  	bool level;
1640  	u16 trig_mode;
1641  	u32 shorthand;
1642  	u32 dest_id;
1643  	bool msi_redir_hint;
1644  };
1645  
kvm_lapic_irq_dest_mode(bool dest_mode_logical)1646  static inline u16 kvm_lapic_irq_dest_mode(bool dest_mode_logical)
1647  {
1648  	return dest_mode_logical ? APIC_DEST_LOGICAL : APIC_DEST_PHYSICAL;
1649  }
1650  
1651  struct kvm_x86_ops {
1652  	const char *name;
1653  
1654  	int (*check_processor_compatibility)(void);
1655  
1656  	int (*enable_virtualization_cpu)(void);
1657  	void (*disable_virtualization_cpu)(void);
1658  	cpu_emergency_virt_cb *emergency_disable_virtualization_cpu;
1659  
1660  	void (*hardware_unsetup)(void);
1661  	bool (*has_emulated_msr)(struct kvm *kvm, u32 index);
1662  	void (*vcpu_after_set_cpuid)(struct kvm_vcpu *vcpu);
1663  
1664  	unsigned int vm_size;
1665  	int (*vm_init)(struct kvm *kvm);
1666  	void (*vm_destroy)(struct kvm *kvm);
1667  
1668  	/* Create, but do not attach this VCPU */
1669  	int (*vcpu_precreate)(struct kvm *kvm);
1670  	int (*vcpu_create)(struct kvm_vcpu *vcpu);
1671  	void (*vcpu_free)(struct kvm_vcpu *vcpu);
1672  	void (*vcpu_reset)(struct kvm_vcpu *vcpu, bool init_event);
1673  
1674  	void (*prepare_switch_to_guest)(struct kvm_vcpu *vcpu);
1675  	void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
1676  	void (*vcpu_put)(struct kvm_vcpu *vcpu);
1677  
1678  	void (*update_exception_bitmap)(struct kvm_vcpu *vcpu);
1679  	int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
1680  	int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
1681  	u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
1682  	void (*get_segment)(struct kvm_vcpu *vcpu,
1683  			    struct kvm_segment *var, int seg);
1684  	int (*get_cpl)(struct kvm_vcpu *vcpu);
1685  	int (*get_cpl_no_cache)(struct kvm_vcpu *vcpu);
1686  	void (*set_segment)(struct kvm_vcpu *vcpu,
1687  			    struct kvm_segment *var, int seg);
1688  	void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l);
1689  	bool (*is_valid_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
1690  	void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
1691  	void (*post_set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
1692  	bool (*is_valid_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
1693  	void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
1694  	int (*set_efer)(struct kvm_vcpu *vcpu, u64 efer);
1695  	void (*get_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
1696  	void (*set_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
1697  	void (*get_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
1698  	void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
1699  	void (*sync_dirty_debug_regs)(struct kvm_vcpu *vcpu);
1700  	void (*set_dr6)(struct kvm_vcpu *vcpu, unsigned long value);
1701  	void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value);
1702  	void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
1703  	unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
1704  	void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
1705  	bool (*get_if_flag)(struct kvm_vcpu *vcpu);
1706  
1707  	void (*flush_tlb_all)(struct kvm_vcpu *vcpu);
1708  	void (*flush_tlb_current)(struct kvm_vcpu *vcpu);
1709  #if IS_ENABLED(CONFIG_HYPERV)
1710  	int  (*flush_remote_tlbs)(struct kvm *kvm);
1711  	int  (*flush_remote_tlbs_range)(struct kvm *kvm, gfn_t gfn,
1712  					gfn_t nr_pages);
1713  #endif
1714  
1715  	/*
1716  	 * Flush any TLB entries associated with the given GVA.
1717  	 * Does not need to flush GPA->HPA mappings.
1718  	 * Can potentially get non-canonical addresses through INVLPGs, which
1719  	 * the implementation may choose to ignore if appropriate.
1720  	 */
1721  	void (*flush_tlb_gva)(struct kvm_vcpu *vcpu, gva_t addr);
1722  
1723  	/*
1724  	 * Flush any TLB entries created by the guest.  Like tlb_flush_gva(),
1725  	 * does not need to flush GPA->HPA mappings.
1726  	 */
1727  	void (*flush_tlb_guest)(struct kvm_vcpu *vcpu);
1728  
1729  	int (*vcpu_pre_run)(struct kvm_vcpu *vcpu);
1730  	enum exit_fastpath_completion (*vcpu_run)(struct kvm_vcpu *vcpu,
1731  						  bool force_immediate_exit);
1732  	int (*handle_exit)(struct kvm_vcpu *vcpu,
1733  		enum exit_fastpath_completion exit_fastpath);
1734  	int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
1735  	void (*update_emulated_instruction)(struct kvm_vcpu *vcpu);
1736  	void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
1737  	u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu);
1738  	void (*patch_hypercall)(struct kvm_vcpu *vcpu,
1739  				unsigned char *hypercall_addr);
1740  	void (*inject_irq)(struct kvm_vcpu *vcpu, bool reinjected);
1741  	void (*inject_nmi)(struct kvm_vcpu *vcpu);
1742  	void (*inject_exception)(struct kvm_vcpu *vcpu);
1743  	void (*cancel_injection)(struct kvm_vcpu *vcpu);
1744  	int (*interrupt_allowed)(struct kvm_vcpu *vcpu, bool for_injection);
1745  	int (*nmi_allowed)(struct kvm_vcpu *vcpu, bool for_injection);
1746  	bool (*get_nmi_mask)(struct kvm_vcpu *vcpu);
1747  	void (*set_nmi_mask)(struct kvm_vcpu *vcpu, bool masked);
1748  	/* Whether or not a virtual NMI is pending in hardware. */
1749  	bool (*is_vnmi_pending)(struct kvm_vcpu *vcpu);
1750  	/*
1751  	 * Attempt to pend a virtual NMI in hardware.  Returns %true on success
1752  	 * to allow using static_call_ret0 as the fallback.
1753  	 */
1754  	bool (*set_vnmi_pending)(struct kvm_vcpu *vcpu);
1755  	void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
1756  	void (*enable_irq_window)(struct kvm_vcpu *vcpu);
1757  	void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
1758  
1759  	const bool x2apic_icr_is_split;
1760  	const unsigned long required_apicv_inhibits;
1761  	bool allow_apicv_in_x2apic_without_x2apic_virtualization;
1762  	void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu);
1763  	void (*hwapic_isr_update)(struct kvm_vcpu *vcpu, int isr);
1764  	void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
1765  	void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu);
1766  	void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu);
1767  	void (*deliver_interrupt)(struct kvm_lapic *apic, int delivery_mode,
1768  				  int trig_mode, int vector);
1769  	int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
1770  	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
1771  	int (*set_identity_map_addr)(struct kvm *kvm, u64 ident_addr);
1772  	u8 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
1773  
1774  	void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, hpa_t root_hpa,
1775  			     int root_level);
1776  
1777  	/* Update external mapping with page table link. */
1778  	int (*link_external_spt)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
1779  				void *external_spt);
1780  	/* Update the external page table from spte getting set. */
1781  	int (*set_external_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
1782  				 kvm_pfn_t pfn_for_gfn);
1783  
1784  	/* Update external page tables for page table about to be freed. */
1785  	int (*free_external_spt)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
1786  				 void *external_spt);
1787  
1788  	/* Update external page table from spte getting removed, and flush TLB. */
1789  	int (*remove_external_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
1790  				    kvm_pfn_t pfn_for_gfn);
1791  
1792  	bool (*has_wbinvd_exit)(void);
1793  
1794  	u64 (*get_l2_tsc_offset)(struct kvm_vcpu *vcpu);
1795  	u64 (*get_l2_tsc_multiplier)(struct kvm_vcpu *vcpu);
1796  	void (*write_tsc_offset)(struct kvm_vcpu *vcpu);
1797  	void (*write_tsc_multiplier)(struct kvm_vcpu *vcpu);
1798  
1799  	/*
1800  	 * Retrieve somewhat arbitrary exit/entry information.  Intended to
1801  	 * be used only from within tracepoints or error paths.
1802  	 */
1803  	void (*get_exit_info)(struct kvm_vcpu *vcpu, u32 *reason,
1804  			      u64 *info1, u64 *info2,
1805  			      u32 *intr_info, u32 *error_code);
1806  
1807  	void (*get_entry_info)(struct kvm_vcpu *vcpu,
1808  			       u32 *intr_info, u32 *error_code);
1809  
1810  	int (*check_intercept)(struct kvm_vcpu *vcpu,
1811  			       struct x86_instruction_info *info,
1812  			       enum x86_intercept_stage stage,
1813  			       struct x86_exception *exception);
1814  	void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu);
1815  
1816  	/*
1817  	 * Size of the CPU's dirty log buffer, i.e. VMX's PML buffer.  A zero
1818  	 * value indicates CPU dirty logging is unsupported or disabled.
1819  	 */
1820  	int cpu_dirty_log_size;
1821  	void (*update_cpu_dirty_logging)(struct kvm_vcpu *vcpu);
1822  
1823  	const struct kvm_x86_nested_ops *nested_ops;
1824  
1825  	void (*vcpu_blocking)(struct kvm_vcpu *vcpu);
1826  	void (*vcpu_unblocking)(struct kvm_vcpu *vcpu);
1827  
1828  	int (*pi_update_irte)(struct kvm *kvm, unsigned int host_irq,
1829  			      uint32_t guest_irq, bool set);
1830  	void (*pi_start_assignment)(struct kvm *kvm);
1831  	void (*apicv_pre_state_restore)(struct kvm_vcpu *vcpu);
1832  	void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu);
1833  	bool (*dy_apicv_has_pending_interrupt)(struct kvm_vcpu *vcpu);
1834  
1835  	int (*set_hv_timer)(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
1836  			    bool *expired);
1837  	void (*cancel_hv_timer)(struct kvm_vcpu *vcpu);
1838  
1839  	void (*setup_mce)(struct kvm_vcpu *vcpu);
1840  
1841  #ifdef CONFIG_KVM_SMM
1842  	int (*smi_allowed)(struct kvm_vcpu *vcpu, bool for_injection);
1843  	int (*enter_smm)(struct kvm_vcpu *vcpu, union kvm_smram *smram);
1844  	int (*leave_smm)(struct kvm_vcpu *vcpu, const union kvm_smram *smram);
1845  	void (*enable_smi_window)(struct kvm_vcpu *vcpu);
1846  #endif
1847  
1848  	int (*dev_get_attr)(u32 group, u64 attr, u64 *val);
1849  	int (*mem_enc_ioctl)(struct kvm *kvm, void __user *argp);
1850  	int (*mem_enc_register_region)(struct kvm *kvm, struct kvm_enc_region *argp);
1851  	int (*mem_enc_unregister_region)(struct kvm *kvm, struct kvm_enc_region *argp);
1852  	int (*vm_copy_enc_context_from)(struct kvm *kvm, unsigned int source_fd);
1853  	int (*vm_move_enc_context_from)(struct kvm *kvm, unsigned int source_fd);
1854  	void (*guest_memory_reclaimed)(struct kvm *kvm);
1855  
1856  	int (*get_feature_msr)(u32 msr, u64 *data);
1857  
1858  	int (*check_emulate_instruction)(struct kvm_vcpu *vcpu, int emul_type,
1859  					 void *insn, int insn_len);
1860  
1861  	bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu);
1862  	int (*enable_l2_tlb_flush)(struct kvm_vcpu *vcpu);
1863  
1864  	void (*migrate_timers)(struct kvm_vcpu *vcpu);
1865  	void (*msr_filter_changed)(struct kvm_vcpu *vcpu);
1866  	int (*complete_emulated_msr)(struct kvm_vcpu *vcpu, int err);
1867  
1868  	void (*vcpu_deliver_sipi_vector)(struct kvm_vcpu *vcpu, u8 vector);
1869  
1870  	/*
1871  	 * Returns vCPU specific APICv inhibit reasons
1872  	 */
1873  	unsigned long (*vcpu_get_apicv_inhibit_reasons)(struct kvm_vcpu *vcpu);
1874  
1875  	gva_t (*get_untagged_addr)(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags);
1876  	void *(*alloc_apic_backing_page)(struct kvm_vcpu *vcpu);
1877  	int (*gmem_prepare)(struct kvm *kvm, kvm_pfn_t pfn, gfn_t gfn, int max_order);
1878  	void (*gmem_invalidate)(kvm_pfn_t start, kvm_pfn_t end);
1879  	int (*private_max_mapping_level)(struct kvm *kvm, kvm_pfn_t pfn);
1880  };
1881  
1882  struct kvm_x86_nested_ops {
1883  	void (*leave_nested)(struct kvm_vcpu *vcpu);
1884  	bool (*is_exception_vmexit)(struct kvm_vcpu *vcpu, u8 vector,
1885  				    u32 error_code);
1886  	int (*check_events)(struct kvm_vcpu *vcpu);
1887  	bool (*has_events)(struct kvm_vcpu *vcpu, bool for_injection);
1888  	void (*triple_fault)(struct kvm_vcpu *vcpu);
1889  	int (*get_state)(struct kvm_vcpu *vcpu,
1890  			 struct kvm_nested_state __user *user_kvm_nested_state,
1891  			 unsigned user_data_size);
1892  	int (*set_state)(struct kvm_vcpu *vcpu,
1893  			 struct kvm_nested_state __user *user_kvm_nested_state,
1894  			 struct kvm_nested_state *kvm_state);
1895  	bool (*get_nested_state_pages)(struct kvm_vcpu *vcpu);
1896  	int (*write_log_dirty)(struct kvm_vcpu *vcpu, gpa_t l2_gpa);
1897  
1898  	int (*enable_evmcs)(struct kvm_vcpu *vcpu,
1899  			    uint16_t *vmcs_version);
1900  	uint16_t (*get_evmcs_version)(struct kvm_vcpu *vcpu);
1901  	void (*hv_inject_synthetic_vmexit_post_tlb_flush)(struct kvm_vcpu *vcpu);
1902  };
1903  
1904  struct kvm_x86_init_ops {
1905  	int (*hardware_setup)(void);
1906  	unsigned int (*handle_intel_pt_intr)(void);
1907  
1908  	struct kvm_x86_ops *runtime_ops;
1909  	struct kvm_pmu_ops *pmu_ops;
1910  };
1911  
1912  struct kvm_arch_async_pf {
1913  	u32 token;
1914  	gfn_t gfn;
1915  	unsigned long cr3;
1916  	bool direct_map;
1917  	u64 error_code;
1918  };
1919  
1920  extern u32 __read_mostly kvm_nr_uret_msrs;
1921  extern bool __read_mostly allow_smaller_maxphyaddr;
1922  extern bool __read_mostly enable_apicv;
1923  extern struct kvm_x86_ops kvm_x86_ops;
1924  
1925  #define kvm_x86_call(func) static_call(kvm_x86_##func)
1926  #define kvm_pmu_call(func) static_call(kvm_x86_pmu_##func)
1927  
1928  #define KVM_X86_OP(func) \
1929  	DECLARE_STATIC_CALL(kvm_x86_##func, *(((struct kvm_x86_ops *)0)->func));
1930  #define KVM_X86_OP_OPTIONAL KVM_X86_OP
1931  #define KVM_X86_OP_OPTIONAL_RET0 KVM_X86_OP
1932  #include <asm/kvm-x86-ops.h>
1933  
1934  int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops);
1935  void kvm_x86_vendor_exit(void);
1936  
1937  #define __KVM_HAVE_ARCH_VM_ALLOC
kvm_arch_alloc_vm(void)1938  static inline struct kvm *kvm_arch_alloc_vm(void)
1939  {
1940  	return __vmalloc(kvm_x86_ops.vm_size, GFP_KERNEL_ACCOUNT | __GFP_ZERO);
1941  }
1942  
1943  #define __KVM_HAVE_ARCH_VM_FREE
1944  void kvm_arch_free_vm(struct kvm *kvm);
1945  
1946  #if IS_ENABLED(CONFIG_HYPERV)
1947  #define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS
kvm_arch_flush_remote_tlbs(struct kvm * kvm)1948  static inline int kvm_arch_flush_remote_tlbs(struct kvm *kvm)
1949  {
1950  	if (kvm_x86_ops.flush_remote_tlbs &&
1951  	    !kvm_x86_call(flush_remote_tlbs)(kvm))
1952  		return 0;
1953  	else
1954  		return -ENOTSUPP;
1955  }
1956  
1957  #define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS_RANGE
kvm_arch_flush_remote_tlbs_range(struct kvm * kvm,gfn_t gfn,u64 nr_pages)1958  static inline int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn,
1959  						   u64 nr_pages)
1960  {
1961  	if (!kvm_x86_ops.flush_remote_tlbs_range)
1962  		return -EOPNOTSUPP;
1963  
1964  	return kvm_x86_call(flush_remote_tlbs_range)(kvm, gfn, nr_pages);
1965  }
1966  #endif /* CONFIG_HYPERV */
1967  
1968  enum kvm_intr_type {
1969  	/* Values are arbitrary, but must be non-zero. */
1970  	KVM_HANDLING_IRQ = 1,
1971  	KVM_HANDLING_NMI,
1972  };
1973  
1974  /* Enable perf NMI and timer modes to work, and minimise false positives. */
1975  #define kvm_arch_pmi_in_guest(vcpu) \
1976  	((vcpu) && (vcpu)->arch.handling_intr_from_guest && \
1977  	 (!!in_nmi() == ((vcpu)->arch.handling_intr_from_guest == KVM_HANDLING_NMI)))
1978  
1979  void __init kvm_mmu_x86_module_init(void);
1980  int kvm_mmu_vendor_module_init(void);
1981  void kvm_mmu_vendor_module_exit(void);
1982  
1983  void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
1984  int kvm_mmu_create(struct kvm_vcpu *vcpu);
1985  void kvm_mmu_init_vm(struct kvm *kvm);
1986  void kvm_mmu_uninit_vm(struct kvm *kvm);
1987  
1988  void kvm_mmu_init_memslot_memory_attributes(struct kvm *kvm,
1989  					    struct kvm_memory_slot *slot);
1990  
1991  void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu);
1992  void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
1993  void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
1994  				      const struct kvm_memory_slot *memslot,
1995  				      int start_level);
1996  void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm,
1997  				       const struct kvm_memory_slot *memslot,
1998  				       int target_level);
1999  void kvm_mmu_try_split_huge_pages(struct kvm *kvm,
2000  				  const struct kvm_memory_slot *memslot,
2001  				  u64 start, u64 end,
2002  				  int target_level);
2003  void kvm_mmu_recover_huge_pages(struct kvm *kvm,
2004  				const struct kvm_memory_slot *memslot);
2005  void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
2006  				   const struct kvm_memory_slot *memslot);
2007  void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen);
2008  void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long kvm_nr_mmu_pages);
2009  void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end);
2010  
2011  int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
2012  
2013  int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
2014  			  const void *val, int bytes);
2015  
2016  struct kvm_irq_mask_notifier {
2017  	void (*func)(struct kvm_irq_mask_notifier *kimn, bool masked);
2018  	int irq;
2019  	struct hlist_node link;
2020  };
2021  
2022  void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
2023  				    struct kvm_irq_mask_notifier *kimn);
2024  void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
2025  				      struct kvm_irq_mask_notifier *kimn);
2026  void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
2027  			     bool mask);
2028  
2029  extern bool tdp_enabled;
2030  
2031  u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu);
2032  
2033  /*
2034   * EMULTYPE_NO_DECODE - Set when re-emulating an instruction (after completing
2035   *			userspace I/O) to indicate that the emulation context
2036   *			should be reused as is, i.e. skip initialization of
2037   *			emulation context, instruction fetch and decode.
2038   *
2039   * EMULTYPE_TRAP_UD - Set when emulating an intercepted #UD from hardware.
2040   *		      Indicates that only select instructions (tagged with
2041   *		      EmulateOnUD) should be emulated (to minimize the emulator
2042   *		      attack surface).  See also EMULTYPE_TRAP_UD_FORCED.
2043   *
2044   * EMULTYPE_SKIP - Set when emulating solely to skip an instruction, i.e. to
2045   *		   decode the instruction length.  For use *only* by
2046   *		   kvm_x86_ops.skip_emulated_instruction() implementations if
2047   *		   EMULTYPE_COMPLETE_USER_EXIT is not set.
2048   *
2049   * EMULTYPE_ALLOW_RETRY_PF - Set when the emulator should resume the guest to
2050   *			     retry native execution under certain conditions,
2051   *			     Can only be set in conjunction with EMULTYPE_PF.
2052   *
2053   * EMULTYPE_TRAP_UD_FORCED - Set when emulating an intercepted #UD that was
2054   *			     triggered by KVM's magic "force emulation" prefix,
2055   *			     which is opt in via module param (off by default).
2056   *			     Bypasses EmulateOnUD restriction despite emulating
2057   *			     due to an intercepted #UD (see EMULTYPE_TRAP_UD).
2058   *			     Used to test the full emulator from userspace.
2059   *
2060   * EMULTYPE_VMWARE_GP - Set when emulating an intercepted #GP for VMware
2061   *			backdoor emulation, which is opt in via module param.
2062   *			VMware backdoor emulation handles select instructions
2063   *			and reinjects the #GP for all other cases.
2064   *
2065   * EMULTYPE_PF - Set when an intercepted #PF triggers the emulation, in which case
2066   *		 the CR2/GPA value pass on the stack is valid.
2067   *
2068   * EMULTYPE_COMPLETE_USER_EXIT - Set when the emulator should update interruptibility
2069   *				 state and inject single-step #DBs after skipping
2070   *				 an instruction (after completing userspace I/O).
2071   *
2072   * EMULTYPE_WRITE_PF_TO_SP - Set when emulating an intercepted page fault that
2073   *			     is attempting to write a gfn that contains one or
2074   *			     more of the PTEs used to translate the write itself,
2075   *			     and the owning page table is being shadowed by KVM.
2076   *			     If emulation of the faulting instruction fails and
2077   *			     this flag is set, KVM will exit to userspace instead
2078   *			     of retrying emulation as KVM cannot make forward
2079   *			     progress.
2080   *
2081   *			     If emulation fails for a write to guest page tables,
2082   *			     KVM unprotects (zaps) the shadow page for the target
2083   *			     gfn and resumes the guest to retry the non-emulatable
2084   *			     instruction (on hardware).  Unprotecting the gfn
2085   *			     doesn't allow forward progress for a self-changing
2086   *			     access because doing so also zaps the translation for
2087   *			     the gfn, i.e. retrying the instruction will hit a
2088   *			     !PRESENT fault, which results in a new shadow page
2089   *			     and sends KVM back to square one.
2090   */
2091  #define EMULTYPE_NO_DECODE	    (1 << 0)
2092  #define EMULTYPE_TRAP_UD	    (1 << 1)
2093  #define EMULTYPE_SKIP		    (1 << 2)
2094  #define EMULTYPE_ALLOW_RETRY_PF	    (1 << 3)
2095  #define EMULTYPE_TRAP_UD_FORCED	    (1 << 4)
2096  #define EMULTYPE_VMWARE_GP	    (1 << 5)
2097  #define EMULTYPE_PF		    (1 << 6)
2098  #define EMULTYPE_COMPLETE_USER_EXIT (1 << 7)
2099  #define EMULTYPE_WRITE_PF_TO_SP	    (1 << 8)
2100  
kvm_can_emulate_event_vectoring(int emul_type)2101  static inline bool kvm_can_emulate_event_vectoring(int emul_type)
2102  {
2103  	return !(emul_type & EMULTYPE_PF);
2104  }
2105  
2106  int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type);
2107  int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu,
2108  					void *insn, int insn_len);
2109  void __kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu,
2110  					  u64 *data, u8 ndata);
2111  void kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu);
2112  
2113  void kvm_prepare_event_vectoring_exit(struct kvm_vcpu *vcpu, gpa_t gpa);
2114  
2115  void kvm_enable_efer_bits(u64);
2116  bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer);
2117  int kvm_get_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 *data);
2118  int kvm_set_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 data);
2119  int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, bool host_initiated);
2120  int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data);
2121  int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data);
2122  int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu);
2123  int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu);
2124  int kvm_emulate_as_nop(struct kvm_vcpu *vcpu);
2125  int kvm_emulate_invd(struct kvm_vcpu *vcpu);
2126  int kvm_emulate_mwait(struct kvm_vcpu *vcpu);
2127  int kvm_handle_invalid_op(struct kvm_vcpu *vcpu);
2128  int kvm_emulate_monitor(struct kvm_vcpu *vcpu);
2129  
2130  int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in);
2131  int kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
2132  int kvm_emulate_halt(struct kvm_vcpu *vcpu);
2133  int kvm_emulate_halt_noskip(struct kvm_vcpu *vcpu);
2134  int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu);
2135  int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu);
2136  
2137  void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
2138  void kvm_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
2139  int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
2140  void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector);
2141  
2142  int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
2143  		    int reason, bool has_error_code, u32 error_code);
2144  
2145  void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0);
2146  void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4);
2147  int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
2148  int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
2149  int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
2150  int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
2151  int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val);
2152  unsigned long kvm_get_dr(struct kvm_vcpu *vcpu, int dr);
2153  unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
2154  void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
2155  int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu);
2156  
2157  int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr);
2158  int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr);
2159  
2160  unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu);
2161  void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
2162  int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu);
2163  
2164  void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
2165  void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
2166  void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr, unsigned long payload);
2167  void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr);
2168  void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
2169  void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault);
2170  void kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
2171  				    struct x86_exception *fault);
2172  bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
2173  bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr);
2174  
__kvm_irq_line_state(unsigned long * irq_state,int irq_source_id,int level)2175  static inline int __kvm_irq_line_state(unsigned long *irq_state,
2176  				       int irq_source_id, int level)
2177  {
2178  	/* Logical OR for level trig interrupt */
2179  	if (level)
2180  		__set_bit(irq_source_id, irq_state);
2181  	else
2182  		__clear_bit(irq_source_id, irq_state);
2183  
2184  	return !!(*irq_state);
2185  }
2186  
2187  int kvm_pic_set_irq(struct kvm_pic *pic, int irq, int irq_source_id, int level);
2188  void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id);
2189  
2190  void kvm_inject_nmi(struct kvm_vcpu *vcpu);
2191  int kvm_get_nr_pending_nmis(struct kvm_vcpu *vcpu);
2192  
2193  void kvm_update_dr7(struct kvm_vcpu *vcpu);
2194  
2195  bool __kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
2196  				       bool always_retry);
2197  
kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu * vcpu,gpa_t cr2_or_gpa)2198  static inline bool kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu,
2199  						   gpa_t cr2_or_gpa)
2200  {
2201  	return __kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa, false);
2202  }
2203  
2204  void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
2205  			ulong roots_to_free);
2206  void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu);
2207  gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
2208  			      struct x86_exception *exception);
2209  gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
2210  			       struct x86_exception *exception);
2211  gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
2212  				struct x86_exception *exception);
2213  
2214  bool kvm_apicv_activated(struct kvm *kvm);
2215  bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu);
2216  void __kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu);
2217  void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
2218  				      enum kvm_apicv_inhibit reason, bool set);
2219  void kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
2220  				    enum kvm_apicv_inhibit reason, bool set);
2221  
kvm_set_apicv_inhibit(struct kvm * kvm,enum kvm_apicv_inhibit reason)2222  static inline void kvm_set_apicv_inhibit(struct kvm *kvm,
2223  					 enum kvm_apicv_inhibit reason)
2224  {
2225  	kvm_set_or_clear_apicv_inhibit(kvm, reason, true);
2226  }
2227  
kvm_clear_apicv_inhibit(struct kvm * kvm,enum kvm_apicv_inhibit reason)2228  static inline void kvm_clear_apicv_inhibit(struct kvm *kvm,
2229  					   enum kvm_apicv_inhibit reason)
2230  {
2231  	kvm_set_or_clear_apicv_inhibit(kvm, reason, false);
2232  }
2233  
2234  int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
2235  		       void *insn, int insn_len);
2236  void kvm_mmu_print_sptes(struct kvm_vcpu *vcpu, gpa_t gpa, const char *msg);
2237  void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
2238  void kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
2239  			     u64 addr, unsigned long roots);
2240  void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid);
2241  void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd);
2242  
2243  void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
2244  		       int tdp_max_root_level, int tdp_huge_page_level);
2245  
2246  
2247  #ifdef CONFIG_KVM_PRIVATE_MEM
2248  #define kvm_arch_has_private_mem(kvm) ((kvm)->arch.has_private_mem)
2249  #else
2250  #define kvm_arch_has_private_mem(kvm) false
2251  #endif
2252  
2253  #define kvm_arch_has_readonly_mem(kvm) (!(kvm)->arch.has_protected_state)
2254  
kvm_read_ldt(void)2255  static inline u16 kvm_read_ldt(void)
2256  {
2257  	u16 ldt;
2258  	asm("sldt %0" : "=g"(ldt));
2259  	return ldt;
2260  }
2261  
kvm_load_ldt(u16 sel)2262  static inline void kvm_load_ldt(u16 sel)
2263  {
2264  	asm("lldt %0" : : "rm"(sel));
2265  }
2266  
2267  #ifdef CONFIG_X86_64
read_msr(unsigned long msr)2268  static inline unsigned long read_msr(unsigned long msr)
2269  {
2270  	u64 value;
2271  
2272  	rdmsrl(msr, value);
2273  	return value;
2274  }
2275  #endif
2276  
kvm_inject_gp(struct kvm_vcpu * vcpu,u32 error_code)2277  static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code)
2278  {
2279  	kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
2280  }
2281  
2282  #define TSS_IOPB_BASE_OFFSET 0x66
2283  #define TSS_BASE_SIZE 0x68
2284  #define TSS_IOPB_SIZE (65536 / 8)
2285  #define TSS_REDIRECTION_SIZE (256 / 8)
2286  #define RMODE_TSS_SIZE							\
2287  	(TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1)
2288  
2289  enum {
2290  	TASK_SWITCH_CALL = 0,
2291  	TASK_SWITCH_IRET = 1,
2292  	TASK_SWITCH_JMP = 2,
2293  	TASK_SWITCH_GATE = 3,
2294  };
2295  
2296  #define HF_GUEST_MASK		(1 << 0) /* VCPU is in guest-mode */
2297  
2298  #ifdef CONFIG_KVM_SMM
2299  #define HF_SMM_MASK		(1 << 1)
2300  #define HF_SMM_INSIDE_NMI_MASK	(1 << 2)
2301  
2302  # define KVM_MAX_NR_ADDRESS_SPACES	2
2303  /* SMM is currently unsupported for guests with private memory. */
2304  # define kvm_arch_nr_memslot_as_ids(kvm) (kvm_arch_has_private_mem(kvm) ? 1 : 2)
2305  # define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 : 0)
2306  # define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, (role).smm)
2307  #else
2308  # define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, 0)
2309  #endif
2310  
2311  int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v);
2312  int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
2313  int kvm_cpu_has_extint(struct kvm_vcpu *v);
2314  int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
2315  int kvm_cpu_get_extint(struct kvm_vcpu *v);
2316  int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
2317  void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
2318  
2319  int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low,
2320  		    unsigned long ipi_bitmap_high, u32 min,
2321  		    unsigned long icr, int op_64_bit);
2322  
2323  int kvm_add_user_return_msr(u32 msr);
2324  int kvm_find_user_return_msr(u32 msr);
2325  int kvm_set_user_return_msr(unsigned index, u64 val, u64 mask);
2326  
kvm_is_supported_user_return_msr(u32 msr)2327  static inline bool kvm_is_supported_user_return_msr(u32 msr)
2328  {
2329  	return kvm_find_user_return_msr(msr) >= 0;
2330  }
2331  
2332  u64 kvm_scale_tsc(u64 tsc, u64 ratio);
2333  u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc);
2334  u64 kvm_calc_nested_tsc_offset(u64 l1_offset, u64 l2_offset, u64 l2_multiplier);
2335  u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier);
2336  
2337  unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu);
2338  bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip);
2339  
2340  void kvm_make_scan_ioapic_request(struct kvm *kvm);
2341  void kvm_make_scan_ioapic_request_mask(struct kvm *kvm,
2342  				       unsigned long *vcpu_bitmap);
2343  
2344  bool kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
2345  				     struct kvm_async_pf *work);
2346  void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
2347  				 struct kvm_async_pf *work);
2348  void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu,
2349  			       struct kvm_async_pf *work);
2350  void kvm_arch_async_page_present_queued(struct kvm_vcpu *vcpu);
2351  bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu);
2352  extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn);
2353  
2354  int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu);
2355  int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err);
2356  
2357  void __user *__x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa,
2358  				     u32 size);
2359  bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu);
2360  bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu);
2361  
2362  bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
2363  			     struct kvm_vcpu **dest_vcpu);
2364  
2365  void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
2366  		     struct kvm_lapic_irq *irq);
2367  
kvm_irq_is_postable(struct kvm_lapic_irq * irq)2368  static inline bool kvm_irq_is_postable(struct kvm_lapic_irq *irq)
2369  {
2370  	/* We can only post Fixed and LowPrio IRQs */
2371  	return (irq->delivery_mode == APIC_DM_FIXED ||
2372  		irq->delivery_mode == APIC_DM_LOWEST);
2373  }
2374  
kvm_arch_vcpu_blocking(struct kvm_vcpu * vcpu)2375  static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
2376  {
2377  	kvm_x86_call(vcpu_blocking)(vcpu);
2378  }
2379  
kvm_arch_vcpu_unblocking(struct kvm_vcpu * vcpu)2380  static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu)
2381  {
2382  	kvm_x86_call(vcpu_unblocking)(vcpu);
2383  }
2384  
kvm_cpu_get_apicid(int mps_cpu)2385  static inline int kvm_cpu_get_apicid(int mps_cpu)
2386  {
2387  #ifdef CONFIG_X86_LOCAL_APIC
2388  	return default_cpu_present_to_apicid(mps_cpu);
2389  #else
2390  	WARN_ON_ONCE(1);
2391  	return BAD_APICID;
2392  #endif
2393  }
2394  
2395  int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages);
2396  
2397  #define KVM_CLOCK_VALID_FLAGS						\
2398  	(KVM_CLOCK_TSC_STABLE | KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC)
2399  
2400  #define KVM_X86_VALID_QUIRKS			\
2401  	(KVM_X86_QUIRK_LINT0_REENABLED |	\
2402  	 KVM_X86_QUIRK_CD_NW_CLEARED |		\
2403  	 KVM_X86_QUIRK_LAPIC_MMIO_HOLE |	\
2404  	 KVM_X86_QUIRK_OUT_7E_INC_RIP |		\
2405  	 KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT |	\
2406  	 KVM_X86_QUIRK_FIX_HYPERCALL_INSN |	\
2407  	 KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS |	\
2408  	 KVM_X86_QUIRK_SLOT_ZAP_ALL |		\
2409  	 KVM_X86_QUIRK_STUFF_FEATURE_MSRS)
2410  
2411  /*
2412   * KVM previously used a u32 field in kvm_run to indicate the hypercall was
2413   * initiated from long mode. KVM now sets bit 0 to indicate long mode, but the
2414   * remaining 31 lower bits must be 0 to preserve ABI.
2415   */
2416  #define KVM_EXIT_HYPERCALL_MBZ		GENMASK_ULL(31, 1)
2417  
2418  #endif /* _ASM_X86_KVM_HOST_H */
2419