xref: /illumos-gate/usr/src/uts/intel/sys/vmm.h (revision 4702be99a7621a425e53a63af8ae0ff7fe973b31)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 /*
29  * This file and its contents are supplied under the terms of the
30  * Common Development and Distribution License ("CDDL"), version 1.0.
31  * You may only use this file in accordance with the terms of version
32  * 1.0 of the CDDL.
33  *
34  * A full copy of the text of the CDDL should have accompanied this
35  * source.  A copy of the CDDL is also available via the Internet at
36  * http://www.illumos.org/license/CDDL.
37  *
38  * Copyright 2015 Pluribus Networks Inc.
39  * Copyright 2019 Joyent, Inc.
40  * Copyright 2022 Oxide Computer Company
41  */
42 
43 #ifndef _VMM_H_
44 #define	_VMM_H_
45 
46 enum vm_suspend_how {
47 	VM_SUSPEND_NONE,
48 	VM_SUSPEND_RESET,
49 	VM_SUSPEND_POWEROFF,
50 	VM_SUSPEND_HALT,
51 	VM_SUSPEND_TRIPLEFAULT,
52 	VM_SUSPEND_LAST
53 };
54 
55 /*
56  * Identifiers for architecturally defined registers.
57  */
58 enum vm_reg_name {
59 	VM_REG_GUEST_RAX,
60 	VM_REG_GUEST_RBX,
61 	VM_REG_GUEST_RCX,
62 	VM_REG_GUEST_RDX,
63 	VM_REG_GUEST_RSI,
64 	VM_REG_GUEST_RDI,
65 	VM_REG_GUEST_RBP,
66 	VM_REG_GUEST_R8,
67 	VM_REG_GUEST_R9,
68 	VM_REG_GUEST_R10,
69 	VM_REG_GUEST_R11,
70 	VM_REG_GUEST_R12,
71 	VM_REG_GUEST_R13,
72 	VM_REG_GUEST_R14,
73 	VM_REG_GUEST_R15,
74 	VM_REG_GUEST_CR0,
75 	VM_REG_GUEST_CR3,
76 	VM_REG_GUEST_CR4,
77 	VM_REG_GUEST_DR7,
78 	VM_REG_GUEST_RSP,
79 	VM_REG_GUEST_RIP,
80 	VM_REG_GUEST_RFLAGS,
81 	VM_REG_GUEST_ES,
82 	VM_REG_GUEST_CS,
83 	VM_REG_GUEST_SS,
84 	VM_REG_GUEST_DS,
85 	VM_REG_GUEST_FS,
86 	VM_REG_GUEST_GS,
87 	VM_REG_GUEST_LDTR,
88 	VM_REG_GUEST_TR,
89 	VM_REG_GUEST_IDTR,
90 	VM_REG_GUEST_GDTR,
91 	VM_REG_GUEST_EFER,
92 	VM_REG_GUEST_CR2,
93 	VM_REG_GUEST_PDPTE0,
94 	VM_REG_GUEST_PDPTE1,
95 	VM_REG_GUEST_PDPTE2,
96 	VM_REG_GUEST_PDPTE3,
97 	VM_REG_GUEST_INTR_SHADOW,
98 	VM_REG_GUEST_DR0,
99 	VM_REG_GUEST_DR1,
100 	VM_REG_GUEST_DR2,
101 	VM_REG_GUEST_DR3,
102 	VM_REG_GUEST_DR6,
103 	VM_REG_GUEST_ENTRY_INST_LENGTH,
104 	VM_REG_GUEST_XCR0,
105 	VM_REG_LAST
106 };
107 
108 enum x2apic_state {
109 	X2APIC_DISABLED,
110 	X2APIC_ENABLED,
111 	X2APIC_STATE_LAST
112 };
113 
114 #define	VM_INTINFO_MASK_VECTOR	0xffUL
115 #define	VM_INTINFO_MASK_TYPE	0x700UL
116 #define	VM_INTINFO_MASK_RSVD	0x7ffff000UL
117 #define	VM_INTINFO_SHIFT_ERRCODE 32
118 
119 #define	VM_INTINFO_VECTOR(val)	((val) & VM_INTINFO_MASK_VECTOR)
120 #define	VM_INTINFO_TYPE(val)	((val) & VM_INTINFO_MASK_TYPE)
121 #define	VM_INTINFO_ERRCODE(val)	((val) >> VM_INTINFO_SHIFT_ERRCODE)
122 #define	VM_INTINFO_PENDING(val)	(((val) & VM_INTINFO_VALID) != 0)
123 #define	VM_INTINFO_HAS_ERRCODE(val) (((val) & VM_INTINFO_DEL_ERRCODE) != 0)
124 
125 #define	VM_INTINFO_VALID	(1UL << 31)
126 #define	VM_INTINFO_DEL_ERRCODE	(1UL << 11)
127 
128 #define	VM_INTINFO_HWINTR	(0 << 8)
129 #define	VM_INTINFO_NMI		(2 << 8)
130 #define	VM_INTINFO_HWEXCP	(3 << 8)
131 #define	VM_INTINFO_SWINTR	(4 << 8)
132 /* Reserved for CPU (read: Intel) specific types */
133 #define	VM_INTINFO_RESV1	(1 << 8)
134 #define	VM_INTINFO_RESV5	(5 << 8)
135 #define	VM_INTINFO_RESV6	(6 << 8)
136 #define	VM_INTINFO_RESV7	(7 << 8)
137 
138 /*
139  * illumos doesn't have a limitation based on SPECNAMELEN like FreeBSD does.
140  * To simplify structure definitions, an arbitrary limit has been chosen.
141  * This same limit is used for memory segment names
142  */
143 
144 #define	VM_MAX_NAMELEN		128
145 #define	VM_MAX_SEG_NAMELEN	128
146 
147 #ifdef _KERNEL
148 #define	VM_MAXCPU	32			/* maximum virtual cpus */
149 #endif
150 
151 /*
152  * Identifiers for optional vmm capabilities
153  */
154 enum vm_cap_type {
155 	VM_CAP_HALT_EXIT,
156 	VM_CAP_MTRAP_EXIT,
157 	VM_CAP_PAUSE_EXIT,
158 	VM_CAP_ENABLE_INVPCID,
159 	VM_CAP_BPT_EXIT,
160 	VM_CAP_MAX
161 };
162 
163 enum vmx_caps {
164 	VMX_CAP_NONE		= 0,
165 	VMX_CAP_TPR_SHADOW	= (1UL << 0),
166 	VMX_CAP_APICV		= (1UL << 1),
167 	VMX_CAP_APICV_X2APIC	= (1UL << 2),
168 	VMX_CAP_APICV_PIR	= (1UL << 3),
169 };
170 
171 enum vm_intr_trigger {
172 	EDGE_TRIGGER,
173 	LEVEL_TRIGGER
174 };
175 
176 /*
177  * The 'access' field has the format specified in Table 21-2 of the Intel
178  * Architecture Manual vol 3b.
179  *
180  * XXX The contents of the 'access' field are architecturally defined except
181  * bit 16 - Segment Unusable.
182  */
183 struct seg_desc {
184 	uint64_t	base;
185 	uint32_t	limit;
186 	uint32_t	access;
187 };
188 #define	SEG_DESC_TYPE(access)		((access) & 0x001f)
189 #define	SEG_DESC_DPL_MASK		0x3
190 #define	SEG_DESC_DPL_SHIFT		5
191 #define	SEG_DESC_DPL(access)		\
192 	(((access) >> SEG_DESC_DPL_SHIFT) & SEG_DESC_DPL_MASK)
193 #define	SEG_DESC_PRESENT(access)	(((access) & 0x0080) ? 1 : 0)
194 #define	SEG_DESC_DEF32(access)		(((access) & 0x4000) ? 1 : 0)
195 #define	SEG_DESC_GRANULARITY(access)	(((access) & 0x8000) ? 1 : 0)
196 #define	SEG_DESC_UNUSABLE(access)	(((access) & 0x10000) ? 1 : 0)
197 
198 enum vm_cpu_mode {
199 	CPU_MODE_REAL,
200 	CPU_MODE_PROTECTED,
201 	CPU_MODE_COMPATIBILITY,		/* IA-32E mode (CS.L = 0) */
202 	CPU_MODE_64BIT,			/* IA-32E mode (CS.L = 1) */
203 };
204 
205 enum vm_paging_mode {
206 	PAGING_MODE_FLAT,
207 	PAGING_MODE_32,
208 	PAGING_MODE_PAE,
209 	PAGING_MODE_64,
210 };
211 
212 struct vm_guest_paging {
213 	uint64_t	cr3;
214 	int		cpl;
215 	enum vm_cpu_mode cpu_mode;
216 	enum vm_paging_mode paging_mode;
217 };
218 
219 enum vm_exitcode {
220 	VM_EXITCODE_INOUT,
221 	VM_EXITCODE_VMX,
222 	VM_EXITCODE_BOGUS,
223 	VM_EXITCODE_RDMSR,
224 	VM_EXITCODE_WRMSR,
225 	VM_EXITCODE_HLT,
226 	VM_EXITCODE_MTRAP,
227 	VM_EXITCODE_PAUSE,
228 	VM_EXITCODE_PAGING,
229 	VM_EXITCODE_INST_EMUL,
230 	VM_EXITCODE_RUN_STATE,
231 	VM_EXITCODE_MMIO_EMUL,
232 	VM_EXITCODE_DEPRECATED,	/* formerly RUNBLOCK */
233 	VM_EXITCODE_IOAPIC_EOI,
234 	VM_EXITCODE_SUSPENDED,
235 	VM_EXITCODE_MMIO,
236 	VM_EXITCODE_TASK_SWITCH,
237 	VM_EXITCODE_MONITOR,
238 	VM_EXITCODE_MWAIT,
239 	VM_EXITCODE_SVM,
240 	VM_EXITCODE_DEPRECATED2, /* formerly REQIDLE */
241 	VM_EXITCODE_DEBUG,
242 	VM_EXITCODE_VMINSN,
243 	VM_EXITCODE_BPT,
244 	VM_EXITCODE_HT,
245 	VM_EXITCODE_MAX
246 };
247 
248 enum inout_flags {
249 	INOUT_IN	= (1U << 0), /* direction: 'in' when set, else 'out' */
250 
251 	/*
252 	 * The following flags are used only for in-kernel emulation logic and
253 	 * are not exposed to userspace.
254 	 */
255 	INOUT_STR	= (1U << 1), /* ins/outs operation */
256 	INOUT_REP	= (1U << 2), /* 'rep' prefix present on instruction */
257 };
258 
259 struct vm_inout {
260 	uint32_t	eax;
261 	uint16_t	port;
262 	uint8_t		bytes;		/* 1 or 2 or 4 */
263 	uint8_t		flags;		/* see: inout_flags */
264 
265 	/*
266 	 * The address size and segment are relevant to INS/OUTS operations.
267 	 * Userspace is not concerned with them since the in-kernel emulation
268 	 * handles those specific aspects.
269 	 */
270 	uint8_t		addrsize;
271 	uint8_t		segment;
272 };
273 
274 struct vm_mmio {
275 	uint8_t		bytes;		/* 1/2/4/8 bytes */
276 	uint8_t		read;		/* read: 1, write: 0 */
277 	uint16_t	_pad[3];
278 	uint64_t	gpa;
279 	uint64_t	data;
280 };
281 
282 enum task_switch_reason {
283 	TSR_CALL,
284 	TSR_IRET,
285 	TSR_JMP,
286 	TSR_IDT_GATE,	/* task gate in IDT */
287 };
288 
289 struct vm_task_switch {
290 	uint16_t	tsssel;		/* new TSS selector */
291 	int		ext;		/* task switch due to external event */
292 	uint32_t	errcode;
293 	int		errcode_valid;	/* push 'errcode' on the new stack */
294 	enum task_switch_reason reason;
295 	struct vm_guest_paging paging;
296 };
297 
298 enum vcpu_run_state {
299 	VRS_HALT		= 0,
300 	VRS_INIT		= (1 << 0),
301 	VRS_RUN			= (1 << 1),
302 
303 	VRS_PEND_INIT		= (1 << 14),
304 	VRS_PEND_SIPI		= (1 << 15),
305 };
306 #define VRS_MASK_VALID(v)	\
307 	((v) & (VRS_INIT | VRS_RUN | VRS_PEND_SIPI | VRS_PEND_SIPI))
308 #define VRS_IS_VALID(v)		((v) == VRS_MASK_VALID(v))
309 
310 struct vm_exit {
311 	enum vm_exitcode	exitcode;
312 	int			inst_length;	/* 0 means unknown */
313 	uint64_t		rip;
314 	union {
315 		struct vm_inout	inout;
316 		struct vm_mmio	mmio;
317 		struct {
318 			uint64_t	gpa;
319 			int		fault_type;
320 		} paging;
321 		/*
322 		 * Kernel-internal MMIO decoding and emulation.
323 		 * Userspace should not expect to see this, but rather a
324 		 * VM_EXITCODE_MMIO with the above 'mmio' context.
325 		 */
326 		struct {
327 			uint64_t	gpa;
328 			uint64_t	gla;
329 			uint64_t	cs_base;
330 			int		cs_d;		/* CS.D */
331 		} mmio_emul;
332 		struct {
333 			uint8_t		inst[15];
334 			uint8_t		num_valid;
335 		} inst_emul;
336 		/*
337 		 * VMX specific payload. Used when there is no "better"
338 		 * exitcode to represent the VM-exit.
339 		 */
340 		struct {
341 			int		status;		/* vmx inst status */
342 			/*
343 			 * 'exit_reason' and 'exit_qualification' are valid
344 			 * only if 'status' is zero.
345 			 */
346 			uint32_t	exit_reason;
347 			uint64_t	exit_qualification;
348 			/*
349 			 * 'inst_error' and 'inst_type' are valid
350 			 * only if 'status' is non-zero.
351 			 */
352 			int		inst_type;
353 			int		inst_error;
354 		} vmx;
355 		/*
356 		 * SVM specific payload.
357 		 */
358 		struct {
359 			uint64_t	exitcode;
360 			uint64_t	exitinfo1;
361 			uint64_t	exitinfo2;
362 		} svm;
363 		struct {
364 			int		inst_length;
365 		} bpt;
366 		struct {
367 			uint32_t	code;		/* ecx value */
368 			uint64_t	wval;
369 		} msr;
370 		struct {
371 			uint64_t	rflags;
372 		} hlt;
373 		struct {
374 			int		vector;
375 		} ioapic_eoi;
376 		struct {
377 			enum vm_suspend_how how;
378 			/*
379 			 * Source vcpuid for suspend status.  Typically -1,
380 			 * except for triple-fault events which occur on a
381 			 * specific faulting vCPU.
382 			 */
383 			int source;
384 			/*
385 			 * When suspend status was set on VM, measured in
386 			 * nanoseconds since VM boot.
387 			 */
388 			uint64_t when;
389 		} suspended;
390 		struct vm_task_switch task_switch;
391 	} u;
392 };
393 
394 enum vm_entry_cmds {
395 	VEC_DEFAULT = 0,
396 	VEC_DISCARD_INSTR,	/* discard inst emul state */
397 	VEC_FULFILL_MMIO,	/* entry includes result for mmio emul */
398 	VEC_FULFILL_INOUT,	/* entry includes result for inout emul */
399 
400 	/* Below are flags which can be combined with the above commands: */
401 
402 	/*
403 	 * Exit to userspace when vCPU is in consistent state: when any pending
404 	 * instruction emulation tasks have been completed and committed to the
405 	 * architecturally defined state.
406 	 */
407 	VEC_FLAG_EXIT_CONSISTENT	= 1 << 31,
408 };
409 
410 struct vm_entry {
411 	int cpuid;
412 	uint_t cmd;		/* see: vm_entry_cmds */
413 	void *exit_data;
414 	union {
415 		struct vm_inout inout;
416 		struct vm_mmio mmio;
417 	} u;
418 };
419 
420 int vm_restart_instruction(void *vm, int vcpuid);
421 
422 enum vm_create_flags {
423 	/*
424 	 * Allocate guest memory segments from existing reservoir capacity,
425 	 * rather than attempting to create transient allocations.
426 	 */
427 	VCF_RESERVOIR_MEM = (1 << 0),
428 
429 	/*
430 	 * Enable dirty page tracking for the guest.
431 	 */
432 	VCF_TRACK_DIRTY = (1 << 1),
433 };
434 
435 /*
436  * Describes an entry for `cpuid` emulation.
437  * Used internally by bhyve (kernel) in addition to exposed ioctl(2) interface.
438  */
439 struct vcpu_cpuid_entry {
440 	uint32_t	vce_function;
441 	uint32_t	vce_index;
442 	uint32_t	vce_flags;
443 	uint32_t	vce_eax;
444 	uint32_t	vce_ebx;
445 	uint32_t	vce_ecx;
446 	uint32_t	vce_edx;
447 	uint32_t	_pad;
448 };
449 
450 /*
451  * Defined flags for vcpu_cpuid_entry`vce_flags are below.
452  */
453 
454 /* Use index (ecx) input value when matching entry */
455 #define	VCE_FLAG_MATCH_INDEX		(1 << 0)
456 
457 /* All valid flacts for vcpu_cpuid_entry`vce_flags */
458 #define	VCE_FLAGS_VALID		VCE_FLAG_MATCH_INDEX
459 
460 /*
461  * Defined flags for vcpu_cpuid configuration are below.
462  * These are used by both the ioctl(2) interface via vm_vcpu_cpuid_config and
463  * internally in the kernel vmm.
464  */
465 
466 /* Use legacy hard-coded cpuid masking tables applied to the host CPU */
467 #define	VCC_FLAG_LEGACY_HANDLING	(1 << 0)
468 /*
469  * Emulate Intel-style fallback behavior (emit highest "standard" entry) if the
470  * queried function/index do not match.  If not set, emulate AMD-style, where
471  * all zeroes are returned in such cases.
472  */
473 #define	VCC_FLAG_INTEL_FALLBACK		(1 << 1)
474 
475 /* All valid flacts for vm_vcpu_cpuid_config`vvcc_flags */
476 #define	VCC_FLAGS_VALID		\
477 	(VCC_FLAG_LEGACY_HANDLING | VCC_FLAG_INTEL_FALLBACK)
478 
479 /* Maximum vcpu_cpuid_entry records per vCPU */
480 #define	VMM_MAX_CPUID_ENTRIES		256
481 
482 #endif	/* _VMM_H_ */
483