xref: /linux/arch/x86/kvm/vmx/nested.c (revision 8262fe85b4edc5fb3dd7b9520bf5c6b4f027fa55)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include <linux/frame.h>
4 #include <linux/percpu.h>
5 
6 #include <asm/debugreg.h>
7 #include <asm/mmu_context.h>
8 
9 #include "cpuid.h"
10 #include "hyperv.h"
11 #include "mmu.h"
12 #include "nested.h"
13 #include "pmu.h"
14 #include "trace.h"
15 #include "x86.h"
16 
17 static bool __read_mostly enable_shadow_vmcs = 1;
18 module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
19 
20 static bool __read_mostly nested_early_check = 0;
21 module_param(nested_early_check, bool, S_IRUGO);
22 
23 #define CC(consistency_check)						\
24 ({									\
25 	bool failed = (consistency_check);				\
26 	if (failed)							\
27 		trace_kvm_nested_vmenter_failed(#consistency_check, 0);	\
28 	failed;								\
29 })
30 
31 #define SET_MSR_OR_WARN(vcpu, idx, data)				\
32 ({									\
33 	bool failed = kvm_set_msr(vcpu, idx, data);			\
34 	if (failed)							\
35 		pr_warn_ratelimited(					\
36 				"%s cannot write MSR (0x%x, 0x%llx)\n",	\
37 				__func__, idx, data);			\
38 	failed;								\
39 })
40 
41 /*
42  * Hyper-V requires all of these, so mark them as supported even though
43  * they are just treated the same as all-context.
44  */
45 #define VMX_VPID_EXTENT_SUPPORTED_MASK		\
46 	(VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT |	\
47 	VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT |	\
48 	VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT |	\
49 	VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT)
50 
51 #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5
52 
53 enum {
54 	VMX_VMREAD_BITMAP,
55 	VMX_VMWRITE_BITMAP,
56 	VMX_BITMAP_NR
57 };
58 static unsigned long *vmx_bitmap[VMX_BITMAP_NR];
59 
60 #define vmx_vmread_bitmap                    (vmx_bitmap[VMX_VMREAD_BITMAP])
61 #define vmx_vmwrite_bitmap                   (vmx_bitmap[VMX_VMWRITE_BITMAP])
62 
63 struct shadow_vmcs_field {
64 	u16	encoding;
65 	u16	offset;
66 };
67 static struct shadow_vmcs_field shadow_read_only_fields[] = {
68 #define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) },
69 #include "vmcs_shadow_fields.h"
70 };
71 static int max_shadow_read_only_fields =
72 	ARRAY_SIZE(shadow_read_only_fields);
73 
74 static struct shadow_vmcs_field shadow_read_write_fields[] = {
75 #define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) },
76 #include "vmcs_shadow_fields.h"
77 };
78 static int max_shadow_read_write_fields =
79 	ARRAY_SIZE(shadow_read_write_fields);
80 
81 static void init_vmcs_shadow_fields(void)
82 {
83 	int i, j;
84 
85 	memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
86 	memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
87 
88 	for (i = j = 0; i < max_shadow_read_only_fields; i++) {
89 		struct shadow_vmcs_field entry = shadow_read_only_fields[i];
90 		u16 field = entry.encoding;
91 
92 		if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
93 		    (i + 1 == max_shadow_read_only_fields ||
94 		     shadow_read_only_fields[i + 1].encoding != field + 1))
95 			pr_err("Missing field from shadow_read_only_field %x\n",
96 			       field + 1);
97 
98 		clear_bit(field, vmx_vmread_bitmap);
99 		if (field & 1)
100 #ifdef CONFIG_X86_64
101 			continue;
102 #else
103 			entry.offset += sizeof(u32);
104 #endif
105 		shadow_read_only_fields[j++] = entry;
106 	}
107 	max_shadow_read_only_fields = j;
108 
109 	for (i = j = 0; i < max_shadow_read_write_fields; i++) {
110 		struct shadow_vmcs_field entry = shadow_read_write_fields[i];
111 		u16 field = entry.encoding;
112 
113 		if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
114 		    (i + 1 == max_shadow_read_write_fields ||
115 		     shadow_read_write_fields[i + 1].encoding != field + 1))
116 			pr_err("Missing field from shadow_read_write_field %x\n",
117 			       field + 1);
118 
119 		WARN_ONCE(field >= GUEST_ES_AR_BYTES &&
120 			  field <= GUEST_TR_AR_BYTES,
121 			  "Update vmcs12_write_any() to drop reserved bits from AR_BYTES");
122 
123 		/*
124 		 * PML and the preemption timer can be emulated, but the
125 		 * processor cannot vmwrite to fields that don't exist
126 		 * on bare metal.
127 		 */
128 		switch (field) {
129 		case GUEST_PML_INDEX:
130 			if (!cpu_has_vmx_pml())
131 				continue;
132 			break;
133 		case VMX_PREEMPTION_TIMER_VALUE:
134 			if (!cpu_has_vmx_preemption_timer())
135 				continue;
136 			break;
137 		case GUEST_INTR_STATUS:
138 			if (!cpu_has_vmx_apicv())
139 				continue;
140 			break;
141 		default:
142 			break;
143 		}
144 
145 		clear_bit(field, vmx_vmwrite_bitmap);
146 		clear_bit(field, vmx_vmread_bitmap);
147 		if (field & 1)
148 #ifdef CONFIG_X86_64
149 			continue;
150 #else
151 			entry.offset += sizeof(u32);
152 #endif
153 		shadow_read_write_fields[j++] = entry;
154 	}
155 	max_shadow_read_write_fields = j;
156 }
157 
158 /*
159  * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
160  * set the success or error code of an emulated VMX instruction (as specified
161  * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated
162  * instruction.
163  */
164 static int nested_vmx_succeed(struct kvm_vcpu *vcpu)
165 {
166 	vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
167 			& ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
168 			    X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
169 	return kvm_skip_emulated_instruction(vcpu);
170 }
171 
172 static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
173 {
174 	vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
175 			& ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
176 			    X86_EFLAGS_SF | X86_EFLAGS_OF))
177 			| X86_EFLAGS_CF);
178 	return kvm_skip_emulated_instruction(vcpu);
179 }
180 
181 static int nested_vmx_failValid(struct kvm_vcpu *vcpu,
182 				u32 vm_instruction_error)
183 {
184 	struct vcpu_vmx *vmx = to_vmx(vcpu);
185 
186 	/*
187 	 * failValid writes the error number to the current VMCS, which
188 	 * can't be done if there isn't a current VMCS.
189 	 */
190 	if (vmx->nested.current_vmptr == -1ull && !vmx->nested.hv_evmcs)
191 		return nested_vmx_failInvalid(vcpu);
192 
193 	vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
194 			& ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
195 			    X86_EFLAGS_SF | X86_EFLAGS_OF))
196 			| X86_EFLAGS_ZF);
197 	get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
198 	/*
199 	 * We don't need to force a shadow sync because
200 	 * VM_INSTRUCTION_ERROR is not shadowed
201 	 */
202 	return kvm_skip_emulated_instruction(vcpu);
203 }
204 
205 static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
206 {
207 	/* TODO: not to reset guest simply here. */
208 	kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
209 	pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator);
210 }
211 
212 static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
213 {
214 	return fixed_bits_valid(control, low, high);
215 }
216 
217 static inline u64 vmx_control_msr(u32 low, u32 high)
218 {
219 	return low | ((u64)high << 32);
220 }
221 
222 static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
223 {
224 	secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS);
225 	vmcs_write64(VMCS_LINK_POINTER, -1ull);
226 	vmx->nested.need_vmcs12_to_shadow_sync = false;
227 }
228 
229 static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
230 {
231 	struct vcpu_vmx *vmx = to_vmx(vcpu);
232 
233 	if (!vmx->nested.hv_evmcs)
234 		return;
235 
236 	kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true);
237 	vmx->nested.hv_evmcs_vmptr = -1ull;
238 	vmx->nested.hv_evmcs = NULL;
239 }
240 
241 /*
242  * Free whatever needs to be freed from vmx->nested when L1 goes down, or
243  * just stops using VMX.
244  */
245 static void free_nested(struct kvm_vcpu *vcpu)
246 {
247 	struct vcpu_vmx *vmx = to_vmx(vcpu);
248 
249 	if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)
250 		return;
251 
252 	kvm_clear_request(KVM_REQ_GET_VMCS12_PAGES, vcpu);
253 
254 	vmx->nested.vmxon = false;
255 	vmx->nested.smm.vmxon = false;
256 	free_vpid(vmx->nested.vpid02);
257 	vmx->nested.posted_intr_nv = -1;
258 	vmx->nested.current_vmptr = -1ull;
259 	if (enable_shadow_vmcs) {
260 		vmx_disable_shadow_vmcs(vmx);
261 		vmcs_clear(vmx->vmcs01.shadow_vmcs);
262 		free_vmcs(vmx->vmcs01.shadow_vmcs);
263 		vmx->vmcs01.shadow_vmcs = NULL;
264 	}
265 	kfree(vmx->nested.cached_vmcs12);
266 	vmx->nested.cached_vmcs12 = NULL;
267 	kfree(vmx->nested.cached_shadow_vmcs12);
268 	vmx->nested.cached_shadow_vmcs12 = NULL;
269 	/* Unpin physical memory we referred to in the vmcs02 */
270 	if (vmx->nested.apic_access_page) {
271 		kvm_release_page_clean(vmx->nested.apic_access_page);
272 		vmx->nested.apic_access_page = NULL;
273 	}
274 	kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
275 	kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
276 	vmx->nested.pi_desc = NULL;
277 
278 	kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
279 
280 	nested_release_evmcs(vcpu);
281 
282 	free_loaded_vmcs(&vmx->nested.vmcs02);
283 }
284 
285 static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
286 				     struct loaded_vmcs *prev)
287 {
288 	struct vmcs_host_state *dest, *src;
289 
290 	if (unlikely(!vmx->guest_state_loaded))
291 		return;
292 
293 	src = &prev->host_state;
294 	dest = &vmx->loaded_vmcs->host_state;
295 
296 	vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base);
297 	dest->ldt_sel = src->ldt_sel;
298 #ifdef CONFIG_X86_64
299 	dest->ds_sel = src->ds_sel;
300 	dest->es_sel = src->es_sel;
301 #endif
302 }
303 
304 static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
305 {
306 	struct vcpu_vmx *vmx = to_vmx(vcpu);
307 	struct loaded_vmcs *prev;
308 	int cpu;
309 
310 	if (vmx->loaded_vmcs == vmcs)
311 		return;
312 
313 	cpu = get_cpu();
314 	prev = vmx->loaded_vmcs;
315 	vmx->loaded_vmcs = vmcs;
316 	vmx_vcpu_load_vmcs(vcpu, cpu);
317 	vmx_sync_vmcs_host_state(vmx, prev);
318 	put_cpu();
319 
320 	vmx_segment_cache_clear(vmx);
321 }
322 
323 /*
324  * Ensure that the current vmcs of the logical processor is the
325  * vmcs01 of the vcpu before calling free_nested().
326  */
327 void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu)
328 {
329 	vcpu_load(vcpu);
330 	vmx_leave_nested(vcpu);
331 	vmx_switch_vmcs(vcpu, &to_vmx(vcpu)->vmcs01);
332 	free_nested(vcpu);
333 	vcpu_put(vcpu);
334 }
335 
336 static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
337 		struct x86_exception *fault)
338 {
339 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
340 	struct vcpu_vmx *vmx = to_vmx(vcpu);
341 	u32 exit_reason;
342 	unsigned long exit_qualification = vcpu->arch.exit_qualification;
343 
344 	if (vmx->nested.pml_full) {
345 		exit_reason = EXIT_REASON_PML_FULL;
346 		vmx->nested.pml_full = false;
347 		exit_qualification &= INTR_INFO_UNBLOCK_NMI;
348 	} else if (fault->error_code & PFERR_RSVD_MASK)
349 		exit_reason = EXIT_REASON_EPT_MISCONFIG;
350 	else
351 		exit_reason = EXIT_REASON_EPT_VIOLATION;
352 
353 	nested_vmx_vmexit(vcpu, exit_reason, 0, exit_qualification);
354 	vmcs12->guest_physical_address = fault->address;
355 }
356 
357 static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
358 {
359 	WARN_ON(mmu_is_nested(vcpu));
360 
361 	vcpu->arch.mmu = &vcpu->arch.guest_mmu;
362 	kvm_init_shadow_ept_mmu(vcpu,
363 			to_vmx(vcpu)->nested.msrs.ept_caps &
364 			VMX_EPT_EXECUTE_ONLY_BIT,
365 			nested_ept_ad_enabled(vcpu),
366 			nested_ept_get_cr3(vcpu));
367 	vcpu->arch.mmu->set_cr3           = vmx_set_cr3;
368 	vcpu->arch.mmu->get_cr3           = nested_ept_get_cr3;
369 	vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault;
370 	vcpu->arch.mmu->get_pdptr         = kvm_pdptr_read;
371 
372 	vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
373 }
374 
375 static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
376 {
377 	vcpu->arch.mmu = &vcpu->arch.root_mmu;
378 	vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
379 }
380 
381 static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
382 					    u16 error_code)
383 {
384 	bool inequality, bit;
385 
386 	bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0;
387 	inequality =
388 		(error_code & vmcs12->page_fault_error_code_mask) !=
389 		 vmcs12->page_fault_error_code_match;
390 	return inequality ^ bit;
391 }
392 
393 
394 /*
395  * KVM wants to inject page-faults which it got to the guest. This function
396  * checks whether in a nested guest, we need to inject them to L1 or L2.
397  */
398 static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual)
399 {
400 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
401 	unsigned int nr = vcpu->arch.exception.nr;
402 	bool has_payload = vcpu->arch.exception.has_payload;
403 	unsigned long payload = vcpu->arch.exception.payload;
404 
405 	if (nr == PF_VECTOR) {
406 		if (vcpu->arch.exception.nested_apf) {
407 			*exit_qual = vcpu->arch.apf.nested_apf_token;
408 			return 1;
409 		}
410 		if (nested_vmx_is_page_fault_vmexit(vmcs12,
411 						    vcpu->arch.exception.error_code)) {
412 			*exit_qual = has_payload ? payload : vcpu->arch.cr2;
413 			return 1;
414 		}
415 	} else if (vmcs12->exception_bitmap & (1u << nr)) {
416 		if (nr == DB_VECTOR) {
417 			if (!has_payload) {
418 				payload = vcpu->arch.dr6;
419 				payload &= ~(DR6_FIXED_1 | DR6_BT);
420 				payload ^= DR6_RTM;
421 			}
422 			*exit_qual = payload;
423 		} else
424 			*exit_qual = 0;
425 		return 1;
426 	}
427 
428 	return 0;
429 }
430 
431 
432 static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
433 		struct x86_exception *fault)
434 {
435 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
436 
437 	WARN_ON(!is_guest_mode(vcpu));
438 
439 	if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) &&
440 		!to_vmx(vcpu)->nested.nested_run_pending) {
441 		vmcs12->vm_exit_intr_error_code = fault->error_code;
442 		nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
443 				  PF_VECTOR | INTR_TYPE_HARD_EXCEPTION |
444 				  INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK,
445 				  fault->address);
446 	} else {
447 		kvm_inject_page_fault(vcpu, fault);
448 	}
449 }
450 
451 static bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa)
452 {
453 	return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu));
454 }
455 
456 static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu,
457 					       struct vmcs12 *vmcs12)
458 {
459 	if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
460 		return 0;
461 
462 	if (CC(!page_address_valid(vcpu, vmcs12->io_bitmap_a)) ||
463 	    CC(!page_address_valid(vcpu, vmcs12->io_bitmap_b)))
464 		return -EINVAL;
465 
466 	return 0;
467 }
468 
469 static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
470 						struct vmcs12 *vmcs12)
471 {
472 	if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
473 		return 0;
474 
475 	if (CC(!page_address_valid(vcpu, vmcs12->msr_bitmap)))
476 		return -EINVAL;
477 
478 	return 0;
479 }
480 
481 static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu,
482 						struct vmcs12 *vmcs12)
483 {
484 	if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
485 		return 0;
486 
487 	if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr)))
488 		return -EINVAL;
489 
490 	return 0;
491 }
492 
493 /*
494  * Check if MSR is intercepted for L01 MSR bitmap.
495  */
496 static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr)
497 {
498 	unsigned long *msr_bitmap;
499 	int f = sizeof(unsigned long);
500 
501 	if (!cpu_has_vmx_msr_bitmap())
502 		return true;
503 
504 	msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap;
505 
506 	if (msr <= 0x1fff) {
507 		return !!test_bit(msr, msr_bitmap + 0x800 / f);
508 	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
509 		msr &= 0x1fff;
510 		return !!test_bit(msr, msr_bitmap + 0xc00 / f);
511 	}
512 
513 	return true;
514 }
515 
516 /*
517  * If a msr is allowed by L0, we should check whether it is allowed by L1.
518  * The corresponding bit will be cleared unless both of L0 and L1 allow it.
519  */
520 static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
521 					       unsigned long *msr_bitmap_nested,
522 					       u32 msr, int type)
523 {
524 	int f = sizeof(unsigned long);
525 
526 	/*
527 	 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
528 	 * have the write-low and read-high bitmap offsets the wrong way round.
529 	 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
530 	 */
531 	if (msr <= 0x1fff) {
532 		if (type & MSR_TYPE_R &&
533 		   !test_bit(msr, msr_bitmap_l1 + 0x000 / f))
534 			/* read-low */
535 			__clear_bit(msr, msr_bitmap_nested + 0x000 / f);
536 
537 		if (type & MSR_TYPE_W &&
538 		   !test_bit(msr, msr_bitmap_l1 + 0x800 / f))
539 			/* write-low */
540 			__clear_bit(msr, msr_bitmap_nested + 0x800 / f);
541 
542 	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
543 		msr &= 0x1fff;
544 		if (type & MSR_TYPE_R &&
545 		   !test_bit(msr, msr_bitmap_l1 + 0x400 / f))
546 			/* read-high */
547 			__clear_bit(msr, msr_bitmap_nested + 0x400 / f);
548 
549 		if (type & MSR_TYPE_W &&
550 		   !test_bit(msr, msr_bitmap_l1 + 0xc00 / f))
551 			/* write-high */
552 			__clear_bit(msr, msr_bitmap_nested + 0xc00 / f);
553 
554 	}
555 }
556 
557 static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) {
558 	int msr;
559 
560 	for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
561 		unsigned word = msr / BITS_PER_LONG;
562 
563 		msr_bitmap[word] = ~0;
564 		msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
565 	}
566 }
567 
568 /*
569  * Merge L0's and L1's MSR bitmap, return false to indicate that
570  * we do not use the hardware.
571  */
572 static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
573 						 struct vmcs12 *vmcs12)
574 {
575 	int msr;
576 	unsigned long *msr_bitmap_l1;
577 	unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap;
578 	struct kvm_host_map *map = &to_vmx(vcpu)->nested.msr_bitmap_map;
579 
580 	/* Nothing to do if the MSR bitmap is not in use.  */
581 	if (!cpu_has_vmx_msr_bitmap() ||
582 	    !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
583 		return false;
584 
585 	if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map))
586 		return false;
587 
588 	msr_bitmap_l1 = (unsigned long *)map->hva;
589 
590 	/*
591 	 * To keep the control flow simple, pay eight 8-byte writes (sixteen
592 	 * 4-byte writes on 32-bit systems) up front to enable intercepts for
593 	 * the x2APIC MSR range and selectively disable them below.
594 	 */
595 	enable_x2apic_msr_intercepts(msr_bitmap_l0);
596 
597 	if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
598 		if (nested_cpu_has_apic_reg_virt(vmcs12)) {
599 			/*
600 			 * L0 need not intercept reads for MSRs between 0x800
601 			 * and 0x8ff, it just lets the processor take the value
602 			 * from the virtual-APIC page; take those 256 bits
603 			 * directly from the L1 bitmap.
604 			 */
605 			for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
606 				unsigned word = msr / BITS_PER_LONG;
607 
608 				msr_bitmap_l0[word] = msr_bitmap_l1[word];
609 			}
610 		}
611 
612 		nested_vmx_disable_intercept_for_msr(
613 			msr_bitmap_l1, msr_bitmap_l0,
614 			X2APIC_MSR(APIC_TASKPRI),
615 			MSR_TYPE_R | MSR_TYPE_W);
616 
617 		if (nested_cpu_has_vid(vmcs12)) {
618 			nested_vmx_disable_intercept_for_msr(
619 				msr_bitmap_l1, msr_bitmap_l0,
620 				X2APIC_MSR(APIC_EOI),
621 				MSR_TYPE_W);
622 			nested_vmx_disable_intercept_for_msr(
623 				msr_bitmap_l1, msr_bitmap_l0,
624 				X2APIC_MSR(APIC_SELF_IPI),
625 				MSR_TYPE_W);
626 		}
627 	}
628 
629 	/* KVM unconditionally exposes the FS/GS base MSRs to L1. */
630 	nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
631 					     MSR_FS_BASE, MSR_TYPE_RW);
632 
633 	nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
634 					     MSR_GS_BASE, MSR_TYPE_RW);
635 
636 	nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
637 					     MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
638 
639 	/*
640 	 * Checking the L0->L1 bitmap is trying to verify two things:
641 	 *
642 	 * 1. L0 gave a permission to L1 to actually passthrough the MSR. This
643 	 *    ensures that we do not accidentally generate an L02 MSR bitmap
644 	 *    from the L12 MSR bitmap that is too permissive.
645 	 * 2. That L1 or L2s have actually used the MSR. This avoids
646 	 *    unnecessarily merging of the bitmap if the MSR is unused. This
647 	 *    works properly because we only update the L01 MSR bitmap lazily.
648 	 *    So even if L0 should pass L1 these MSRs, the L01 bitmap is only
649 	 *    updated to reflect this when L1 (or its L2s) actually write to
650 	 *    the MSR.
651 	 */
652 	if (!msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL))
653 		nested_vmx_disable_intercept_for_msr(
654 					msr_bitmap_l1, msr_bitmap_l0,
655 					MSR_IA32_SPEC_CTRL,
656 					MSR_TYPE_R | MSR_TYPE_W);
657 
658 	if (!msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD))
659 		nested_vmx_disable_intercept_for_msr(
660 					msr_bitmap_l1, msr_bitmap_l0,
661 					MSR_IA32_PRED_CMD,
662 					MSR_TYPE_W);
663 
664 	kvm_vcpu_unmap(vcpu, &to_vmx(vcpu)->nested.msr_bitmap_map, false);
665 
666 	return true;
667 }
668 
669 static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu,
670 				       struct vmcs12 *vmcs12)
671 {
672 	struct kvm_host_map map;
673 	struct vmcs12 *shadow;
674 
675 	if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
676 	    vmcs12->vmcs_link_pointer == -1ull)
677 		return;
678 
679 	shadow = get_shadow_vmcs12(vcpu);
680 
681 	if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map))
682 		return;
683 
684 	memcpy(shadow, map.hva, VMCS12_SIZE);
685 	kvm_vcpu_unmap(vcpu, &map, false);
686 }
687 
688 static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu,
689 					      struct vmcs12 *vmcs12)
690 {
691 	struct vcpu_vmx *vmx = to_vmx(vcpu);
692 
693 	if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
694 	    vmcs12->vmcs_link_pointer == -1ull)
695 		return;
696 
697 	kvm_write_guest(vmx->vcpu.kvm, vmcs12->vmcs_link_pointer,
698 			get_shadow_vmcs12(vcpu), VMCS12_SIZE);
699 }
700 
701 /*
702  * In nested virtualization, check if L1 has set
703  * VM_EXIT_ACK_INTR_ON_EXIT
704  */
705 static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu)
706 {
707 	return get_vmcs12(vcpu)->vm_exit_controls &
708 		VM_EXIT_ACK_INTR_ON_EXIT;
709 }
710 
711 static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu)
712 {
713 	return nested_cpu_has_nmi_exiting(get_vmcs12(vcpu));
714 }
715 
716 static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu,
717 					  struct vmcs12 *vmcs12)
718 {
719 	if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
720 	    CC(!page_address_valid(vcpu, vmcs12->apic_access_addr)))
721 		return -EINVAL;
722 	else
723 		return 0;
724 }
725 
726 static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
727 					   struct vmcs12 *vmcs12)
728 {
729 	if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
730 	    !nested_cpu_has_apic_reg_virt(vmcs12) &&
731 	    !nested_cpu_has_vid(vmcs12) &&
732 	    !nested_cpu_has_posted_intr(vmcs12))
733 		return 0;
734 
735 	/*
736 	 * If virtualize x2apic mode is enabled,
737 	 * virtualize apic access must be disabled.
738 	 */
739 	if (CC(nested_cpu_has_virt_x2apic_mode(vmcs12) &&
740 	       nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)))
741 		return -EINVAL;
742 
743 	/*
744 	 * If virtual interrupt delivery is enabled,
745 	 * we must exit on external interrupts.
746 	 */
747 	if (CC(nested_cpu_has_vid(vmcs12) && !nested_exit_on_intr(vcpu)))
748 		return -EINVAL;
749 
750 	/*
751 	 * bits 15:8 should be zero in posted_intr_nv,
752 	 * the descriptor address has been already checked
753 	 * in nested_get_vmcs12_pages.
754 	 *
755 	 * bits 5:0 of posted_intr_desc_addr should be zero.
756 	 */
757 	if (nested_cpu_has_posted_intr(vmcs12) &&
758 	   (CC(!nested_cpu_has_vid(vmcs12)) ||
759 	    CC(!nested_exit_intr_ack_set(vcpu)) ||
760 	    CC((vmcs12->posted_intr_nv & 0xff00)) ||
761 	    CC((vmcs12->posted_intr_desc_addr & 0x3f)) ||
762 	    CC((vmcs12->posted_intr_desc_addr >> cpuid_maxphyaddr(vcpu)))))
763 		return -EINVAL;
764 
765 	/* tpr shadow is needed by all apicv features. */
766 	if (CC(!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)))
767 		return -EINVAL;
768 
769 	return 0;
770 }
771 
772 static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
773 				       u32 count, u64 addr)
774 {
775 	int maxphyaddr;
776 
777 	if (count == 0)
778 		return 0;
779 	maxphyaddr = cpuid_maxphyaddr(vcpu);
780 	if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr ||
781 	    (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr)
782 		return -EINVAL;
783 
784 	return 0;
785 }
786 
787 static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu,
788 						     struct vmcs12 *vmcs12)
789 {
790 	if (CC(nested_vmx_check_msr_switch(vcpu,
791 					   vmcs12->vm_exit_msr_load_count,
792 					   vmcs12->vm_exit_msr_load_addr)) ||
793 	    CC(nested_vmx_check_msr_switch(vcpu,
794 					   vmcs12->vm_exit_msr_store_count,
795 					   vmcs12->vm_exit_msr_store_addr)))
796 		return -EINVAL;
797 
798 	return 0;
799 }
800 
801 static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu,
802                                                       struct vmcs12 *vmcs12)
803 {
804 	if (CC(nested_vmx_check_msr_switch(vcpu,
805 					   vmcs12->vm_entry_msr_load_count,
806 					   vmcs12->vm_entry_msr_load_addr)))
807                 return -EINVAL;
808 
809 	return 0;
810 }
811 
812 static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu,
813 					 struct vmcs12 *vmcs12)
814 {
815 	if (!nested_cpu_has_pml(vmcs12))
816 		return 0;
817 
818 	if (CC(!nested_cpu_has_ept(vmcs12)) ||
819 	    CC(!page_address_valid(vcpu, vmcs12->pml_address)))
820 		return -EINVAL;
821 
822 	return 0;
823 }
824 
825 static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu,
826 							struct vmcs12 *vmcs12)
827 {
828 	if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) &&
829 	       !nested_cpu_has_ept(vmcs12)))
830 		return -EINVAL;
831 	return 0;
832 }
833 
834 static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu,
835 							 struct vmcs12 *vmcs12)
836 {
837 	if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) &&
838 	       !nested_cpu_has_ept(vmcs12)))
839 		return -EINVAL;
840 	return 0;
841 }
842 
843 static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu,
844 						 struct vmcs12 *vmcs12)
845 {
846 	if (!nested_cpu_has_shadow_vmcs(vmcs12))
847 		return 0;
848 
849 	if (CC(!page_address_valid(vcpu, vmcs12->vmread_bitmap)) ||
850 	    CC(!page_address_valid(vcpu, vmcs12->vmwrite_bitmap)))
851 		return -EINVAL;
852 
853 	return 0;
854 }
855 
856 static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu,
857 				       struct vmx_msr_entry *e)
858 {
859 	/* x2APIC MSR accesses are not allowed */
860 	if (CC(vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8))
861 		return -EINVAL;
862 	if (CC(e->index == MSR_IA32_UCODE_WRITE) || /* SDM Table 35-2 */
863 	    CC(e->index == MSR_IA32_UCODE_REV))
864 		return -EINVAL;
865 	if (CC(e->reserved != 0))
866 		return -EINVAL;
867 	return 0;
868 }
869 
870 static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu,
871 				     struct vmx_msr_entry *e)
872 {
873 	if (CC(e->index == MSR_FS_BASE) ||
874 	    CC(e->index == MSR_GS_BASE) ||
875 	    CC(e->index == MSR_IA32_SMM_MONITOR_CTL) || /* SMM is not supported */
876 	    nested_vmx_msr_check_common(vcpu, e))
877 		return -EINVAL;
878 	return 0;
879 }
880 
881 static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu,
882 				      struct vmx_msr_entry *e)
883 {
884 	if (CC(e->index == MSR_IA32_SMBASE) || /* SMM is not supported */
885 	    nested_vmx_msr_check_common(vcpu, e))
886 		return -EINVAL;
887 	return 0;
888 }
889 
890 static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu)
891 {
892 	struct vcpu_vmx *vmx = to_vmx(vcpu);
893 	u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low,
894 				       vmx->nested.msrs.misc_high);
895 
896 	return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER;
897 }
898 
899 /*
900  * Load guest's/host's msr at nested entry/exit.
901  * return 0 for success, entry index for failure.
902  *
903  * One of the failure modes for MSR load/store is when a list exceeds the
904  * virtual hardware's capacity. To maintain compatibility with hardware inasmuch
905  * as possible, process all valid entries before failing rather than precheck
906  * for a capacity violation.
907  */
908 static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
909 {
910 	u32 i;
911 	struct vmx_msr_entry e;
912 	u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu);
913 
914 	for (i = 0; i < count; i++) {
915 		if (unlikely(i >= max_msr_list_size))
916 			goto fail;
917 
918 		if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e),
919 					&e, sizeof(e))) {
920 			pr_debug_ratelimited(
921 				"%s cannot read MSR entry (%u, 0x%08llx)\n",
922 				__func__, i, gpa + i * sizeof(e));
923 			goto fail;
924 		}
925 		if (nested_vmx_load_msr_check(vcpu, &e)) {
926 			pr_debug_ratelimited(
927 				"%s check failed (%u, 0x%x, 0x%x)\n",
928 				__func__, i, e.index, e.reserved);
929 			goto fail;
930 		}
931 		if (kvm_set_msr(vcpu, e.index, e.value)) {
932 			pr_debug_ratelimited(
933 				"%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
934 				__func__, i, e.index, e.value);
935 			goto fail;
936 		}
937 	}
938 	return 0;
939 fail:
940 	return i + 1;
941 }
942 
943 static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu,
944 					    u32 msr_index,
945 					    u64 *data)
946 {
947 	struct vcpu_vmx *vmx = to_vmx(vcpu);
948 
949 	/*
950 	 * If the L0 hypervisor stored a more accurate value for the TSC that
951 	 * does not include the time taken for emulation of the L2->L1
952 	 * VM-exit in L0, use the more accurate value.
953 	 */
954 	if (msr_index == MSR_IA32_TSC) {
955 		int index = vmx_find_msr_index(&vmx->msr_autostore.guest,
956 					       MSR_IA32_TSC);
957 
958 		if (index >= 0) {
959 			u64 val = vmx->msr_autostore.guest.val[index].value;
960 
961 			*data = kvm_read_l1_tsc(vcpu, val);
962 			return true;
963 		}
964 	}
965 
966 	if (kvm_get_msr(vcpu, msr_index, data)) {
967 		pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__,
968 			msr_index);
969 		return false;
970 	}
971 	return true;
972 }
973 
974 static bool read_and_check_msr_entry(struct kvm_vcpu *vcpu, u64 gpa, int i,
975 				     struct vmx_msr_entry *e)
976 {
977 	if (kvm_vcpu_read_guest(vcpu,
978 				gpa + i * sizeof(*e),
979 				e, 2 * sizeof(u32))) {
980 		pr_debug_ratelimited(
981 			"%s cannot read MSR entry (%u, 0x%08llx)\n",
982 			__func__, i, gpa + i * sizeof(*e));
983 		return false;
984 	}
985 	if (nested_vmx_store_msr_check(vcpu, e)) {
986 		pr_debug_ratelimited(
987 			"%s check failed (%u, 0x%x, 0x%x)\n",
988 			__func__, i, e->index, e->reserved);
989 		return false;
990 	}
991 	return true;
992 }
993 
994 static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
995 {
996 	u64 data;
997 	u32 i;
998 	struct vmx_msr_entry e;
999 	u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu);
1000 
1001 	for (i = 0; i < count; i++) {
1002 		if (unlikely(i >= max_msr_list_size))
1003 			return -EINVAL;
1004 
1005 		if (!read_and_check_msr_entry(vcpu, gpa, i, &e))
1006 			return -EINVAL;
1007 
1008 		if (!nested_vmx_get_vmexit_msr_value(vcpu, e.index, &data))
1009 			return -EINVAL;
1010 
1011 		if (kvm_vcpu_write_guest(vcpu,
1012 					 gpa + i * sizeof(e) +
1013 					     offsetof(struct vmx_msr_entry, value),
1014 					 &data, sizeof(data))) {
1015 			pr_debug_ratelimited(
1016 				"%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
1017 				__func__, i, e.index, data);
1018 			return -EINVAL;
1019 		}
1020 	}
1021 	return 0;
1022 }
1023 
1024 static bool nested_msr_store_list_has_msr(struct kvm_vcpu *vcpu, u32 msr_index)
1025 {
1026 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1027 	u32 count = vmcs12->vm_exit_msr_store_count;
1028 	u64 gpa = vmcs12->vm_exit_msr_store_addr;
1029 	struct vmx_msr_entry e;
1030 	u32 i;
1031 
1032 	for (i = 0; i < count; i++) {
1033 		if (!read_and_check_msr_entry(vcpu, gpa, i, &e))
1034 			return false;
1035 
1036 		if (e.index == msr_index)
1037 			return true;
1038 	}
1039 	return false;
1040 }
1041 
1042 static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu,
1043 					   u32 msr_index)
1044 {
1045 	struct vcpu_vmx *vmx = to_vmx(vcpu);
1046 	struct vmx_msrs *autostore = &vmx->msr_autostore.guest;
1047 	bool in_vmcs12_store_list;
1048 	int msr_autostore_index;
1049 	bool in_autostore_list;
1050 	int last;
1051 
1052 	msr_autostore_index = vmx_find_msr_index(autostore, msr_index);
1053 	in_autostore_list = msr_autostore_index >= 0;
1054 	in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index);
1055 
1056 	if (in_vmcs12_store_list && !in_autostore_list) {
1057 		if (autostore->nr == NR_LOADSTORE_MSRS) {
1058 			/*
1059 			 * Emulated VMEntry does not fail here.  Instead a less
1060 			 * accurate value will be returned by
1061 			 * nested_vmx_get_vmexit_msr_value() using kvm_get_msr()
1062 			 * instead of reading the value from the vmcs02 VMExit
1063 			 * MSR-store area.
1064 			 */
1065 			pr_warn_ratelimited(
1066 				"Not enough msr entries in msr_autostore.  Can't add msr %x\n",
1067 				msr_index);
1068 			return;
1069 		}
1070 		last = autostore->nr++;
1071 		autostore->val[last].index = msr_index;
1072 	} else if (!in_vmcs12_store_list && in_autostore_list) {
1073 		last = --autostore->nr;
1074 		autostore->val[msr_autostore_index] = autostore->val[last];
1075 	}
1076 }
1077 
1078 static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val)
1079 {
1080 	unsigned long invalid_mask;
1081 
1082 	invalid_mask = (~0ULL) << cpuid_maxphyaddr(vcpu);
1083 	return (val & invalid_mask) == 0;
1084 }
1085 
1086 /*
1087  * Load guest's/host's cr3 at nested entry/exit. nested_ept is true if we are
1088  * emulating VM entry into a guest with EPT enabled.
1089  * Returns 0 on success, 1 on failure. Invalid state exit qualification code
1090  * is assigned to entry_failure_code on failure.
1091  */
1092 static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept,
1093 			       u32 *entry_failure_code)
1094 {
1095 	if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) {
1096 		if (CC(!nested_cr3_valid(vcpu, cr3))) {
1097 			*entry_failure_code = ENTRY_FAIL_DEFAULT;
1098 			return -EINVAL;
1099 		}
1100 
1101 		/*
1102 		 * If PAE paging and EPT are both on, CR3 is not used by the CPU and
1103 		 * must not be dereferenced.
1104 		 */
1105 		if (is_pae_paging(vcpu) && !nested_ept) {
1106 			if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))) {
1107 				*entry_failure_code = ENTRY_FAIL_PDPTE;
1108 				return -EINVAL;
1109 			}
1110 		}
1111 	}
1112 
1113 	if (!nested_ept)
1114 		kvm_mmu_new_cr3(vcpu, cr3, false);
1115 
1116 	vcpu->arch.cr3 = cr3;
1117 	kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
1118 
1119 	kvm_init_mmu(vcpu, false);
1120 
1121 	return 0;
1122 }
1123 
1124 /*
1125  * Returns if KVM is able to config CPU to tag TLB entries
1126  * populated by L2 differently than TLB entries populated
1127  * by L1.
1128  *
1129  * If L0 uses EPT, L1 and L2 run with different EPTP because
1130  * guest_mode is part of kvm_mmu_page_role. Thus, TLB entries
1131  * are tagged with different EPTP.
1132  *
1133  * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged
1134  * with different VPID (L1 entries are tagged with vmx->vpid
1135  * while L2 entries are tagged with vmx->nested.vpid02).
1136  */
1137 static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu)
1138 {
1139 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1140 
1141 	return enable_ept ||
1142 	       (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02);
1143 }
1144 
1145 static u16 nested_get_vpid02(struct kvm_vcpu *vcpu)
1146 {
1147 	struct vcpu_vmx *vmx = to_vmx(vcpu);
1148 
1149 	return vmx->nested.vpid02 ? vmx->nested.vpid02 : vmx->vpid;
1150 }
1151 
1152 static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask)
1153 {
1154 	superset &= mask;
1155 	subset &= mask;
1156 
1157 	return (superset | subset) == superset;
1158 }
1159 
1160 static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data)
1161 {
1162 	const u64 feature_and_reserved =
1163 		/* feature (except bit 48; see below) */
1164 		BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) |
1165 		/* reserved */
1166 		BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56);
1167 	u64 vmx_basic = vmx->nested.msrs.basic;
1168 
1169 	if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved))
1170 		return -EINVAL;
1171 
1172 	/*
1173 	 * KVM does not emulate a version of VMX that constrains physical
1174 	 * addresses of VMX structures (e.g. VMCS) to 32-bits.
1175 	 */
1176 	if (data & BIT_ULL(48))
1177 		return -EINVAL;
1178 
1179 	if (vmx_basic_vmcs_revision_id(vmx_basic) !=
1180 	    vmx_basic_vmcs_revision_id(data))
1181 		return -EINVAL;
1182 
1183 	if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data))
1184 		return -EINVAL;
1185 
1186 	vmx->nested.msrs.basic = data;
1187 	return 0;
1188 }
1189 
1190 static int
1191 vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
1192 {
1193 	u64 supported;
1194 	u32 *lowp, *highp;
1195 
1196 	switch (msr_index) {
1197 	case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1198 		lowp = &vmx->nested.msrs.pinbased_ctls_low;
1199 		highp = &vmx->nested.msrs.pinbased_ctls_high;
1200 		break;
1201 	case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1202 		lowp = &vmx->nested.msrs.procbased_ctls_low;
1203 		highp = &vmx->nested.msrs.procbased_ctls_high;
1204 		break;
1205 	case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1206 		lowp = &vmx->nested.msrs.exit_ctls_low;
1207 		highp = &vmx->nested.msrs.exit_ctls_high;
1208 		break;
1209 	case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1210 		lowp = &vmx->nested.msrs.entry_ctls_low;
1211 		highp = &vmx->nested.msrs.entry_ctls_high;
1212 		break;
1213 	case MSR_IA32_VMX_PROCBASED_CTLS2:
1214 		lowp = &vmx->nested.msrs.secondary_ctls_low;
1215 		highp = &vmx->nested.msrs.secondary_ctls_high;
1216 		break;
1217 	default:
1218 		BUG();
1219 	}
1220 
1221 	supported = vmx_control_msr(*lowp, *highp);
1222 
1223 	/* Check must-be-1 bits are still 1. */
1224 	if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0)))
1225 		return -EINVAL;
1226 
1227 	/* Check must-be-0 bits are still 0. */
1228 	if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32)))
1229 		return -EINVAL;
1230 
1231 	*lowp = data;
1232 	*highp = data >> 32;
1233 	return 0;
1234 }
1235 
1236 static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
1237 {
1238 	const u64 feature_and_reserved_bits =
1239 		/* feature */
1240 		BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) |
1241 		BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) |
1242 		/* reserved */
1243 		GENMASK_ULL(13, 9) | BIT_ULL(31);
1244 	u64 vmx_misc;
1245 
1246 	vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low,
1247 				   vmx->nested.msrs.misc_high);
1248 
1249 	if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits))
1250 		return -EINVAL;
1251 
1252 	if ((vmx->nested.msrs.pinbased_ctls_high &
1253 	     PIN_BASED_VMX_PREEMPTION_TIMER) &&
1254 	    vmx_misc_preemption_timer_rate(data) !=
1255 	    vmx_misc_preemption_timer_rate(vmx_misc))
1256 		return -EINVAL;
1257 
1258 	if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc))
1259 		return -EINVAL;
1260 
1261 	if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc))
1262 		return -EINVAL;
1263 
1264 	if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc))
1265 		return -EINVAL;
1266 
1267 	vmx->nested.msrs.misc_low = data;
1268 	vmx->nested.msrs.misc_high = data >> 32;
1269 
1270 	return 0;
1271 }
1272 
1273 static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data)
1274 {
1275 	u64 vmx_ept_vpid_cap;
1276 
1277 	vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.msrs.ept_caps,
1278 					   vmx->nested.msrs.vpid_caps);
1279 
1280 	/* Every bit is either reserved or a feature bit. */
1281 	if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL))
1282 		return -EINVAL;
1283 
1284 	vmx->nested.msrs.ept_caps = data;
1285 	vmx->nested.msrs.vpid_caps = data >> 32;
1286 	return 0;
1287 }
1288 
1289 static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
1290 {
1291 	u64 *msr;
1292 
1293 	switch (msr_index) {
1294 	case MSR_IA32_VMX_CR0_FIXED0:
1295 		msr = &vmx->nested.msrs.cr0_fixed0;
1296 		break;
1297 	case MSR_IA32_VMX_CR4_FIXED0:
1298 		msr = &vmx->nested.msrs.cr4_fixed0;
1299 		break;
1300 	default:
1301 		BUG();
1302 	}
1303 
1304 	/*
1305 	 * 1 bits (which indicates bits which "must-be-1" during VMX operation)
1306 	 * must be 1 in the restored value.
1307 	 */
1308 	if (!is_bitwise_subset(data, *msr, -1ULL))
1309 		return -EINVAL;
1310 
1311 	*msr = data;
1312 	return 0;
1313 }
1314 
1315 /*
1316  * Called when userspace is restoring VMX MSRs.
1317  *
1318  * Returns 0 on success, non-0 otherwise.
1319  */
1320 int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1321 {
1322 	struct vcpu_vmx *vmx = to_vmx(vcpu);
1323 
1324 	/*
1325 	 * Don't allow changes to the VMX capability MSRs while the vCPU
1326 	 * is in VMX operation.
1327 	 */
1328 	if (vmx->nested.vmxon)
1329 		return -EBUSY;
1330 
1331 	switch (msr_index) {
1332 	case MSR_IA32_VMX_BASIC:
1333 		return vmx_restore_vmx_basic(vmx, data);
1334 	case MSR_IA32_VMX_PINBASED_CTLS:
1335 	case MSR_IA32_VMX_PROCBASED_CTLS:
1336 	case MSR_IA32_VMX_EXIT_CTLS:
1337 	case MSR_IA32_VMX_ENTRY_CTLS:
1338 		/*
1339 		 * The "non-true" VMX capability MSRs are generated from the
1340 		 * "true" MSRs, so we do not support restoring them directly.
1341 		 *
1342 		 * If userspace wants to emulate VMX_BASIC[55]=0, userspace
1343 		 * should restore the "true" MSRs with the must-be-1 bits
1344 		 * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND
1345 		 * DEFAULT SETTINGS".
1346 		 */
1347 		return -EINVAL;
1348 	case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1349 	case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1350 	case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1351 	case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1352 	case MSR_IA32_VMX_PROCBASED_CTLS2:
1353 		return vmx_restore_control_msr(vmx, msr_index, data);
1354 	case MSR_IA32_VMX_MISC:
1355 		return vmx_restore_vmx_misc(vmx, data);
1356 	case MSR_IA32_VMX_CR0_FIXED0:
1357 	case MSR_IA32_VMX_CR4_FIXED0:
1358 		return vmx_restore_fixed0_msr(vmx, msr_index, data);
1359 	case MSR_IA32_VMX_CR0_FIXED1:
1360 	case MSR_IA32_VMX_CR4_FIXED1:
1361 		/*
1362 		 * These MSRs are generated based on the vCPU's CPUID, so we
1363 		 * do not support restoring them directly.
1364 		 */
1365 		return -EINVAL;
1366 	case MSR_IA32_VMX_EPT_VPID_CAP:
1367 		return vmx_restore_vmx_ept_vpid_cap(vmx, data);
1368 	case MSR_IA32_VMX_VMCS_ENUM:
1369 		vmx->nested.msrs.vmcs_enum = data;
1370 		return 0;
1371 	case MSR_IA32_VMX_VMFUNC:
1372 		if (data & ~vmx->nested.msrs.vmfunc_controls)
1373 			return -EINVAL;
1374 		vmx->nested.msrs.vmfunc_controls = data;
1375 		return 0;
1376 	default:
1377 		/*
1378 		 * The rest of the VMX capability MSRs do not support restore.
1379 		 */
1380 		return -EINVAL;
1381 	}
1382 }
1383 
1384 /* Returns 0 on success, non-0 otherwise. */
1385 int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata)
1386 {
1387 	switch (msr_index) {
1388 	case MSR_IA32_VMX_BASIC:
1389 		*pdata = msrs->basic;
1390 		break;
1391 	case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1392 	case MSR_IA32_VMX_PINBASED_CTLS:
1393 		*pdata = vmx_control_msr(
1394 			msrs->pinbased_ctls_low,
1395 			msrs->pinbased_ctls_high);
1396 		if (msr_index == MSR_IA32_VMX_PINBASED_CTLS)
1397 			*pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
1398 		break;
1399 	case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1400 	case MSR_IA32_VMX_PROCBASED_CTLS:
1401 		*pdata = vmx_control_msr(
1402 			msrs->procbased_ctls_low,
1403 			msrs->procbased_ctls_high);
1404 		if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS)
1405 			*pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
1406 		break;
1407 	case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1408 	case MSR_IA32_VMX_EXIT_CTLS:
1409 		*pdata = vmx_control_msr(
1410 			msrs->exit_ctls_low,
1411 			msrs->exit_ctls_high);
1412 		if (msr_index == MSR_IA32_VMX_EXIT_CTLS)
1413 			*pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
1414 		break;
1415 	case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1416 	case MSR_IA32_VMX_ENTRY_CTLS:
1417 		*pdata = vmx_control_msr(
1418 			msrs->entry_ctls_low,
1419 			msrs->entry_ctls_high);
1420 		if (msr_index == MSR_IA32_VMX_ENTRY_CTLS)
1421 			*pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
1422 		break;
1423 	case MSR_IA32_VMX_MISC:
1424 		*pdata = vmx_control_msr(
1425 			msrs->misc_low,
1426 			msrs->misc_high);
1427 		break;
1428 	case MSR_IA32_VMX_CR0_FIXED0:
1429 		*pdata = msrs->cr0_fixed0;
1430 		break;
1431 	case MSR_IA32_VMX_CR0_FIXED1:
1432 		*pdata = msrs->cr0_fixed1;
1433 		break;
1434 	case MSR_IA32_VMX_CR4_FIXED0:
1435 		*pdata = msrs->cr4_fixed0;
1436 		break;
1437 	case MSR_IA32_VMX_CR4_FIXED1:
1438 		*pdata = msrs->cr4_fixed1;
1439 		break;
1440 	case MSR_IA32_VMX_VMCS_ENUM:
1441 		*pdata = msrs->vmcs_enum;
1442 		break;
1443 	case MSR_IA32_VMX_PROCBASED_CTLS2:
1444 		*pdata = vmx_control_msr(
1445 			msrs->secondary_ctls_low,
1446 			msrs->secondary_ctls_high);
1447 		break;
1448 	case MSR_IA32_VMX_EPT_VPID_CAP:
1449 		*pdata = msrs->ept_caps |
1450 			((u64)msrs->vpid_caps << 32);
1451 		break;
1452 	case MSR_IA32_VMX_VMFUNC:
1453 		*pdata = msrs->vmfunc_controls;
1454 		break;
1455 	default:
1456 		return 1;
1457 	}
1458 
1459 	return 0;
1460 }
1461 
1462 /*
1463  * Copy the writable VMCS shadow fields back to the VMCS12, in case they have
1464  * been modified by the L1 guest.  Note, "writable" in this context means
1465  * "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of
1466  * fields tagged SHADOW_FIELD_RO may or may not align with the "read-only"
1467  * VM-exit information fields (which are actually writable if the vCPU is
1468  * configured to support "VMWRITE to any supported field in the VMCS").
1469  */
1470 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
1471 {
1472 	struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
1473 	struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu);
1474 	struct shadow_vmcs_field field;
1475 	unsigned long val;
1476 	int i;
1477 
1478 	if (WARN_ON(!shadow_vmcs))
1479 		return;
1480 
1481 	preempt_disable();
1482 
1483 	vmcs_load(shadow_vmcs);
1484 
1485 	for (i = 0; i < max_shadow_read_write_fields; i++) {
1486 		field = shadow_read_write_fields[i];
1487 		val = __vmcs_readl(field.encoding);
1488 		vmcs12_write_any(vmcs12, field.encoding, field.offset, val);
1489 	}
1490 
1491 	vmcs_clear(shadow_vmcs);
1492 	vmcs_load(vmx->loaded_vmcs->vmcs);
1493 
1494 	preempt_enable();
1495 }
1496 
1497 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
1498 {
1499 	const struct shadow_vmcs_field *fields[] = {
1500 		shadow_read_write_fields,
1501 		shadow_read_only_fields
1502 	};
1503 	const int max_fields[] = {
1504 		max_shadow_read_write_fields,
1505 		max_shadow_read_only_fields
1506 	};
1507 	struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
1508 	struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu);
1509 	struct shadow_vmcs_field field;
1510 	unsigned long val;
1511 	int i, q;
1512 
1513 	if (WARN_ON(!shadow_vmcs))
1514 		return;
1515 
1516 	vmcs_load(shadow_vmcs);
1517 
1518 	for (q = 0; q < ARRAY_SIZE(fields); q++) {
1519 		for (i = 0; i < max_fields[q]; i++) {
1520 			field = fields[q][i];
1521 			val = vmcs12_read_any(vmcs12, field.encoding,
1522 					      field.offset);
1523 			__vmcs_writel(field.encoding, val);
1524 		}
1525 	}
1526 
1527 	vmcs_clear(shadow_vmcs);
1528 	vmcs_load(vmx->loaded_vmcs->vmcs);
1529 }
1530 
1531 static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx)
1532 {
1533 	struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
1534 	struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
1535 
1536 	/* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */
1537 	vmcs12->tpr_threshold = evmcs->tpr_threshold;
1538 	vmcs12->guest_rip = evmcs->guest_rip;
1539 
1540 	if (unlikely(!(evmcs->hv_clean_fields &
1541 		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) {
1542 		vmcs12->guest_rsp = evmcs->guest_rsp;
1543 		vmcs12->guest_rflags = evmcs->guest_rflags;
1544 		vmcs12->guest_interruptibility_info =
1545 			evmcs->guest_interruptibility_info;
1546 	}
1547 
1548 	if (unlikely(!(evmcs->hv_clean_fields &
1549 		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) {
1550 		vmcs12->cpu_based_vm_exec_control =
1551 			evmcs->cpu_based_vm_exec_control;
1552 	}
1553 
1554 	if (unlikely(!(evmcs->hv_clean_fields &
1555 		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN))) {
1556 		vmcs12->exception_bitmap = evmcs->exception_bitmap;
1557 	}
1558 
1559 	if (unlikely(!(evmcs->hv_clean_fields &
1560 		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) {
1561 		vmcs12->vm_entry_controls = evmcs->vm_entry_controls;
1562 	}
1563 
1564 	if (unlikely(!(evmcs->hv_clean_fields &
1565 		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) {
1566 		vmcs12->vm_entry_intr_info_field =
1567 			evmcs->vm_entry_intr_info_field;
1568 		vmcs12->vm_entry_exception_error_code =
1569 			evmcs->vm_entry_exception_error_code;
1570 		vmcs12->vm_entry_instruction_len =
1571 			evmcs->vm_entry_instruction_len;
1572 	}
1573 
1574 	if (unlikely(!(evmcs->hv_clean_fields &
1575 		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) {
1576 		vmcs12->host_ia32_pat = evmcs->host_ia32_pat;
1577 		vmcs12->host_ia32_efer = evmcs->host_ia32_efer;
1578 		vmcs12->host_cr0 = evmcs->host_cr0;
1579 		vmcs12->host_cr3 = evmcs->host_cr3;
1580 		vmcs12->host_cr4 = evmcs->host_cr4;
1581 		vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp;
1582 		vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip;
1583 		vmcs12->host_rip = evmcs->host_rip;
1584 		vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs;
1585 		vmcs12->host_es_selector = evmcs->host_es_selector;
1586 		vmcs12->host_cs_selector = evmcs->host_cs_selector;
1587 		vmcs12->host_ss_selector = evmcs->host_ss_selector;
1588 		vmcs12->host_ds_selector = evmcs->host_ds_selector;
1589 		vmcs12->host_fs_selector = evmcs->host_fs_selector;
1590 		vmcs12->host_gs_selector = evmcs->host_gs_selector;
1591 		vmcs12->host_tr_selector = evmcs->host_tr_selector;
1592 	}
1593 
1594 	if (unlikely(!(evmcs->hv_clean_fields &
1595 		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1))) {
1596 		vmcs12->pin_based_vm_exec_control =
1597 			evmcs->pin_based_vm_exec_control;
1598 		vmcs12->vm_exit_controls = evmcs->vm_exit_controls;
1599 		vmcs12->secondary_vm_exec_control =
1600 			evmcs->secondary_vm_exec_control;
1601 	}
1602 
1603 	if (unlikely(!(evmcs->hv_clean_fields &
1604 		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) {
1605 		vmcs12->io_bitmap_a = evmcs->io_bitmap_a;
1606 		vmcs12->io_bitmap_b = evmcs->io_bitmap_b;
1607 	}
1608 
1609 	if (unlikely(!(evmcs->hv_clean_fields &
1610 		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) {
1611 		vmcs12->msr_bitmap = evmcs->msr_bitmap;
1612 	}
1613 
1614 	if (unlikely(!(evmcs->hv_clean_fields &
1615 		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) {
1616 		vmcs12->guest_es_base = evmcs->guest_es_base;
1617 		vmcs12->guest_cs_base = evmcs->guest_cs_base;
1618 		vmcs12->guest_ss_base = evmcs->guest_ss_base;
1619 		vmcs12->guest_ds_base = evmcs->guest_ds_base;
1620 		vmcs12->guest_fs_base = evmcs->guest_fs_base;
1621 		vmcs12->guest_gs_base = evmcs->guest_gs_base;
1622 		vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base;
1623 		vmcs12->guest_tr_base = evmcs->guest_tr_base;
1624 		vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base;
1625 		vmcs12->guest_idtr_base = evmcs->guest_idtr_base;
1626 		vmcs12->guest_es_limit = evmcs->guest_es_limit;
1627 		vmcs12->guest_cs_limit = evmcs->guest_cs_limit;
1628 		vmcs12->guest_ss_limit = evmcs->guest_ss_limit;
1629 		vmcs12->guest_ds_limit = evmcs->guest_ds_limit;
1630 		vmcs12->guest_fs_limit = evmcs->guest_fs_limit;
1631 		vmcs12->guest_gs_limit = evmcs->guest_gs_limit;
1632 		vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit;
1633 		vmcs12->guest_tr_limit = evmcs->guest_tr_limit;
1634 		vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit;
1635 		vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit;
1636 		vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes;
1637 		vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes;
1638 		vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes;
1639 		vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes;
1640 		vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes;
1641 		vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes;
1642 		vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes;
1643 		vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes;
1644 		vmcs12->guest_es_selector = evmcs->guest_es_selector;
1645 		vmcs12->guest_cs_selector = evmcs->guest_cs_selector;
1646 		vmcs12->guest_ss_selector = evmcs->guest_ss_selector;
1647 		vmcs12->guest_ds_selector = evmcs->guest_ds_selector;
1648 		vmcs12->guest_fs_selector = evmcs->guest_fs_selector;
1649 		vmcs12->guest_gs_selector = evmcs->guest_gs_selector;
1650 		vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector;
1651 		vmcs12->guest_tr_selector = evmcs->guest_tr_selector;
1652 	}
1653 
1654 	if (unlikely(!(evmcs->hv_clean_fields &
1655 		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) {
1656 		vmcs12->tsc_offset = evmcs->tsc_offset;
1657 		vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr;
1658 		vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap;
1659 	}
1660 
1661 	if (unlikely(!(evmcs->hv_clean_fields &
1662 		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) {
1663 		vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask;
1664 		vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask;
1665 		vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow;
1666 		vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow;
1667 		vmcs12->guest_cr0 = evmcs->guest_cr0;
1668 		vmcs12->guest_cr3 = evmcs->guest_cr3;
1669 		vmcs12->guest_cr4 = evmcs->guest_cr4;
1670 		vmcs12->guest_dr7 = evmcs->guest_dr7;
1671 	}
1672 
1673 	if (unlikely(!(evmcs->hv_clean_fields &
1674 		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) {
1675 		vmcs12->host_fs_base = evmcs->host_fs_base;
1676 		vmcs12->host_gs_base = evmcs->host_gs_base;
1677 		vmcs12->host_tr_base = evmcs->host_tr_base;
1678 		vmcs12->host_gdtr_base = evmcs->host_gdtr_base;
1679 		vmcs12->host_idtr_base = evmcs->host_idtr_base;
1680 		vmcs12->host_rsp = evmcs->host_rsp;
1681 	}
1682 
1683 	if (unlikely(!(evmcs->hv_clean_fields &
1684 		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) {
1685 		vmcs12->ept_pointer = evmcs->ept_pointer;
1686 		vmcs12->virtual_processor_id = evmcs->virtual_processor_id;
1687 	}
1688 
1689 	if (unlikely(!(evmcs->hv_clean_fields &
1690 		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) {
1691 		vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer;
1692 		vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl;
1693 		vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat;
1694 		vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer;
1695 		vmcs12->guest_pdptr0 = evmcs->guest_pdptr0;
1696 		vmcs12->guest_pdptr1 = evmcs->guest_pdptr1;
1697 		vmcs12->guest_pdptr2 = evmcs->guest_pdptr2;
1698 		vmcs12->guest_pdptr3 = evmcs->guest_pdptr3;
1699 		vmcs12->guest_pending_dbg_exceptions =
1700 			evmcs->guest_pending_dbg_exceptions;
1701 		vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp;
1702 		vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip;
1703 		vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs;
1704 		vmcs12->guest_activity_state = evmcs->guest_activity_state;
1705 		vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs;
1706 	}
1707 
1708 	/*
1709 	 * Not used?
1710 	 * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr;
1711 	 * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr;
1712 	 * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr;
1713 	 * vmcs12->cr3_target_value0 = evmcs->cr3_target_value0;
1714 	 * vmcs12->cr3_target_value1 = evmcs->cr3_target_value1;
1715 	 * vmcs12->cr3_target_value2 = evmcs->cr3_target_value2;
1716 	 * vmcs12->cr3_target_value3 = evmcs->cr3_target_value3;
1717 	 * vmcs12->page_fault_error_code_mask =
1718 	 *		evmcs->page_fault_error_code_mask;
1719 	 * vmcs12->page_fault_error_code_match =
1720 	 *		evmcs->page_fault_error_code_match;
1721 	 * vmcs12->cr3_target_count = evmcs->cr3_target_count;
1722 	 * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count;
1723 	 * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count;
1724 	 * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count;
1725 	 */
1726 
1727 	/*
1728 	 * Read only fields:
1729 	 * vmcs12->guest_physical_address = evmcs->guest_physical_address;
1730 	 * vmcs12->vm_instruction_error = evmcs->vm_instruction_error;
1731 	 * vmcs12->vm_exit_reason = evmcs->vm_exit_reason;
1732 	 * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info;
1733 	 * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code;
1734 	 * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field;
1735 	 * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code;
1736 	 * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len;
1737 	 * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info;
1738 	 * vmcs12->exit_qualification = evmcs->exit_qualification;
1739 	 * vmcs12->guest_linear_address = evmcs->guest_linear_address;
1740 	 *
1741 	 * Not present in struct vmcs12:
1742 	 * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx;
1743 	 * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi;
1744 	 * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi;
1745 	 * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip;
1746 	 */
1747 
1748 	return 0;
1749 }
1750 
1751 static int copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx)
1752 {
1753 	struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
1754 	struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
1755 
1756 	/*
1757 	 * Should not be changed by KVM:
1758 	 *
1759 	 * evmcs->host_es_selector = vmcs12->host_es_selector;
1760 	 * evmcs->host_cs_selector = vmcs12->host_cs_selector;
1761 	 * evmcs->host_ss_selector = vmcs12->host_ss_selector;
1762 	 * evmcs->host_ds_selector = vmcs12->host_ds_selector;
1763 	 * evmcs->host_fs_selector = vmcs12->host_fs_selector;
1764 	 * evmcs->host_gs_selector = vmcs12->host_gs_selector;
1765 	 * evmcs->host_tr_selector = vmcs12->host_tr_selector;
1766 	 * evmcs->host_ia32_pat = vmcs12->host_ia32_pat;
1767 	 * evmcs->host_ia32_efer = vmcs12->host_ia32_efer;
1768 	 * evmcs->host_cr0 = vmcs12->host_cr0;
1769 	 * evmcs->host_cr3 = vmcs12->host_cr3;
1770 	 * evmcs->host_cr4 = vmcs12->host_cr4;
1771 	 * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp;
1772 	 * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip;
1773 	 * evmcs->host_rip = vmcs12->host_rip;
1774 	 * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs;
1775 	 * evmcs->host_fs_base = vmcs12->host_fs_base;
1776 	 * evmcs->host_gs_base = vmcs12->host_gs_base;
1777 	 * evmcs->host_tr_base = vmcs12->host_tr_base;
1778 	 * evmcs->host_gdtr_base = vmcs12->host_gdtr_base;
1779 	 * evmcs->host_idtr_base = vmcs12->host_idtr_base;
1780 	 * evmcs->host_rsp = vmcs12->host_rsp;
1781 	 * sync_vmcs02_to_vmcs12() doesn't read these:
1782 	 * evmcs->io_bitmap_a = vmcs12->io_bitmap_a;
1783 	 * evmcs->io_bitmap_b = vmcs12->io_bitmap_b;
1784 	 * evmcs->msr_bitmap = vmcs12->msr_bitmap;
1785 	 * evmcs->ept_pointer = vmcs12->ept_pointer;
1786 	 * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap;
1787 	 * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr;
1788 	 * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr;
1789 	 * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr;
1790 	 * evmcs->cr3_target_value0 = vmcs12->cr3_target_value0;
1791 	 * evmcs->cr3_target_value1 = vmcs12->cr3_target_value1;
1792 	 * evmcs->cr3_target_value2 = vmcs12->cr3_target_value2;
1793 	 * evmcs->cr3_target_value3 = vmcs12->cr3_target_value3;
1794 	 * evmcs->tpr_threshold = vmcs12->tpr_threshold;
1795 	 * evmcs->virtual_processor_id = vmcs12->virtual_processor_id;
1796 	 * evmcs->exception_bitmap = vmcs12->exception_bitmap;
1797 	 * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer;
1798 	 * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control;
1799 	 * evmcs->vm_exit_controls = vmcs12->vm_exit_controls;
1800 	 * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control;
1801 	 * evmcs->page_fault_error_code_mask =
1802 	 *		vmcs12->page_fault_error_code_mask;
1803 	 * evmcs->page_fault_error_code_match =
1804 	 *		vmcs12->page_fault_error_code_match;
1805 	 * evmcs->cr3_target_count = vmcs12->cr3_target_count;
1806 	 * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr;
1807 	 * evmcs->tsc_offset = vmcs12->tsc_offset;
1808 	 * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl;
1809 	 * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask;
1810 	 * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask;
1811 	 * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow;
1812 	 * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow;
1813 	 * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count;
1814 	 * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count;
1815 	 * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count;
1816 	 *
1817 	 * Not present in struct vmcs12:
1818 	 * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx;
1819 	 * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi;
1820 	 * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi;
1821 	 * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip;
1822 	 */
1823 
1824 	evmcs->guest_es_selector = vmcs12->guest_es_selector;
1825 	evmcs->guest_cs_selector = vmcs12->guest_cs_selector;
1826 	evmcs->guest_ss_selector = vmcs12->guest_ss_selector;
1827 	evmcs->guest_ds_selector = vmcs12->guest_ds_selector;
1828 	evmcs->guest_fs_selector = vmcs12->guest_fs_selector;
1829 	evmcs->guest_gs_selector = vmcs12->guest_gs_selector;
1830 	evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector;
1831 	evmcs->guest_tr_selector = vmcs12->guest_tr_selector;
1832 
1833 	evmcs->guest_es_limit = vmcs12->guest_es_limit;
1834 	evmcs->guest_cs_limit = vmcs12->guest_cs_limit;
1835 	evmcs->guest_ss_limit = vmcs12->guest_ss_limit;
1836 	evmcs->guest_ds_limit = vmcs12->guest_ds_limit;
1837 	evmcs->guest_fs_limit = vmcs12->guest_fs_limit;
1838 	evmcs->guest_gs_limit = vmcs12->guest_gs_limit;
1839 	evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit;
1840 	evmcs->guest_tr_limit = vmcs12->guest_tr_limit;
1841 	evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit;
1842 	evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit;
1843 
1844 	evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes;
1845 	evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes;
1846 	evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes;
1847 	evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes;
1848 	evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes;
1849 	evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes;
1850 	evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes;
1851 	evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes;
1852 
1853 	evmcs->guest_es_base = vmcs12->guest_es_base;
1854 	evmcs->guest_cs_base = vmcs12->guest_cs_base;
1855 	evmcs->guest_ss_base = vmcs12->guest_ss_base;
1856 	evmcs->guest_ds_base = vmcs12->guest_ds_base;
1857 	evmcs->guest_fs_base = vmcs12->guest_fs_base;
1858 	evmcs->guest_gs_base = vmcs12->guest_gs_base;
1859 	evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base;
1860 	evmcs->guest_tr_base = vmcs12->guest_tr_base;
1861 	evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base;
1862 	evmcs->guest_idtr_base = vmcs12->guest_idtr_base;
1863 
1864 	evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat;
1865 	evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer;
1866 
1867 	evmcs->guest_pdptr0 = vmcs12->guest_pdptr0;
1868 	evmcs->guest_pdptr1 = vmcs12->guest_pdptr1;
1869 	evmcs->guest_pdptr2 = vmcs12->guest_pdptr2;
1870 	evmcs->guest_pdptr3 = vmcs12->guest_pdptr3;
1871 
1872 	evmcs->guest_pending_dbg_exceptions =
1873 		vmcs12->guest_pending_dbg_exceptions;
1874 	evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp;
1875 	evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip;
1876 
1877 	evmcs->guest_activity_state = vmcs12->guest_activity_state;
1878 	evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs;
1879 
1880 	evmcs->guest_cr0 = vmcs12->guest_cr0;
1881 	evmcs->guest_cr3 = vmcs12->guest_cr3;
1882 	evmcs->guest_cr4 = vmcs12->guest_cr4;
1883 	evmcs->guest_dr7 = vmcs12->guest_dr7;
1884 
1885 	evmcs->guest_physical_address = vmcs12->guest_physical_address;
1886 
1887 	evmcs->vm_instruction_error = vmcs12->vm_instruction_error;
1888 	evmcs->vm_exit_reason = vmcs12->vm_exit_reason;
1889 	evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info;
1890 	evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code;
1891 	evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field;
1892 	evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code;
1893 	evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len;
1894 	evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info;
1895 
1896 	evmcs->exit_qualification = vmcs12->exit_qualification;
1897 
1898 	evmcs->guest_linear_address = vmcs12->guest_linear_address;
1899 	evmcs->guest_rsp = vmcs12->guest_rsp;
1900 	evmcs->guest_rflags = vmcs12->guest_rflags;
1901 
1902 	evmcs->guest_interruptibility_info =
1903 		vmcs12->guest_interruptibility_info;
1904 	evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control;
1905 	evmcs->vm_entry_controls = vmcs12->vm_entry_controls;
1906 	evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field;
1907 	evmcs->vm_entry_exception_error_code =
1908 		vmcs12->vm_entry_exception_error_code;
1909 	evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len;
1910 
1911 	evmcs->guest_rip = vmcs12->guest_rip;
1912 
1913 	evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs;
1914 
1915 	return 0;
1916 }
1917 
1918 /*
1919  * This is an equivalent of the nested hypervisor executing the vmptrld
1920  * instruction.
1921  */
1922 static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu,
1923 						 bool from_launch)
1924 {
1925 	struct vcpu_vmx *vmx = to_vmx(vcpu);
1926 	bool evmcs_gpa_changed = false;
1927 	u64 evmcs_gpa;
1928 
1929 	if (likely(!vmx->nested.enlightened_vmcs_enabled))
1930 		return 1;
1931 
1932 	if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa))
1933 		return 1;
1934 
1935 	if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) {
1936 		if (!vmx->nested.hv_evmcs)
1937 			vmx->nested.current_vmptr = -1ull;
1938 
1939 		nested_release_evmcs(vcpu);
1940 
1941 		if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa),
1942 				 &vmx->nested.hv_evmcs_map))
1943 			return 0;
1944 
1945 		vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva;
1946 
1947 		/*
1948 		 * Currently, KVM only supports eVMCS version 1
1949 		 * (== KVM_EVMCS_VERSION) and thus we expect guest to set this
1950 		 * value to first u32 field of eVMCS which should specify eVMCS
1951 		 * VersionNumber.
1952 		 *
1953 		 * Guest should be aware of supported eVMCS versions by host by
1954 		 * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is
1955 		 * expected to set this CPUID leaf according to the value
1956 		 * returned in vmcs_version from nested_enable_evmcs().
1957 		 *
1958 		 * However, it turns out that Microsoft Hyper-V fails to comply
1959 		 * to their own invented interface: When Hyper-V use eVMCS, it
1960 		 * just sets first u32 field of eVMCS to revision_id specified
1961 		 * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number
1962 		 * which is one of the supported versions specified in
1963 		 * CPUID.0x4000000A.EAX[0:15].
1964 		 *
1965 		 * To overcome Hyper-V bug, we accept here either a supported
1966 		 * eVMCS version or VMCS12 revision_id as valid values for first
1967 		 * u32 field of eVMCS.
1968 		 */
1969 		if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) &&
1970 		    (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) {
1971 			nested_release_evmcs(vcpu);
1972 			return 0;
1973 		}
1974 
1975 		vmx->nested.dirty_vmcs12 = true;
1976 		vmx->nested.hv_evmcs_vmptr = evmcs_gpa;
1977 
1978 		evmcs_gpa_changed = true;
1979 		/*
1980 		 * Unlike normal vmcs12, enlightened vmcs12 is not fully
1981 		 * reloaded from guest's memory (read only fields, fields not
1982 		 * present in struct hv_enlightened_vmcs, ...). Make sure there
1983 		 * are no leftovers.
1984 		 */
1985 		if (from_launch) {
1986 			struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1987 			memset(vmcs12, 0, sizeof(*vmcs12));
1988 			vmcs12->hdr.revision_id = VMCS12_REVISION;
1989 		}
1990 
1991 	}
1992 
1993 	/*
1994 	 * Clean fields data can't de used on VMLAUNCH and when we switch
1995 	 * between different L2 guests as KVM keeps a single VMCS12 per L1.
1996 	 */
1997 	if (from_launch || evmcs_gpa_changed)
1998 		vmx->nested.hv_evmcs->hv_clean_fields &=
1999 			~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
2000 
2001 	return 1;
2002 }
2003 
2004 void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu)
2005 {
2006 	struct vcpu_vmx *vmx = to_vmx(vcpu);
2007 
2008 	/*
2009 	 * hv_evmcs may end up being not mapped after migration (when
2010 	 * L2 was running), map it here to make sure vmcs12 changes are
2011 	 * properly reflected.
2012 	 */
2013 	if (vmx->nested.enlightened_vmcs_enabled && !vmx->nested.hv_evmcs)
2014 		nested_vmx_handle_enlightened_vmptrld(vcpu, false);
2015 
2016 	if (vmx->nested.hv_evmcs) {
2017 		copy_vmcs12_to_enlightened(vmx);
2018 		/* All fields are clean */
2019 		vmx->nested.hv_evmcs->hv_clean_fields |=
2020 			HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
2021 	} else {
2022 		copy_vmcs12_to_shadow(vmx);
2023 	}
2024 
2025 	vmx->nested.need_vmcs12_to_shadow_sync = false;
2026 }
2027 
2028 static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
2029 {
2030 	struct vcpu_vmx *vmx =
2031 		container_of(timer, struct vcpu_vmx, nested.preemption_timer);
2032 
2033 	vmx->nested.preemption_timer_expired = true;
2034 	kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu);
2035 	kvm_vcpu_kick(&vmx->vcpu);
2036 
2037 	return HRTIMER_NORESTART;
2038 }
2039 
2040 static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu)
2041 {
2042 	u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value;
2043 	struct vcpu_vmx *vmx = to_vmx(vcpu);
2044 
2045 	/*
2046 	 * A timer value of zero is architecturally guaranteed to cause
2047 	 * a VMExit prior to executing any instructions in the guest.
2048 	 */
2049 	if (preemption_timeout == 0) {
2050 		vmx_preemption_timer_fn(&vmx->nested.preemption_timer);
2051 		return;
2052 	}
2053 
2054 	if (vcpu->arch.virtual_tsc_khz == 0)
2055 		return;
2056 
2057 	preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
2058 	preemption_timeout *= 1000000;
2059 	do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz);
2060 	hrtimer_start(&vmx->nested.preemption_timer,
2061 		      ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL);
2062 }
2063 
2064 static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
2065 {
2066 	if (vmx->nested.nested_run_pending &&
2067 	    (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER))
2068 		return vmcs12->guest_ia32_efer;
2069 	else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
2070 		return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME);
2071 	else
2072 		return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME);
2073 }
2074 
2075 static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
2076 {
2077 	/*
2078 	 * If vmcs02 hasn't been initialized, set the constant vmcs02 state
2079 	 * according to L0's settings (vmcs12 is irrelevant here).  Host
2080 	 * fields that come from L0 and are not constant, e.g. HOST_CR3,
2081 	 * will be set as needed prior to VMLAUNCH/VMRESUME.
2082 	 */
2083 	if (vmx->nested.vmcs02_initialized)
2084 		return;
2085 	vmx->nested.vmcs02_initialized = true;
2086 
2087 	/*
2088 	 * We don't care what the EPTP value is we just need to guarantee
2089 	 * it's valid so we don't get a false positive when doing early
2090 	 * consistency checks.
2091 	 */
2092 	if (enable_ept && nested_early_check)
2093 		vmcs_write64(EPT_POINTER, construct_eptp(&vmx->vcpu, 0));
2094 
2095 	/* All VMFUNCs are currently emulated through L0 vmexits.  */
2096 	if (cpu_has_vmx_vmfunc())
2097 		vmcs_write64(VM_FUNCTION_CONTROL, 0);
2098 
2099 	if (cpu_has_vmx_posted_intr())
2100 		vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR);
2101 
2102 	if (cpu_has_vmx_msr_bitmap())
2103 		vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
2104 
2105 	/*
2106 	 * The PML address never changes, so it is constant in vmcs02.
2107 	 * Conceptually we want to copy the PML index from vmcs01 here,
2108 	 * and then back to vmcs01 on nested vmexit.  But since we flush
2109 	 * the log and reset GUEST_PML_INDEX on each vmexit, the PML
2110 	 * index is also effectively constant in vmcs02.
2111 	 */
2112 	if (enable_pml) {
2113 		vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
2114 		vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
2115 	}
2116 
2117 	if (cpu_has_vmx_encls_vmexit())
2118 		vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
2119 
2120 	/*
2121 	 * Set the MSR load/store lists to match L0's settings.  Only the
2122 	 * addresses are constant (for vmcs02), the counts can change based
2123 	 * on L2's behavior, e.g. switching to/from long mode.
2124 	 */
2125 	vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.guest.val));
2126 	vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
2127 	vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
2128 
2129 	vmx_set_constant_host_state(vmx);
2130 }
2131 
2132 static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx,
2133 				      struct vmcs12 *vmcs12)
2134 {
2135 	prepare_vmcs02_constant_state(vmx);
2136 
2137 	vmcs_write64(VMCS_LINK_POINTER, -1ull);
2138 
2139 	if (enable_vpid) {
2140 		if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02)
2141 			vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02);
2142 		else
2143 			vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
2144 	}
2145 }
2146 
2147 static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
2148 {
2149 	u32 exec_control, vmcs12_exec_ctrl;
2150 	u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12);
2151 
2152 	if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs)
2153 		prepare_vmcs02_early_rare(vmx, vmcs12);
2154 
2155 	/*
2156 	 * PIN CONTROLS
2157 	 */
2158 	exec_control = vmx_pin_based_exec_ctrl(vmx);
2159 	exec_control |= (vmcs12->pin_based_vm_exec_control &
2160 			 ~PIN_BASED_VMX_PREEMPTION_TIMER);
2161 
2162 	/* Posted interrupts setting is only taken from vmcs12.  */
2163 	if (nested_cpu_has_posted_intr(vmcs12)) {
2164 		vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
2165 		vmx->nested.pi_pending = false;
2166 	} else {
2167 		exec_control &= ~PIN_BASED_POSTED_INTR;
2168 	}
2169 	pin_controls_set(vmx, exec_control);
2170 
2171 	/*
2172 	 * EXEC CONTROLS
2173 	 */
2174 	exec_control = vmx_exec_control(vmx); /* L0's desires */
2175 	exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING;
2176 	exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING;
2177 	exec_control &= ~CPU_BASED_TPR_SHADOW;
2178 	exec_control |= vmcs12->cpu_based_vm_exec_control;
2179 
2180 	vmx->nested.l1_tpr_threshold = -1;
2181 	if (exec_control & CPU_BASED_TPR_SHADOW)
2182 		vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
2183 #ifdef CONFIG_X86_64
2184 	else
2185 		exec_control |= CPU_BASED_CR8_LOAD_EXITING |
2186 				CPU_BASED_CR8_STORE_EXITING;
2187 #endif
2188 
2189 	/*
2190 	 * A vmexit (to either L1 hypervisor or L0 userspace) is always needed
2191 	 * for I/O port accesses.
2192 	 */
2193 	exec_control |= CPU_BASED_UNCOND_IO_EXITING;
2194 	exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
2195 
2196 	/*
2197 	 * This bit will be computed in nested_get_vmcs12_pages, because
2198 	 * we do not have access to L1's MSR bitmap yet.  For now, keep
2199 	 * the same bit as before, hoping to avoid multiple VMWRITEs that
2200 	 * only set/clear this bit.
2201 	 */
2202 	exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
2203 	exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS;
2204 
2205 	exec_controls_set(vmx, exec_control);
2206 
2207 	/*
2208 	 * SECONDARY EXEC CONTROLS
2209 	 */
2210 	if (cpu_has_secondary_exec_ctrls()) {
2211 		exec_control = vmx->secondary_exec_control;
2212 
2213 		/* Take the following fields only from vmcs12 */
2214 		exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
2215 				  SECONDARY_EXEC_ENABLE_INVPCID |
2216 				  SECONDARY_EXEC_RDTSCP |
2217 				  SECONDARY_EXEC_XSAVES |
2218 				  SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE |
2219 				  SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
2220 				  SECONDARY_EXEC_APIC_REGISTER_VIRT |
2221 				  SECONDARY_EXEC_ENABLE_VMFUNC);
2222 		if (nested_cpu_has(vmcs12,
2223 				   CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) {
2224 			vmcs12_exec_ctrl = vmcs12->secondary_vm_exec_control &
2225 				~SECONDARY_EXEC_ENABLE_PML;
2226 			exec_control |= vmcs12_exec_ctrl;
2227 		}
2228 
2229 		/* VMCS shadowing for L2 is emulated for now */
2230 		exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
2231 
2232 		/*
2233 		 * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4()
2234 		 * will not have to rewrite the controls just for this bit.
2235 		 */
2236 		if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated() &&
2237 		    (vmcs12->guest_cr4 & X86_CR4_UMIP))
2238 			exec_control |= SECONDARY_EXEC_DESC;
2239 
2240 		if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
2241 			vmcs_write16(GUEST_INTR_STATUS,
2242 				vmcs12->guest_intr_status);
2243 
2244 		secondary_exec_controls_set(vmx, exec_control);
2245 	}
2246 
2247 	/*
2248 	 * ENTRY CONTROLS
2249 	 *
2250 	 * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE
2251 	 * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate
2252 	 * on the related bits (if supported by the CPU) in the hope that
2253 	 * we can avoid VMWrites during vmx_set_efer().
2254 	 */
2255 	exec_control = (vmcs12->vm_entry_controls | vmx_vmentry_ctrl()) &
2256 			~VM_ENTRY_IA32E_MODE & ~VM_ENTRY_LOAD_IA32_EFER;
2257 	if (cpu_has_load_ia32_efer()) {
2258 		if (guest_efer & EFER_LMA)
2259 			exec_control |= VM_ENTRY_IA32E_MODE;
2260 		if (guest_efer != host_efer)
2261 			exec_control |= VM_ENTRY_LOAD_IA32_EFER;
2262 	}
2263 	vm_entry_controls_set(vmx, exec_control);
2264 
2265 	/*
2266 	 * EXIT CONTROLS
2267 	 *
2268 	 * L2->L1 exit controls are emulated - the hardware exit is to L0 so
2269 	 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
2270 	 * bits may be modified by vmx_set_efer() in prepare_vmcs02().
2271 	 */
2272 	exec_control = vmx_vmexit_ctrl();
2273 	if (cpu_has_load_ia32_efer() && guest_efer != host_efer)
2274 		exec_control |= VM_EXIT_LOAD_IA32_EFER;
2275 	vm_exit_controls_set(vmx, exec_control);
2276 
2277 	/*
2278 	 * Interrupt/Exception Fields
2279 	 */
2280 	if (vmx->nested.nested_run_pending) {
2281 		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2282 			     vmcs12->vm_entry_intr_info_field);
2283 		vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
2284 			     vmcs12->vm_entry_exception_error_code);
2285 		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
2286 			     vmcs12->vm_entry_instruction_len);
2287 		vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
2288 			     vmcs12->guest_interruptibility_info);
2289 		vmx->loaded_vmcs->nmi_known_unmasked =
2290 			!(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI);
2291 	} else {
2292 		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
2293 	}
2294 }
2295 
2296 static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
2297 {
2298 	struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs;
2299 
2300 	if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
2301 			   HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) {
2302 		vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
2303 		vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
2304 		vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
2305 		vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector);
2306 		vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector);
2307 		vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector);
2308 		vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector);
2309 		vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector);
2310 		vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit);
2311 		vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit);
2312 		vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit);
2313 		vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit);
2314 		vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit);
2315 		vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit);
2316 		vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit);
2317 		vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit);
2318 		vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit);
2319 		vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit);
2320 		vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
2321 		vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
2322 		vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes);
2323 		vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes);
2324 		vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes);
2325 		vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes);
2326 		vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes);
2327 		vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes);
2328 		vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
2329 		vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
2330 		vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base);
2331 		vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base);
2332 		vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base);
2333 		vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base);
2334 		vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base);
2335 		vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base);
2336 		vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
2337 		vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
2338 	}
2339 
2340 	if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
2341 			   HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) {
2342 		vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
2343 		vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
2344 			    vmcs12->guest_pending_dbg_exceptions);
2345 		vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
2346 		vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
2347 
2348 		/*
2349 		 * L1 may access the L2's PDPTR, so save them to construct
2350 		 * vmcs12
2351 		 */
2352 		if (enable_ept) {
2353 			vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
2354 			vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
2355 			vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
2356 			vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
2357 		}
2358 
2359 		if (kvm_mpx_supported() && vmx->nested.nested_run_pending &&
2360 		    (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
2361 			vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
2362 	}
2363 
2364 	if (nested_cpu_has_xsaves(vmcs12))
2365 		vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap);
2366 
2367 	/*
2368 	 * Whether page-faults are trapped is determined by a combination of
2369 	 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.
2370 	 * If enable_ept, L0 doesn't care about page faults and we should
2371 	 * set all of these to L1's desires. However, if !enable_ept, L0 does
2372 	 * care about (at least some) page faults, and because it is not easy
2373 	 * (if at all possible?) to merge L0 and L1's desires, we simply ask
2374 	 * to exit on each and every L2 page fault. This is done by setting
2375 	 * MASK=MATCH=0 and (see below) EB.PF=1.
2376 	 * Note that below we don't need special code to set EB.PF beyond the
2377 	 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
2378 	 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
2379 	 * !enable_ept, EB.PF is 1, so the "or" will always be 1.
2380 	 */
2381 	vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
2382 		enable_ept ? vmcs12->page_fault_error_code_mask : 0);
2383 	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
2384 		enable_ept ? vmcs12->page_fault_error_code_match : 0);
2385 
2386 	if (cpu_has_vmx_apicv()) {
2387 		vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0);
2388 		vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1);
2389 		vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2);
2390 		vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3);
2391 	}
2392 
2393 	/*
2394 	 * Make sure the msr_autostore list is up to date before we set the
2395 	 * count in the vmcs02.
2396 	 */
2397 	prepare_vmx_msr_autostore_list(&vmx->vcpu, MSR_IA32_TSC);
2398 
2399 	vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.guest.nr);
2400 	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
2401 	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
2402 
2403 	set_cr4_guest_host_mask(vmx);
2404 }
2405 
2406 /*
2407  * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
2408  * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
2409  * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2
2410  * guest in a way that will both be appropriate to L1's requests, and our
2411  * needs. In addition to modifying the active vmcs (which is vmcs02), this
2412  * function also has additional necessary side-effects, like setting various
2413  * vcpu->arch fields.
2414  * Returns 0 on success, 1 on failure. Invalid state exit qualification code
2415  * is assigned to entry_failure_code on failure.
2416  */
2417 static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
2418 			  u32 *entry_failure_code)
2419 {
2420 	struct vcpu_vmx *vmx = to_vmx(vcpu);
2421 	struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs;
2422 	bool load_guest_pdptrs_vmcs12 = false;
2423 
2424 	if (vmx->nested.dirty_vmcs12 || hv_evmcs) {
2425 		prepare_vmcs02_rare(vmx, vmcs12);
2426 		vmx->nested.dirty_vmcs12 = false;
2427 
2428 		load_guest_pdptrs_vmcs12 = !hv_evmcs ||
2429 			!(hv_evmcs->hv_clean_fields &
2430 			  HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1);
2431 	}
2432 
2433 	if (vmx->nested.nested_run_pending &&
2434 	    (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
2435 		kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
2436 		vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
2437 	} else {
2438 		kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
2439 		vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl);
2440 	}
2441 	if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
2442 	    !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
2443 		vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs);
2444 	vmx_set_rflags(vcpu, vmcs12->guest_rflags);
2445 
2446 	/* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
2447 	 * bitwise-or of what L1 wants to trap for L2, and what we want to
2448 	 * trap. Note that CR0.TS also needs updating - we do this later.
2449 	 */
2450 	update_exception_bitmap(vcpu);
2451 	vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
2452 	vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
2453 
2454 	if (vmx->nested.nested_run_pending &&
2455 	    (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) {
2456 		vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
2457 		vcpu->arch.pat = vmcs12->guest_ia32_pat;
2458 	} else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
2459 		vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
2460 	}
2461 
2462 	vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
2463 
2464 	if (kvm_has_tsc_control)
2465 		decache_tsc_multiplier(vmx);
2466 
2467 	if (enable_vpid) {
2468 		/*
2469 		 * There is no direct mapping between vpid02 and vpid12, the
2470 		 * vpid02 is per-vCPU for L0 and reused while the value of
2471 		 * vpid12 is changed w/ one invvpid during nested vmentry.
2472 		 * The vpid12 is allocated by L1 for L2, so it will not
2473 		 * influence global bitmap(for vpid01 and vpid02 allocation)
2474 		 * even if spawn a lot of nested vCPUs.
2475 		 */
2476 		if (nested_cpu_has_vpid(vmcs12) && nested_has_guest_tlb_tag(vcpu)) {
2477 			if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
2478 				vmx->nested.last_vpid = vmcs12->virtual_processor_id;
2479 				__vmx_flush_tlb(vcpu, nested_get_vpid02(vcpu), false);
2480 			}
2481 		} else {
2482 			/*
2483 			 * If L1 use EPT, then L0 needs to execute INVEPT on
2484 			 * EPTP02 instead of EPTP01. Therefore, delay TLB
2485 			 * flush until vmcs02->eptp is fully updated by
2486 			 * KVM_REQ_LOAD_CR3. Note that this assumes
2487 			 * KVM_REQ_TLB_FLUSH is evaluated after
2488 			 * KVM_REQ_LOAD_CR3 in vcpu_enter_guest().
2489 			 */
2490 			kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
2491 		}
2492 	}
2493 
2494 	if (nested_cpu_has_ept(vmcs12))
2495 		nested_ept_init_mmu_context(vcpu);
2496 
2497 	/*
2498 	 * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those
2499 	 * bits which we consider mandatory enabled.
2500 	 * The CR0_READ_SHADOW is what L2 should have expected to read given
2501 	 * the specifications by L1; It's not enough to take
2502 	 * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we
2503 	 * have more bits than L1 expected.
2504 	 */
2505 	vmx_set_cr0(vcpu, vmcs12->guest_cr0);
2506 	vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
2507 
2508 	vmx_set_cr4(vcpu, vmcs12->guest_cr4);
2509 	vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
2510 
2511 	vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12);
2512 	/* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
2513 	vmx_set_efer(vcpu, vcpu->arch.efer);
2514 
2515 	/*
2516 	 * Guest state is invalid and unrestricted guest is disabled,
2517 	 * which means L1 attempted VMEntry to L2 with invalid state.
2518 	 * Fail the VMEntry.
2519 	 */
2520 	if (vmx->emulation_required) {
2521 		*entry_failure_code = ENTRY_FAIL_DEFAULT;
2522 		return -EINVAL;
2523 	}
2524 
2525 	/* Shadow page tables on either EPT or shadow page tables. */
2526 	if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12),
2527 				entry_failure_code))
2528 		return -EINVAL;
2529 
2530 	/*
2531 	 * Immediately write vmcs02.GUEST_CR3.  It will be propagated to vmcs12
2532 	 * on nested VM-Exit, which can occur without actually running L2 and
2533 	 * thus without hitting vmx_set_cr3(), e.g. if L1 is entering L2 with
2534 	 * vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the
2535 	 * transition to HLT instead of running L2.
2536 	 */
2537 	if (enable_ept)
2538 		vmcs_writel(GUEST_CR3, vmcs12->guest_cr3);
2539 
2540 	/* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */
2541 	if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) &&
2542 	    is_pae_paging(vcpu)) {
2543 		vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
2544 		vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
2545 		vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
2546 		vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
2547 	}
2548 
2549 	if (!enable_ept)
2550 		vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested;
2551 
2552 	if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
2553 	    SET_MSR_OR_WARN(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
2554 			    vmcs12->guest_ia32_perf_global_ctrl))
2555 		return -EINVAL;
2556 
2557 	kvm_rsp_write(vcpu, vmcs12->guest_rsp);
2558 	kvm_rip_write(vcpu, vmcs12->guest_rip);
2559 	return 0;
2560 }
2561 
2562 static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12)
2563 {
2564 	if (CC(!nested_cpu_has_nmi_exiting(vmcs12) &&
2565 	       nested_cpu_has_virtual_nmis(vmcs12)))
2566 		return -EINVAL;
2567 
2568 	if (CC(!nested_cpu_has_virtual_nmis(vmcs12) &&
2569 	       nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING)))
2570 		return -EINVAL;
2571 
2572 	return 0;
2573 }
2574 
2575 static bool valid_ept_address(struct kvm_vcpu *vcpu, u64 address)
2576 {
2577 	struct vcpu_vmx *vmx = to_vmx(vcpu);
2578 	int maxphyaddr = cpuid_maxphyaddr(vcpu);
2579 
2580 	/* Check for memory type validity */
2581 	switch (address & VMX_EPTP_MT_MASK) {
2582 	case VMX_EPTP_MT_UC:
2583 		if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT)))
2584 			return false;
2585 		break;
2586 	case VMX_EPTP_MT_WB:
2587 		if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT)))
2588 			return false;
2589 		break;
2590 	default:
2591 		return false;
2592 	}
2593 
2594 	/* only 4 levels page-walk length are valid */
2595 	if (CC((address & VMX_EPTP_PWL_MASK) != VMX_EPTP_PWL_4))
2596 		return false;
2597 
2598 	/* Reserved bits should not be set */
2599 	if (CC(address >> maxphyaddr || ((address >> 7) & 0x1f)))
2600 		return false;
2601 
2602 	/* AD, if set, should be supported */
2603 	if (address & VMX_EPTP_AD_ENABLE_BIT) {
2604 		if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT)))
2605 			return false;
2606 	}
2607 
2608 	return true;
2609 }
2610 
2611 /*
2612  * Checks related to VM-Execution Control Fields
2613  */
2614 static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu,
2615                                               struct vmcs12 *vmcs12)
2616 {
2617 	struct vcpu_vmx *vmx = to_vmx(vcpu);
2618 
2619 	if (CC(!vmx_control_verify(vmcs12->pin_based_vm_exec_control,
2620 				   vmx->nested.msrs.pinbased_ctls_low,
2621 				   vmx->nested.msrs.pinbased_ctls_high)) ||
2622 	    CC(!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
2623 				   vmx->nested.msrs.procbased_ctls_low,
2624 				   vmx->nested.msrs.procbased_ctls_high)))
2625 		return -EINVAL;
2626 
2627 	if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
2628 	    CC(!vmx_control_verify(vmcs12->secondary_vm_exec_control,
2629 				   vmx->nested.msrs.secondary_ctls_low,
2630 				   vmx->nested.msrs.secondary_ctls_high)))
2631 		return -EINVAL;
2632 
2633 	if (CC(vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) ||
2634 	    nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) ||
2635 	    nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) ||
2636 	    nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) ||
2637 	    nested_vmx_check_apic_access_controls(vcpu, vmcs12) ||
2638 	    nested_vmx_check_apicv_controls(vcpu, vmcs12) ||
2639 	    nested_vmx_check_nmi_controls(vmcs12) ||
2640 	    nested_vmx_check_pml_controls(vcpu, vmcs12) ||
2641 	    nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) ||
2642 	    nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) ||
2643 	    nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) ||
2644 	    CC(nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id))
2645 		return -EINVAL;
2646 
2647 	if (!nested_cpu_has_preemption_timer(vmcs12) &&
2648 	    nested_cpu_has_save_preemption_timer(vmcs12))
2649 		return -EINVAL;
2650 
2651 	if (nested_cpu_has_ept(vmcs12) &&
2652 	    CC(!valid_ept_address(vcpu, vmcs12->ept_pointer)))
2653 		return -EINVAL;
2654 
2655 	if (nested_cpu_has_vmfunc(vmcs12)) {
2656 		if (CC(vmcs12->vm_function_control &
2657 		       ~vmx->nested.msrs.vmfunc_controls))
2658 			return -EINVAL;
2659 
2660 		if (nested_cpu_has_eptp_switching(vmcs12)) {
2661 			if (CC(!nested_cpu_has_ept(vmcs12)) ||
2662 			    CC(!page_address_valid(vcpu, vmcs12->eptp_list_address)))
2663 				return -EINVAL;
2664 		}
2665 	}
2666 
2667 	return 0;
2668 }
2669 
2670 /*
2671  * Checks related to VM-Exit Control Fields
2672  */
2673 static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu,
2674                                          struct vmcs12 *vmcs12)
2675 {
2676 	struct vcpu_vmx *vmx = to_vmx(vcpu);
2677 
2678 	if (CC(!vmx_control_verify(vmcs12->vm_exit_controls,
2679 				    vmx->nested.msrs.exit_ctls_low,
2680 				    vmx->nested.msrs.exit_ctls_high)) ||
2681 	    CC(nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12)))
2682 		return -EINVAL;
2683 
2684 	return 0;
2685 }
2686 
2687 /*
2688  * Checks related to VM-Entry Control Fields
2689  */
2690 static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu,
2691 					  struct vmcs12 *vmcs12)
2692 {
2693 	struct vcpu_vmx *vmx = to_vmx(vcpu);
2694 
2695 	if (CC(!vmx_control_verify(vmcs12->vm_entry_controls,
2696 				    vmx->nested.msrs.entry_ctls_low,
2697 				    vmx->nested.msrs.entry_ctls_high)))
2698 		return -EINVAL;
2699 
2700 	/*
2701 	 * From the Intel SDM, volume 3:
2702 	 * Fields relevant to VM-entry event injection must be set properly.
2703 	 * These fields are the VM-entry interruption-information field, the
2704 	 * VM-entry exception error code, and the VM-entry instruction length.
2705 	 */
2706 	if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) {
2707 		u32 intr_info = vmcs12->vm_entry_intr_info_field;
2708 		u8 vector = intr_info & INTR_INFO_VECTOR_MASK;
2709 		u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK;
2710 		bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK;
2711 		bool should_have_error_code;
2712 		bool urg = nested_cpu_has2(vmcs12,
2713 					   SECONDARY_EXEC_UNRESTRICTED_GUEST);
2714 		bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE;
2715 
2716 		/* VM-entry interruption-info field: interruption type */
2717 		if (CC(intr_type == INTR_TYPE_RESERVED) ||
2718 		    CC(intr_type == INTR_TYPE_OTHER_EVENT &&
2719 		       !nested_cpu_supports_monitor_trap_flag(vcpu)))
2720 			return -EINVAL;
2721 
2722 		/* VM-entry interruption-info field: vector */
2723 		if (CC(intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) ||
2724 		    CC(intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) ||
2725 		    CC(intr_type == INTR_TYPE_OTHER_EVENT && vector != 0))
2726 			return -EINVAL;
2727 
2728 		/* VM-entry interruption-info field: deliver error code */
2729 		should_have_error_code =
2730 			intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode &&
2731 			x86_exception_has_error_code(vector);
2732 		if (CC(has_error_code != should_have_error_code))
2733 			return -EINVAL;
2734 
2735 		/* VM-entry exception error code */
2736 		if (CC(has_error_code &&
2737 		       vmcs12->vm_entry_exception_error_code & GENMASK(31, 16)))
2738 			return -EINVAL;
2739 
2740 		/* VM-entry interruption-info field: reserved bits */
2741 		if (CC(intr_info & INTR_INFO_RESVD_BITS_MASK))
2742 			return -EINVAL;
2743 
2744 		/* VM-entry instruction length */
2745 		switch (intr_type) {
2746 		case INTR_TYPE_SOFT_EXCEPTION:
2747 		case INTR_TYPE_SOFT_INTR:
2748 		case INTR_TYPE_PRIV_SW_EXCEPTION:
2749 			if (CC(vmcs12->vm_entry_instruction_len > 15) ||
2750 			    CC(vmcs12->vm_entry_instruction_len == 0 &&
2751 			    CC(!nested_cpu_has_zero_length_injection(vcpu))))
2752 				return -EINVAL;
2753 		}
2754 	}
2755 
2756 	if (nested_vmx_check_entry_msr_switch_controls(vcpu, vmcs12))
2757 		return -EINVAL;
2758 
2759 	return 0;
2760 }
2761 
2762 static int nested_vmx_check_controls(struct kvm_vcpu *vcpu,
2763 				     struct vmcs12 *vmcs12)
2764 {
2765 	if (nested_check_vm_execution_controls(vcpu, vmcs12) ||
2766 	    nested_check_vm_exit_controls(vcpu, vmcs12) ||
2767 	    nested_check_vm_entry_controls(vcpu, vmcs12))
2768 		return -EINVAL;
2769 
2770 	return 0;
2771 }
2772 
2773 static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
2774 				       struct vmcs12 *vmcs12)
2775 {
2776 	bool ia32e;
2777 
2778 	if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) ||
2779 	    CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) ||
2780 	    CC(!nested_cr3_valid(vcpu, vmcs12->host_cr3)))
2781 		return -EINVAL;
2782 
2783 	if (CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu)) ||
2784 	    CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu)))
2785 		return -EINVAL;
2786 
2787 	if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) &&
2788 	    CC(!kvm_pat_valid(vmcs12->host_ia32_pat)))
2789 		return -EINVAL;
2790 
2791 	if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) &&
2792 	    CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu),
2793 					   vmcs12->host_ia32_perf_global_ctrl)))
2794 		return -EINVAL;
2795 
2796 #ifdef CONFIG_X86_64
2797 	ia32e = !!(vcpu->arch.efer & EFER_LMA);
2798 #else
2799 	ia32e = false;
2800 #endif
2801 
2802 	if (ia32e) {
2803 		if (CC(!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)) ||
2804 		    CC(!(vmcs12->host_cr4 & X86_CR4_PAE)))
2805 			return -EINVAL;
2806 	} else {
2807 		if (CC(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) ||
2808 		    CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) ||
2809 		    CC(vmcs12->host_cr4 & X86_CR4_PCIDE) ||
2810 		    CC((vmcs12->host_rip) >> 32))
2811 			return -EINVAL;
2812 	}
2813 
2814 	if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2815 	    CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2816 	    CC(vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2817 	    CC(vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2818 	    CC(vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2819 	    CC(vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2820 	    CC(vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2821 	    CC(vmcs12->host_cs_selector == 0) ||
2822 	    CC(vmcs12->host_tr_selector == 0) ||
2823 	    CC(vmcs12->host_ss_selector == 0 && !ia32e))
2824 		return -EINVAL;
2825 
2826 #ifdef CONFIG_X86_64
2827 	if (CC(is_noncanonical_address(vmcs12->host_fs_base, vcpu)) ||
2828 	    CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) ||
2829 	    CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) ||
2830 	    CC(is_noncanonical_address(vmcs12->host_idtr_base, vcpu)) ||
2831 	    CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu)) ||
2832 	    CC(is_noncanonical_address(vmcs12->host_rip, vcpu)))
2833 		return -EINVAL;
2834 #endif
2835 
2836 	/*
2837 	 * If the load IA32_EFER VM-exit control is 1, bits reserved in the
2838 	 * IA32_EFER MSR must be 0 in the field for that register. In addition,
2839 	 * the values of the LMA and LME bits in the field must each be that of
2840 	 * the host address-space size VM-exit control.
2841 	 */
2842 	if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) {
2843 		if (CC(!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer)) ||
2844 		    CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA)) ||
2845 		    CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)))
2846 			return -EINVAL;
2847 	}
2848 
2849 	return 0;
2850 }
2851 
2852 static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu,
2853 					  struct vmcs12 *vmcs12)
2854 {
2855 	int r = 0;
2856 	struct vmcs12 *shadow;
2857 	struct kvm_host_map map;
2858 
2859 	if (vmcs12->vmcs_link_pointer == -1ull)
2860 		return 0;
2861 
2862 	if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer)))
2863 		return -EINVAL;
2864 
2865 	if (CC(kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map)))
2866 		return -EINVAL;
2867 
2868 	shadow = map.hva;
2869 
2870 	if (CC(shadow->hdr.revision_id != VMCS12_REVISION) ||
2871 	    CC(shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12)))
2872 		r = -EINVAL;
2873 
2874 	kvm_vcpu_unmap(vcpu, &map, false);
2875 	return r;
2876 }
2877 
2878 /*
2879  * Checks related to Guest Non-register State
2880  */
2881 static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12)
2882 {
2883 	if (CC(vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
2884 	       vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT))
2885 		return -EINVAL;
2886 
2887 	return 0;
2888 }
2889 
2890 static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
2891 					struct vmcs12 *vmcs12,
2892 					u32 *exit_qual)
2893 {
2894 	bool ia32e;
2895 
2896 	*exit_qual = ENTRY_FAIL_DEFAULT;
2897 
2898 	if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) ||
2899 	    CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)))
2900 		return -EINVAL;
2901 
2902 	if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) &&
2903 	    CC(!kvm_pat_valid(vmcs12->guest_ia32_pat)))
2904 		return -EINVAL;
2905 
2906 	if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) {
2907 		*exit_qual = ENTRY_FAIL_VMCS_LINK_PTR;
2908 		return -EINVAL;
2909 	}
2910 
2911 	if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
2912 	    CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu),
2913 					   vmcs12->guest_ia32_perf_global_ctrl)))
2914 		return -EINVAL;
2915 
2916 	/*
2917 	 * If the load IA32_EFER VM-entry control is 1, the following checks
2918 	 * are performed on the field for the IA32_EFER MSR:
2919 	 * - Bits reserved in the IA32_EFER MSR must be 0.
2920 	 * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of
2921 	 *   the IA-32e mode guest VM-exit control. It must also be identical
2922 	 *   to bit 8 (LME) if bit 31 in the CR0 field (corresponding to
2923 	 *   CR0.PG) is 1.
2924 	 */
2925 	if (to_vmx(vcpu)->nested.nested_run_pending &&
2926 	    (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) {
2927 		ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0;
2928 		if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) ||
2929 		    CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) ||
2930 		    CC(((vmcs12->guest_cr0 & X86_CR0_PG) &&
2931 		     ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))))
2932 			return -EINVAL;
2933 	}
2934 
2935 	if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) &&
2936 	    (CC(is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) ||
2937 	     CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD))))
2938 		return -EINVAL;
2939 
2940 	if (nested_check_guest_non_reg_state(vmcs12))
2941 		return -EINVAL;
2942 
2943 	return 0;
2944 }
2945 
2946 static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
2947 {
2948 	struct vcpu_vmx *vmx = to_vmx(vcpu);
2949 	unsigned long cr3, cr4;
2950 	bool vm_fail;
2951 
2952 	if (!nested_early_check)
2953 		return 0;
2954 
2955 	if (vmx->msr_autoload.host.nr)
2956 		vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
2957 	if (vmx->msr_autoload.guest.nr)
2958 		vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
2959 
2960 	preempt_disable();
2961 
2962 	vmx_prepare_switch_to_guest(vcpu);
2963 
2964 	/*
2965 	 * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS,
2966 	 * which is reserved to '1' by hardware.  GUEST_RFLAGS is guaranteed to
2967 	 * be written (by preparve_vmcs02()) before the "real" VMEnter, i.e.
2968 	 * there is no need to preserve other bits or save/restore the field.
2969 	 */
2970 	vmcs_writel(GUEST_RFLAGS, 0);
2971 
2972 	cr3 = __get_current_cr3_fast();
2973 	if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
2974 		vmcs_writel(HOST_CR3, cr3);
2975 		vmx->loaded_vmcs->host_state.cr3 = cr3;
2976 	}
2977 
2978 	cr4 = cr4_read_shadow();
2979 	if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
2980 		vmcs_writel(HOST_CR4, cr4);
2981 		vmx->loaded_vmcs->host_state.cr4 = cr4;
2982 	}
2983 
2984 	asm(
2985 		"sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */
2986 		"cmp %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t"
2987 		"je 1f \n\t"
2988 		__ex("vmwrite %%" _ASM_SP ", %[HOST_RSP]") "\n\t"
2989 		"mov %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t"
2990 		"1: \n\t"
2991 		"add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */
2992 
2993 		/* Check if vmlaunch or vmresume is needed */
2994 		"cmpb $0, %c[launched](%[loaded_vmcs])\n\t"
2995 
2996 		/*
2997 		 * VMLAUNCH and VMRESUME clear RFLAGS.{CF,ZF} on VM-Exit, set
2998 		 * RFLAGS.CF on VM-Fail Invalid and set RFLAGS.ZF on VM-Fail
2999 		 * Valid.  vmx_vmenter() directly "returns" RFLAGS, and so the
3000 		 * results of VM-Enter is captured via CC_{SET,OUT} to vm_fail.
3001 		 */
3002 		"call vmx_vmenter\n\t"
3003 
3004 		CC_SET(be)
3005 	      : ASM_CALL_CONSTRAINT, CC_OUT(be) (vm_fail)
3006 	      :	[HOST_RSP]"r"((unsigned long)HOST_RSP),
3007 		[loaded_vmcs]"r"(vmx->loaded_vmcs),
3008 		[launched]"i"(offsetof(struct loaded_vmcs, launched)),
3009 		[host_state_rsp]"i"(offsetof(struct loaded_vmcs, host_state.rsp)),
3010 		[wordsize]"i"(sizeof(ulong))
3011 	      : "memory"
3012 	);
3013 
3014 	if (vmx->msr_autoload.host.nr)
3015 		vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
3016 	if (vmx->msr_autoload.guest.nr)
3017 		vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
3018 
3019 	if (vm_fail) {
3020 		u32 error = vmcs_read32(VM_INSTRUCTION_ERROR);
3021 
3022 		preempt_enable();
3023 
3024 		trace_kvm_nested_vmenter_failed(
3025 			"early hardware check VM-instruction error: ", error);
3026 		WARN_ON_ONCE(error != VMXERR_ENTRY_INVALID_CONTROL_FIELD);
3027 		return 1;
3028 	}
3029 
3030 	/*
3031 	 * VMExit clears RFLAGS.IF and DR7, even on a consistency check.
3032 	 */
3033 	local_irq_enable();
3034 	if (hw_breakpoint_active())
3035 		set_debugreg(__this_cpu_read(cpu_dr7), 7);
3036 	preempt_enable();
3037 
3038 	/*
3039 	 * A non-failing VMEntry means we somehow entered guest mode with
3040 	 * an illegal RIP, and that's just the tip of the iceberg.  There
3041 	 * is no telling what memory has been modified or what state has
3042 	 * been exposed to unknown code.  Hitting this all but guarantees
3043 	 * a (very critical) hardware issue.
3044 	 */
3045 	WARN_ON(!(vmcs_read32(VM_EXIT_REASON) &
3046 		VMX_EXIT_REASONS_FAILED_VMENTRY));
3047 
3048 	return 0;
3049 }
3050 
3051 static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
3052 						 struct vmcs12 *vmcs12);
3053 
3054 static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
3055 {
3056 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3057 	struct vcpu_vmx *vmx = to_vmx(vcpu);
3058 	struct kvm_host_map *map;
3059 	struct page *page;
3060 	u64 hpa;
3061 
3062 	if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
3063 		/*
3064 		 * Translate L1 physical address to host physical
3065 		 * address for vmcs02. Keep the page pinned, so this
3066 		 * physical address remains valid. We keep a reference
3067 		 * to it so we can release it later.
3068 		 */
3069 		if (vmx->nested.apic_access_page) { /* shouldn't happen */
3070 			kvm_release_page_clean(vmx->nested.apic_access_page);
3071 			vmx->nested.apic_access_page = NULL;
3072 		}
3073 		page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr);
3074 		if (!is_error_page(page)) {
3075 			vmx->nested.apic_access_page = page;
3076 			hpa = page_to_phys(vmx->nested.apic_access_page);
3077 			vmcs_write64(APIC_ACCESS_ADDR, hpa);
3078 		} else {
3079 			pr_debug_ratelimited("%s: no backing 'struct page' for APIC-access address in vmcs12\n",
3080 					     __func__);
3081 			vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3082 			vcpu->run->internal.suberror =
3083 				KVM_INTERNAL_ERROR_EMULATION;
3084 			vcpu->run->internal.ndata = 0;
3085 			return false;
3086 		}
3087 	}
3088 
3089 	if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
3090 		map = &vmx->nested.virtual_apic_map;
3091 
3092 		if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) {
3093 			vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn));
3094 		} else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) &&
3095 		           nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) &&
3096 			   !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
3097 			/*
3098 			 * The processor will never use the TPR shadow, simply
3099 			 * clear the bit from the execution control.  Such a
3100 			 * configuration is useless, but it happens in tests.
3101 			 * For any other configuration, failing the vm entry is
3102 			 * _not_ what the processor does but it's basically the
3103 			 * only possibility we have.
3104 			 */
3105 			exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW);
3106 		} else {
3107 			/*
3108 			 * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to
3109 			 * force VM-Entry to fail.
3110 			 */
3111 			vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull);
3112 		}
3113 	}
3114 
3115 	if (nested_cpu_has_posted_intr(vmcs12)) {
3116 		map = &vmx->nested.pi_desc_map;
3117 
3118 		if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) {
3119 			vmx->nested.pi_desc =
3120 				(struct pi_desc *)(((void *)map->hva) +
3121 				offset_in_page(vmcs12->posted_intr_desc_addr));
3122 			vmcs_write64(POSTED_INTR_DESC_ADDR,
3123 				     pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr));
3124 		}
3125 	}
3126 	if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12))
3127 		exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
3128 	else
3129 		exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
3130 	return true;
3131 }
3132 
3133 /*
3134  * Intel's VMX Instruction Reference specifies a common set of prerequisites
3135  * for running VMX instructions (except VMXON, whose prerequisites are
3136  * slightly different). It also specifies what exception to inject otherwise.
3137  * Note that many of these exceptions have priority over VM exits, so they
3138  * don't have to be checked again here.
3139  */
3140 static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
3141 {
3142 	if (!to_vmx(vcpu)->nested.vmxon) {
3143 		kvm_queue_exception(vcpu, UD_VECTOR);
3144 		return 0;
3145 	}
3146 
3147 	if (vmx_get_cpl(vcpu)) {
3148 		kvm_inject_gp(vcpu, 0);
3149 		return 0;
3150 	}
3151 
3152 	return 1;
3153 }
3154 
3155 static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu)
3156 {
3157 	u8 rvi = vmx_get_rvi();
3158 	u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI);
3159 
3160 	return ((rvi & 0xf0) > (vppr & 0xf0));
3161 }
3162 
3163 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
3164 				   struct vmcs12 *vmcs12);
3165 
3166 /*
3167  * If from_vmentry is false, this is being called from state restore (either RSM
3168  * or KVM_SET_NESTED_STATE).  Otherwise it's called from vmlaunch/vmresume.
3169  *
3170  * Returns:
3171  *	NVMX_ENTRY_SUCCESS: Entered VMX non-root mode
3172  *	NVMX_ENTRY_VMFAIL:  Consistency check VMFail
3173  *	NVMX_ENTRY_VMEXIT:  Consistency check VMExit
3174  *	NVMX_ENTRY_KVM_INTERNAL_ERROR: KVM internal error
3175  */
3176 enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
3177 							bool from_vmentry)
3178 {
3179 	struct vcpu_vmx *vmx = to_vmx(vcpu);
3180 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3181 	bool evaluate_pending_interrupts;
3182 	u32 exit_reason = EXIT_REASON_INVALID_STATE;
3183 	u32 exit_qual;
3184 
3185 	evaluate_pending_interrupts = exec_controls_get(vmx) &
3186 		(CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING);
3187 	if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu))
3188 		evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu);
3189 
3190 	if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
3191 		vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
3192 	if (kvm_mpx_supported() &&
3193 		!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
3194 		vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
3195 
3196 	/*
3197 	 * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and*
3198 	 * nested early checks are disabled.  In the event of a "late" VM-Fail,
3199 	 * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its
3200 	 * software model to the pre-VMEntry host state.  When EPT is disabled,
3201 	 * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes
3202 	 * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3.  Stuffing
3203 	 * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to
3204 	 * the correct value.  Smashing vmcs01.GUEST_CR3 is safe because nested
3205 	 * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is
3206 	 * guaranteed to be overwritten with a shadow CR3 prior to re-entering
3207 	 * L1.  Don't stuff vmcs01.GUEST_CR3 when using nested early checks as
3208 	 * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks
3209 	 * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail
3210 	 * path would need to manually save/restore vmcs01.GUEST_CR3.
3211 	 */
3212 	if (!enable_ept && !nested_early_check)
3213 		vmcs_writel(GUEST_CR3, vcpu->arch.cr3);
3214 
3215 	vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
3216 
3217 	prepare_vmcs02_early(vmx, vmcs12);
3218 
3219 	if (from_vmentry) {
3220 		if (unlikely(!nested_get_vmcs12_pages(vcpu)))
3221 			return NVMX_VMENTRY_KVM_INTERNAL_ERROR;
3222 
3223 		if (nested_vmx_check_vmentry_hw(vcpu)) {
3224 			vmx_switch_vmcs(vcpu, &vmx->vmcs01);
3225 			return NVMX_VMENTRY_VMFAIL;
3226 		}
3227 
3228 		if (nested_vmx_check_guest_state(vcpu, vmcs12, &exit_qual))
3229 			goto vmentry_fail_vmexit;
3230 	}
3231 
3232 	enter_guest_mode(vcpu);
3233 	if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING)
3234 		vcpu->arch.tsc_offset += vmcs12->tsc_offset;
3235 
3236 	if (prepare_vmcs02(vcpu, vmcs12, &exit_qual))
3237 		goto vmentry_fail_vmexit_guest_mode;
3238 
3239 	if (from_vmentry) {
3240 		exit_reason = EXIT_REASON_MSR_LOAD_FAIL;
3241 		exit_qual = nested_vmx_load_msr(vcpu,
3242 						vmcs12->vm_entry_msr_load_addr,
3243 						vmcs12->vm_entry_msr_load_count);
3244 		if (exit_qual)
3245 			goto vmentry_fail_vmexit_guest_mode;
3246 	} else {
3247 		/*
3248 		 * The MMU is not initialized to point at the right entities yet and
3249 		 * "get pages" would need to read data from the guest (i.e. we will
3250 		 * need to perform gpa to hpa translation). Request a call
3251 		 * to nested_get_vmcs12_pages before the next VM-entry.  The MSRs
3252 		 * have already been set at vmentry time and should not be reset.
3253 		 */
3254 		kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu);
3255 	}
3256 
3257 	/*
3258 	 * If L1 had a pending IRQ/NMI until it executed
3259 	 * VMLAUNCH/VMRESUME which wasn't delivered because it was
3260 	 * disallowed (e.g. interrupts disabled), L0 needs to
3261 	 * evaluate if this pending event should cause an exit from L2
3262 	 * to L1 or delivered directly to L2 (e.g. In case L1 don't
3263 	 * intercept EXTERNAL_INTERRUPT).
3264 	 *
3265 	 * Usually this would be handled by the processor noticing an
3266 	 * IRQ/NMI window request, or checking RVI during evaluation of
3267 	 * pending virtual interrupts.  However, this setting was done
3268 	 * on VMCS01 and now VMCS02 is active instead. Thus, we force L0
3269 	 * to perform pending event evaluation by requesting a KVM_REQ_EVENT.
3270 	 */
3271 	if (unlikely(evaluate_pending_interrupts))
3272 		kvm_make_request(KVM_REQ_EVENT, vcpu);
3273 
3274 	/*
3275 	 * Do not start the preemption timer hrtimer until after we know
3276 	 * we are successful, so that only nested_vmx_vmexit needs to cancel
3277 	 * the timer.
3278 	 */
3279 	vmx->nested.preemption_timer_expired = false;
3280 	if (nested_cpu_has_preemption_timer(vmcs12))
3281 		vmx_start_preemption_timer(vcpu);
3282 
3283 	/*
3284 	 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
3285 	 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
3286 	 * returned as far as L1 is concerned. It will only return (and set
3287 	 * the success flag) when L2 exits (see nested_vmx_vmexit()).
3288 	 */
3289 	return NVMX_VMENTRY_SUCCESS;
3290 
3291 	/*
3292 	 * A failed consistency check that leads to a VMExit during L1's
3293 	 * VMEnter to L2 is a variation of a normal VMexit, as explained in
3294 	 * 26.7 "VM-entry failures during or after loading guest state".
3295 	 */
3296 vmentry_fail_vmexit_guest_mode:
3297 	if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING)
3298 		vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
3299 	leave_guest_mode(vcpu);
3300 
3301 vmentry_fail_vmexit:
3302 	vmx_switch_vmcs(vcpu, &vmx->vmcs01);
3303 
3304 	if (!from_vmentry)
3305 		return NVMX_VMENTRY_VMEXIT;
3306 
3307 	load_vmcs12_host_state(vcpu, vmcs12);
3308 	vmcs12->vm_exit_reason = exit_reason | VMX_EXIT_REASONS_FAILED_VMENTRY;
3309 	vmcs12->exit_qualification = exit_qual;
3310 	if (enable_shadow_vmcs || vmx->nested.hv_evmcs)
3311 		vmx->nested.need_vmcs12_to_shadow_sync = true;
3312 	return NVMX_VMENTRY_VMEXIT;
3313 }
3314 
3315 /*
3316  * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
3317  * for running an L2 nested guest.
3318  */
3319 static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
3320 {
3321 	struct vmcs12 *vmcs12;
3322 	enum nvmx_vmentry_status status;
3323 	struct vcpu_vmx *vmx = to_vmx(vcpu);
3324 	u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu);
3325 
3326 	if (!nested_vmx_check_permission(vcpu))
3327 		return 1;
3328 
3329 	if (!nested_vmx_handle_enlightened_vmptrld(vcpu, launch))
3330 		return 1;
3331 
3332 	if (!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull)
3333 		return nested_vmx_failInvalid(vcpu);
3334 
3335 	vmcs12 = get_vmcs12(vcpu);
3336 
3337 	/*
3338 	 * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact
3339 	 * that there *is* a valid VMCS pointer, RFLAGS.CF is set
3340 	 * rather than RFLAGS.ZF, and no error number is stored to the
3341 	 * VM-instruction error field.
3342 	 */
3343 	if (vmcs12->hdr.shadow_vmcs)
3344 		return nested_vmx_failInvalid(vcpu);
3345 
3346 	if (vmx->nested.hv_evmcs) {
3347 		copy_enlightened_to_vmcs12(vmx);
3348 		/* Enlightened VMCS doesn't have launch state */
3349 		vmcs12->launch_state = !launch;
3350 	} else if (enable_shadow_vmcs) {
3351 		copy_shadow_to_vmcs12(vmx);
3352 	}
3353 
3354 	/*
3355 	 * The nested entry process starts with enforcing various prerequisites
3356 	 * on vmcs12 as required by the Intel SDM, and act appropriately when
3357 	 * they fail: As the SDM explains, some conditions should cause the
3358 	 * instruction to fail, while others will cause the instruction to seem
3359 	 * to succeed, but return an EXIT_REASON_INVALID_STATE.
3360 	 * To speed up the normal (success) code path, we should avoid checking
3361 	 * for misconfigurations which will anyway be caught by the processor
3362 	 * when using the merged vmcs02.
3363 	 */
3364 	if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS)
3365 		return nested_vmx_failValid(vcpu,
3366 			VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS);
3367 
3368 	if (vmcs12->launch_state == launch)
3369 		return nested_vmx_failValid(vcpu,
3370 			launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
3371 			       : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
3372 
3373 	if (nested_vmx_check_controls(vcpu, vmcs12))
3374 		return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
3375 
3376 	if (nested_vmx_check_host_state(vcpu, vmcs12))
3377 		return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
3378 
3379 	/*
3380 	 * We're finally done with prerequisite checking, and can start with
3381 	 * the nested entry.
3382 	 */
3383 	vmx->nested.nested_run_pending = 1;
3384 	status = nested_vmx_enter_non_root_mode(vcpu, true);
3385 	if (unlikely(status != NVMX_VMENTRY_SUCCESS))
3386 		goto vmentry_failed;
3387 
3388 	/* Hide L1D cache contents from the nested guest.  */
3389 	vmx->vcpu.arch.l1tf_flush_l1d = true;
3390 
3391 	/*
3392 	 * Must happen outside of nested_vmx_enter_non_root_mode() as it will
3393 	 * also be used as part of restoring nVMX state for
3394 	 * snapshot restore (migration).
3395 	 *
3396 	 * In this flow, it is assumed that vmcs12 cache was
3397 	 * trasferred as part of captured nVMX state and should
3398 	 * therefore not be read from guest memory (which may not
3399 	 * exist on destination host yet).
3400 	 */
3401 	nested_cache_shadow_vmcs12(vcpu, vmcs12);
3402 
3403 	/*
3404 	 * If we're entering a halted L2 vcpu and the L2 vcpu won't be
3405 	 * awakened by event injection or by an NMI-window VM-exit or
3406 	 * by an interrupt-window VM-exit, halt the vcpu.
3407 	 */
3408 	if ((vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) &&
3409 	    !(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) &&
3410 	    !(vmcs12->cpu_based_vm_exec_control & CPU_BASED_NMI_WINDOW_EXITING) &&
3411 	    !((vmcs12->cpu_based_vm_exec_control & CPU_BASED_INTR_WINDOW_EXITING) &&
3412 	      (vmcs12->guest_rflags & X86_EFLAGS_IF))) {
3413 		vmx->nested.nested_run_pending = 0;
3414 		return kvm_vcpu_halt(vcpu);
3415 	}
3416 	return 1;
3417 
3418 vmentry_failed:
3419 	vmx->nested.nested_run_pending = 0;
3420 	if (status == NVMX_VMENTRY_KVM_INTERNAL_ERROR)
3421 		return 0;
3422 	if (status == NVMX_VMENTRY_VMEXIT)
3423 		return 1;
3424 	WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL);
3425 	return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
3426 }
3427 
3428 /*
3429  * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date
3430  * because L2 may have changed some cr0 bits directly (CRO_GUEST_HOST_MASK).
3431  * This function returns the new value we should put in vmcs12.guest_cr0.
3432  * It's not enough to just return the vmcs02 GUEST_CR0. Rather,
3433  *  1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now
3434  *     available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0
3435  *     didn't trap the bit, because if L1 did, so would L0).
3436  *  2. Bits that L1 asked to trap (and therefore L0 also did) could not have
3437  *     been modified by L2, and L1 knows it. So just leave the old value of
3438  *     the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0
3439  *     isn't relevant, because if L0 traps this bit it can set it to anything.
3440  *  3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have
3441  *     changed these bits, and therefore they need to be updated, but L0
3442  *     didn't necessarily allow them to be changed in GUEST_CR0 - and rather
3443  *     put them in vmcs02 CR0_READ_SHADOW. So take these bits from there.
3444  */
3445 static inline unsigned long
3446 vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
3447 {
3448 	return
3449 	/*1*/	(vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) |
3450 	/*2*/	(vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) |
3451 	/*3*/	(vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask |
3452 			vcpu->arch.cr0_guest_owned_bits));
3453 }
3454 
3455 static inline unsigned long
3456 vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
3457 {
3458 	return
3459 	/*1*/	(vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) |
3460 	/*2*/	(vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) |
3461 	/*3*/	(vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask |
3462 			vcpu->arch.cr4_guest_owned_bits));
3463 }
3464 
3465 static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
3466 				      struct vmcs12 *vmcs12)
3467 {
3468 	u32 idt_vectoring;
3469 	unsigned int nr;
3470 
3471 	if (vcpu->arch.exception.injected) {
3472 		nr = vcpu->arch.exception.nr;
3473 		idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
3474 
3475 		if (kvm_exception_is_soft(nr)) {
3476 			vmcs12->vm_exit_instruction_len =
3477 				vcpu->arch.event_exit_inst_len;
3478 			idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION;
3479 		} else
3480 			idt_vectoring |= INTR_TYPE_HARD_EXCEPTION;
3481 
3482 		if (vcpu->arch.exception.has_error_code) {
3483 			idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK;
3484 			vmcs12->idt_vectoring_error_code =
3485 				vcpu->arch.exception.error_code;
3486 		}
3487 
3488 		vmcs12->idt_vectoring_info_field = idt_vectoring;
3489 	} else if (vcpu->arch.nmi_injected) {
3490 		vmcs12->idt_vectoring_info_field =
3491 			INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR;
3492 	} else if (vcpu->arch.interrupt.injected) {
3493 		nr = vcpu->arch.interrupt.nr;
3494 		idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
3495 
3496 		if (vcpu->arch.interrupt.soft) {
3497 			idt_vectoring |= INTR_TYPE_SOFT_INTR;
3498 			vmcs12->vm_entry_instruction_len =
3499 				vcpu->arch.event_exit_inst_len;
3500 		} else
3501 			idt_vectoring |= INTR_TYPE_EXT_INTR;
3502 
3503 		vmcs12->idt_vectoring_info_field = idt_vectoring;
3504 	}
3505 }
3506 
3507 
3508 static void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu)
3509 {
3510 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3511 	gfn_t gfn;
3512 
3513 	/*
3514 	 * Don't need to mark the APIC access page dirty; it is never
3515 	 * written to by the CPU during APIC virtualization.
3516 	 */
3517 
3518 	if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
3519 		gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT;
3520 		kvm_vcpu_mark_page_dirty(vcpu, gfn);
3521 	}
3522 
3523 	if (nested_cpu_has_posted_intr(vmcs12)) {
3524 		gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT;
3525 		kvm_vcpu_mark_page_dirty(vcpu, gfn);
3526 	}
3527 }
3528 
3529 static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
3530 {
3531 	struct vcpu_vmx *vmx = to_vmx(vcpu);
3532 	int max_irr;
3533 	void *vapic_page;
3534 	u16 status;
3535 
3536 	if (!vmx->nested.pi_desc || !vmx->nested.pi_pending)
3537 		return;
3538 
3539 	vmx->nested.pi_pending = false;
3540 	if (!pi_test_and_clear_on(vmx->nested.pi_desc))
3541 		return;
3542 
3543 	max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
3544 	if (max_irr != 256) {
3545 		vapic_page = vmx->nested.virtual_apic_map.hva;
3546 		if (!vapic_page)
3547 			return;
3548 
3549 		__kvm_apic_update_irr(vmx->nested.pi_desc->pir,
3550 			vapic_page, &max_irr);
3551 		status = vmcs_read16(GUEST_INTR_STATUS);
3552 		if ((u8)max_irr > ((u8)status & 0xff)) {
3553 			status &= ~0xff;
3554 			status |= (u8)max_irr;
3555 			vmcs_write16(GUEST_INTR_STATUS, status);
3556 		}
3557 	}
3558 
3559 	nested_mark_vmcs12_pages_dirty(vcpu);
3560 }
3561 
3562 static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu,
3563 					       unsigned long exit_qual)
3564 {
3565 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3566 	unsigned int nr = vcpu->arch.exception.nr;
3567 	u32 intr_info = nr | INTR_INFO_VALID_MASK;
3568 
3569 	if (vcpu->arch.exception.has_error_code) {
3570 		vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code;
3571 		intr_info |= INTR_INFO_DELIVER_CODE_MASK;
3572 	}
3573 
3574 	if (kvm_exception_is_soft(nr))
3575 		intr_info |= INTR_TYPE_SOFT_EXCEPTION;
3576 	else
3577 		intr_info |= INTR_TYPE_HARD_EXCEPTION;
3578 
3579 	if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) &&
3580 	    vmx_get_nmi_mask(vcpu))
3581 		intr_info |= INTR_INFO_UNBLOCK_NMI;
3582 
3583 	nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual);
3584 }
3585 
3586 static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
3587 {
3588 	struct vcpu_vmx *vmx = to_vmx(vcpu);
3589 	unsigned long exit_qual;
3590 	bool block_nested_events =
3591 	    vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu);
3592 	struct kvm_lapic *apic = vcpu->arch.apic;
3593 
3594 	if (lapic_in_kernel(vcpu) &&
3595 		test_bit(KVM_APIC_INIT, &apic->pending_events)) {
3596 		if (block_nested_events)
3597 			return -EBUSY;
3598 		clear_bit(KVM_APIC_INIT, &apic->pending_events);
3599 		nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0);
3600 		return 0;
3601 	}
3602 
3603 	if (vcpu->arch.exception.pending &&
3604 		nested_vmx_check_exception(vcpu, &exit_qual)) {
3605 		if (block_nested_events)
3606 			return -EBUSY;
3607 		nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
3608 		return 0;
3609 	}
3610 
3611 	if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) &&
3612 	    vmx->nested.preemption_timer_expired) {
3613 		if (block_nested_events)
3614 			return -EBUSY;
3615 		nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0);
3616 		return 0;
3617 	}
3618 
3619 	if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) {
3620 		if (block_nested_events)
3621 			return -EBUSY;
3622 		nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
3623 				  NMI_VECTOR | INTR_TYPE_NMI_INTR |
3624 				  INTR_INFO_VALID_MASK, 0);
3625 		/*
3626 		 * The NMI-triggered VM exit counts as injection:
3627 		 * clear this one and block further NMIs.
3628 		 */
3629 		vcpu->arch.nmi_pending = 0;
3630 		vmx_set_nmi_mask(vcpu, true);
3631 		return 0;
3632 	}
3633 
3634 	if ((kvm_cpu_has_interrupt(vcpu) || external_intr) &&
3635 	    nested_exit_on_intr(vcpu)) {
3636 		if (block_nested_events)
3637 			return -EBUSY;
3638 		nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
3639 		return 0;
3640 	}
3641 
3642 	vmx_complete_nested_posted_interrupt(vcpu);
3643 	return 0;
3644 }
3645 
3646 static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
3647 {
3648 	ktime_t remaining =
3649 		hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer);
3650 	u64 value;
3651 
3652 	if (ktime_to_ns(remaining) <= 0)
3653 		return 0;
3654 
3655 	value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz;
3656 	do_div(value, 1000000);
3657 	return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
3658 }
3659 
3660 static bool is_vmcs12_ext_field(unsigned long field)
3661 {
3662 	switch (field) {
3663 	case GUEST_ES_SELECTOR:
3664 	case GUEST_CS_SELECTOR:
3665 	case GUEST_SS_SELECTOR:
3666 	case GUEST_DS_SELECTOR:
3667 	case GUEST_FS_SELECTOR:
3668 	case GUEST_GS_SELECTOR:
3669 	case GUEST_LDTR_SELECTOR:
3670 	case GUEST_TR_SELECTOR:
3671 	case GUEST_ES_LIMIT:
3672 	case GUEST_CS_LIMIT:
3673 	case GUEST_SS_LIMIT:
3674 	case GUEST_DS_LIMIT:
3675 	case GUEST_FS_LIMIT:
3676 	case GUEST_GS_LIMIT:
3677 	case GUEST_LDTR_LIMIT:
3678 	case GUEST_TR_LIMIT:
3679 	case GUEST_GDTR_LIMIT:
3680 	case GUEST_IDTR_LIMIT:
3681 	case GUEST_ES_AR_BYTES:
3682 	case GUEST_DS_AR_BYTES:
3683 	case GUEST_FS_AR_BYTES:
3684 	case GUEST_GS_AR_BYTES:
3685 	case GUEST_LDTR_AR_BYTES:
3686 	case GUEST_TR_AR_BYTES:
3687 	case GUEST_ES_BASE:
3688 	case GUEST_CS_BASE:
3689 	case GUEST_SS_BASE:
3690 	case GUEST_DS_BASE:
3691 	case GUEST_FS_BASE:
3692 	case GUEST_GS_BASE:
3693 	case GUEST_LDTR_BASE:
3694 	case GUEST_TR_BASE:
3695 	case GUEST_GDTR_BASE:
3696 	case GUEST_IDTR_BASE:
3697 	case GUEST_PENDING_DBG_EXCEPTIONS:
3698 	case GUEST_BNDCFGS:
3699 		return true;
3700 	default:
3701 		break;
3702 	}
3703 
3704 	return false;
3705 }
3706 
3707 static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
3708 				       struct vmcs12 *vmcs12)
3709 {
3710 	struct vcpu_vmx *vmx = to_vmx(vcpu);
3711 
3712 	vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
3713 	vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
3714 	vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
3715 	vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
3716 	vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
3717 	vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
3718 	vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
3719 	vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
3720 	vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
3721 	vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
3722 	vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
3723 	vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
3724 	vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
3725 	vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
3726 	vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
3727 	vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
3728 	vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
3729 	vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
3730 	vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
3731 	vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
3732 	vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
3733 	vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
3734 	vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
3735 	vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
3736 	vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE);
3737 	vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
3738 	vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
3739 	vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
3740 	vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
3741 	vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
3742 	vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
3743 	vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
3744 	vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
3745 	vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
3746 	vmcs12->guest_pending_dbg_exceptions =
3747 		vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
3748 	if (kvm_mpx_supported())
3749 		vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
3750 
3751 	vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false;
3752 }
3753 
3754 static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
3755 				       struct vmcs12 *vmcs12)
3756 {
3757 	struct vcpu_vmx *vmx = to_vmx(vcpu);
3758 	int cpu;
3759 
3760 	if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare)
3761 		return;
3762 
3763 
3764 	WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01);
3765 
3766 	cpu = get_cpu();
3767 	vmx->loaded_vmcs = &vmx->nested.vmcs02;
3768 	vmx_vcpu_load(&vmx->vcpu, cpu);
3769 
3770 	sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
3771 
3772 	vmx->loaded_vmcs = &vmx->vmcs01;
3773 	vmx_vcpu_load(&vmx->vcpu, cpu);
3774 	put_cpu();
3775 }
3776 
3777 /*
3778  * Update the guest state fields of vmcs12 to reflect changes that
3779  * occurred while L2 was running. (The "IA-32e mode guest" bit of the
3780  * VM-entry controls is also updated, since this is really a guest
3781  * state bit.)
3782  */
3783 static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
3784 {
3785 	struct vcpu_vmx *vmx = to_vmx(vcpu);
3786 
3787 	if (vmx->nested.hv_evmcs)
3788 		sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
3789 
3790 	vmx->nested.need_sync_vmcs02_to_vmcs12_rare = !vmx->nested.hv_evmcs;
3791 
3792 	vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
3793 	vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
3794 
3795 	vmcs12->guest_rsp = kvm_rsp_read(vcpu);
3796 	vmcs12->guest_rip = kvm_rip_read(vcpu);
3797 	vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);
3798 
3799 	vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
3800 	vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
3801 
3802 	vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
3803 	vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
3804 	vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
3805 
3806 	vmcs12->guest_interruptibility_info =
3807 		vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
3808 
3809 	if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
3810 		vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT;
3811 	else
3812 		vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE;
3813 
3814 	if (nested_cpu_has_preemption_timer(vmcs12) &&
3815 	    vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
3816 			vmcs12->vmx_preemption_timer_value =
3817 				vmx_get_preemption_timer_value(vcpu);
3818 
3819 	/*
3820 	 * In some cases (usually, nested EPT), L2 is allowed to change its
3821 	 * own CR3 without exiting. If it has changed it, we must keep it.
3822 	 * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined
3823 	 * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12.
3824 	 *
3825 	 * Additionally, restore L2's PDPTR to vmcs12.
3826 	 */
3827 	if (enable_ept) {
3828 		vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3);
3829 		if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) {
3830 			vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
3831 			vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
3832 			vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
3833 			vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
3834 		}
3835 	}
3836 
3837 	vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
3838 
3839 	if (nested_cpu_has_vid(vmcs12))
3840 		vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS);
3841 
3842 	vmcs12->vm_entry_controls =
3843 		(vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
3844 		(vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
3845 
3846 	if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS)
3847 		kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7);
3848 
3849 	if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER)
3850 		vmcs12->guest_ia32_efer = vcpu->arch.efer;
3851 }
3852 
3853 /*
3854  * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
3855  * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
3856  * and this function updates it to reflect the changes to the guest state while
3857  * L2 was running (and perhaps made some exits which were handled directly by L0
3858  * without going back to L1), and to reflect the exit reason.
3859  * Note that we do not have to copy here all VMCS fields, just those that
3860  * could have changed by the L2 guest or the exit - i.e., the guest-state and
3861  * exit-information fields only. Other fields are modified by L1 with VMWRITE,
3862  * which already writes to vmcs12 directly.
3863  */
3864 static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
3865 			   u32 exit_reason, u32 exit_intr_info,
3866 			   unsigned long exit_qualification)
3867 {
3868 	/* update exit information fields: */
3869 	vmcs12->vm_exit_reason = exit_reason;
3870 	vmcs12->exit_qualification = exit_qualification;
3871 	vmcs12->vm_exit_intr_info = exit_intr_info;
3872 
3873 	vmcs12->idt_vectoring_info_field = 0;
3874 	vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
3875 	vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
3876 
3877 	if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) {
3878 		vmcs12->launch_state = 1;
3879 
3880 		/* vm_entry_intr_info_field is cleared on exit. Emulate this
3881 		 * instead of reading the real value. */
3882 		vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK;
3883 
3884 		/*
3885 		 * Transfer the event that L0 or L1 may wanted to inject into
3886 		 * L2 to IDT_VECTORING_INFO_FIELD.
3887 		 */
3888 		vmcs12_save_pending_event(vcpu, vmcs12);
3889 
3890 		/*
3891 		 * According to spec, there's no need to store the guest's
3892 		 * MSRs if the exit is due to a VM-entry failure that occurs
3893 		 * during or after loading the guest state. Since this exit
3894 		 * does not fall in that category, we need to save the MSRs.
3895 		 */
3896 		if (nested_vmx_store_msr(vcpu,
3897 					 vmcs12->vm_exit_msr_store_addr,
3898 					 vmcs12->vm_exit_msr_store_count))
3899 			nested_vmx_abort(vcpu,
3900 					 VMX_ABORT_SAVE_GUEST_MSR_FAIL);
3901 	}
3902 
3903 	/*
3904 	 * Drop what we picked up for L2 via vmx_complete_interrupts. It is
3905 	 * preserved above and would only end up incorrectly in L1.
3906 	 */
3907 	vcpu->arch.nmi_injected = false;
3908 	kvm_clear_exception_queue(vcpu);
3909 	kvm_clear_interrupt_queue(vcpu);
3910 }
3911 
3912 /*
3913  * A part of what we need to when the nested L2 guest exits and we want to
3914  * run its L1 parent, is to reset L1's guest state to the host state specified
3915  * in vmcs12.
3916  * This function is to be called not only on normal nested exit, but also on
3917  * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry
3918  * Failures During or After Loading Guest State").
3919  * This function should be called when the active VMCS is L1's (vmcs01).
3920  */
3921 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
3922 				   struct vmcs12 *vmcs12)
3923 {
3924 	struct kvm_segment seg;
3925 	u32 entry_failure_code;
3926 
3927 	if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
3928 		vcpu->arch.efer = vmcs12->host_ia32_efer;
3929 	else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
3930 		vcpu->arch.efer |= (EFER_LMA | EFER_LME);
3931 	else
3932 		vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
3933 	vmx_set_efer(vcpu, vcpu->arch.efer);
3934 
3935 	kvm_rsp_write(vcpu, vmcs12->host_rsp);
3936 	kvm_rip_write(vcpu, vmcs12->host_rip);
3937 	vmx_set_rflags(vcpu, X86_EFLAGS_FIXED);
3938 	vmx_set_interrupt_shadow(vcpu, 0);
3939 
3940 	/*
3941 	 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't
3942 	 * actually changed, because vmx_set_cr0 refers to efer set above.
3943 	 *
3944 	 * CR0_GUEST_HOST_MASK is already set in the original vmcs01
3945 	 * (KVM doesn't change it);
3946 	 */
3947 	vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
3948 	vmx_set_cr0(vcpu, vmcs12->host_cr0);
3949 
3950 	/* Same as above - no reason to call set_cr4_guest_host_mask().  */
3951 	vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
3952 	vmx_set_cr4(vcpu, vmcs12->host_cr4);
3953 
3954 	nested_ept_uninit_mmu_context(vcpu);
3955 
3956 	/*
3957 	 * Only PDPTE load can fail as the value of cr3 was checked on entry and
3958 	 * couldn't have changed.
3959 	 */
3960 	if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code))
3961 		nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
3962 
3963 	if (!enable_ept)
3964 		vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
3965 
3966 	/*
3967 	 * If vmcs01 doesn't use VPID, CPU flushes TLB on every
3968 	 * VMEntry/VMExit. Thus, no need to flush TLB.
3969 	 *
3970 	 * If vmcs12 doesn't use VPID, L1 expects TLB to be
3971 	 * flushed on every VMEntry/VMExit.
3972 	 *
3973 	 * Otherwise, we can preserve TLB entries as long as we are
3974 	 * able to tag L1 TLB entries differently than L2 TLB entries.
3975 	 *
3976 	 * If vmcs12 uses EPT, we need to execute this flush on EPTP01
3977 	 * and therefore we request the TLB flush to happen only after VMCS EPTP
3978 	 * has been set by KVM_REQ_LOAD_CR3.
3979 	 */
3980 	if (enable_vpid &&
3981 	    (!nested_cpu_has_vpid(vmcs12) || !nested_has_guest_tlb_tag(vcpu))) {
3982 		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
3983 	}
3984 
3985 	vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
3986 	vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp);
3987 	vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
3988 	vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
3989 	vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
3990 	vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF);
3991 	vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF);
3992 
3993 	/* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1.  */
3994 	if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS)
3995 		vmcs_write64(GUEST_BNDCFGS, 0);
3996 
3997 	if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) {
3998 		vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
3999 		vcpu->arch.pat = vmcs12->host_ia32_pat;
4000 	}
4001 	if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
4002 		SET_MSR_OR_WARN(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
4003 				vmcs12->host_ia32_perf_global_ctrl);
4004 
4005 	/* Set L1 segment info according to Intel SDM
4006 	    27.5.2 Loading Host Segment and Descriptor-Table Registers */
4007 	seg = (struct kvm_segment) {
4008 		.base = 0,
4009 		.limit = 0xFFFFFFFF,
4010 		.selector = vmcs12->host_cs_selector,
4011 		.type = 11,
4012 		.present = 1,
4013 		.s = 1,
4014 		.g = 1
4015 	};
4016 	if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
4017 		seg.l = 1;
4018 	else
4019 		seg.db = 1;
4020 	vmx_set_segment(vcpu, &seg, VCPU_SREG_CS);
4021 	seg = (struct kvm_segment) {
4022 		.base = 0,
4023 		.limit = 0xFFFFFFFF,
4024 		.type = 3,
4025 		.present = 1,
4026 		.s = 1,
4027 		.db = 1,
4028 		.g = 1
4029 	};
4030 	seg.selector = vmcs12->host_ds_selector;
4031 	vmx_set_segment(vcpu, &seg, VCPU_SREG_DS);
4032 	seg.selector = vmcs12->host_es_selector;
4033 	vmx_set_segment(vcpu, &seg, VCPU_SREG_ES);
4034 	seg.selector = vmcs12->host_ss_selector;
4035 	vmx_set_segment(vcpu, &seg, VCPU_SREG_SS);
4036 	seg.selector = vmcs12->host_fs_selector;
4037 	seg.base = vmcs12->host_fs_base;
4038 	vmx_set_segment(vcpu, &seg, VCPU_SREG_FS);
4039 	seg.selector = vmcs12->host_gs_selector;
4040 	seg.base = vmcs12->host_gs_base;
4041 	vmx_set_segment(vcpu, &seg, VCPU_SREG_GS);
4042 	seg = (struct kvm_segment) {
4043 		.base = vmcs12->host_tr_base,
4044 		.limit = 0x67,
4045 		.selector = vmcs12->host_tr_selector,
4046 		.type = 11,
4047 		.present = 1
4048 	};
4049 	vmx_set_segment(vcpu, &seg, VCPU_SREG_TR);
4050 
4051 	kvm_set_dr(vcpu, 7, 0x400);
4052 	vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
4053 
4054 	if (cpu_has_vmx_msr_bitmap())
4055 		vmx_update_msr_bitmap(vcpu);
4056 
4057 	if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
4058 				vmcs12->vm_exit_msr_load_count))
4059 		nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
4060 }
4061 
4062 static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx)
4063 {
4064 	struct shared_msr_entry *efer_msr;
4065 	unsigned int i;
4066 
4067 	if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER)
4068 		return vmcs_read64(GUEST_IA32_EFER);
4069 
4070 	if (cpu_has_load_ia32_efer())
4071 		return host_efer;
4072 
4073 	for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) {
4074 		if (vmx->msr_autoload.guest.val[i].index == MSR_EFER)
4075 			return vmx->msr_autoload.guest.val[i].value;
4076 	}
4077 
4078 	efer_msr = find_msr_entry(vmx, MSR_EFER);
4079 	if (efer_msr)
4080 		return efer_msr->data;
4081 
4082 	return host_efer;
4083 }
4084 
4085 static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
4086 {
4087 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
4088 	struct vcpu_vmx *vmx = to_vmx(vcpu);
4089 	struct vmx_msr_entry g, h;
4090 	gpa_t gpa;
4091 	u32 i, j;
4092 
4093 	vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT);
4094 
4095 	if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) {
4096 		/*
4097 		 * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set
4098 		 * as vmcs01.GUEST_DR7 contains a userspace defined value
4099 		 * and vcpu->arch.dr7 is not squirreled away before the
4100 		 * nested VMENTER (not worth adding a variable in nested_vmx).
4101 		 */
4102 		if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
4103 			kvm_set_dr(vcpu, 7, DR7_FIXED_1);
4104 		else
4105 			WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7)));
4106 	}
4107 
4108 	/*
4109 	 * Note that calling vmx_set_{efer,cr0,cr4} is important as they
4110 	 * handle a variety of side effects to KVM's software model.
4111 	 */
4112 	vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx));
4113 
4114 	vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
4115 	vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW));
4116 
4117 	vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
4118 	vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW));
4119 
4120 	nested_ept_uninit_mmu_context(vcpu);
4121 	vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
4122 	kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
4123 
4124 	/*
4125 	 * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs
4126 	 * from vmcs01 (if necessary).  The PDPTRs are not loaded on
4127 	 * VMFail, like everything else we just need to ensure our
4128 	 * software model is up-to-date.
4129 	 */
4130 	if (enable_ept)
4131 		ept_save_pdptrs(vcpu);
4132 
4133 	kvm_mmu_reset_context(vcpu);
4134 
4135 	if (cpu_has_vmx_msr_bitmap())
4136 		vmx_update_msr_bitmap(vcpu);
4137 
4138 	/*
4139 	 * This nasty bit of open coding is a compromise between blindly
4140 	 * loading L1's MSRs using the exit load lists (incorrect emulation
4141 	 * of VMFail), leaving the nested VM's MSRs in the software model
4142 	 * (incorrect behavior) and snapshotting the modified MSRs (too
4143 	 * expensive since the lists are unbound by hardware).  For each
4144 	 * MSR that was (prematurely) loaded from the nested VMEntry load
4145 	 * list, reload it from the exit load list if it exists and differs
4146 	 * from the guest value.  The intent is to stuff host state as
4147 	 * silently as possible, not to fully process the exit load list.
4148 	 */
4149 	for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) {
4150 		gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g));
4151 		if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) {
4152 			pr_debug_ratelimited(
4153 				"%s read MSR index failed (%u, 0x%08llx)\n",
4154 				__func__, i, gpa);
4155 			goto vmabort;
4156 		}
4157 
4158 		for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) {
4159 			gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h));
4160 			if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) {
4161 				pr_debug_ratelimited(
4162 					"%s read MSR failed (%u, 0x%08llx)\n",
4163 					__func__, j, gpa);
4164 				goto vmabort;
4165 			}
4166 			if (h.index != g.index)
4167 				continue;
4168 			if (h.value == g.value)
4169 				break;
4170 
4171 			if (nested_vmx_load_msr_check(vcpu, &h)) {
4172 				pr_debug_ratelimited(
4173 					"%s check failed (%u, 0x%x, 0x%x)\n",
4174 					__func__, j, h.index, h.reserved);
4175 				goto vmabort;
4176 			}
4177 
4178 			if (kvm_set_msr(vcpu, h.index, h.value)) {
4179 				pr_debug_ratelimited(
4180 					"%s WRMSR failed (%u, 0x%x, 0x%llx)\n",
4181 					__func__, j, h.index, h.value);
4182 				goto vmabort;
4183 			}
4184 		}
4185 	}
4186 
4187 	return;
4188 
4189 vmabort:
4190 	nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
4191 }
4192 
4193 /*
4194  * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1
4195  * and modify vmcs12 to make it see what it would expect to see there if
4196  * L2 was its real guest. Must only be called when in L2 (is_guest_mode())
4197  */
4198 void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
4199 		       u32 exit_intr_info, unsigned long exit_qualification)
4200 {
4201 	struct vcpu_vmx *vmx = to_vmx(vcpu);
4202 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
4203 
4204 	/* trying to cancel vmlaunch/vmresume is a bug */
4205 	WARN_ON_ONCE(vmx->nested.nested_run_pending);
4206 
4207 	leave_guest_mode(vcpu);
4208 
4209 	if (nested_cpu_has_preemption_timer(vmcs12))
4210 		hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer);
4211 
4212 	if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING)
4213 		vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
4214 
4215 	if (likely(!vmx->fail)) {
4216 		sync_vmcs02_to_vmcs12(vcpu, vmcs12);
4217 
4218 		if (exit_reason != -1)
4219 			prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
4220 				       exit_qualification);
4221 
4222 		/*
4223 		 * Must happen outside of sync_vmcs02_to_vmcs12() as it will
4224 		 * also be used to capture vmcs12 cache as part of
4225 		 * capturing nVMX state for snapshot (migration).
4226 		 *
4227 		 * Otherwise, this flush will dirty guest memory at a
4228 		 * point it is already assumed by user-space to be
4229 		 * immutable.
4230 		 */
4231 		nested_flush_cached_shadow_vmcs12(vcpu, vmcs12);
4232 	} else {
4233 		/*
4234 		 * The only expected VM-instruction error is "VM entry with
4235 		 * invalid control field(s)." Anything else indicates a
4236 		 * problem with L0.  And we should never get here with a
4237 		 * VMFail of any type if early consistency checks are enabled.
4238 		 */
4239 		WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) !=
4240 			     VMXERR_ENTRY_INVALID_CONTROL_FIELD);
4241 		WARN_ON_ONCE(nested_early_check);
4242 	}
4243 
4244 	vmx_switch_vmcs(vcpu, &vmx->vmcs01);
4245 
4246 	/* Update any VMCS fields that might have changed while L2 ran */
4247 	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
4248 	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
4249 	vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
4250 	if (vmx->nested.l1_tpr_threshold != -1)
4251 		vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold);
4252 
4253 	if (kvm_has_tsc_control)
4254 		decache_tsc_multiplier(vmx);
4255 
4256 	if (vmx->nested.change_vmcs01_virtual_apic_mode) {
4257 		vmx->nested.change_vmcs01_virtual_apic_mode = false;
4258 		vmx_set_virtual_apic_mode(vcpu);
4259 	}
4260 
4261 	/* Unpin physical memory we referred to in vmcs02 */
4262 	if (vmx->nested.apic_access_page) {
4263 		kvm_release_page_clean(vmx->nested.apic_access_page);
4264 		vmx->nested.apic_access_page = NULL;
4265 	}
4266 	kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
4267 	kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
4268 	vmx->nested.pi_desc = NULL;
4269 
4270 	/*
4271 	 * We are now running in L2, mmu_notifier will force to reload the
4272 	 * page's hpa for L2 vmcs. Need to reload it for L1 before entering L1.
4273 	 */
4274 	kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
4275 
4276 	if ((exit_reason != -1) && (enable_shadow_vmcs || vmx->nested.hv_evmcs))
4277 		vmx->nested.need_vmcs12_to_shadow_sync = true;
4278 
4279 	/* in case we halted in L2 */
4280 	vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
4281 
4282 	if (likely(!vmx->fail)) {
4283 		/*
4284 		 * TODO: SDM says that with acknowledge interrupt on
4285 		 * exit, bit 31 of the VM-exit interrupt information
4286 		 * (valid interrupt) is always set to 1 on
4287 		 * EXIT_REASON_EXTERNAL_INTERRUPT, so we shouldn't
4288 		 * need kvm_cpu_has_interrupt().  See the commit
4289 		 * message for details.
4290 		 */
4291 		if (nested_exit_intr_ack_set(vcpu) &&
4292 		    exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
4293 		    kvm_cpu_has_interrupt(vcpu)) {
4294 			int irq = kvm_cpu_get_interrupt(vcpu);
4295 			WARN_ON(irq < 0);
4296 			vmcs12->vm_exit_intr_info = irq |
4297 				INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
4298 		}
4299 
4300 		if (exit_reason != -1)
4301 			trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
4302 						       vmcs12->exit_qualification,
4303 						       vmcs12->idt_vectoring_info_field,
4304 						       vmcs12->vm_exit_intr_info,
4305 						       vmcs12->vm_exit_intr_error_code,
4306 						       KVM_ISA_VMX);
4307 
4308 		load_vmcs12_host_state(vcpu, vmcs12);
4309 
4310 		return;
4311 	}
4312 
4313 	/*
4314 	 * After an early L2 VM-entry failure, we're now back
4315 	 * in L1 which thinks it just finished a VMLAUNCH or
4316 	 * VMRESUME instruction, so we need to set the failure
4317 	 * flag and the VM-instruction error field of the VMCS
4318 	 * accordingly, and skip the emulated instruction.
4319 	 */
4320 	(void)nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
4321 
4322 	/*
4323 	 * Restore L1's host state to KVM's software model.  We're here
4324 	 * because a consistency check was caught by hardware, which
4325 	 * means some amount of guest state has been propagated to KVM's
4326 	 * model and needs to be unwound to the host's state.
4327 	 */
4328 	nested_vmx_restore_host_state(vcpu);
4329 
4330 	vmx->fail = 0;
4331 }
4332 
4333 /*
4334  * Decode the memory-address operand of a vmx instruction, as recorded on an
4335  * exit caused by such an instruction (run by a guest hypervisor).
4336  * On success, returns 0. When the operand is invalid, returns 1 and throws
4337  * #UD or #GP.
4338  */
4339 int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
4340 			u32 vmx_instruction_info, bool wr, int len, gva_t *ret)
4341 {
4342 	gva_t off;
4343 	bool exn;
4344 	struct kvm_segment s;
4345 
4346 	/*
4347 	 * According to Vol. 3B, "Information for VM Exits Due to Instruction
4348 	 * Execution", on an exit, vmx_instruction_info holds most of the
4349 	 * addressing components of the operand. Only the displacement part
4350 	 * is put in exit_qualification (see 3B, "Basic VM-Exit Information").
4351 	 * For how an actual address is calculated from all these components,
4352 	 * refer to Vol. 1, "Operand Addressing".
4353 	 */
4354 	int  scaling = vmx_instruction_info & 3;
4355 	int  addr_size = (vmx_instruction_info >> 7) & 7;
4356 	bool is_reg = vmx_instruction_info & (1u << 10);
4357 	int  seg_reg = (vmx_instruction_info >> 15) & 7;
4358 	int  index_reg = (vmx_instruction_info >> 18) & 0xf;
4359 	bool index_is_valid = !(vmx_instruction_info & (1u << 22));
4360 	int  base_reg       = (vmx_instruction_info >> 23) & 0xf;
4361 	bool base_is_valid  = !(vmx_instruction_info & (1u << 27));
4362 
4363 	if (is_reg) {
4364 		kvm_queue_exception(vcpu, UD_VECTOR);
4365 		return 1;
4366 	}
4367 
4368 	/* Addr = segment_base + offset */
4369 	/* offset = base + [index * scale] + displacement */
4370 	off = exit_qualification; /* holds the displacement */
4371 	if (addr_size == 1)
4372 		off = (gva_t)sign_extend64(off, 31);
4373 	else if (addr_size == 0)
4374 		off = (gva_t)sign_extend64(off, 15);
4375 	if (base_is_valid)
4376 		off += kvm_register_read(vcpu, base_reg);
4377 	if (index_is_valid)
4378 		off += kvm_register_read(vcpu, index_reg)<<scaling;
4379 	vmx_get_segment(vcpu, &s, seg_reg);
4380 
4381 	/*
4382 	 * The effective address, i.e. @off, of a memory operand is truncated
4383 	 * based on the address size of the instruction.  Note that this is
4384 	 * the *effective address*, i.e. the address prior to accounting for
4385 	 * the segment's base.
4386 	 */
4387 	if (addr_size == 1) /* 32 bit */
4388 		off &= 0xffffffff;
4389 	else if (addr_size == 0) /* 16 bit */
4390 		off &= 0xffff;
4391 
4392 	/* Checks for #GP/#SS exceptions. */
4393 	exn = false;
4394 	if (is_long_mode(vcpu)) {
4395 		/*
4396 		 * The virtual/linear address is never truncated in 64-bit
4397 		 * mode, e.g. a 32-bit address size can yield a 64-bit virtual
4398 		 * address when using FS/GS with a non-zero base.
4399 		 */
4400 		if (seg_reg == VCPU_SREG_FS || seg_reg == VCPU_SREG_GS)
4401 			*ret = s.base + off;
4402 		else
4403 			*ret = off;
4404 
4405 		/* Long mode: #GP(0)/#SS(0) if the memory address is in a
4406 		 * non-canonical form. This is the only check on the memory
4407 		 * destination for long mode!
4408 		 */
4409 		exn = is_noncanonical_address(*ret, vcpu);
4410 	} else {
4411 		/*
4412 		 * When not in long mode, the virtual/linear address is
4413 		 * unconditionally truncated to 32 bits regardless of the
4414 		 * address size.
4415 		 */
4416 		*ret = (s.base + off) & 0xffffffff;
4417 
4418 		/* Protected mode: apply checks for segment validity in the
4419 		 * following order:
4420 		 * - segment type check (#GP(0) may be thrown)
4421 		 * - usability check (#GP(0)/#SS(0))
4422 		 * - limit check (#GP(0)/#SS(0))
4423 		 */
4424 		if (wr)
4425 			/* #GP(0) if the destination operand is located in a
4426 			 * read-only data segment or any code segment.
4427 			 */
4428 			exn = ((s.type & 0xa) == 0 || (s.type & 8));
4429 		else
4430 			/* #GP(0) if the source operand is located in an
4431 			 * execute-only code segment
4432 			 */
4433 			exn = ((s.type & 0xa) == 8);
4434 		if (exn) {
4435 			kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
4436 			return 1;
4437 		}
4438 		/* Protected mode: #GP(0)/#SS(0) if the segment is unusable.
4439 		 */
4440 		exn = (s.unusable != 0);
4441 
4442 		/*
4443 		 * Protected mode: #GP(0)/#SS(0) if the memory operand is
4444 		 * outside the segment limit.  All CPUs that support VMX ignore
4445 		 * limit checks for flat segments, i.e. segments with base==0,
4446 		 * limit==0xffffffff and of type expand-up data or code.
4447 		 */
4448 		if (!(s.base == 0 && s.limit == 0xffffffff &&
4449 		     ((s.type & 8) || !(s.type & 4))))
4450 			exn = exn || ((u64)off + len - 1 > s.limit);
4451 	}
4452 	if (exn) {
4453 		kvm_queue_exception_e(vcpu,
4454 				      seg_reg == VCPU_SREG_SS ?
4455 						SS_VECTOR : GP_VECTOR,
4456 				      0);
4457 		return 1;
4458 	}
4459 
4460 	return 0;
4461 }
4462 
4463 void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu)
4464 {
4465 	struct vcpu_vmx *vmx;
4466 
4467 	if (!nested_vmx_allowed(vcpu))
4468 		return;
4469 
4470 	vmx = to_vmx(vcpu);
4471 	if (kvm_x86_ops->pmu_ops->is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL)) {
4472 		vmx->nested.msrs.entry_ctls_high |=
4473 				VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
4474 		vmx->nested.msrs.exit_ctls_high |=
4475 				VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
4476 	} else {
4477 		vmx->nested.msrs.entry_ctls_high &=
4478 				~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
4479 		vmx->nested.msrs.exit_ctls_high &=
4480 				~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
4481 	}
4482 }
4483 
4484 static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer)
4485 {
4486 	gva_t gva;
4487 	struct x86_exception e;
4488 
4489 	if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
4490 				vmcs_read32(VMX_INSTRUCTION_INFO), false,
4491 				sizeof(*vmpointer), &gva))
4492 		return 1;
4493 
4494 	if (kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e)) {
4495 		kvm_inject_page_fault(vcpu, &e);
4496 		return 1;
4497 	}
4498 
4499 	return 0;
4500 }
4501 
4502 /*
4503  * Allocate a shadow VMCS and associate it with the currently loaded
4504  * VMCS, unless such a shadow VMCS already exists. The newly allocated
4505  * VMCS is also VMCLEARed, so that it is ready for use.
4506  */
4507 static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu)
4508 {
4509 	struct vcpu_vmx *vmx = to_vmx(vcpu);
4510 	struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs;
4511 
4512 	/*
4513 	 * We should allocate a shadow vmcs for vmcs01 only when L1
4514 	 * executes VMXON and free it when L1 executes VMXOFF.
4515 	 * As it is invalid to execute VMXON twice, we shouldn't reach
4516 	 * here when vmcs01 already have an allocated shadow vmcs.
4517 	 */
4518 	WARN_ON(loaded_vmcs == &vmx->vmcs01 && loaded_vmcs->shadow_vmcs);
4519 
4520 	if (!loaded_vmcs->shadow_vmcs) {
4521 		loaded_vmcs->shadow_vmcs = alloc_vmcs(true);
4522 		if (loaded_vmcs->shadow_vmcs)
4523 			vmcs_clear(loaded_vmcs->shadow_vmcs);
4524 	}
4525 	return loaded_vmcs->shadow_vmcs;
4526 }
4527 
4528 static int enter_vmx_operation(struct kvm_vcpu *vcpu)
4529 {
4530 	struct vcpu_vmx *vmx = to_vmx(vcpu);
4531 	int r;
4532 
4533 	r = alloc_loaded_vmcs(&vmx->nested.vmcs02);
4534 	if (r < 0)
4535 		goto out_vmcs02;
4536 
4537 	vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
4538 	if (!vmx->nested.cached_vmcs12)
4539 		goto out_cached_vmcs12;
4540 
4541 	vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
4542 	if (!vmx->nested.cached_shadow_vmcs12)
4543 		goto out_cached_shadow_vmcs12;
4544 
4545 	if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu))
4546 		goto out_shadow_vmcs;
4547 
4548 	hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
4549 		     HRTIMER_MODE_REL_PINNED);
4550 	vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
4551 
4552 	vmx->nested.vpid02 = allocate_vpid();
4553 
4554 	vmx->nested.vmcs02_initialized = false;
4555 	vmx->nested.vmxon = true;
4556 
4557 	if (pt_mode == PT_MODE_HOST_GUEST) {
4558 		vmx->pt_desc.guest.ctl = 0;
4559 		pt_update_intercept_for_msr(vmx);
4560 	}
4561 
4562 	return 0;
4563 
4564 out_shadow_vmcs:
4565 	kfree(vmx->nested.cached_shadow_vmcs12);
4566 
4567 out_cached_shadow_vmcs12:
4568 	kfree(vmx->nested.cached_vmcs12);
4569 
4570 out_cached_vmcs12:
4571 	free_loaded_vmcs(&vmx->nested.vmcs02);
4572 
4573 out_vmcs02:
4574 	return -ENOMEM;
4575 }
4576 
4577 /*
4578  * Emulate the VMXON instruction.
4579  * Currently, we just remember that VMX is active, and do not save or even
4580  * inspect the argument to VMXON (the so-called "VMXON pointer") because we
4581  * do not currently need to store anything in that guest-allocated memory
4582  * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their
4583  * argument is different from the VMXON pointer (which the spec says they do).
4584  */
4585 static int handle_vmon(struct kvm_vcpu *vcpu)
4586 {
4587 	int ret;
4588 	gpa_t vmptr;
4589 	uint32_t revision;
4590 	struct vcpu_vmx *vmx = to_vmx(vcpu);
4591 	const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED
4592 		| FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
4593 
4594 	/*
4595 	 * The Intel VMX Instruction Reference lists a bunch of bits that are
4596 	 * prerequisite to running VMXON, most notably cr4.VMXE must be set to
4597 	 * 1 (see vmx_set_cr4() for when we allow the guest to set this).
4598 	 * Otherwise, we should fail with #UD.  But most faulting conditions
4599 	 * have already been checked by hardware, prior to the VM-exit for
4600 	 * VMXON.  We do test guest cr4.VMXE because processor CR4 always has
4601 	 * that bit set to 1 in non-root mode.
4602 	 */
4603 	if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) {
4604 		kvm_queue_exception(vcpu, UD_VECTOR);
4605 		return 1;
4606 	}
4607 
4608 	/* CPL=0 must be checked manually. */
4609 	if (vmx_get_cpl(vcpu)) {
4610 		kvm_inject_gp(vcpu, 0);
4611 		return 1;
4612 	}
4613 
4614 	if (vmx->nested.vmxon)
4615 		return nested_vmx_failValid(vcpu,
4616 			VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
4617 
4618 	if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
4619 			!= VMXON_NEEDED_FEATURES) {
4620 		kvm_inject_gp(vcpu, 0);
4621 		return 1;
4622 	}
4623 
4624 	if (nested_vmx_get_vmptr(vcpu, &vmptr))
4625 		return 1;
4626 
4627 	/*
4628 	 * SDM 3: 24.11.5
4629 	 * The first 4 bytes of VMXON region contain the supported
4630 	 * VMCS revision identifier
4631 	 *
4632 	 * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case;
4633 	 * which replaces physical address width with 32
4634 	 */
4635 	if (!page_address_valid(vcpu, vmptr))
4636 		return nested_vmx_failInvalid(vcpu);
4637 
4638 	if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) ||
4639 	    revision != VMCS12_REVISION)
4640 		return nested_vmx_failInvalid(vcpu);
4641 
4642 	vmx->nested.vmxon_ptr = vmptr;
4643 	ret = enter_vmx_operation(vcpu);
4644 	if (ret)
4645 		return ret;
4646 
4647 	return nested_vmx_succeed(vcpu);
4648 }
4649 
4650 static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu)
4651 {
4652 	struct vcpu_vmx *vmx = to_vmx(vcpu);
4653 
4654 	if (vmx->nested.current_vmptr == -1ull)
4655 		return;
4656 
4657 	copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu));
4658 
4659 	if (enable_shadow_vmcs) {
4660 		/* copy to memory all shadowed fields in case
4661 		   they were modified */
4662 		copy_shadow_to_vmcs12(vmx);
4663 		vmx_disable_shadow_vmcs(vmx);
4664 	}
4665 	vmx->nested.posted_intr_nv = -1;
4666 
4667 	/* Flush VMCS12 to guest memory */
4668 	kvm_vcpu_write_guest_page(vcpu,
4669 				  vmx->nested.current_vmptr >> PAGE_SHIFT,
4670 				  vmx->nested.cached_vmcs12, 0, VMCS12_SIZE);
4671 
4672 	kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
4673 
4674 	vmx->nested.current_vmptr = -1ull;
4675 }
4676 
4677 /* Emulate the VMXOFF instruction */
4678 static int handle_vmoff(struct kvm_vcpu *vcpu)
4679 {
4680 	if (!nested_vmx_check_permission(vcpu))
4681 		return 1;
4682 
4683 	free_nested(vcpu);
4684 
4685 	/* Process a latched INIT during time CPU was in VMX operation */
4686 	kvm_make_request(KVM_REQ_EVENT, vcpu);
4687 
4688 	return nested_vmx_succeed(vcpu);
4689 }
4690 
4691 /* Emulate the VMCLEAR instruction */
4692 static int handle_vmclear(struct kvm_vcpu *vcpu)
4693 {
4694 	struct vcpu_vmx *vmx = to_vmx(vcpu);
4695 	u32 zero = 0;
4696 	gpa_t vmptr;
4697 	u64 evmcs_gpa;
4698 
4699 	if (!nested_vmx_check_permission(vcpu))
4700 		return 1;
4701 
4702 	if (nested_vmx_get_vmptr(vcpu, &vmptr))
4703 		return 1;
4704 
4705 	if (!page_address_valid(vcpu, vmptr))
4706 		return nested_vmx_failValid(vcpu,
4707 			VMXERR_VMCLEAR_INVALID_ADDRESS);
4708 
4709 	if (vmptr == vmx->nested.vmxon_ptr)
4710 		return nested_vmx_failValid(vcpu,
4711 			VMXERR_VMCLEAR_VMXON_POINTER);
4712 
4713 	/*
4714 	 * When Enlightened VMEntry is enabled on the calling CPU we treat
4715 	 * memory area pointer by vmptr as Enlightened VMCS (as there's no good
4716 	 * way to distinguish it from VMCS12) and we must not corrupt it by
4717 	 * writing to the non-existent 'launch_state' field. The area doesn't
4718 	 * have to be the currently active EVMCS on the calling CPU and there's
4719 	 * nothing KVM has to do to transition it from 'active' to 'non-active'
4720 	 * state. It is possible that the area will stay mapped as
4721 	 * vmx->nested.hv_evmcs but this shouldn't be a problem.
4722 	 */
4723 	if (likely(!vmx->nested.enlightened_vmcs_enabled ||
4724 		   !nested_enlightened_vmentry(vcpu, &evmcs_gpa))) {
4725 		if (vmptr == vmx->nested.current_vmptr)
4726 			nested_release_vmcs12(vcpu);
4727 
4728 		kvm_vcpu_write_guest(vcpu,
4729 				     vmptr + offsetof(struct vmcs12,
4730 						      launch_state),
4731 				     &zero, sizeof(zero));
4732 	}
4733 
4734 	return nested_vmx_succeed(vcpu);
4735 }
4736 
4737 static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch);
4738 
4739 /* Emulate the VMLAUNCH instruction */
4740 static int handle_vmlaunch(struct kvm_vcpu *vcpu)
4741 {
4742 	return nested_vmx_run(vcpu, true);
4743 }
4744 
4745 /* Emulate the VMRESUME instruction */
4746 static int handle_vmresume(struct kvm_vcpu *vcpu)
4747 {
4748 
4749 	return nested_vmx_run(vcpu, false);
4750 }
4751 
4752 static int handle_vmread(struct kvm_vcpu *vcpu)
4753 {
4754 	struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu)
4755 						    : get_vmcs12(vcpu);
4756 	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
4757 	u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
4758 	struct vcpu_vmx *vmx = to_vmx(vcpu);
4759 	struct x86_exception e;
4760 	unsigned long field;
4761 	u64 value;
4762 	gva_t gva = 0;
4763 	short offset;
4764 	int len;
4765 
4766 	if (!nested_vmx_check_permission(vcpu))
4767 		return 1;
4768 
4769 	/*
4770 	 * In VMX non-root operation, when the VMCS-link pointer is -1ull,
4771 	 * any VMREAD sets the ALU flags for VMfailInvalid.
4772 	 */
4773 	if (vmx->nested.current_vmptr == -1ull ||
4774 	    (is_guest_mode(vcpu) &&
4775 	     get_vmcs12(vcpu)->vmcs_link_pointer == -1ull))
4776 		return nested_vmx_failInvalid(vcpu);
4777 
4778 	/* Decode instruction info and find the field to read */
4779 	field = kvm_register_readl(vcpu, (((instr_info) >> 28) & 0xf));
4780 
4781 	offset = vmcs_field_to_offset(field);
4782 	if (offset < 0)
4783 		return nested_vmx_failValid(vcpu,
4784 			VMXERR_UNSUPPORTED_VMCS_COMPONENT);
4785 
4786 	if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field))
4787 		copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
4788 
4789 	/* Read the field, zero-extended to a u64 value */
4790 	value = vmcs12_read_any(vmcs12, field, offset);
4791 
4792 	/*
4793 	 * Now copy part of this value to register or memory, as requested.
4794 	 * Note that the number of bits actually copied is 32 or 64 depending
4795 	 * on the guest's mode (32 or 64 bit), not on the given field's length.
4796 	 */
4797 	if (instr_info & BIT(10)) {
4798 		kvm_register_writel(vcpu, (((instr_info) >> 3) & 0xf), value);
4799 	} else {
4800 		len = is_64_bit_mode(vcpu) ? 8 : 4;
4801 		if (get_vmx_mem_address(vcpu, exit_qualification,
4802 					instr_info, true, len, &gva))
4803 			return 1;
4804 		/* _system ok, nested_vmx_check_permission has verified cpl=0 */
4805 		if (kvm_write_guest_virt_system(vcpu, gva, &value, len, &e))
4806 			kvm_inject_page_fault(vcpu, &e);
4807 	}
4808 
4809 	return nested_vmx_succeed(vcpu);
4810 }
4811 
4812 static bool is_shadow_field_rw(unsigned long field)
4813 {
4814 	switch (field) {
4815 #define SHADOW_FIELD_RW(x, y) case x:
4816 #include "vmcs_shadow_fields.h"
4817 		return true;
4818 	default:
4819 		break;
4820 	}
4821 	return false;
4822 }
4823 
4824 static bool is_shadow_field_ro(unsigned long field)
4825 {
4826 	switch (field) {
4827 #define SHADOW_FIELD_RO(x, y) case x:
4828 #include "vmcs_shadow_fields.h"
4829 		return true;
4830 	default:
4831 		break;
4832 	}
4833 	return false;
4834 }
4835 
4836 static int handle_vmwrite(struct kvm_vcpu *vcpu)
4837 {
4838 	struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu)
4839 						    : get_vmcs12(vcpu);
4840 	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
4841 	u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
4842 	struct vcpu_vmx *vmx = to_vmx(vcpu);
4843 	struct x86_exception e;
4844 	unsigned long field;
4845 	short offset;
4846 	gva_t gva;
4847 	int len;
4848 
4849 	/*
4850 	 * The value to write might be 32 or 64 bits, depending on L1's long
4851 	 * mode, and eventually we need to write that into a field of several
4852 	 * possible lengths. The code below first zero-extends the value to 64
4853 	 * bit (value), and then copies only the appropriate number of
4854 	 * bits into the vmcs12 field.
4855 	 */
4856 	u64 value = 0;
4857 
4858 	if (!nested_vmx_check_permission(vcpu))
4859 		return 1;
4860 
4861 	/*
4862 	 * In VMX non-root operation, when the VMCS-link pointer is -1ull,
4863 	 * any VMWRITE sets the ALU flags for VMfailInvalid.
4864 	 */
4865 	if (vmx->nested.current_vmptr == -1ull ||
4866 	    (is_guest_mode(vcpu) &&
4867 	     get_vmcs12(vcpu)->vmcs_link_pointer == -1ull))
4868 		return nested_vmx_failInvalid(vcpu);
4869 
4870 	if (instr_info & BIT(10))
4871 		value = kvm_register_readl(vcpu, (((instr_info) >> 3) & 0xf));
4872 	else {
4873 		len = is_64_bit_mode(vcpu) ? 8 : 4;
4874 		if (get_vmx_mem_address(vcpu, exit_qualification,
4875 					instr_info, false, len, &gva))
4876 			return 1;
4877 		if (kvm_read_guest_virt(vcpu, gva, &value, len, &e)) {
4878 			kvm_inject_page_fault(vcpu, &e);
4879 			return 1;
4880 		}
4881 	}
4882 
4883 	field = kvm_register_readl(vcpu, (((instr_info) >> 28) & 0xf));
4884 
4885 	offset = vmcs_field_to_offset(field);
4886 	if (offset < 0)
4887 		return nested_vmx_failValid(vcpu,
4888 			VMXERR_UNSUPPORTED_VMCS_COMPONENT);
4889 
4890 	/*
4891 	 * If the vCPU supports "VMWRITE to any supported field in the
4892 	 * VMCS," then the "read-only" fields are actually read/write.
4893 	 */
4894 	if (vmcs_field_readonly(field) &&
4895 	    !nested_cpu_has_vmwrite_any_field(vcpu))
4896 		return nested_vmx_failValid(vcpu,
4897 			VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
4898 
4899 	/*
4900 	 * Ensure vmcs12 is up-to-date before any VMWRITE that dirties
4901 	 * vmcs12, else we may crush a field or consume a stale value.
4902 	 */
4903 	if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field))
4904 		copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
4905 
4906 	/*
4907 	 * Some Intel CPUs intentionally drop the reserved bits of the AR byte
4908 	 * fields on VMWRITE.  Emulate this behavior to ensure consistent KVM
4909 	 * behavior regardless of the underlying hardware, e.g. if an AR_BYTE
4910 	 * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD
4911 	 * from L1 will return a different value than VMREAD from L2 (L1 sees
4912 	 * the stripped down value, L2 sees the full value as stored by KVM).
4913 	 */
4914 	if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES)
4915 		value &= 0x1f0ff;
4916 
4917 	vmcs12_write_any(vmcs12, field, offset, value);
4918 
4919 	/*
4920 	 * Do not track vmcs12 dirty-state if in guest-mode as we actually
4921 	 * dirty shadow vmcs12 instead of vmcs12.  Fields that can be updated
4922 	 * by L1 without a vmexit are always updated in the vmcs02, i.e. don't
4923 	 * "dirty" vmcs12, all others go down the prepare_vmcs02() slow path.
4924 	 */
4925 	if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) {
4926 		/*
4927 		 * L1 can read these fields without exiting, ensure the
4928 		 * shadow VMCS is up-to-date.
4929 		 */
4930 		if (enable_shadow_vmcs && is_shadow_field_ro(field)) {
4931 			preempt_disable();
4932 			vmcs_load(vmx->vmcs01.shadow_vmcs);
4933 
4934 			__vmcs_writel(field, value);
4935 
4936 			vmcs_clear(vmx->vmcs01.shadow_vmcs);
4937 			vmcs_load(vmx->loaded_vmcs->vmcs);
4938 			preempt_enable();
4939 		}
4940 		vmx->nested.dirty_vmcs12 = true;
4941 	}
4942 
4943 	return nested_vmx_succeed(vcpu);
4944 }
4945 
4946 static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr)
4947 {
4948 	vmx->nested.current_vmptr = vmptr;
4949 	if (enable_shadow_vmcs) {
4950 		secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS);
4951 		vmcs_write64(VMCS_LINK_POINTER,
4952 			     __pa(vmx->vmcs01.shadow_vmcs));
4953 		vmx->nested.need_vmcs12_to_shadow_sync = true;
4954 	}
4955 	vmx->nested.dirty_vmcs12 = true;
4956 }
4957 
4958 /* Emulate the VMPTRLD instruction */
4959 static int handle_vmptrld(struct kvm_vcpu *vcpu)
4960 {
4961 	struct vcpu_vmx *vmx = to_vmx(vcpu);
4962 	gpa_t vmptr;
4963 
4964 	if (!nested_vmx_check_permission(vcpu))
4965 		return 1;
4966 
4967 	if (nested_vmx_get_vmptr(vcpu, &vmptr))
4968 		return 1;
4969 
4970 	if (!page_address_valid(vcpu, vmptr))
4971 		return nested_vmx_failValid(vcpu,
4972 			VMXERR_VMPTRLD_INVALID_ADDRESS);
4973 
4974 	if (vmptr == vmx->nested.vmxon_ptr)
4975 		return nested_vmx_failValid(vcpu,
4976 			VMXERR_VMPTRLD_VMXON_POINTER);
4977 
4978 	/* Forbid normal VMPTRLD if Enlightened version was used */
4979 	if (vmx->nested.hv_evmcs)
4980 		return 1;
4981 
4982 	if (vmx->nested.current_vmptr != vmptr) {
4983 		struct kvm_host_map map;
4984 		struct vmcs12 *new_vmcs12;
4985 
4986 		if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmptr), &map)) {
4987 			/*
4988 			 * Reads from an unbacked page return all 1s,
4989 			 * which means that the 32 bits located at the
4990 			 * given physical address won't match the required
4991 			 * VMCS12_REVISION identifier.
4992 			 */
4993 			return nested_vmx_failValid(vcpu,
4994 				VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
4995 		}
4996 
4997 		new_vmcs12 = map.hva;
4998 
4999 		if (new_vmcs12->hdr.revision_id != VMCS12_REVISION ||
5000 		    (new_vmcs12->hdr.shadow_vmcs &&
5001 		     !nested_cpu_has_vmx_shadow_vmcs(vcpu))) {
5002 			kvm_vcpu_unmap(vcpu, &map, false);
5003 			return nested_vmx_failValid(vcpu,
5004 				VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
5005 		}
5006 
5007 		nested_release_vmcs12(vcpu);
5008 
5009 		/*
5010 		 * Load VMCS12 from guest memory since it is not already
5011 		 * cached.
5012 		 */
5013 		memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE);
5014 		kvm_vcpu_unmap(vcpu, &map, false);
5015 
5016 		set_current_vmptr(vmx, vmptr);
5017 	}
5018 
5019 	return nested_vmx_succeed(vcpu);
5020 }
5021 
5022 /* Emulate the VMPTRST instruction */
5023 static int handle_vmptrst(struct kvm_vcpu *vcpu)
5024 {
5025 	unsigned long exit_qual = vmcs_readl(EXIT_QUALIFICATION);
5026 	u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5027 	gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr;
5028 	struct x86_exception e;
5029 	gva_t gva;
5030 
5031 	if (!nested_vmx_check_permission(vcpu))
5032 		return 1;
5033 
5034 	if (unlikely(to_vmx(vcpu)->nested.hv_evmcs))
5035 		return 1;
5036 
5037 	if (get_vmx_mem_address(vcpu, exit_qual, instr_info,
5038 				true, sizeof(gpa_t), &gva))
5039 		return 1;
5040 	/* *_system ok, nested_vmx_check_permission has verified cpl=0 */
5041 	if (kvm_write_guest_virt_system(vcpu, gva, (void *)&current_vmptr,
5042 					sizeof(gpa_t), &e)) {
5043 		kvm_inject_page_fault(vcpu, &e);
5044 		return 1;
5045 	}
5046 	return nested_vmx_succeed(vcpu);
5047 }
5048 
5049 /* Emulate the INVEPT instruction */
5050 static int handle_invept(struct kvm_vcpu *vcpu)
5051 {
5052 	struct vcpu_vmx *vmx = to_vmx(vcpu);
5053 	u32 vmx_instruction_info, types;
5054 	unsigned long type;
5055 	gva_t gva;
5056 	struct x86_exception e;
5057 	struct {
5058 		u64 eptp, gpa;
5059 	} operand;
5060 
5061 	if (!(vmx->nested.msrs.secondary_ctls_high &
5062 	      SECONDARY_EXEC_ENABLE_EPT) ||
5063 	    !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) {
5064 		kvm_queue_exception(vcpu, UD_VECTOR);
5065 		return 1;
5066 	}
5067 
5068 	if (!nested_vmx_check_permission(vcpu))
5069 		return 1;
5070 
5071 	vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5072 	type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
5073 
5074 	types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
5075 
5076 	if (type >= 32 || !(types & (1 << type)))
5077 		return nested_vmx_failValid(vcpu,
5078 				VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5079 
5080 	/* According to the Intel VMX instruction reference, the memory
5081 	 * operand is read even if it isn't needed (e.g., for type==global)
5082 	 */
5083 	if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
5084 			vmx_instruction_info, false, sizeof(operand), &gva))
5085 		return 1;
5086 	if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
5087 		kvm_inject_page_fault(vcpu, &e);
5088 		return 1;
5089 	}
5090 
5091 	switch (type) {
5092 	case VMX_EPT_EXTENT_GLOBAL:
5093 	case VMX_EPT_EXTENT_CONTEXT:
5094 	/*
5095 	 * TODO: Sync the necessary shadow EPT roots here, rather than
5096 	 * at the next emulated VM-entry.
5097 	 */
5098 		break;
5099 	default:
5100 		BUG_ON(1);
5101 		break;
5102 	}
5103 
5104 	return nested_vmx_succeed(vcpu);
5105 }
5106 
5107 static int handle_invvpid(struct kvm_vcpu *vcpu)
5108 {
5109 	struct vcpu_vmx *vmx = to_vmx(vcpu);
5110 	u32 vmx_instruction_info;
5111 	unsigned long type, types;
5112 	gva_t gva;
5113 	struct x86_exception e;
5114 	struct {
5115 		u64 vpid;
5116 		u64 gla;
5117 	} operand;
5118 	u16 vpid02;
5119 
5120 	if (!(vmx->nested.msrs.secondary_ctls_high &
5121 	      SECONDARY_EXEC_ENABLE_VPID) ||
5122 			!(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) {
5123 		kvm_queue_exception(vcpu, UD_VECTOR);
5124 		return 1;
5125 	}
5126 
5127 	if (!nested_vmx_check_permission(vcpu))
5128 		return 1;
5129 
5130 	vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5131 	type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
5132 
5133 	types = (vmx->nested.msrs.vpid_caps &
5134 			VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8;
5135 
5136 	if (type >= 32 || !(types & (1 << type)))
5137 		return nested_vmx_failValid(vcpu,
5138 			VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5139 
5140 	/* according to the intel vmx instruction reference, the memory
5141 	 * operand is read even if it isn't needed (e.g., for type==global)
5142 	 */
5143 	if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
5144 			vmx_instruction_info, false, sizeof(operand), &gva))
5145 		return 1;
5146 	if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
5147 		kvm_inject_page_fault(vcpu, &e);
5148 		return 1;
5149 	}
5150 	if (operand.vpid >> 16)
5151 		return nested_vmx_failValid(vcpu,
5152 			VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5153 
5154 	vpid02 = nested_get_vpid02(vcpu);
5155 	switch (type) {
5156 	case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
5157 		if (!operand.vpid ||
5158 		    is_noncanonical_address(operand.gla, vcpu))
5159 			return nested_vmx_failValid(vcpu,
5160 				VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5161 		if (cpu_has_vmx_invvpid_individual_addr()) {
5162 			__invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR,
5163 				vpid02, operand.gla);
5164 		} else
5165 			__vmx_flush_tlb(vcpu, vpid02, false);
5166 		break;
5167 	case VMX_VPID_EXTENT_SINGLE_CONTEXT:
5168 	case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL:
5169 		if (!operand.vpid)
5170 			return nested_vmx_failValid(vcpu,
5171 				VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5172 		__vmx_flush_tlb(vcpu, vpid02, false);
5173 		break;
5174 	case VMX_VPID_EXTENT_ALL_CONTEXT:
5175 		__vmx_flush_tlb(vcpu, vpid02, false);
5176 		break;
5177 	default:
5178 		WARN_ON_ONCE(1);
5179 		return kvm_skip_emulated_instruction(vcpu);
5180 	}
5181 
5182 	return nested_vmx_succeed(vcpu);
5183 }
5184 
5185 static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
5186 				     struct vmcs12 *vmcs12)
5187 {
5188 	u32 index = kvm_rcx_read(vcpu);
5189 	u64 address;
5190 	bool accessed_dirty;
5191 	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
5192 
5193 	if (!nested_cpu_has_eptp_switching(vmcs12) ||
5194 	    !nested_cpu_has_ept(vmcs12))
5195 		return 1;
5196 
5197 	if (index >= VMFUNC_EPTP_ENTRIES)
5198 		return 1;
5199 
5200 
5201 	if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT,
5202 				     &address, index * 8, 8))
5203 		return 1;
5204 
5205 	accessed_dirty = !!(address & VMX_EPTP_AD_ENABLE_BIT);
5206 
5207 	/*
5208 	 * If the (L2) guest does a vmfunc to the currently
5209 	 * active ept pointer, we don't have to do anything else
5210 	 */
5211 	if (vmcs12->ept_pointer != address) {
5212 		if (!valid_ept_address(vcpu, address))
5213 			return 1;
5214 
5215 		kvm_mmu_unload(vcpu);
5216 		mmu->ept_ad = accessed_dirty;
5217 		mmu->mmu_role.base.ad_disabled = !accessed_dirty;
5218 		vmcs12->ept_pointer = address;
5219 		/*
5220 		 * TODO: Check what's the correct approach in case
5221 		 * mmu reload fails. Currently, we just let the next
5222 		 * reload potentially fail
5223 		 */
5224 		kvm_mmu_reload(vcpu);
5225 	}
5226 
5227 	return 0;
5228 }
5229 
5230 static int handle_vmfunc(struct kvm_vcpu *vcpu)
5231 {
5232 	struct vcpu_vmx *vmx = to_vmx(vcpu);
5233 	struct vmcs12 *vmcs12;
5234 	u32 function = kvm_rax_read(vcpu);
5235 
5236 	/*
5237 	 * VMFUNC is only supported for nested guests, but we always enable the
5238 	 * secondary control for simplicity; for non-nested mode, fake that we
5239 	 * didn't by injecting #UD.
5240 	 */
5241 	if (!is_guest_mode(vcpu)) {
5242 		kvm_queue_exception(vcpu, UD_VECTOR);
5243 		return 1;
5244 	}
5245 
5246 	vmcs12 = get_vmcs12(vcpu);
5247 	if ((vmcs12->vm_function_control & (1 << function)) == 0)
5248 		goto fail;
5249 
5250 	switch (function) {
5251 	case 0:
5252 		if (nested_vmx_eptp_switching(vcpu, vmcs12))
5253 			goto fail;
5254 		break;
5255 	default:
5256 		goto fail;
5257 	}
5258 	return kvm_skip_emulated_instruction(vcpu);
5259 
5260 fail:
5261 	nested_vmx_vmexit(vcpu, vmx->exit_reason,
5262 			  vmcs_read32(VM_EXIT_INTR_INFO),
5263 			  vmcs_readl(EXIT_QUALIFICATION));
5264 	return 1;
5265 }
5266 
5267 
5268 static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
5269 				       struct vmcs12 *vmcs12)
5270 {
5271 	unsigned long exit_qualification;
5272 	gpa_t bitmap, last_bitmap;
5273 	unsigned int port;
5274 	int size;
5275 	u8 b;
5276 
5277 	if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
5278 		return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
5279 
5280 	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5281 
5282 	port = exit_qualification >> 16;
5283 	size = (exit_qualification & 7) + 1;
5284 
5285 	last_bitmap = (gpa_t)-1;
5286 	b = -1;
5287 
5288 	while (size > 0) {
5289 		if (port < 0x8000)
5290 			bitmap = vmcs12->io_bitmap_a;
5291 		else if (port < 0x10000)
5292 			bitmap = vmcs12->io_bitmap_b;
5293 		else
5294 			return true;
5295 		bitmap += (port & 0x7fff) / 8;
5296 
5297 		if (last_bitmap != bitmap)
5298 			if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1))
5299 				return true;
5300 		if (b & (1 << (port & 7)))
5301 			return true;
5302 
5303 		port++;
5304 		size--;
5305 		last_bitmap = bitmap;
5306 	}
5307 
5308 	return false;
5309 }
5310 
5311 /*
5312  * Return 1 if we should exit from L2 to L1 to handle an MSR access access,
5313  * rather than handle it ourselves in L0. I.e., check whether L1 expressed
5314  * disinterest in the current event (read or write a specific MSR) by using an
5315  * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps.
5316  */
5317 static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
5318 	struct vmcs12 *vmcs12, u32 exit_reason)
5319 {
5320 	u32 msr_index = kvm_rcx_read(vcpu);
5321 	gpa_t bitmap;
5322 
5323 	if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
5324 		return true;
5325 
5326 	/*
5327 	 * The MSR_BITMAP page is divided into four 1024-byte bitmaps,
5328 	 * for the four combinations of read/write and low/high MSR numbers.
5329 	 * First we need to figure out which of the four to use:
5330 	 */
5331 	bitmap = vmcs12->msr_bitmap;
5332 	if (exit_reason == EXIT_REASON_MSR_WRITE)
5333 		bitmap += 2048;
5334 	if (msr_index >= 0xc0000000) {
5335 		msr_index -= 0xc0000000;
5336 		bitmap += 1024;
5337 	}
5338 
5339 	/* Then read the msr_index'th bit from this bitmap: */
5340 	if (msr_index < 1024*8) {
5341 		unsigned char b;
5342 		if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1))
5343 			return true;
5344 		return 1 & (b >> (msr_index & 7));
5345 	} else
5346 		return true; /* let L1 handle the wrong parameter */
5347 }
5348 
5349 /*
5350  * Return 1 if we should exit from L2 to L1 to handle a CR access exit,
5351  * rather than handle it ourselves in L0. I.e., check if L1 wanted to
5352  * intercept (via guest_host_mask etc.) the current event.
5353  */
5354 static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
5355 	struct vmcs12 *vmcs12)
5356 {
5357 	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5358 	int cr = exit_qualification & 15;
5359 	int reg;
5360 	unsigned long val;
5361 
5362 	switch ((exit_qualification >> 4) & 3) {
5363 	case 0: /* mov to cr */
5364 		reg = (exit_qualification >> 8) & 15;
5365 		val = kvm_register_readl(vcpu, reg);
5366 		switch (cr) {
5367 		case 0:
5368 			if (vmcs12->cr0_guest_host_mask &
5369 			    (val ^ vmcs12->cr0_read_shadow))
5370 				return true;
5371 			break;
5372 		case 3:
5373 			if ((vmcs12->cr3_target_count >= 1 &&
5374 					vmcs12->cr3_target_value0 == val) ||
5375 				(vmcs12->cr3_target_count >= 2 &&
5376 					vmcs12->cr3_target_value1 == val) ||
5377 				(vmcs12->cr3_target_count >= 3 &&
5378 					vmcs12->cr3_target_value2 == val) ||
5379 				(vmcs12->cr3_target_count >= 4 &&
5380 					vmcs12->cr3_target_value3 == val))
5381 				return false;
5382 			if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING))
5383 				return true;
5384 			break;
5385 		case 4:
5386 			if (vmcs12->cr4_guest_host_mask &
5387 			    (vmcs12->cr4_read_shadow ^ val))
5388 				return true;
5389 			break;
5390 		case 8:
5391 			if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING))
5392 				return true;
5393 			break;
5394 		}
5395 		break;
5396 	case 2: /* clts */
5397 		if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) &&
5398 		    (vmcs12->cr0_read_shadow & X86_CR0_TS))
5399 			return true;
5400 		break;
5401 	case 1: /* mov from cr */
5402 		switch (cr) {
5403 		case 3:
5404 			if (vmcs12->cpu_based_vm_exec_control &
5405 			    CPU_BASED_CR3_STORE_EXITING)
5406 				return true;
5407 			break;
5408 		case 8:
5409 			if (vmcs12->cpu_based_vm_exec_control &
5410 			    CPU_BASED_CR8_STORE_EXITING)
5411 				return true;
5412 			break;
5413 		}
5414 		break;
5415 	case 3: /* lmsw */
5416 		/*
5417 		 * lmsw can change bits 1..3 of cr0, and only set bit 0 of
5418 		 * cr0. Other attempted changes are ignored, with no exit.
5419 		 */
5420 		val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
5421 		if (vmcs12->cr0_guest_host_mask & 0xe &
5422 		    (val ^ vmcs12->cr0_read_shadow))
5423 			return true;
5424 		if ((vmcs12->cr0_guest_host_mask & 0x1) &&
5425 		    !(vmcs12->cr0_read_shadow & 0x1) &&
5426 		    (val & 0x1))
5427 			return true;
5428 		break;
5429 	}
5430 	return false;
5431 }
5432 
5433 static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu,
5434 	struct vmcs12 *vmcs12, gpa_t bitmap)
5435 {
5436 	u32 vmx_instruction_info;
5437 	unsigned long field;
5438 	u8 b;
5439 
5440 	if (!nested_cpu_has_shadow_vmcs(vmcs12))
5441 		return true;
5442 
5443 	/* Decode instruction info and find the field to access */
5444 	vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5445 	field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
5446 
5447 	/* Out-of-range fields always cause a VM exit from L2 to L1 */
5448 	if (field >> 15)
5449 		return true;
5450 
5451 	if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1))
5452 		return true;
5453 
5454 	return 1 & (b >> (field & 7));
5455 }
5456 
5457 /*
5458  * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we
5459  * should handle it ourselves in L0 (and then continue L2). Only call this
5460  * when in is_guest_mode (L2).
5461  */
5462 bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
5463 {
5464 	u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
5465 	struct vcpu_vmx *vmx = to_vmx(vcpu);
5466 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5467 
5468 	if (vmx->nested.nested_run_pending)
5469 		return false;
5470 
5471 	if (unlikely(vmx->fail)) {
5472 		trace_kvm_nested_vmenter_failed(
5473 			"hardware VM-instruction error: ",
5474 			vmcs_read32(VM_INSTRUCTION_ERROR));
5475 		return true;
5476 	}
5477 
5478 	/*
5479 	 * The host physical addresses of some pages of guest memory
5480 	 * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
5481 	 * Page). The CPU may write to these pages via their host
5482 	 * physical address while L2 is running, bypassing any
5483 	 * address-translation-based dirty tracking (e.g. EPT write
5484 	 * protection).
5485 	 *
5486 	 * Mark them dirty on every exit from L2 to prevent them from
5487 	 * getting out of sync with dirty tracking.
5488 	 */
5489 	nested_mark_vmcs12_pages_dirty(vcpu);
5490 
5491 	trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason,
5492 				vmcs_readl(EXIT_QUALIFICATION),
5493 				vmx->idt_vectoring_info,
5494 				intr_info,
5495 				vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
5496 				KVM_ISA_VMX);
5497 
5498 	switch (exit_reason) {
5499 	case EXIT_REASON_EXCEPTION_NMI:
5500 		if (is_nmi(intr_info))
5501 			return false;
5502 		else if (is_page_fault(intr_info))
5503 			return !vmx->vcpu.arch.apf.host_apf_reason && enable_ept;
5504 		else if (is_debug(intr_info) &&
5505 			 vcpu->guest_debug &
5506 			 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
5507 			return false;
5508 		else if (is_breakpoint(intr_info) &&
5509 			 vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
5510 			return false;
5511 		return vmcs12->exception_bitmap &
5512 				(1u << (intr_info & INTR_INFO_VECTOR_MASK));
5513 	case EXIT_REASON_EXTERNAL_INTERRUPT:
5514 		return false;
5515 	case EXIT_REASON_TRIPLE_FAULT:
5516 		return true;
5517 	case EXIT_REASON_INTERRUPT_WINDOW:
5518 		return nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING);
5519 	case EXIT_REASON_NMI_WINDOW:
5520 		return nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING);
5521 	case EXIT_REASON_TASK_SWITCH:
5522 		return true;
5523 	case EXIT_REASON_CPUID:
5524 		return true;
5525 	case EXIT_REASON_HLT:
5526 		return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING);
5527 	case EXIT_REASON_INVD:
5528 		return true;
5529 	case EXIT_REASON_INVLPG:
5530 		return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
5531 	case EXIT_REASON_RDPMC:
5532 		return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING);
5533 	case EXIT_REASON_RDRAND:
5534 		return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING);
5535 	case EXIT_REASON_RDSEED:
5536 		return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING);
5537 	case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP:
5538 		return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
5539 	case EXIT_REASON_VMREAD:
5540 		return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
5541 			vmcs12->vmread_bitmap);
5542 	case EXIT_REASON_VMWRITE:
5543 		return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
5544 			vmcs12->vmwrite_bitmap);
5545 	case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
5546 	case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD:
5547 	case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME:
5548 	case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
5549 	case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID:
5550 		/*
5551 		 * VMX instructions trap unconditionally. This allows L1 to
5552 		 * emulate them for its L2 guest, i.e., allows 3-level nesting!
5553 		 */
5554 		return true;
5555 	case EXIT_REASON_CR_ACCESS:
5556 		return nested_vmx_exit_handled_cr(vcpu, vmcs12);
5557 	case EXIT_REASON_DR_ACCESS:
5558 		return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING);
5559 	case EXIT_REASON_IO_INSTRUCTION:
5560 		return nested_vmx_exit_handled_io(vcpu, vmcs12);
5561 	case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR:
5562 		return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC);
5563 	case EXIT_REASON_MSR_READ:
5564 	case EXIT_REASON_MSR_WRITE:
5565 		return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason);
5566 	case EXIT_REASON_INVALID_STATE:
5567 		return true;
5568 	case EXIT_REASON_MWAIT_INSTRUCTION:
5569 		return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING);
5570 	case EXIT_REASON_MONITOR_TRAP_FLAG:
5571 		return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_TRAP_FLAG);
5572 	case EXIT_REASON_MONITOR_INSTRUCTION:
5573 		return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING);
5574 	case EXIT_REASON_PAUSE_INSTRUCTION:
5575 		return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) ||
5576 			nested_cpu_has2(vmcs12,
5577 				SECONDARY_EXEC_PAUSE_LOOP_EXITING);
5578 	case EXIT_REASON_MCE_DURING_VMENTRY:
5579 		return false;
5580 	case EXIT_REASON_TPR_BELOW_THRESHOLD:
5581 		return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW);
5582 	case EXIT_REASON_APIC_ACCESS:
5583 	case EXIT_REASON_APIC_WRITE:
5584 	case EXIT_REASON_EOI_INDUCED:
5585 		/*
5586 		 * The controls for "virtualize APIC accesses," "APIC-
5587 		 * register virtualization," and "virtual-interrupt
5588 		 * delivery" only come from vmcs12.
5589 		 */
5590 		return true;
5591 	case EXIT_REASON_EPT_VIOLATION:
5592 		/*
5593 		 * L0 always deals with the EPT violation. If nested EPT is
5594 		 * used, and the nested mmu code discovers that the address is
5595 		 * missing in the guest EPT table (EPT12), the EPT violation
5596 		 * will be injected with nested_ept_inject_page_fault()
5597 		 */
5598 		return false;
5599 	case EXIT_REASON_EPT_MISCONFIG:
5600 		/*
5601 		 * L2 never uses directly L1's EPT, but rather L0's own EPT
5602 		 * table (shadow on EPT) or a merged EPT table that L0 built
5603 		 * (EPT on EPT). So any problems with the structure of the
5604 		 * table is L0's fault.
5605 		 */
5606 		return false;
5607 	case EXIT_REASON_INVPCID:
5608 		return
5609 			nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) &&
5610 			nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
5611 	case EXIT_REASON_WBINVD:
5612 		return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
5613 	case EXIT_REASON_XSETBV:
5614 		return true;
5615 	case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS:
5616 		/*
5617 		 * This should never happen, since it is not possible to
5618 		 * set XSS to a non-zero value---neither in L1 nor in L2.
5619 		 * If if it were, XSS would have to be checked against
5620 		 * the XSS exit bitmap in vmcs12.
5621 		 */
5622 		return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
5623 	case EXIT_REASON_PREEMPTION_TIMER:
5624 		return false;
5625 	case EXIT_REASON_PML_FULL:
5626 		/* We emulate PML support to L1. */
5627 		return false;
5628 	case EXIT_REASON_VMFUNC:
5629 		/* VM functions are emulated through L2->L0 vmexits. */
5630 		return false;
5631 	case EXIT_REASON_ENCLS:
5632 		/* SGX is never exposed to L1 */
5633 		return false;
5634 	case EXIT_REASON_UMWAIT:
5635 	case EXIT_REASON_TPAUSE:
5636 		return nested_cpu_has2(vmcs12,
5637 			SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE);
5638 	default:
5639 		return true;
5640 	}
5641 }
5642 
5643 
5644 static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
5645 				struct kvm_nested_state __user *user_kvm_nested_state,
5646 				u32 user_data_size)
5647 {
5648 	struct vcpu_vmx *vmx;
5649 	struct vmcs12 *vmcs12;
5650 	struct kvm_nested_state kvm_state = {
5651 		.flags = 0,
5652 		.format = KVM_STATE_NESTED_FORMAT_VMX,
5653 		.size = sizeof(kvm_state),
5654 		.hdr.vmx.vmxon_pa = -1ull,
5655 		.hdr.vmx.vmcs12_pa = -1ull,
5656 	};
5657 	struct kvm_vmx_nested_state_data __user *user_vmx_nested_state =
5658 		&user_kvm_nested_state->data.vmx[0];
5659 
5660 	if (!vcpu)
5661 		return kvm_state.size + sizeof(*user_vmx_nested_state);
5662 
5663 	vmx = to_vmx(vcpu);
5664 	vmcs12 = get_vmcs12(vcpu);
5665 
5666 	if (nested_vmx_allowed(vcpu) &&
5667 	    (vmx->nested.vmxon || vmx->nested.smm.vmxon)) {
5668 		kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr;
5669 		kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr;
5670 
5671 		if (vmx_has_valid_vmcs12(vcpu)) {
5672 			kvm_state.size += sizeof(user_vmx_nested_state->vmcs12);
5673 
5674 			if (vmx->nested.hv_evmcs)
5675 				kvm_state.flags |= KVM_STATE_NESTED_EVMCS;
5676 
5677 			if (is_guest_mode(vcpu) &&
5678 			    nested_cpu_has_shadow_vmcs(vmcs12) &&
5679 			    vmcs12->vmcs_link_pointer != -1ull)
5680 				kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12);
5681 		}
5682 
5683 		if (vmx->nested.smm.vmxon)
5684 			kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON;
5685 
5686 		if (vmx->nested.smm.guest_mode)
5687 			kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE;
5688 
5689 		if (is_guest_mode(vcpu)) {
5690 			kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE;
5691 
5692 			if (vmx->nested.nested_run_pending)
5693 				kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING;
5694 		}
5695 	}
5696 
5697 	if (user_data_size < kvm_state.size)
5698 		goto out;
5699 
5700 	if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state)))
5701 		return -EFAULT;
5702 
5703 	if (!vmx_has_valid_vmcs12(vcpu))
5704 		goto out;
5705 
5706 	/*
5707 	 * When running L2, the authoritative vmcs12 state is in the
5708 	 * vmcs02. When running L1, the authoritative vmcs12 state is
5709 	 * in the shadow or enlightened vmcs linked to vmcs01, unless
5710 	 * need_vmcs12_to_shadow_sync is set, in which case, the authoritative
5711 	 * vmcs12 state is in the vmcs12 already.
5712 	 */
5713 	if (is_guest_mode(vcpu)) {
5714 		sync_vmcs02_to_vmcs12(vcpu, vmcs12);
5715 		sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
5716 	} else if (!vmx->nested.need_vmcs12_to_shadow_sync) {
5717 		if (vmx->nested.hv_evmcs)
5718 			copy_enlightened_to_vmcs12(vmx);
5719 		else if (enable_shadow_vmcs)
5720 			copy_shadow_to_vmcs12(vmx);
5721 	}
5722 
5723 	BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE);
5724 	BUILD_BUG_ON(sizeof(user_vmx_nested_state->shadow_vmcs12) < VMCS12_SIZE);
5725 
5726 	/*
5727 	 * Copy over the full allocated size of vmcs12 rather than just the size
5728 	 * of the struct.
5729 	 */
5730 	if (copy_to_user(user_vmx_nested_state->vmcs12, vmcs12, VMCS12_SIZE))
5731 		return -EFAULT;
5732 
5733 	if (nested_cpu_has_shadow_vmcs(vmcs12) &&
5734 	    vmcs12->vmcs_link_pointer != -1ull) {
5735 		if (copy_to_user(user_vmx_nested_state->shadow_vmcs12,
5736 				 get_shadow_vmcs12(vcpu), VMCS12_SIZE))
5737 			return -EFAULT;
5738 	}
5739 
5740 out:
5741 	return kvm_state.size;
5742 }
5743 
5744 /*
5745  * Forcibly leave nested mode in order to be able to reset the VCPU later on.
5746  */
5747 void vmx_leave_nested(struct kvm_vcpu *vcpu)
5748 {
5749 	if (is_guest_mode(vcpu)) {
5750 		to_vmx(vcpu)->nested.nested_run_pending = 0;
5751 		nested_vmx_vmexit(vcpu, -1, 0, 0);
5752 	}
5753 	free_nested(vcpu);
5754 }
5755 
5756 static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
5757 				struct kvm_nested_state __user *user_kvm_nested_state,
5758 				struct kvm_nested_state *kvm_state)
5759 {
5760 	struct vcpu_vmx *vmx = to_vmx(vcpu);
5761 	struct vmcs12 *vmcs12;
5762 	u32 exit_qual;
5763 	struct kvm_vmx_nested_state_data __user *user_vmx_nested_state =
5764 		&user_kvm_nested_state->data.vmx[0];
5765 	int ret;
5766 
5767 	if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX)
5768 		return -EINVAL;
5769 
5770 	if (kvm_state->hdr.vmx.vmxon_pa == -1ull) {
5771 		if (kvm_state->hdr.vmx.smm.flags)
5772 			return -EINVAL;
5773 
5774 		if (kvm_state->hdr.vmx.vmcs12_pa != -1ull)
5775 			return -EINVAL;
5776 
5777 		/*
5778 		 * KVM_STATE_NESTED_EVMCS used to signal that KVM should
5779 		 * enable eVMCS capability on vCPU. However, since then
5780 		 * code was changed such that flag signals vmcs12 should
5781 		 * be copied into eVMCS in guest memory.
5782 		 *
5783 		 * To preserve backwards compatability, allow user
5784 		 * to set this flag even when there is no VMXON region.
5785 		 */
5786 		if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS)
5787 			return -EINVAL;
5788 	} else {
5789 		if (!nested_vmx_allowed(vcpu))
5790 			return -EINVAL;
5791 
5792 		if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa))
5793 			return -EINVAL;
5794 	}
5795 
5796 	if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
5797 	    (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
5798 		return -EINVAL;
5799 
5800 	if (kvm_state->hdr.vmx.smm.flags &
5801 	    ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON))
5802 		return -EINVAL;
5803 
5804 	/*
5805 	 * SMM temporarily disables VMX, so we cannot be in guest mode,
5806 	 * nor can VMLAUNCH/VMRESUME be pending.  Outside SMM, SMM flags
5807 	 * must be zero.
5808 	 */
5809 	if (is_smm(vcpu) ?
5810 		(kvm_state->flags &
5811 		 (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING))
5812 		: kvm_state->hdr.vmx.smm.flags)
5813 		return -EINVAL;
5814 
5815 	if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
5816 	    !(kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON))
5817 		return -EINVAL;
5818 
5819 	if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) &&
5820 		(!nested_vmx_allowed(vcpu) || !vmx->nested.enlightened_vmcs_enabled))
5821 			return -EINVAL;
5822 
5823 	vmx_leave_nested(vcpu);
5824 
5825 	if (kvm_state->hdr.vmx.vmxon_pa == -1ull)
5826 		return 0;
5827 
5828 	vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa;
5829 	ret = enter_vmx_operation(vcpu);
5830 	if (ret)
5831 		return ret;
5832 
5833 	/* Empty 'VMXON' state is permitted */
5834 	if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12))
5835 		return 0;
5836 
5837 	if (kvm_state->hdr.vmx.vmcs12_pa != -1ull) {
5838 		if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa ||
5839 		    !page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa))
5840 			return -EINVAL;
5841 
5842 		set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa);
5843 	} else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) {
5844 		/*
5845 		 * Sync eVMCS upon entry as we may not have
5846 		 * HV_X64_MSR_VP_ASSIST_PAGE set up yet.
5847 		 */
5848 		vmx->nested.need_vmcs12_to_shadow_sync = true;
5849 	} else {
5850 		return -EINVAL;
5851 	}
5852 
5853 	if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) {
5854 		vmx->nested.smm.vmxon = true;
5855 		vmx->nested.vmxon = false;
5856 
5857 		if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE)
5858 			vmx->nested.smm.guest_mode = true;
5859 	}
5860 
5861 	vmcs12 = get_vmcs12(vcpu);
5862 	if (copy_from_user(vmcs12, user_vmx_nested_state->vmcs12, sizeof(*vmcs12)))
5863 		return -EFAULT;
5864 
5865 	if (vmcs12->hdr.revision_id != VMCS12_REVISION)
5866 		return -EINVAL;
5867 
5868 	if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
5869 		return 0;
5870 
5871 	vmx->nested.nested_run_pending =
5872 		!!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
5873 
5874 	ret = -EINVAL;
5875 	if (nested_cpu_has_shadow_vmcs(vmcs12) &&
5876 	    vmcs12->vmcs_link_pointer != -1ull) {
5877 		struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu);
5878 
5879 		if (kvm_state->size <
5880 		    sizeof(*kvm_state) +
5881 		    sizeof(user_vmx_nested_state->vmcs12) + sizeof(*shadow_vmcs12))
5882 			goto error_guest_mode;
5883 
5884 		if (copy_from_user(shadow_vmcs12,
5885 				   user_vmx_nested_state->shadow_vmcs12,
5886 				   sizeof(*shadow_vmcs12))) {
5887 			ret = -EFAULT;
5888 			goto error_guest_mode;
5889 		}
5890 
5891 		if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION ||
5892 		    !shadow_vmcs12->hdr.shadow_vmcs)
5893 			goto error_guest_mode;
5894 	}
5895 
5896 	if (nested_vmx_check_controls(vcpu, vmcs12) ||
5897 	    nested_vmx_check_host_state(vcpu, vmcs12) ||
5898 	    nested_vmx_check_guest_state(vcpu, vmcs12, &exit_qual))
5899 		goto error_guest_mode;
5900 
5901 	vmx->nested.dirty_vmcs12 = true;
5902 	ret = nested_vmx_enter_non_root_mode(vcpu, false);
5903 	if (ret)
5904 		goto error_guest_mode;
5905 
5906 	return 0;
5907 
5908 error_guest_mode:
5909 	vmx->nested.nested_run_pending = 0;
5910 	return ret;
5911 }
5912 
5913 void nested_vmx_set_vmcs_shadowing_bitmap(void)
5914 {
5915 	if (enable_shadow_vmcs) {
5916 		vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap));
5917 		vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
5918 	}
5919 }
5920 
5921 /*
5922  * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be
5923  * returned for the various VMX controls MSRs when nested VMX is enabled.
5924  * The same values should also be used to verify that vmcs12 control fields are
5925  * valid during nested entry from L1 to L2.
5926  * Each of these control msrs has a low and high 32-bit half: A low bit is on
5927  * if the corresponding bit in the (32-bit) control field *must* be on, and a
5928  * bit in the high half is on if the corresponding bit in the control field
5929  * may be on. See also vmx_control_verify().
5930  */
5931 void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps,
5932 				bool apicv)
5933 {
5934 	/*
5935 	 * Note that as a general rule, the high half of the MSRs (bits in
5936 	 * the control fields which may be 1) should be initialized by the
5937 	 * intersection of the underlying hardware's MSR (i.e., features which
5938 	 * can be supported) and the list of features we want to expose -
5939 	 * because they are known to be properly supported in our code.
5940 	 * Also, usually, the low half of the MSRs (bits which must be 1) can
5941 	 * be set to 0, meaning that L1 may turn off any of these bits. The
5942 	 * reason is that if one of these bits is necessary, it will appear
5943 	 * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
5944 	 * fields of vmcs01 and vmcs02, will turn these bits off - and
5945 	 * nested_vmx_exit_reflected() will not pass related exits to L1.
5946 	 * These rules have exceptions below.
5947 	 */
5948 
5949 	/* pin-based controls */
5950 	rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
5951 		msrs->pinbased_ctls_low,
5952 		msrs->pinbased_ctls_high);
5953 	msrs->pinbased_ctls_low |=
5954 		PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
5955 	msrs->pinbased_ctls_high &=
5956 		PIN_BASED_EXT_INTR_MASK |
5957 		PIN_BASED_NMI_EXITING |
5958 		PIN_BASED_VIRTUAL_NMIS |
5959 		(apicv ? PIN_BASED_POSTED_INTR : 0);
5960 	msrs->pinbased_ctls_high |=
5961 		PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
5962 		PIN_BASED_VMX_PREEMPTION_TIMER;
5963 
5964 	/* exit controls */
5965 	rdmsr(MSR_IA32_VMX_EXIT_CTLS,
5966 		msrs->exit_ctls_low,
5967 		msrs->exit_ctls_high);
5968 	msrs->exit_ctls_low =
5969 		VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
5970 
5971 	msrs->exit_ctls_high &=
5972 #ifdef CONFIG_X86_64
5973 		VM_EXIT_HOST_ADDR_SPACE_SIZE |
5974 #endif
5975 		VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
5976 	msrs->exit_ctls_high |=
5977 		VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
5978 		VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
5979 		VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
5980 
5981 	/* We support free control of debug control saving. */
5982 	msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS;
5983 
5984 	/* entry controls */
5985 	rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
5986 		msrs->entry_ctls_low,
5987 		msrs->entry_ctls_high);
5988 	msrs->entry_ctls_low =
5989 		VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
5990 	msrs->entry_ctls_high &=
5991 #ifdef CONFIG_X86_64
5992 		VM_ENTRY_IA32E_MODE |
5993 #endif
5994 		VM_ENTRY_LOAD_IA32_PAT;
5995 	msrs->entry_ctls_high |=
5996 		(VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
5997 
5998 	/* We support free control of debug control loading. */
5999 	msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
6000 
6001 	/* cpu-based controls */
6002 	rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
6003 		msrs->procbased_ctls_low,
6004 		msrs->procbased_ctls_high);
6005 	msrs->procbased_ctls_low =
6006 		CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
6007 	msrs->procbased_ctls_high &=
6008 		CPU_BASED_INTR_WINDOW_EXITING |
6009 		CPU_BASED_NMI_WINDOW_EXITING | CPU_BASED_USE_TSC_OFFSETTING |
6010 		CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
6011 		CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING |
6012 		CPU_BASED_CR3_STORE_EXITING |
6013 #ifdef CONFIG_X86_64
6014 		CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING |
6015 #endif
6016 		CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
6017 		CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG |
6018 		CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING |
6019 		CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING |
6020 		CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
6021 	/*
6022 	 * We can allow some features even when not supported by the
6023 	 * hardware. For example, L1 can specify an MSR bitmap - and we
6024 	 * can use it to avoid exits to L1 - even when L0 runs L2
6025 	 * without MSR bitmaps.
6026 	 */
6027 	msrs->procbased_ctls_high |=
6028 		CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
6029 		CPU_BASED_USE_MSR_BITMAPS;
6030 
6031 	/* We support free control of CR3 access interception. */
6032 	msrs->procbased_ctls_low &=
6033 		~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
6034 
6035 	/*
6036 	 * secondary cpu-based controls.  Do not include those that
6037 	 * depend on CPUID bits, they are added later by vmx_cpuid_update.
6038 	 */
6039 	if (msrs->procbased_ctls_high & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)
6040 		rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
6041 		      msrs->secondary_ctls_low,
6042 		      msrs->secondary_ctls_high);
6043 
6044 	msrs->secondary_ctls_low = 0;
6045 	msrs->secondary_ctls_high &=
6046 		SECONDARY_EXEC_DESC |
6047 		SECONDARY_EXEC_RDTSCP |
6048 		SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
6049 		SECONDARY_EXEC_WBINVD_EXITING |
6050 		SECONDARY_EXEC_APIC_REGISTER_VIRT |
6051 		SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
6052 		SECONDARY_EXEC_RDRAND_EXITING |
6053 		SECONDARY_EXEC_ENABLE_INVPCID |
6054 		SECONDARY_EXEC_RDSEED_EXITING |
6055 		SECONDARY_EXEC_XSAVES;
6056 
6057 	/*
6058 	 * We can emulate "VMCS shadowing," even if the hardware
6059 	 * doesn't support it.
6060 	 */
6061 	msrs->secondary_ctls_high |=
6062 		SECONDARY_EXEC_SHADOW_VMCS;
6063 
6064 	if (enable_ept) {
6065 		/* nested EPT: emulate EPT also to L1 */
6066 		msrs->secondary_ctls_high |=
6067 			SECONDARY_EXEC_ENABLE_EPT;
6068 		msrs->ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
6069 			 VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT;
6070 		if (cpu_has_vmx_ept_execute_only())
6071 			msrs->ept_caps |=
6072 				VMX_EPT_EXECUTE_ONLY_BIT;
6073 		msrs->ept_caps &= ept_caps;
6074 		msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
6075 			VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT |
6076 			VMX_EPT_1GB_PAGE_BIT;
6077 		if (enable_ept_ad_bits) {
6078 			msrs->secondary_ctls_high |=
6079 				SECONDARY_EXEC_ENABLE_PML;
6080 			msrs->ept_caps |= VMX_EPT_AD_BIT;
6081 		}
6082 	}
6083 
6084 	if (cpu_has_vmx_vmfunc()) {
6085 		msrs->secondary_ctls_high |=
6086 			SECONDARY_EXEC_ENABLE_VMFUNC;
6087 		/*
6088 		 * Advertise EPTP switching unconditionally
6089 		 * since we emulate it
6090 		 */
6091 		if (enable_ept)
6092 			msrs->vmfunc_controls =
6093 				VMX_VMFUNC_EPTP_SWITCHING;
6094 	}
6095 
6096 	/*
6097 	 * Old versions of KVM use the single-context version without
6098 	 * checking for support, so declare that it is supported even
6099 	 * though it is treated as global context.  The alternative is
6100 	 * not failing the single-context invvpid, and it is worse.
6101 	 */
6102 	if (enable_vpid) {
6103 		msrs->secondary_ctls_high |=
6104 			SECONDARY_EXEC_ENABLE_VPID;
6105 		msrs->vpid_caps = VMX_VPID_INVVPID_BIT |
6106 			VMX_VPID_EXTENT_SUPPORTED_MASK;
6107 	}
6108 
6109 	if (enable_unrestricted_guest)
6110 		msrs->secondary_ctls_high |=
6111 			SECONDARY_EXEC_UNRESTRICTED_GUEST;
6112 
6113 	if (flexpriority_enabled)
6114 		msrs->secondary_ctls_high |=
6115 			SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
6116 
6117 	/* miscellaneous data */
6118 	rdmsr(MSR_IA32_VMX_MISC,
6119 		msrs->misc_low,
6120 		msrs->misc_high);
6121 	msrs->misc_low &= VMX_MISC_SAVE_EFER_LMA;
6122 	msrs->misc_low |=
6123 		MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
6124 		VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
6125 		VMX_MISC_ACTIVITY_HLT;
6126 	msrs->misc_high = 0;
6127 
6128 	/*
6129 	 * This MSR reports some information about VMX support. We
6130 	 * should return information about the VMX we emulate for the
6131 	 * guest, and the VMCS structure we give it - not about the
6132 	 * VMX support of the underlying hardware.
6133 	 */
6134 	msrs->basic =
6135 		VMCS12_REVISION |
6136 		VMX_BASIC_TRUE_CTLS |
6137 		((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
6138 		(VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
6139 
6140 	if (cpu_has_vmx_basic_inout())
6141 		msrs->basic |= VMX_BASIC_INOUT;
6142 
6143 	/*
6144 	 * These MSRs specify bits which the guest must keep fixed on
6145 	 * while L1 is in VMXON mode (in L1's root mode, or running an L2).
6146 	 * We picked the standard core2 setting.
6147 	 */
6148 #define VMXON_CR0_ALWAYSON     (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
6149 #define VMXON_CR4_ALWAYSON     X86_CR4_VMXE
6150 	msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON;
6151 	msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON;
6152 
6153 	/* These MSRs specify bits which the guest must keep fixed off. */
6154 	rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1);
6155 	rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1);
6156 
6157 	/* highest index: VMX_PREEMPTION_TIMER_VALUE */
6158 	msrs->vmcs_enum = VMCS12_MAX_FIELD_INDEX << 1;
6159 }
6160 
6161 void nested_vmx_hardware_unsetup(void)
6162 {
6163 	int i;
6164 
6165 	if (enable_shadow_vmcs) {
6166 		for (i = 0; i < VMX_BITMAP_NR; i++)
6167 			free_page((unsigned long)vmx_bitmap[i]);
6168 	}
6169 }
6170 
6171 __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
6172 {
6173 	int i;
6174 
6175 	if (!cpu_has_vmx_shadow_vmcs())
6176 		enable_shadow_vmcs = 0;
6177 	if (enable_shadow_vmcs) {
6178 		for (i = 0; i < VMX_BITMAP_NR; i++) {
6179 			/*
6180 			 * The vmx_bitmap is not tied to a VM and so should
6181 			 * not be charged to a memcg.
6182 			 */
6183 			vmx_bitmap[i] = (unsigned long *)
6184 				__get_free_page(GFP_KERNEL);
6185 			if (!vmx_bitmap[i]) {
6186 				nested_vmx_hardware_unsetup();
6187 				return -ENOMEM;
6188 			}
6189 		}
6190 
6191 		init_vmcs_shadow_fields();
6192 	}
6193 
6194 	exit_handlers[EXIT_REASON_VMCLEAR]	= handle_vmclear;
6195 	exit_handlers[EXIT_REASON_VMLAUNCH]	= handle_vmlaunch;
6196 	exit_handlers[EXIT_REASON_VMPTRLD]	= handle_vmptrld;
6197 	exit_handlers[EXIT_REASON_VMPTRST]	= handle_vmptrst;
6198 	exit_handlers[EXIT_REASON_VMREAD]	= handle_vmread;
6199 	exit_handlers[EXIT_REASON_VMRESUME]	= handle_vmresume;
6200 	exit_handlers[EXIT_REASON_VMWRITE]	= handle_vmwrite;
6201 	exit_handlers[EXIT_REASON_VMOFF]	= handle_vmoff;
6202 	exit_handlers[EXIT_REASON_VMON]		= handle_vmon;
6203 	exit_handlers[EXIT_REASON_INVEPT]	= handle_invept;
6204 	exit_handlers[EXIT_REASON_INVVPID]	= handle_invvpid;
6205 	exit_handlers[EXIT_REASON_VMFUNC]	= handle_vmfunc;
6206 
6207 	kvm_x86_ops->check_nested_events = vmx_check_nested_events;
6208 	kvm_x86_ops->get_nested_state = vmx_get_nested_state;
6209 	kvm_x86_ops->set_nested_state = vmx_set_nested_state;
6210 	kvm_x86_ops->get_vmcs12_pages = nested_get_vmcs12_pages;
6211 	kvm_x86_ops->nested_enable_evmcs = nested_enable_evmcs;
6212 	kvm_x86_ops->nested_get_evmcs_version = nested_get_evmcs_version;
6213 
6214 	return 0;
6215 }
6216