xref: /linux/arch/x86/kvm/svm/nested.c (revision 4e6df939687caf878bb493570ff1c583bba86e7c)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Kernel-based Virtual Machine driver for Linux
4  *
5  * AMD SVM support
6  *
7  * Copyright (C) 2006 Qumranet, Inc.
8  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
9  *
10  * Authors:
11  *   Yaniv Kamay  <yaniv@qumranet.com>
12  *   Avi Kivity   <avi@qumranet.com>
13  */
14 
15 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
16 
17 #include <linux/kvm_types.h>
18 #include <linux/kvm_host.h>
19 #include <linux/kernel.h>
20 
21 #include <asm/msr-index.h>
22 #include <asm/debugreg.h>
23 
24 #include "kvm_emulate.h"
25 #include "trace.h"
26 #include "mmu.h"
27 #include "x86.h"
28 #include "smm.h"
29 #include "cpuid.h"
30 #include "lapic.h"
31 #include "svm.h"
32 #include "hyperv.h"
33 #include "pmu.h"
34 
35 #define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK
36 
37 static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
38 				       struct x86_exception *fault,
39 				       bool from_hardware)
40 {
41 	struct vcpu_svm *svm = to_svm(vcpu);
42 	struct vmcb *vmcb = svm->vmcb;
43 	u64 fault_stage;
44 
45 	/*
46 	 * For hardware NPF exits, the GUEST_FAULT_STAGE bits are only
47 	 * available in the hardware exit_info_1, since the guest_mmu
48 	 * walker doesn't know whether the faulting GPA was a page table
49 	 * page or final page from L2's perspective.
50 	 */
51 	if (from_hardware)
52 		fault_stage = vmcb->control.exit_info_1 &
53 			      PFERR_GUEST_FAULT_STAGE_MASK;
54 	else
55 		fault_stage = fault->error_code & PFERR_GUEST_FAULT_STAGE_MASK;
56 
57 	/*
58 	 * All nested page faults should be annotated as occurring on the
59 	 * final translation *or* the page walk. Arbitrarily choose "final"
60 	 * if KVM is buggy and enumerated both or neither.
61 	 */
62 	if (WARN_ON_ONCE(hweight64(fault_stage) != 1))
63 		fault_stage = PFERR_GUEST_FINAL_MASK;
64 
65 	vmcb->control.exit_code = SVM_EXIT_NPF;
66 	vmcb->control.exit_info_1 = fault_stage |
67 				    (fault->error_code & ~PFERR_GUEST_FAULT_STAGE_MASK);
68 	vmcb->control.exit_info_2 = fault->address;
69 
70 	nested_svm_vmexit(svm);
71 }
72 
73 static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
74 {
75 	struct vcpu_svm *svm = to_svm(vcpu);
76 	u64 cr3 = svm->nested.ctl.nested_cr3;
77 	u64 pdpte;
78 	int ret;
79 
80 	/*
81 	 * Note, nCR3 is "assumed" to be 32-byte aligned, i.e. the CPU ignores
82 	 * nCR3[4:0] when loading PDPTEs from memory.
83 	 */
84 	ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(cr3), &pdpte,
85 				       (cr3 & GENMASK(11, 5)) + index * 8, 8);
86 	if (ret)
87 		return 0;
88 	return pdpte;
89 }
90 
91 static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
92 {
93 	struct vcpu_svm *svm = to_svm(vcpu);
94 
95 	return svm->nested.ctl.nested_cr3;
96 }
97 
98 static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
99 {
100 	struct vcpu_svm *svm = to_svm(vcpu);
101 
102 	WARN_ON(mmu_is_nested(vcpu));
103 
104 	vcpu->arch.mmu = &vcpu->arch.guest_mmu;
105 
106 	/*
107 	 * The NPT format depends on L1's CR4 and EFER, which is in vmcb01.  Note,
108 	 * when called via KVM_SET_NESTED_STATE, that state may _not_ match current
109 	 * vCPU state.  CR0.WP is explicitly ignored, while CR0.PG is required.
110 	 */
111 	kvm_init_shadow_npt_mmu(vcpu, svm->vmcb01.ptr->save.cr4,
112 				svm->vmcb01.ptr->save.efer,
113 				svm->nested.ctl.nested_cr3,
114 				svm->nested.ctl.misc_ctl);
115 	vcpu->arch.mmu->get_guest_pgd     = nested_svm_get_tdp_cr3;
116 	vcpu->arch.mmu->get_pdptr         = nested_svm_get_tdp_pdptr;
117 	vcpu->arch.mmu->inject_page_fault = nested_svm_inject_npf_exit;
118 	vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
119 }
120 
121 static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
122 {
123 	vcpu->arch.mmu = &vcpu->arch.root_mmu;
124 	vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
125 }
126 
127 static bool nested_vmcb_needs_vls_intercept(struct vcpu_svm *svm)
128 {
129 	if (!guest_cpu_cap_has(&svm->vcpu, X86_FEATURE_V_VMSAVE_VMLOAD))
130 		return true;
131 
132 	if (!nested_npt_enabled(svm))
133 		return true;
134 
135 	if (!(svm->nested.ctl.misc_ctl2 & SVM_MISC2_ENABLE_V_VMLOAD_VMSAVE))
136 		return true;
137 
138 	return false;
139 }
140 
141 void nested_vmcb02_recalc_intercepts(struct vcpu_svm *svm)
142 {
143 	struct vmcb_ctrl_area_cached *vmcb12_ctrl = &svm->nested.ctl;
144 	struct vmcb *vmcb02 = svm->nested.vmcb02.ptr;
145 	struct vmcb *vmcb01 = svm->vmcb01.ptr;
146 	unsigned int i;
147 
148 	if (WARN_ON_ONCE(svm->vmcb != vmcb02))
149 		return;
150 
151 	vmcb_mark_dirty(vmcb02, VMCB_INTERCEPTS);
152 
153 	for (i = 0; i < MAX_INTERCEPT; i++)
154 		vmcb02->control.intercepts[i] = vmcb01->control.intercepts[i];
155 
156 	if (vmcb12_ctrl->int_ctl & V_INTR_MASKING_MASK) {
157 		/*
158 		 * If L2 is active and V_INTR_MASKING is enabled in vmcb12,
159 		 * disable intercept of CR8 writes as L2's CR8 does not affect
160 		 * any interrupt KVM may want to inject.
161 		 *
162 		 * Similarly, disable intercept of virtual interrupts (used to
163 		 * detect interrupt windows) if the saved RFLAGS.IF is '0', as
164 		 * the effective RFLAGS.IF for L1 interrupts will never be set
165 		 * while L2 is running (L2's RFLAGS.IF doesn't affect L1 IRQs).
166 		 */
167 		vmcb_clr_intercept(&vmcb02->control, INTERCEPT_CR8_WRITE);
168 		if (!(vmcb01->save.rflags & X86_EFLAGS_IF))
169 			vmcb_clr_intercept(&vmcb02->control, INTERCEPT_VINTR);
170 	}
171 
172 	for (i = 0; i < MAX_INTERCEPT; i++)
173 		vmcb02->control.intercepts[i] |= vmcb12_ctrl->intercepts[i];
174 
175 	/* If SMI is not intercepted, ignore guest SMI intercept as well  */
176 	if (!intercept_smi)
177 		vmcb_clr_intercept(&vmcb02->control, INTERCEPT_SMI);
178 
179 	/*
180 	 * Intercept PAUSE if and only if L1 wants to.  KVM intercepts PAUSE so
181 	 * that a vCPU that may be spinning waiting for a lock can be scheduled
182 	 * out in favor of the vCPU that holds said lock.  KVM doesn't support
183 	 * yielding across L2 vCPUs, as KVM has limited visilibity into which
184 	 * L2 vCPUs are in the same L2 VM, i.e. may be contending for locks.
185 	 */
186 	if (!vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_PAUSE))
187 		vmcb_clr_intercept(&vmcb02->control, INTERCEPT_PAUSE);
188 
189 	if (nested_vmcb_needs_vls_intercept(svm)) {
190 		/*
191 		 * If the virtual VMLOAD/VMSAVE is not enabled for the L2,
192 		 * we must intercept these instructions to correctly
193 		 * emulate them in case L1 doesn't intercept them.
194 		 */
195 		vmcb_set_intercept(&vmcb02->control, INTERCEPT_VMLOAD);
196 		vmcb_set_intercept(&vmcb02->control, INTERCEPT_VMSAVE);
197 	} else {
198 		WARN_ON_ONCE(!(vmcb02->control.misc_ctl2 & SVM_MISC2_ENABLE_V_VMLOAD_VMSAVE));
199 	}
200 }
201 
202 /*
203  * This array (and its actual size) holds the set of offsets (indexing by chunk
204  * size) to process when merging vmcb12's MSRPM with vmcb01's MSRPM.  Note, the
205  * set of MSRs for which interception is disabled in vmcb01 is per-vCPU, e.g.
206  * based on CPUID features.  This array only tracks MSRs that *might* be passed
207  * through to the guest.
208  *
209  * Hardcode the capacity of the array based on the maximum number of _offsets_.
210  * MSRs are batched together, so there are fewer offsets than MSRs.
211  */
212 static int nested_svm_msrpm_merge_offsets[10] __ro_after_init;
213 static int nested_svm_nr_msrpm_merge_offsets __ro_after_init;
214 typedef unsigned long nsvm_msrpm_merge_t;
215 
216 int __init nested_svm_init_msrpm_merge_offsets(void)
217 {
218 	static const u32 merge_msrs[] __initconst = {
219 		MSR_STAR,
220 		MSR_IA32_SYSENTER_CS,
221 		MSR_IA32_SYSENTER_EIP,
222 		MSR_IA32_SYSENTER_ESP,
223 	#ifdef CONFIG_X86_64
224 		MSR_GS_BASE,
225 		MSR_FS_BASE,
226 		MSR_KERNEL_GS_BASE,
227 		MSR_LSTAR,
228 		MSR_CSTAR,
229 		MSR_SYSCALL_MASK,
230 	#endif
231 		MSR_IA32_SPEC_CTRL,
232 		MSR_IA32_PRED_CMD,
233 		MSR_IA32_FLUSH_CMD,
234 		MSR_IA32_APERF,
235 		MSR_IA32_MPERF,
236 		MSR_IA32_LASTBRANCHFROMIP,
237 		MSR_IA32_LASTBRANCHTOIP,
238 		MSR_IA32_LASTINTFROMIP,
239 		MSR_IA32_LASTINTTOIP,
240 
241 		MSR_K7_PERFCTR0,
242 		MSR_K7_PERFCTR1,
243 		MSR_K7_PERFCTR2,
244 		MSR_K7_PERFCTR3,
245 		MSR_F15H_PERF_CTR0,
246 		MSR_F15H_PERF_CTR1,
247 		MSR_F15H_PERF_CTR2,
248 		MSR_F15H_PERF_CTR3,
249 		MSR_F15H_PERF_CTR4,
250 		MSR_F15H_PERF_CTR5,
251 
252 		MSR_AMD64_PERF_CNTR_GLOBAL_CTL,
253 		MSR_AMD64_PERF_CNTR_GLOBAL_STATUS,
254 		MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR,
255 		MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET,
256 	};
257 	int i, j;
258 
259 	for (i = 0; i < ARRAY_SIZE(merge_msrs); i++) {
260 		int bit_nr = svm_msrpm_bit_nr(merge_msrs[i]);
261 		u32 offset;
262 
263 		if (WARN_ON(bit_nr < 0))
264 			return -EIO;
265 
266 		/*
267 		 * Merging is done in chunks to reduce the number of accesses
268 		 * to L1's bitmap.
269 		 */
270 		offset = bit_nr / BITS_PER_BYTE / sizeof(nsvm_msrpm_merge_t);
271 
272 		for (j = 0; j < nested_svm_nr_msrpm_merge_offsets; j++) {
273 			if (nested_svm_msrpm_merge_offsets[j] == offset)
274 				break;
275 		}
276 
277 		if (j < nested_svm_nr_msrpm_merge_offsets)
278 			continue;
279 
280 		if (WARN_ON(j >= ARRAY_SIZE(nested_svm_msrpm_merge_offsets)))
281 			return -EIO;
282 
283 		nested_svm_msrpm_merge_offsets[j] = offset;
284 		nested_svm_nr_msrpm_merge_offsets++;
285 	}
286 
287 	return 0;
288 }
289 
290 /*
291  * Merge L0's (KVM) and L1's (Nested VMCB) MSR permission bitmaps. The function
292  * is optimized in that it only merges the parts where KVM MSR permission bitmap
293  * may contain zero bits.
294  */
295 static bool nested_svm_merge_msrpm(struct kvm_vcpu *vcpu)
296 {
297 	struct vcpu_svm *svm = to_svm(vcpu);
298 	nsvm_msrpm_merge_t *msrpm02 = svm->nested.msrpm;
299 	nsvm_msrpm_merge_t *msrpm01 = svm->msrpm;
300 	int i;
301 
302 	/*
303 	 * MSR bitmap update can be skipped when:
304 	 * - MSR bitmap for L1 hasn't changed.
305 	 * - Nested hypervisor (L1) is attempting to launch the same L2 as
306 	 *   before.
307 	 * - Nested hypervisor (L1) is using Hyper-V emulation interface and
308 	 * tells KVM (L0) there were no changes in MSR bitmap for L2.
309 	 */
310 #ifdef CONFIG_KVM_HYPERV
311 	if (!svm->nested.force_msr_bitmap_recalc) {
312 		struct hv_vmcb_enlightenments *hve = &svm->nested.ctl.hv_enlightenments;
313 
314 		if (kvm_hv_hypercall_enabled(vcpu) &&
315 		    hve->hv_enlightenments_control.msr_bitmap &&
316 		    (svm->nested.ctl.clean & BIT(HV_VMCB_NESTED_ENLIGHTENMENTS)))
317 			goto set_msrpm_base_pa;
318 	}
319 #endif
320 
321 	if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)))
322 		return true;
323 
324 	for (i = 0; i < nested_svm_nr_msrpm_merge_offsets; i++) {
325 		const int p = nested_svm_msrpm_merge_offsets[i];
326 		nsvm_msrpm_merge_t l1_val;
327 		gpa_t gpa;
328 
329 		gpa = svm->nested.ctl.msrpm_base_pa + (p * sizeof(l1_val));
330 
331 		if (kvm_vcpu_read_guest(vcpu, gpa, &l1_val, sizeof(l1_val)))
332 			return false;
333 
334 		msrpm02[p] = msrpm01[p] | l1_val;
335 	}
336 
337 	svm->nested.force_msr_bitmap_recalc = false;
338 
339 #ifdef CONFIG_KVM_HYPERV
340 set_msrpm_base_pa:
341 #endif
342 	svm->vmcb->control.msrpm_base_pa = __sme_set(__pa(svm->nested.msrpm));
343 
344 	return true;
345 }
346 
347 /*
348  * Bits 11:0 of bitmap address are ignored by hardware
349  */
350 static bool nested_svm_check_bitmap_pa(struct kvm_vcpu *vcpu, u64 pa, u32 size)
351 {
352 	u64 addr = PAGE_ALIGN(pa);
353 
354 	return kvm_vcpu_is_legal_gpa(vcpu, addr) &&
355 	    kvm_vcpu_is_legal_gpa(vcpu, addr + size - 1);
356 }
357 
358 static bool nested_svm_event_inj_valid_exept(struct kvm_vcpu *vcpu, u8 vector)
359 {
360 	/*
361 	 * Vectors that do not correspond to a defined exception are invalid
362 	 * (including #NMI and reserved vectors). In a best effort to define
363 	 * valid exceptions based on the virtual CPU, make all exceptions always
364 	 * valid except those obviously tied to a CPU feature.
365 	 */
366 	switch (vector) {
367 	case DE_VECTOR: case DB_VECTOR: case BP_VECTOR: case OF_VECTOR:
368 	case BR_VECTOR: case UD_VECTOR: case NM_VECTOR: case DF_VECTOR:
369 	case TS_VECTOR: case NP_VECTOR: case SS_VECTOR: case GP_VECTOR:
370 	case PF_VECTOR: case MF_VECTOR: case AC_VECTOR: case MC_VECTOR:
371 	case XM_VECTOR: case HV_VECTOR: case SX_VECTOR:
372 		return true;
373 	case CP_VECTOR:
374 		return guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK);
375 	case VC_VECTOR:
376 		return guest_cpu_cap_has(vcpu, X86_FEATURE_SEV_ES);
377 	}
378 	return false;
379 }
380 
381 /*
382  * According to the APM, VMRUN exits with SVM_EXIT_ERR if SVM_EVTINJ_VALID is
383  * set and:
384  * - The type of event_inj is not one of the defined values.
385  * - The type is SVM_EVTINJ_TYPE_EXEPT, but the vector is not a valid exception.
386  */
387 static bool nested_svm_check_event_inj(struct kvm_vcpu *vcpu, u32 event_inj)
388 {
389 	u32 type = event_inj & SVM_EVTINJ_TYPE_MASK;
390 	u8 vector = event_inj & SVM_EVTINJ_VEC_MASK;
391 
392 	if (!(event_inj & SVM_EVTINJ_VALID))
393 		return true;
394 
395 	if (type != SVM_EVTINJ_TYPE_INTR && type != SVM_EVTINJ_TYPE_NMI &&
396 	    type != SVM_EVTINJ_TYPE_EXEPT && type != SVM_EVTINJ_TYPE_SOFT)
397 		return false;
398 
399 	if (type == SVM_EVTINJ_TYPE_EXEPT &&
400 	    !nested_svm_event_inj_valid_exept(vcpu, vector))
401 		return false;
402 
403 	return true;
404 }
405 
406 static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu,
407 				       struct vmcb_ctrl_area_cached *control)
408 {
409 	if (CC(!vmcb12_is_intercept(control, INTERCEPT_VMRUN)))
410 		return false;
411 
412 	if (CC(control->asid == 0))
413 		return false;
414 
415 	if (CC((control->misc_ctl & SVM_MISC_ENABLE_NP) &&
416 	       !kvm_vcpu_is_legal_gpa(vcpu, control->nested_cr3)))
417 		return false;
418 
419 	if (CC(!nested_svm_check_bitmap_pa(vcpu, control->msrpm_base_pa,
420 					   MSRPM_SIZE)))
421 		return false;
422 	if (CC(!nested_svm_check_bitmap_pa(vcpu, control->iopm_base_pa,
423 					   IOPM_SIZE)))
424 		return false;
425 
426 	if (CC((control->int_ctl & V_NMI_ENABLE_MASK) &&
427 	       !vmcb12_is_intercept(control, INTERCEPT_NMI))) {
428 		return false;
429 	}
430 
431 	if (CC(!nested_svm_check_event_inj(vcpu, control->event_inj)))
432 		return false;
433 
434 	return true;
435 }
436 
437 /* Common checks that apply to both L1 and L2 state.  */
438 static bool nested_vmcb_check_save(struct kvm_vcpu *vcpu,
439 				   struct vmcb_save_area_cached *save,
440 				   bool check_gpat)
441 {
442 	if (CC(!(save->efer & EFER_SVME)))
443 		return false;
444 
445 	if (CC((save->cr0 & X86_CR0_CD) == 0 && (save->cr0 & X86_CR0_NW)) ||
446 	    CC(save->cr0 & ~0xffffffffULL))
447 		return false;
448 
449 	if (CC(!kvm_dr6_valid(save->dr6)) || CC(!kvm_dr7_valid(save->dr7)))
450 		return false;
451 
452 	/*
453 	 * These checks are also performed by KVM_SET_SREGS,
454 	 * except that EFER.LMA is not checked by SVM against
455 	 * CR0.PG && EFER.LME.
456 	 */
457 	if ((save->efer & EFER_LME) && (save->cr0 & X86_CR0_PG)) {
458 		if (CC(!(save->cr4 & X86_CR4_PAE)) ||
459 		    CC(!(save->cr0 & X86_CR0_PE)) ||
460 		    CC(!kvm_vcpu_is_legal_cr3(vcpu, save->cr3)))
461 			return false;
462 
463 		if (CC((save->cs.attrib & SVM_SELECTOR_L_MASK) &&
464 		       (save->cs.attrib & SVM_SELECTOR_DB_MASK)))
465 			return false;
466 	}
467 
468 	/* Note, SVM doesn't have any additional restrictions on CR4. */
469 	if (CC(!__kvm_is_valid_cr4(vcpu, save->cr4)))
470 		return false;
471 
472 	if (CC(!kvm_valid_efer(vcpu, save->efer)))
473 		return false;
474 
475 	/*
476 	 * If userspace contrives to get an invalid g_pat into vmcb02 by
477 	 * disabling KVM_X86_QUIRK_NESTED_SVM_SHARED_PAT in a race with
478 	 * this check, it should be prepared for the KVM_EXIT_FAIL_ENTRY
479 	 * that will follow.
480 	 */
481 	if (check_gpat && CC(!kvm_pat_valid(save->g_pat)))
482 		return false;
483 
484 	return true;
485 }
486 
487 int nested_svm_check_cached_vmcb12(struct kvm_vcpu *vcpu)
488 {
489 	struct vcpu_svm *svm = to_svm(vcpu);
490 
491 	if (!nested_vmcb_check_save(vcpu, &svm->nested.save,
492 				    l2_has_separate_pat(vcpu)) ||
493 	    !nested_vmcb_check_controls(vcpu, &svm->nested.ctl))
494 		return -EINVAL;
495 
496 	return 0;
497 }
498 
499 /*
500  * If a feature is not advertised to L1, clear the corresponding vmcb12
501  * intercept.
502  */
503 #define __nested_svm_sanitize_intercept(__vcpu, __control, fname, iname)	\
504 do {										\
505 	if (!guest_cpu_cap_has(__vcpu, X86_FEATURE_##fname))			\
506 		vmcb12_clr_intercept(__control, INTERCEPT_##iname);		\
507 } while (0)
508 
509 #define nested_svm_sanitize_intercept(__vcpu, __control, name)			\
510 	__nested_svm_sanitize_intercept(__vcpu, __control, name, name)
511 
512 static
513 void __nested_copy_vmcb_control_to_cache(struct kvm_vcpu *vcpu,
514 					 struct vmcb_ctrl_area_cached *to,
515 					 struct vmcb_control_area *from)
516 {
517 	unsigned int i;
518 
519 	for (i = 0; i < MAX_INTERCEPT; i++)
520 		to->intercepts[i] = from->intercepts[i];
521 
522 	__nested_svm_sanitize_intercept(vcpu, to, XSAVE, XSETBV);
523 	nested_svm_sanitize_intercept(vcpu, to, INVPCID);
524 	nested_svm_sanitize_intercept(vcpu, to, RDTSCP);
525 	nested_svm_sanitize_intercept(vcpu, to, SKINIT);
526 	nested_svm_sanitize_intercept(vcpu, to, RDPRU);
527 
528 	/* Always clear misc_ctl bits that the guest cannot use */
529 	to->misc_ctl = from->misc_ctl;
530 	if (!guest_cpu_cap_has(vcpu, X86_FEATURE_NPT))
531 		to->misc_ctl &= ~SVM_MISC_ENABLE_NP;
532 
533 	if (!gmet_enabled || !guest_cpu_cap_has(vcpu, X86_FEATURE_GMET))
534 		to->misc_ctl &= ~SVM_MISC_ENABLE_GMET;
535 
536 	to->iopm_base_pa        = from->iopm_base_pa & PAGE_MASK;
537 	to->msrpm_base_pa       = from->msrpm_base_pa & PAGE_MASK;
538 	to->tsc_offset          = from->tsc_offset;
539 	to->tlb_ctl             = from->tlb_ctl & TLB_CONTROL_MASK;
540 	to->erap_ctl            = from->erap_ctl;
541 	to->int_ctl             = from->int_ctl;
542 	to->int_vector          = from->int_vector & SVM_INT_VECTOR_MASK;
543 	to->int_state           = from->int_state & SVM_INTERRUPT_SHADOW_MASK;
544 	to->exit_code           = from->exit_code;
545 	to->exit_info_1         = from->exit_info_1;
546 	to->exit_info_2         = from->exit_info_2;
547 	to->exit_int_info       = from->exit_int_info;
548 	to->exit_int_info_err   = from->exit_int_info_err;
549 	to->event_inj           = from->event_inj & ~SVM_EVTINJ_RESERVED_BITS;
550 	to->event_inj_err       = from->event_inj_err;
551 	to->next_rip            = from->next_rip;
552 	to->nested_cr3          = from->nested_cr3;
553 	to->misc_ctl2		= from->misc_ctl2;
554 	to->pause_filter_count  = from->pause_filter_count;
555 	to->pause_filter_thresh = from->pause_filter_thresh;
556 
557 	/* Copy asid here because nested_vmcb_check_controls() will check it */
558 	to->asid           = from->asid;
559 	to->clean = from->clean;
560 
561 #ifdef CONFIG_KVM_HYPERV
562 	/* Hyper-V extensions (Enlightened VMCB) */
563 	if (kvm_hv_hypercall_enabled(vcpu)) {
564 		memcpy(&to->hv_enlightenments, &from->hv_enlightenments,
565 		       sizeof(to->hv_enlightenments));
566 	}
567 #endif
568 }
569 
570 void nested_copy_vmcb_control_to_cache(struct vcpu_svm *svm,
571 				       struct vmcb_control_area *control)
572 {
573 	__nested_copy_vmcb_control_to_cache(&svm->vcpu, &svm->nested.ctl, control);
574 }
575 
576 static void __nested_copy_vmcb_save_to_cache(struct vmcb_save_area_cached *to,
577 					     struct vmcb_save_area *from)
578 {
579 	to->es = from->es;
580 	to->cs = from->cs;
581 	to->ss = from->ss;
582 	to->ds = from->ds;
583 	to->gdtr = from->gdtr;
584 	to->idtr = from->idtr;
585 
586 	to->cpl = from->cpl;
587 
588 	to->efer = from->efer;
589 	to->cr4 = from->cr4;
590 	to->cr3 = from->cr3;
591 	to->cr0 = from->cr0;
592 	to->dr7 = from->dr7;
593 	to->dr6 = from->dr6;
594 
595 	to->rflags = from->rflags;
596 	to->rip = from->rip;
597 	to->rsp = from->rsp;
598 
599 	to->s_cet = from->s_cet;
600 	to->ssp = from->ssp;
601 	to->isst_addr = from->isst_addr;
602 
603 	to->rax = from->rax;
604 	to->cr2 = from->cr2;
605 	to->g_pat = from->g_pat;
606 
607 	svm_copy_lbrs(to, from);
608 }
609 
610 void nested_copy_vmcb_save_to_cache(struct vcpu_svm *svm,
611 				    struct vmcb_save_area *save)
612 {
613 	__nested_copy_vmcb_save_to_cache(&svm->nested.save, save);
614 }
615 
616 /*
617  * Synchronize fields that are written by the processor, so that
618  * they can be copied back into the vmcb12.
619  */
620 void nested_sync_control_from_vmcb02(struct vcpu_svm *svm)
621 {
622 	u32 mask;
623 	svm->nested.ctl.event_inj      = svm->vmcb->control.event_inj;
624 	svm->nested.ctl.event_inj_err  = svm->vmcb->control.event_inj_err;
625 	svm->nested.ctl.int_state	= svm->vmcb->control.int_state;
626 
627 	/* Only a few fields of int_ctl are written by the processor.  */
628 	mask = V_IRQ_MASK | V_TPR_MASK;
629 	/*
630 	 * Don't sync vmcb02 V_IRQ back to vmcb12 if KVM (L0) is intercepting
631 	 * virtual interrupts in order to request an interrupt window, as KVM
632 	 * has usurped vmcb02's int_ctl.  If an interrupt window opens before
633 	 * the next VM-Exit, svm_clear_vintr() will restore vmcb12's int_ctl.
634 	 * If no window opens, V_IRQ will be correctly preserved in vmcb12's
635 	 * int_ctl (because it was never recognized while L2 was running).
636 	 */
637 	if (svm_is_intercept(svm, INTERCEPT_VINTR) &&
638 	    !vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_VINTR))
639 		mask &= ~V_IRQ_MASK;
640 
641 	if (nested_vgif_enabled(svm))
642 		mask |= V_GIF_MASK;
643 
644 	if (nested_vnmi_enabled(svm))
645 		mask |= V_NMI_BLOCKING_MASK | V_NMI_PENDING_MASK;
646 
647 	svm->nested.ctl.int_ctl        &= ~mask;
648 	svm->nested.ctl.int_ctl        |= svm->vmcb->control.int_ctl & mask;
649 }
650 
651 /*
652  * Transfer any event that L0 or L1 wanted to inject into L2 to
653  * EXIT_INT_INFO.
654  */
655 static void nested_save_pending_event_to_vmcb12(struct vcpu_svm *svm,
656 						struct vmcb *vmcb12)
657 {
658 	struct kvm_vcpu *vcpu = &svm->vcpu;
659 	u32 exit_int_info = 0;
660 	unsigned int nr;
661 
662 	if (vcpu->arch.exception.injected) {
663 		nr = vcpu->arch.exception.vector;
664 		exit_int_info = nr | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_EXEPT;
665 
666 		if (vcpu->arch.exception.has_error_code) {
667 			exit_int_info |= SVM_EVTINJ_VALID_ERR;
668 			vmcb12->control.exit_int_info_err =
669 				vcpu->arch.exception.error_code;
670 		}
671 
672 	} else if (vcpu->arch.nmi_injected) {
673 		exit_int_info = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
674 
675 	} else if (vcpu->arch.interrupt.injected) {
676 		nr = vcpu->arch.interrupt.nr;
677 		exit_int_info = nr | SVM_EVTINJ_VALID;
678 
679 		if (vcpu->arch.interrupt.soft)
680 			exit_int_info |= SVM_EVTINJ_TYPE_SOFT;
681 		else
682 			exit_int_info |= SVM_EVTINJ_TYPE_INTR;
683 	}
684 
685 	vmcb12->control.exit_int_info = exit_int_info;
686 }
687 
688 static void nested_svm_transition_tlb_flush(struct kvm_vcpu *vcpu)
689 {
690 	/* Handle pending Hyper-V TLB flush requests */
691 	kvm_hv_nested_transtion_tlb_flush(vcpu, npt_enabled);
692 
693 	/*
694 	 * TODO: optimize unconditional TLB flush/MMU sync.  A partial list of
695 	 * things to fix before this can be conditional:
696 	 *
697 	 *  - Flush TLBs for both L1 and L2 remote TLB flush
698 	 *  - Honor L1's request to flush an ASID on nested VMRUN
699 	 *  - Sync nested NPT MMU on VMRUN that flushes L2's ASID[*]
700 	 *  - Don't crush a pending TLB flush in vmcb02 on nested VMRUN
701 	 *  - Flush L1's ASID on KVM_REQ_TLB_FLUSH_GUEST
702 	 *
703 	 * [*] Unlike nested EPT, SVM's ASID management can invalidate nested
704 	 *     NPT guest-physical mappings on VMRUN.
705 	 */
706 	kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
707 	kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
708 }
709 
710 /*
711  * Load guest's/host's cr3 on nested vmentry or vmexit. @nested_npt is true
712  * if we are emulating VM-Entry into a guest with NPT enabled.
713  */
714 static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
715 			       bool nested_npt, bool reload_pdptrs)
716 {
717 	if (CC(!kvm_vcpu_is_legal_cr3(vcpu, cr3)))
718 		return -EINVAL;
719 
720 	if (reload_pdptrs && is_pae_paging(vcpu)) {
721 		if (nested_npt)
722 			kvm_register_mark_for_reload(vcpu, VCPU_REG_PDPTR);
723 		else if (CC(!load_pdptrs(vcpu, cr3)))
724 			return -EINVAL;
725 	}
726 
727 	vcpu->arch.cr3 = cr3;
728 
729 	/* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */
730 	kvm_init_mmu(vcpu);
731 
732 	if (!nested_npt)
733 		kvm_mmu_new_pgd(vcpu, cr3);
734 
735 	return 0;
736 }
737 
738 static bool nested_vmcb12_has_lbrv(struct kvm_vcpu *vcpu)
739 {
740 	return guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) &&
741 		(to_svm(vcpu)->nested.ctl.misc_ctl2 & SVM_MISC2_ENABLE_V_LBR);
742 }
743 
744 static void nested_vmcb02_prepare_save(struct vcpu_svm *svm)
745 {
746 	struct vmcb_ctrl_area_cached *control = &svm->nested.ctl;
747 	struct vmcb_save_area_cached *save = &svm->nested.save;
748 	bool new_vmcb12 = false;
749 	struct vmcb *vmcb01 = svm->vmcb01.ptr;
750 	struct vmcb *vmcb02 = svm->nested.vmcb02.ptr;
751 	struct kvm_vcpu *vcpu = &svm->vcpu;
752 
753 	/* Load the nested guest state */
754 	if (svm->nested.vmcb12_gpa != svm->nested.last_vmcb12_gpa) {
755 		new_vmcb12 = true;
756 		svm->nested.last_vmcb12_gpa = svm->nested.vmcb12_gpa;
757 		svm->nested.force_msr_bitmap_recalc = true;
758 	}
759 
760 	if (unlikely(new_vmcb12 || vmcb12_is_dirty(control, VMCB_SEG))) {
761 		vmcb02->save.es = save->es;
762 		vmcb02->save.cs = save->cs;
763 		vmcb02->save.ss = save->ss;
764 		vmcb02->save.ds = save->ds;
765 		vmcb02->save.cpl = save->cpl;
766 		vmcb_mark_dirty(vmcb02, VMCB_SEG);
767 	}
768 
769 	if (unlikely(new_vmcb12 || vmcb12_is_dirty(control, VMCB_DT))) {
770 		vmcb02->save.gdtr = save->gdtr;
771 		vmcb02->save.idtr = save->idtr;
772 		vmcb_mark_dirty(vmcb02, VMCB_DT);
773 	}
774 
775 	if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) &&
776 	    (unlikely(new_vmcb12 || vmcb12_is_dirty(control, VMCB_CET)))) {
777 		vmcb02->save.s_cet  = save->s_cet;
778 		vmcb02->save.isst_addr = save->isst_addr;
779 		vmcb02->save.ssp = save->ssp;
780 		vmcb_mark_dirty(vmcb02, VMCB_CET);
781 	}
782 
783 	if (l2_has_separate_pat(vcpu)) {
784 		if (unlikely(new_vmcb12 || vmcb12_is_dirty(control, VMCB_NPT)))
785 			vmcb_set_gpat(vmcb02, svm->nested.save.g_pat);
786 	} else if (npt_enabled) {
787 		vmcb_set_gpat(vmcb02, vcpu->arch.pat);
788 	}
789 
790 	kvm_set_rflags(vcpu, save->rflags | X86_EFLAGS_FIXED);
791 
792 	svm_set_efer(vcpu, svm->nested.save.efer);
793 
794 	svm_set_cr0(vcpu, svm->nested.save.cr0);
795 	svm_set_cr4(vcpu, svm->nested.save.cr4);
796 
797 	svm->vcpu.arch.cr2 = save->cr2;
798 
799 	kvm_rax_write_raw(vcpu, save->rax);
800 	kvm_rsp_write(vcpu, save->rsp);
801 	kvm_rip_write(vcpu, save->rip);
802 
803 	/* In case we don't even reach vcpu_run, the fields are not updated */
804 	vmcb02->save.rax = save->rax;
805 	vmcb02->save.rsp = save->rsp;
806 	vmcb02->save.rip = save->rip;
807 
808 	if (unlikely(new_vmcb12 || vmcb12_is_dirty(control, VMCB_DR))) {
809 		vmcb02->save.dr7 = svm->nested.save.dr7 | DR7_FIXED_1;
810 		svm->vcpu.arch.dr6  = svm->nested.save.dr6 | DR6_ACTIVE_LOW;
811 		vmcb_mark_dirty(vmcb02, VMCB_DR);
812 	}
813 
814 	if (nested_vmcb12_has_lbrv(vcpu)) {
815 		/*
816 		 * Reserved bits of DEBUGCTL are ignored.  Be consistent with
817 		 * svm_set_msr's definition of reserved bits.
818 		 */
819 		svm_copy_lbrs(&vmcb02->save, save);
820 		vmcb02->save.dbgctl &= ~DEBUGCTL_RESERVED_BITS;
821 	} else {
822 		svm_copy_lbrs(&vmcb02->save, &vmcb01->save);
823 	}
824 	vmcb_mark_dirty(vmcb02, VMCB_LBR);
825 	svm_update_lbrv(&svm->vcpu);
826 }
827 
828 static inline bool is_evtinj_soft(u32 evtinj)
829 {
830 	u32 type = evtinj & SVM_EVTINJ_TYPE_MASK;
831 	u8 vector = evtinj & SVM_EVTINJ_VEC_MASK;
832 
833 	if (!(evtinj & SVM_EVTINJ_VALID))
834 		return false;
835 
836 	if (type == SVM_EVTINJ_TYPE_SOFT)
837 		return true;
838 
839 	return type == SVM_EVTINJ_TYPE_EXEPT && kvm_exception_is_soft(vector);
840 }
841 
842 static bool is_evtinj_nmi(u32 evtinj)
843 {
844 	u32 type = evtinj & SVM_EVTINJ_TYPE_MASK;
845 
846 	if (!(evtinj & SVM_EVTINJ_VALID))
847 		return false;
848 
849 	return type == SVM_EVTINJ_TYPE_NMI;
850 }
851 
852 static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
853 {
854 	u32 int_ctl_vmcb01_bits = V_INTR_MASKING_MASK;
855 	u32 int_ctl_vmcb12_bits = V_TPR_MASK | V_IRQ_INJECTION_BITS_MASK;
856 
857 	struct vmcb_ctrl_area_cached *vmcb12_ctrl = &svm->nested.ctl;
858 	struct vmcb *vmcb02 = svm->nested.vmcb02.ptr;
859 	struct vmcb *vmcb01 = svm->vmcb01.ptr;
860 	struct kvm_vcpu *vcpu = &svm->vcpu;
861 
862 	nested_svm_transition_tlb_flush(vcpu);
863 
864 	/* Enter Guest-Mode */
865 	enter_guest_mode(vcpu);
866 	svm_pmu_handle_nested_transition(svm);
867 
868 	/*
869 	 * Filled at exit: exit_code, exit_info_1, exit_info_2, exit_int_info,
870 	 * exit_int_info_err, next_rip, insn_len, insn_bytes.
871 	 */
872 
873 	if (guest_cpu_cap_has(vcpu, X86_FEATURE_VGIF) &&
874 	    (vmcb12_ctrl->int_ctl & V_GIF_ENABLE_MASK))
875 		int_ctl_vmcb12_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK);
876 	else
877 		int_ctl_vmcb01_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK);
878 
879 	if (vnmi) {
880 		if (vmcb01->control.int_ctl & V_NMI_PENDING_MASK) {
881 			svm->vcpu.arch.nmi_pending++;
882 			kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
883 		}
884 		if (nested_vnmi_enabled(svm))
885 			int_ctl_vmcb12_bits |= (V_NMI_PENDING_MASK |
886 						V_NMI_ENABLE_MASK |
887 						V_NMI_BLOCKING_MASK);
888 	}
889 
890 	/*
891 	 * Copied from vmcb01.  msrpm_base can be overwritten later.
892 	 *
893 	 * SVM_MISC_ENABLE_NP in vmcb12 is only used for consistency checks.  If
894 	 * L1 enables NPTs, KVM shadows L1's NPTs and uses those to run L2. If
895 	 * L1 disables NPT, KVM runs L2 with the same NPTs used to run L1. For
896 	 * the latter, L1 runs L2 with shadow page tables that translate L2 GVAs
897 	 * to L1 GPAs, so the same NPTs can be used for L1 and L2.
898 	 */
899 	vmcb02->control.misc_ctl = vmcb01->control.misc_ctl & (SVM_MISC_ENABLE_NP | SVM_MISC_ENABLE_GMET);
900 	vmcb02->control.iopm_base_pa = vmcb01->control.iopm_base_pa;
901 	vmcb02->control.msrpm_base_pa = vmcb01->control.msrpm_base_pa;
902 	vmcb_mark_dirty(vmcb02, VMCB_PERM_MAP);
903 
904 	/*
905 	 * Stash vmcb02's counter if the guest hasn't moved past the guilty
906 	 * instruction; otherwise, reset the counter to '0'.
907 	 *
908 	 * In order to detect if L2 has made forward progress or not, track the
909 	 * RIP at which a bus lock has occurred on a per-vmcb12 basis.  If RIP
910 	 * is changed, guest has clearly made forward progress, bus_lock_counter
911 	 * still remained '1', so reset bus_lock_counter to '0'. Eg. In the
912 	 * scenario, where a buslock happened in L1 before VMRUN, the bus lock
913 	 * firmly happened on an instruction in the past. Even if vmcb01's
914 	 * counter is still '1', (because the guilty instruction got patched),
915 	 * the vCPU has clearly made forward progress and so KVM should reset
916 	 * vmcb02's counter to '0'.
917 	 *
918 	 * If the RIP hasn't changed, stash the bus lock counter at nested VMRUN
919 	 * to prevent the same guilty instruction from triggering a VM-Exit. Eg.
920 	 * if userspace rate-limits the vCPU, then it's entirely possible that
921 	 * L1's tick interrupt is pending by the time userspace re-runs the
922 	 * vCPU.  If KVM unconditionally clears the counter on VMRUN, then when
923 	 * L1 re-enters L2, the same instruction will trigger a VM-Exit and the
924 	 * entire cycle start over.
925 	 */
926 	if (vmcb02->save.rip && (svm->nested.last_bus_lock_rip == vmcb02->save.rip))
927 		vmcb02->control.bus_lock_counter = 1;
928 	else
929 		vmcb02->control.bus_lock_counter = 0;
930 
931 	/* Done at vmrun: asid.  */
932 
933 	/* Also overwritten later if necessary.  */
934 	vmcb02->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
935 
936 	/* Use vmcb01 MMU and format if guest does not use nNPT */
937 	if (nested_npt_enabled(svm)) {
938 		vmcb02->control.misc_ctl &= ~SVM_MISC_ENABLE_GMET;
939 		vmcb02->control.misc_ctl |= (svm->nested.ctl.misc_ctl & SVM_MISC_ENABLE_GMET);
940 
941 		nested_svm_init_mmu_context(vcpu);
942 	}
943 
944 	vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset(vcpu->arch.l1_tsc_offset,
945 							   vmcb12_ctrl->tsc_offset,
946 							   svm->tsc_ratio_msr);
947 
948 	vmcb02->control.tsc_offset = vcpu->arch.tsc_offset;
949 
950 	if (guest_cpu_cap_has(vcpu, X86_FEATURE_TSCRATEMSR) &&
951 	    svm->tsc_ratio_msr != kvm_caps.default_tsc_scaling_ratio)
952 		nested_svm_update_tsc_ratio_msr(vcpu);
953 
954 	vmcb02->control.int_ctl             =
955 		(vmcb12_ctrl->int_ctl & int_ctl_vmcb12_bits) |
956 		(vmcb01->control.int_ctl & int_ctl_vmcb01_bits);
957 
958 	vmcb02->control.int_vector          = vmcb12_ctrl->int_vector;
959 	vmcb02->control.int_state           = vmcb12_ctrl->int_state;
960 	vmcb02->control.event_inj           = vmcb12_ctrl->event_inj;
961 	vmcb02->control.event_inj_err       = vmcb12_ctrl->event_inj_err;
962 
963 	/*
964 	 * If nrips is exposed to L1, take NextRIP as-is.  Otherwise, L1
965 	 * advances L2's RIP before VMRUN instead of using NextRIP. KVM will
966 	 * stuff the current RIP as vmcb02's NextRIP before L2 is run.  After
967 	 * the first run of L2 (e.g. after save+restore), NextRIP is updated by
968 	 * the CPU and/or KVM and should be used regardless of L1's support.
969 	 */
970 	if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS) ||
971 	    !vcpu->arch.nested_run_pending)
972 		vmcb02->control.next_rip = vmcb12_ctrl->next_rip;
973 
974 	svm->nmi_l1_to_l2 = is_evtinj_nmi(vmcb02->control.event_inj);
975 
976 	/*
977 	 * soft_int_csbase, soft_int_old_rip, and soft_int_next_rip (if L1
978 	 * doesn't have NRIPS) are initialized later, before the vCPU is run.
979 	 */
980 	if (is_evtinj_soft(vmcb02->control.event_inj)) {
981 		svm->soft_int_injected = true;
982 		if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS) ||
983 		    !vcpu->arch.nested_run_pending)
984 			svm->soft_int_next_rip = vmcb12_ctrl->next_rip;
985 	}
986 
987 	/* SVM_MISC2_ENABLE_V_LBR is controlled by svm_update_lbrv() */
988 
989 	if (!nested_vmcb_needs_vls_intercept(svm))
990 		vmcb02->control.misc_ctl2 |= SVM_MISC2_ENABLE_V_VMLOAD_VMSAVE;
991 
992 	if (guest_cpu_cap_has(vcpu, X86_FEATURE_PAUSEFILTER))
993 		vmcb02->control.pause_filter_count = vmcb12_ctrl->pause_filter_count;
994 	else
995 		vmcb02->control.pause_filter_count = 0;
996 	if (guest_cpu_cap_has(vcpu, X86_FEATURE_PFTHRESHOLD))
997 		vmcb02->control.pause_filter_thresh = vmcb12_ctrl->pause_filter_thresh;
998 	else
999 		vmcb02->control.pause_filter_thresh = 0;
1000 
1001 	/*
1002 	 * Take ALLOW_LARGER_RAP from vmcb12 even though it should be safe to
1003 	 * let L2 use a larger RAP since KVM will emulate the necessary clears,
1004 	 * as it's possible L1 deliberately wants to restrict L2 to the legacy
1005 	 * RAP size.  Unconditionally clear the RAP on nested VMRUN, as KVM is
1006 	 * responsible for emulating the host vs. guest tags (L1 is the "host",
1007 	 * L2 is the "guest").
1008 	 */
1009 	if (guest_cpu_cap_has(vcpu, X86_FEATURE_ERAPS))
1010 		vmcb02->control.erap_ctl = (vmcb12_ctrl->erap_ctl &
1011 					    ERAP_CONTROL_ALLOW_LARGER_RAP) |
1012 					   ERAP_CONTROL_CLEAR_RAP;
1013 
1014 	/*
1015 	 * Merge guest and host intercepts - must be called with vcpu in
1016 	 * guest-mode to take effect.
1017 	 */
1018 	nested_vmcb02_recalc_intercepts(svm);
1019 }
1020 
1021 static void nested_svm_copy_common_state(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
1022 {
1023 	/*
1024 	 * Some VMCB state is shared between L1 and L2 and thus has to be
1025 	 * moved at the time of nested vmrun and vmexit.
1026 	 *
1027 	 * VMLOAD/VMSAVE state would also belong in this category, but KVM
1028 	 * always performs VMLOAD and VMSAVE from the VMCB01.
1029 	 */
1030 	to_vmcb->save.spec_ctrl = from_vmcb->save.spec_ctrl;
1031 }
1032 
1033 int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa, bool from_vmrun)
1034 {
1035 	struct vcpu_svm *svm = to_svm(vcpu);
1036 	struct vmcb_ctrl_area_cached *control = &svm->nested.ctl;
1037 	struct vmcb_save_area_cached *save = &svm->nested.save;
1038 	int ret;
1039 
1040 	trace_kvm_nested_vmenter(svm->vmcb->save.rip,
1041 				 vmcb12_gpa,
1042 				 save->rip,
1043 				 control->int_ctl,
1044 				 control->event_inj,
1045 				 control->misc_ctl,
1046 				 control->nested_cr3,
1047 				 save->cr3,
1048 				 KVM_ISA_SVM);
1049 
1050 	trace_kvm_nested_intercepts(control->intercepts[INTERCEPT_CR] & 0xffff,
1051 				    control->intercepts[INTERCEPT_CR] >> 16,
1052 				    control->intercepts[INTERCEPT_EXCEPTION],
1053 				    control->intercepts[INTERCEPT_WORD3],
1054 				    control->intercepts[INTERCEPT_WORD4],
1055 				    control->intercepts[INTERCEPT_WORD5]);
1056 
1057 
1058 	svm->nested.vmcb12_gpa = vmcb12_gpa;
1059 
1060 	WARN_ON(svm->vmcb == svm->nested.vmcb02.ptr);
1061 
1062 	nested_svm_copy_common_state(svm->vmcb01.ptr, svm->nested.vmcb02.ptr);
1063 
1064 	svm_switch_vmcb(svm, &svm->nested.vmcb02);
1065 	nested_vmcb02_prepare_control(svm);
1066 	nested_vmcb02_prepare_save(svm);
1067 
1068 	ret = nested_svm_load_cr3(&svm->vcpu, svm->nested.save.cr3,
1069 				  nested_npt_enabled(svm), from_vmrun);
1070 	if (ret)
1071 		return ret;
1072 
1073 	if (!from_vmrun)
1074 		kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
1075 
1076 	svm_set_gif(svm, true);
1077 
1078 	if (kvm_vcpu_apicv_active(vcpu))
1079 		kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
1080 
1081 	nested_svm_hv_update_vm_vp_ids(vcpu);
1082 
1083 	return 0;
1084 }
1085 
1086 static int nested_svm_copy_vmcb12_to_cache(struct kvm_vcpu *vcpu, u64 vmcb12_gpa)
1087 {
1088 	struct vcpu_svm *svm = to_svm(vcpu);
1089 	struct kvm_host_map map;
1090 	struct vmcb *vmcb12;
1091 	int r = 0;
1092 
1093 	if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map))
1094 		return -EFAULT;
1095 
1096 	vmcb12 = map.hva;
1097 	nested_copy_vmcb_control_to_cache(svm, &vmcb12->control);
1098 	nested_copy_vmcb_save_to_cache(svm, &vmcb12->save);
1099 
1100 	if (nested_svm_check_cached_vmcb12(vcpu) < 0) {
1101 		vmcb12->control.exit_code = SVM_EXIT_ERR;
1102 		vmcb12->control.exit_info_1 = 0;
1103 		vmcb12->control.exit_info_2 = 0;
1104 		vmcb12->control.event_inj = 0;
1105 		vmcb12->control.event_inj_err = 0;
1106 		svm_set_gif(svm, false);
1107 		r = -EINVAL;
1108 	}
1109 
1110 	kvm_vcpu_unmap(vcpu, &map);
1111 	return r;
1112 }
1113 
1114 int nested_svm_vmrun(struct kvm_vcpu *vcpu)
1115 {
1116 	struct vcpu_svm *svm = to_svm(vcpu);
1117 	int ret;
1118 	u64 vmcb12_gpa;
1119 	struct vmcb *vmcb01 = svm->vmcb01.ptr;
1120 
1121 	if (!svm->nested.hsave_msr) {
1122 		kvm_inject_gp(vcpu, 0);
1123 		return 1;
1124 	}
1125 
1126 	if (is_smm(vcpu)) {
1127 		kvm_queue_exception(vcpu, UD_VECTOR);
1128 		return 1;
1129 	}
1130 
1131 	/* This fails when VP assist page is enabled but the supplied GPA is bogus */
1132 	ret = kvm_hv_verify_vp_assist(vcpu);
1133 	if (ret) {
1134 		kvm_inject_gp(vcpu, 0);
1135 		return ret;
1136 	}
1137 
1138 	if (WARN_ON_ONCE(!svm->nested.initialized))
1139 		return -EINVAL;
1140 
1141 	vmcb12_gpa = kvm_rax_read(vcpu);
1142 	if (!page_address_valid(vcpu, vmcb12_gpa)) {
1143 		kvm_inject_gp(vcpu, 0);
1144 		return 1;
1145 	}
1146 
1147 	ret = nested_svm_copy_vmcb12_to_cache(vcpu, vmcb12_gpa);
1148 	if (ret == -EFAULT)
1149 		return kvm_handle_memory_failure(vcpu, X86EMUL_IO_NEEDED, NULL);
1150 
1151 	/*
1152 	 * At this point, VMRUN is guaranteed to not fault; advance RIP. If
1153 	 * caching vmcb12 failed for other reasons, return immediately afterward
1154 	 * as a nested #VMEXIT was already set up.
1155 	 *
1156 	 * FIXME: If TF is set on VMRUN should inject a #DB (or handle guest
1157 	 * debugging) right after #VMEXIT, right now it's just ignored.
1158 	 */
1159 	if (!svm_skip_emulated_instruction(vcpu))
1160 		return 0;
1161 
1162 	if (ret)
1163 		goto insn_retired;
1164 
1165 	/*
1166 	 * Since vmcb01 is not in use, we can use it to store some of the L1
1167 	 * state.
1168 	 */
1169 	vmcb01->save.efer   = vcpu->arch.efer;
1170 	vmcb01->save.cr0    = kvm_read_cr0(vcpu);
1171 	vmcb01->save.cr4    = vcpu->arch.cr4;
1172 	vmcb01->save.rflags = kvm_get_rflags(vcpu);
1173 	vmcb01->save.rip    = kvm_rip_read(vcpu);
1174 
1175 	if (!npt_enabled)
1176 		vmcb01->save.cr3 = kvm_read_cr3(vcpu);
1177 
1178 	vcpu->arch.nested_run_pending = KVM_NESTED_RUN_PENDING;
1179 
1180 	if (enter_svm_guest_mode(vcpu, vmcb12_gpa, true) ||
1181 	    !nested_svm_merge_msrpm(vcpu)) {
1182 		vcpu->arch.nested_run_pending = 0;
1183 		svm->nmi_l1_to_l2 = false;
1184 		svm->soft_int_injected = false;
1185 
1186 		svm->vmcb->control.exit_code    = SVM_EXIT_ERR;
1187 		svm->vmcb->control.exit_info_1  = 0;
1188 		svm->vmcb->control.exit_info_2  = 0;
1189 
1190 		nested_svm_vmexit(svm);
1191 	}
1192 
1193 insn_retired:
1194 	/*
1195 	 * A successful VMRUN is counted by the PMU in guest mode, so only
1196 	 * retire the instruction after potentially entering guest mode.
1197 	 */
1198 	kvm_pmu_instruction_retired(vcpu);
1199 	return 1;
1200 }
1201 
1202 /* Copy state save area fields which are handled by VMRUN */
1203 void svm_copy_vmrun_state(struct vmcb_save_area *to_save,
1204 			  struct vmcb_save_area *from_save)
1205 {
1206 	to_save->es = from_save->es;
1207 	to_save->cs = from_save->cs;
1208 	to_save->ss = from_save->ss;
1209 	to_save->ds = from_save->ds;
1210 	to_save->gdtr = from_save->gdtr;
1211 	to_save->idtr = from_save->idtr;
1212 	to_save->rflags = from_save->rflags | X86_EFLAGS_FIXED;
1213 	to_save->efer = from_save->efer;
1214 	to_save->cr0 = from_save->cr0;
1215 	to_save->cr3 = from_save->cr3;
1216 	to_save->cr4 = from_save->cr4;
1217 	to_save->rax = from_save->rax;
1218 	to_save->rsp = from_save->rsp;
1219 	to_save->rip = from_save->rip;
1220 	to_save->cpl = 0;
1221 
1222 	if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) {
1223 		to_save->s_cet  = from_save->s_cet;
1224 		to_save->isst_addr = from_save->isst_addr;
1225 		to_save->ssp = from_save->ssp;
1226 	}
1227 
1228 	if (kvm_cpu_cap_has(X86_FEATURE_LBRV)) {
1229 		svm_copy_lbrs(to_save, from_save);
1230 		to_save->dbgctl &= ~DEBUGCTL_RESERVED_BITS;
1231 	}
1232 }
1233 
1234 void svm_copy_vmloadsave_state(struct vmcb *to_vmcb, struct vmcb *from_vmcb)
1235 {
1236 	to_vmcb->save.fs = from_vmcb->save.fs;
1237 	to_vmcb->save.gs = from_vmcb->save.gs;
1238 	to_vmcb->save.tr = from_vmcb->save.tr;
1239 	to_vmcb->save.ldtr = from_vmcb->save.ldtr;
1240 	to_vmcb->save.kernel_gs_base = from_vmcb->save.kernel_gs_base;
1241 	to_vmcb->save.star = from_vmcb->save.star;
1242 	to_vmcb->save.lstar = from_vmcb->save.lstar;
1243 	to_vmcb->save.cstar = from_vmcb->save.cstar;
1244 	to_vmcb->save.sfmask = from_vmcb->save.sfmask;
1245 	to_vmcb->save.sysenter_cs = from_vmcb->save.sysenter_cs;
1246 	to_vmcb->save.sysenter_esp = from_vmcb->save.sysenter_esp;
1247 	to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip;
1248 }
1249 
1250 static int nested_svm_vmexit_update_vmcb12(struct kvm_vcpu *vcpu)
1251 {
1252 	struct vcpu_svm *svm = to_svm(vcpu);
1253 	struct vmcb *vmcb02 = svm->nested.vmcb02.ptr;
1254 	struct kvm_host_map map;
1255 	struct vmcb *vmcb12;
1256 	int rc;
1257 
1258 	rc = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.vmcb12_gpa), &map);
1259 	if (rc)
1260 		return rc;
1261 
1262 	vmcb12 = map.hva;
1263 
1264 	vmcb12->save.es     = vmcb02->save.es;
1265 	vmcb12->save.cs     = vmcb02->save.cs;
1266 	vmcb12->save.ss     = vmcb02->save.ss;
1267 	vmcb12->save.ds     = vmcb02->save.ds;
1268 	vmcb12->save.gdtr   = vmcb02->save.gdtr;
1269 	vmcb12->save.idtr   = vmcb02->save.idtr;
1270 	vmcb12->save.efer   = svm->vcpu.arch.efer;
1271 	vmcb12->save.cr0    = kvm_read_cr0(vcpu);
1272 	vmcb12->save.cr3    = kvm_read_cr3(vcpu);
1273 	vmcb12->save.cr2    = vcpu->arch.cr2;
1274 	vmcb12->save.cr4    = svm->vcpu.arch.cr4;
1275 	vmcb12->save.rflags = kvm_get_rflags(vcpu);
1276 	vmcb12->save.rip    = kvm_rip_read(vcpu);
1277 	vmcb12->save.rsp    = kvm_rsp_read(vcpu);
1278 	vmcb12->save.rax    = kvm_rax_read_raw(vcpu);
1279 	vmcb12->save.dr7    = vmcb02->save.dr7;
1280 	vmcb12->save.dr6    = svm->vcpu.arch.dr6;
1281 	vmcb12->save.cpl    = vmcb02->save.cpl;
1282 
1283 	if (l2_has_separate_pat(vcpu))
1284 		vmcb12->save.g_pat = vmcb02->save.g_pat;
1285 
1286 	if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) {
1287 		vmcb12->save.s_cet	= vmcb02->save.s_cet;
1288 		vmcb12->save.isst_addr	= vmcb02->save.isst_addr;
1289 		vmcb12->save.ssp	= vmcb02->save.ssp;
1290 	}
1291 
1292 	vmcb12->control.int_state         = vmcb02->control.int_state;
1293 	vmcb12->control.exit_code         = vmcb02->control.exit_code;
1294 	vmcb12->control.exit_info_1       = vmcb02->control.exit_info_1;
1295 	vmcb12->control.exit_info_2       = vmcb02->control.exit_info_2;
1296 
1297 	if (!svm_is_vmrun_failure(vmcb12->control.exit_code))
1298 		nested_save_pending_event_to_vmcb12(svm, vmcb12);
1299 
1300 	if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS))
1301 		vmcb12->control.next_rip  = vmcb02->control.next_rip;
1302 
1303 	if (nested_vmcb12_has_lbrv(vcpu))
1304 		svm_copy_lbrs(&vmcb12->save, &vmcb02->save);
1305 
1306 	vmcb12->control.event_inj	  = 0;
1307 	vmcb12->control.event_inj_err	  = 0;
1308 	vmcb12->control.int_ctl           = svm->nested.ctl.int_ctl;
1309 
1310 	trace_kvm_nested_vmexit_inject(vmcb12->control.exit_code,
1311 				       vmcb12->control.exit_info_1,
1312 				       vmcb12->control.exit_info_2,
1313 				       vmcb12->control.exit_int_info,
1314 				       vmcb12->control.exit_int_info_err,
1315 				       KVM_ISA_SVM);
1316 
1317 	kvm_vcpu_unmap(vcpu, &map);
1318 	return 0;
1319 }
1320 
1321 void nested_svm_vmexit(struct vcpu_svm *svm)
1322 {
1323 	struct kvm_vcpu *vcpu = &svm->vcpu;
1324 	struct vmcb *vmcb01 = svm->vmcb01.ptr;
1325 	struct vmcb *vmcb02 = svm->nested.vmcb02.ptr;
1326 
1327 	if (nested_svm_vmexit_update_vmcb12(vcpu))
1328 		kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
1329 
1330 	/* Exit Guest-Mode */
1331 	leave_guest_mode(vcpu);
1332 	svm_pmu_handle_nested_transition(svm);
1333 
1334 	svm->nested.vmcb12_gpa = 0;
1335 
1336 	kvm_warn_on_nested_run_pending(vcpu);
1337 
1338 	kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
1339 
1340 	/* in case we halted in L2 */
1341 	kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE);
1342 
1343 	/*
1344 	 * Invalidate last_bus_lock_rip unless KVM is still waiting for the
1345 	 * guest to make forward progress before re-enabling bus lock detection.
1346 	 */
1347 	if (!vmcb02->control.bus_lock_counter)
1348 		svm->nested.last_bus_lock_rip = INVALID_GPA;
1349 
1350 	nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr);
1351 
1352 	kvm_nested_vmexit_handle_ibrs(vcpu);
1353 
1354 	if (guest_cpu_cap_has(vcpu, X86_FEATURE_ERAPS))
1355 		vmcb01->control.erap_ctl |= ERAP_CONTROL_CLEAR_RAP;
1356 
1357 	svm_switch_vmcb(svm, &svm->vmcb01);
1358 
1359 	/*
1360 	 * Rules for synchronizing int_ctl bits from vmcb02 to vmcb01:
1361 	 *
1362 	 * V_IRQ, V_IRQ_VECTOR, V_INTR_PRIO_MASK, V_IGN_TPR:  If L1 doesn't
1363 	 * intercept interrupts, then KVM will use vmcb02's V_IRQ (and related
1364 	 * flags) to detect interrupt windows for L1 IRQs (even if L1 uses
1365 	 * virtual interrupt masking).  Raise KVM_REQ_EVENT to ensure that
1366 	 * KVM re-requests an interrupt window if necessary, which implicitly
1367 	 * copies this bits from vmcb02 to vmcb01.
1368 	 *
1369 	 * V_TPR: If L1 doesn't use virtual interrupt masking, then L1's vTPR
1370 	 * is stored in vmcb02, but its value doesn't need to be copied from/to
1371 	 * vmcb01 because it is copied from/to the virtual APIC's TPR register
1372 	 * on each VM entry/exit.
1373 	 *
1374 	 * V_GIF: If nested vGIF is not used, KVM uses vmcb02's V_GIF for L1's
1375 	 * V_GIF.  However, GIF is architecturally clear on each VM exit, thus
1376 	 * there is no need to copy V_GIF from vmcb02 to vmcb01.
1377 	 */
1378 	if (!nested_exit_on_intr(svm))
1379 		kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
1380 
1381 	if (!nested_vmcb12_has_lbrv(vcpu)) {
1382 		svm_copy_lbrs(&vmcb01->save, &vmcb02->save);
1383 		vmcb_mark_dirty(vmcb01, VMCB_LBR);
1384 	}
1385 
1386 	svm_update_lbrv(vcpu);
1387 
1388 	if (vnmi) {
1389 		if (vmcb02->control.int_ctl & V_NMI_BLOCKING_MASK)
1390 			vmcb01->control.int_ctl |= V_NMI_BLOCKING_MASK;
1391 		else
1392 			vmcb01->control.int_ctl &= ~V_NMI_BLOCKING_MASK;
1393 
1394 		if (vcpu->arch.nmi_pending) {
1395 			vcpu->arch.nmi_pending--;
1396 			vmcb01->control.int_ctl |= V_NMI_PENDING_MASK;
1397 		} else {
1398 			vmcb01->control.int_ctl &= ~V_NMI_PENDING_MASK;
1399 		}
1400 	}
1401 
1402 	/*
1403 	 * On vmexit the  GIF is set to false and
1404 	 * no event can be injected in L1.
1405 	 */
1406 	svm_set_gif(svm, false);
1407 	vmcb01->control.exit_int_info = 0;
1408 
1409 	svm->vcpu.arch.tsc_offset = svm->vcpu.arch.l1_tsc_offset;
1410 	if (vmcb01->control.tsc_offset != svm->vcpu.arch.tsc_offset) {
1411 		vmcb01->control.tsc_offset = svm->vcpu.arch.tsc_offset;
1412 		vmcb_mark_dirty(vmcb01, VMCB_INTERCEPTS);
1413 	}
1414 
1415 	if (kvm_caps.has_tsc_control &&
1416 	    vcpu->arch.tsc_scaling_ratio != vcpu->arch.l1_tsc_scaling_ratio) {
1417 		vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio;
1418 		svm_write_tsc_multiplier(vcpu);
1419 	}
1420 
1421 	svm->nested.ctl.nested_cr3 = 0;
1422 
1423 	/*
1424 	 * Restore processor state that had been saved in vmcb01
1425 	 */
1426 	kvm_set_rflags(vcpu, vmcb01->save.rflags);
1427 	svm_set_efer(vcpu, vmcb01->save.efer);
1428 	svm_set_cr0(vcpu, vmcb01->save.cr0 | X86_CR0_PE);
1429 	svm_set_cr4(vcpu, vmcb01->save.cr4);
1430 	kvm_rax_write_raw(vcpu, vmcb01->save.rax);
1431 	kvm_rsp_write(vcpu, vmcb01->save.rsp);
1432 	kvm_rip_write(vcpu, vmcb01->save.rip);
1433 
1434 	svm->vcpu.arch.dr7 = DR7_FIXED_1;
1435 	kvm_update_dr7(&svm->vcpu);
1436 
1437 	nested_svm_transition_tlb_flush(vcpu);
1438 
1439 	nested_svm_uninit_mmu_context(vcpu);
1440 
1441 	if (nested_svm_load_cr3(vcpu, vmcb01->save.cr3, false, true))
1442 		kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
1443 
1444 	/* Drop tracking for L1->L2 injected NMIs and soft IRQs */
1445 	svm->nmi_l1_to_l2 = false;
1446 	svm->soft_int_injected = false;
1447 
1448 	/*
1449 	 * Drop what we picked up for L2 via svm_complete_interrupts() so it
1450 	 * doesn't end up in L1.
1451 	 */
1452 	svm->vcpu.arch.nmi_injected = false;
1453 	kvm_clear_exception_queue(vcpu);
1454 	kvm_clear_interrupt_queue(vcpu);
1455 
1456 	/*
1457 	 * If we are here following the completion of a VMRUN that
1458 	 * is being single-stepped, queue the pending #DB intercept
1459 	 * right now so that it an be accounted for before we execute
1460 	 * L1's next instruction.
1461 	 */
1462 	if (unlikely(vmcb01->save.rflags & X86_EFLAGS_TF))
1463 		kvm_queue_exception(&(svm->vcpu), DB_VECTOR);
1464 
1465 	/*
1466 	 * Un-inhibit the AVIC right away, so that other vCPUs can start
1467 	 * to benefit from it right away.
1468 	 */
1469 	if (kvm_apicv_activated(vcpu->kvm))
1470 		__kvm_vcpu_update_apicv(vcpu);
1471 }
1472 
1473 static void nested_svm_triple_fault(struct kvm_vcpu *vcpu)
1474 {
1475 	struct vcpu_svm *svm = to_svm(vcpu);
1476 
1477 	if (!vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SHUTDOWN))
1478 		return;
1479 
1480 	kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
1481 	nested_svm_simple_vmexit(to_svm(vcpu), SVM_EXIT_SHUTDOWN);
1482 }
1483 
1484 int svm_allocate_nested(struct vcpu_svm *svm)
1485 {
1486 	struct page *vmcb02_page;
1487 
1488 	if (svm->nested.initialized)
1489 		return 0;
1490 
1491 	vmcb02_page = snp_safe_alloc_page();
1492 	if (!vmcb02_page)
1493 		return -ENOMEM;
1494 	svm->nested.vmcb02.ptr = page_address(vmcb02_page);
1495 	svm->nested.vmcb02.pa = __sme_set(page_to_pfn(vmcb02_page) << PAGE_SHIFT);
1496 
1497 	svm->nested.msrpm = svm_vcpu_alloc_msrpm();
1498 	if (!svm->nested.msrpm)
1499 		goto err_free_vmcb02;
1500 
1501 	svm->nested.initialized = true;
1502 	return 0;
1503 
1504 err_free_vmcb02:
1505 	__free_page(vmcb02_page);
1506 	return -ENOMEM;
1507 }
1508 
1509 void svm_free_nested(struct vcpu_svm *svm)
1510 {
1511 	if (!svm->nested.initialized)
1512 		return;
1513 
1514 	if (WARN_ON_ONCE(svm->vmcb != svm->vmcb01.ptr))
1515 		svm_switch_vmcb(svm, &svm->vmcb01);
1516 
1517 	svm_vcpu_free_msrpm(svm->nested.msrpm);
1518 	svm->nested.msrpm = NULL;
1519 
1520 	__free_page(virt_to_page(svm->nested.vmcb02.ptr));
1521 	svm->nested.vmcb02.ptr = NULL;
1522 
1523 	/*
1524 	 * When last_vmcb12_gpa matches the current vmcb12 gpa,
1525 	 * some vmcb12 fields are not loaded if they are marked clean
1526 	 * in the vmcb12, since in this case they are up to date already.
1527 	 *
1528 	 * When the vmcb02 is freed, this optimization becomes invalid.
1529 	 */
1530 	svm->nested.last_vmcb12_gpa = INVALID_GPA;
1531 
1532 	svm->nested.initialized = false;
1533 }
1534 
1535 void svm_leave_nested(struct kvm_vcpu *vcpu)
1536 {
1537 	struct vcpu_svm *svm = to_svm(vcpu);
1538 
1539 	if (is_guest_mode(vcpu)) {
1540 		vcpu->arch.nested_run_pending = 0;
1541 		svm->nested.vmcb12_gpa = INVALID_GPA;
1542 
1543 		leave_guest_mode(vcpu);
1544 
1545 		/*
1546 		 * Force leaving nested is a non-architectural flow so precision
1547 		 * isn't a priority.  Defer updating the PMU until the next vCPU
1548 		 * run, potentially tolerating some imprecision to avoid poking
1549 		 * into PMU state from arbitrary contexts (e.g. to avoid using
1550 		 * stale state).
1551 		 */
1552 		__svm_pmu_handle_nested_transition(svm, true);
1553 
1554 		svm_switch_vmcb(svm, &svm->vmcb01);
1555 
1556 		nested_svm_uninit_mmu_context(vcpu);
1557 		vmcb_mark_all_dirty(svm->vmcb);
1558 
1559 		svm_set_gif(svm, true);
1560 
1561 		if (kvm_apicv_activated(vcpu->kvm))
1562 			kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
1563 	}
1564 
1565 	kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
1566 }
1567 
1568 static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
1569 {
1570 	gpa_t base = svm->nested.ctl.msrpm_base_pa;
1571 	int write, bit_nr;
1572 	u8 value, mask;
1573 	u32 msr;
1574 
1575 	if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)))
1576 		return NESTED_EXIT_HOST;
1577 
1578 	msr    = svm->vcpu.arch.regs[VCPU_REGS_RCX];
1579 	bit_nr = svm_msrpm_bit_nr(msr);
1580 	write  = svm->vmcb->control.exit_info_1 & 1;
1581 
1582 	if (bit_nr < 0)
1583 		return NESTED_EXIT_DONE;
1584 
1585 	if (kvm_vcpu_read_guest(&svm->vcpu, base + bit_nr / BITS_PER_BYTE,
1586 				&value, sizeof(value)))
1587 		return NESTED_EXIT_DONE;
1588 
1589 	mask = BIT(write) << (bit_nr & (BITS_PER_BYTE - 1));
1590 	return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
1591 }
1592 
1593 static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
1594 {
1595 	unsigned port, size, iopm_len;
1596 	u16 val, mask;
1597 	u8 start_bit;
1598 	u64 gpa;
1599 
1600 	if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_IOIO_PROT)))
1601 		return NESTED_EXIT_HOST;
1602 
1603 	port = svm->vmcb->control.exit_info_1 >> 16;
1604 	size = (svm->vmcb->control.exit_info_1 & SVM_IOIO_SIZE_MASK) >>
1605 		SVM_IOIO_SIZE_SHIFT;
1606 	gpa  = svm->nested.ctl.iopm_base_pa + (port / 8);
1607 	start_bit = port % 8;
1608 	iopm_len = (start_bit + size > 8) ? 2 : 1;
1609 	mask = (0xf >> (4 - size)) << start_bit;
1610 	val = 0;
1611 
1612 	if (kvm_vcpu_read_guest(&svm->vcpu, gpa, &val, iopm_len))
1613 		return NESTED_EXIT_DONE;
1614 
1615 	return (val & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
1616 }
1617 
1618 static int nested_svm_intercept(struct vcpu_svm *svm)
1619 {
1620 	u64 exit_code = svm->vmcb->control.exit_code;
1621 	int vmexit = NESTED_EXIT_HOST;
1622 
1623 	if (svm_is_vmrun_failure(exit_code))
1624 		return NESTED_EXIT_DONE;
1625 
1626 	switch (exit_code) {
1627 	case SVM_EXIT_MSR:
1628 		vmexit = nested_svm_exit_handled_msr(svm);
1629 		break;
1630 	case SVM_EXIT_IOIO:
1631 		vmexit = nested_svm_intercept_ioio(svm);
1632 		break;
1633 	case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f:
1634 		/*
1635 		 * Host-intercepted exceptions have been checked already in
1636 		 * nested_svm_exit_special.  There is nothing to do here,
1637 		 * the vmexit is injected by svm_check_nested_events.
1638 		 */
1639 		vmexit = NESTED_EXIT_DONE;
1640 		break;
1641 	default:
1642 		if (vmcb12_is_intercept(&svm->nested.ctl, exit_code))
1643 			vmexit = NESTED_EXIT_DONE;
1644 		break;
1645 	}
1646 
1647 	return vmexit;
1648 }
1649 
1650 int nested_svm_exit_handled(struct vcpu_svm *svm)
1651 {
1652 	int vmexit;
1653 
1654 	vmexit = nested_svm_intercept(svm);
1655 
1656 	if (vmexit == NESTED_EXIT_DONE)
1657 		nested_svm_vmexit(svm);
1658 
1659 	return vmexit;
1660 }
1661 
1662 int nested_svm_check_permissions(struct kvm_vcpu *vcpu)
1663 {
1664 	if (!(vcpu->arch.efer & EFER_SVME) || !is_paging(vcpu)) {
1665 		kvm_queue_exception(vcpu, UD_VECTOR);
1666 		return 1;
1667 	}
1668 
1669 	if (to_svm(vcpu)->vmcb->save.cpl) {
1670 		kvm_inject_gp(vcpu, 0);
1671 		return 1;
1672 	}
1673 
1674 	return 0;
1675 }
1676 
1677 static bool nested_svm_is_exception_vmexit(struct kvm_vcpu *vcpu, u8 vector,
1678 					   u32 error_code)
1679 {
1680 	struct vcpu_svm *svm = to_svm(vcpu);
1681 
1682 	return (svm->nested.ctl.intercepts[INTERCEPT_EXCEPTION] & BIT(vector));
1683 }
1684 
1685 static void nested_svm_inject_exception_vmexit(struct kvm_vcpu *vcpu)
1686 {
1687 	struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit;
1688 	struct vcpu_svm *svm = to_svm(vcpu);
1689 	struct vmcb *vmcb = svm->vmcb;
1690 
1691 	vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + ex->vector;
1692 
1693 	if (ex->has_error_code)
1694 		vmcb->control.exit_info_1 = ex->error_code;
1695 
1696 	/*
1697 	 * EXITINFO2 is undefined for all exception intercepts other
1698 	 * than #PF.
1699 	 */
1700 	if (ex->vector == PF_VECTOR) {
1701 		if (ex->has_payload)
1702 			vmcb->control.exit_info_2 = ex->payload;
1703 		else
1704 			vmcb->control.exit_info_2 = vcpu->arch.cr2;
1705 	} else if (ex->vector == DB_VECTOR) {
1706 		/* See kvm_check_and_inject_events().  */
1707 		kvm_deliver_exception_payload(vcpu, ex);
1708 
1709 		if (vcpu->arch.dr7 & DR7_GD) {
1710 			vcpu->arch.dr7 &= ~DR7_GD;
1711 			kvm_update_dr7(vcpu);
1712 		}
1713 	} else {
1714 		WARN_ON(ex->has_payload);
1715 	}
1716 
1717 	nested_svm_vmexit(svm);
1718 }
1719 
1720 static inline bool nested_exit_on_init(struct vcpu_svm *svm)
1721 {
1722 	return vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_INIT);
1723 }
1724 
1725 static int svm_check_nested_events(struct kvm_vcpu *vcpu)
1726 {
1727 	struct kvm_lapic *apic = vcpu->arch.apic;
1728 	struct vcpu_svm *svm = to_svm(vcpu);
1729 	/*
1730 	 * Only a pending nested run blocks a pending exception.  If there is a
1731 	 * previously injected event, the pending exception occurred while said
1732 	 * event was being delivered and thus needs to be handled.
1733 	 */
1734 	bool block_nested_exceptions = vcpu->arch.nested_run_pending;
1735 	/*
1736 	 * New events (not exceptions) are only recognized at instruction
1737 	 * boundaries.  If an event needs reinjection, then KVM is handling a
1738 	 * VM-Exit that occurred _during_ instruction execution; new events are
1739 	 * blocked until the instruction completes.
1740 	 */
1741 	bool block_nested_events = block_nested_exceptions ||
1742 				   kvm_event_needs_reinjection(vcpu);
1743 
1744 	if (lapic_in_kernel(vcpu) &&
1745 	    test_bit(KVM_APIC_INIT, &apic->pending_events)) {
1746 		if (block_nested_events)
1747 			return -EBUSY;
1748 		if (!nested_exit_on_init(svm))
1749 			return 0;
1750 		nested_svm_simple_vmexit(svm, SVM_EXIT_INIT);
1751 		return 0;
1752 	}
1753 
1754 	if (vcpu->arch.exception_vmexit.pending) {
1755 		if (block_nested_exceptions)
1756                         return -EBUSY;
1757 		nested_svm_inject_exception_vmexit(vcpu);
1758 		return 0;
1759 	}
1760 
1761 	if (vcpu->arch.exception.pending) {
1762 		if (block_nested_exceptions)
1763 			return -EBUSY;
1764 		return 0;
1765 	}
1766 
1767 #ifdef CONFIG_KVM_SMM
1768 	if (vcpu->arch.smi_pending && !svm_smi_blocked(vcpu)) {
1769 		if (block_nested_events)
1770 			return -EBUSY;
1771 		if (!nested_exit_on_smi(svm))
1772 			return 0;
1773 		nested_svm_simple_vmexit(svm, SVM_EXIT_SMI);
1774 		return 0;
1775 	}
1776 #endif
1777 
1778 	if (vcpu->arch.nmi_pending && !svm_nmi_blocked(vcpu)) {
1779 		if (block_nested_events)
1780 			return -EBUSY;
1781 		if (!nested_exit_on_nmi(svm))
1782 			return 0;
1783 		nested_svm_simple_vmexit(svm, SVM_EXIT_NMI);
1784 		return 0;
1785 	}
1786 
1787 	if (kvm_cpu_has_interrupt(vcpu) && !svm_interrupt_blocked(vcpu)) {
1788 		if (block_nested_events)
1789 			return -EBUSY;
1790 		if (!nested_exit_on_intr(svm))
1791 			return 0;
1792 		trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
1793 		nested_svm_simple_vmexit(svm, SVM_EXIT_INTR);
1794 		return 0;
1795 	}
1796 
1797 	return 0;
1798 }
1799 
1800 int nested_svm_exit_special(struct vcpu_svm *svm)
1801 {
1802 	u32 exit_code = svm->vmcb->control.exit_code;
1803 	struct kvm_vcpu *vcpu = &svm->vcpu;
1804 
1805 	switch (exit_code) {
1806 	case SVM_EXIT_INTR:
1807 	case SVM_EXIT_NMI:
1808 	case SVM_EXIT_NPF:
1809 		return NESTED_EXIT_HOST;
1810 	case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
1811 		u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
1812 
1813 		if (svm->vmcb01.ptr->control.intercepts[INTERCEPT_EXCEPTION] &
1814 		    excp_bits)
1815 			return NESTED_EXIT_HOST;
1816 		else if (exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR &&
1817 			 svm->vcpu.arch.apf.host_apf_flags)
1818 			/* Trap async PF even if not shadowing */
1819 			return NESTED_EXIT_HOST;
1820 		break;
1821 	}
1822 	case SVM_EXIT_VMMCALL:
1823 		/* Hyper-V L2 TLB flush hypercall is handled by L0 */
1824 		if (nested_svm_is_l2_tlb_flush_hcall(vcpu))
1825 			return NESTED_EXIT_HOST;
1826 		break;
1827 	default:
1828 		break;
1829 	}
1830 
1831 	return NESTED_EXIT_CONTINUE;
1832 }
1833 
1834 void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu)
1835 {
1836 	struct vcpu_svm *svm = to_svm(vcpu);
1837 
1838 	vcpu->arch.tsc_scaling_ratio =
1839 		kvm_calc_nested_tsc_multiplier(vcpu->arch.l1_tsc_scaling_ratio,
1840 					       svm->tsc_ratio_msr);
1841 	svm_write_tsc_multiplier(vcpu);
1842 }
1843 
1844 /* Inverse operation of nested_copy_vmcb_control_to_cache(). asid is copied too. */
1845 static void nested_copy_vmcb_cache_to_control(struct vmcb_control_area *dst,
1846 					      struct vmcb_ctrl_area_cached *from)
1847 {
1848 	unsigned int i;
1849 
1850 	memset(dst, 0, sizeof(struct vmcb_control_area));
1851 
1852 	for (i = 0; i < MAX_INTERCEPT; i++)
1853 		dst->intercepts[i] = from->intercepts[i];
1854 
1855 	dst->iopm_base_pa         = from->iopm_base_pa;
1856 	dst->msrpm_base_pa        = from->msrpm_base_pa;
1857 	dst->tsc_offset           = from->tsc_offset;
1858 	dst->asid                 = from->asid;
1859 	dst->tlb_ctl              = from->tlb_ctl;
1860 	dst->erap_ctl             = from->erap_ctl;
1861 	dst->int_ctl              = from->int_ctl;
1862 	dst->int_vector           = from->int_vector;
1863 	dst->int_state            = from->int_state;
1864 	dst->exit_code            = from->exit_code;
1865 	dst->exit_info_1          = from->exit_info_1;
1866 	dst->exit_info_2          = from->exit_info_2;
1867 	dst->exit_int_info        = from->exit_int_info;
1868 	dst->exit_int_info_err    = from->exit_int_info_err;
1869 	dst->misc_ctl		  = from->misc_ctl;
1870 	dst->event_inj            = from->event_inj;
1871 	dst->event_inj_err        = from->event_inj_err;
1872 	dst->next_rip             = from->next_rip;
1873 	dst->nested_cr3		  = from->nested_cr3;
1874 	dst->misc_ctl2		  = from->misc_ctl2;
1875 	dst->pause_filter_count   = from->pause_filter_count;
1876 	dst->pause_filter_thresh  = from->pause_filter_thresh;
1877 	/* 'clean' and 'hv_enlightenments' are not changed by KVM */
1878 }
1879 
1880 static int svm_get_nested_state(struct kvm_vcpu *vcpu,
1881 				struct kvm_nested_state __user *user_kvm_nested_state,
1882 				u32 user_data_size)
1883 {
1884 	struct vcpu_svm *svm;
1885 	struct vmcb_control_area *ctl;
1886 	unsigned long r;
1887 	struct kvm_nested_state kvm_state = {
1888 		.flags = 0,
1889 		.format = KVM_STATE_NESTED_FORMAT_SVM,
1890 		.size = sizeof(kvm_state),
1891 	};
1892 	struct vmcb __user *user_vmcb = (struct vmcb __user *)
1893 		&user_kvm_nested_state->data.svm[0];
1894 
1895 	if (!vcpu)
1896 		return kvm_state.size + KVM_STATE_NESTED_SVM_VMCB_SIZE;
1897 
1898 	svm = to_svm(vcpu);
1899 
1900 	if (user_data_size < kvm_state.size)
1901 		goto out;
1902 
1903 	/* First fill in the header and copy it out.  */
1904 	if (is_guest_mode(vcpu)) {
1905 		kvm_state.hdr.svm.vmcb_pa = svm->nested.vmcb12_gpa;
1906 		kvm_state.hdr.svm.gpat = 0;
1907 		if (l2_has_separate_pat(vcpu))
1908 			kvm_state.hdr.svm.gpat = svm->vmcb->save.g_pat;
1909 		kvm_state.size += KVM_STATE_NESTED_SVM_VMCB_SIZE;
1910 		kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE;
1911 
1912 		if (vcpu->arch.nested_run_pending)
1913 			kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING;
1914 	}
1915 
1916 	if (gif_set(svm))
1917 		kvm_state.flags |= KVM_STATE_NESTED_GIF_SET;
1918 
1919 	if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state)))
1920 		return -EFAULT;
1921 
1922 	if (!is_guest_mode(vcpu))
1923 		goto out;
1924 
1925 	/*
1926 	 * Copy over the full size of the VMCB rather than just the size
1927 	 * of the structs.
1928 	 */
1929 	if (clear_user(user_vmcb, KVM_STATE_NESTED_SVM_VMCB_SIZE))
1930 		return -EFAULT;
1931 
1932 	ctl = kzalloc_obj(*ctl);
1933 	if (!ctl)
1934 		return -ENOMEM;
1935 
1936 	nested_copy_vmcb_cache_to_control(ctl, &svm->nested.ctl);
1937 	r = copy_to_user(&user_vmcb->control, ctl,
1938 			 sizeof(user_vmcb->control));
1939 	kfree(ctl);
1940 	if (r)
1941 		return -EFAULT;
1942 
1943 	if (copy_to_user(&user_vmcb->save, &svm->vmcb01.ptr->save,
1944 			 sizeof(user_vmcb->save)))
1945 		return -EFAULT;
1946 out:
1947 	return kvm_state.size;
1948 }
1949 
1950 static int svm_set_nested_state(struct kvm_vcpu *vcpu,
1951 				struct kvm_nested_state __user *user_kvm_nested_state,
1952 				struct kvm_nested_state *kvm_state)
1953 {
1954 	struct vcpu_svm *svm = to_svm(vcpu);
1955 	struct vmcb __user *user_vmcb = (struct vmcb __user *)
1956 		&user_kvm_nested_state->data.svm[0];
1957 	struct vmcb_control_area *ctl;
1958 	struct vmcb_save_area *save;
1959 	struct vmcb_save_area_cached save_cached;
1960 	struct vmcb_ctrl_area_cached ctl_cached;
1961 	bool use_separate_l2_pat;
1962 	unsigned long cr0;
1963 	int ret;
1964 
1965 	BUILD_BUG_ON(sizeof(struct vmcb_control_area) + sizeof(struct vmcb_save_area) >
1966 		     KVM_STATE_NESTED_SVM_VMCB_SIZE);
1967 
1968 	if (kvm_state->format != KVM_STATE_NESTED_FORMAT_SVM)
1969 		return -EINVAL;
1970 
1971 	if (kvm_state->flags & ~(KVM_STATE_NESTED_GUEST_MODE |
1972 				 KVM_STATE_NESTED_RUN_PENDING |
1973 				 KVM_STATE_NESTED_GIF_SET))
1974 		return -EINVAL;
1975 
1976 	/*
1977 	 * If in guest mode, vcpu->arch.efer actually refers to the L2 guest's
1978 	 * EFER.SVME, but EFER.SVME still has to be 1 for VMRUN to succeed.
1979 	 * If SVME is disabled, the only valid states are "none" and GIF=1
1980 	 * (clearing SVME does NOT set GIF, i.e. GIF=0 is allowed).
1981 	 */
1982 	if (!(vcpu->arch.efer & EFER_SVME) && kvm_state->flags &&
1983 	    kvm_state->flags != KVM_STATE_NESTED_GIF_SET)
1984 		return -EINVAL;
1985 
1986 	/* SMM temporarily disables SVM, so we cannot be in guest mode.  */
1987 	if (is_smm(vcpu) && (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
1988 		return -EINVAL;
1989 
1990 	if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) {
1991 		svm_leave_nested(vcpu);
1992 		svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET));
1993 		return 0;
1994 	}
1995 
1996 	if (!page_address_valid(vcpu, kvm_state->hdr.svm.vmcb_pa))
1997 		return -EINVAL;
1998 	if (kvm_state->size < sizeof(*kvm_state) + KVM_STATE_NESTED_SVM_VMCB_SIZE)
1999 		return -EINVAL;
2000 
2001 	ctl = memdup_user(&user_vmcb->control, sizeof(*ctl));
2002 	if (IS_ERR(ctl))
2003 		return PTR_ERR(ctl);
2004 
2005 	save = memdup_user(&user_vmcb->save, sizeof(*save));
2006 	if (IS_ERR(save)) {
2007 		kfree(ctl);
2008 		return PTR_ERR(save);
2009 	}
2010 
2011 	ret = -EINVAL;
2012 	__nested_copy_vmcb_control_to_cache(vcpu, &ctl_cached, ctl);
2013 	if (!nested_vmcb_check_controls(vcpu, &ctl_cached))
2014 		goto out_free;
2015 
2016 	/*
2017 	 * Processor state contains L2 state.  Check that it is
2018 	 * valid for guest mode (see nested_vmcb_check_save()).
2019 	 */
2020 	cr0 = kvm_read_cr0(vcpu);
2021         if (((cr0 & X86_CR0_CD) == 0) && (cr0 & X86_CR0_NW))
2022 		goto out_free;
2023 
2024 	/*
2025 	 * Validate host state saved from before VMRUN (see
2026 	 * nested_svm_check_permissions). Note that the g_pat field is not
2027 	 * validated, because (a) it may have been clobbered by SMM before
2028 	 * KVM_GET_NESTED_STATE, and (b) it is not loaded at emulated
2029 	 * #VMEXIT.
2030 	 */
2031 	__nested_copy_vmcb_save_to_cache(&save_cached, save);
2032 	if (!(save->cr0 & X86_CR0_PG) ||
2033 	    !(save->cr0 & X86_CR0_PE) ||
2034 	    (save->rflags & X86_EFLAGS_VM) ||
2035 	    !nested_vmcb_check_save(vcpu, &save_cached, false))
2036 		goto out_free;
2037 
2038 	/*
2039 	 * Validate gPAT when the shared PAT quirk is disabled (i.e. L2
2040 	 * has its own gPAT). This is done separately from the
2041 	 * vmcb_save_area_cached validation above, because gPAT is L2
2042 	 * state, but the vmcb_save_area_cached is populated with L1 state.
2043 	 */
2044 	use_separate_l2_pat = (ctl_cached.misc_ctl & SVM_MISC_ENABLE_NP) &&
2045 			      !kvm_check_has_quirk(vcpu->kvm,
2046 						   KVM_X86_QUIRK_NESTED_SVM_SHARED_PAT);
2047 	if (use_separate_l2_pat && !kvm_pat_valid(kvm_state->hdr.svm.gpat))
2048 		goto out_free;
2049 
2050 	/*
2051 	 * All checks done, we can enter guest mode. Userspace provides
2052 	 * vmcb12.control, which will be combined with L1 and stored into
2053 	 * vmcb02, and the L1 save state which we store in vmcb01.
2054 	 * L2 registers if needed are moved from the current VMCB to VMCB02.
2055 	 */
2056 
2057 	if (is_guest_mode(vcpu))
2058 		svm_leave_nested(vcpu);
2059 	else
2060 		svm->nested.vmcb02.ptr->save = svm->vmcb01.ptr->save;
2061 
2062 	svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET));
2063 
2064 	if (kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING)
2065 		vcpu->arch.nested_run_pending = KVM_NESTED_RUN_PENDING_UNTRUSTED;
2066 	else
2067 		vcpu->arch.nested_run_pending = 0;
2068 
2069 	svm->nested.vmcb12_gpa = kvm_state->hdr.svm.vmcb_pa;
2070 
2071 	svm_copy_vmrun_state(&svm->vmcb01.ptr->save, save);
2072 	nested_copy_vmcb_control_to_cache(svm, ctl);
2073 
2074 	svm_switch_vmcb(svm, &svm->nested.vmcb02);
2075 
2076 	if (use_separate_l2_pat)
2077 		vmcb_set_gpat(svm->vmcb, kvm_state->hdr.svm.gpat);
2078 
2079 	nested_vmcb02_prepare_control(svm);
2080 
2081 	/*
2082 	 * Any previously restored state (e.g. KVM_SET_SREGS) would mark fields
2083 	 * dirty in vmcb01 instead of vmcb02, so mark all of vmcb02 dirty here.
2084 	 */
2085 	vmcb_mark_all_dirty(svm->vmcb);
2086 
2087 	/*
2088 	 * While the nested guest CR3 is already checked and set by
2089 	 * KVM_SET_SREGS, it was set when nested state was yet loaded,
2090 	 * thus MMU might not be initialized correctly.
2091 	 * Set it again to fix this.
2092 	 */
2093 	ret = nested_svm_load_cr3(&svm->vcpu, vcpu->arch.cr3,
2094 				  nested_npt_enabled(svm), false);
2095 	if (ret)
2096 		goto out_free;
2097 
2098 	svm->nested.force_msr_bitmap_recalc = true;
2099 
2100 	if (kvm_vcpu_apicv_active(vcpu))
2101 		kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
2102 
2103 	kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
2104 	ret = 0;
2105 out_free:
2106 	kfree(save);
2107 	kfree(ctl);
2108 
2109 	return ret;
2110 }
2111 
2112 static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
2113 {
2114 	if (WARN_ON(!is_guest_mode(vcpu)))
2115 		return true;
2116 
2117 	if (is_pae_paging(vcpu)) {
2118 		/*
2119 		 * After migration, CR3 may have been restored before
2120 		 * KVM_SET_NESTED_STATE, so the PDPTR load into mmu->pdptrs[]
2121 		 * may have treated CR3 as an L1 GPA. For nNPT, drop the
2122 		 * cache so the next access reloads them with the proper
2123 		 * nGPA translation. For !nNPT, reload eagerly unless userspace
2124 		 * already supplied authoritative PDPTRs via KVM_SET_SREGS2.
2125 		 */
2126 		if (nested_npt_enabled(to_svm(vcpu)))
2127 			kvm_register_mark_for_reload(vcpu, VCPU_REG_PDPTR);
2128 		else if (!vcpu->arch.pdptrs_from_userspace &&
2129 			 CC(!load_pdptrs(vcpu, vcpu->arch.cr3)))
2130 			return false;
2131 	}
2132 
2133 	if (!nested_svm_merge_msrpm(vcpu)) {
2134 		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
2135 		vcpu->run->internal.suberror =
2136 			KVM_INTERNAL_ERROR_EMULATION;
2137 		vcpu->run->internal.ndata = 0;
2138 		return false;
2139 	}
2140 
2141 	if (kvm_hv_verify_vp_assist(vcpu))
2142 		return false;
2143 
2144 	return true;
2145 }
2146 
2147 static gpa_t svm_translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa,
2148 				      u64 access,
2149 				      struct x86_exception *exception,
2150 				      u64 pte_access)
2151 {
2152 	struct vcpu_svm *svm = to_svm(vcpu);
2153 	struct kvm_mmu *mmu = vcpu->arch.mmu;
2154 
2155 	BUG_ON(!mmu_is_nested(vcpu));
2156 
2157 	/* Non-GMET walks are always user-walks */
2158 	if (!(svm->nested.ctl.misc_ctl & SVM_MISC_ENABLE_GMET))
2159 		access |= PFERR_USER_MASK;
2160 
2161 	return mmu->gva_to_gpa(vcpu, mmu, gpa, access, exception);
2162 }
2163 
2164 struct kvm_x86_nested_ops svm_nested_ops = {
2165 	.leave_nested = svm_leave_nested,
2166 	.translate_nested_gpa = svm_translate_nested_gpa,
2167 	.is_exception_vmexit = nested_svm_is_exception_vmexit,
2168 	.check_events = svm_check_nested_events,
2169 	.triple_fault = nested_svm_triple_fault,
2170 	.get_nested_state_pages = svm_get_nested_state_pages,
2171 	.get_state = svm_get_nested_state,
2172 	.set_state = svm_set_nested_state,
2173 	.hv_inject_synthetic_vmexit_post_tlb_flush = svm_hv_inject_synthetic_vmexit_post_tlb_flush,
2174 };
2175