1 // SPDX-License-Identifier: GPL-2.0 2 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 3 4 #include <linux/objtool.h> 5 #include <linux/percpu.h> 6 7 #include <asm/debugreg.h> 8 #include <asm/mmu_context.h> 9 10 #include "x86.h" 11 #include "cpuid.h" 12 #include "hyperv.h" 13 #include "mmu.h" 14 #include "nested.h" 15 #include "pmu.h" 16 #include "posted_intr.h" 17 #include "sgx.h" 18 #include "trace.h" 19 #include "vmx.h" 20 #include "smm.h" 21 22 static bool __read_mostly enable_shadow_vmcs = 1; 23 module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); 24 25 static bool __read_mostly nested_early_check = 0; 26 module_param(nested_early_check, bool, S_IRUGO); 27 28 #define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK 29 30 /* 31 * Hyper-V requires all of these, so mark them as supported even though 32 * they are just treated the same as all-context. 33 */ 34 #define VMX_VPID_EXTENT_SUPPORTED_MASK \ 35 (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \ 36 VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \ 37 VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \ 38 VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT) 39 40 #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 41 42 enum { 43 VMX_VMREAD_BITMAP, 44 VMX_VMWRITE_BITMAP, 45 VMX_BITMAP_NR 46 }; 47 static unsigned long *vmx_bitmap[VMX_BITMAP_NR]; 48 49 #define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP]) 50 #define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP]) 51 52 struct shadow_vmcs_field { 53 u16 encoding; 54 u16 offset; 55 }; 56 static struct shadow_vmcs_field shadow_read_only_fields[] = { 57 #define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) }, 58 #include "vmcs_shadow_fields.h" 59 }; 60 static int max_shadow_read_only_fields = 61 ARRAY_SIZE(shadow_read_only_fields); 62 63 static struct shadow_vmcs_field shadow_read_write_fields[] = { 64 #define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) }, 65 #include "vmcs_shadow_fields.h" 66 }; 67 static int max_shadow_read_write_fields = 68 ARRAY_SIZE(shadow_read_write_fields); 69 init_vmcs_shadow_fields(void)70 static void init_vmcs_shadow_fields(void) 71 { 72 int i, j; 73 74 memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); 75 memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); 76 77 for (i = j = 0; i < max_shadow_read_only_fields; i++) { 78 struct shadow_vmcs_field entry = shadow_read_only_fields[i]; 79 u16 field = entry.encoding; 80 81 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && 82 (i + 1 == max_shadow_read_only_fields || 83 shadow_read_only_fields[i + 1].encoding != field + 1)) 84 pr_err("Missing field from shadow_read_only_field %x\n", 85 field + 1); 86 87 clear_bit(field, vmx_vmread_bitmap); 88 if (field & 1) 89 #ifdef CONFIG_X86_64 90 continue; 91 #else 92 entry.offset += sizeof(u32); 93 #endif 94 shadow_read_only_fields[j++] = entry; 95 } 96 max_shadow_read_only_fields = j; 97 98 for (i = j = 0; i < max_shadow_read_write_fields; i++) { 99 struct shadow_vmcs_field entry = shadow_read_write_fields[i]; 100 u16 field = entry.encoding; 101 102 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && 103 (i + 1 == max_shadow_read_write_fields || 104 shadow_read_write_fields[i + 1].encoding != field + 1)) 105 pr_err("Missing field from shadow_read_write_field %x\n", 106 field + 1); 107 108 WARN_ONCE(field >= GUEST_ES_AR_BYTES && 109 field <= GUEST_TR_AR_BYTES, 110 "Update vmcs12_write_any() to drop reserved bits from AR_BYTES"); 111 112 /* 113 * PML and the preemption timer can be emulated, but the 114 * processor cannot vmwrite to fields that don't exist 115 * on bare metal. 116 */ 117 switch (field) { 118 case GUEST_PML_INDEX: 119 if (!cpu_has_vmx_pml()) 120 continue; 121 break; 122 case VMX_PREEMPTION_TIMER_VALUE: 123 if (!cpu_has_vmx_preemption_timer()) 124 continue; 125 break; 126 case GUEST_INTR_STATUS: 127 if (!cpu_has_vmx_apicv()) 128 continue; 129 break; 130 default: 131 break; 132 } 133 134 clear_bit(field, vmx_vmwrite_bitmap); 135 clear_bit(field, vmx_vmread_bitmap); 136 if (field & 1) 137 #ifdef CONFIG_X86_64 138 continue; 139 #else 140 entry.offset += sizeof(u32); 141 #endif 142 shadow_read_write_fields[j++] = entry; 143 } 144 max_shadow_read_write_fields = j; 145 } 146 147 /* 148 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), 149 * set the success or error code of an emulated VMX instruction (as specified 150 * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated 151 * instruction. 152 */ nested_vmx_succeed(struct kvm_vcpu * vcpu)153 static int nested_vmx_succeed(struct kvm_vcpu *vcpu) 154 { 155 vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) 156 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 157 X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); 158 return kvm_skip_emulated_instruction(vcpu); 159 } 160 nested_vmx_failInvalid(struct kvm_vcpu * vcpu)161 static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu) 162 { 163 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 164 & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | 165 X86_EFLAGS_SF | X86_EFLAGS_OF)) 166 | X86_EFLAGS_CF); 167 return kvm_skip_emulated_instruction(vcpu); 168 } 169 nested_vmx_failValid(struct kvm_vcpu * vcpu,u32 vm_instruction_error)170 static int nested_vmx_failValid(struct kvm_vcpu *vcpu, 171 u32 vm_instruction_error) 172 { 173 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 174 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 175 X86_EFLAGS_SF | X86_EFLAGS_OF)) 176 | X86_EFLAGS_ZF); 177 get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; 178 /* 179 * We don't need to force sync to shadow VMCS because 180 * VM_INSTRUCTION_ERROR is not shadowed. Enlightened VMCS 'shadows' all 181 * fields and thus must be synced. 182 */ 183 if (nested_vmx_is_evmptr12_set(to_vmx(vcpu))) 184 to_vmx(vcpu)->nested.need_vmcs12_to_shadow_sync = true; 185 186 return kvm_skip_emulated_instruction(vcpu); 187 } 188 nested_vmx_fail(struct kvm_vcpu * vcpu,u32 vm_instruction_error)189 static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error) 190 { 191 struct vcpu_vmx *vmx = to_vmx(vcpu); 192 193 /* 194 * failValid writes the error number to the current VMCS, which 195 * can't be done if there isn't a current VMCS. 196 */ 197 if (vmx->nested.current_vmptr == INVALID_GPA && 198 !nested_vmx_is_evmptr12_valid(vmx)) 199 return nested_vmx_failInvalid(vcpu); 200 201 return nested_vmx_failValid(vcpu, vm_instruction_error); 202 } 203 nested_vmx_abort(struct kvm_vcpu * vcpu,u32 indicator)204 static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) 205 { 206 /* TODO: not to reset guest simply here. */ 207 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 208 pr_debug_ratelimited("nested vmx abort, indicator %d\n", indicator); 209 } 210 vmx_control_verify(u32 control,u32 low,u32 high)211 static inline bool vmx_control_verify(u32 control, u32 low, u32 high) 212 { 213 return fixed_bits_valid(control, low, high); 214 } 215 vmx_control_msr(u32 low,u32 high)216 static inline u64 vmx_control_msr(u32 low, u32 high) 217 { 218 return low | ((u64)high << 32); 219 } 220 vmx_disable_shadow_vmcs(struct vcpu_vmx * vmx)221 static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) 222 { 223 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); 224 vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); 225 vmx->nested.need_vmcs12_to_shadow_sync = false; 226 } 227 nested_release_evmcs(struct kvm_vcpu * vcpu)228 static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) 229 { 230 #ifdef CONFIG_KVM_HYPERV 231 struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); 232 struct vcpu_vmx *vmx = to_vmx(vcpu); 233 234 kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map); 235 vmx->nested.hv_evmcs = NULL; 236 vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; 237 238 if (hv_vcpu) { 239 hv_vcpu->nested.pa_page_gpa = INVALID_GPA; 240 hv_vcpu->nested.vm_id = 0; 241 hv_vcpu->nested.vp_id = 0; 242 } 243 #endif 244 } 245 nested_evmcs_handle_vmclear(struct kvm_vcpu * vcpu,gpa_t vmptr)246 static bool nested_evmcs_handle_vmclear(struct kvm_vcpu *vcpu, gpa_t vmptr) 247 { 248 #ifdef CONFIG_KVM_HYPERV 249 struct vcpu_vmx *vmx = to_vmx(vcpu); 250 /* 251 * When Enlightened VMEntry is enabled on the calling CPU we treat 252 * memory area pointer by vmptr as Enlightened VMCS (as there's no good 253 * way to distinguish it from VMCS12) and we must not corrupt it by 254 * writing to the non-existent 'launch_state' field. The area doesn't 255 * have to be the currently active EVMCS on the calling CPU and there's 256 * nothing KVM has to do to transition it from 'active' to 'non-active' 257 * state. It is possible that the area will stay mapped as 258 * vmx->nested.hv_evmcs but this shouldn't be a problem. 259 */ 260 if (!guest_cpu_cap_has_evmcs(vcpu) || 261 !evmptr_is_valid(nested_get_evmptr(vcpu))) 262 return false; 263 264 if (nested_vmx_evmcs(vmx) && vmptr == vmx->nested.hv_evmcs_vmptr) 265 nested_release_evmcs(vcpu); 266 267 return true; 268 #else 269 return false; 270 #endif 271 } 272 vmx_sync_vmcs_host_state(struct vcpu_vmx * vmx,struct loaded_vmcs * prev)273 static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, 274 struct loaded_vmcs *prev) 275 { 276 struct vmcs_host_state *dest, *src; 277 278 if (unlikely(!vmx->guest_state_loaded)) 279 return; 280 281 src = &prev->host_state; 282 dest = &vmx->loaded_vmcs->host_state; 283 284 vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base); 285 dest->ldt_sel = src->ldt_sel; 286 #ifdef CONFIG_X86_64 287 dest->ds_sel = src->ds_sel; 288 dest->es_sel = src->es_sel; 289 #endif 290 } 291 vmx_switch_vmcs(struct kvm_vcpu * vcpu,struct loaded_vmcs * vmcs)292 static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) 293 { 294 struct vcpu_vmx *vmx = to_vmx(vcpu); 295 struct loaded_vmcs *prev; 296 int cpu; 297 298 if (WARN_ON_ONCE(vmx->loaded_vmcs == vmcs)) 299 return; 300 301 cpu = get_cpu(); 302 prev = vmx->loaded_vmcs; 303 vmx->loaded_vmcs = vmcs; 304 vmx_vcpu_load_vmcs(vcpu, cpu, prev); 305 vmx_sync_vmcs_host_state(vmx, prev); 306 put_cpu(); 307 308 vcpu->arch.regs_avail = ~VMX_REGS_LAZY_LOAD_SET; 309 310 /* 311 * All lazily updated registers will be reloaded from VMCS12 on both 312 * vmentry and vmexit. 313 */ 314 vcpu->arch.regs_dirty = 0; 315 } 316 nested_put_vmcs12_pages(struct kvm_vcpu * vcpu)317 static void nested_put_vmcs12_pages(struct kvm_vcpu *vcpu) 318 { 319 struct vcpu_vmx *vmx = to_vmx(vcpu); 320 321 kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map); 322 kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map); 323 kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map); 324 vmx->nested.pi_desc = NULL; 325 } 326 327 /* 328 * Free whatever needs to be freed from vmx->nested when L1 goes down, or 329 * just stops using VMX. 330 */ free_nested(struct kvm_vcpu * vcpu)331 static void free_nested(struct kvm_vcpu *vcpu) 332 { 333 struct vcpu_vmx *vmx = to_vmx(vcpu); 334 335 if (WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01)) 336 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 337 338 if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) 339 return; 340 341 kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 342 343 vmx->nested.vmxon = false; 344 vmx->nested.smm.vmxon = false; 345 vmx->nested.vmxon_ptr = INVALID_GPA; 346 free_vpid(vmx->nested.vpid02); 347 vmx->nested.posted_intr_nv = -1; 348 vmx->nested.current_vmptr = INVALID_GPA; 349 if (enable_shadow_vmcs) { 350 vmx_disable_shadow_vmcs(vmx); 351 vmcs_clear(vmx->vmcs01.shadow_vmcs); 352 free_vmcs(vmx->vmcs01.shadow_vmcs); 353 vmx->vmcs01.shadow_vmcs = NULL; 354 } 355 kfree(vmx->nested.cached_vmcs12); 356 vmx->nested.cached_vmcs12 = NULL; 357 kfree(vmx->nested.cached_shadow_vmcs12); 358 vmx->nested.cached_shadow_vmcs12 = NULL; 359 360 nested_put_vmcs12_pages(vcpu); 361 362 kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 363 364 nested_release_evmcs(vcpu); 365 366 free_loaded_vmcs(&vmx->nested.vmcs02); 367 } 368 369 /* 370 * Ensure that the current vmcs of the logical processor is the 371 * vmcs01 of the vcpu before calling free_nested(). 372 */ nested_vmx_free_vcpu(struct kvm_vcpu * vcpu)373 void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu) 374 { 375 vcpu_load(vcpu); 376 vmx_leave_nested(vcpu); 377 vcpu_put(vcpu); 378 } 379 380 #define EPTP_PA_MASK GENMASK_ULL(51, 12) 381 nested_ept_root_matches(hpa_t root_hpa,u64 root_eptp,u64 eptp)382 static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp) 383 { 384 return VALID_PAGE(root_hpa) && 385 ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK)); 386 } 387 nested_ept_invalidate_addr(struct kvm_vcpu * vcpu,gpa_t eptp,gpa_t addr)388 static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp, 389 gpa_t addr) 390 { 391 unsigned long roots = 0; 392 uint i; 393 struct kvm_mmu_root_info *cached_root; 394 395 WARN_ON_ONCE(!mmu_is_nested(vcpu)); 396 397 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 398 cached_root = &vcpu->arch.mmu->prev_roots[i]; 399 400 if (nested_ept_root_matches(cached_root->hpa, cached_root->pgd, 401 eptp)) 402 roots |= KVM_MMU_ROOT_PREVIOUS(i); 403 } 404 if (roots) 405 kvm_mmu_invalidate_addr(vcpu, vcpu->arch.mmu, addr, roots); 406 } 407 nested_ept_inject_page_fault(struct kvm_vcpu * vcpu,struct x86_exception * fault)408 static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, 409 struct x86_exception *fault) 410 { 411 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 412 struct vcpu_vmx *vmx = to_vmx(vcpu); 413 unsigned long exit_qualification; 414 u32 vm_exit_reason; 415 416 if (vmx->nested.pml_full) { 417 vm_exit_reason = EXIT_REASON_PML_FULL; 418 vmx->nested.pml_full = false; 419 420 /* 421 * It should be impossible to trigger a nested PML Full VM-Exit 422 * for anything other than an EPT Violation from L2. KVM *can* 423 * trigger nEPT page fault injection in response to an EPT 424 * Misconfig, e.g. if the MMIO SPTE was stale and L1's EPT 425 * tables also changed, but KVM should not treat EPT Misconfig 426 * VM-Exits as writes. 427 */ 428 WARN_ON_ONCE(vmx->exit_reason.basic != EXIT_REASON_EPT_VIOLATION); 429 430 /* 431 * PML Full and EPT Violation VM-Exits both use bit 12 to report 432 * "NMI unblocking due to IRET", i.e. the bit can be propagated 433 * as-is from the original EXIT_QUALIFICATION. 434 */ 435 exit_qualification = vmx_get_exit_qual(vcpu) & INTR_INFO_UNBLOCK_NMI; 436 } else { 437 if (fault->error_code & PFERR_RSVD_MASK) { 438 vm_exit_reason = EXIT_REASON_EPT_MISCONFIG; 439 exit_qualification = 0; 440 } else { 441 exit_qualification = fault->exit_qualification; 442 exit_qualification |= vmx_get_exit_qual(vcpu) & 443 (EPT_VIOLATION_GVA_IS_VALID | 444 EPT_VIOLATION_GVA_TRANSLATED); 445 vm_exit_reason = EXIT_REASON_EPT_VIOLATION; 446 } 447 448 /* 449 * Although the caller (kvm_inject_emulated_page_fault) would 450 * have already synced the faulting address in the shadow EPT 451 * tables for the current EPTP12, we also need to sync it for 452 * any other cached EPTP02s based on the same EP4TA, since the 453 * TLB associates mappings to the EP4TA rather than the full EPTP. 454 */ 455 nested_ept_invalidate_addr(vcpu, vmcs12->ept_pointer, 456 fault->address); 457 } 458 459 nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification); 460 vmcs12->guest_physical_address = fault->address; 461 } 462 nested_ept_new_eptp(struct kvm_vcpu * vcpu)463 static void nested_ept_new_eptp(struct kvm_vcpu *vcpu) 464 { 465 struct vcpu_vmx *vmx = to_vmx(vcpu); 466 bool execonly = vmx->nested.msrs.ept_caps & VMX_EPT_EXECUTE_ONLY_BIT; 467 int ept_lpage_level = ept_caps_to_lpage_level(vmx->nested.msrs.ept_caps); 468 469 kvm_init_shadow_ept_mmu(vcpu, execonly, ept_lpage_level, 470 nested_ept_ad_enabled(vcpu), 471 nested_ept_get_eptp(vcpu)); 472 } 473 nested_ept_init_mmu_context(struct kvm_vcpu * vcpu)474 static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) 475 { 476 WARN_ON(mmu_is_nested(vcpu)); 477 478 vcpu->arch.mmu = &vcpu->arch.guest_mmu; 479 nested_ept_new_eptp(vcpu); 480 vcpu->arch.mmu->get_guest_pgd = nested_ept_get_eptp; 481 vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault; 482 vcpu->arch.mmu->get_pdptr = kvm_pdptr_read; 483 484 vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; 485 } 486 nested_ept_uninit_mmu_context(struct kvm_vcpu * vcpu)487 static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) 488 { 489 vcpu->arch.mmu = &vcpu->arch.root_mmu; 490 vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; 491 } 492 nested_vmx_is_page_fault_vmexit(struct vmcs12 * vmcs12,u16 error_code)493 static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, 494 u16 error_code) 495 { 496 bool inequality, bit; 497 498 bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0; 499 inequality = 500 (error_code & vmcs12->page_fault_error_code_mask) != 501 vmcs12->page_fault_error_code_match; 502 return inequality ^ bit; 503 } 504 nested_vmx_is_exception_vmexit(struct kvm_vcpu * vcpu,u8 vector,u32 error_code)505 static bool nested_vmx_is_exception_vmexit(struct kvm_vcpu *vcpu, u8 vector, 506 u32 error_code) 507 { 508 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 509 510 /* 511 * Drop bits 31:16 of the error code when performing the #PF mask+match 512 * check. All VMCS fields involved are 32 bits, but Intel CPUs never 513 * set bits 31:16 and VMX disallows setting bits 31:16 in the injected 514 * error code. Including the to-be-dropped bits in the check might 515 * result in an "impossible" or missed exit from L1's perspective. 516 */ 517 if (vector == PF_VECTOR) 518 return nested_vmx_is_page_fault_vmexit(vmcs12, (u16)error_code); 519 520 return (vmcs12->exception_bitmap & (1u << vector)); 521 } 522 nested_vmx_check_io_bitmap_controls(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12)523 static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu, 524 struct vmcs12 *vmcs12) 525 { 526 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 527 return 0; 528 529 if (CC(!page_address_valid(vcpu, vmcs12->io_bitmap_a)) || 530 CC(!page_address_valid(vcpu, vmcs12->io_bitmap_b))) 531 return -EINVAL; 532 533 return 0; 534 } 535 nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12)536 static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu, 537 struct vmcs12 *vmcs12) 538 { 539 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 540 return 0; 541 542 if (CC(!page_address_valid(vcpu, vmcs12->msr_bitmap))) 543 return -EINVAL; 544 545 return 0; 546 } 547 nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12)548 static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu, 549 struct vmcs12 *vmcs12) 550 { 551 if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) 552 return 0; 553 554 if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr))) 555 return -EINVAL; 556 557 return 0; 558 } 559 560 /* 561 * For x2APIC MSRs, ignore the vmcs01 bitmap. L1 can enable x2APIC without L1 562 * itself utilizing x2APIC. All MSRs were previously set to be intercepted, 563 * only the "disable intercept" case needs to be handled. 564 */ nested_vmx_disable_intercept_for_x2apic_msr(unsigned long * msr_bitmap_l1,unsigned long * msr_bitmap_l0,u32 msr,int type)565 static void nested_vmx_disable_intercept_for_x2apic_msr(unsigned long *msr_bitmap_l1, 566 unsigned long *msr_bitmap_l0, 567 u32 msr, int type) 568 { 569 if (type & MSR_TYPE_R && !vmx_test_msr_bitmap_read(msr_bitmap_l1, msr)) 570 vmx_clear_msr_bitmap_read(msr_bitmap_l0, msr); 571 572 if (type & MSR_TYPE_W && !vmx_test_msr_bitmap_write(msr_bitmap_l1, msr)) 573 vmx_clear_msr_bitmap_write(msr_bitmap_l0, msr); 574 } 575 enable_x2apic_msr_intercepts(unsigned long * msr_bitmap)576 static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) 577 { 578 int msr; 579 580 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 581 unsigned word = msr / BITS_PER_LONG; 582 583 msr_bitmap[word] = ~0; 584 msr_bitmap[word + (0x800 / sizeof(long))] = ~0; 585 } 586 } 587 588 #define BUILD_NVMX_MSR_INTERCEPT_HELPER(rw) \ 589 static inline \ 590 void nested_vmx_set_msr_##rw##_intercept(struct vcpu_vmx *vmx, \ 591 unsigned long *msr_bitmap_l1, \ 592 unsigned long *msr_bitmap_l0, u32 msr) \ 593 { \ 594 if (vmx_test_msr_bitmap_##rw(vmx->vmcs01.msr_bitmap, msr) || \ 595 vmx_test_msr_bitmap_##rw(msr_bitmap_l1, msr)) \ 596 vmx_set_msr_bitmap_##rw(msr_bitmap_l0, msr); \ 597 else \ 598 vmx_clear_msr_bitmap_##rw(msr_bitmap_l0, msr); \ 599 } 600 BUILD_NVMX_MSR_INTERCEPT_HELPER(read) BUILD_NVMX_MSR_INTERCEPT_HELPER(write)601 BUILD_NVMX_MSR_INTERCEPT_HELPER(write) 602 603 static inline void nested_vmx_set_intercept_for_msr(struct vcpu_vmx *vmx, 604 unsigned long *msr_bitmap_l1, 605 unsigned long *msr_bitmap_l0, 606 u32 msr, int types) 607 { 608 if (types & MSR_TYPE_R) 609 nested_vmx_set_msr_read_intercept(vmx, msr_bitmap_l1, 610 msr_bitmap_l0, msr); 611 if (types & MSR_TYPE_W) 612 nested_vmx_set_msr_write_intercept(vmx, msr_bitmap_l1, 613 msr_bitmap_l0, msr); 614 } 615 616 /* 617 * Merge L0's and L1's MSR bitmap, return false to indicate that 618 * we do not use the hardware. 619 */ nested_vmx_prepare_msr_bitmap(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12)620 static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, 621 struct vmcs12 *vmcs12) 622 { 623 struct vcpu_vmx *vmx = to_vmx(vcpu); 624 int msr; 625 unsigned long *msr_bitmap_l1; 626 unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap; 627 struct kvm_host_map map; 628 629 /* Nothing to do if the MSR bitmap is not in use. */ 630 if (!cpu_has_vmx_msr_bitmap() || 631 !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 632 return false; 633 634 /* 635 * MSR bitmap update can be skipped when: 636 * - MSR bitmap for L1 hasn't changed. 637 * - Nested hypervisor (L1) is attempting to launch the same L2 as 638 * before. 639 * - Nested hypervisor (L1) has enabled 'Enlightened MSR Bitmap' feature 640 * and tells KVM (L0) there were no changes in MSR bitmap for L2. 641 */ 642 if (!vmx->nested.force_msr_bitmap_recalc) { 643 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 644 645 if (evmcs && evmcs->hv_enlightenments_control.msr_bitmap && 646 evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP) 647 return true; 648 } 649 650 if (kvm_vcpu_map_readonly(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), &map)) 651 return false; 652 653 msr_bitmap_l1 = (unsigned long *)map.hva; 654 655 /* 656 * To keep the control flow simple, pay eight 8-byte writes (sixteen 657 * 4-byte writes on 32-bit systems) up front to enable intercepts for 658 * the x2APIC MSR range and selectively toggle those relevant to L2. 659 */ 660 enable_x2apic_msr_intercepts(msr_bitmap_l0); 661 662 if (nested_cpu_has_virt_x2apic_mode(vmcs12)) { 663 if (nested_cpu_has_apic_reg_virt(vmcs12)) { 664 /* 665 * L0 need not intercept reads for MSRs between 0x800 666 * and 0x8ff, it just lets the processor take the value 667 * from the virtual-APIC page; take those 256 bits 668 * directly from the L1 bitmap. 669 */ 670 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 671 unsigned word = msr / BITS_PER_LONG; 672 673 msr_bitmap_l0[word] = msr_bitmap_l1[word]; 674 } 675 } 676 677 nested_vmx_disable_intercept_for_x2apic_msr( 678 msr_bitmap_l1, msr_bitmap_l0, 679 X2APIC_MSR(APIC_TASKPRI), 680 MSR_TYPE_R | MSR_TYPE_W); 681 682 if (nested_cpu_has_vid(vmcs12)) { 683 nested_vmx_disable_intercept_for_x2apic_msr( 684 msr_bitmap_l1, msr_bitmap_l0, 685 X2APIC_MSR(APIC_EOI), 686 MSR_TYPE_W); 687 nested_vmx_disable_intercept_for_x2apic_msr( 688 msr_bitmap_l1, msr_bitmap_l0, 689 X2APIC_MSR(APIC_SELF_IPI), 690 MSR_TYPE_W); 691 } 692 } 693 694 /* 695 * Always check vmcs01's bitmap to honor userspace MSR filters and any 696 * other runtime changes to vmcs01's bitmap, e.g. dynamic pass-through. 697 */ 698 #ifdef CONFIG_X86_64 699 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 700 MSR_FS_BASE, MSR_TYPE_RW); 701 702 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 703 MSR_GS_BASE, MSR_TYPE_RW); 704 705 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 706 MSR_KERNEL_GS_BASE, MSR_TYPE_RW); 707 #endif 708 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 709 MSR_IA32_SPEC_CTRL, MSR_TYPE_RW); 710 711 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 712 MSR_IA32_PRED_CMD, MSR_TYPE_W); 713 714 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 715 MSR_IA32_FLUSH_CMD, MSR_TYPE_W); 716 717 kvm_vcpu_unmap(vcpu, &map); 718 719 vmx->nested.force_msr_bitmap_recalc = false; 720 721 return true; 722 } 723 nested_cache_shadow_vmcs12(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12)724 static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, 725 struct vmcs12 *vmcs12) 726 { 727 struct vcpu_vmx *vmx = to_vmx(vcpu); 728 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; 729 730 if (!nested_cpu_has_shadow_vmcs(vmcs12) || 731 vmcs12->vmcs_link_pointer == INVALID_GPA) 732 return; 733 734 if (ghc->gpa != vmcs12->vmcs_link_pointer && 735 kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, 736 vmcs12->vmcs_link_pointer, VMCS12_SIZE)) 737 return; 738 739 kvm_read_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu), 740 VMCS12_SIZE); 741 } 742 nested_flush_cached_shadow_vmcs12(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12)743 static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, 744 struct vmcs12 *vmcs12) 745 { 746 struct vcpu_vmx *vmx = to_vmx(vcpu); 747 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; 748 749 if (!nested_cpu_has_shadow_vmcs(vmcs12) || 750 vmcs12->vmcs_link_pointer == INVALID_GPA) 751 return; 752 753 if (ghc->gpa != vmcs12->vmcs_link_pointer && 754 kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, 755 vmcs12->vmcs_link_pointer, VMCS12_SIZE)) 756 return; 757 758 kvm_write_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu), 759 VMCS12_SIZE); 760 } 761 762 /* 763 * In nested virtualization, check if L1 has set 764 * VM_EXIT_ACK_INTR_ON_EXIT 765 */ nested_exit_intr_ack_set(struct kvm_vcpu * vcpu)766 static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu) 767 { 768 return get_vmcs12(vcpu)->vm_exit_controls & 769 VM_EXIT_ACK_INTR_ON_EXIT; 770 } 771 nested_vmx_check_apic_access_controls(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12)772 static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu, 773 struct vmcs12 *vmcs12) 774 { 775 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && 776 CC(!page_address_valid(vcpu, vmcs12->apic_access_addr))) 777 return -EINVAL; 778 else 779 return 0; 780 } 781 nested_vmx_check_apicv_controls(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12)782 static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, 783 struct vmcs12 *vmcs12) 784 { 785 if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && 786 !nested_cpu_has_apic_reg_virt(vmcs12) && 787 !nested_cpu_has_vid(vmcs12) && 788 !nested_cpu_has_posted_intr(vmcs12)) 789 return 0; 790 791 /* 792 * If virtualize x2apic mode is enabled, 793 * virtualize apic access must be disabled. 794 */ 795 if (CC(nested_cpu_has_virt_x2apic_mode(vmcs12) && 796 nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))) 797 return -EINVAL; 798 799 /* 800 * If virtual interrupt delivery is enabled, 801 * we must exit on external interrupts. 802 */ 803 if (CC(nested_cpu_has_vid(vmcs12) && !nested_exit_on_intr(vcpu))) 804 return -EINVAL; 805 806 /* 807 * bits 15:8 should be zero in posted_intr_nv, 808 * the descriptor address has been already checked 809 * in nested_get_vmcs12_pages. 810 * 811 * bits 5:0 of posted_intr_desc_addr should be zero. 812 */ 813 if (nested_cpu_has_posted_intr(vmcs12) && 814 (CC(!nested_cpu_has_vid(vmcs12)) || 815 CC(!nested_exit_intr_ack_set(vcpu)) || 816 CC((vmcs12->posted_intr_nv & 0xff00)) || 817 CC(!kvm_vcpu_is_legal_aligned_gpa(vcpu, vmcs12->posted_intr_desc_addr, 64)))) 818 return -EINVAL; 819 820 /* tpr shadow is needed by all apicv features. */ 821 if (CC(!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))) 822 return -EINVAL; 823 824 return 0; 825 } 826 nested_vmx_check_msr_switch(struct kvm_vcpu * vcpu,u32 count,u64 addr)827 static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, 828 u32 count, u64 addr) 829 { 830 if (count == 0) 831 return 0; 832 833 if (!kvm_vcpu_is_legal_aligned_gpa(vcpu, addr, 16) || 834 !kvm_vcpu_is_legal_gpa(vcpu, (addr + count * sizeof(struct vmx_msr_entry) - 1))) 835 return -EINVAL; 836 837 return 0; 838 } 839 nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12)840 static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu, 841 struct vmcs12 *vmcs12) 842 { 843 if (CC(nested_vmx_check_msr_switch(vcpu, 844 vmcs12->vm_exit_msr_load_count, 845 vmcs12->vm_exit_msr_load_addr)) || 846 CC(nested_vmx_check_msr_switch(vcpu, 847 vmcs12->vm_exit_msr_store_count, 848 vmcs12->vm_exit_msr_store_addr))) 849 return -EINVAL; 850 851 return 0; 852 } 853 nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12)854 static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu, 855 struct vmcs12 *vmcs12) 856 { 857 if (CC(nested_vmx_check_msr_switch(vcpu, 858 vmcs12->vm_entry_msr_load_count, 859 vmcs12->vm_entry_msr_load_addr))) 860 return -EINVAL; 861 862 return 0; 863 } 864 nested_vmx_check_pml_controls(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12)865 static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu, 866 struct vmcs12 *vmcs12) 867 { 868 if (!nested_cpu_has_pml(vmcs12)) 869 return 0; 870 871 if (CC(!nested_cpu_has_ept(vmcs12)) || 872 CC(!page_address_valid(vcpu, vmcs12->pml_address))) 873 return -EINVAL; 874 875 return 0; 876 } 877 nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12)878 static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu, 879 struct vmcs12 *vmcs12) 880 { 881 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) && 882 !nested_cpu_has_ept(vmcs12))) 883 return -EINVAL; 884 return 0; 885 } 886 nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12)887 static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu, 888 struct vmcs12 *vmcs12) 889 { 890 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) && 891 !nested_cpu_has_ept(vmcs12))) 892 return -EINVAL; 893 return 0; 894 } 895 nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12)896 static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu, 897 struct vmcs12 *vmcs12) 898 { 899 if (!nested_cpu_has_shadow_vmcs(vmcs12)) 900 return 0; 901 902 if (CC(!page_address_valid(vcpu, vmcs12->vmread_bitmap)) || 903 CC(!page_address_valid(vcpu, vmcs12->vmwrite_bitmap))) 904 return -EINVAL; 905 906 return 0; 907 } 908 nested_vmx_msr_check_common(struct kvm_vcpu * vcpu,struct vmx_msr_entry * e)909 static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, 910 struct vmx_msr_entry *e) 911 { 912 /* x2APIC MSR accesses are not allowed */ 913 if (CC(vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8)) 914 return -EINVAL; 915 if (CC(e->index == MSR_IA32_UCODE_WRITE) || /* SDM Table 35-2 */ 916 CC(e->index == MSR_IA32_UCODE_REV)) 917 return -EINVAL; 918 if (CC(e->reserved != 0)) 919 return -EINVAL; 920 return 0; 921 } 922 nested_vmx_load_msr_check(struct kvm_vcpu * vcpu,struct vmx_msr_entry * e)923 static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu, 924 struct vmx_msr_entry *e) 925 { 926 if (CC(e->index == MSR_FS_BASE) || 927 CC(e->index == MSR_GS_BASE) || 928 CC(e->index == MSR_IA32_SMM_MONITOR_CTL) || /* SMM is not supported */ 929 nested_vmx_msr_check_common(vcpu, e)) 930 return -EINVAL; 931 return 0; 932 } 933 nested_vmx_store_msr_check(struct kvm_vcpu * vcpu,struct vmx_msr_entry * e)934 static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, 935 struct vmx_msr_entry *e) 936 { 937 if (CC(e->index == MSR_IA32_SMBASE) || /* SMM is not supported */ 938 nested_vmx_msr_check_common(vcpu, e)) 939 return -EINVAL; 940 return 0; 941 } 942 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu * vcpu)943 static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu) 944 { 945 struct vcpu_vmx *vmx = to_vmx(vcpu); 946 u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, 947 vmx->nested.msrs.misc_high); 948 949 return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER; 950 } 951 952 /* 953 * Load guest's/host's msr at nested entry/exit. 954 * return 0 for success, entry index for failure. 955 * 956 * One of the failure modes for MSR load/store is when a list exceeds the 957 * virtual hardware's capacity. To maintain compatibility with hardware inasmuch 958 * as possible, process all valid entries before failing rather than precheck 959 * for a capacity violation. 960 */ nested_vmx_load_msr(struct kvm_vcpu * vcpu,u64 gpa,u32 count)961 static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 962 { 963 u32 i; 964 struct vmx_msr_entry e; 965 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); 966 967 for (i = 0; i < count; i++) { 968 if (unlikely(i >= max_msr_list_size)) 969 goto fail; 970 971 if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), 972 &e, sizeof(e))) { 973 pr_debug_ratelimited( 974 "%s cannot read MSR entry (%u, 0x%08llx)\n", 975 __func__, i, gpa + i * sizeof(e)); 976 goto fail; 977 } 978 if (nested_vmx_load_msr_check(vcpu, &e)) { 979 pr_debug_ratelimited( 980 "%s check failed (%u, 0x%x, 0x%x)\n", 981 __func__, i, e.index, e.reserved); 982 goto fail; 983 } 984 if (kvm_set_msr_with_filter(vcpu, e.index, e.value)) { 985 pr_debug_ratelimited( 986 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 987 __func__, i, e.index, e.value); 988 goto fail; 989 } 990 } 991 return 0; 992 fail: 993 /* Note, max_msr_list_size is at most 4096, i.e. this can't wrap. */ 994 return i + 1; 995 } 996 nested_vmx_get_vmexit_msr_value(struct kvm_vcpu * vcpu,u32 msr_index,u64 * data)997 static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu, 998 u32 msr_index, 999 u64 *data) 1000 { 1001 struct vcpu_vmx *vmx = to_vmx(vcpu); 1002 1003 /* 1004 * If the L0 hypervisor stored a more accurate value for the TSC that 1005 * does not include the time taken for emulation of the L2->L1 1006 * VM-exit in L0, use the more accurate value. 1007 */ 1008 if (msr_index == MSR_IA32_TSC) { 1009 int i = vmx_find_loadstore_msr_slot(&vmx->msr_autostore.guest, 1010 MSR_IA32_TSC); 1011 1012 if (i >= 0) { 1013 u64 val = vmx->msr_autostore.guest.val[i].value; 1014 1015 *data = kvm_read_l1_tsc(vcpu, val); 1016 return true; 1017 } 1018 } 1019 1020 if (kvm_get_msr_with_filter(vcpu, msr_index, data)) { 1021 pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__, 1022 msr_index); 1023 return false; 1024 } 1025 return true; 1026 } 1027 read_and_check_msr_entry(struct kvm_vcpu * vcpu,u64 gpa,int i,struct vmx_msr_entry * e)1028 static bool read_and_check_msr_entry(struct kvm_vcpu *vcpu, u64 gpa, int i, 1029 struct vmx_msr_entry *e) 1030 { 1031 if (kvm_vcpu_read_guest(vcpu, 1032 gpa + i * sizeof(*e), 1033 e, 2 * sizeof(u32))) { 1034 pr_debug_ratelimited( 1035 "%s cannot read MSR entry (%u, 0x%08llx)\n", 1036 __func__, i, gpa + i * sizeof(*e)); 1037 return false; 1038 } 1039 if (nested_vmx_store_msr_check(vcpu, e)) { 1040 pr_debug_ratelimited( 1041 "%s check failed (%u, 0x%x, 0x%x)\n", 1042 __func__, i, e->index, e->reserved); 1043 return false; 1044 } 1045 return true; 1046 } 1047 nested_vmx_store_msr(struct kvm_vcpu * vcpu,u64 gpa,u32 count)1048 static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 1049 { 1050 u64 data; 1051 u32 i; 1052 struct vmx_msr_entry e; 1053 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); 1054 1055 for (i = 0; i < count; i++) { 1056 if (unlikely(i >= max_msr_list_size)) 1057 return -EINVAL; 1058 1059 if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) 1060 return -EINVAL; 1061 1062 if (!nested_vmx_get_vmexit_msr_value(vcpu, e.index, &data)) 1063 return -EINVAL; 1064 1065 if (kvm_vcpu_write_guest(vcpu, 1066 gpa + i * sizeof(e) + 1067 offsetof(struct vmx_msr_entry, value), 1068 &data, sizeof(data))) { 1069 pr_debug_ratelimited( 1070 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 1071 __func__, i, e.index, data); 1072 return -EINVAL; 1073 } 1074 } 1075 return 0; 1076 } 1077 nested_msr_store_list_has_msr(struct kvm_vcpu * vcpu,u32 msr_index)1078 static bool nested_msr_store_list_has_msr(struct kvm_vcpu *vcpu, u32 msr_index) 1079 { 1080 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1081 u32 count = vmcs12->vm_exit_msr_store_count; 1082 u64 gpa = vmcs12->vm_exit_msr_store_addr; 1083 struct vmx_msr_entry e; 1084 u32 i; 1085 1086 for (i = 0; i < count; i++) { 1087 if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) 1088 return false; 1089 1090 if (e.index == msr_index) 1091 return true; 1092 } 1093 return false; 1094 } 1095 prepare_vmx_msr_autostore_list(struct kvm_vcpu * vcpu,u32 msr_index)1096 static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu, 1097 u32 msr_index) 1098 { 1099 struct vcpu_vmx *vmx = to_vmx(vcpu); 1100 struct vmx_msrs *autostore = &vmx->msr_autostore.guest; 1101 bool in_vmcs12_store_list; 1102 int msr_autostore_slot; 1103 bool in_autostore_list; 1104 int last; 1105 1106 msr_autostore_slot = vmx_find_loadstore_msr_slot(autostore, msr_index); 1107 in_autostore_list = msr_autostore_slot >= 0; 1108 in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index); 1109 1110 if (in_vmcs12_store_list && !in_autostore_list) { 1111 if (autostore->nr == MAX_NR_LOADSTORE_MSRS) { 1112 /* 1113 * Emulated VMEntry does not fail here. Instead a less 1114 * accurate value will be returned by 1115 * nested_vmx_get_vmexit_msr_value() by reading KVM's 1116 * internal MSR state instead of reading the value from 1117 * the vmcs02 VMExit MSR-store area. 1118 */ 1119 pr_warn_ratelimited( 1120 "Not enough msr entries in msr_autostore. Can't add msr %x\n", 1121 msr_index); 1122 return; 1123 } 1124 last = autostore->nr++; 1125 autostore->val[last].index = msr_index; 1126 } else if (!in_vmcs12_store_list && in_autostore_list) { 1127 last = --autostore->nr; 1128 autostore->val[msr_autostore_slot] = autostore->val[last]; 1129 } 1130 } 1131 1132 /* 1133 * Load guest's/host's cr3 at nested entry/exit. @nested_ept is true if we are 1134 * emulating VM-Entry into a guest with EPT enabled. On failure, the expected 1135 * Exit Qualification (for a VM-Entry consistency check VM-Exit) is assigned to 1136 * @entry_failure_code. 1137 */ nested_vmx_load_cr3(struct kvm_vcpu * vcpu,unsigned long cr3,bool nested_ept,bool reload_pdptrs,enum vm_entry_failure_code * entry_failure_code)1138 static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, 1139 bool nested_ept, bool reload_pdptrs, 1140 enum vm_entry_failure_code *entry_failure_code) 1141 { 1142 if (CC(!kvm_vcpu_is_legal_cr3(vcpu, cr3))) { 1143 *entry_failure_code = ENTRY_FAIL_DEFAULT; 1144 return -EINVAL; 1145 } 1146 1147 /* 1148 * If PAE paging and EPT are both on, CR3 is not used by the CPU and 1149 * must not be dereferenced. 1150 */ 1151 if (reload_pdptrs && !nested_ept && is_pae_paging(vcpu) && 1152 CC(!load_pdptrs(vcpu, cr3))) { 1153 *entry_failure_code = ENTRY_FAIL_PDPTE; 1154 return -EINVAL; 1155 } 1156 1157 vcpu->arch.cr3 = cr3; 1158 kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); 1159 1160 /* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */ 1161 kvm_init_mmu(vcpu); 1162 1163 if (!nested_ept) 1164 kvm_mmu_new_pgd(vcpu, cr3); 1165 1166 return 0; 1167 } 1168 1169 /* 1170 * Returns if KVM is able to config CPU to tag TLB entries 1171 * populated by L2 differently than TLB entries populated 1172 * by L1. 1173 * 1174 * If L0 uses EPT, L1 and L2 run with different EPTP because 1175 * guest_mode is part of kvm_mmu_page_role. Thus, TLB entries 1176 * are tagged with different EPTP. 1177 * 1178 * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged 1179 * with different VPID (L1 entries are tagged with vmx->vpid 1180 * while L2 entries are tagged with vmx->nested.vpid02). 1181 */ nested_has_guest_tlb_tag(struct kvm_vcpu * vcpu)1182 static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu) 1183 { 1184 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1185 1186 return enable_ept || 1187 (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02); 1188 } 1189 nested_vmx_transition_tlb_flush(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12,bool is_vmenter)1190 static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu, 1191 struct vmcs12 *vmcs12, 1192 bool is_vmenter) 1193 { 1194 struct vcpu_vmx *vmx = to_vmx(vcpu); 1195 1196 /* Handle pending Hyper-V TLB flush requests */ 1197 kvm_hv_nested_transtion_tlb_flush(vcpu, enable_ept); 1198 1199 /* 1200 * If VPID is disabled, then guest TLB accesses use VPID=0, i.e. the 1201 * same VPID as the host, and so architecturally, linear and combined 1202 * mappings for VPID=0 must be flushed at VM-Enter and VM-Exit. KVM 1203 * emulates L2 sharing L1's VPID=0 by using vpid01 while running L2, 1204 * and so KVM must also emulate TLB flush of VPID=0, i.e. vpid01. This 1205 * is required if VPID is disabled in KVM, as a TLB flush (there are no 1206 * VPIDs) still occurs from L1's perspective, and KVM may need to 1207 * synchronize the MMU in response to the guest TLB flush. 1208 * 1209 * Note, using TLB_FLUSH_GUEST is correct even if nested EPT is in use. 1210 * EPT is a special snowflake, as guest-physical mappings aren't 1211 * flushed on VPID invalidations, including VM-Enter or VM-Exit with 1212 * VPID disabled. As a result, KVM _never_ needs to sync nEPT 1213 * entries on VM-Enter because L1 can't rely on VM-Enter to flush 1214 * those mappings. 1215 */ 1216 if (!nested_cpu_has_vpid(vmcs12)) { 1217 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 1218 return; 1219 } 1220 1221 /* L2 should never have a VPID if VPID is disabled. */ 1222 WARN_ON(!enable_vpid); 1223 1224 /* 1225 * VPID is enabled and in use by vmcs12. If vpid12 is changing, then 1226 * emulate a guest TLB flush as KVM does not track vpid12 history nor 1227 * is the VPID incorporated into the MMU context. I.e. KVM must assume 1228 * that the new vpid12 has never been used and thus represents a new 1229 * guest ASID that cannot have entries in the TLB. 1230 */ 1231 if (is_vmenter && vmcs12->virtual_processor_id != vmx->nested.last_vpid) { 1232 vmx->nested.last_vpid = vmcs12->virtual_processor_id; 1233 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 1234 return; 1235 } 1236 1237 /* 1238 * If VPID is enabled, used by vmc12, and vpid12 is not changing but 1239 * does not have a unique TLB tag (ASID), i.e. EPT is disabled and 1240 * KVM was unable to allocate a VPID for L2, flush the current context 1241 * as the effective ASID is common to both L1 and L2. 1242 */ 1243 if (!nested_has_guest_tlb_tag(vcpu)) 1244 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); 1245 } 1246 is_bitwise_subset(u64 superset,u64 subset,u64 mask)1247 static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) 1248 { 1249 superset &= mask; 1250 subset &= mask; 1251 1252 return (superset | subset) == superset; 1253 } 1254 vmx_restore_vmx_basic(struct vcpu_vmx * vmx,u64 data)1255 static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) 1256 { 1257 const u64 feature_bits = VMX_BASIC_DUAL_MONITOR_TREATMENT | 1258 VMX_BASIC_INOUT | 1259 VMX_BASIC_TRUE_CTLS; 1260 1261 const u64 reserved_bits = GENMASK_ULL(63, 56) | 1262 GENMASK_ULL(47, 45) | 1263 BIT_ULL(31); 1264 1265 u64 vmx_basic = vmcs_config.nested.basic; 1266 1267 BUILD_BUG_ON(feature_bits & reserved_bits); 1268 1269 /* 1270 * Except for 32BIT_PHYS_ADDR_ONLY, which is an anti-feature bit (has 1271 * inverted polarity), the incoming value must not set feature bits or 1272 * reserved bits that aren't allowed/supported by KVM. Fields, i.e. 1273 * multi-bit values, are explicitly checked below. 1274 */ 1275 if (!is_bitwise_subset(vmx_basic, data, feature_bits | reserved_bits)) 1276 return -EINVAL; 1277 1278 /* 1279 * KVM does not emulate a version of VMX that constrains physical 1280 * addresses of VMX structures (e.g. VMCS) to 32-bits. 1281 */ 1282 if (data & VMX_BASIC_32BIT_PHYS_ADDR_ONLY) 1283 return -EINVAL; 1284 1285 if (vmx_basic_vmcs_revision_id(vmx_basic) != 1286 vmx_basic_vmcs_revision_id(data)) 1287 return -EINVAL; 1288 1289 if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data)) 1290 return -EINVAL; 1291 1292 vmx->nested.msrs.basic = data; 1293 return 0; 1294 } 1295 vmx_get_control_msr(struct nested_vmx_msrs * msrs,u32 msr_index,u32 ** low,u32 ** high)1296 static void vmx_get_control_msr(struct nested_vmx_msrs *msrs, u32 msr_index, 1297 u32 **low, u32 **high) 1298 { 1299 switch (msr_index) { 1300 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1301 *low = &msrs->pinbased_ctls_low; 1302 *high = &msrs->pinbased_ctls_high; 1303 break; 1304 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1305 *low = &msrs->procbased_ctls_low; 1306 *high = &msrs->procbased_ctls_high; 1307 break; 1308 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1309 *low = &msrs->exit_ctls_low; 1310 *high = &msrs->exit_ctls_high; 1311 break; 1312 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1313 *low = &msrs->entry_ctls_low; 1314 *high = &msrs->entry_ctls_high; 1315 break; 1316 case MSR_IA32_VMX_PROCBASED_CTLS2: 1317 *low = &msrs->secondary_ctls_low; 1318 *high = &msrs->secondary_ctls_high; 1319 break; 1320 default: 1321 BUG(); 1322 } 1323 } 1324 1325 static int vmx_restore_control_msr(struct vcpu_vmx * vmx,u32 msr_index,u64 data)1326 vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) 1327 { 1328 u32 *lowp, *highp; 1329 u64 supported; 1330 1331 vmx_get_control_msr(&vmcs_config.nested, msr_index, &lowp, &highp); 1332 1333 supported = vmx_control_msr(*lowp, *highp); 1334 1335 /* Check must-be-1 bits are still 1. */ 1336 if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0))) 1337 return -EINVAL; 1338 1339 /* Check must-be-0 bits are still 0. */ 1340 if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32))) 1341 return -EINVAL; 1342 1343 vmx_get_control_msr(&vmx->nested.msrs, msr_index, &lowp, &highp); 1344 *lowp = data; 1345 *highp = data >> 32; 1346 return 0; 1347 } 1348 vmx_restore_vmx_misc(struct vcpu_vmx * vmx,u64 data)1349 static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) 1350 { 1351 const u64 feature_bits = VMX_MISC_SAVE_EFER_LMA | 1352 VMX_MISC_ACTIVITY_HLT | 1353 VMX_MISC_ACTIVITY_SHUTDOWN | 1354 VMX_MISC_ACTIVITY_WAIT_SIPI | 1355 VMX_MISC_INTEL_PT | 1356 VMX_MISC_RDMSR_IN_SMM | 1357 VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | 1358 VMX_MISC_VMXOFF_BLOCK_SMI | 1359 VMX_MISC_ZERO_LEN_INS; 1360 1361 const u64 reserved_bits = BIT_ULL(31) | GENMASK_ULL(13, 9); 1362 1363 u64 vmx_misc = vmx_control_msr(vmcs_config.nested.misc_low, 1364 vmcs_config.nested.misc_high); 1365 1366 BUILD_BUG_ON(feature_bits & reserved_bits); 1367 1368 /* 1369 * The incoming value must not set feature bits or reserved bits that 1370 * aren't allowed/supported by KVM. Fields, i.e. multi-bit values, are 1371 * explicitly checked below. 1372 */ 1373 if (!is_bitwise_subset(vmx_misc, data, feature_bits | reserved_bits)) 1374 return -EINVAL; 1375 1376 if ((vmx->nested.msrs.pinbased_ctls_high & 1377 PIN_BASED_VMX_PREEMPTION_TIMER) && 1378 vmx_misc_preemption_timer_rate(data) != 1379 vmx_misc_preemption_timer_rate(vmx_misc)) 1380 return -EINVAL; 1381 1382 if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc)) 1383 return -EINVAL; 1384 1385 if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc)) 1386 return -EINVAL; 1387 1388 if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc)) 1389 return -EINVAL; 1390 1391 vmx->nested.msrs.misc_low = data; 1392 vmx->nested.msrs.misc_high = data >> 32; 1393 1394 return 0; 1395 } 1396 vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx * vmx,u64 data)1397 static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data) 1398 { 1399 u64 vmx_ept_vpid_cap = vmx_control_msr(vmcs_config.nested.ept_caps, 1400 vmcs_config.nested.vpid_caps); 1401 1402 /* Every bit is either reserved or a feature bit. */ 1403 if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL)) 1404 return -EINVAL; 1405 1406 vmx->nested.msrs.ept_caps = data; 1407 vmx->nested.msrs.vpid_caps = data >> 32; 1408 return 0; 1409 } 1410 vmx_get_fixed0_msr(struct nested_vmx_msrs * msrs,u32 msr_index)1411 static u64 *vmx_get_fixed0_msr(struct nested_vmx_msrs *msrs, u32 msr_index) 1412 { 1413 switch (msr_index) { 1414 case MSR_IA32_VMX_CR0_FIXED0: 1415 return &msrs->cr0_fixed0; 1416 case MSR_IA32_VMX_CR4_FIXED0: 1417 return &msrs->cr4_fixed0; 1418 default: 1419 BUG(); 1420 } 1421 } 1422 vmx_restore_fixed0_msr(struct vcpu_vmx * vmx,u32 msr_index,u64 data)1423 static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) 1424 { 1425 const u64 *msr = vmx_get_fixed0_msr(&vmcs_config.nested, msr_index); 1426 1427 /* 1428 * 1 bits (which indicates bits which "must-be-1" during VMX operation) 1429 * must be 1 in the restored value. 1430 */ 1431 if (!is_bitwise_subset(data, *msr, -1ULL)) 1432 return -EINVAL; 1433 1434 *vmx_get_fixed0_msr(&vmx->nested.msrs, msr_index) = data; 1435 return 0; 1436 } 1437 1438 /* 1439 * Called when userspace is restoring VMX MSRs. 1440 * 1441 * Returns 0 on success, non-0 otherwise. 1442 */ vmx_set_vmx_msr(struct kvm_vcpu * vcpu,u32 msr_index,u64 data)1443 int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 1444 { 1445 struct vcpu_vmx *vmx = to_vmx(vcpu); 1446 1447 /* 1448 * Don't allow changes to the VMX capability MSRs while the vCPU 1449 * is in VMX operation. 1450 */ 1451 if (vmx->nested.vmxon) 1452 return -EBUSY; 1453 1454 switch (msr_index) { 1455 case MSR_IA32_VMX_BASIC: 1456 return vmx_restore_vmx_basic(vmx, data); 1457 case MSR_IA32_VMX_PINBASED_CTLS: 1458 case MSR_IA32_VMX_PROCBASED_CTLS: 1459 case MSR_IA32_VMX_EXIT_CTLS: 1460 case MSR_IA32_VMX_ENTRY_CTLS: 1461 /* 1462 * The "non-true" VMX capability MSRs are generated from the 1463 * "true" MSRs, so we do not support restoring them directly. 1464 * 1465 * If userspace wants to emulate VMX_BASIC[55]=0, userspace 1466 * should restore the "true" MSRs with the must-be-1 bits 1467 * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND 1468 * DEFAULT SETTINGS". 1469 */ 1470 return -EINVAL; 1471 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1472 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1473 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1474 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1475 case MSR_IA32_VMX_PROCBASED_CTLS2: 1476 return vmx_restore_control_msr(vmx, msr_index, data); 1477 case MSR_IA32_VMX_MISC: 1478 return vmx_restore_vmx_misc(vmx, data); 1479 case MSR_IA32_VMX_CR0_FIXED0: 1480 case MSR_IA32_VMX_CR4_FIXED0: 1481 return vmx_restore_fixed0_msr(vmx, msr_index, data); 1482 case MSR_IA32_VMX_CR0_FIXED1: 1483 case MSR_IA32_VMX_CR4_FIXED1: 1484 /* 1485 * These MSRs are generated based on the vCPU's CPUID, so we 1486 * do not support restoring them directly. 1487 */ 1488 return -EINVAL; 1489 case MSR_IA32_VMX_EPT_VPID_CAP: 1490 return vmx_restore_vmx_ept_vpid_cap(vmx, data); 1491 case MSR_IA32_VMX_VMCS_ENUM: 1492 vmx->nested.msrs.vmcs_enum = data; 1493 return 0; 1494 case MSR_IA32_VMX_VMFUNC: 1495 if (data & ~vmcs_config.nested.vmfunc_controls) 1496 return -EINVAL; 1497 vmx->nested.msrs.vmfunc_controls = data; 1498 return 0; 1499 default: 1500 /* 1501 * The rest of the VMX capability MSRs do not support restore. 1502 */ 1503 return -EINVAL; 1504 } 1505 } 1506 1507 /* Returns 0 on success, non-0 otherwise. */ vmx_get_vmx_msr(struct nested_vmx_msrs * msrs,u32 msr_index,u64 * pdata)1508 int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata) 1509 { 1510 switch (msr_index) { 1511 case MSR_IA32_VMX_BASIC: 1512 *pdata = msrs->basic; 1513 break; 1514 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1515 case MSR_IA32_VMX_PINBASED_CTLS: 1516 *pdata = vmx_control_msr( 1517 msrs->pinbased_ctls_low, 1518 msrs->pinbased_ctls_high); 1519 if (msr_index == MSR_IA32_VMX_PINBASED_CTLS) 1520 *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 1521 break; 1522 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1523 case MSR_IA32_VMX_PROCBASED_CTLS: 1524 *pdata = vmx_control_msr( 1525 msrs->procbased_ctls_low, 1526 msrs->procbased_ctls_high); 1527 if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS) 1528 *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 1529 break; 1530 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1531 case MSR_IA32_VMX_EXIT_CTLS: 1532 *pdata = vmx_control_msr( 1533 msrs->exit_ctls_low, 1534 msrs->exit_ctls_high); 1535 if (msr_index == MSR_IA32_VMX_EXIT_CTLS) 1536 *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 1537 break; 1538 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1539 case MSR_IA32_VMX_ENTRY_CTLS: 1540 *pdata = vmx_control_msr( 1541 msrs->entry_ctls_low, 1542 msrs->entry_ctls_high); 1543 if (msr_index == MSR_IA32_VMX_ENTRY_CTLS) 1544 *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 1545 break; 1546 case MSR_IA32_VMX_MISC: 1547 *pdata = vmx_control_msr( 1548 msrs->misc_low, 1549 msrs->misc_high); 1550 break; 1551 case MSR_IA32_VMX_CR0_FIXED0: 1552 *pdata = msrs->cr0_fixed0; 1553 break; 1554 case MSR_IA32_VMX_CR0_FIXED1: 1555 *pdata = msrs->cr0_fixed1; 1556 break; 1557 case MSR_IA32_VMX_CR4_FIXED0: 1558 *pdata = msrs->cr4_fixed0; 1559 break; 1560 case MSR_IA32_VMX_CR4_FIXED1: 1561 *pdata = msrs->cr4_fixed1; 1562 break; 1563 case MSR_IA32_VMX_VMCS_ENUM: 1564 *pdata = msrs->vmcs_enum; 1565 break; 1566 case MSR_IA32_VMX_PROCBASED_CTLS2: 1567 *pdata = vmx_control_msr( 1568 msrs->secondary_ctls_low, 1569 msrs->secondary_ctls_high); 1570 break; 1571 case MSR_IA32_VMX_EPT_VPID_CAP: 1572 *pdata = msrs->ept_caps | 1573 ((u64)msrs->vpid_caps << 32); 1574 break; 1575 case MSR_IA32_VMX_VMFUNC: 1576 *pdata = msrs->vmfunc_controls; 1577 break; 1578 default: 1579 return 1; 1580 } 1581 1582 return 0; 1583 } 1584 1585 /* 1586 * Copy the writable VMCS shadow fields back to the VMCS12, in case they have 1587 * been modified by the L1 guest. Note, "writable" in this context means 1588 * "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of 1589 * fields tagged SHADOW_FIELD_RO may or may not align with the "read-only" 1590 * VM-exit information fields (which are actually writable if the vCPU is 1591 * configured to support "VMWRITE to any supported field in the VMCS"). 1592 */ copy_shadow_to_vmcs12(struct vcpu_vmx * vmx)1593 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) 1594 { 1595 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 1596 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); 1597 struct shadow_vmcs_field field; 1598 unsigned long val; 1599 int i; 1600 1601 if (WARN_ON(!shadow_vmcs)) 1602 return; 1603 1604 preempt_disable(); 1605 1606 vmcs_load(shadow_vmcs); 1607 1608 for (i = 0; i < max_shadow_read_write_fields; i++) { 1609 field = shadow_read_write_fields[i]; 1610 val = __vmcs_readl(field.encoding); 1611 vmcs12_write_any(vmcs12, field.encoding, field.offset, val); 1612 } 1613 1614 vmcs_clear(shadow_vmcs); 1615 vmcs_load(vmx->loaded_vmcs->vmcs); 1616 1617 preempt_enable(); 1618 } 1619 copy_vmcs12_to_shadow(struct vcpu_vmx * vmx)1620 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) 1621 { 1622 const struct shadow_vmcs_field *fields[] = { 1623 shadow_read_write_fields, 1624 shadow_read_only_fields 1625 }; 1626 const int max_fields[] = { 1627 max_shadow_read_write_fields, 1628 max_shadow_read_only_fields 1629 }; 1630 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 1631 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); 1632 struct shadow_vmcs_field field; 1633 unsigned long val; 1634 int i, q; 1635 1636 if (WARN_ON(!shadow_vmcs)) 1637 return; 1638 1639 vmcs_load(shadow_vmcs); 1640 1641 for (q = 0; q < ARRAY_SIZE(fields); q++) { 1642 for (i = 0; i < max_fields[q]; i++) { 1643 field = fields[q][i]; 1644 val = vmcs12_read_any(vmcs12, field.encoding, 1645 field.offset); 1646 __vmcs_writel(field.encoding, val); 1647 } 1648 } 1649 1650 vmcs_clear(shadow_vmcs); 1651 vmcs_load(vmx->loaded_vmcs->vmcs); 1652 } 1653 copy_enlightened_to_vmcs12(struct vcpu_vmx * vmx,u32 hv_clean_fields)1654 static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields) 1655 { 1656 #ifdef CONFIG_KVM_HYPERV 1657 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; 1658 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 1659 struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(&vmx->vcpu); 1660 1661 /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */ 1662 vmcs12->tpr_threshold = evmcs->tpr_threshold; 1663 vmcs12->guest_rip = evmcs->guest_rip; 1664 1665 if (unlikely(!(hv_clean_fields & 1666 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ENLIGHTENMENTSCONTROL))) { 1667 hv_vcpu->nested.pa_page_gpa = evmcs->partition_assist_page; 1668 hv_vcpu->nested.vm_id = evmcs->hv_vm_id; 1669 hv_vcpu->nested.vp_id = evmcs->hv_vp_id; 1670 } 1671 1672 if (unlikely(!(hv_clean_fields & 1673 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) { 1674 vmcs12->guest_rsp = evmcs->guest_rsp; 1675 vmcs12->guest_rflags = evmcs->guest_rflags; 1676 vmcs12->guest_interruptibility_info = 1677 evmcs->guest_interruptibility_info; 1678 /* 1679 * Not present in struct vmcs12: 1680 * vmcs12->guest_ssp = evmcs->guest_ssp; 1681 */ 1682 } 1683 1684 if (unlikely(!(hv_clean_fields & 1685 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) { 1686 vmcs12->cpu_based_vm_exec_control = 1687 evmcs->cpu_based_vm_exec_control; 1688 } 1689 1690 if (unlikely(!(hv_clean_fields & 1691 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN))) { 1692 vmcs12->exception_bitmap = evmcs->exception_bitmap; 1693 } 1694 1695 if (unlikely(!(hv_clean_fields & 1696 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) { 1697 vmcs12->vm_entry_controls = evmcs->vm_entry_controls; 1698 } 1699 1700 if (unlikely(!(hv_clean_fields & 1701 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) { 1702 vmcs12->vm_entry_intr_info_field = 1703 evmcs->vm_entry_intr_info_field; 1704 vmcs12->vm_entry_exception_error_code = 1705 evmcs->vm_entry_exception_error_code; 1706 vmcs12->vm_entry_instruction_len = 1707 evmcs->vm_entry_instruction_len; 1708 } 1709 1710 if (unlikely(!(hv_clean_fields & 1711 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) { 1712 vmcs12->host_ia32_pat = evmcs->host_ia32_pat; 1713 vmcs12->host_ia32_efer = evmcs->host_ia32_efer; 1714 vmcs12->host_cr0 = evmcs->host_cr0; 1715 vmcs12->host_cr3 = evmcs->host_cr3; 1716 vmcs12->host_cr4 = evmcs->host_cr4; 1717 vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp; 1718 vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip; 1719 vmcs12->host_rip = evmcs->host_rip; 1720 vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs; 1721 vmcs12->host_es_selector = evmcs->host_es_selector; 1722 vmcs12->host_cs_selector = evmcs->host_cs_selector; 1723 vmcs12->host_ss_selector = evmcs->host_ss_selector; 1724 vmcs12->host_ds_selector = evmcs->host_ds_selector; 1725 vmcs12->host_fs_selector = evmcs->host_fs_selector; 1726 vmcs12->host_gs_selector = evmcs->host_gs_selector; 1727 vmcs12->host_tr_selector = evmcs->host_tr_selector; 1728 vmcs12->host_ia32_perf_global_ctrl = evmcs->host_ia32_perf_global_ctrl; 1729 /* 1730 * Not present in struct vmcs12: 1731 * vmcs12->host_ia32_s_cet = evmcs->host_ia32_s_cet; 1732 * vmcs12->host_ssp = evmcs->host_ssp; 1733 * vmcs12->host_ia32_int_ssp_table_addr = evmcs->host_ia32_int_ssp_table_addr; 1734 */ 1735 } 1736 1737 if (unlikely(!(hv_clean_fields & 1738 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1))) { 1739 vmcs12->pin_based_vm_exec_control = 1740 evmcs->pin_based_vm_exec_control; 1741 vmcs12->vm_exit_controls = evmcs->vm_exit_controls; 1742 vmcs12->secondary_vm_exec_control = 1743 evmcs->secondary_vm_exec_control; 1744 } 1745 1746 if (unlikely(!(hv_clean_fields & 1747 HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) { 1748 vmcs12->io_bitmap_a = evmcs->io_bitmap_a; 1749 vmcs12->io_bitmap_b = evmcs->io_bitmap_b; 1750 } 1751 1752 if (unlikely(!(hv_clean_fields & 1753 HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) { 1754 vmcs12->msr_bitmap = evmcs->msr_bitmap; 1755 } 1756 1757 if (unlikely(!(hv_clean_fields & 1758 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) { 1759 vmcs12->guest_es_base = evmcs->guest_es_base; 1760 vmcs12->guest_cs_base = evmcs->guest_cs_base; 1761 vmcs12->guest_ss_base = evmcs->guest_ss_base; 1762 vmcs12->guest_ds_base = evmcs->guest_ds_base; 1763 vmcs12->guest_fs_base = evmcs->guest_fs_base; 1764 vmcs12->guest_gs_base = evmcs->guest_gs_base; 1765 vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base; 1766 vmcs12->guest_tr_base = evmcs->guest_tr_base; 1767 vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base; 1768 vmcs12->guest_idtr_base = evmcs->guest_idtr_base; 1769 vmcs12->guest_es_limit = evmcs->guest_es_limit; 1770 vmcs12->guest_cs_limit = evmcs->guest_cs_limit; 1771 vmcs12->guest_ss_limit = evmcs->guest_ss_limit; 1772 vmcs12->guest_ds_limit = evmcs->guest_ds_limit; 1773 vmcs12->guest_fs_limit = evmcs->guest_fs_limit; 1774 vmcs12->guest_gs_limit = evmcs->guest_gs_limit; 1775 vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit; 1776 vmcs12->guest_tr_limit = evmcs->guest_tr_limit; 1777 vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit; 1778 vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit; 1779 vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes; 1780 vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes; 1781 vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes; 1782 vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes; 1783 vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes; 1784 vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes; 1785 vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes; 1786 vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes; 1787 vmcs12->guest_es_selector = evmcs->guest_es_selector; 1788 vmcs12->guest_cs_selector = evmcs->guest_cs_selector; 1789 vmcs12->guest_ss_selector = evmcs->guest_ss_selector; 1790 vmcs12->guest_ds_selector = evmcs->guest_ds_selector; 1791 vmcs12->guest_fs_selector = evmcs->guest_fs_selector; 1792 vmcs12->guest_gs_selector = evmcs->guest_gs_selector; 1793 vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector; 1794 vmcs12->guest_tr_selector = evmcs->guest_tr_selector; 1795 } 1796 1797 if (unlikely(!(hv_clean_fields & 1798 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) { 1799 vmcs12->tsc_offset = evmcs->tsc_offset; 1800 vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr; 1801 vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap; 1802 vmcs12->encls_exiting_bitmap = evmcs->encls_exiting_bitmap; 1803 vmcs12->tsc_multiplier = evmcs->tsc_multiplier; 1804 } 1805 1806 if (unlikely(!(hv_clean_fields & 1807 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) { 1808 vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask; 1809 vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask; 1810 vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow; 1811 vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow; 1812 vmcs12->guest_cr0 = evmcs->guest_cr0; 1813 vmcs12->guest_cr3 = evmcs->guest_cr3; 1814 vmcs12->guest_cr4 = evmcs->guest_cr4; 1815 vmcs12->guest_dr7 = evmcs->guest_dr7; 1816 } 1817 1818 if (unlikely(!(hv_clean_fields & 1819 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) { 1820 vmcs12->host_fs_base = evmcs->host_fs_base; 1821 vmcs12->host_gs_base = evmcs->host_gs_base; 1822 vmcs12->host_tr_base = evmcs->host_tr_base; 1823 vmcs12->host_gdtr_base = evmcs->host_gdtr_base; 1824 vmcs12->host_idtr_base = evmcs->host_idtr_base; 1825 vmcs12->host_rsp = evmcs->host_rsp; 1826 } 1827 1828 if (unlikely(!(hv_clean_fields & 1829 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) { 1830 vmcs12->ept_pointer = evmcs->ept_pointer; 1831 vmcs12->virtual_processor_id = evmcs->virtual_processor_id; 1832 } 1833 1834 if (unlikely(!(hv_clean_fields & 1835 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) { 1836 vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer; 1837 vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl; 1838 vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat; 1839 vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer; 1840 vmcs12->guest_pdptr0 = evmcs->guest_pdptr0; 1841 vmcs12->guest_pdptr1 = evmcs->guest_pdptr1; 1842 vmcs12->guest_pdptr2 = evmcs->guest_pdptr2; 1843 vmcs12->guest_pdptr3 = evmcs->guest_pdptr3; 1844 vmcs12->guest_pending_dbg_exceptions = 1845 evmcs->guest_pending_dbg_exceptions; 1846 vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp; 1847 vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip; 1848 vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs; 1849 vmcs12->guest_activity_state = evmcs->guest_activity_state; 1850 vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs; 1851 vmcs12->guest_ia32_perf_global_ctrl = evmcs->guest_ia32_perf_global_ctrl; 1852 /* 1853 * Not present in struct vmcs12: 1854 * vmcs12->guest_ia32_s_cet = evmcs->guest_ia32_s_cet; 1855 * vmcs12->guest_ia32_lbr_ctl = evmcs->guest_ia32_lbr_ctl; 1856 * vmcs12->guest_ia32_int_ssp_table_addr = evmcs->guest_ia32_int_ssp_table_addr; 1857 */ 1858 } 1859 1860 /* 1861 * Not used? 1862 * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr; 1863 * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr; 1864 * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr; 1865 * vmcs12->page_fault_error_code_mask = 1866 * evmcs->page_fault_error_code_mask; 1867 * vmcs12->page_fault_error_code_match = 1868 * evmcs->page_fault_error_code_match; 1869 * vmcs12->cr3_target_count = evmcs->cr3_target_count; 1870 * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count; 1871 * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count; 1872 * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count; 1873 */ 1874 1875 /* 1876 * Read only fields: 1877 * vmcs12->guest_physical_address = evmcs->guest_physical_address; 1878 * vmcs12->vm_instruction_error = evmcs->vm_instruction_error; 1879 * vmcs12->vm_exit_reason = evmcs->vm_exit_reason; 1880 * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info; 1881 * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code; 1882 * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field; 1883 * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code; 1884 * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len; 1885 * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info; 1886 * vmcs12->exit_qualification = evmcs->exit_qualification; 1887 * vmcs12->guest_linear_address = evmcs->guest_linear_address; 1888 * 1889 * Not present in struct vmcs12: 1890 * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx; 1891 * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi; 1892 * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi; 1893 * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip; 1894 */ 1895 1896 return; 1897 #else /* CONFIG_KVM_HYPERV */ 1898 KVM_BUG_ON(1, vmx->vcpu.kvm); 1899 #endif /* CONFIG_KVM_HYPERV */ 1900 } 1901 copy_vmcs12_to_enlightened(struct vcpu_vmx * vmx)1902 static void copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) 1903 { 1904 #ifdef CONFIG_KVM_HYPERV 1905 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; 1906 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 1907 1908 /* 1909 * Should not be changed by KVM: 1910 * 1911 * evmcs->host_es_selector = vmcs12->host_es_selector; 1912 * evmcs->host_cs_selector = vmcs12->host_cs_selector; 1913 * evmcs->host_ss_selector = vmcs12->host_ss_selector; 1914 * evmcs->host_ds_selector = vmcs12->host_ds_selector; 1915 * evmcs->host_fs_selector = vmcs12->host_fs_selector; 1916 * evmcs->host_gs_selector = vmcs12->host_gs_selector; 1917 * evmcs->host_tr_selector = vmcs12->host_tr_selector; 1918 * evmcs->host_ia32_pat = vmcs12->host_ia32_pat; 1919 * evmcs->host_ia32_efer = vmcs12->host_ia32_efer; 1920 * evmcs->host_cr0 = vmcs12->host_cr0; 1921 * evmcs->host_cr3 = vmcs12->host_cr3; 1922 * evmcs->host_cr4 = vmcs12->host_cr4; 1923 * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp; 1924 * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip; 1925 * evmcs->host_rip = vmcs12->host_rip; 1926 * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs; 1927 * evmcs->host_fs_base = vmcs12->host_fs_base; 1928 * evmcs->host_gs_base = vmcs12->host_gs_base; 1929 * evmcs->host_tr_base = vmcs12->host_tr_base; 1930 * evmcs->host_gdtr_base = vmcs12->host_gdtr_base; 1931 * evmcs->host_idtr_base = vmcs12->host_idtr_base; 1932 * evmcs->host_rsp = vmcs12->host_rsp; 1933 * sync_vmcs02_to_vmcs12() doesn't read these: 1934 * evmcs->io_bitmap_a = vmcs12->io_bitmap_a; 1935 * evmcs->io_bitmap_b = vmcs12->io_bitmap_b; 1936 * evmcs->msr_bitmap = vmcs12->msr_bitmap; 1937 * evmcs->ept_pointer = vmcs12->ept_pointer; 1938 * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap; 1939 * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr; 1940 * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr; 1941 * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr; 1942 * evmcs->tpr_threshold = vmcs12->tpr_threshold; 1943 * evmcs->virtual_processor_id = vmcs12->virtual_processor_id; 1944 * evmcs->exception_bitmap = vmcs12->exception_bitmap; 1945 * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer; 1946 * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control; 1947 * evmcs->vm_exit_controls = vmcs12->vm_exit_controls; 1948 * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control; 1949 * evmcs->page_fault_error_code_mask = 1950 * vmcs12->page_fault_error_code_mask; 1951 * evmcs->page_fault_error_code_match = 1952 * vmcs12->page_fault_error_code_match; 1953 * evmcs->cr3_target_count = vmcs12->cr3_target_count; 1954 * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr; 1955 * evmcs->tsc_offset = vmcs12->tsc_offset; 1956 * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl; 1957 * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask; 1958 * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask; 1959 * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow; 1960 * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow; 1961 * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count; 1962 * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count; 1963 * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count; 1964 * evmcs->guest_ia32_perf_global_ctrl = vmcs12->guest_ia32_perf_global_ctrl; 1965 * evmcs->host_ia32_perf_global_ctrl = vmcs12->host_ia32_perf_global_ctrl; 1966 * evmcs->encls_exiting_bitmap = vmcs12->encls_exiting_bitmap; 1967 * evmcs->tsc_multiplier = vmcs12->tsc_multiplier; 1968 * 1969 * Not present in struct vmcs12: 1970 * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx; 1971 * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi; 1972 * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi; 1973 * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip; 1974 * evmcs->host_ia32_s_cet = vmcs12->host_ia32_s_cet; 1975 * evmcs->host_ssp = vmcs12->host_ssp; 1976 * evmcs->host_ia32_int_ssp_table_addr = vmcs12->host_ia32_int_ssp_table_addr; 1977 * evmcs->guest_ia32_s_cet = vmcs12->guest_ia32_s_cet; 1978 * evmcs->guest_ia32_lbr_ctl = vmcs12->guest_ia32_lbr_ctl; 1979 * evmcs->guest_ia32_int_ssp_table_addr = vmcs12->guest_ia32_int_ssp_table_addr; 1980 * evmcs->guest_ssp = vmcs12->guest_ssp; 1981 */ 1982 1983 evmcs->guest_es_selector = vmcs12->guest_es_selector; 1984 evmcs->guest_cs_selector = vmcs12->guest_cs_selector; 1985 evmcs->guest_ss_selector = vmcs12->guest_ss_selector; 1986 evmcs->guest_ds_selector = vmcs12->guest_ds_selector; 1987 evmcs->guest_fs_selector = vmcs12->guest_fs_selector; 1988 evmcs->guest_gs_selector = vmcs12->guest_gs_selector; 1989 evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector; 1990 evmcs->guest_tr_selector = vmcs12->guest_tr_selector; 1991 1992 evmcs->guest_es_limit = vmcs12->guest_es_limit; 1993 evmcs->guest_cs_limit = vmcs12->guest_cs_limit; 1994 evmcs->guest_ss_limit = vmcs12->guest_ss_limit; 1995 evmcs->guest_ds_limit = vmcs12->guest_ds_limit; 1996 evmcs->guest_fs_limit = vmcs12->guest_fs_limit; 1997 evmcs->guest_gs_limit = vmcs12->guest_gs_limit; 1998 evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit; 1999 evmcs->guest_tr_limit = vmcs12->guest_tr_limit; 2000 evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit; 2001 evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit; 2002 2003 evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes; 2004 evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes; 2005 evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes; 2006 evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes; 2007 evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes; 2008 evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes; 2009 evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes; 2010 evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes; 2011 2012 evmcs->guest_es_base = vmcs12->guest_es_base; 2013 evmcs->guest_cs_base = vmcs12->guest_cs_base; 2014 evmcs->guest_ss_base = vmcs12->guest_ss_base; 2015 evmcs->guest_ds_base = vmcs12->guest_ds_base; 2016 evmcs->guest_fs_base = vmcs12->guest_fs_base; 2017 evmcs->guest_gs_base = vmcs12->guest_gs_base; 2018 evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base; 2019 evmcs->guest_tr_base = vmcs12->guest_tr_base; 2020 evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base; 2021 evmcs->guest_idtr_base = vmcs12->guest_idtr_base; 2022 2023 evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat; 2024 evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer; 2025 2026 evmcs->guest_pdptr0 = vmcs12->guest_pdptr0; 2027 evmcs->guest_pdptr1 = vmcs12->guest_pdptr1; 2028 evmcs->guest_pdptr2 = vmcs12->guest_pdptr2; 2029 evmcs->guest_pdptr3 = vmcs12->guest_pdptr3; 2030 2031 evmcs->guest_pending_dbg_exceptions = 2032 vmcs12->guest_pending_dbg_exceptions; 2033 evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp; 2034 evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip; 2035 2036 evmcs->guest_activity_state = vmcs12->guest_activity_state; 2037 evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs; 2038 2039 evmcs->guest_cr0 = vmcs12->guest_cr0; 2040 evmcs->guest_cr3 = vmcs12->guest_cr3; 2041 evmcs->guest_cr4 = vmcs12->guest_cr4; 2042 evmcs->guest_dr7 = vmcs12->guest_dr7; 2043 2044 evmcs->guest_physical_address = vmcs12->guest_physical_address; 2045 2046 evmcs->vm_instruction_error = vmcs12->vm_instruction_error; 2047 evmcs->vm_exit_reason = vmcs12->vm_exit_reason; 2048 evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info; 2049 evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code; 2050 evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field; 2051 evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code; 2052 evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len; 2053 evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info; 2054 2055 evmcs->exit_qualification = vmcs12->exit_qualification; 2056 2057 evmcs->guest_linear_address = vmcs12->guest_linear_address; 2058 evmcs->guest_rsp = vmcs12->guest_rsp; 2059 evmcs->guest_rflags = vmcs12->guest_rflags; 2060 2061 evmcs->guest_interruptibility_info = 2062 vmcs12->guest_interruptibility_info; 2063 evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control; 2064 evmcs->vm_entry_controls = vmcs12->vm_entry_controls; 2065 evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field; 2066 evmcs->vm_entry_exception_error_code = 2067 vmcs12->vm_entry_exception_error_code; 2068 evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len; 2069 2070 evmcs->guest_rip = vmcs12->guest_rip; 2071 2072 evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs; 2073 2074 return; 2075 #else /* CONFIG_KVM_HYPERV */ 2076 KVM_BUG_ON(1, vmx->vcpu.kvm); 2077 #endif /* CONFIG_KVM_HYPERV */ 2078 } 2079 2080 /* 2081 * This is an equivalent of the nested hypervisor executing the vmptrld 2082 * instruction. 2083 */ nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu * vcpu,bool from_launch)2084 static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld( 2085 struct kvm_vcpu *vcpu, bool from_launch) 2086 { 2087 #ifdef CONFIG_KVM_HYPERV 2088 struct vcpu_vmx *vmx = to_vmx(vcpu); 2089 bool evmcs_gpa_changed = false; 2090 u64 evmcs_gpa; 2091 2092 if (likely(!guest_cpu_cap_has_evmcs(vcpu))) 2093 return EVMPTRLD_DISABLED; 2094 2095 evmcs_gpa = nested_get_evmptr(vcpu); 2096 if (!evmptr_is_valid(evmcs_gpa)) { 2097 nested_release_evmcs(vcpu); 2098 return EVMPTRLD_DISABLED; 2099 } 2100 2101 if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) { 2102 vmx->nested.current_vmptr = INVALID_GPA; 2103 2104 nested_release_evmcs(vcpu); 2105 2106 if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa), 2107 &vmx->nested.hv_evmcs_map)) 2108 return EVMPTRLD_ERROR; 2109 2110 vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva; 2111 2112 /* 2113 * Currently, KVM only supports eVMCS version 1 2114 * (== KVM_EVMCS_VERSION) and thus we expect guest to set this 2115 * value to first u32 field of eVMCS which should specify eVMCS 2116 * VersionNumber. 2117 * 2118 * Guest should be aware of supported eVMCS versions by host by 2119 * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is 2120 * expected to set this CPUID leaf according to the value 2121 * returned in vmcs_version from nested_enable_evmcs(). 2122 * 2123 * However, it turns out that Microsoft Hyper-V fails to comply 2124 * to their own invented interface: When Hyper-V use eVMCS, it 2125 * just sets first u32 field of eVMCS to revision_id specified 2126 * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number 2127 * which is one of the supported versions specified in 2128 * CPUID.0x4000000A.EAX[0:15]. 2129 * 2130 * To overcome Hyper-V bug, we accept here either a supported 2131 * eVMCS version or VMCS12 revision_id as valid values for first 2132 * u32 field of eVMCS. 2133 */ 2134 if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) && 2135 (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) { 2136 nested_release_evmcs(vcpu); 2137 return EVMPTRLD_VMFAIL; 2138 } 2139 2140 vmx->nested.hv_evmcs_vmptr = evmcs_gpa; 2141 2142 evmcs_gpa_changed = true; 2143 /* 2144 * Unlike normal vmcs12, enlightened vmcs12 is not fully 2145 * reloaded from guest's memory (read only fields, fields not 2146 * present in struct hv_enlightened_vmcs, ...). Make sure there 2147 * are no leftovers. 2148 */ 2149 if (from_launch) { 2150 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2151 memset(vmcs12, 0, sizeof(*vmcs12)); 2152 vmcs12->hdr.revision_id = VMCS12_REVISION; 2153 } 2154 2155 } 2156 2157 /* 2158 * Clean fields data can't be used on VMLAUNCH and when we switch 2159 * between different L2 guests as KVM keeps a single VMCS12 per L1. 2160 */ 2161 if (from_launch || evmcs_gpa_changed) { 2162 vmx->nested.hv_evmcs->hv_clean_fields &= 2163 ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 2164 2165 vmx->nested.force_msr_bitmap_recalc = true; 2166 } 2167 2168 return EVMPTRLD_SUCCEEDED; 2169 #else 2170 return EVMPTRLD_DISABLED; 2171 #endif 2172 } 2173 nested_sync_vmcs12_to_shadow(struct kvm_vcpu * vcpu)2174 void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu) 2175 { 2176 struct vcpu_vmx *vmx = to_vmx(vcpu); 2177 2178 if (nested_vmx_is_evmptr12_valid(vmx)) 2179 copy_vmcs12_to_enlightened(vmx); 2180 else 2181 copy_vmcs12_to_shadow(vmx); 2182 2183 vmx->nested.need_vmcs12_to_shadow_sync = false; 2184 } 2185 vmx_preemption_timer_fn(struct hrtimer * timer)2186 static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) 2187 { 2188 struct vcpu_vmx *vmx = 2189 container_of(timer, struct vcpu_vmx, nested.preemption_timer); 2190 2191 vmx->nested.preemption_timer_expired = true; 2192 kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); 2193 kvm_vcpu_kick(&vmx->vcpu); 2194 2195 return HRTIMER_NORESTART; 2196 } 2197 vmx_calc_preemption_timer_value(struct kvm_vcpu * vcpu)2198 static u64 vmx_calc_preemption_timer_value(struct kvm_vcpu *vcpu) 2199 { 2200 struct vcpu_vmx *vmx = to_vmx(vcpu); 2201 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2202 2203 u64 l1_scaled_tsc = kvm_read_l1_tsc(vcpu, rdtsc()) >> 2204 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 2205 2206 if (!vmx->nested.has_preemption_timer_deadline) { 2207 vmx->nested.preemption_timer_deadline = 2208 vmcs12->vmx_preemption_timer_value + l1_scaled_tsc; 2209 vmx->nested.has_preemption_timer_deadline = true; 2210 } 2211 return vmx->nested.preemption_timer_deadline - l1_scaled_tsc; 2212 } 2213 vmx_start_preemption_timer(struct kvm_vcpu * vcpu,u64 preemption_timeout)2214 static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu, 2215 u64 preemption_timeout) 2216 { 2217 struct vcpu_vmx *vmx = to_vmx(vcpu); 2218 2219 /* 2220 * A timer value of zero is architecturally guaranteed to cause 2221 * a VMExit prior to executing any instructions in the guest. 2222 */ 2223 if (preemption_timeout == 0) { 2224 vmx_preemption_timer_fn(&vmx->nested.preemption_timer); 2225 return; 2226 } 2227 2228 if (vcpu->arch.virtual_tsc_khz == 0) 2229 return; 2230 2231 preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 2232 preemption_timeout *= 1000000; 2233 do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); 2234 hrtimer_start(&vmx->nested.preemption_timer, 2235 ktime_add_ns(ktime_get(), preemption_timeout), 2236 HRTIMER_MODE_ABS_PINNED); 2237 } 2238 nested_vmx_calc_efer(struct vcpu_vmx * vmx,struct vmcs12 * vmcs12)2239 static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2240 { 2241 if (vmx->nested.nested_run_pending && 2242 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) 2243 return vmcs12->guest_ia32_efer; 2244 else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) 2245 return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME); 2246 else 2247 return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME); 2248 } 2249 prepare_vmcs02_constant_state(struct vcpu_vmx * vmx)2250 static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) 2251 { 2252 struct kvm *kvm = vmx->vcpu.kvm; 2253 2254 /* 2255 * If vmcs02 hasn't been initialized, set the constant vmcs02 state 2256 * according to L0's settings (vmcs12 is irrelevant here). Host 2257 * fields that come from L0 and are not constant, e.g. HOST_CR3, 2258 * will be set as needed prior to VMLAUNCH/VMRESUME. 2259 */ 2260 if (vmx->nested.vmcs02_initialized) 2261 return; 2262 vmx->nested.vmcs02_initialized = true; 2263 2264 /* 2265 * We don't care what the EPTP value is we just need to guarantee 2266 * it's valid so we don't get a false positive when doing early 2267 * consistency checks. 2268 */ 2269 if (enable_ept && nested_early_check) 2270 vmcs_write64(EPT_POINTER, 2271 construct_eptp(&vmx->vcpu, 0, PT64_ROOT_4LEVEL)); 2272 2273 if (vmx->ve_info) 2274 vmcs_write64(VE_INFORMATION_ADDRESS, __pa(vmx->ve_info)); 2275 2276 /* All VMFUNCs are currently emulated through L0 vmexits. */ 2277 if (cpu_has_vmx_vmfunc()) 2278 vmcs_write64(VM_FUNCTION_CONTROL, 0); 2279 2280 if (cpu_has_vmx_posted_intr()) 2281 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR); 2282 2283 if (cpu_has_vmx_msr_bitmap()) 2284 vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap)); 2285 2286 /* 2287 * PML is emulated for L2, but never enabled in hardware as the MMU 2288 * handles A/D emulation. Disabling PML for L2 also avoids having to 2289 * deal with filtering out L2 GPAs from the buffer. 2290 */ 2291 if (enable_pml) { 2292 vmcs_write64(PML_ADDRESS, 0); 2293 vmcs_write16(GUEST_PML_INDEX, -1); 2294 } 2295 2296 if (cpu_has_vmx_encls_vmexit()) 2297 vmcs_write64(ENCLS_EXITING_BITMAP, INVALID_GPA); 2298 2299 if (kvm_notify_vmexit_enabled(kvm)) 2300 vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window); 2301 2302 /* 2303 * Set the MSR load/store lists to match L0's settings. Only the 2304 * addresses are constant (for vmcs02), the counts can change based 2305 * on L2's behavior, e.g. switching to/from long mode. 2306 */ 2307 vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.guest.val)); 2308 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); 2309 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); 2310 2311 vmx_set_constant_host_state(vmx); 2312 } 2313 prepare_vmcs02_early_rare(struct vcpu_vmx * vmx,struct vmcs12 * vmcs12)2314 static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx, 2315 struct vmcs12 *vmcs12) 2316 { 2317 prepare_vmcs02_constant_state(vmx); 2318 2319 vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); 2320 2321 /* 2322 * If VPID is disabled, then guest TLB accesses use VPID=0, i.e. the 2323 * same VPID as the host. Emulate this behavior by using vpid01 for L2 2324 * if VPID is disabled in vmcs12. Note, if VPID is disabled, VM-Enter 2325 * and VM-Exit are architecturally required to flush VPID=0, but *only* 2326 * VPID=0. I.e. using vpid02 would be ok (so long as KVM emulates the 2327 * required flushes), but doing so would cause KVM to over-flush. E.g. 2328 * if L1 runs L2 X with VPID12=1, then runs L2 Y with VPID12 disabled, 2329 * and then runs L2 X again, then KVM can and should retain TLB entries 2330 * for VPID12=1. 2331 */ 2332 if (enable_vpid) { 2333 if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) 2334 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); 2335 else 2336 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 2337 } 2338 } 2339 prepare_vmcs02_early(struct vcpu_vmx * vmx,struct loaded_vmcs * vmcs01,struct vmcs12 * vmcs12)2340 static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs01, 2341 struct vmcs12 *vmcs12) 2342 { 2343 u32 exec_control; 2344 u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12); 2345 2346 if (vmx->nested.dirty_vmcs12 || nested_vmx_is_evmptr12_valid(vmx)) 2347 prepare_vmcs02_early_rare(vmx, vmcs12); 2348 2349 /* 2350 * PIN CONTROLS 2351 */ 2352 exec_control = __pin_controls_get(vmcs01); 2353 exec_control |= (vmcs12->pin_based_vm_exec_control & 2354 ~PIN_BASED_VMX_PREEMPTION_TIMER); 2355 2356 /* Posted interrupts setting is only taken from vmcs12. */ 2357 vmx->nested.pi_pending = false; 2358 if (nested_cpu_has_posted_intr(vmcs12)) { 2359 vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; 2360 } else { 2361 vmx->nested.posted_intr_nv = -1; 2362 exec_control &= ~PIN_BASED_POSTED_INTR; 2363 } 2364 pin_controls_set(vmx, exec_control); 2365 2366 /* 2367 * EXEC CONTROLS 2368 */ 2369 exec_control = __exec_controls_get(vmcs01); /* L0's desires */ 2370 exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING; 2371 exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING; 2372 exec_control &= ~CPU_BASED_TPR_SHADOW; 2373 exec_control |= vmcs12->cpu_based_vm_exec_control; 2374 2375 vmx->nested.l1_tpr_threshold = -1; 2376 if (exec_control & CPU_BASED_TPR_SHADOW) 2377 vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); 2378 #ifdef CONFIG_X86_64 2379 else 2380 exec_control |= CPU_BASED_CR8_LOAD_EXITING | 2381 CPU_BASED_CR8_STORE_EXITING; 2382 #endif 2383 2384 /* 2385 * A vmexit (to either L1 hypervisor or L0 userspace) is always needed 2386 * for I/O port accesses. 2387 */ 2388 exec_control |= CPU_BASED_UNCOND_IO_EXITING; 2389 exec_control &= ~CPU_BASED_USE_IO_BITMAPS; 2390 2391 /* 2392 * This bit will be computed in nested_get_vmcs12_pages, because 2393 * we do not have access to L1's MSR bitmap yet. For now, keep 2394 * the same bit as before, hoping to avoid multiple VMWRITEs that 2395 * only set/clear this bit. 2396 */ 2397 exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; 2398 exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS; 2399 2400 exec_controls_set(vmx, exec_control); 2401 2402 /* 2403 * SECONDARY EXEC CONTROLS 2404 */ 2405 if (cpu_has_secondary_exec_ctrls()) { 2406 exec_control = __secondary_exec_controls_get(vmcs01); 2407 2408 /* Take the following fields only from vmcs12 */ 2409 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 2410 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 2411 SECONDARY_EXEC_ENABLE_INVPCID | 2412 SECONDARY_EXEC_ENABLE_RDTSCP | 2413 SECONDARY_EXEC_ENABLE_XSAVES | 2414 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | 2415 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 2416 SECONDARY_EXEC_APIC_REGISTER_VIRT | 2417 SECONDARY_EXEC_ENABLE_VMFUNC | 2418 SECONDARY_EXEC_DESC); 2419 2420 if (nested_cpu_has(vmcs12, 2421 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) 2422 exec_control |= vmcs12->secondary_vm_exec_control; 2423 2424 /* PML is emulated and never enabled in hardware for L2. */ 2425 exec_control &= ~SECONDARY_EXEC_ENABLE_PML; 2426 2427 /* VMCS shadowing for L2 is emulated for now */ 2428 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; 2429 2430 /* 2431 * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4() 2432 * will not have to rewrite the controls just for this bit. 2433 */ 2434 if (vmx_umip_emulated() && (vmcs12->guest_cr4 & X86_CR4_UMIP)) 2435 exec_control |= SECONDARY_EXEC_DESC; 2436 2437 if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) 2438 vmcs_write16(GUEST_INTR_STATUS, 2439 vmcs12->guest_intr_status); 2440 2441 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) 2442 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 2443 2444 if (exec_control & SECONDARY_EXEC_ENCLS_EXITING) 2445 vmx_write_encls_bitmap(&vmx->vcpu, vmcs12); 2446 2447 secondary_exec_controls_set(vmx, exec_control); 2448 } 2449 2450 /* 2451 * ENTRY CONTROLS 2452 * 2453 * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE 2454 * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate 2455 * on the related bits (if supported by the CPU) in the hope that 2456 * we can avoid VMWrites during vmx_set_efer(). 2457 * 2458 * Similarly, take vmcs01's PERF_GLOBAL_CTRL in the hope that if KVM is 2459 * loading PERF_GLOBAL_CTRL via the VMCS for L1, then KVM will want to 2460 * do the same for L2. 2461 */ 2462 exec_control = __vm_entry_controls_get(vmcs01); 2463 exec_control |= (vmcs12->vm_entry_controls & 2464 ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL); 2465 exec_control &= ~(VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER); 2466 if (cpu_has_load_ia32_efer()) { 2467 if (guest_efer & EFER_LMA) 2468 exec_control |= VM_ENTRY_IA32E_MODE; 2469 if (guest_efer != kvm_host.efer) 2470 exec_control |= VM_ENTRY_LOAD_IA32_EFER; 2471 } 2472 vm_entry_controls_set(vmx, exec_control); 2473 2474 /* 2475 * EXIT CONTROLS 2476 * 2477 * L2->L1 exit controls are emulated - the hardware exit is to L0 so 2478 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER 2479 * bits may be modified by vmx_set_efer() in prepare_vmcs02(). 2480 */ 2481 exec_control = __vm_exit_controls_get(vmcs01); 2482 if (cpu_has_load_ia32_efer() && guest_efer != kvm_host.efer) 2483 exec_control |= VM_EXIT_LOAD_IA32_EFER; 2484 else 2485 exec_control &= ~VM_EXIT_LOAD_IA32_EFER; 2486 vm_exit_controls_set(vmx, exec_control); 2487 2488 /* 2489 * Interrupt/Exception Fields 2490 */ 2491 if (vmx->nested.nested_run_pending) { 2492 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 2493 vmcs12->vm_entry_intr_info_field); 2494 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 2495 vmcs12->vm_entry_exception_error_code); 2496 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 2497 vmcs12->vm_entry_instruction_len); 2498 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 2499 vmcs12->guest_interruptibility_info); 2500 vmx->loaded_vmcs->nmi_known_unmasked = 2501 !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI); 2502 } else { 2503 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); 2504 } 2505 } 2506 prepare_vmcs02_rare(struct vcpu_vmx * vmx,struct vmcs12 * vmcs12)2507 static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2508 { 2509 struct hv_enlightened_vmcs *hv_evmcs = nested_vmx_evmcs(vmx); 2510 2511 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2512 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { 2513 2514 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); 2515 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); 2516 vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); 2517 vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector); 2518 vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector); 2519 vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector); 2520 vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector); 2521 vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector); 2522 vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit); 2523 vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit); 2524 vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit); 2525 vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit); 2526 vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit); 2527 vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit); 2528 vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit); 2529 vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit); 2530 vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); 2531 vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); 2532 vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); 2533 vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); 2534 vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); 2535 vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); 2536 vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); 2537 vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes); 2538 vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes); 2539 vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes); 2540 vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); 2541 vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); 2542 vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base); 2543 vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base); 2544 vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base); 2545 vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base); 2546 vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base); 2547 vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); 2548 vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); 2549 vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); 2550 2551 vmx_segment_cache_clear(vmx); 2552 } 2553 2554 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2555 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) { 2556 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); 2557 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 2558 vmcs12->guest_pending_dbg_exceptions); 2559 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); 2560 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); 2561 2562 /* 2563 * L1 may access the L2's PDPTR, so save them to construct 2564 * vmcs12 2565 */ 2566 if (enable_ept) { 2567 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 2568 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 2569 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 2570 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 2571 } 2572 2573 if (kvm_mpx_supported() && vmx->nested.nested_run_pending && 2574 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) 2575 vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); 2576 } 2577 2578 if (nested_cpu_has_xsaves(vmcs12)) 2579 vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap); 2580 2581 /* 2582 * Whether page-faults are trapped is determined by a combination of 2583 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. If L0 2584 * doesn't care about page faults then we should set all of these to 2585 * L1's desires. However, if L0 does care about (some) page faults, it 2586 * is not easy (if at all possible?) to merge L0 and L1's desires, we 2587 * simply ask to exit on each and every L2 page fault. This is done by 2588 * setting MASK=MATCH=0 and (see below) EB.PF=1. 2589 * Note that below we don't need special code to set EB.PF beyond the 2590 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, 2591 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when 2592 * !enable_ept, EB.PF is 1, so the "or" will always be 1. 2593 */ 2594 if (vmx_need_pf_intercept(&vmx->vcpu)) { 2595 /* 2596 * TODO: if both L0 and L1 need the same MASK and MATCH, 2597 * go ahead and use it? 2598 */ 2599 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); 2600 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); 2601 } else { 2602 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, vmcs12->page_fault_error_code_mask); 2603 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, vmcs12->page_fault_error_code_match); 2604 } 2605 2606 if (cpu_has_vmx_apicv()) { 2607 vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0); 2608 vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1); 2609 vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2); 2610 vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3); 2611 } 2612 2613 /* 2614 * Make sure the msr_autostore list is up to date before we set the 2615 * count in the vmcs02. 2616 */ 2617 prepare_vmx_msr_autostore_list(&vmx->vcpu, MSR_IA32_TSC); 2618 2619 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.guest.nr); 2620 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 2621 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 2622 2623 set_cr4_guest_host_mask(vmx); 2624 } 2625 2626 /* 2627 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested 2628 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it 2629 * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2 2630 * guest in a way that will both be appropriate to L1's requests, and our 2631 * needs. In addition to modifying the active vmcs (which is vmcs02), this 2632 * function also has additional necessary side-effects, like setting various 2633 * vcpu->arch fields. 2634 * Returns 0 on success, 1 on failure. Invalid state exit qualification code 2635 * is assigned to entry_failure_code on failure. 2636 */ prepare_vmcs02(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12,bool from_vmentry,enum vm_entry_failure_code * entry_failure_code)2637 static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 2638 bool from_vmentry, 2639 enum vm_entry_failure_code *entry_failure_code) 2640 { 2641 struct vcpu_vmx *vmx = to_vmx(vcpu); 2642 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 2643 bool load_guest_pdptrs_vmcs12 = false; 2644 2645 if (vmx->nested.dirty_vmcs12 || nested_vmx_is_evmptr12_valid(vmx)) { 2646 prepare_vmcs02_rare(vmx, vmcs12); 2647 vmx->nested.dirty_vmcs12 = false; 2648 2649 load_guest_pdptrs_vmcs12 = !nested_vmx_is_evmptr12_valid(vmx) || 2650 !(evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1); 2651 } 2652 2653 if (vmx->nested.nested_run_pending && 2654 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { 2655 kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); 2656 vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); 2657 } else { 2658 kvm_set_dr(vcpu, 7, vcpu->arch.dr7); 2659 vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.pre_vmenter_debugctl); 2660 } 2661 if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || 2662 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) 2663 vmcs_write64(GUEST_BNDCFGS, vmx->nested.pre_vmenter_bndcfgs); 2664 vmx_set_rflags(vcpu, vmcs12->guest_rflags); 2665 2666 /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the 2667 * bitwise-or of what L1 wants to trap for L2, and what we want to 2668 * trap. Note that CR0.TS also needs updating - we do this later. 2669 */ 2670 vmx_update_exception_bitmap(vcpu); 2671 vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; 2672 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); 2673 2674 if (vmx->nested.nested_run_pending && 2675 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) { 2676 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); 2677 vcpu->arch.pat = vmcs12->guest_ia32_pat; 2678 } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { 2679 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); 2680 } 2681 2682 vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset( 2683 vcpu->arch.l1_tsc_offset, 2684 vmx_get_l2_tsc_offset(vcpu), 2685 vmx_get_l2_tsc_multiplier(vcpu)); 2686 2687 vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier( 2688 vcpu->arch.l1_tsc_scaling_ratio, 2689 vmx_get_l2_tsc_multiplier(vcpu)); 2690 2691 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 2692 if (kvm_caps.has_tsc_control) 2693 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); 2694 2695 nested_vmx_transition_tlb_flush(vcpu, vmcs12, true); 2696 2697 if (nested_cpu_has_ept(vmcs12)) 2698 nested_ept_init_mmu_context(vcpu); 2699 2700 /* 2701 * Override the CR0/CR4 read shadows after setting the effective guest 2702 * CR0/CR4. The common helpers also set the shadows, but they don't 2703 * account for vmcs12's cr0/4_guest_host_mask. 2704 */ 2705 vmx_set_cr0(vcpu, vmcs12->guest_cr0); 2706 vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); 2707 2708 vmx_set_cr4(vcpu, vmcs12->guest_cr4); 2709 vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12)); 2710 2711 vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12); 2712 /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ 2713 vmx_set_efer(vcpu, vcpu->arch.efer); 2714 2715 /* 2716 * Guest state is invalid and unrestricted guest is disabled, 2717 * which means L1 attempted VMEntry to L2 with invalid state. 2718 * Fail the VMEntry. 2719 * 2720 * However when force loading the guest state (SMM exit or 2721 * loading nested state after migration, it is possible to 2722 * have invalid guest state now, which will be later fixed by 2723 * restoring L2 register state 2724 */ 2725 if (CC(from_vmentry && !vmx_guest_state_valid(vcpu))) { 2726 *entry_failure_code = ENTRY_FAIL_DEFAULT; 2727 return -EINVAL; 2728 } 2729 2730 /* Shadow page tables on either EPT or shadow page tables. */ 2731 if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12), 2732 from_vmentry, entry_failure_code)) 2733 return -EINVAL; 2734 2735 /* 2736 * Immediately write vmcs02.GUEST_CR3. It will be propagated to vmcs12 2737 * on nested VM-Exit, which can occur without actually running L2 and 2738 * thus without hitting vmx_load_mmu_pgd(), e.g. if L1 is entering L2 with 2739 * vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the 2740 * transition to HLT instead of running L2. 2741 */ 2742 if (enable_ept) 2743 vmcs_writel(GUEST_CR3, vmcs12->guest_cr3); 2744 2745 /* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */ 2746 if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) && 2747 is_pae_paging(vcpu)) { 2748 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 2749 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 2750 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 2751 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 2752 } 2753 2754 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && 2755 kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)) && 2756 WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 2757 vmcs12->guest_ia32_perf_global_ctrl))) { 2758 *entry_failure_code = ENTRY_FAIL_DEFAULT; 2759 return -EINVAL; 2760 } 2761 2762 kvm_rsp_write(vcpu, vmcs12->guest_rsp); 2763 kvm_rip_write(vcpu, vmcs12->guest_rip); 2764 2765 /* 2766 * It was observed that genuine Hyper-V running in L1 doesn't reset 2767 * 'hv_clean_fields' by itself, it only sets the corresponding dirty 2768 * bits when it changes a field in eVMCS. Mark all fields as clean 2769 * here. 2770 */ 2771 if (nested_vmx_is_evmptr12_valid(vmx)) 2772 evmcs->hv_clean_fields |= HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 2773 2774 return 0; 2775 } 2776 nested_vmx_check_nmi_controls(struct vmcs12 * vmcs12)2777 static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12) 2778 { 2779 if (CC(!nested_cpu_has_nmi_exiting(vmcs12) && 2780 nested_cpu_has_virtual_nmis(vmcs12))) 2781 return -EINVAL; 2782 2783 if (CC(!nested_cpu_has_virtual_nmis(vmcs12) && 2784 nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING))) 2785 return -EINVAL; 2786 2787 return 0; 2788 } 2789 nested_vmx_check_eptp(struct kvm_vcpu * vcpu,u64 new_eptp)2790 static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp) 2791 { 2792 struct vcpu_vmx *vmx = to_vmx(vcpu); 2793 2794 /* Check for memory type validity */ 2795 switch (new_eptp & VMX_EPTP_MT_MASK) { 2796 case VMX_EPTP_MT_UC: 2797 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT))) 2798 return false; 2799 break; 2800 case VMX_EPTP_MT_WB: 2801 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT))) 2802 return false; 2803 break; 2804 default: 2805 return false; 2806 } 2807 2808 /* Page-walk levels validity. */ 2809 switch (new_eptp & VMX_EPTP_PWL_MASK) { 2810 case VMX_EPTP_PWL_5: 2811 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_5_BIT))) 2812 return false; 2813 break; 2814 case VMX_EPTP_PWL_4: 2815 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_4_BIT))) 2816 return false; 2817 break; 2818 default: 2819 return false; 2820 } 2821 2822 /* Reserved bits should not be set */ 2823 if (CC(!kvm_vcpu_is_legal_gpa(vcpu, new_eptp) || ((new_eptp >> 7) & 0x1f))) 2824 return false; 2825 2826 /* AD, if set, should be supported */ 2827 if (new_eptp & VMX_EPTP_AD_ENABLE_BIT) { 2828 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT))) 2829 return false; 2830 } 2831 2832 return true; 2833 } 2834 2835 /* 2836 * Checks related to VM-Execution Control Fields 2837 */ nested_check_vm_execution_controls(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12)2838 static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu, 2839 struct vmcs12 *vmcs12) 2840 { 2841 struct vcpu_vmx *vmx = to_vmx(vcpu); 2842 2843 if (CC(!vmx_control_verify(vmcs12->pin_based_vm_exec_control, 2844 vmx->nested.msrs.pinbased_ctls_low, 2845 vmx->nested.msrs.pinbased_ctls_high)) || 2846 CC(!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, 2847 vmx->nested.msrs.procbased_ctls_low, 2848 vmx->nested.msrs.procbased_ctls_high))) 2849 return -EINVAL; 2850 2851 if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && 2852 CC(!vmx_control_verify(vmcs12->secondary_vm_exec_control, 2853 vmx->nested.msrs.secondary_ctls_low, 2854 vmx->nested.msrs.secondary_ctls_high))) 2855 return -EINVAL; 2856 2857 if (CC(vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) || 2858 nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) || 2859 nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) || 2860 nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) || 2861 nested_vmx_check_apic_access_controls(vcpu, vmcs12) || 2862 nested_vmx_check_apicv_controls(vcpu, vmcs12) || 2863 nested_vmx_check_nmi_controls(vmcs12) || 2864 nested_vmx_check_pml_controls(vcpu, vmcs12) || 2865 nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) || 2866 nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) || 2867 nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) || 2868 CC(nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id)) 2869 return -EINVAL; 2870 2871 if (!nested_cpu_has_preemption_timer(vmcs12) && 2872 nested_cpu_has_save_preemption_timer(vmcs12)) 2873 return -EINVAL; 2874 2875 if (nested_cpu_has_ept(vmcs12) && 2876 CC(!nested_vmx_check_eptp(vcpu, vmcs12->ept_pointer))) 2877 return -EINVAL; 2878 2879 if (nested_cpu_has_vmfunc(vmcs12)) { 2880 if (CC(vmcs12->vm_function_control & 2881 ~vmx->nested.msrs.vmfunc_controls)) 2882 return -EINVAL; 2883 2884 if (nested_cpu_has_eptp_switching(vmcs12)) { 2885 if (CC(!nested_cpu_has_ept(vmcs12)) || 2886 CC(!page_address_valid(vcpu, vmcs12->eptp_list_address))) 2887 return -EINVAL; 2888 } 2889 } 2890 2891 return 0; 2892 } 2893 2894 /* 2895 * Checks related to VM-Exit Control Fields 2896 */ nested_check_vm_exit_controls(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12)2897 static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu, 2898 struct vmcs12 *vmcs12) 2899 { 2900 struct vcpu_vmx *vmx = to_vmx(vcpu); 2901 2902 if (CC(!vmx_control_verify(vmcs12->vm_exit_controls, 2903 vmx->nested.msrs.exit_ctls_low, 2904 vmx->nested.msrs.exit_ctls_high)) || 2905 CC(nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12))) 2906 return -EINVAL; 2907 2908 return 0; 2909 } 2910 2911 /* 2912 * Checks related to VM-Entry Control Fields 2913 */ nested_check_vm_entry_controls(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12)2914 static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, 2915 struct vmcs12 *vmcs12) 2916 { 2917 struct vcpu_vmx *vmx = to_vmx(vcpu); 2918 2919 if (CC(!vmx_control_verify(vmcs12->vm_entry_controls, 2920 vmx->nested.msrs.entry_ctls_low, 2921 vmx->nested.msrs.entry_ctls_high))) 2922 return -EINVAL; 2923 2924 /* 2925 * From the Intel SDM, volume 3: 2926 * Fields relevant to VM-entry event injection must be set properly. 2927 * These fields are the VM-entry interruption-information field, the 2928 * VM-entry exception error code, and the VM-entry instruction length. 2929 */ 2930 if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) { 2931 u32 intr_info = vmcs12->vm_entry_intr_info_field; 2932 u8 vector = intr_info & INTR_INFO_VECTOR_MASK; 2933 u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK; 2934 bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK; 2935 bool should_have_error_code; 2936 bool urg = nested_cpu_has2(vmcs12, 2937 SECONDARY_EXEC_UNRESTRICTED_GUEST); 2938 bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE; 2939 2940 /* VM-entry interruption-info field: interruption type */ 2941 if (CC(intr_type == INTR_TYPE_RESERVED) || 2942 CC(intr_type == INTR_TYPE_OTHER_EVENT && 2943 !nested_cpu_supports_monitor_trap_flag(vcpu))) 2944 return -EINVAL; 2945 2946 /* VM-entry interruption-info field: vector */ 2947 if (CC(intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) || 2948 CC(intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) || 2949 CC(intr_type == INTR_TYPE_OTHER_EVENT && vector != 0)) 2950 return -EINVAL; 2951 2952 /* VM-entry interruption-info field: deliver error code */ 2953 should_have_error_code = 2954 intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode && 2955 x86_exception_has_error_code(vector); 2956 if (CC(has_error_code != should_have_error_code)) 2957 return -EINVAL; 2958 2959 /* VM-entry exception error code */ 2960 if (CC(has_error_code && 2961 vmcs12->vm_entry_exception_error_code & GENMASK(31, 16))) 2962 return -EINVAL; 2963 2964 /* VM-entry interruption-info field: reserved bits */ 2965 if (CC(intr_info & INTR_INFO_RESVD_BITS_MASK)) 2966 return -EINVAL; 2967 2968 /* VM-entry instruction length */ 2969 switch (intr_type) { 2970 case INTR_TYPE_SOFT_EXCEPTION: 2971 case INTR_TYPE_SOFT_INTR: 2972 case INTR_TYPE_PRIV_SW_EXCEPTION: 2973 if (CC(vmcs12->vm_entry_instruction_len > X86_MAX_INSTRUCTION_LENGTH) || 2974 CC(vmcs12->vm_entry_instruction_len == 0 && 2975 CC(!nested_cpu_has_zero_length_injection(vcpu)))) 2976 return -EINVAL; 2977 } 2978 } 2979 2980 if (nested_vmx_check_entry_msr_switch_controls(vcpu, vmcs12)) 2981 return -EINVAL; 2982 2983 return 0; 2984 } 2985 nested_vmx_check_controls(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12)2986 static int nested_vmx_check_controls(struct kvm_vcpu *vcpu, 2987 struct vmcs12 *vmcs12) 2988 { 2989 if (nested_check_vm_execution_controls(vcpu, vmcs12) || 2990 nested_check_vm_exit_controls(vcpu, vmcs12) || 2991 nested_check_vm_entry_controls(vcpu, vmcs12)) 2992 return -EINVAL; 2993 2994 #ifdef CONFIG_KVM_HYPERV 2995 if (guest_cpu_cap_has_evmcs(vcpu)) 2996 return nested_evmcs_check_controls(vmcs12); 2997 #endif 2998 2999 return 0; 3000 } 3001 nested_vmx_check_address_space_size(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12)3002 static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu, 3003 struct vmcs12 *vmcs12) 3004 { 3005 #ifdef CONFIG_X86_64 3006 if (CC(!!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) != 3007 !!(vcpu->arch.efer & EFER_LMA))) 3008 return -EINVAL; 3009 #endif 3010 return 0; 3011 } 3012 is_l1_noncanonical_address_on_vmexit(u64 la,struct vmcs12 * vmcs12)3013 static bool is_l1_noncanonical_address_on_vmexit(u64 la, struct vmcs12 *vmcs12) 3014 { 3015 /* 3016 * Check that the given linear address is canonical after a VM exit 3017 * from L2, based on HOST_CR4.LA57 value that will be loaded for L1. 3018 */ 3019 u8 l1_address_bits_on_exit = (vmcs12->host_cr4 & X86_CR4_LA57) ? 57 : 48; 3020 3021 return !__is_canonical_address(la, l1_address_bits_on_exit); 3022 } 3023 nested_vmx_check_host_state(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12)3024 static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, 3025 struct vmcs12 *vmcs12) 3026 { 3027 bool ia32e = !!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE); 3028 3029 if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) || 3030 CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) || 3031 CC(!kvm_vcpu_is_legal_cr3(vcpu, vmcs12->host_cr3))) 3032 return -EINVAL; 3033 3034 if (CC(is_noncanonical_msr_address(vmcs12->host_ia32_sysenter_esp, vcpu)) || 3035 CC(is_noncanonical_msr_address(vmcs12->host_ia32_sysenter_eip, vcpu))) 3036 return -EINVAL; 3037 3038 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) && 3039 CC(!kvm_pat_valid(vmcs12->host_ia32_pat))) 3040 return -EINVAL; 3041 3042 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && 3043 CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), 3044 vmcs12->host_ia32_perf_global_ctrl))) 3045 return -EINVAL; 3046 3047 if (ia32e) { 3048 if (CC(!(vmcs12->host_cr4 & X86_CR4_PAE))) 3049 return -EINVAL; 3050 } else { 3051 if (CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) || 3052 CC(vmcs12->host_cr4 & X86_CR4_PCIDE) || 3053 CC((vmcs12->host_rip) >> 32)) 3054 return -EINVAL; 3055 } 3056 3057 if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3058 CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3059 CC(vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3060 CC(vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3061 CC(vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3062 CC(vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3063 CC(vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3064 CC(vmcs12->host_cs_selector == 0) || 3065 CC(vmcs12->host_tr_selector == 0) || 3066 CC(vmcs12->host_ss_selector == 0 && !ia32e)) 3067 return -EINVAL; 3068 3069 if (CC(is_noncanonical_base_address(vmcs12->host_fs_base, vcpu)) || 3070 CC(is_noncanonical_base_address(vmcs12->host_gs_base, vcpu)) || 3071 CC(is_noncanonical_base_address(vmcs12->host_gdtr_base, vcpu)) || 3072 CC(is_noncanonical_base_address(vmcs12->host_idtr_base, vcpu)) || 3073 CC(is_noncanonical_base_address(vmcs12->host_tr_base, vcpu)) || 3074 CC(is_l1_noncanonical_address_on_vmexit(vmcs12->host_rip, vmcs12))) 3075 return -EINVAL; 3076 3077 /* 3078 * If the load IA32_EFER VM-exit control is 1, bits reserved in the 3079 * IA32_EFER MSR must be 0 in the field for that register. In addition, 3080 * the values of the LMA and LME bits in the field must each be that of 3081 * the host address-space size VM-exit control. 3082 */ 3083 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) { 3084 if (CC(!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer)) || 3085 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA)) || 3086 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))) 3087 return -EINVAL; 3088 } 3089 3090 return 0; 3091 } 3092 nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12)3093 static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, 3094 struct vmcs12 *vmcs12) 3095 { 3096 struct vcpu_vmx *vmx = to_vmx(vcpu); 3097 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; 3098 struct vmcs_hdr hdr; 3099 3100 if (vmcs12->vmcs_link_pointer == INVALID_GPA) 3101 return 0; 3102 3103 if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))) 3104 return -EINVAL; 3105 3106 if (ghc->gpa != vmcs12->vmcs_link_pointer && 3107 CC(kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, 3108 vmcs12->vmcs_link_pointer, VMCS12_SIZE))) 3109 return -EINVAL; 3110 3111 if (CC(kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr, 3112 offsetof(struct vmcs12, hdr), 3113 sizeof(hdr)))) 3114 return -EINVAL; 3115 3116 if (CC(hdr.revision_id != VMCS12_REVISION) || 3117 CC(hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))) 3118 return -EINVAL; 3119 3120 return 0; 3121 } 3122 3123 /* 3124 * Checks related to Guest Non-register State 3125 */ nested_check_guest_non_reg_state(struct vmcs12 * vmcs12)3126 static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12) 3127 { 3128 if (CC(vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && 3129 vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT && 3130 vmcs12->guest_activity_state != GUEST_ACTIVITY_WAIT_SIPI)) 3131 return -EINVAL; 3132 3133 return 0; 3134 } 3135 nested_vmx_check_guest_state(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12,enum vm_entry_failure_code * entry_failure_code)3136 static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, 3137 struct vmcs12 *vmcs12, 3138 enum vm_entry_failure_code *entry_failure_code) 3139 { 3140 bool ia32e = !!(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE); 3141 3142 *entry_failure_code = ENTRY_FAIL_DEFAULT; 3143 3144 if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) || 3145 CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))) 3146 return -EINVAL; 3147 3148 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) && 3149 CC(!kvm_dr7_valid(vmcs12->guest_dr7))) 3150 return -EINVAL; 3151 3152 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) && 3153 CC(!kvm_pat_valid(vmcs12->guest_ia32_pat))) 3154 return -EINVAL; 3155 3156 if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) { 3157 *entry_failure_code = ENTRY_FAIL_VMCS_LINK_PTR; 3158 return -EINVAL; 3159 } 3160 3161 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && 3162 CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), 3163 vmcs12->guest_ia32_perf_global_ctrl))) 3164 return -EINVAL; 3165 3166 if (CC((vmcs12->guest_cr0 & (X86_CR0_PG | X86_CR0_PE)) == X86_CR0_PG)) 3167 return -EINVAL; 3168 3169 if (CC(ia32e && !(vmcs12->guest_cr4 & X86_CR4_PAE)) || 3170 CC(ia32e && !(vmcs12->guest_cr0 & X86_CR0_PG))) 3171 return -EINVAL; 3172 3173 /* 3174 * If the load IA32_EFER VM-entry control is 1, the following checks 3175 * are performed on the field for the IA32_EFER MSR: 3176 * - Bits reserved in the IA32_EFER MSR must be 0. 3177 * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of 3178 * the IA-32e mode guest VM-exit control. It must also be identical 3179 * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to 3180 * CR0.PG) is 1. 3181 */ 3182 if (to_vmx(vcpu)->nested.nested_run_pending && 3183 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) { 3184 if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) || 3185 CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) || 3186 CC(((vmcs12->guest_cr0 & X86_CR0_PG) && 3187 ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))) 3188 return -EINVAL; 3189 } 3190 3191 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) && 3192 (CC(is_noncanonical_msr_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) || 3193 CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))) 3194 return -EINVAL; 3195 3196 if (nested_check_guest_non_reg_state(vmcs12)) 3197 return -EINVAL; 3198 3199 return 0; 3200 } 3201 nested_vmx_check_vmentry_hw(struct kvm_vcpu * vcpu)3202 static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) 3203 { 3204 struct vcpu_vmx *vmx = to_vmx(vcpu); 3205 unsigned long cr3, cr4; 3206 bool vm_fail; 3207 3208 if (!nested_early_check) 3209 return 0; 3210 3211 if (vmx->msr_autoload.host.nr) 3212 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); 3213 if (vmx->msr_autoload.guest.nr) 3214 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); 3215 3216 preempt_disable(); 3217 3218 vmx_prepare_switch_to_guest(vcpu); 3219 3220 /* 3221 * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS, 3222 * which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to 3223 * be written (by prepare_vmcs02()) before the "real" VMEnter, i.e. 3224 * there is no need to preserve other bits or save/restore the field. 3225 */ 3226 vmcs_writel(GUEST_RFLAGS, 0); 3227 3228 cr3 = __get_current_cr3_fast(); 3229 if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { 3230 vmcs_writel(HOST_CR3, cr3); 3231 vmx->loaded_vmcs->host_state.cr3 = cr3; 3232 } 3233 3234 cr4 = cr4_read_shadow(); 3235 if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { 3236 vmcs_writel(HOST_CR4, cr4); 3237 vmx->loaded_vmcs->host_state.cr4 = cr4; 3238 } 3239 3240 vm_fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, 3241 __vmx_vcpu_run_flags(vmx)); 3242 3243 if (vmx->msr_autoload.host.nr) 3244 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 3245 if (vmx->msr_autoload.guest.nr) 3246 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 3247 3248 if (vm_fail) { 3249 u32 error = vmcs_read32(VM_INSTRUCTION_ERROR); 3250 3251 preempt_enable(); 3252 3253 trace_kvm_nested_vmenter_failed( 3254 "early hardware check VM-instruction error: ", error); 3255 WARN_ON_ONCE(error != VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3256 return 1; 3257 } 3258 3259 /* 3260 * VMExit clears RFLAGS.IF and DR7, even on a consistency check. 3261 */ 3262 if (hw_breakpoint_active()) 3263 set_debugreg(__this_cpu_read(cpu_dr7), 7); 3264 local_irq_enable(); 3265 preempt_enable(); 3266 3267 /* 3268 * A non-failing VMEntry means we somehow entered guest mode with 3269 * an illegal RIP, and that's just the tip of the iceberg. There 3270 * is no telling what memory has been modified or what state has 3271 * been exposed to unknown code. Hitting this all but guarantees 3272 * a (very critical) hardware issue. 3273 */ 3274 WARN_ON(!(vmcs_read32(VM_EXIT_REASON) & 3275 VMX_EXIT_REASONS_FAILED_VMENTRY)); 3276 3277 return 0; 3278 } 3279 3280 #ifdef CONFIG_KVM_HYPERV nested_get_evmcs_page(struct kvm_vcpu * vcpu)3281 static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu) 3282 { 3283 struct vcpu_vmx *vmx = to_vmx(vcpu); 3284 3285 /* 3286 * hv_evmcs may end up being not mapped after migration (when 3287 * L2 was running), map it here to make sure vmcs12 changes are 3288 * properly reflected. 3289 */ 3290 if (guest_cpu_cap_has_evmcs(vcpu) && 3291 vmx->nested.hv_evmcs_vmptr == EVMPTR_MAP_PENDING) { 3292 enum nested_evmptrld_status evmptrld_status = 3293 nested_vmx_handle_enlightened_vmptrld(vcpu, false); 3294 3295 if (evmptrld_status == EVMPTRLD_VMFAIL || 3296 evmptrld_status == EVMPTRLD_ERROR) 3297 return false; 3298 3299 /* 3300 * Post migration VMCS12 always provides the most actual 3301 * information, copy it to eVMCS upon entry. 3302 */ 3303 vmx->nested.need_vmcs12_to_shadow_sync = true; 3304 } 3305 3306 return true; 3307 } 3308 #endif 3309 nested_get_vmcs12_pages(struct kvm_vcpu * vcpu)3310 static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) 3311 { 3312 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3313 struct vcpu_vmx *vmx = to_vmx(vcpu); 3314 struct kvm_host_map *map; 3315 3316 if (!vcpu->arch.pdptrs_from_userspace && 3317 !nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { 3318 /* 3319 * Reload the guest's PDPTRs since after a migration 3320 * the guest CR3 might be restored prior to setting the nested 3321 * state which can lead to a load of wrong PDPTRs. 3322 */ 3323 if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3))) 3324 return false; 3325 } 3326 3327 3328 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 3329 map = &vmx->nested.apic_access_page_map; 3330 3331 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->apic_access_addr), map)) { 3332 vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(map->pfn)); 3333 } else { 3334 pr_debug_ratelimited("%s: no backing for APIC-access address in vmcs12\n", 3335 __func__); 3336 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3337 vcpu->run->internal.suberror = 3338 KVM_INTERNAL_ERROR_EMULATION; 3339 vcpu->run->internal.ndata = 0; 3340 return false; 3341 } 3342 } 3343 3344 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 3345 map = &vmx->nested.virtual_apic_map; 3346 3347 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) { 3348 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn)); 3349 } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) && 3350 nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) && 3351 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 3352 /* 3353 * The processor will never use the TPR shadow, simply 3354 * clear the bit from the execution control. Such a 3355 * configuration is useless, but it happens in tests. 3356 * For any other configuration, failing the vm entry is 3357 * _not_ what the processor does but it's basically the 3358 * only possibility we have. 3359 */ 3360 exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW); 3361 } else { 3362 /* 3363 * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to 3364 * force VM-Entry to fail. 3365 */ 3366 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, INVALID_GPA); 3367 } 3368 } 3369 3370 if (nested_cpu_has_posted_intr(vmcs12)) { 3371 map = &vmx->nested.pi_desc_map; 3372 3373 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) { 3374 vmx->nested.pi_desc = 3375 (struct pi_desc *)(((void *)map->hva) + 3376 offset_in_page(vmcs12->posted_intr_desc_addr)); 3377 vmcs_write64(POSTED_INTR_DESC_ADDR, 3378 pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr)); 3379 } else { 3380 /* 3381 * Defer the KVM_INTERNAL_EXIT until KVM tries to 3382 * access the contents of the VMCS12 posted interrupt 3383 * descriptor. (Note that KVM may do this when it 3384 * should not, per the architectural specification.) 3385 */ 3386 vmx->nested.pi_desc = NULL; 3387 pin_controls_clearbit(vmx, PIN_BASED_POSTED_INTR); 3388 } 3389 } 3390 if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12)) 3391 exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS); 3392 else 3393 exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS); 3394 3395 return true; 3396 } 3397 vmx_get_nested_state_pages(struct kvm_vcpu * vcpu)3398 static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu) 3399 { 3400 #ifdef CONFIG_KVM_HYPERV 3401 /* 3402 * Note: nested_get_evmcs_page() also updates 'vp_assist_page' copy 3403 * in 'struct kvm_vcpu_hv' in case eVMCS is in use, this is mandatory 3404 * to make nested_evmcs_l2_tlb_flush_enabled() work correctly post 3405 * migration. 3406 */ 3407 if (!nested_get_evmcs_page(vcpu)) { 3408 pr_debug_ratelimited("%s: enlightened vmptrld failed\n", 3409 __func__); 3410 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3411 vcpu->run->internal.suberror = 3412 KVM_INTERNAL_ERROR_EMULATION; 3413 vcpu->run->internal.ndata = 0; 3414 3415 return false; 3416 } 3417 #endif 3418 3419 if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu)) 3420 return false; 3421 3422 return true; 3423 } 3424 nested_vmx_write_pml_buffer(struct kvm_vcpu * vcpu,gpa_t gpa)3425 static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa) 3426 { 3427 struct vmcs12 *vmcs12; 3428 struct vcpu_vmx *vmx = to_vmx(vcpu); 3429 gpa_t dst; 3430 3431 if (WARN_ON_ONCE(!is_guest_mode(vcpu))) 3432 return 0; 3433 3434 if (WARN_ON_ONCE(vmx->nested.pml_full)) 3435 return 1; 3436 3437 /* 3438 * Check if PML is enabled for the nested guest. Whether eptp bit 6 is 3439 * set is already checked as part of A/D emulation. 3440 */ 3441 vmcs12 = get_vmcs12(vcpu); 3442 if (!nested_cpu_has_pml(vmcs12)) 3443 return 0; 3444 3445 if (vmcs12->guest_pml_index >= PML_LOG_NR_ENTRIES) { 3446 vmx->nested.pml_full = true; 3447 return 1; 3448 } 3449 3450 gpa &= ~0xFFFull; 3451 dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index; 3452 3453 if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa, 3454 offset_in_page(dst), sizeof(gpa))) 3455 return 0; 3456 3457 vmcs12->guest_pml_index--; 3458 3459 return 0; 3460 } 3461 3462 /* 3463 * Intel's VMX Instruction Reference specifies a common set of prerequisites 3464 * for running VMX instructions (except VMXON, whose prerequisites are 3465 * slightly different). It also specifies what exception to inject otherwise. 3466 * Note that many of these exceptions have priority over VM exits, so they 3467 * don't have to be checked again here. 3468 */ nested_vmx_check_permission(struct kvm_vcpu * vcpu)3469 static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) 3470 { 3471 if (!to_vmx(vcpu)->nested.vmxon) { 3472 kvm_queue_exception(vcpu, UD_VECTOR); 3473 return 0; 3474 } 3475 3476 if (vmx_get_cpl(vcpu)) { 3477 kvm_inject_gp(vcpu, 0); 3478 return 0; 3479 } 3480 3481 return 1; 3482 } 3483 3484 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 3485 struct vmcs12 *vmcs12); 3486 3487 /* 3488 * If from_vmentry is false, this is being called from state restore (either RSM 3489 * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume. 3490 * 3491 * Returns: 3492 * NVMX_VMENTRY_SUCCESS: Entered VMX non-root mode 3493 * NVMX_VMENTRY_VMFAIL: Consistency check VMFail 3494 * NVMX_VMENTRY_VMEXIT: Consistency check VMExit 3495 * NVMX_VMENTRY_KVM_INTERNAL_ERROR: KVM internal error 3496 */ nested_vmx_enter_non_root_mode(struct kvm_vcpu * vcpu,bool from_vmentry)3497 enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, 3498 bool from_vmentry) 3499 { 3500 struct vcpu_vmx *vmx = to_vmx(vcpu); 3501 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3502 enum vm_entry_failure_code entry_failure_code; 3503 union vmx_exit_reason exit_reason = { 3504 .basic = EXIT_REASON_INVALID_STATE, 3505 .failed_vmentry = 1, 3506 }; 3507 u32 failed_index; 3508 3509 trace_kvm_nested_vmenter(kvm_rip_read(vcpu), 3510 vmx->nested.current_vmptr, 3511 vmcs12->guest_rip, 3512 vmcs12->guest_intr_status, 3513 vmcs12->vm_entry_intr_info_field, 3514 vmcs12->secondary_vm_exec_control & SECONDARY_EXEC_ENABLE_EPT, 3515 vmcs12->ept_pointer, 3516 vmcs12->guest_cr3, 3517 KVM_ISA_VMX); 3518 3519 kvm_service_local_tlb_flush_requests(vcpu); 3520 3521 if (!vmx->nested.nested_run_pending || 3522 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) 3523 vmx->nested.pre_vmenter_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); 3524 if (kvm_mpx_supported() && 3525 (!vmx->nested.nested_run_pending || 3526 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) 3527 vmx->nested.pre_vmenter_bndcfgs = vmcs_read64(GUEST_BNDCFGS); 3528 3529 /* 3530 * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and* 3531 * nested early checks are disabled. In the event of a "late" VM-Fail, 3532 * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its 3533 * software model to the pre-VMEntry host state. When EPT is disabled, 3534 * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes 3535 * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3. Stuffing 3536 * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to 3537 * the correct value. Smashing vmcs01.GUEST_CR3 is safe because nested 3538 * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is 3539 * guaranteed to be overwritten with a shadow CR3 prior to re-entering 3540 * L1. Don't stuff vmcs01.GUEST_CR3 when using nested early checks as 3541 * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks 3542 * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail 3543 * path would need to manually save/restore vmcs01.GUEST_CR3. 3544 */ 3545 if (!enable_ept && !nested_early_check) 3546 vmcs_writel(GUEST_CR3, vcpu->arch.cr3); 3547 3548 vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); 3549 3550 prepare_vmcs02_early(vmx, &vmx->vmcs01, vmcs12); 3551 3552 if (from_vmentry) { 3553 if (unlikely(!nested_get_vmcs12_pages(vcpu))) { 3554 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3555 return NVMX_VMENTRY_KVM_INTERNAL_ERROR; 3556 } 3557 3558 if (nested_vmx_check_vmentry_hw(vcpu)) { 3559 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3560 return NVMX_VMENTRY_VMFAIL; 3561 } 3562 3563 if (nested_vmx_check_guest_state(vcpu, vmcs12, 3564 &entry_failure_code)) { 3565 exit_reason.basic = EXIT_REASON_INVALID_STATE; 3566 vmcs12->exit_qualification = entry_failure_code; 3567 goto vmentry_fail_vmexit; 3568 } 3569 } 3570 3571 enter_guest_mode(vcpu); 3572 3573 if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &entry_failure_code)) { 3574 exit_reason.basic = EXIT_REASON_INVALID_STATE; 3575 vmcs12->exit_qualification = entry_failure_code; 3576 goto vmentry_fail_vmexit_guest_mode; 3577 } 3578 3579 if (from_vmentry) { 3580 failed_index = nested_vmx_load_msr(vcpu, 3581 vmcs12->vm_entry_msr_load_addr, 3582 vmcs12->vm_entry_msr_load_count); 3583 if (failed_index) { 3584 exit_reason.basic = EXIT_REASON_MSR_LOAD_FAIL; 3585 vmcs12->exit_qualification = failed_index; 3586 goto vmentry_fail_vmexit_guest_mode; 3587 } 3588 } else { 3589 /* 3590 * The MMU is not initialized to point at the right entities yet and 3591 * "get pages" would need to read data from the guest (i.e. we will 3592 * need to perform gpa to hpa translation). Request a call 3593 * to nested_get_vmcs12_pages before the next VM-entry. The MSRs 3594 * have already been set at vmentry time and should not be reset. 3595 */ 3596 kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 3597 } 3598 3599 /* 3600 * Re-evaluate pending events if L1 had a pending IRQ/NMI/INIT/SIPI 3601 * when it executed VMLAUNCH/VMRESUME, as entering non-root mode can 3602 * effectively unblock various events, e.g. INIT/SIPI cause VM-Exit 3603 * unconditionally. Take care to pull data from vmcs01 as appropriate, 3604 * e.g. when checking for interrupt windows, as vmcs02 is now loaded. 3605 */ 3606 if ((__exec_controls_get(&vmx->vmcs01) & (CPU_BASED_INTR_WINDOW_EXITING | 3607 CPU_BASED_NMI_WINDOW_EXITING)) || 3608 kvm_apic_has_pending_init_or_sipi(vcpu) || 3609 kvm_apic_has_interrupt(vcpu)) 3610 kvm_make_request(KVM_REQ_EVENT, vcpu); 3611 3612 /* 3613 * Do not start the preemption timer hrtimer until after we know 3614 * we are successful, so that only nested_vmx_vmexit needs to cancel 3615 * the timer. 3616 */ 3617 vmx->nested.preemption_timer_expired = false; 3618 if (nested_cpu_has_preemption_timer(vmcs12)) { 3619 u64 timer_value = vmx_calc_preemption_timer_value(vcpu); 3620 vmx_start_preemption_timer(vcpu, timer_value); 3621 } 3622 3623 /* 3624 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point 3625 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet 3626 * returned as far as L1 is concerned. It will only return (and set 3627 * the success flag) when L2 exits (see nested_vmx_vmexit()). 3628 */ 3629 return NVMX_VMENTRY_SUCCESS; 3630 3631 /* 3632 * A failed consistency check that leads to a VMExit during L1's 3633 * VMEnter to L2 is a variation of a normal VMexit, as explained in 3634 * 26.7 "VM-entry failures during or after loading guest state". 3635 */ 3636 vmentry_fail_vmexit_guest_mode: 3637 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING) 3638 vcpu->arch.tsc_offset -= vmcs12->tsc_offset; 3639 leave_guest_mode(vcpu); 3640 3641 vmentry_fail_vmexit: 3642 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3643 3644 if (!from_vmentry) 3645 return NVMX_VMENTRY_VMEXIT; 3646 3647 load_vmcs12_host_state(vcpu, vmcs12); 3648 vmcs12->vm_exit_reason = exit_reason.full; 3649 if (enable_shadow_vmcs || nested_vmx_is_evmptr12_valid(vmx)) 3650 vmx->nested.need_vmcs12_to_shadow_sync = true; 3651 return NVMX_VMENTRY_VMEXIT; 3652 } 3653 3654 /* 3655 * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1 3656 * for running an L2 nested guest. 3657 */ nested_vmx_run(struct kvm_vcpu * vcpu,bool launch)3658 static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) 3659 { 3660 struct vmcs12 *vmcs12; 3661 enum nvmx_vmentry_status status; 3662 struct vcpu_vmx *vmx = to_vmx(vcpu); 3663 u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu); 3664 enum nested_evmptrld_status evmptrld_status; 3665 3666 if (!nested_vmx_check_permission(vcpu)) 3667 return 1; 3668 3669 evmptrld_status = nested_vmx_handle_enlightened_vmptrld(vcpu, launch); 3670 if (evmptrld_status == EVMPTRLD_ERROR) { 3671 kvm_queue_exception(vcpu, UD_VECTOR); 3672 return 1; 3673 } 3674 3675 kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED); 3676 3677 if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) 3678 return nested_vmx_failInvalid(vcpu); 3679 3680 if (CC(!nested_vmx_is_evmptr12_valid(vmx) && 3681 vmx->nested.current_vmptr == INVALID_GPA)) 3682 return nested_vmx_failInvalid(vcpu); 3683 3684 vmcs12 = get_vmcs12(vcpu); 3685 3686 /* 3687 * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact 3688 * that there *is* a valid VMCS pointer, RFLAGS.CF is set 3689 * rather than RFLAGS.ZF, and no error number is stored to the 3690 * VM-instruction error field. 3691 */ 3692 if (CC(vmcs12->hdr.shadow_vmcs)) 3693 return nested_vmx_failInvalid(vcpu); 3694 3695 if (nested_vmx_is_evmptr12_valid(vmx)) { 3696 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 3697 3698 copy_enlightened_to_vmcs12(vmx, evmcs->hv_clean_fields); 3699 /* Enlightened VMCS doesn't have launch state */ 3700 vmcs12->launch_state = !launch; 3701 } else if (enable_shadow_vmcs) { 3702 copy_shadow_to_vmcs12(vmx); 3703 } 3704 3705 /* 3706 * The nested entry process starts with enforcing various prerequisites 3707 * on vmcs12 as required by the Intel SDM, and act appropriately when 3708 * they fail: As the SDM explains, some conditions should cause the 3709 * instruction to fail, while others will cause the instruction to seem 3710 * to succeed, but return an EXIT_REASON_INVALID_STATE. 3711 * To speed up the normal (success) code path, we should avoid checking 3712 * for misconfigurations which will anyway be caught by the processor 3713 * when using the merged vmcs02. 3714 */ 3715 if (CC(interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS)) 3716 return nested_vmx_fail(vcpu, VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS); 3717 3718 if (CC(vmcs12->launch_state == launch)) 3719 return nested_vmx_fail(vcpu, 3720 launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS 3721 : VMXERR_VMRESUME_NONLAUNCHED_VMCS); 3722 3723 if (nested_vmx_check_controls(vcpu, vmcs12)) 3724 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3725 3726 if (nested_vmx_check_address_space_size(vcpu, vmcs12)) 3727 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); 3728 3729 if (nested_vmx_check_host_state(vcpu, vmcs12)) 3730 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); 3731 3732 /* 3733 * We're finally done with prerequisite checking, and can start with 3734 * the nested entry. 3735 */ 3736 vmx->nested.nested_run_pending = 1; 3737 vmx->nested.has_preemption_timer_deadline = false; 3738 status = nested_vmx_enter_non_root_mode(vcpu, true); 3739 if (unlikely(status != NVMX_VMENTRY_SUCCESS)) 3740 goto vmentry_failed; 3741 3742 /* Hide L1D cache contents from the nested guest. */ 3743 vmx->vcpu.arch.l1tf_flush_l1d = true; 3744 3745 /* 3746 * Must happen outside of nested_vmx_enter_non_root_mode() as it will 3747 * also be used as part of restoring nVMX state for 3748 * snapshot restore (migration). 3749 * 3750 * In this flow, it is assumed that vmcs12 cache was 3751 * transferred as part of captured nVMX state and should 3752 * therefore not be read from guest memory (which may not 3753 * exist on destination host yet). 3754 */ 3755 nested_cache_shadow_vmcs12(vcpu, vmcs12); 3756 3757 switch (vmcs12->guest_activity_state) { 3758 case GUEST_ACTIVITY_HLT: 3759 /* 3760 * If we're entering a halted L2 vcpu and the L2 vcpu won't be 3761 * awakened by event injection or by an NMI-window VM-exit or 3762 * by an interrupt-window VM-exit, halt the vcpu. 3763 */ 3764 if (!(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) && 3765 !nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING) && 3766 !(nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING) && 3767 (vmcs12->guest_rflags & X86_EFLAGS_IF))) { 3768 vmx->nested.nested_run_pending = 0; 3769 return kvm_emulate_halt_noskip(vcpu); 3770 } 3771 break; 3772 case GUEST_ACTIVITY_WAIT_SIPI: 3773 vmx->nested.nested_run_pending = 0; 3774 kvm_set_mp_state(vcpu, KVM_MP_STATE_INIT_RECEIVED); 3775 break; 3776 default: 3777 break; 3778 } 3779 3780 return 1; 3781 3782 vmentry_failed: 3783 vmx->nested.nested_run_pending = 0; 3784 if (status == NVMX_VMENTRY_KVM_INTERNAL_ERROR) 3785 return 0; 3786 if (status == NVMX_VMENTRY_VMEXIT) 3787 return 1; 3788 WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL); 3789 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3790 } 3791 3792 /* 3793 * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date 3794 * because L2 may have changed some cr0 bits directly (CR0_GUEST_HOST_MASK). 3795 * This function returns the new value we should put in vmcs12.guest_cr0. 3796 * It's not enough to just return the vmcs02 GUEST_CR0. Rather, 3797 * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now 3798 * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0 3799 * didn't trap the bit, because if L1 did, so would L0). 3800 * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have 3801 * been modified by L2, and L1 knows it. So just leave the old value of 3802 * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0 3803 * isn't relevant, because if L0 traps this bit it can set it to anything. 3804 * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have 3805 * changed these bits, and therefore they need to be updated, but L0 3806 * didn't necessarily allow them to be changed in GUEST_CR0 - and rather 3807 * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there. 3808 */ 3809 static inline unsigned long vmcs12_guest_cr0(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12)3810 vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3811 { 3812 return 3813 /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) | 3814 /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) | 3815 /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask | 3816 vcpu->arch.cr0_guest_owned_bits)); 3817 } 3818 3819 static inline unsigned long vmcs12_guest_cr4(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12)3820 vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3821 { 3822 return 3823 /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) | 3824 /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) | 3825 /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask | 3826 vcpu->arch.cr4_guest_owned_bits)); 3827 } 3828 vmcs12_save_pending_event(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12,u32 vm_exit_reason,u32 exit_intr_info)3829 static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, 3830 struct vmcs12 *vmcs12, 3831 u32 vm_exit_reason, u32 exit_intr_info) 3832 { 3833 u32 idt_vectoring; 3834 unsigned int nr; 3835 3836 /* 3837 * Per the SDM, VM-Exits due to double and triple faults are never 3838 * considered to occur during event delivery, even if the double/triple 3839 * fault is the result of an escalating vectoring issue. 3840 * 3841 * Note, the SDM qualifies the double fault behavior with "The original 3842 * event results in a double-fault exception". It's unclear why the 3843 * qualification exists since exits due to double fault can occur only 3844 * while vectoring a different exception (injected events are never 3845 * subject to interception), i.e. there's _always_ an original event. 3846 * 3847 * The SDM also uses NMI as a confusing example for the "original event 3848 * causes the VM exit directly" clause. NMI isn't special in any way, 3849 * the same rule applies to all events that cause an exit directly. 3850 * NMI is an odd choice for the example because NMIs can only occur on 3851 * instruction boundaries, i.e. they _can't_ occur during vectoring. 3852 */ 3853 if ((u16)vm_exit_reason == EXIT_REASON_TRIPLE_FAULT || 3854 ((u16)vm_exit_reason == EXIT_REASON_EXCEPTION_NMI && 3855 is_double_fault(exit_intr_info))) { 3856 vmcs12->idt_vectoring_info_field = 0; 3857 } else if (vcpu->arch.exception.injected) { 3858 nr = vcpu->arch.exception.vector; 3859 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 3860 3861 if (kvm_exception_is_soft(nr)) { 3862 vmcs12->vm_exit_instruction_len = 3863 vcpu->arch.event_exit_inst_len; 3864 idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION; 3865 } else 3866 idt_vectoring |= INTR_TYPE_HARD_EXCEPTION; 3867 3868 if (vcpu->arch.exception.has_error_code) { 3869 idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK; 3870 vmcs12->idt_vectoring_error_code = 3871 vcpu->arch.exception.error_code; 3872 } 3873 3874 vmcs12->idt_vectoring_info_field = idt_vectoring; 3875 } else if (vcpu->arch.nmi_injected) { 3876 vmcs12->idt_vectoring_info_field = 3877 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR; 3878 } else if (vcpu->arch.interrupt.injected) { 3879 nr = vcpu->arch.interrupt.nr; 3880 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 3881 3882 if (vcpu->arch.interrupt.soft) { 3883 idt_vectoring |= INTR_TYPE_SOFT_INTR; 3884 vmcs12->vm_entry_instruction_len = 3885 vcpu->arch.event_exit_inst_len; 3886 } else 3887 idt_vectoring |= INTR_TYPE_EXT_INTR; 3888 3889 vmcs12->idt_vectoring_info_field = idt_vectoring; 3890 } else { 3891 vmcs12->idt_vectoring_info_field = 0; 3892 } 3893 } 3894 3895 nested_mark_vmcs12_pages_dirty(struct kvm_vcpu * vcpu)3896 void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu) 3897 { 3898 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3899 gfn_t gfn; 3900 3901 /* 3902 * Don't need to mark the APIC access page dirty; it is never 3903 * written to by the CPU during APIC virtualization. 3904 */ 3905 3906 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 3907 gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT; 3908 kvm_vcpu_mark_page_dirty(vcpu, gfn); 3909 } 3910 3911 if (nested_cpu_has_posted_intr(vmcs12)) { 3912 gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT; 3913 kvm_vcpu_mark_page_dirty(vcpu, gfn); 3914 } 3915 } 3916 vmx_complete_nested_posted_interrupt(struct kvm_vcpu * vcpu)3917 static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) 3918 { 3919 struct vcpu_vmx *vmx = to_vmx(vcpu); 3920 int max_irr; 3921 void *vapic_page; 3922 u16 status; 3923 3924 if (!vmx->nested.pi_pending) 3925 return 0; 3926 3927 if (!vmx->nested.pi_desc) 3928 goto mmio_needed; 3929 3930 vmx->nested.pi_pending = false; 3931 3932 if (!pi_test_and_clear_on(vmx->nested.pi_desc)) 3933 return 0; 3934 3935 max_irr = pi_find_highest_vector(vmx->nested.pi_desc); 3936 if (max_irr > 0) { 3937 vapic_page = vmx->nested.virtual_apic_map.hva; 3938 if (!vapic_page) 3939 goto mmio_needed; 3940 3941 __kvm_apic_update_irr(vmx->nested.pi_desc->pir, 3942 vapic_page, &max_irr); 3943 status = vmcs_read16(GUEST_INTR_STATUS); 3944 if ((u8)max_irr > ((u8)status & 0xff)) { 3945 status &= ~0xff; 3946 status |= (u8)max_irr; 3947 vmcs_write16(GUEST_INTR_STATUS, status); 3948 } 3949 } 3950 3951 nested_mark_vmcs12_pages_dirty(vcpu); 3952 return 0; 3953 3954 mmio_needed: 3955 kvm_handle_memory_failure(vcpu, X86EMUL_IO_NEEDED, NULL); 3956 return -ENXIO; 3957 } 3958 nested_vmx_inject_exception_vmexit(struct kvm_vcpu * vcpu)3959 static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu) 3960 { 3961 struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit; 3962 u32 intr_info = ex->vector | INTR_INFO_VALID_MASK; 3963 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3964 unsigned long exit_qual; 3965 3966 if (ex->has_payload) { 3967 exit_qual = ex->payload; 3968 } else if (ex->vector == PF_VECTOR) { 3969 exit_qual = vcpu->arch.cr2; 3970 } else if (ex->vector == DB_VECTOR) { 3971 exit_qual = vcpu->arch.dr6; 3972 exit_qual &= ~DR6_BT; 3973 exit_qual ^= DR6_ACTIVE_LOW; 3974 } else { 3975 exit_qual = 0; 3976 } 3977 3978 /* 3979 * Unlike AMD's Paged Real Mode, which reports an error code on #PF 3980 * VM-Exits even if the CPU is in Real Mode, Intel VMX never sets the 3981 * "has error code" flags on VM-Exit if the CPU is in Real Mode. 3982 */ 3983 if (ex->has_error_code && is_protmode(vcpu)) { 3984 /* 3985 * Intel CPUs do not generate error codes with bits 31:16 set, 3986 * and more importantly VMX disallows setting bits 31:16 in the 3987 * injected error code for VM-Entry. Drop the bits to mimic 3988 * hardware and avoid inducing failure on nested VM-Entry if L1 3989 * chooses to inject the exception back to L2. AMD CPUs _do_ 3990 * generate "full" 32-bit error codes, so KVM allows userspace 3991 * to inject exception error codes with bits 31:16 set. 3992 */ 3993 vmcs12->vm_exit_intr_error_code = (u16)ex->error_code; 3994 intr_info |= INTR_INFO_DELIVER_CODE_MASK; 3995 } 3996 3997 if (kvm_exception_is_soft(ex->vector)) 3998 intr_info |= INTR_TYPE_SOFT_EXCEPTION; 3999 else 4000 intr_info |= INTR_TYPE_HARD_EXCEPTION; 4001 4002 if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) && 4003 vmx_get_nmi_mask(vcpu)) 4004 intr_info |= INTR_INFO_UNBLOCK_NMI; 4005 4006 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual); 4007 } 4008 4009 /* 4010 * Returns true if a debug trap is (likely) pending delivery. Infer the class 4011 * of a #DB (trap-like vs. fault-like) from the exception payload (to-be-DR6). 4012 * Using the payload is flawed because code breakpoints (fault-like) and data 4013 * breakpoints (trap-like) set the same bits in DR6 (breakpoint detected), i.e. 4014 * this will return false positives if a to-be-injected code breakpoint #DB is 4015 * pending (from KVM's perspective, but not "pending" across an instruction 4016 * boundary). ICEBP, a.k.a. INT1, is also not reflected here even though it 4017 * too is trap-like. 4018 * 4019 * KVM "works" despite these flaws as ICEBP isn't currently supported by the 4020 * emulator, Monitor Trap Flag is not marked pending on intercepted #DBs (the 4021 * #DB has already happened), and MTF isn't marked pending on code breakpoints 4022 * from the emulator (because such #DBs are fault-like and thus don't trigger 4023 * actions that fire on instruction retire). 4024 */ vmx_get_pending_dbg_trap(struct kvm_queued_exception * ex)4025 static unsigned long vmx_get_pending_dbg_trap(struct kvm_queued_exception *ex) 4026 { 4027 if (!ex->pending || ex->vector != DB_VECTOR) 4028 return 0; 4029 4030 /* General Detect #DBs are always fault-like. */ 4031 return ex->payload & ~DR6_BD; 4032 } 4033 4034 /* 4035 * Returns true if there's a pending #DB exception that is lower priority than 4036 * a pending Monitor Trap Flag VM-Exit. TSS T-flag #DBs are not emulated by 4037 * KVM, but could theoretically be injected by userspace. Note, this code is 4038 * imperfect, see above. 4039 */ vmx_is_low_priority_db_trap(struct kvm_queued_exception * ex)4040 static bool vmx_is_low_priority_db_trap(struct kvm_queued_exception *ex) 4041 { 4042 return vmx_get_pending_dbg_trap(ex) & ~DR6_BT; 4043 } 4044 4045 /* 4046 * Certain VM-exits set the 'pending debug exceptions' field to indicate a 4047 * recognized #DB (data or single-step) that has yet to be delivered. Since KVM 4048 * represents these debug traps with a payload that is said to be compatible 4049 * with the 'pending debug exceptions' field, write the payload to the VMCS 4050 * field if a VM-exit is delivered before the debug trap. 4051 */ nested_vmx_update_pending_dbg(struct kvm_vcpu * vcpu)4052 static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu) 4053 { 4054 unsigned long pending_dbg; 4055 4056 pending_dbg = vmx_get_pending_dbg_trap(&vcpu->arch.exception); 4057 if (pending_dbg) 4058 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, pending_dbg); 4059 } 4060 nested_vmx_preemption_timer_pending(struct kvm_vcpu * vcpu)4061 static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu) 4062 { 4063 return nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && 4064 to_vmx(vcpu)->nested.preemption_timer_expired; 4065 } 4066 vmx_has_nested_events(struct kvm_vcpu * vcpu,bool for_injection)4067 static bool vmx_has_nested_events(struct kvm_vcpu *vcpu, bool for_injection) 4068 { 4069 struct vcpu_vmx *vmx = to_vmx(vcpu); 4070 void *vapic = vmx->nested.virtual_apic_map.hva; 4071 int max_irr, vppr; 4072 4073 if (nested_vmx_preemption_timer_pending(vcpu) || 4074 vmx->nested.mtf_pending) 4075 return true; 4076 4077 /* 4078 * Virtual Interrupt Delivery doesn't require manual injection. Either 4079 * the interrupt is already in GUEST_RVI and will be recognized by CPU 4080 * at VM-Entry, or there is a KVM_REQ_EVENT pending and KVM will move 4081 * the interrupt from the PIR to RVI prior to entering the guest. 4082 */ 4083 if (for_injection) 4084 return false; 4085 4086 if (!nested_cpu_has_vid(get_vmcs12(vcpu)) || 4087 __vmx_interrupt_blocked(vcpu)) 4088 return false; 4089 4090 if (!vapic) 4091 return false; 4092 4093 vppr = *((u32 *)(vapic + APIC_PROCPRI)); 4094 4095 max_irr = vmx_get_rvi(); 4096 if ((max_irr & 0xf0) > (vppr & 0xf0)) 4097 return true; 4098 4099 if (vmx->nested.pi_pending && vmx->nested.pi_desc && 4100 pi_test_on(vmx->nested.pi_desc)) { 4101 max_irr = pi_find_highest_vector(vmx->nested.pi_desc); 4102 if (max_irr > 0 && (max_irr & 0xf0) > (vppr & 0xf0)) 4103 return true; 4104 } 4105 4106 return false; 4107 } 4108 4109 /* 4110 * Per the Intel SDM's table "Priority Among Concurrent Events", with minor 4111 * edits to fill in missing examples, e.g. #DB due to split-lock accesses, 4112 * and less minor edits to splice in the priority of VMX Non-Root specific 4113 * events, e.g. MTF and NMI/INTR-window exiting. 4114 * 4115 * 1 Hardware Reset and Machine Checks 4116 * - RESET 4117 * - Machine Check 4118 * 4119 * 2 Trap on Task Switch 4120 * - T flag in TSS is set (on task switch) 4121 * 4122 * 3 External Hardware Interventions 4123 * - FLUSH 4124 * - STOPCLK 4125 * - SMI 4126 * - INIT 4127 * 4128 * 3.5 Monitor Trap Flag (MTF) VM-exit[1] 4129 * 4130 * 4 Traps on Previous Instruction 4131 * - Breakpoints 4132 * - Trap-class Debug Exceptions (#DB due to TF flag set, data/I-O 4133 * breakpoint, or #DB due to a split-lock access) 4134 * 4135 * 4.3 VMX-preemption timer expired VM-exit 4136 * 4137 * 4.6 NMI-window exiting VM-exit[2] 4138 * 4139 * 5 Nonmaskable Interrupts (NMI) 4140 * 4141 * 5.5 Interrupt-window exiting VM-exit and Virtual-interrupt delivery 4142 * 4143 * 6 Maskable Hardware Interrupts 4144 * 4145 * 7 Code Breakpoint Fault 4146 * 4147 * 8 Faults from Fetching Next Instruction 4148 * - Code-Segment Limit Violation 4149 * - Code Page Fault 4150 * - Control protection exception (missing ENDBRANCH at target of indirect 4151 * call or jump) 4152 * 4153 * 9 Faults from Decoding Next Instruction 4154 * - Instruction length > 15 bytes 4155 * - Invalid Opcode 4156 * - Coprocessor Not Available 4157 * 4158 *10 Faults on Executing Instruction 4159 * - Overflow 4160 * - Bound error 4161 * - Invalid TSS 4162 * - Segment Not Present 4163 * - Stack fault 4164 * - General Protection 4165 * - Data Page Fault 4166 * - Alignment Check 4167 * - x86 FPU Floating-point exception 4168 * - SIMD floating-point exception 4169 * - Virtualization exception 4170 * - Control protection exception 4171 * 4172 * [1] Per the "Monitor Trap Flag" section: System-management interrupts (SMIs), 4173 * INIT signals, and higher priority events take priority over MTF VM exits. 4174 * MTF VM exits take priority over debug-trap exceptions and lower priority 4175 * events. 4176 * 4177 * [2] Debug-trap exceptions and higher priority events take priority over VM exits 4178 * caused by the VMX-preemption timer. VM exits caused by the VMX-preemption 4179 * timer take priority over VM exits caused by the "NMI-window exiting" 4180 * VM-execution control and lower priority events. 4181 * 4182 * [3] Debug-trap exceptions and higher priority events take priority over VM exits 4183 * caused by "NMI-window exiting". VM exits caused by this control take 4184 * priority over non-maskable interrupts (NMIs) and lower priority events. 4185 * 4186 * [4] Virtual-interrupt delivery has the same priority as that of VM exits due to 4187 * the 1-setting of the "interrupt-window exiting" VM-execution control. Thus, 4188 * non-maskable interrupts (NMIs) and higher priority events take priority over 4189 * delivery of a virtual interrupt; delivery of a virtual interrupt takes 4190 * priority over external interrupts and lower priority events. 4191 */ vmx_check_nested_events(struct kvm_vcpu * vcpu)4192 static int vmx_check_nested_events(struct kvm_vcpu *vcpu) 4193 { 4194 struct kvm_lapic *apic = vcpu->arch.apic; 4195 struct vcpu_vmx *vmx = to_vmx(vcpu); 4196 /* 4197 * Only a pending nested run blocks a pending exception. If there is a 4198 * previously injected event, the pending exception occurred while said 4199 * event was being delivered and thus needs to be handled. 4200 */ 4201 bool block_nested_exceptions = vmx->nested.nested_run_pending; 4202 /* 4203 * Events that don't require injection, i.e. that are virtualized by 4204 * hardware, aren't blocked by a pending VM-Enter as KVM doesn't need 4205 * to regain control in order to deliver the event, and hardware will 4206 * handle event ordering, e.g. with respect to injected exceptions. 4207 * 4208 * But, new events (not exceptions) are only recognized at instruction 4209 * boundaries. If an event needs reinjection, then KVM is handling a 4210 * VM-Exit that occurred _during_ instruction execution; new events, 4211 * irrespective of whether or not they're injected, are blocked until 4212 * the instruction completes. 4213 */ 4214 bool block_non_injected_events = kvm_event_needs_reinjection(vcpu); 4215 /* 4216 * Inject events are blocked by nested VM-Enter, as KVM is responsible 4217 * for managing priority between concurrent events, i.e. KVM needs to 4218 * wait until after VM-Enter completes to deliver injected events. 4219 */ 4220 bool block_nested_events = block_nested_exceptions || 4221 block_non_injected_events; 4222 4223 if (lapic_in_kernel(vcpu) && 4224 test_bit(KVM_APIC_INIT, &apic->pending_events)) { 4225 if (block_nested_events) 4226 return -EBUSY; 4227 nested_vmx_update_pending_dbg(vcpu); 4228 clear_bit(KVM_APIC_INIT, &apic->pending_events); 4229 if (vcpu->arch.mp_state != KVM_MP_STATE_INIT_RECEIVED) 4230 nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0); 4231 4232 /* MTF is discarded if the vCPU is in WFS. */ 4233 vmx->nested.mtf_pending = false; 4234 return 0; 4235 } 4236 4237 if (lapic_in_kernel(vcpu) && 4238 test_bit(KVM_APIC_SIPI, &apic->pending_events)) { 4239 if (block_nested_events) 4240 return -EBUSY; 4241 4242 clear_bit(KVM_APIC_SIPI, &apic->pending_events); 4243 if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { 4244 nested_vmx_vmexit(vcpu, EXIT_REASON_SIPI_SIGNAL, 0, 4245 apic->sipi_vector & 0xFFUL); 4246 return 0; 4247 } 4248 /* Fallthrough, the SIPI is completely ignored. */ 4249 } 4250 4251 /* 4252 * Process exceptions that are higher priority than Monitor Trap Flag: 4253 * fault-like exceptions, TSS T flag #DB (not emulated by KVM, but 4254 * could theoretically come in from userspace), and ICEBP (INT1). 4255 * 4256 * TODO: SMIs have higher priority than MTF and trap-like #DBs (except 4257 * for TSS T flag #DBs). KVM also doesn't save/restore pending MTF 4258 * across SMI/RSM as it should; that needs to be addressed in order to 4259 * prioritize SMI over MTF and trap-like #DBs. 4260 */ 4261 if (vcpu->arch.exception_vmexit.pending && 4262 !vmx_is_low_priority_db_trap(&vcpu->arch.exception_vmexit)) { 4263 if (block_nested_exceptions) 4264 return -EBUSY; 4265 4266 nested_vmx_inject_exception_vmexit(vcpu); 4267 return 0; 4268 } 4269 4270 if (vcpu->arch.exception.pending && 4271 !vmx_is_low_priority_db_trap(&vcpu->arch.exception)) { 4272 if (block_nested_exceptions) 4273 return -EBUSY; 4274 goto no_vmexit; 4275 } 4276 4277 if (vmx->nested.mtf_pending) { 4278 if (block_nested_events) 4279 return -EBUSY; 4280 nested_vmx_update_pending_dbg(vcpu); 4281 nested_vmx_vmexit(vcpu, EXIT_REASON_MONITOR_TRAP_FLAG, 0, 0); 4282 return 0; 4283 } 4284 4285 if (vcpu->arch.exception_vmexit.pending) { 4286 if (block_nested_exceptions) 4287 return -EBUSY; 4288 4289 nested_vmx_inject_exception_vmexit(vcpu); 4290 return 0; 4291 } 4292 4293 if (vcpu->arch.exception.pending) { 4294 if (block_nested_exceptions) 4295 return -EBUSY; 4296 goto no_vmexit; 4297 } 4298 4299 if (nested_vmx_preemption_timer_pending(vcpu)) { 4300 if (block_nested_events) 4301 return -EBUSY; 4302 nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0); 4303 return 0; 4304 } 4305 4306 if (vcpu->arch.smi_pending && !is_smm(vcpu)) { 4307 if (block_nested_events) 4308 return -EBUSY; 4309 goto no_vmexit; 4310 } 4311 4312 if (vcpu->arch.nmi_pending && !vmx_nmi_blocked(vcpu)) { 4313 if (block_nested_events) 4314 return -EBUSY; 4315 if (!nested_exit_on_nmi(vcpu)) 4316 goto no_vmexit; 4317 4318 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, 4319 NMI_VECTOR | INTR_TYPE_NMI_INTR | 4320 INTR_INFO_VALID_MASK, 0); 4321 /* 4322 * The NMI-triggered VM exit counts as injection: 4323 * clear this one and block further NMIs. 4324 */ 4325 vcpu->arch.nmi_pending = 0; 4326 vmx_set_nmi_mask(vcpu, true); 4327 return 0; 4328 } 4329 4330 if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) { 4331 int irq; 4332 4333 if (!nested_exit_on_intr(vcpu)) { 4334 if (block_nested_events) 4335 return -EBUSY; 4336 4337 goto no_vmexit; 4338 } 4339 4340 if (!nested_exit_intr_ack_set(vcpu)) { 4341 if (block_nested_events) 4342 return -EBUSY; 4343 4344 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); 4345 return 0; 4346 } 4347 4348 irq = kvm_cpu_get_extint(vcpu); 4349 if (irq != -1) { 4350 if (block_nested_events) 4351 return -EBUSY; 4352 4353 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 4354 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0); 4355 return 0; 4356 } 4357 4358 irq = kvm_apic_has_interrupt(vcpu); 4359 if (WARN_ON_ONCE(irq < 0)) 4360 goto no_vmexit; 4361 4362 /* 4363 * If the IRQ is L2's PI notification vector, process posted 4364 * interrupts for L2 instead of injecting VM-Exit, as the 4365 * detection/morphing architecturally occurs when the IRQ is 4366 * delivered to the CPU. Note, only interrupts that are routed 4367 * through the local APIC trigger posted interrupt processing, 4368 * and enabling posted interrupts requires ACK-on-exit. 4369 */ 4370 if (irq == vmx->nested.posted_intr_nv) { 4371 /* 4372 * Nested posted interrupts are delivered via RVI, i.e. 4373 * aren't injected by KVM, and so can be queued even if 4374 * manual event injection is disallowed. 4375 */ 4376 if (block_non_injected_events) 4377 return -EBUSY; 4378 4379 vmx->nested.pi_pending = true; 4380 kvm_apic_clear_irr(vcpu, irq); 4381 goto no_vmexit; 4382 } 4383 4384 if (block_nested_events) 4385 return -EBUSY; 4386 4387 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 4388 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0); 4389 4390 /* 4391 * ACK the interrupt _after_ emulating VM-Exit, as the IRQ must 4392 * be marked as in-service in vmcs01.GUEST_INTERRUPT_STATUS.SVI 4393 * if APICv is active. 4394 */ 4395 kvm_apic_ack_interrupt(vcpu, irq); 4396 return 0; 4397 } 4398 4399 no_vmexit: 4400 return vmx_complete_nested_posted_interrupt(vcpu); 4401 } 4402 vmx_get_preemption_timer_value(struct kvm_vcpu * vcpu)4403 static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) 4404 { 4405 ktime_t remaining = 4406 hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer); 4407 u64 value; 4408 4409 if (ktime_to_ns(remaining) <= 0) 4410 return 0; 4411 4412 value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz; 4413 do_div(value, 1000000); 4414 return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 4415 } 4416 is_vmcs12_ext_field(unsigned long field)4417 static bool is_vmcs12_ext_field(unsigned long field) 4418 { 4419 switch (field) { 4420 case GUEST_ES_SELECTOR: 4421 case GUEST_CS_SELECTOR: 4422 case GUEST_SS_SELECTOR: 4423 case GUEST_DS_SELECTOR: 4424 case GUEST_FS_SELECTOR: 4425 case GUEST_GS_SELECTOR: 4426 case GUEST_LDTR_SELECTOR: 4427 case GUEST_TR_SELECTOR: 4428 case GUEST_ES_LIMIT: 4429 case GUEST_CS_LIMIT: 4430 case GUEST_SS_LIMIT: 4431 case GUEST_DS_LIMIT: 4432 case GUEST_FS_LIMIT: 4433 case GUEST_GS_LIMIT: 4434 case GUEST_LDTR_LIMIT: 4435 case GUEST_TR_LIMIT: 4436 case GUEST_GDTR_LIMIT: 4437 case GUEST_IDTR_LIMIT: 4438 case GUEST_ES_AR_BYTES: 4439 case GUEST_DS_AR_BYTES: 4440 case GUEST_FS_AR_BYTES: 4441 case GUEST_GS_AR_BYTES: 4442 case GUEST_LDTR_AR_BYTES: 4443 case GUEST_TR_AR_BYTES: 4444 case GUEST_ES_BASE: 4445 case GUEST_CS_BASE: 4446 case GUEST_SS_BASE: 4447 case GUEST_DS_BASE: 4448 case GUEST_FS_BASE: 4449 case GUEST_GS_BASE: 4450 case GUEST_LDTR_BASE: 4451 case GUEST_TR_BASE: 4452 case GUEST_GDTR_BASE: 4453 case GUEST_IDTR_BASE: 4454 case GUEST_PENDING_DBG_EXCEPTIONS: 4455 case GUEST_BNDCFGS: 4456 return true; 4457 default: 4458 break; 4459 } 4460 4461 return false; 4462 } 4463 sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12)4464 static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, 4465 struct vmcs12 *vmcs12) 4466 { 4467 struct vcpu_vmx *vmx = to_vmx(vcpu); 4468 4469 vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); 4470 vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR); 4471 vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR); 4472 vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR); 4473 vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR); 4474 vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR); 4475 vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR); 4476 vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR); 4477 vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT); 4478 vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT); 4479 vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT); 4480 vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT); 4481 vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT); 4482 vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT); 4483 vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT); 4484 vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT); 4485 vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT); 4486 vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT); 4487 vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES); 4488 vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES); 4489 vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES); 4490 vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES); 4491 vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES); 4492 vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES); 4493 vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE); 4494 vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE); 4495 vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE); 4496 vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE); 4497 vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE); 4498 vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE); 4499 vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE); 4500 vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE); 4501 vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); 4502 vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); 4503 vmcs12->guest_pending_dbg_exceptions = 4504 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); 4505 4506 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false; 4507 } 4508 copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12)4509 static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, 4510 struct vmcs12 *vmcs12) 4511 { 4512 struct vcpu_vmx *vmx = to_vmx(vcpu); 4513 int cpu; 4514 4515 if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare) 4516 return; 4517 4518 4519 WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01); 4520 4521 cpu = get_cpu(); 4522 vmx->loaded_vmcs = &vmx->nested.vmcs02; 4523 vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->vmcs01); 4524 4525 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 4526 4527 vmx->loaded_vmcs = &vmx->vmcs01; 4528 vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->nested.vmcs02); 4529 put_cpu(); 4530 } 4531 4532 /* 4533 * Update the guest state fields of vmcs12 to reflect changes that 4534 * occurred while L2 was running. (The "IA-32e mode guest" bit of the 4535 * VM-entry controls is also updated, since this is really a guest 4536 * state bit.) 4537 */ sync_vmcs02_to_vmcs12(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12)4538 static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 4539 { 4540 struct vcpu_vmx *vmx = to_vmx(vcpu); 4541 4542 if (nested_vmx_is_evmptr12_valid(vmx)) 4543 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 4544 4545 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = 4546 !nested_vmx_is_evmptr12_valid(vmx); 4547 4548 vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); 4549 vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); 4550 4551 vmcs12->guest_rsp = kvm_rsp_read(vcpu); 4552 vmcs12->guest_rip = kvm_rip_read(vcpu); 4553 vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); 4554 4555 vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); 4556 vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); 4557 4558 vmcs12->guest_interruptibility_info = 4559 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 4560 4561 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) 4562 vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT; 4563 else if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) 4564 vmcs12->guest_activity_state = GUEST_ACTIVITY_WAIT_SIPI; 4565 else 4566 vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; 4567 4568 if (nested_cpu_has_preemption_timer(vmcs12) && 4569 vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER && 4570 !vmx->nested.nested_run_pending) 4571 vmcs12->vmx_preemption_timer_value = 4572 vmx_get_preemption_timer_value(vcpu); 4573 4574 /* 4575 * In some cases (usually, nested EPT), L2 is allowed to change its 4576 * own CR3 without exiting. If it has changed it, we must keep it. 4577 * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined 4578 * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12. 4579 * 4580 * Additionally, restore L2's PDPTR to vmcs12. 4581 */ 4582 if (enable_ept) { 4583 vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3); 4584 if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { 4585 vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0); 4586 vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1); 4587 vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2); 4588 vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); 4589 } 4590 } 4591 4592 vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); 4593 4594 if (nested_cpu_has_vid(vmcs12)) 4595 vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS); 4596 4597 vmcs12->vm_entry_controls = 4598 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | 4599 (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); 4600 4601 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) 4602 vmcs12->guest_dr7 = vcpu->arch.dr7; 4603 4604 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) 4605 vmcs12->guest_ia32_efer = vcpu->arch.efer; 4606 } 4607 4608 /* 4609 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits 4610 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), 4611 * and this function updates it to reflect the changes to the guest state while 4612 * L2 was running (and perhaps made some exits which were handled directly by L0 4613 * without going back to L1), and to reflect the exit reason. 4614 * Note that we do not have to copy here all VMCS fields, just those that 4615 * could have changed by the L2 guest or the exit - i.e., the guest-state and 4616 * exit-information fields only. Other fields are modified by L1 with VMWRITE, 4617 * which already writes to vmcs12 directly. 4618 */ prepare_vmcs12(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12,u32 vm_exit_reason,u32 exit_intr_info,unsigned long exit_qualification,u32 exit_insn_len)4619 static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 4620 u32 vm_exit_reason, u32 exit_intr_info, 4621 unsigned long exit_qualification, u32 exit_insn_len) 4622 { 4623 /* update exit information fields: */ 4624 vmcs12->vm_exit_reason = vm_exit_reason; 4625 if (to_vmx(vcpu)->exit_reason.enclave_mode) 4626 vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE; 4627 vmcs12->exit_qualification = exit_qualification; 4628 4629 /* 4630 * On VM-Exit due to a failed VM-Entry, the VMCS isn't marked launched 4631 * and only EXIT_REASON and EXIT_QUALIFICATION are updated, all other 4632 * exit info fields are unmodified. 4633 */ 4634 if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { 4635 vmcs12->launch_state = 1; 4636 4637 /* vm_entry_intr_info_field is cleared on exit. Emulate this 4638 * instead of reading the real value. */ 4639 vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK; 4640 4641 /* 4642 * Transfer the event that L0 or L1 may wanted to inject into 4643 * L2 to IDT_VECTORING_INFO_FIELD. 4644 */ 4645 vmcs12_save_pending_event(vcpu, vmcs12, 4646 vm_exit_reason, exit_intr_info); 4647 4648 vmcs12->vm_exit_intr_info = exit_intr_info; 4649 vmcs12->vm_exit_instruction_len = exit_insn_len; 4650 vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 4651 4652 /* 4653 * According to spec, there's no need to store the guest's 4654 * MSRs if the exit is due to a VM-entry failure that occurs 4655 * during or after loading the guest state. Since this exit 4656 * does not fall in that category, we need to save the MSRs. 4657 */ 4658 if (nested_vmx_store_msr(vcpu, 4659 vmcs12->vm_exit_msr_store_addr, 4660 vmcs12->vm_exit_msr_store_count)) 4661 nested_vmx_abort(vcpu, 4662 VMX_ABORT_SAVE_GUEST_MSR_FAIL); 4663 } 4664 } 4665 4666 /* 4667 * A part of what we need to when the nested L2 guest exits and we want to 4668 * run its L1 parent, is to reset L1's guest state to the host state specified 4669 * in vmcs12. 4670 * This function is to be called not only on normal nested exit, but also on 4671 * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry 4672 * Failures During or After Loading Guest State"). 4673 * This function should be called when the active VMCS is L1's (vmcs01). 4674 */ load_vmcs12_host_state(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12)4675 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 4676 struct vmcs12 *vmcs12) 4677 { 4678 enum vm_entry_failure_code ignored; 4679 struct kvm_segment seg; 4680 4681 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) 4682 vcpu->arch.efer = vmcs12->host_ia32_efer; 4683 else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 4684 vcpu->arch.efer |= (EFER_LMA | EFER_LME); 4685 else 4686 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); 4687 vmx_set_efer(vcpu, vcpu->arch.efer); 4688 4689 kvm_rsp_write(vcpu, vmcs12->host_rsp); 4690 kvm_rip_write(vcpu, vmcs12->host_rip); 4691 vmx_set_rflags(vcpu, X86_EFLAGS_FIXED); 4692 vmx_set_interrupt_shadow(vcpu, 0); 4693 4694 /* 4695 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't 4696 * actually changed, because vmx_set_cr0 refers to efer set above. 4697 * 4698 * CR0_GUEST_HOST_MASK is already set in the original vmcs01 4699 * (KVM doesn't change it); 4700 */ 4701 vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); 4702 vmx_set_cr0(vcpu, vmcs12->host_cr0); 4703 4704 /* Same as above - no reason to call set_cr4_guest_host_mask(). */ 4705 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 4706 vmx_set_cr4(vcpu, vmcs12->host_cr4); 4707 4708 nested_ept_uninit_mmu_context(vcpu); 4709 4710 /* 4711 * Only PDPTE load can fail as the value of cr3 was checked on entry and 4712 * couldn't have changed. 4713 */ 4714 if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, true, &ignored)) 4715 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); 4716 4717 nested_vmx_transition_tlb_flush(vcpu, vmcs12, false); 4718 4719 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); 4720 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); 4721 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); 4722 vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); 4723 vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); 4724 vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF); 4725 vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF); 4726 4727 /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ 4728 if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) 4729 vmcs_write64(GUEST_BNDCFGS, 0); 4730 4731 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { 4732 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); 4733 vcpu->arch.pat = vmcs12->host_ia32_pat; 4734 } 4735 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && 4736 kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu))) 4737 WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 4738 vmcs12->host_ia32_perf_global_ctrl)); 4739 4740 /* Set L1 segment info according to Intel SDM 4741 27.5.2 Loading Host Segment and Descriptor-Table Registers */ 4742 seg = (struct kvm_segment) { 4743 .base = 0, 4744 .limit = 0xFFFFFFFF, 4745 .selector = vmcs12->host_cs_selector, 4746 .type = 11, 4747 .present = 1, 4748 .s = 1, 4749 .g = 1 4750 }; 4751 if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 4752 seg.l = 1; 4753 else 4754 seg.db = 1; 4755 __vmx_set_segment(vcpu, &seg, VCPU_SREG_CS); 4756 seg = (struct kvm_segment) { 4757 .base = 0, 4758 .limit = 0xFFFFFFFF, 4759 .type = 3, 4760 .present = 1, 4761 .s = 1, 4762 .db = 1, 4763 .g = 1 4764 }; 4765 seg.selector = vmcs12->host_ds_selector; 4766 __vmx_set_segment(vcpu, &seg, VCPU_SREG_DS); 4767 seg.selector = vmcs12->host_es_selector; 4768 __vmx_set_segment(vcpu, &seg, VCPU_SREG_ES); 4769 seg.selector = vmcs12->host_ss_selector; 4770 __vmx_set_segment(vcpu, &seg, VCPU_SREG_SS); 4771 seg.selector = vmcs12->host_fs_selector; 4772 seg.base = vmcs12->host_fs_base; 4773 __vmx_set_segment(vcpu, &seg, VCPU_SREG_FS); 4774 seg.selector = vmcs12->host_gs_selector; 4775 seg.base = vmcs12->host_gs_base; 4776 __vmx_set_segment(vcpu, &seg, VCPU_SREG_GS); 4777 seg = (struct kvm_segment) { 4778 .base = vmcs12->host_tr_base, 4779 .limit = 0x67, 4780 .selector = vmcs12->host_tr_selector, 4781 .type = 11, 4782 .present = 1 4783 }; 4784 __vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); 4785 4786 memset(&seg, 0, sizeof(seg)); 4787 seg.unusable = 1; 4788 __vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR); 4789 4790 kvm_set_dr(vcpu, 7, 0x400); 4791 vmcs_write64(GUEST_IA32_DEBUGCTL, 0); 4792 4793 if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, 4794 vmcs12->vm_exit_msr_load_count)) 4795 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 4796 4797 to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu); 4798 } 4799 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx * vmx)4800 static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) 4801 { 4802 struct vmx_uret_msr *efer_msr; 4803 unsigned int i; 4804 4805 if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER) 4806 return vmcs_read64(GUEST_IA32_EFER); 4807 4808 if (cpu_has_load_ia32_efer()) 4809 return kvm_host.efer; 4810 4811 for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) { 4812 if (vmx->msr_autoload.guest.val[i].index == MSR_EFER) 4813 return vmx->msr_autoload.guest.val[i].value; 4814 } 4815 4816 efer_msr = vmx_find_uret_msr(vmx, MSR_EFER); 4817 if (efer_msr) 4818 return efer_msr->data; 4819 4820 return kvm_host.efer; 4821 } 4822 nested_vmx_restore_host_state(struct kvm_vcpu * vcpu)4823 static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) 4824 { 4825 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4826 struct vcpu_vmx *vmx = to_vmx(vcpu); 4827 struct vmx_msr_entry g, h; 4828 gpa_t gpa; 4829 u32 i, j; 4830 4831 vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT); 4832 4833 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { 4834 /* 4835 * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set 4836 * as vmcs01.GUEST_DR7 contains a userspace defined value 4837 * and vcpu->arch.dr7 is not squirreled away before the 4838 * nested VMENTER (not worth adding a variable in nested_vmx). 4839 */ 4840 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 4841 kvm_set_dr(vcpu, 7, DR7_FIXED_1); 4842 else 4843 WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7))); 4844 } 4845 4846 /* 4847 * Note that calling vmx_set_{efer,cr0,cr4} is important as they 4848 * handle a variety of side effects to KVM's software model. 4849 */ 4850 vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx)); 4851 4852 vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); 4853 vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW)); 4854 4855 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 4856 vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW)); 4857 4858 nested_ept_uninit_mmu_context(vcpu); 4859 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 4860 kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); 4861 4862 /* 4863 * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs 4864 * from vmcs01 (if necessary). The PDPTRs are not loaded on 4865 * VMFail, like everything else we just need to ensure our 4866 * software model is up-to-date. 4867 */ 4868 if (enable_ept && is_pae_paging(vcpu)) 4869 ept_save_pdptrs(vcpu); 4870 4871 kvm_mmu_reset_context(vcpu); 4872 4873 /* 4874 * This nasty bit of open coding is a compromise between blindly 4875 * loading L1's MSRs using the exit load lists (incorrect emulation 4876 * of VMFail), leaving the nested VM's MSRs in the software model 4877 * (incorrect behavior) and snapshotting the modified MSRs (too 4878 * expensive since the lists are unbound by hardware). For each 4879 * MSR that was (prematurely) loaded from the nested VMEntry load 4880 * list, reload it from the exit load list if it exists and differs 4881 * from the guest value. The intent is to stuff host state as 4882 * silently as possible, not to fully process the exit load list. 4883 */ 4884 for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) { 4885 gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g)); 4886 if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) { 4887 pr_debug_ratelimited( 4888 "%s read MSR index failed (%u, 0x%08llx)\n", 4889 __func__, i, gpa); 4890 goto vmabort; 4891 } 4892 4893 for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) { 4894 gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h)); 4895 if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) { 4896 pr_debug_ratelimited( 4897 "%s read MSR failed (%u, 0x%08llx)\n", 4898 __func__, j, gpa); 4899 goto vmabort; 4900 } 4901 if (h.index != g.index) 4902 continue; 4903 if (h.value == g.value) 4904 break; 4905 4906 if (nested_vmx_load_msr_check(vcpu, &h)) { 4907 pr_debug_ratelimited( 4908 "%s check failed (%u, 0x%x, 0x%x)\n", 4909 __func__, j, h.index, h.reserved); 4910 goto vmabort; 4911 } 4912 4913 if (kvm_set_msr_with_filter(vcpu, h.index, h.value)) { 4914 pr_debug_ratelimited( 4915 "%s WRMSR failed (%u, 0x%x, 0x%llx)\n", 4916 __func__, j, h.index, h.value); 4917 goto vmabort; 4918 } 4919 } 4920 } 4921 4922 return; 4923 4924 vmabort: 4925 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 4926 } 4927 4928 /* 4929 * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1 4930 * and modify vmcs12 to make it see what it would expect to see there if 4931 * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) 4932 */ __nested_vmx_vmexit(struct kvm_vcpu * vcpu,u32 vm_exit_reason,u32 exit_intr_info,unsigned long exit_qualification,u32 exit_insn_len)4933 void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, 4934 u32 exit_intr_info, unsigned long exit_qualification, 4935 u32 exit_insn_len) 4936 { 4937 struct vcpu_vmx *vmx = to_vmx(vcpu); 4938 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4939 4940 /* Pending MTF traps are discarded on VM-Exit. */ 4941 vmx->nested.mtf_pending = false; 4942 4943 /* trying to cancel vmlaunch/vmresume is a bug */ 4944 WARN_ON_ONCE(vmx->nested.nested_run_pending); 4945 4946 #ifdef CONFIG_KVM_HYPERV 4947 if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) { 4948 /* 4949 * KVM_REQ_GET_NESTED_STATE_PAGES is also used to map 4950 * Enlightened VMCS after migration and we still need to 4951 * do that when something is forcing L2->L1 exit prior to 4952 * the first L2 run. 4953 */ 4954 (void)nested_get_evmcs_page(vcpu); 4955 } 4956 #endif 4957 4958 /* Service pending TLB flush requests for L2 before switching to L1. */ 4959 kvm_service_local_tlb_flush_requests(vcpu); 4960 4961 /* 4962 * VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between 4963 * now and the new vmentry. Ensure that the VMCS02 PDPTR fields are 4964 * up-to-date before switching to L1. 4965 */ 4966 if (enable_ept && is_pae_paging(vcpu)) 4967 vmx_ept_load_pdptrs(vcpu); 4968 4969 leave_guest_mode(vcpu); 4970 4971 if (nested_cpu_has_preemption_timer(vmcs12)) 4972 hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); 4973 4974 if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) { 4975 vcpu->arch.tsc_offset = vcpu->arch.l1_tsc_offset; 4976 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING)) 4977 vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio; 4978 } 4979 4980 if (likely(!vmx->fail)) { 4981 sync_vmcs02_to_vmcs12(vcpu, vmcs12); 4982 4983 if (vm_exit_reason != -1) 4984 prepare_vmcs12(vcpu, vmcs12, vm_exit_reason, 4985 exit_intr_info, exit_qualification, 4986 exit_insn_len); 4987 4988 /* 4989 * Must happen outside of sync_vmcs02_to_vmcs12() as it will 4990 * also be used to capture vmcs12 cache as part of 4991 * capturing nVMX state for snapshot (migration). 4992 * 4993 * Otherwise, this flush will dirty guest memory at a 4994 * point it is already assumed by user-space to be 4995 * immutable. 4996 */ 4997 nested_flush_cached_shadow_vmcs12(vcpu, vmcs12); 4998 } else { 4999 /* 5000 * The only expected VM-instruction error is "VM entry with 5001 * invalid control field(s)." Anything else indicates a 5002 * problem with L0. And we should never get here with a 5003 * VMFail of any type if early consistency checks are enabled. 5004 */ 5005 WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != 5006 VMXERR_ENTRY_INVALID_CONTROL_FIELD); 5007 WARN_ON_ONCE(nested_early_check); 5008 } 5009 5010 /* 5011 * Drop events/exceptions that were queued for re-injection to L2 5012 * (picked up via vmx_complete_interrupts()), as well as exceptions 5013 * that were pending for L2. Note, this must NOT be hoisted above 5014 * prepare_vmcs12(), events/exceptions queued for re-injection need to 5015 * be captured in vmcs12 (see vmcs12_save_pending_event()). 5016 */ 5017 vcpu->arch.nmi_injected = false; 5018 kvm_clear_exception_queue(vcpu); 5019 kvm_clear_interrupt_queue(vcpu); 5020 5021 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 5022 5023 /* 5024 * If IBRS is advertised to the vCPU, KVM must flush the indirect 5025 * branch predictors when transitioning from L2 to L1, as L1 expects 5026 * hardware (KVM in this case) to provide separate predictor modes. 5027 * Bare metal isolates VMX root (host) from VMX non-root (guest), but 5028 * doesn't isolate different VMCSs, i.e. in this case, doesn't provide 5029 * separate modes for L2 vs L1. 5030 */ 5031 if (guest_cpu_cap_has(vcpu, X86_FEATURE_SPEC_CTRL)) 5032 indirect_branch_prediction_barrier(); 5033 5034 /* Update any VMCS fields that might have changed while L2 ran */ 5035 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 5036 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 5037 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 5038 if (kvm_caps.has_tsc_control) 5039 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); 5040 5041 if (vmx->nested.l1_tpr_threshold != -1) 5042 vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold); 5043 5044 if (vmx->nested.change_vmcs01_virtual_apic_mode) { 5045 vmx->nested.change_vmcs01_virtual_apic_mode = false; 5046 vmx_set_virtual_apic_mode(vcpu); 5047 } 5048 5049 if (vmx->nested.update_vmcs01_cpu_dirty_logging) { 5050 vmx->nested.update_vmcs01_cpu_dirty_logging = false; 5051 vmx_update_cpu_dirty_logging(vcpu); 5052 } 5053 5054 nested_put_vmcs12_pages(vcpu); 5055 5056 if (vmx->nested.reload_vmcs01_apic_access_page) { 5057 vmx->nested.reload_vmcs01_apic_access_page = false; 5058 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 5059 } 5060 5061 if (vmx->nested.update_vmcs01_apicv_status) { 5062 vmx->nested.update_vmcs01_apicv_status = false; 5063 kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); 5064 } 5065 5066 if (vmx->nested.update_vmcs01_hwapic_isr) { 5067 vmx->nested.update_vmcs01_hwapic_isr = false; 5068 kvm_apic_update_hwapic_isr(vcpu); 5069 } 5070 5071 if ((vm_exit_reason != -1) && 5072 (enable_shadow_vmcs || nested_vmx_is_evmptr12_valid(vmx))) 5073 vmx->nested.need_vmcs12_to_shadow_sync = true; 5074 5075 /* in case we halted in L2 */ 5076 kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); 5077 5078 if (likely(!vmx->fail)) { 5079 if (vm_exit_reason != -1) 5080 trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, 5081 vmcs12->exit_qualification, 5082 vmcs12->idt_vectoring_info_field, 5083 vmcs12->vm_exit_intr_info, 5084 vmcs12->vm_exit_intr_error_code, 5085 KVM_ISA_VMX); 5086 5087 load_vmcs12_host_state(vcpu, vmcs12); 5088 5089 /* 5090 * Process events if an injectable IRQ or NMI is pending, even 5091 * if the event is blocked (RFLAGS.IF is cleared on VM-Exit). 5092 * If an event became pending while L2 was active, KVM needs to 5093 * either inject the event or request an IRQ/NMI window. SMIs 5094 * don't need to be processed as SMM is mutually exclusive with 5095 * non-root mode. INIT/SIPI don't need to be checked as INIT 5096 * is blocked post-VMXON, and SIPIs are ignored. 5097 */ 5098 if (kvm_cpu_has_injectable_intr(vcpu) || vcpu->arch.nmi_pending) 5099 kvm_make_request(KVM_REQ_EVENT, vcpu); 5100 return; 5101 } 5102 5103 /* 5104 * After an early L2 VM-entry failure, we're now back 5105 * in L1 which thinks it just finished a VMLAUNCH or 5106 * VMRESUME instruction, so we need to set the failure 5107 * flag and the VM-instruction error field of the VMCS 5108 * accordingly, and skip the emulated instruction. 5109 */ 5110 (void)nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 5111 5112 /* 5113 * Restore L1's host state to KVM's software model. We're here 5114 * because a consistency check was caught by hardware, which 5115 * means some amount of guest state has been propagated to KVM's 5116 * model and needs to be unwound to the host's state. 5117 */ 5118 nested_vmx_restore_host_state(vcpu); 5119 5120 vmx->fail = 0; 5121 } 5122 nested_vmx_triple_fault(struct kvm_vcpu * vcpu)5123 static void nested_vmx_triple_fault(struct kvm_vcpu *vcpu) 5124 { 5125 kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu); 5126 nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0); 5127 } 5128 5129 /* 5130 * Decode the memory-address operand of a vmx instruction, as recorded on an 5131 * exit caused by such an instruction (run by a guest hypervisor). 5132 * On success, returns 0. When the operand is invalid, returns 1 and throws 5133 * #UD, #GP, or #SS. 5134 */ get_vmx_mem_address(struct kvm_vcpu * vcpu,unsigned long exit_qualification,u32 vmx_instruction_info,bool wr,int len,gva_t * ret)5135 int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, 5136 u32 vmx_instruction_info, bool wr, int len, gva_t *ret) 5137 { 5138 gva_t off; 5139 bool exn; 5140 struct kvm_segment s; 5141 5142 /* 5143 * According to Vol. 3B, "Information for VM Exits Due to Instruction 5144 * Execution", on an exit, vmx_instruction_info holds most of the 5145 * addressing components of the operand. Only the displacement part 5146 * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). 5147 * For how an actual address is calculated from all these components, 5148 * refer to Vol. 1, "Operand Addressing". 5149 */ 5150 int scaling = vmx_instruction_info & 3; 5151 int addr_size = (vmx_instruction_info >> 7) & 7; 5152 bool is_reg = vmx_instruction_info & (1u << 10); 5153 int seg_reg = (vmx_instruction_info >> 15) & 7; 5154 int index_reg = (vmx_instruction_info >> 18) & 0xf; 5155 bool index_is_valid = !(vmx_instruction_info & (1u << 22)); 5156 int base_reg = (vmx_instruction_info >> 23) & 0xf; 5157 bool base_is_valid = !(vmx_instruction_info & (1u << 27)); 5158 5159 if (is_reg) { 5160 kvm_queue_exception(vcpu, UD_VECTOR); 5161 return 1; 5162 } 5163 5164 /* Addr = segment_base + offset */ 5165 /* offset = base + [index * scale] + displacement */ 5166 off = exit_qualification; /* holds the displacement */ 5167 if (addr_size == 1) 5168 off = (gva_t)sign_extend64(off, 31); 5169 else if (addr_size == 0) 5170 off = (gva_t)sign_extend64(off, 15); 5171 if (base_is_valid) 5172 off += kvm_register_read(vcpu, base_reg); 5173 if (index_is_valid) 5174 off += kvm_register_read(vcpu, index_reg) << scaling; 5175 vmx_get_segment(vcpu, &s, seg_reg); 5176 5177 /* 5178 * The effective address, i.e. @off, of a memory operand is truncated 5179 * based on the address size of the instruction. Note that this is 5180 * the *effective address*, i.e. the address prior to accounting for 5181 * the segment's base. 5182 */ 5183 if (addr_size == 1) /* 32 bit */ 5184 off &= 0xffffffff; 5185 else if (addr_size == 0) /* 16 bit */ 5186 off &= 0xffff; 5187 5188 /* Checks for #GP/#SS exceptions. */ 5189 exn = false; 5190 if (is_long_mode(vcpu)) { 5191 /* 5192 * The virtual/linear address is never truncated in 64-bit 5193 * mode, e.g. a 32-bit address size can yield a 64-bit virtual 5194 * address when using FS/GS with a non-zero base. 5195 */ 5196 if (seg_reg == VCPU_SREG_FS || seg_reg == VCPU_SREG_GS) 5197 *ret = s.base + off; 5198 else 5199 *ret = off; 5200 5201 *ret = vmx_get_untagged_addr(vcpu, *ret, 0); 5202 /* Long mode: #GP(0)/#SS(0) if the memory address is in a 5203 * non-canonical form. This is the only check on the memory 5204 * destination for long mode! 5205 */ 5206 exn = is_noncanonical_address(*ret, vcpu, 0); 5207 } else { 5208 /* 5209 * When not in long mode, the virtual/linear address is 5210 * unconditionally truncated to 32 bits regardless of the 5211 * address size. 5212 */ 5213 *ret = (s.base + off) & 0xffffffff; 5214 5215 /* Protected mode: apply checks for segment validity in the 5216 * following order: 5217 * - segment type check (#GP(0) may be thrown) 5218 * - usability check (#GP(0)/#SS(0)) 5219 * - limit check (#GP(0)/#SS(0)) 5220 */ 5221 if (wr) 5222 /* #GP(0) if the destination operand is located in a 5223 * read-only data segment or any code segment. 5224 */ 5225 exn = ((s.type & 0xa) == 0 || (s.type & 8)); 5226 else 5227 /* #GP(0) if the source operand is located in an 5228 * execute-only code segment 5229 */ 5230 exn = ((s.type & 0xa) == 8); 5231 if (exn) { 5232 kvm_queue_exception_e(vcpu, GP_VECTOR, 0); 5233 return 1; 5234 } 5235 /* Protected mode: #GP(0)/#SS(0) if the segment is unusable. 5236 */ 5237 exn = (s.unusable != 0); 5238 5239 /* 5240 * Protected mode: #GP(0)/#SS(0) if the memory operand is 5241 * outside the segment limit. All CPUs that support VMX ignore 5242 * limit checks for flat segments, i.e. segments with base==0, 5243 * limit==0xffffffff and of type expand-up data or code. 5244 */ 5245 if (!(s.base == 0 && s.limit == 0xffffffff && 5246 ((s.type & 8) || !(s.type & 4)))) 5247 exn = exn || ((u64)off + len - 1 > s.limit); 5248 } 5249 if (exn) { 5250 kvm_queue_exception_e(vcpu, 5251 seg_reg == VCPU_SREG_SS ? 5252 SS_VECTOR : GP_VECTOR, 5253 0); 5254 return 1; 5255 } 5256 5257 return 0; 5258 } 5259 nested_vmx_get_vmptr(struct kvm_vcpu * vcpu,gpa_t * vmpointer,int * ret)5260 static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer, 5261 int *ret) 5262 { 5263 gva_t gva; 5264 struct x86_exception e; 5265 int r; 5266 5267 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 5268 vmcs_read32(VMX_INSTRUCTION_INFO), false, 5269 sizeof(*vmpointer), &gva)) { 5270 *ret = 1; 5271 return -EINVAL; 5272 } 5273 5274 r = kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e); 5275 if (r != X86EMUL_CONTINUE) { 5276 *ret = kvm_handle_memory_failure(vcpu, r, &e); 5277 return -EINVAL; 5278 } 5279 5280 return 0; 5281 } 5282 5283 /* 5284 * Allocate a shadow VMCS and associate it with the currently loaded 5285 * VMCS, unless such a shadow VMCS already exists. The newly allocated 5286 * VMCS is also VMCLEARed, so that it is ready for use. 5287 */ alloc_shadow_vmcs(struct kvm_vcpu * vcpu)5288 static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu) 5289 { 5290 struct vcpu_vmx *vmx = to_vmx(vcpu); 5291 struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs; 5292 5293 /* 5294 * KVM allocates a shadow VMCS only when L1 executes VMXON and frees it 5295 * when L1 executes VMXOFF or the vCPU is forced out of nested 5296 * operation. VMXON faults if the CPU is already post-VMXON, so it 5297 * should be impossible to already have an allocated shadow VMCS. KVM 5298 * doesn't support virtualization of VMCS shadowing, so vmcs01 should 5299 * always be the loaded VMCS. 5300 */ 5301 if (WARN_ON(loaded_vmcs != &vmx->vmcs01 || loaded_vmcs->shadow_vmcs)) 5302 return loaded_vmcs->shadow_vmcs; 5303 5304 loaded_vmcs->shadow_vmcs = alloc_vmcs(true); 5305 if (loaded_vmcs->shadow_vmcs) 5306 vmcs_clear(loaded_vmcs->shadow_vmcs); 5307 5308 return loaded_vmcs->shadow_vmcs; 5309 } 5310 enter_vmx_operation(struct kvm_vcpu * vcpu)5311 static int enter_vmx_operation(struct kvm_vcpu *vcpu) 5312 { 5313 struct vcpu_vmx *vmx = to_vmx(vcpu); 5314 int r; 5315 5316 r = alloc_loaded_vmcs(&vmx->nested.vmcs02); 5317 if (r < 0) 5318 goto out_vmcs02; 5319 5320 vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); 5321 if (!vmx->nested.cached_vmcs12) 5322 goto out_cached_vmcs12; 5323 5324 vmx->nested.shadow_vmcs12_cache.gpa = INVALID_GPA; 5325 vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); 5326 if (!vmx->nested.cached_shadow_vmcs12) 5327 goto out_cached_shadow_vmcs12; 5328 5329 if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu)) 5330 goto out_shadow_vmcs; 5331 5332 hrtimer_setup(&vmx->nested.preemption_timer, vmx_preemption_timer_fn, CLOCK_MONOTONIC, 5333 HRTIMER_MODE_ABS_PINNED); 5334 5335 vmx->nested.vpid02 = allocate_vpid(); 5336 5337 vmx->nested.vmcs02_initialized = false; 5338 vmx->nested.vmxon = true; 5339 5340 if (vmx_pt_mode_is_host_guest()) { 5341 vmx->pt_desc.guest.ctl = 0; 5342 pt_update_intercept_for_msr(vcpu); 5343 } 5344 5345 return 0; 5346 5347 out_shadow_vmcs: 5348 kfree(vmx->nested.cached_shadow_vmcs12); 5349 5350 out_cached_shadow_vmcs12: 5351 kfree(vmx->nested.cached_vmcs12); 5352 5353 out_cached_vmcs12: 5354 free_loaded_vmcs(&vmx->nested.vmcs02); 5355 5356 out_vmcs02: 5357 return -ENOMEM; 5358 } 5359 5360 /* Emulate the VMXON instruction. */ handle_vmxon(struct kvm_vcpu * vcpu)5361 static int handle_vmxon(struct kvm_vcpu *vcpu) 5362 { 5363 int ret; 5364 gpa_t vmptr; 5365 uint32_t revision; 5366 struct vcpu_vmx *vmx = to_vmx(vcpu); 5367 const u64 VMXON_NEEDED_FEATURES = FEAT_CTL_LOCKED 5368 | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; 5369 5370 /* 5371 * Manually check CR4.VMXE checks, KVM must force CR4.VMXE=1 to enter 5372 * the guest and so cannot rely on hardware to perform the check, 5373 * which has higher priority than VM-Exit (see Intel SDM's pseudocode 5374 * for VMXON). 5375 * 5376 * Rely on hardware for the other pre-VM-Exit checks, CR0.PE=1, !VM86 5377 * and !COMPATIBILITY modes. For an unrestricted guest, KVM doesn't 5378 * force any of the relevant guest state. For a restricted guest, KVM 5379 * does force CR0.PE=1, but only to also force VM86 in order to emulate 5380 * Real Mode, and so there's no need to check CR0.PE manually. 5381 */ 5382 if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_VMXE)) { 5383 kvm_queue_exception(vcpu, UD_VECTOR); 5384 return 1; 5385 } 5386 5387 /* 5388 * The CPL is checked for "not in VMX operation" and for "in VMX root", 5389 * and has higher priority than the VM-Fail due to being post-VMXON, 5390 * i.e. VMXON #GPs outside of VMX non-root if CPL!=0. In VMX non-root, 5391 * VMXON causes VM-Exit and KVM unconditionally forwards VMXON VM-Exits 5392 * from L2 to L1, i.e. there's no need to check for the vCPU being in 5393 * VMX non-root. 5394 * 5395 * Forwarding the VM-Exit unconditionally, i.e. without performing the 5396 * #UD checks (see above), is functionally ok because KVM doesn't allow 5397 * L1 to run L2 without CR4.VMXE=0, and because KVM never modifies L2's 5398 * CR0 or CR4, i.e. it's L2's responsibility to emulate #UDs that are 5399 * missed by hardware due to shadowing CR0 and/or CR4. 5400 */ 5401 if (vmx_get_cpl(vcpu)) { 5402 kvm_inject_gp(vcpu, 0); 5403 return 1; 5404 } 5405 5406 if (vmx->nested.vmxon) 5407 return nested_vmx_fail(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); 5408 5409 /* 5410 * Invalid CR0/CR4 generates #GP. These checks are performed if and 5411 * only if the vCPU isn't already in VMX operation, i.e. effectively 5412 * have lower priority than the VM-Fail above. 5413 */ 5414 if (!nested_host_cr0_valid(vcpu, kvm_read_cr0(vcpu)) || 5415 !nested_host_cr4_valid(vcpu, kvm_read_cr4(vcpu))) { 5416 kvm_inject_gp(vcpu, 0); 5417 return 1; 5418 } 5419 5420 if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) 5421 != VMXON_NEEDED_FEATURES) { 5422 kvm_inject_gp(vcpu, 0); 5423 return 1; 5424 } 5425 5426 if (nested_vmx_get_vmptr(vcpu, &vmptr, &ret)) 5427 return ret; 5428 5429 /* 5430 * SDM 3: 24.11.5 5431 * The first 4 bytes of VMXON region contain the supported 5432 * VMCS revision identifier 5433 * 5434 * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case; 5435 * which replaces physical address width with 32 5436 */ 5437 if (!page_address_valid(vcpu, vmptr)) 5438 return nested_vmx_failInvalid(vcpu); 5439 5440 if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) || 5441 revision != VMCS12_REVISION) 5442 return nested_vmx_failInvalid(vcpu); 5443 5444 vmx->nested.vmxon_ptr = vmptr; 5445 ret = enter_vmx_operation(vcpu); 5446 if (ret) 5447 return ret; 5448 5449 return nested_vmx_succeed(vcpu); 5450 } 5451 nested_release_vmcs12(struct kvm_vcpu * vcpu)5452 static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) 5453 { 5454 struct vcpu_vmx *vmx = to_vmx(vcpu); 5455 5456 if (vmx->nested.current_vmptr == INVALID_GPA) 5457 return; 5458 5459 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); 5460 5461 if (enable_shadow_vmcs) { 5462 /* copy to memory all shadowed fields in case 5463 they were modified */ 5464 copy_shadow_to_vmcs12(vmx); 5465 vmx_disable_shadow_vmcs(vmx); 5466 } 5467 vmx->nested.posted_intr_nv = -1; 5468 5469 /* Flush VMCS12 to guest memory */ 5470 kvm_vcpu_write_guest_page(vcpu, 5471 vmx->nested.current_vmptr >> PAGE_SHIFT, 5472 vmx->nested.cached_vmcs12, 0, VMCS12_SIZE); 5473 5474 kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 5475 5476 vmx->nested.current_vmptr = INVALID_GPA; 5477 } 5478 5479 /* Emulate the VMXOFF instruction */ handle_vmxoff(struct kvm_vcpu * vcpu)5480 static int handle_vmxoff(struct kvm_vcpu *vcpu) 5481 { 5482 if (!nested_vmx_check_permission(vcpu)) 5483 return 1; 5484 5485 free_nested(vcpu); 5486 5487 if (kvm_apic_has_pending_init_or_sipi(vcpu)) 5488 kvm_make_request(KVM_REQ_EVENT, vcpu); 5489 5490 return nested_vmx_succeed(vcpu); 5491 } 5492 5493 /* Emulate the VMCLEAR instruction */ handle_vmclear(struct kvm_vcpu * vcpu)5494 static int handle_vmclear(struct kvm_vcpu *vcpu) 5495 { 5496 struct vcpu_vmx *vmx = to_vmx(vcpu); 5497 u32 zero = 0; 5498 gpa_t vmptr; 5499 int r; 5500 5501 if (!nested_vmx_check_permission(vcpu)) 5502 return 1; 5503 5504 if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) 5505 return r; 5506 5507 if (!page_address_valid(vcpu, vmptr)) 5508 return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS); 5509 5510 if (vmptr == vmx->nested.vmxon_ptr) 5511 return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_VMXON_POINTER); 5512 5513 if (likely(!nested_evmcs_handle_vmclear(vcpu, vmptr))) { 5514 if (vmptr == vmx->nested.current_vmptr) 5515 nested_release_vmcs12(vcpu); 5516 5517 /* 5518 * Silently ignore memory errors on VMCLEAR, Intel's pseudocode 5519 * for VMCLEAR includes a "ensure that data for VMCS referenced 5520 * by the operand is in memory" clause that guards writes to 5521 * memory, i.e. doing nothing for I/O is architecturally valid. 5522 * 5523 * FIXME: Suppress failures if and only if no memslot is found, 5524 * i.e. exit to userspace if __copy_to_user() fails. 5525 */ 5526 (void)kvm_vcpu_write_guest(vcpu, 5527 vmptr + offsetof(struct vmcs12, 5528 launch_state), 5529 &zero, sizeof(zero)); 5530 } 5531 5532 return nested_vmx_succeed(vcpu); 5533 } 5534 5535 /* Emulate the VMLAUNCH instruction */ handle_vmlaunch(struct kvm_vcpu * vcpu)5536 static int handle_vmlaunch(struct kvm_vcpu *vcpu) 5537 { 5538 return nested_vmx_run(vcpu, true); 5539 } 5540 5541 /* Emulate the VMRESUME instruction */ handle_vmresume(struct kvm_vcpu * vcpu)5542 static int handle_vmresume(struct kvm_vcpu *vcpu) 5543 { 5544 5545 return nested_vmx_run(vcpu, false); 5546 } 5547 handle_vmread(struct kvm_vcpu * vcpu)5548 static int handle_vmread(struct kvm_vcpu *vcpu) 5549 { 5550 struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) 5551 : get_vmcs12(vcpu); 5552 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5553 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5554 struct vcpu_vmx *vmx = to_vmx(vcpu); 5555 struct x86_exception e; 5556 unsigned long field; 5557 u64 value; 5558 gva_t gva = 0; 5559 short offset; 5560 int len, r; 5561 5562 if (!nested_vmx_check_permission(vcpu)) 5563 return 1; 5564 5565 /* Decode instruction info and find the field to read */ 5566 field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); 5567 5568 if (!nested_vmx_is_evmptr12_valid(vmx)) { 5569 /* 5570 * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, 5571 * any VMREAD sets the ALU flags for VMfailInvalid. 5572 */ 5573 if (vmx->nested.current_vmptr == INVALID_GPA || 5574 (is_guest_mode(vcpu) && 5575 get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) 5576 return nested_vmx_failInvalid(vcpu); 5577 5578 offset = get_vmcs12_field_offset(field); 5579 if (offset < 0) 5580 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5581 5582 if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field)) 5583 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 5584 5585 /* Read the field, zero-extended to a u64 value */ 5586 value = vmcs12_read_any(vmcs12, field, offset); 5587 } else { 5588 /* 5589 * Hyper-V TLFS (as of 6.0b) explicitly states, that while an 5590 * enlightened VMCS is active VMREAD/VMWRITE instructions are 5591 * unsupported. Unfortunately, certain versions of Windows 11 5592 * don't comply with this requirement which is not enforced in 5593 * genuine Hyper-V. Allow VMREAD from an enlightened VMCS as a 5594 * workaround, as misbehaving guests will panic on VM-Fail. 5595 * Note, enlightened VMCS is incompatible with shadow VMCS so 5596 * all VMREADs from L2 should go to L1. 5597 */ 5598 if (WARN_ON_ONCE(is_guest_mode(vcpu))) 5599 return nested_vmx_failInvalid(vcpu); 5600 5601 offset = evmcs_field_offset(field, NULL); 5602 if (offset < 0) 5603 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5604 5605 /* Read the field, zero-extended to a u64 value */ 5606 value = evmcs_read_any(nested_vmx_evmcs(vmx), field, offset); 5607 } 5608 5609 /* 5610 * Now copy part of this value to register or memory, as requested. 5611 * Note that the number of bits actually copied is 32 or 64 depending 5612 * on the guest's mode (32 or 64 bit), not on the given field's length. 5613 */ 5614 if (instr_info & BIT(10)) { 5615 kvm_register_write(vcpu, (((instr_info) >> 3) & 0xf), value); 5616 } else { 5617 len = is_64_bit_mode(vcpu) ? 8 : 4; 5618 if (get_vmx_mem_address(vcpu, exit_qualification, 5619 instr_info, true, len, &gva)) 5620 return 1; 5621 /* _system ok, nested_vmx_check_permission has verified cpl=0 */ 5622 r = kvm_write_guest_virt_system(vcpu, gva, &value, len, &e); 5623 if (r != X86EMUL_CONTINUE) 5624 return kvm_handle_memory_failure(vcpu, r, &e); 5625 } 5626 5627 return nested_vmx_succeed(vcpu); 5628 } 5629 is_shadow_field_rw(unsigned long field)5630 static bool is_shadow_field_rw(unsigned long field) 5631 { 5632 switch (field) { 5633 #define SHADOW_FIELD_RW(x, y) case x: 5634 #include "vmcs_shadow_fields.h" 5635 return true; 5636 default: 5637 break; 5638 } 5639 return false; 5640 } 5641 is_shadow_field_ro(unsigned long field)5642 static bool is_shadow_field_ro(unsigned long field) 5643 { 5644 switch (field) { 5645 #define SHADOW_FIELD_RO(x, y) case x: 5646 #include "vmcs_shadow_fields.h" 5647 return true; 5648 default: 5649 break; 5650 } 5651 return false; 5652 } 5653 handle_vmwrite(struct kvm_vcpu * vcpu)5654 static int handle_vmwrite(struct kvm_vcpu *vcpu) 5655 { 5656 struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) 5657 : get_vmcs12(vcpu); 5658 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5659 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5660 struct vcpu_vmx *vmx = to_vmx(vcpu); 5661 struct x86_exception e; 5662 unsigned long field; 5663 short offset; 5664 gva_t gva; 5665 int len, r; 5666 5667 /* 5668 * The value to write might be 32 or 64 bits, depending on L1's long 5669 * mode, and eventually we need to write that into a field of several 5670 * possible lengths. The code below first zero-extends the value to 64 5671 * bit (value), and then copies only the appropriate number of 5672 * bits into the vmcs12 field. 5673 */ 5674 u64 value = 0; 5675 5676 if (!nested_vmx_check_permission(vcpu)) 5677 return 1; 5678 5679 /* 5680 * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, 5681 * any VMWRITE sets the ALU flags for VMfailInvalid. 5682 */ 5683 if (vmx->nested.current_vmptr == INVALID_GPA || 5684 (is_guest_mode(vcpu) && 5685 get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) 5686 return nested_vmx_failInvalid(vcpu); 5687 5688 if (instr_info & BIT(10)) 5689 value = kvm_register_read(vcpu, (((instr_info) >> 3) & 0xf)); 5690 else { 5691 len = is_64_bit_mode(vcpu) ? 8 : 4; 5692 if (get_vmx_mem_address(vcpu, exit_qualification, 5693 instr_info, false, len, &gva)) 5694 return 1; 5695 r = kvm_read_guest_virt(vcpu, gva, &value, len, &e); 5696 if (r != X86EMUL_CONTINUE) 5697 return kvm_handle_memory_failure(vcpu, r, &e); 5698 } 5699 5700 field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); 5701 5702 offset = get_vmcs12_field_offset(field); 5703 if (offset < 0) 5704 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5705 5706 /* 5707 * If the vCPU supports "VMWRITE to any supported field in the 5708 * VMCS," then the "read-only" fields are actually read/write. 5709 */ 5710 if (vmcs_field_readonly(field) && 5711 !nested_cpu_has_vmwrite_any_field(vcpu)) 5712 return nested_vmx_fail(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); 5713 5714 /* 5715 * Ensure vmcs12 is up-to-date before any VMWRITE that dirties 5716 * vmcs12, else we may crush a field or consume a stale value. 5717 */ 5718 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) 5719 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 5720 5721 /* 5722 * Some Intel CPUs intentionally drop the reserved bits of the AR byte 5723 * fields on VMWRITE. Emulate this behavior to ensure consistent KVM 5724 * behavior regardless of the underlying hardware, e.g. if an AR_BYTE 5725 * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD 5726 * from L1 will return a different value than VMREAD from L2 (L1 sees 5727 * the stripped down value, L2 sees the full value as stored by KVM). 5728 */ 5729 if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES) 5730 value &= 0x1f0ff; 5731 5732 vmcs12_write_any(vmcs12, field, offset, value); 5733 5734 /* 5735 * Do not track vmcs12 dirty-state if in guest-mode as we actually 5736 * dirty shadow vmcs12 instead of vmcs12. Fields that can be updated 5737 * by L1 without a vmexit are always updated in the vmcs02, i.e. don't 5738 * "dirty" vmcs12, all others go down the prepare_vmcs02() slow path. 5739 */ 5740 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) { 5741 /* 5742 * L1 can read these fields without exiting, ensure the 5743 * shadow VMCS is up-to-date. 5744 */ 5745 if (enable_shadow_vmcs && is_shadow_field_ro(field)) { 5746 preempt_disable(); 5747 vmcs_load(vmx->vmcs01.shadow_vmcs); 5748 5749 __vmcs_writel(field, value); 5750 5751 vmcs_clear(vmx->vmcs01.shadow_vmcs); 5752 vmcs_load(vmx->loaded_vmcs->vmcs); 5753 preempt_enable(); 5754 } 5755 vmx->nested.dirty_vmcs12 = true; 5756 } 5757 5758 return nested_vmx_succeed(vcpu); 5759 } 5760 set_current_vmptr(struct vcpu_vmx * vmx,gpa_t vmptr)5761 static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr) 5762 { 5763 vmx->nested.current_vmptr = vmptr; 5764 if (enable_shadow_vmcs) { 5765 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); 5766 vmcs_write64(VMCS_LINK_POINTER, 5767 __pa(vmx->vmcs01.shadow_vmcs)); 5768 vmx->nested.need_vmcs12_to_shadow_sync = true; 5769 } 5770 vmx->nested.dirty_vmcs12 = true; 5771 vmx->nested.force_msr_bitmap_recalc = true; 5772 } 5773 5774 /* Emulate the VMPTRLD instruction */ handle_vmptrld(struct kvm_vcpu * vcpu)5775 static int handle_vmptrld(struct kvm_vcpu *vcpu) 5776 { 5777 struct vcpu_vmx *vmx = to_vmx(vcpu); 5778 gpa_t vmptr; 5779 int r; 5780 5781 if (!nested_vmx_check_permission(vcpu)) 5782 return 1; 5783 5784 if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) 5785 return r; 5786 5787 if (!page_address_valid(vcpu, vmptr)) 5788 return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS); 5789 5790 if (vmptr == vmx->nested.vmxon_ptr) 5791 return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_VMXON_POINTER); 5792 5793 /* Forbid normal VMPTRLD if Enlightened version was used */ 5794 if (nested_vmx_is_evmptr12_valid(vmx)) 5795 return 1; 5796 5797 if (vmx->nested.current_vmptr != vmptr) { 5798 struct gfn_to_hva_cache *ghc = &vmx->nested.vmcs12_cache; 5799 struct vmcs_hdr hdr; 5800 5801 if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmptr, VMCS12_SIZE)) { 5802 /* 5803 * Reads from an unbacked page return all 1s, 5804 * which means that the 32 bits located at the 5805 * given physical address won't match the required 5806 * VMCS12_REVISION identifier. 5807 */ 5808 return nested_vmx_fail(vcpu, 5809 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5810 } 5811 5812 if (kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr, 5813 offsetof(struct vmcs12, hdr), 5814 sizeof(hdr))) { 5815 return nested_vmx_fail(vcpu, 5816 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5817 } 5818 5819 if (hdr.revision_id != VMCS12_REVISION || 5820 (hdr.shadow_vmcs && 5821 !nested_cpu_has_vmx_shadow_vmcs(vcpu))) { 5822 return nested_vmx_fail(vcpu, 5823 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5824 } 5825 5826 nested_release_vmcs12(vcpu); 5827 5828 /* 5829 * Load VMCS12 from guest memory since it is not already 5830 * cached. 5831 */ 5832 if (kvm_read_guest_cached(vcpu->kvm, ghc, vmx->nested.cached_vmcs12, 5833 VMCS12_SIZE)) { 5834 return nested_vmx_fail(vcpu, 5835 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5836 } 5837 5838 set_current_vmptr(vmx, vmptr); 5839 } 5840 5841 return nested_vmx_succeed(vcpu); 5842 } 5843 5844 /* Emulate the VMPTRST instruction */ handle_vmptrst(struct kvm_vcpu * vcpu)5845 static int handle_vmptrst(struct kvm_vcpu *vcpu) 5846 { 5847 unsigned long exit_qual = vmx_get_exit_qual(vcpu); 5848 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5849 gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr; 5850 struct x86_exception e; 5851 gva_t gva; 5852 int r; 5853 5854 if (!nested_vmx_check_permission(vcpu)) 5855 return 1; 5856 5857 if (unlikely(nested_vmx_is_evmptr12_valid(to_vmx(vcpu)))) 5858 return 1; 5859 5860 if (get_vmx_mem_address(vcpu, exit_qual, instr_info, 5861 true, sizeof(gpa_t), &gva)) 5862 return 1; 5863 /* *_system ok, nested_vmx_check_permission has verified cpl=0 */ 5864 r = kvm_write_guest_virt_system(vcpu, gva, (void *)¤t_vmptr, 5865 sizeof(gpa_t), &e); 5866 if (r != X86EMUL_CONTINUE) 5867 return kvm_handle_memory_failure(vcpu, r, &e); 5868 5869 return nested_vmx_succeed(vcpu); 5870 } 5871 5872 /* Emulate the INVEPT instruction */ handle_invept(struct kvm_vcpu * vcpu)5873 static int handle_invept(struct kvm_vcpu *vcpu) 5874 { 5875 struct vcpu_vmx *vmx = to_vmx(vcpu); 5876 u32 vmx_instruction_info, types; 5877 unsigned long type, roots_to_free; 5878 struct kvm_mmu *mmu; 5879 gva_t gva; 5880 struct x86_exception e; 5881 struct { 5882 u64 eptp, gpa; 5883 } operand; 5884 int i, r, gpr_index; 5885 5886 if (!(vmx->nested.msrs.secondary_ctls_high & 5887 SECONDARY_EXEC_ENABLE_EPT) || 5888 !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) { 5889 kvm_queue_exception(vcpu, UD_VECTOR); 5890 return 1; 5891 } 5892 5893 if (!nested_vmx_check_permission(vcpu)) 5894 return 1; 5895 5896 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5897 gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); 5898 type = kvm_register_read(vcpu, gpr_index); 5899 5900 types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; 5901 5902 if (type >= 32 || !(types & (1 << type))) 5903 return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5904 5905 /* According to the Intel VMX instruction reference, the memory 5906 * operand is read even if it isn't needed (e.g., for type==global) 5907 */ 5908 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 5909 vmx_instruction_info, false, sizeof(operand), &gva)) 5910 return 1; 5911 r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); 5912 if (r != X86EMUL_CONTINUE) 5913 return kvm_handle_memory_failure(vcpu, r, &e); 5914 5915 /* 5916 * Nested EPT roots are always held through guest_mmu, 5917 * not root_mmu. 5918 */ 5919 mmu = &vcpu->arch.guest_mmu; 5920 5921 switch (type) { 5922 case VMX_EPT_EXTENT_CONTEXT: 5923 if (!nested_vmx_check_eptp(vcpu, operand.eptp)) 5924 return nested_vmx_fail(vcpu, 5925 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5926 5927 roots_to_free = 0; 5928 if (nested_ept_root_matches(mmu->root.hpa, mmu->root.pgd, 5929 operand.eptp)) 5930 roots_to_free |= KVM_MMU_ROOT_CURRENT; 5931 5932 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 5933 if (nested_ept_root_matches(mmu->prev_roots[i].hpa, 5934 mmu->prev_roots[i].pgd, 5935 operand.eptp)) 5936 roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); 5937 } 5938 break; 5939 case VMX_EPT_EXTENT_GLOBAL: 5940 roots_to_free = KVM_MMU_ROOTS_ALL; 5941 break; 5942 default: 5943 BUG(); 5944 break; 5945 } 5946 5947 if (roots_to_free) 5948 kvm_mmu_free_roots(vcpu->kvm, mmu, roots_to_free); 5949 5950 return nested_vmx_succeed(vcpu); 5951 } 5952 handle_invvpid(struct kvm_vcpu * vcpu)5953 static int handle_invvpid(struct kvm_vcpu *vcpu) 5954 { 5955 struct vcpu_vmx *vmx = to_vmx(vcpu); 5956 u32 vmx_instruction_info; 5957 unsigned long type, types; 5958 gva_t gva; 5959 struct x86_exception e; 5960 struct { 5961 u64 vpid; 5962 u64 gla; 5963 } operand; 5964 u16 vpid02; 5965 int r, gpr_index; 5966 5967 if (!(vmx->nested.msrs.secondary_ctls_high & 5968 SECONDARY_EXEC_ENABLE_VPID) || 5969 !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) { 5970 kvm_queue_exception(vcpu, UD_VECTOR); 5971 return 1; 5972 } 5973 5974 if (!nested_vmx_check_permission(vcpu)) 5975 return 1; 5976 5977 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5978 gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); 5979 type = kvm_register_read(vcpu, gpr_index); 5980 5981 types = (vmx->nested.msrs.vpid_caps & 5982 VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; 5983 5984 if (type >= 32 || !(types & (1 << type))) 5985 return nested_vmx_fail(vcpu, 5986 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5987 5988 /* according to the intel vmx instruction reference, the memory 5989 * operand is read even if it isn't needed (e.g., for type==global) 5990 */ 5991 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 5992 vmx_instruction_info, false, sizeof(operand), &gva)) 5993 return 1; 5994 r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); 5995 if (r != X86EMUL_CONTINUE) 5996 return kvm_handle_memory_failure(vcpu, r, &e); 5997 5998 if (operand.vpid >> 16) 5999 return nested_vmx_fail(vcpu, 6000 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 6001 6002 /* 6003 * Always flush the effective vpid02, i.e. never flush the current VPID 6004 * and never explicitly flush vpid01. INVVPID targets a VPID, not a 6005 * VMCS, and so whether or not the current vmcs12 has VPID enabled is 6006 * irrelevant (and there may not be a loaded vmcs12). 6007 */ 6008 vpid02 = nested_get_vpid02(vcpu); 6009 switch (type) { 6010 case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: 6011 /* 6012 * LAM doesn't apply to addresses that are inputs to TLB 6013 * invalidation. 6014 */ 6015 if (!operand.vpid || 6016 is_noncanonical_invlpg_address(operand.gla, vcpu)) 6017 return nested_vmx_fail(vcpu, 6018 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 6019 vpid_sync_vcpu_addr(vpid02, operand.gla); 6020 break; 6021 case VMX_VPID_EXTENT_SINGLE_CONTEXT: 6022 case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: 6023 if (!operand.vpid) 6024 return nested_vmx_fail(vcpu, 6025 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 6026 vpid_sync_context(vpid02); 6027 break; 6028 case VMX_VPID_EXTENT_ALL_CONTEXT: 6029 vpid_sync_context(vpid02); 6030 break; 6031 default: 6032 WARN_ON_ONCE(1); 6033 return kvm_skip_emulated_instruction(vcpu); 6034 } 6035 6036 /* 6037 * Sync the shadow page tables if EPT is disabled, L1 is invalidating 6038 * linear mappings for L2 (tagged with L2's VPID). Free all guest 6039 * roots as VPIDs are not tracked in the MMU role. 6040 * 6041 * Note, this operates on root_mmu, not guest_mmu, as L1 and L2 share 6042 * an MMU when EPT is disabled. 6043 * 6044 * TODO: sync only the affected SPTEs for INVDIVIDUAL_ADDR. 6045 */ 6046 if (!enable_ept) 6047 kvm_mmu_free_guest_mode_roots(vcpu->kvm, &vcpu->arch.root_mmu); 6048 6049 return nested_vmx_succeed(vcpu); 6050 } 6051 nested_vmx_eptp_switching(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12)6052 static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, 6053 struct vmcs12 *vmcs12) 6054 { 6055 u32 index = kvm_rcx_read(vcpu); 6056 u64 new_eptp; 6057 6058 if (WARN_ON_ONCE(!nested_cpu_has_ept(vmcs12))) 6059 return 1; 6060 if (index >= VMFUNC_EPTP_ENTRIES) 6061 return 1; 6062 6063 if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT, 6064 &new_eptp, index * 8, 8)) 6065 return 1; 6066 6067 /* 6068 * If the (L2) guest does a vmfunc to the currently 6069 * active ept pointer, we don't have to do anything else 6070 */ 6071 if (vmcs12->ept_pointer != new_eptp) { 6072 if (!nested_vmx_check_eptp(vcpu, new_eptp)) 6073 return 1; 6074 6075 vmcs12->ept_pointer = new_eptp; 6076 nested_ept_new_eptp(vcpu); 6077 6078 if (!nested_cpu_has_vpid(vmcs12)) 6079 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 6080 } 6081 6082 return 0; 6083 } 6084 handle_vmfunc(struct kvm_vcpu * vcpu)6085 static int handle_vmfunc(struct kvm_vcpu *vcpu) 6086 { 6087 struct vcpu_vmx *vmx = to_vmx(vcpu); 6088 struct vmcs12 *vmcs12; 6089 u32 function = kvm_rax_read(vcpu); 6090 6091 /* 6092 * VMFUNC should never execute cleanly while L1 is active; KVM supports 6093 * VMFUNC for nested VMs, but not for L1. 6094 */ 6095 if (WARN_ON_ONCE(!is_guest_mode(vcpu))) { 6096 kvm_queue_exception(vcpu, UD_VECTOR); 6097 return 1; 6098 } 6099 6100 vmcs12 = get_vmcs12(vcpu); 6101 6102 /* 6103 * #UD on out-of-bounds function has priority over VM-Exit, and VMFUNC 6104 * is enabled in vmcs02 if and only if it's enabled in vmcs12. 6105 */ 6106 if (WARN_ON_ONCE((function > 63) || !nested_cpu_has_vmfunc(vmcs12))) { 6107 kvm_queue_exception(vcpu, UD_VECTOR); 6108 return 1; 6109 } 6110 6111 if (!(vmcs12->vm_function_control & BIT_ULL(function))) 6112 goto fail; 6113 6114 switch (function) { 6115 case 0: 6116 if (nested_vmx_eptp_switching(vcpu, vmcs12)) 6117 goto fail; 6118 break; 6119 default: 6120 goto fail; 6121 } 6122 return kvm_skip_emulated_instruction(vcpu); 6123 6124 fail: 6125 /* 6126 * This is effectively a reflected VM-Exit, as opposed to a synthesized 6127 * nested VM-Exit. Pass the original exit reason, i.e. don't hardcode 6128 * EXIT_REASON_VMFUNC as the exit reason. 6129 */ 6130 nested_vmx_vmexit(vcpu, vmx->exit_reason.full, 6131 vmx_get_intr_info(vcpu), 6132 vmx_get_exit_qual(vcpu)); 6133 return 1; 6134 } 6135 6136 /* 6137 * Return true if an IO instruction with the specified port and size should cause 6138 * a VM-exit into L1. 6139 */ nested_vmx_check_io_bitmaps(struct kvm_vcpu * vcpu,unsigned int port,int size)6140 bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port, 6141 int size) 6142 { 6143 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6144 gpa_t bitmap, last_bitmap; 6145 u8 b; 6146 6147 last_bitmap = INVALID_GPA; 6148 b = -1; 6149 6150 while (size > 0) { 6151 if (port < 0x8000) 6152 bitmap = vmcs12->io_bitmap_a; 6153 else if (port < 0x10000) 6154 bitmap = vmcs12->io_bitmap_b; 6155 else 6156 return true; 6157 bitmap += (port & 0x7fff) / 8; 6158 6159 if (last_bitmap != bitmap) 6160 if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1)) 6161 return true; 6162 if (b & (1 << (port & 7))) 6163 return true; 6164 6165 port++; 6166 size--; 6167 last_bitmap = bitmap; 6168 } 6169 6170 return false; 6171 } 6172 nested_vmx_exit_handled_io(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12)6173 static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, 6174 struct vmcs12 *vmcs12) 6175 { 6176 unsigned long exit_qualification; 6177 unsigned short port; 6178 int size; 6179 6180 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 6181 return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); 6182 6183 exit_qualification = vmx_get_exit_qual(vcpu); 6184 6185 port = exit_qualification >> 16; 6186 size = (exit_qualification & 7) + 1; 6187 6188 return nested_vmx_check_io_bitmaps(vcpu, port, size); 6189 } 6190 6191 /* 6192 * Return 1 if we should exit from L2 to L1 to handle an MSR access, 6193 * rather than handle it ourselves in L0. I.e., check whether L1 expressed 6194 * disinterest in the current event (read or write a specific MSR) by using an 6195 * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps. 6196 */ nested_vmx_exit_handled_msr(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12,union vmx_exit_reason exit_reason)6197 static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, 6198 struct vmcs12 *vmcs12, 6199 union vmx_exit_reason exit_reason) 6200 { 6201 u32 msr_index = kvm_rcx_read(vcpu); 6202 gpa_t bitmap; 6203 6204 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 6205 return true; 6206 6207 /* 6208 * The MSR_BITMAP page is divided into four 1024-byte bitmaps, 6209 * for the four combinations of read/write and low/high MSR numbers. 6210 * First we need to figure out which of the four to use: 6211 */ 6212 bitmap = vmcs12->msr_bitmap; 6213 if (exit_reason.basic == EXIT_REASON_MSR_WRITE) 6214 bitmap += 2048; 6215 if (msr_index >= 0xc0000000) { 6216 msr_index -= 0xc0000000; 6217 bitmap += 1024; 6218 } 6219 6220 /* Then read the msr_index'th bit from this bitmap: */ 6221 if (msr_index < 1024*8) { 6222 unsigned char b; 6223 if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1)) 6224 return true; 6225 return 1 & (b >> (msr_index & 7)); 6226 } else 6227 return true; /* let L1 handle the wrong parameter */ 6228 } 6229 6230 /* 6231 * Return 1 if we should exit from L2 to L1 to handle a CR access exit, 6232 * rather than handle it ourselves in L0. I.e., check if L1 wanted to 6233 * intercept (via guest_host_mask etc.) the current event. 6234 */ nested_vmx_exit_handled_cr(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12)6235 static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, 6236 struct vmcs12 *vmcs12) 6237 { 6238 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 6239 int cr = exit_qualification & 15; 6240 int reg; 6241 unsigned long val; 6242 6243 switch ((exit_qualification >> 4) & 3) { 6244 case 0: /* mov to cr */ 6245 reg = (exit_qualification >> 8) & 15; 6246 val = kvm_register_read(vcpu, reg); 6247 switch (cr) { 6248 case 0: 6249 if (vmcs12->cr0_guest_host_mask & 6250 (val ^ vmcs12->cr0_read_shadow)) 6251 return true; 6252 break; 6253 case 3: 6254 if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) 6255 return true; 6256 break; 6257 case 4: 6258 if (vmcs12->cr4_guest_host_mask & 6259 (vmcs12->cr4_read_shadow ^ val)) 6260 return true; 6261 break; 6262 case 8: 6263 if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING)) 6264 return true; 6265 break; 6266 } 6267 break; 6268 case 2: /* clts */ 6269 if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) && 6270 (vmcs12->cr0_read_shadow & X86_CR0_TS)) 6271 return true; 6272 break; 6273 case 1: /* mov from cr */ 6274 switch (cr) { 6275 case 3: 6276 if (vmcs12->cpu_based_vm_exec_control & 6277 CPU_BASED_CR3_STORE_EXITING) 6278 return true; 6279 break; 6280 case 8: 6281 if (vmcs12->cpu_based_vm_exec_control & 6282 CPU_BASED_CR8_STORE_EXITING) 6283 return true; 6284 break; 6285 } 6286 break; 6287 case 3: /* lmsw */ 6288 /* 6289 * lmsw can change bits 1..3 of cr0, and only set bit 0 of 6290 * cr0. Other attempted changes are ignored, with no exit. 6291 */ 6292 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; 6293 if (vmcs12->cr0_guest_host_mask & 0xe & 6294 (val ^ vmcs12->cr0_read_shadow)) 6295 return true; 6296 if ((vmcs12->cr0_guest_host_mask & 0x1) && 6297 !(vmcs12->cr0_read_shadow & 0x1) && 6298 (val & 0x1)) 6299 return true; 6300 break; 6301 } 6302 return false; 6303 } 6304 nested_vmx_exit_handled_encls(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12)6305 static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu, 6306 struct vmcs12 *vmcs12) 6307 { 6308 u32 encls_leaf; 6309 6310 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SGX) || 6311 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING)) 6312 return false; 6313 6314 encls_leaf = kvm_rax_read(vcpu); 6315 if (encls_leaf > 62) 6316 encls_leaf = 63; 6317 return vmcs12->encls_exiting_bitmap & BIT_ULL(encls_leaf); 6318 } 6319 nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu * vcpu,struct vmcs12 * vmcs12,gpa_t bitmap)6320 static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu, 6321 struct vmcs12 *vmcs12, gpa_t bitmap) 6322 { 6323 u32 vmx_instruction_info; 6324 unsigned long field; 6325 u8 b; 6326 6327 if (!nested_cpu_has_shadow_vmcs(vmcs12)) 6328 return true; 6329 6330 /* Decode instruction info and find the field to access */ 6331 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 6332 field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); 6333 6334 /* Out-of-range fields always cause a VM exit from L2 to L1 */ 6335 if (field >> 15) 6336 return true; 6337 6338 if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1)) 6339 return true; 6340 6341 return 1 & (b >> (field & 7)); 6342 } 6343 nested_vmx_exit_handled_mtf(struct vmcs12 * vmcs12)6344 static bool nested_vmx_exit_handled_mtf(struct vmcs12 *vmcs12) 6345 { 6346 u32 entry_intr_info = vmcs12->vm_entry_intr_info_field; 6347 6348 if (nested_cpu_has_mtf(vmcs12)) 6349 return true; 6350 6351 /* 6352 * An MTF VM-exit may be injected into the guest by setting the 6353 * interruption-type to 7 (other event) and the vector field to 0. Such 6354 * is the case regardless of the 'monitor trap flag' VM-execution 6355 * control. 6356 */ 6357 return entry_intr_info == (INTR_INFO_VALID_MASK 6358 | INTR_TYPE_OTHER_EVENT); 6359 } 6360 6361 /* 6362 * Return true if L0 wants to handle an exit from L2 regardless of whether or not 6363 * L1 wants the exit. Only call this when in is_guest_mode (L2). 6364 */ nested_vmx_l0_wants_exit(struct kvm_vcpu * vcpu,union vmx_exit_reason exit_reason)6365 static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, 6366 union vmx_exit_reason exit_reason) 6367 { 6368 u32 intr_info; 6369 6370 switch ((u16)exit_reason.basic) { 6371 case EXIT_REASON_EXCEPTION_NMI: 6372 intr_info = vmx_get_intr_info(vcpu); 6373 if (is_nmi(intr_info)) 6374 return true; 6375 else if (is_page_fault(intr_info)) 6376 return vcpu->arch.apf.host_apf_flags || 6377 vmx_need_pf_intercept(vcpu); 6378 else if (is_debug(intr_info) && 6379 vcpu->guest_debug & 6380 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) 6381 return true; 6382 else if (is_breakpoint(intr_info) && 6383 vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 6384 return true; 6385 else if (is_alignment_check(intr_info) && 6386 !vmx_guest_inject_ac(vcpu)) 6387 return true; 6388 else if (is_ve_fault(intr_info)) 6389 return true; 6390 return false; 6391 case EXIT_REASON_EXTERNAL_INTERRUPT: 6392 return true; 6393 case EXIT_REASON_MCE_DURING_VMENTRY: 6394 return true; 6395 case EXIT_REASON_EPT_VIOLATION: 6396 /* 6397 * L0 always deals with the EPT violation. If nested EPT is 6398 * used, and the nested mmu code discovers that the address is 6399 * missing in the guest EPT table (EPT12), the EPT violation 6400 * will be injected with nested_ept_inject_page_fault() 6401 */ 6402 return true; 6403 case EXIT_REASON_EPT_MISCONFIG: 6404 /* 6405 * L2 never uses directly L1's EPT, but rather L0's own EPT 6406 * table (shadow on EPT) or a merged EPT table that L0 built 6407 * (EPT on EPT). So any problems with the structure of the 6408 * table is L0's fault. 6409 */ 6410 return true; 6411 case EXIT_REASON_PREEMPTION_TIMER: 6412 return true; 6413 case EXIT_REASON_PML_FULL: 6414 /* 6415 * PML is emulated for an L1 VMM and should never be enabled in 6416 * vmcs02, always "handle" PML_FULL by exiting to userspace. 6417 */ 6418 return true; 6419 case EXIT_REASON_VMFUNC: 6420 /* VM functions are emulated through L2->L0 vmexits. */ 6421 return true; 6422 case EXIT_REASON_BUS_LOCK: 6423 /* 6424 * At present, bus lock VM exit is never exposed to L1. 6425 * Handle L2's bus locks in L0 directly. 6426 */ 6427 return true; 6428 #ifdef CONFIG_KVM_HYPERV 6429 case EXIT_REASON_VMCALL: 6430 /* Hyper-V L2 TLB flush hypercall is handled by L0 */ 6431 return guest_hv_cpuid_has_l2_tlb_flush(vcpu) && 6432 nested_evmcs_l2_tlb_flush_enabled(vcpu) && 6433 kvm_hv_is_tlb_flush_hcall(vcpu); 6434 #endif 6435 default: 6436 break; 6437 } 6438 return false; 6439 } 6440 6441 /* 6442 * Return 1 if L1 wants to intercept an exit from L2. Only call this when in 6443 * is_guest_mode (L2). 6444 */ nested_vmx_l1_wants_exit(struct kvm_vcpu * vcpu,union vmx_exit_reason exit_reason)6445 static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, 6446 union vmx_exit_reason exit_reason) 6447 { 6448 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6449 u32 intr_info; 6450 6451 switch ((u16)exit_reason.basic) { 6452 case EXIT_REASON_EXCEPTION_NMI: 6453 intr_info = vmx_get_intr_info(vcpu); 6454 if (is_nmi(intr_info)) 6455 return true; 6456 else if (is_page_fault(intr_info)) 6457 return true; 6458 return vmcs12->exception_bitmap & 6459 (1u << (intr_info & INTR_INFO_VECTOR_MASK)); 6460 case EXIT_REASON_EXTERNAL_INTERRUPT: 6461 return nested_exit_on_intr(vcpu); 6462 case EXIT_REASON_TRIPLE_FAULT: 6463 return true; 6464 case EXIT_REASON_INTERRUPT_WINDOW: 6465 return nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING); 6466 case EXIT_REASON_NMI_WINDOW: 6467 return nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING); 6468 case EXIT_REASON_TASK_SWITCH: 6469 return true; 6470 case EXIT_REASON_CPUID: 6471 return true; 6472 case EXIT_REASON_HLT: 6473 return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); 6474 case EXIT_REASON_INVD: 6475 return true; 6476 case EXIT_REASON_INVLPG: 6477 return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 6478 case EXIT_REASON_RDPMC: 6479 return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); 6480 case EXIT_REASON_RDRAND: 6481 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING); 6482 case EXIT_REASON_RDSEED: 6483 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING); 6484 case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: 6485 return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); 6486 case EXIT_REASON_VMREAD: 6487 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, 6488 vmcs12->vmread_bitmap); 6489 case EXIT_REASON_VMWRITE: 6490 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, 6491 vmcs12->vmwrite_bitmap); 6492 case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: 6493 case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: 6494 case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME: 6495 case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: 6496 case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID: 6497 /* 6498 * VMX instructions trap unconditionally. This allows L1 to 6499 * emulate them for its L2 guest, i.e., allows 3-level nesting! 6500 */ 6501 return true; 6502 case EXIT_REASON_CR_ACCESS: 6503 return nested_vmx_exit_handled_cr(vcpu, vmcs12); 6504 case EXIT_REASON_DR_ACCESS: 6505 return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); 6506 case EXIT_REASON_IO_INSTRUCTION: 6507 return nested_vmx_exit_handled_io(vcpu, vmcs12); 6508 case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR: 6509 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC); 6510 case EXIT_REASON_MSR_READ: 6511 case EXIT_REASON_MSR_WRITE: 6512 return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); 6513 case EXIT_REASON_INVALID_STATE: 6514 return true; 6515 case EXIT_REASON_MWAIT_INSTRUCTION: 6516 return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); 6517 case EXIT_REASON_MONITOR_TRAP_FLAG: 6518 return nested_vmx_exit_handled_mtf(vmcs12); 6519 case EXIT_REASON_MONITOR_INSTRUCTION: 6520 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING); 6521 case EXIT_REASON_PAUSE_INSTRUCTION: 6522 return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) || 6523 nested_cpu_has2(vmcs12, 6524 SECONDARY_EXEC_PAUSE_LOOP_EXITING); 6525 case EXIT_REASON_MCE_DURING_VMENTRY: 6526 return true; 6527 case EXIT_REASON_TPR_BELOW_THRESHOLD: 6528 return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW); 6529 case EXIT_REASON_APIC_ACCESS: 6530 case EXIT_REASON_APIC_WRITE: 6531 case EXIT_REASON_EOI_INDUCED: 6532 /* 6533 * The controls for "virtualize APIC accesses," "APIC- 6534 * register virtualization," and "virtual-interrupt 6535 * delivery" only come from vmcs12. 6536 */ 6537 return true; 6538 case EXIT_REASON_INVPCID: 6539 return 6540 nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) && 6541 nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 6542 case EXIT_REASON_WBINVD: 6543 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); 6544 case EXIT_REASON_XSETBV: 6545 return true; 6546 case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS: 6547 /* 6548 * This should never happen, since it is not possible to 6549 * set XSS to a non-zero value---neither in L1 nor in L2. 6550 * If if it were, XSS would have to be checked against 6551 * the XSS exit bitmap in vmcs12. 6552 */ 6553 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_XSAVES); 6554 case EXIT_REASON_UMWAIT: 6555 case EXIT_REASON_TPAUSE: 6556 return nested_cpu_has2(vmcs12, 6557 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE); 6558 case EXIT_REASON_ENCLS: 6559 return nested_vmx_exit_handled_encls(vcpu, vmcs12); 6560 case EXIT_REASON_NOTIFY: 6561 /* Notify VM exit is not exposed to L1 */ 6562 return false; 6563 default: 6564 return true; 6565 } 6566 } 6567 6568 /* 6569 * Conditionally reflect a VM-Exit into L1. Returns %true if the VM-Exit was 6570 * reflected into L1. 6571 */ nested_vmx_reflect_vmexit(struct kvm_vcpu * vcpu)6572 bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu) 6573 { 6574 struct vcpu_vmx *vmx = to_vmx(vcpu); 6575 union vmx_exit_reason exit_reason = vmx->exit_reason; 6576 unsigned long exit_qual; 6577 u32 exit_intr_info; 6578 6579 WARN_ON_ONCE(vmx->nested.nested_run_pending); 6580 6581 /* 6582 * Late nested VM-Fail shares the same flow as nested VM-Exit since KVM 6583 * has already loaded L2's state. 6584 */ 6585 if (unlikely(vmx->fail)) { 6586 trace_kvm_nested_vmenter_failed( 6587 "hardware VM-instruction error: ", 6588 vmcs_read32(VM_INSTRUCTION_ERROR)); 6589 exit_intr_info = 0; 6590 exit_qual = 0; 6591 goto reflect_vmexit; 6592 } 6593 6594 trace_kvm_nested_vmexit(vcpu, KVM_ISA_VMX); 6595 6596 /* If L0 (KVM) wants the exit, it trumps L1's desires. */ 6597 if (nested_vmx_l0_wants_exit(vcpu, exit_reason)) 6598 return false; 6599 6600 /* If L1 doesn't want the exit, handle it in L0. */ 6601 if (!nested_vmx_l1_wants_exit(vcpu, exit_reason)) 6602 return false; 6603 6604 /* 6605 * vmcs.VM_EXIT_INTR_INFO is only valid for EXCEPTION_NMI exits. For 6606 * EXTERNAL_INTERRUPT, the value for vmcs12->vm_exit_intr_info would 6607 * need to be synthesized by querying the in-kernel LAPIC, but external 6608 * interrupts are never reflected to L1 so it's a non-issue. 6609 */ 6610 exit_intr_info = vmx_get_intr_info(vcpu); 6611 if (is_exception_with_error_code(exit_intr_info)) { 6612 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6613 6614 vmcs12->vm_exit_intr_error_code = 6615 vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 6616 } 6617 exit_qual = vmx_get_exit_qual(vcpu); 6618 6619 reflect_vmexit: 6620 nested_vmx_vmexit(vcpu, exit_reason.full, exit_intr_info, exit_qual); 6621 return true; 6622 } 6623 vmx_get_nested_state(struct kvm_vcpu * vcpu,struct kvm_nested_state __user * user_kvm_nested_state,u32 user_data_size)6624 static int vmx_get_nested_state(struct kvm_vcpu *vcpu, 6625 struct kvm_nested_state __user *user_kvm_nested_state, 6626 u32 user_data_size) 6627 { 6628 struct vcpu_vmx *vmx; 6629 struct vmcs12 *vmcs12; 6630 struct kvm_nested_state kvm_state = { 6631 .flags = 0, 6632 .format = KVM_STATE_NESTED_FORMAT_VMX, 6633 .size = sizeof(kvm_state), 6634 .hdr.vmx.flags = 0, 6635 .hdr.vmx.vmxon_pa = INVALID_GPA, 6636 .hdr.vmx.vmcs12_pa = INVALID_GPA, 6637 .hdr.vmx.preemption_timer_deadline = 0, 6638 }; 6639 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = 6640 &user_kvm_nested_state->data.vmx[0]; 6641 6642 if (!vcpu) 6643 return kvm_state.size + sizeof(*user_vmx_nested_state); 6644 6645 vmx = to_vmx(vcpu); 6646 vmcs12 = get_vmcs12(vcpu); 6647 6648 if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX) && 6649 (vmx->nested.vmxon || vmx->nested.smm.vmxon)) { 6650 kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr; 6651 kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr; 6652 6653 if (vmx_has_valid_vmcs12(vcpu)) { 6654 kvm_state.size += sizeof(user_vmx_nested_state->vmcs12); 6655 6656 /* 'hv_evmcs_vmptr' can also be EVMPTR_MAP_PENDING here */ 6657 if (nested_vmx_is_evmptr12_set(vmx)) 6658 kvm_state.flags |= KVM_STATE_NESTED_EVMCS; 6659 6660 if (is_guest_mode(vcpu) && 6661 nested_cpu_has_shadow_vmcs(vmcs12) && 6662 vmcs12->vmcs_link_pointer != INVALID_GPA) 6663 kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12); 6664 } 6665 6666 if (vmx->nested.smm.vmxon) 6667 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON; 6668 6669 if (vmx->nested.smm.guest_mode) 6670 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE; 6671 6672 if (is_guest_mode(vcpu)) { 6673 kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE; 6674 6675 if (vmx->nested.nested_run_pending) 6676 kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING; 6677 6678 if (vmx->nested.mtf_pending) 6679 kvm_state.flags |= KVM_STATE_NESTED_MTF_PENDING; 6680 6681 if (nested_cpu_has_preemption_timer(vmcs12) && 6682 vmx->nested.has_preemption_timer_deadline) { 6683 kvm_state.hdr.vmx.flags |= 6684 KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE; 6685 kvm_state.hdr.vmx.preemption_timer_deadline = 6686 vmx->nested.preemption_timer_deadline; 6687 } 6688 } 6689 } 6690 6691 if (user_data_size < kvm_state.size) 6692 goto out; 6693 6694 if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state))) 6695 return -EFAULT; 6696 6697 if (!vmx_has_valid_vmcs12(vcpu)) 6698 goto out; 6699 6700 /* 6701 * When running L2, the authoritative vmcs12 state is in the 6702 * vmcs02. When running L1, the authoritative vmcs12 state is 6703 * in the shadow or enlightened vmcs linked to vmcs01, unless 6704 * need_vmcs12_to_shadow_sync is set, in which case, the authoritative 6705 * vmcs12 state is in the vmcs12 already. 6706 */ 6707 if (is_guest_mode(vcpu)) { 6708 sync_vmcs02_to_vmcs12(vcpu, vmcs12); 6709 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 6710 } else { 6711 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); 6712 if (!vmx->nested.need_vmcs12_to_shadow_sync) { 6713 if (nested_vmx_is_evmptr12_valid(vmx)) 6714 /* 6715 * L1 hypervisor is not obliged to keep eVMCS 6716 * clean fields data always up-to-date while 6717 * not in guest mode, 'hv_clean_fields' is only 6718 * supposed to be actual upon vmentry so we need 6719 * to ignore it here and do full copy. 6720 */ 6721 copy_enlightened_to_vmcs12(vmx, 0); 6722 else if (enable_shadow_vmcs) 6723 copy_shadow_to_vmcs12(vmx); 6724 } 6725 } 6726 6727 BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE); 6728 BUILD_BUG_ON(sizeof(user_vmx_nested_state->shadow_vmcs12) < VMCS12_SIZE); 6729 6730 /* 6731 * Copy over the full allocated size of vmcs12 rather than just the size 6732 * of the struct. 6733 */ 6734 if (copy_to_user(user_vmx_nested_state->vmcs12, vmcs12, VMCS12_SIZE)) 6735 return -EFAULT; 6736 6737 if (nested_cpu_has_shadow_vmcs(vmcs12) && 6738 vmcs12->vmcs_link_pointer != INVALID_GPA) { 6739 if (copy_to_user(user_vmx_nested_state->shadow_vmcs12, 6740 get_shadow_vmcs12(vcpu), VMCS12_SIZE)) 6741 return -EFAULT; 6742 } 6743 out: 6744 return kvm_state.size; 6745 } 6746 vmx_leave_nested(struct kvm_vcpu * vcpu)6747 void vmx_leave_nested(struct kvm_vcpu *vcpu) 6748 { 6749 if (is_guest_mode(vcpu)) { 6750 to_vmx(vcpu)->nested.nested_run_pending = 0; 6751 nested_vmx_vmexit(vcpu, -1, 0, 0); 6752 } 6753 free_nested(vcpu); 6754 } 6755 vmx_set_nested_state(struct kvm_vcpu * vcpu,struct kvm_nested_state __user * user_kvm_nested_state,struct kvm_nested_state * kvm_state)6756 static int vmx_set_nested_state(struct kvm_vcpu *vcpu, 6757 struct kvm_nested_state __user *user_kvm_nested_state, 6758 struct kvm_nested_state *kvm_state) 6759 { 6760 struct vcpu_vmx *vmx = to_vmx(vcpu); 6761 struct vmcs12 *vmcs12; 6762 enum vm_entry_failure_code ignored; 6763 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = 6764 &user_kvm_nested_state->data.vmx[0]; 6765 int ret; 6766 6767 if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX) 6768 return -EINVAL; 6769 6770 if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) { 6771 if (kvm_state->hdr.vmx.smm.flags) 6772 return -EINVAL; 6773 6774 if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) 6775 return -EINVAL; 6776 6777 /* 6778 * KVM_STATE_NESTED_EVMCS used to signal that KVM should 6779 * enable eVMCS capability on vCPU. However, since then 6780 * code was changed such that flag signals vmcs12 should 6781 * be copied into eVMCS in guest memory. 6782 * 6783 * To preserve backwards compatibility, allow user 6784 * to set this flag even when there is no VMXON region. 6785 */ 6786 if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS) 6787 return -EINVAL; 6788 } else { 6789 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) 6790 return -EINVAL; 6791 6792 if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa)) 6793 return -EINVAL; 6794 } 6795 6796 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 6797 (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 6798 return -EINVAL; 6799 6800 if (kvm_state->hdr.vmx.smm.flags & 6801 ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON)) 6802 return -EINVAL; 6803 6804 if (kvm_state->hdr.vmx.flags & ~KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) 6805 return -EINVAL; 6806 6807 /* 6808 * SMM temporarily disables VMX, so we cannot be in guest mode, 6809 * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags 6810 * must be zero. 6811 */ 6812 if (is_smm(vcpu) ? 6813 (kvm_state->flags & 6814 (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING)) 6815 : kvm_state->hdr.vmx.smm.flags) 6816 return -EINVAL; 6817 6818 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 6819 !(kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON)) 6820 return -EINVAL; 6821 6822 if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) && 6823 (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX) || 6824 !vmx->nested.enlightened_vmcs_enabled)) 6825 return -EINVAL; 6826 6827 vmx_leave_nested(vcpu); 6828 6829 if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) 6830 return 0; 6831 6832 vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa; 6833 ret = enter_vmx_operation(vcpu); 6834 if (ret) 6835 return ret; 6836 6837 /* Empty 'VMXON' state is permitted if no VMCS loaded */ 6838 if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) { 6839 /* See vmx_has_valid_vmcs12. */ 6840 if ((kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE) || 6841 (kvm_state->flags & KVM_STATE_NESTED_EVMCS) || 6842 (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA)) 6843 return -EINVAL; 6844 else 6845 return 0; 6846 } 6847 6848 if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) { 6849 if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa || 6850 !page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa)) 6851 return -EINVAL; 6852 6853 set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa); 6854 #ifdef CONFIG_KVM_HYPERV 6855 } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) { 6856 /* 6857 * nested_vmx_handle_enlightened_vmptrld() cannot be called 6858 * directly from here as HV_X64_MSR_VP_ASSIST_PAGE may not be 6859 * restored yet. EVMCS will be mapped from 6860 * nested_get_vmcs12_pages(). 6861 */ 6862 vmx->nested.hv_evmcs_vmptr = EVMPTR_MAP_PENDING; 6863 kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 6864 #endif 6865 } else { 6866 return -EINVAL; 6867 } 6868 6869 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) { 6870 vmx->nested.smm.vmxon = true; 6871 vmx->nested.vmxon = false; 6872 6873 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) 6874 vmx->nested.smm.guest_mode = true; 6875 } 6876 6877 vmcs12 = get_vmcs12(vcpu); 6878 if (copy_from_user(vmcs12, user_vmx_nested_state->vmcs12, sizeof(*vmcs12))) 6879 return -EFAULT; 6880 6881 if (vmcs12->hdr.revision_id != VMCS12_REVISION) 6882 return -EINVAL; 6883 6884 if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 6885 return 0; 6886 6887 vmx->nested.nested_run_pending = 6888 !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); 6889 6890 vmx->nested.mtf_pending = 6891 !!(kvm_state->flags & KVM_STATE_NESTED_MTF_PENDING); 6892 6893 ret = -EINVAL; 6894 if (nested_cpu_has_shadow_vmcs(vmcs12) && 6895 vmcs12->vmcs_link_pointer != INVALID_GPA) { 6896 struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); 6897 6898 if (kvm_state->size < 6899 sizeof(*kvm_state) + 6900 sizeof(user_vmx_nested_state->vmcs12) + sizeof(*shadow_vmcs12)) 6901 goto error_guest_mode; 6902 6903 if (copy_from_user(shadow_vmcs12, 6904 user_vmx_nested_state->shadow_vmcs12, 6905 sizeof(*shadow_vmcs12))) { 6906 ret = -EFAULT; 6907 goto error_guest_mode; 6908 } 6909 6910 if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION || 6911 !shadow_vmcs12->hdr.shadow_vmcs) 6912 goto error_guest_mode; 6913 } 6914 6915 vmx->nested.has_preemption_timer_deadline = false; 6916 if (kvm_state->hdr.vmx.flags & KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) { 6917 vmx->nested.has_preemption_timer_deadline = true; 6918 vmx->nested.preemption_timer_deadline = 6919 kvm_state->hdr.vmx.preemption_timer_deadline; 6920 } 6921 6922 if (nested_vmx_check_controls(vcpu, vmcs12) || 6923 nested_vmx_check_host_state(vcpu, vmcs12) || 6924 nested_vmx_check_guest_state(vcpu, vmcs12, &ignored)) 6925 goto error_guest_mode; 6926 6927 vmx->nested.dirty_vmcs12 = true; 6928 vmx->nested.force_msr_bitmap_recalc = true; 6929 ret = nested_vmx_enter_non_root_mode(vcpu, false); 6930 if (ret) 6931 goto error_guest_mode; 6932 6933 if (vmx->nested.mtf_pending) 6934 kvm_make_request(KVM_REQ_EVENT, vcpu); 6935 6936 return 0; 6937 6938 error_guest_mode: 6939 vmx->nested.nested_run_pending = 0; 6940 return ret; 6941 } 6942 nested_vmx_set_vmcs_shadowing_bitmap(void)6943 void nested_vmx_set_vmcs_shadowing_bitmap(void) 6944 { 6945 if (enable_shadow_vmcs) { 6946 vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); 6947 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); 6948 } 6949 } 6950 6951 /* 6952 * Indexing into the vmcs12 uses the VMCS encoding rotated left by 6. Undo 6953 * that madness to get the encoding for comparison. 6954 */ 6955 #define VMCS12_IDX_TO_ENC(idx) ((u16)(((u16)(idx) >> 6) | ((u16)(idx) << 10))) 6956 nested_vmx_calc_vmcs_enum_msr(void)6957 static u64 nested_vmx_calc_vmcs_enum_msr(void) 6958 { 6959 /* 6960 * Note these are the so called "index" of the VMCS field encoding, not 6961 * the index into vmcs12. 6962 */ 6963 unsigned int max_idx, idx; 6964 int i; 6965 6966 /* 6967 * For better or worse, KVM allows VMREAD/VMWRITE to all fields in 6968 * vmcs12, regardless of whether or not the associated feature is 6969 * exposed to L1. Simply find the field with the highest index. 6970 */ 6971 max_idx = 0; 6972 for (i = 0; i < nr_vmcs12_fields; i++) { 6973 /* The vmcs12 table is very, very sparsely populated. */ 6974 if (!vmcs12_field_offsets[i]) 6975 continue; 6976 6977 idx = vmcs_field_index(VMCS12_IDX_TO_ENC(i)); 6978 if (idx > max_idx) 6979 max_idx = idx; 6980 } 6981 6982 return (u64)max_idx << VMCS_FIELD_INDEX_SHIFT; 6983 } 6984 nested_vmx_setup_pinbased_ctls(struct vmcs_config * vmcs_conf,struct nested_vmx_msrs * msrs)6985 static void nested_vmx_setup_pinbased_ctls(struct vmcs_config *vmcs_conf, 6986 struct nested_vmx_msrs *msrs) 6987 { 6988 msrs->pinbased_ctls_low = 6989 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 6990 6991 msrs->pinbased_ctls_high = vmcs_conf->pin_based_exec_ctrl; 6992 msrs->pinbased_ctls_high &= 6993 PIN_BASED_EXT_INTR_MASK | 6994 PIN_BASED_NMI_EXITING | 6995 PIN_BASED_VIRTUAL_NMIS | 6996 (enable_apicv ? PIN_BASED_POSTED_INTR : 0); 6997 msrs->pinbased_ctls_high |= 6998 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 6999 PIN_BASED_VMX_PREEMPTION_TIMER; 7000 } 7001 nested_vmx_setup_exit_ctls(struct vmcs_config * vmcs_conf,struct nested_vmx_msrs * msrs)7002 static void nested_vmx_setup_exit_ctls(struct vmcs_config *vmcs_conf, 7003 struct nested_vmx_msrs *msrs) 7004 { 7005 msrs->exit_ctls_low = 7006 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 7007 7008 msrs->exit_ctls_high = vmcs_conf->vmexit_ctrl; 7009 msrs->exit_ctls_high &= 7010 #ifdef CONFIG_X86_64 7011 VM_EXIT_HOST_ADDR_SPACE_SIZE | 7012 #endif 7013 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT | 7014 VM_EXIT_CLEAR_BNDCFGS; 7015 msrs->exit_ctls_high |= 7016 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | 7017 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | 7018 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT | 7019 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; 7020 7021 /* We support free control of debug control saving. */ 7022 msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; 7023 } 7024 nested_vmx_setup_entry_ctls(struct vmcs_config * vmcs_conf,struct nested_vmx_msrs * msrs)7025 static void nested_vmx_setup_entry_ctls(struct vmcs_config *vmcs_conf, 7026 struct nested_vmx_msrs *msrs) 7027 { 7028 msrs->entry_ctls_low = 7029 VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 7030 7031 msrs->entry_ctls_high = vmcs_conf->vmentry_ctrl; 7032 msrs->entry_ctls_high &= 7033 #ifdef CONFIG_X86_64 7034 VM_ENTRY_IA32E_MODE | 7035 #endif 7036 VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS; 7037 msrs->entry_ctls_high |= 7038 (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER | 7039 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL); 7040 7041 /* We support free control of debug control loading. */ 7042 msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; 7043 } 7044 nested_vmx_setup_cpubased_ctls(struct vmcs_config * vmcs_conf,struct nested_vmx_msrs * msrs)7045 static void nested_vmx_setup_cpubased_ctls(struct vmcs_config *vmcs_conf, 7046 struct nested_vmx_msrs *msrs) 7047 { 7048 msrs->procbased_ctls_low = 7049 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 7050 7051 msrs->procbased_ctls_high = vmcs_conf->cpu_based_exec_ctrl; 7052 msrs->procbased_ctls_high &= 7053 CPU_BASED_INTR_WINDOW_EXITING | 7054 CPU_BASED_NMI_WINDOW_EXITING | CPU_BASED_USE_TSC_OFFSETTING | 7055 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | 7056 CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | 7057 CPU_BASED_CR3_STORE_EXITING | 7058 #ifdef CONFIG_X86_64 7059 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | 7060 #endif 7061 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | 7062 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG | 7063 CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING | 7064 CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING | 7065 CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 7066 /* 7067 * We can allow some features even when not supported by the 7068 * hardware. For example, L1 can specify an MSR bitmap - and we 7069 * can use it to avoid exits to L1 - even when L0 runs L2 7070 * without MSR bitmaps. 7071 */ 7072 msrs->procbased_ctls_high |= 7073 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 7074 CPU_BASED_USE_MSR_BITMAPS; 7075 7076 /* We support free control of CR3 access interception. */ 7077 msrs->procbased_ctls_low &= 7078 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); 7079 } 7080 nested_vmx_setup_secondary_ctls(u32 ept_caps,struct vmcs_config * vmcs_conf,struct nested_vmx_msrs * msrs)7081 static void nested_vmx_setup_secondary_ctls(u32 ept_caps, 7082 struct vmcs_config *vmcs_conf, 7083 struct nested_vmx_msrs *msrs) 7084 { 7085 msrs->secondary_ctls_low = 0; 7086 7087 msrs->secondary_ctls_high = vmcs_conf->cpu_based_2nd_exec_ctrl; 7088 msrs->secondary_ctls_high &= 7089 SECONDARY_EXEC_DESC | 7090 SECONDARY_EXEC_ENABLE_RDTSCP | 7091 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 7092 SECONDARY_EXEC_WBINVD_EXITING | 7093 SECONDARY_EXEC_APIC_REGISTER_VIRT | 7094 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 7095 SECONDARY_EXEC_RDRAND_EXITING | 7096 SECONDARY_EXEC_ENABLE_INVPCID | 7097 SECONDARY_EXEC_ENABLE_VMFUNC | 7098 SECONDARY_EXEC_RDSEED_EXITING | 7099 SECONDARY_EXEC_ENABLE_XSAVES | 7100 SECONDARY_EXEC_TSC_SCALING | 7101 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; 7102 7103 /* 7104 * We can emulate "VMCS shadowing," even if the hardware 7105 * doesn't support it. 7106 */ 7107 msrs->secondary_ctls_high |= 7108 SECONDARY_EXEC_SHADOW_VMCS; 7109 7110 if (enable_ept) { 7111 /* nested EPT: emulate EPT also to L1 */ 7112 msrs->secondary_ctls_high |= 7113 SECONDARY_EXEC_ENABLE_EPT; 7114 msrs->ept_caps = 7115 VMX_EPT_PAGE_WALK_4_BIT | 7116 VMX_EPT_PAGE_WALK_5_BIT | 7117 VMX_EPTP_WB_BIT | 7118 VMX_EPT_INVEPT_BIT | 7119 VMX_EPT_EXECUTE_ONLY_BIT; 7120 7121 msrs->ept_caps &= ept_caps; 7122 msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | 7123 VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT | 7124 VMX_EPT_1GB_PAGE_BIT; 7125 if (enable_ept_ad_bits) { 7126 msrs->secondary_ctls_high |= 7127 SECONDARY_EXEC_ENABLE_PML; 7128 msrs->ept_caps |= VMX_EPT_AD_BIT; 7129 } 7130 7131 /* 7132 * Advertise EPTP switching irrespective of hardware support, 7133 * KVM emulates it in software so long as VMFUNC is supported. 7134 */ 7135 if (cpu_has_vmx_vmfunc()) 7136 msrs->vmfunc_controls = VMX_VMFUNC_EPTP_SWITCHING; 7137 } 7138 7139 /* 7140 * Old versions of KVM use the single-context version without 7141 * checking for support, so declare that it is supported even 7142 * though it is treated as global context. The alternative is 7143 * not failing the single-context invvpid, and it is worse. 7144 */ 7145 if (enable_vpid) { 7146 msrs->secondary_ctls_high |= 7147 SECONDARY_EXEC_ENABLE_VPID; 7148 msrs->vpid_caps = VMX_VPID_INVVPID_BIT | 7149 VMX_VPID_EXTENT_SUPPORTED_MASK; 7150 } 7151 7152 if (enable_unrestricted_guest) 7153 msrs->secondary_ctls_high |= 7154 SECONDARY_EXEC_UNRESTRICTED_GUEST; 7155 7156 if (flexpriority_enabled) 7157 msrs->secondary_ctls_high |= 7158 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 7159 7160 if (enable_sgx) 7161 msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING; 7162 } 7163 nested_vmx_setup_misc_data(struct vmcs_config * vmcs_conf,struct nested_vmx_msrs * msrs)7164 static void nested_vmx_setup_misc_data(struct vmcs_config *vmcs_conf, 7165 struct nested_vmx_msrs *msrs) 7166 { 7167 msrs->misc_low = (u32)vmcs_conf->misc & VMX_MISC_SAVE_EFER_LMA; 7168 msrs->misc_low |= 7169 VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | 7170 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | 7171 VMX_MISC_ACTIVITY_HLT | 7172 VMX_MISC_ACTIVITY_WAIT_SIPI; 7173 msrs->misc_high = 0; 7174 } 7175 nested_vmx_setup_basic(struct nested_vmx_msrs * msrs)7176 static void nested_vmx_setup_basic(struct nested_vmx_msrs *msrs) 7177 { 7178 /* 7179 * This MSR reports some information about VMX support. We 7180 * should return information about the VMX we emulate for the 7181 * guest, and the VMCS structure we give it - not about the 7182 * VMX support of the underlying hardware. 7183 */ 7184 msrs->basic = vmx_basic_encode_vmcs_info(VMCS12_REVISION, VMCS12_SIZE, 7185 X86_MEMTYPE_WB); 7186 7187 msrs->basic |= VMX_BASIC_TRUE_CTLS; 7188 if (cpu_has_vmx_basic_inout()) 7189 msrs->basic |= VMX_BASIC_INOUT; 7190 } 7191 nested_vmx_setup_cr_fixed(struct nested_vmx_msrs * msrs)7192 static void nested_vmx_setup_cr_fixed(struct nested_vmx_msrs *msrs) 7193 { 7194 /* 7195 * These MSRs specify bits which the guest must keep fixed on 7196 * while L1 is in VMXON mode (in L1's root mode, or running an L2). 7197 * We picked the standard core2 setting. 7198 */ 7199 #define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) 7200 #define VMXON_CR4_ALWAYSON X86_CR4_VMXE 7201 msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON; 7202 msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON; 7203 7204 /* These MSRs specify bits which the guest must keep fixed off. */ 7205 rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1); 7206 rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1); 7207 7208 if (vmx_umip_emulated()) 7209 msrs->cr4_fixed1 |= X86_CR4_UMIP; 7210 } 7211 7212 /* 7213 * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be 7214 * returned for the various VMX controls MSRs when nested VMX is enabled. 7215 * The same values should also be used to verify that vmcs12 control fields are 7216 * valid during nested entry from L1 to L2. 7217 * Each of these control msrs has a low and high 32-bit half: A low bit is on 7218 * if the corresponding bit in the (32-bit) control field *must* be on, and a 7219 * bit in the high half is on if the corresponding bit in the control field 7220 * may be on. See also vmx_control_verify(). 7221 */ nested_vmx_setup_ctls_msrs(struct vmcs_config * vmcs_conf,u32 ept_caps)7222 void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps) 7223 { 7224 struct nested_vmx_msrs *msrs = &vmcs_conf->nested; 7225 7226 /* 7227 * Note that as a general rule, the high half of the MSRs (bits in 7228 * the control fields which may be 1) should be initialized by the 7229 * intersection of the underlying hardware's MSR (i.e., features which 7230 * can be supported) and the list of features we want to expose - 7231 * because they are known to be properly supported in our code. 7232 * Also, usually, the low half of the MSRs (bits which must be 1) can 7233 * be set to 0, meaning that L1 may turn off any of these bits. The 7234 * reason is that if one of these bits is necessary, it will appear 7235 * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control 7236 * fields of vmcs01 and vmcs02, will turn these bits off - and 7237 * nested_vmx_l1_wants_exit() will not pass related exits to L1. 7238 * These rules have exceptions below. 7239 */ 7240 nested_vmx_setup_pinbased_ctls(vmcs_conf, msrs); 7241 7242 nested_vmx_setup_exit_ctls(vmcs_conf, msrs); 7243 7244 nested_vmx_setup_entry_ctls(vmcs_conf, msrs); 7245 7246 nested_vmx_setup_cpubased_ctls(vmcs_conf, msrs); 7247 7248 nested_vmx_setup_secondary_ctls(ept_caps, vmcs_conf, msrs); 7249 7250 nested_vmx_setup_misc_data(vmcs_conf, msrs); 7251 7252 nested_vmx_setup_basic(msrs); 7253 7254 nested_vmx_setup_cr_fixed(msrs); 7255 7256 msrs->vmcs_enum = nested_vmx_calc_vmcs_enum_msr(); 7257 } 7258 nested_vmx_hardware_unsetup(void)7259 void nested_vmx_hardware_unsetup(void) 7260 { 7261 int i; 7262 7263 if (enable_shadow_vmcs) { 7264 for (i = 0; i < VMX_BITMAP_NR; i++) 7265 free_page((unsigned long)vmx_bitmap[i]); 7266 } 7267 } 7268 nested_vmx_hardware_setup(int (* exit_handlers[])(struct kvm_vcpu *))7269 __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) 7270 { 7271 int i; 7272 7273 if (!cpu_has_vmx_shadow_vmcs()) 7274 enable_shadow_vmcs = 0; 7275 if (enable_shadow_vmcs) { 7276 for (i = 0; i < VMX_BITMAP_NR; i++) { 7277 /* 7278 * The vmx_bitmap is not tied to a VM and so should 7279 * not be charged to a memcg. 7280 */ 7281 vmx_bitmap[i] = (unsigned long *) 7282 __get_free_page(GFP_KERNEL); 7283 if (!vmx_bitmap[i]) { 7284 nested_vmx_hardware_unsetup(); 7285 return -ENOMEM; 7286 } 7287 } 7288 7289 init_vmcs_shadow_fields(); 7290 } 7291 7292 exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear; 7293 exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch; 7294 exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld; 7295 exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst; 7296 exit_handlers[EXIT_REASON_VMREAD] = handle_vmread; 7297 exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume; 7298 exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite; 7299 exit_handlers[EXIT_REASON_VMOFF] = handle_vmxoff; 7300 exit_handlers[EXIT_REASON_VMON] = handle_vmxon; 7301 exit_handlers[EXIT_REASON_INVEPT] = handle_invept; 7302 exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid; 7303 exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc; 7304 7305 return 0; 7306 } 7307 7308 struct kvm_x86_nested_ops vmx_nested_ops = { 7309 .leave_nested = vmx_leave_nested, 7310 .is_exception_vmexit = nested_vmx_is_exception_vmexit, 7311 .check_events = vmx_check_nested_events, 7312 .has_events = vmx_has_nested_events, 7313 .triple_fault = nested_vmx_triple_fault, 7314 .get_state = vmx_get_nested_state, 7315 .set_state = vmx_set_nested_state, 7316 .get_nested_state_pages = vmx_get_nested_state_pages, 7317 .write_log_dirty = nested_vmx_write_pml_buffer, 7318 #ifdef CONFIG_KVM_HYPERV 7319 .enable_evmcs = nested_enable_evmcs, 7320 .get_evmcs_version = nested_get_evmcs_version, 7321 .hv_inject_synthetic_vmexit_post_tlb_flush = vmx_hv_inject_synthetic_vmexit_post_tlb_flush, 7322 #endif 7323 }; 7324