1 // SPDX-License-Identifier: GPL-2.0 2 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 3 4 #include <linux/objtool.h> 5 #include <linux/percpu.h> 6 7 #include <asm/debugreg.h> 8 #include <asm/mmu_context.h> 9 10 #include "cpuid.h" 11 #include "hyperv.h" 12 #include "mmu.h" 13 #include "nested.h" 14 #include "pmu.h" 15 #include "sgx.h" 16 #include "trace.h" 17 #include "vmx.h" 18 #include "x86.h" 19 #include "smm.h" 20 21 static bool __read_mostly enable_shadow_vmcs = 1; 22 module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); 23 24 static bool __read_mostly nested_early_check = 0; 25 module_param(nested_early_check, bool, S_IRUGO); 26 27 #define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK 28 29 /* 30 * Hyper-V requires all of these, so mark them as supported even though 31 * they are just treated the same as all-context. 32 */ 33 #define VMX_VPID_EXTENT_SUPPORTED_MASK \ 34 (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \ 35 VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \ 36 VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \ 37 VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT) 38 39 #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 40 41 enum { 42 VMX_VMREAD_BITMAP, 43 VMX_VMWRITE_BITMAP, 44 VMX_BITMAP_NR 45 }; 46 static unsigned long *vmx_bitmap[VMX_BITMAP_NR]; 47 48 #define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP]) 49 #define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP]) 50 51 struct shadow_vmcs_field { 52 u16 encoding; 53 u16 offset; 54 }; 55 static struct shadow_vmcs_field shadow_read_only_fields[] = { 56 #define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) }, 57 #include "vmcs_shadow_fields.h" 58 }; 59 static int max_shadow_read_only_fields = 60 ARRAY_SIZE(shadow_read_only_fields); 61 62 static struct shadow_vmcs_field shadow_read_write_fields[] = { 63 #define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) }, 64 #include "vmcs_shadow_fields.h" 65 }; 66 static int max_shadow_read_write_fields = 67 ARRAY_SIZE(shadow_read_write_fields); 68 69 static void init_vmcs_shadow_fields(void) 70 { 71 int i, j; 72 73 memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); 74 memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); 75 76 for (i = j = 0; i < max_shadow_read_only_fields; i++) { 77 struct shadow_vmcs_field entry = shadow_read_only_fields[i]; 78 u16 field = entry.encoding; 79 80 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && 81 (i + 1 == max_shadow_read_only_fields || 82 shadow_read_only_fields[i + 1].encoding != field + 1)) 83 pr_err("Missing field from shadow_read_only_field %x\n", 84 field + 1); 85 86 clear_bit(field, vmx_vmread_bitmap); 87 if (field & 1) 88 #ifdef CONFIG_X86_64 89 continue; 90 #else 91 entry.offset += sizeof(u32); 92 #endif 93 shadow_read_only_fields[j++] = entry; 94 } 95 max_shadow_read_only_fields = j; 96 97 for (i = j = 0; i < max_shadow_read_write_fields; i++) { 98 struct shadow_vmcs_field entry = shadow_read_write_fields[i]; 99 u16 field = entry.encoding; 100 101 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && 102 (i + 1 == max_shadow_read_write_fields || 103 shadow_read_write_fields[i + 1].encoding != field + 1)) 104 pr_err("Missing field from shadow_read_write_field %x\n", 105 field + 1); 106 107 WARN_ONCE(field >= GUEST_ES_AR_BYTES && 108 field <= GUEST_TR_AR_BYTES, 109 "Update vmcs12_write_any() to drop reserved bits from AR_BYTES"); 110 111 /* 112 * PML and the preemption timer can be emulated, but the 113 * processor cannot vmwrite to fields that don't exist 114 * on bare metal. 115 */ 116 switch (field) { 117 case GUEST_PML_INDEX: 118 if (!cpu_has_vmx_pml()) 119 continue; 120 break; 121 case VMX_PREEMPTION_TIMER_VALUE: 122 if (!cpu_has_vmx_preemption_timer()) 123 continue; 124 break; 125 case GUEST_INTR_STATUS: 126 if (!cpu_has_vmx_apicv()) 127 continue; 128 break; 129 default: 130 break; 131 } 132 133 clear_bit(field, vmx_vmwrite_bitmap); 134 clear_bit(field, vmx_vmread_bitmap); 135 if (field & 1) 136 #ifdef CONFIG_X86_64 137 continue; 138 #else 139 entry.offset += sizeof(u32); 140 #endif 141 shadow_read_write_fields[j++] = entry; 142 } 143 max_shadow_read_write_fields = j; 144 } 145 146 /* 147 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), 148 * set the success or error code of an emulated VMX instruction (as specified 149 * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated 150 * instruction. 151 */ 152 static int nested_vmx_succeed(struct kvm_vcpu *vcpu) 153 { 154 vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) 155 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 156 X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); 157 return kvm_skip_emulated_instruction(vcpu); 158 } 159 160 static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu) 161 { 162 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 163 & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | 164 X86_EFLAGS_SF | X86_EFLAGS_OF)) 165 | X86_EFLAGS_CF); 166 return kvm_skip_emulated_instruction(vcpu); 167 } 168 169 static int nested_vmx_failValid(struct kvm_vcpu *vcpu, 170 u32 vm_instruction_error) 171 { 172 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 173 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 174 X86_EFLAGS_SF | X86_EFLAGS_OF)) 175 | X86_EFLAGS_ZF); 176 get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; 177 /* 178 * We don't need to force sync to shadow VMCS because 179 * VM_INSTRUCTION_ERROR is not shadowed. Enlightened VMCS 'shadows' all 180 * fields and thus must be synced. 181 */ 182 if (nested_vmx_is_evmptr12_set(to_vmx(vcpu))) 183 to_vmx(vcpu)->nested.need_vmcs12_to_shadow_sync = true; 184 185 return kvm_skip_emulated_instruction(vcpu); 186 } 187 188 static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error) 189 { 190 struct vcpu_vmx *vmx = to_vmx(vcpu); 191 192 /* 193 * failValid writes the error number to the current VMCS, which 194 * can't be done if there isn't a current VMCS. 195 */ 196 if (vmx->nested.current_vmptr == INVALID_GPA && 197 !nested_vmx_is_evmptr12_valid(vmx)) 198 return nested_vmx_failInvalid(vcpu); 199 200 return nested_vmx_failValid(vcpu, vm_instruction_error); 201 } 202 203 static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) 204 { 205 /* TODO: not to reset guest simply here. */ 206 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 207 pr_debug_ratelimited("nested vmx abort, indicator %d\n", indicator); 208 } 209 210 static inline bool vmx_control_verify(u32 control, u32 low, u32 high) 211 { 212 return fixed_bits_valid(control, low, high); 213 } 214 215 static inline u64 vmx_control_msr(u32 low, u32 high) 216 { 217 return low | ((u64)high << 32); 218 } 219 220 static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) 221 { 222 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); 223 vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); 224 vmx->nested.need_vmcs12_to_shadow_sync = false; 225 } 226 227 static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) 228 { 229 #ifdef CONFIG_KVM_HYPERV 230 struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); 231 struct vcpu_vmx *vmx = to_vmx(vcpu); 232 233 if (nested_vmx_is_evmptr12_valid(vmx)) { 234 kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true); 235 vmx->nested.hv_evmcs = NULL; 236 } 237 238 vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; 239 240 if (hv_vcpu) { 241 hv_vcpu->nested.pa_page_gpa = INVALID_GPA; 242 hv_vcpu->nested.vm_id = 0; 243 hv_vcpu->nested.vp_id = 0; 244 } 245 #endif 246 } 247 248 static bool nested_evmcs_handle_vmclear(struct kvm_vcpu *vcpu, gpa_t vmptr) 249 { 250 #ifdef CONFIG_KVM_HYPERV 251 struct vcpu_vmx *vmx = to_vmx(vcpu); 252 /* 253 * When Enlightened VMEntry is enabled on the calling CPU we treat 254 * memory area pointer by vmptr as Enlightened VMCS (as there's no good 255 * way to distinguish it from VMCS12) and we must not corrupt it by 256 * writing to the non-existent 'launch_state' field. The area doesn't 257 * have to be the currently active EVMCS on the calling CPU and there's 258 * nothing KVM has to do to transition it from 'active' to 'non-active' 259 * state. It is possible that the area will stay mapped as 260 * vmx->nested.hv_evmcs but this shouldn't be a problem. 261 */ 262 if (!guest_cpuid_has_evmcs(vcpu) || 263 !evmptr_is_valid(nested_get_evmptr(vcpu))) 264 return false; 265 266 if (nested_vmx_evmcs(vmx) && vmptr == vmx->nested.hv_evmcs_vmptr) 267 nested_release_evmcs(vcpu); 268 269 return true; 270 #else 271 return false; 272 #endif 273 } 274 275 static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, 276 struct loaded_vmcs *prev) 277 { 278 struct vmcs_host_state *dest, *src; 279 280 if (unlikely(!vmx->guest_state_loaded)) 281 return; 282 283 src = &prev->host_state; 284 dest = &vmx->loaded_vmcs->host_state; 285 286 vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base); 287 dest->ldt_sel = src->ldt_sel; 288 #ifdef CONFIG_X86_64 289 dest->ds_sel = src->ds_sel; 290 dest->es_sel = src->es_sel; 291 #endif 292 } 293 294 static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) 295 { 296 struct vcpu_vmx *vmx = to_vmx(vcpu); 297 struct loaded_vmcs *prev; 298 int cpu; 299 300 if (WARN_ON_ONCE(vmx->loaded_vmcs == vmcs)) 301 return; 302 303 cpu = get_cpu(); 304 prev = vmx->loaded_vmcs; 305 vmx->loaded_vmcs = vmcs; 306 vmx_vcpu_load_vmcs(vcpu, cpu, prev); 307 vmx_sync_vmcs_host_state(vmx, prev); 308 put_cpu(); 309 310 vcpu->arch.regs_avail = ~VMX_REGS_LAZY_LOAD_SET; 311 312 /* 313 * All lazily updated registers will be reloaded from VMCS12 on both 314 * vmentry and vmexit. 315 */ 316 vcpu->arch.regs_dirty = 0; 317 } 318 319 /* 320 * Free whatever needs to be freed from vmx->nested when L1 goes down, or 321 * just stops using VMX. 322 */ 323 static void free_nested(struct kvm_vcpu *vcpu) 324 { 325 struct vcpu_vmx *vmx = to_vmx(vcpu); 326 327 if (WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01)) 328 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 329 330 if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) 331 return; 332 333 kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 334 335 vmx->nested.vmxon = false; 336 vmx->nested.smm.vmxon = false; 337 vmx->nested.vmxon_ptr = INVALID_GPA; 338 free_vpid(vmx->nested.vpid02); 339 vmx->nested.posted_intr_nv = -1; 340 vmx->nested.current_vmptr = INVALID_GPA; 341 if (enable_shadow_vmcs) { 342 vmx_disable_shadow_vmcs(vmx); 343 vmcs_clear(vmx->vmcs01.shadow_vmcs); 344 free_vmcs(vmx->vmcs01.shadow_vmcs); 345 vmx->vmcs01.shadow_vmcs = NULL; 346 } 347 kfree(vmx->nested.cached_vmcs12); 348 vmx->nested.cached_vmcs12 = NULL; 349 kfree(vmx->nested.cached_shadow_vmcs12); 350 vmx->nested.cached_shadow_vmcs12 = NULL; 351 /* 352 * Unpin physical memory we referred to in the vmcs02. The APIC access 353 * page's backing page (yeah, confusing) shouldn't actually be accessed, 354 * and if it is written, the contents are irrelevant. 355 */ 356 kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map, false); 357 kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); 358 kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); 359 vmx->nested.pi_desc = NULL; 360 361 kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 362 363 nested_release_evmcs(vcpu); 364 365 free_loaded_vmcs(&vmx->nested.vmcs02); 366 } 367 368 /* 369 * Ensure that the current vmcs of the logical processor is the 370 * vmcs01 of the vcpu before calling free_nested(). 371 */ 372 void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu) 373 { 374 vcpu_load(vcpu); 375 vmx_leave_nested(vcpu); 376 vcpu_put(vcpu); 377 } 378 379 #define EPTP_PA_MASK GENMASK_ULL(51, 12) 380 381 static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp) 382 { 383 return VALID_PAGE(root_hpa) && 384 ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK)); 385 } 386 387 static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp, 388 gpa_t addr) 389 { 390 unsigned long roots = 0; 391 uint i; 392 struct kvm_mmu_root_info *cached_root; 393 394 WARN_ON_ONCE(!mmu_is_nested(vcpu)); 395 396 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 397 cached_root = &vcpu->arch.mmu->prev_roots[i]; 398 399 if (nested_ept_root_matches(cached_root->hpa, cached_root->pgd, 400 eptp)) 401 roots |= KVM_MMU_ROOT_PREVIOUS(i); 402 } 403 if (roots) 404 kvm_mmu_invalidate_addr(vcpu, vcpu->arch.mmu, addr, roots); 405 } 406 407 static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, 408 struct x86_exception *fault) 409 { 410 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 411 struct vcpu_vmx *vmx = to_vmx(vcpu); 412 u32 vm_exit_reason; 413 unsigned long exit_qualification = vcpu->arch.exit_qualification; 414 415 if (vmx->nested.pml_full) { 416 vm_exit_reason = EXIT_REASON_PML_FULL; 417 vmx->nested.pml_full = false; 418 exit_qualification &= INTR_INFO_UNBLOCK_NMI; 419 } else { 420 if (fault->error_code & PFERR_RSVD_MASK) 421 vm_exit_reason = EXIT_REASON_EPT_MISCONFIG; 422 else 423 vm_exit_reason = EXIT_REASON_EPT_VIOLATION; 424 425 /* 426 * Although the caller (kvm_inject_emulated_page_fault) would 427 * have already synced the faulting address in the shadow EPT 428 * tables for the current EPTP12, we also need to sync it for 429 * any other cached EPTP02s based on the same EP4TA, since the 430 * TLB associates mappings to the EP4TA rather than the full EPTP. 431 */ 432 nested_ept_invalidate_addr(vcpu, vmcs12->ept_pointer, 433 fault->address); 434 } 435 436 nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification); 437 vmcs12->guest_physical_address = fault->address; 438 } 439 440 static void nested_ept_new_eptp(struct kvm_vcpu *vcpu) 441 { 442 struct vcpu_vmx *vmx = to_vmx(vcpu); 443 bool execonly = vmx->nested.msrs.ept_caps & VMX_EPT_EXECUTE_ONLY_BIT; 444 int ept_lpage_level = ept_caps_to_lpage_level(vmx->nested.msrs.ept_caps); 445 446 kvm_init_shadow_ept_mmu(vcpu, execonly, ept_lpage_level, 447 nested_ept_ad_enabled(vcpu), 448 nested_ept_get_eptp(vcpu)); 449 } 450 451 static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) 452 { 453 WARN_ON(mmu_is_nested(vcpu)); 454 455 vcpu->arch.mmu = &vcpu->arch.guest_mmu; 456 nested_ept_new_eptp(vcpu); 457 vcpu->arch.mmu->get_guest_pgd = nested_ept_get_eptp; 458 vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault; 459 vcpu->arch.mmu->get_pdptr = kvm_pdptr_read; 460 461 vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; 462 } 463 464 static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) 465 { 466 vcpu->arch.mmu = &vcpu->arch.root_mmu; 467 vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; 468 } 469 470 static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, 471 u16 error_code) 472 { 473 bool inequality, bit; 474 475 bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0; 476 inequality = 477 (error_code & vmcs12->page_fault_error_code_mask) != 478 vmcs12->page_fault_error_code_match; 479 return inequality ^ bit; 480 } 481 482 static bool nested_vmx_is_exception_vmexit(struct kvm_vcpu *vcpu, u8 vector, 483 u32 error_code) 484 { 485 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 486 487 /* 488 * Drop bits 31:16 of the error code when performing the #PF mask+match 489 * check. All VMCS fields involved are 32 bits, but Intel CPUs never 490 * set bits 31:16 and VMX disallows setting bits 31:16 in the injected 491 * error code. Including the to-be-dropped bits in the check might 492 * result in an "impossible" or missed exit from L1's perspective. 493 */ 494 if (vector == PF_VECTOR) 495 return nested_vmx_is_page_fault_vmexit(vmcs12, (u16)error_code); 496 497 return (vmcs12->exception_bitmap & (1u << vector)); 498 } 499 500 static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu, 501 struct vmcs12 *vmcs12) 502 { 503 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 504 return 0; 505 506 if (CC(!page_address_valid(vcpu, vmcs12->io_bitmap_a)) || 507 CC(!page_address_valid(vcpu, vmcs12->io_bitmap_b))) 508 return -EINVAL; 509 510 return 0; 511 } 512 513 static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu, 514 struct vmcs12 *vmcs12) 515 { 516 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 517 return 0; 518 519 if (CC(!page_address_valid(vcpu, vmcs12->msr_bitmap))) 520 return -EINVAL; 521 522 return 0; 523 } 524 525 static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu, 526 struct vmcs12 *vmcs12) 527 { 528 if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) 529 return 0; 530 531 if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr))) 532 return -EINVAL; 533 534 return 0; 535 } 536 537 /* 538 * For x2APIC MSRs, ignore the vmcs01 bitmap. L1 can enable x2APIC without L1 539 * itself utilizing x2APIC. All MSRs were previously set to be intercepted, 540 * only the "disable intercept" case needs to be handled. 541 */ 542 static void nested_vmx_disable_intercept_for_x2apic_msr(unsigned long *msr_bitmap_l1, 543 unsigned long *msr_bitmap_l0, 544 u32 msr, int type) 545 { 546 if (type & MSR_TYPE_R && !vmx_test_msr_bitmap_read(msr_bitmap_l1, msr)) 547 vmx_clear_msr_bitmap_read(msr_bitmap_l0, msr); 548 549 if (type & MSR_TYPE_W && !vmx_test_msr_bitmap_write(msr_bitmap_l1, msr)) 550 vmx_clear_msr_bitmap_write(msr_bitmap_l0, msr); 551 } 552 553 static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) 554 { 555 int msr; 556 557 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 558 unsigned word = msr / BITS_PER_LONG; 559 560 msr_bitmap[word] = ~0; 561 msr_bitmap[word + (0x800 / sizeof(long))] = ~0; 562 } 563 } 564 565 #define BUILD_NVMX_MSR_INTERCEPT_HELPER(rw) \ 566 static inline \ 567 void nested_vmx_set_msr_##rw##_intercept(struct vcpu_vmx *vmx, \ 568 unsigned long *msr_bitmap_l1, \ 569 unsigned long *msr_bitmap_l0, u32 msr) \ 570 { \ 571 if (vmx_test_msr_bitmap_##rw(vmx->vmcs01.msr_bitmap, msr) || \ 572 vmx_test_msr_bitmap_##rw(msr_bitmap_l1, msr)) \ 573 vmx_set_msr_bitmap_##rw(msr_bitmap_l0, msr); \ 574 else \ 575 vmx_clear_msr_bitmap_##rw(msr_bitmap_l0, msr); \ 576 } 577 BUILD_NVMX_MSR_INTERCEPT_HELPER(read) 578 BUILD_NVMX_MSR_INTERCEPT_HELPER(write) 579 580 static inline void nested_vmx_set_intercept_for_msr(struct vcpu_vmx *vmx, 581 unsigned long *msr_bitmap_l1, 582 unsigned long *msr_bitmap_l0, 583 u32 msr, int types) 584 { 585 if (types & MSR_TYPE_R) 586 nested_vmx_set_msr_read_intercept(vmx, msr_bitmap_l1, 587 msr_bitmap_l0, msr); 588 if (types & MSR_TYPE_W) 589 nested_vmx_set_msr_write_intercept(vmx, msr_bitmap_l1, 590 msr_bitmap_l0, msr); 591 } 592 593 /* 594 * Merge L0's and L1's MSR bitmap, return false to indicate that 595 * we do not use the hardware. 596 */ 597 static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, 598 struct vmcs12 *vmcs12) 599 { 600 struct vcpu_vmx *vmx = to_vmx(vcpu); 601 int msr; 602 unsigned long *msr_bitmap_l1; 603 unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap; 604 struct kvm_host_map *map = &vmx->nested.msr_bitmap_map; 605 606 /* Nothing to do if the MSR bitmap is not in use. */ 607 if (!cpu_has_vmx_msr_bitmap() || 608 !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 609 return false; 610 611 /* 612 * MSR bitmap update can be skipped when: 613 * - MSR bitmap for L1 hasn't changed. 614 * - Nested hypervisor (L1) is attempting to launch the same L2 as 615 * before. 616 * - Nested hypervisor (L1) has enabled 'Enlightened MSR Bitmap' feature 617 * and tells KVM (L0) there were no changes in MSR bitmap for L2. 618 */ 619 if (!vmx->nested.force_msr_bitmap_recalc) { 620 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 621 622 if (evmcs && evmcs->hv_enlightenments_control.msr_bitmap && 623 evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP) 624 return true; 625 } 626 627 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map)) 628 return false; 629 630 msr_bitmap_l1 = (unsigned long *)map->hva; 631 632 /* 633 * To keep the control flow simple, pay eight 8-byte writes (sixteen 634 * 4-byte writes on 32-bit systems) up front to enable intercepts for 635 * the x2APIC MSR range and selectively toggle those relevant to L2. 636 */ 637 enable_x2apic_msr_intercepts(msr_bitmap_l0); 638 639 if (nested_cpu_has_virt_x2apic_mode(vmcs12)) { 640 if (nested_cpu_has_apic_reg_virt(vmcs12)) { 641 /* 642 * L0 need not intercept reads for MSRs between 0x800 643 * and 0x8ff, it just lets the processor take the value 644 * from the virtual-APIC page; take those 256 bits 645 * directly from the L1 bitmap. 646 */ 647 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 648 unsigned word = msr / BITS_PER_LONG; 649 650 msr_bitmap_l0[word] = msr_bitmap_l1[word]; 651 } 652 } 653 654 nested_vmx_disable_intercept_for_x2apic_msr( 655 msr_bitmap_l1, msr_bitmap_l0, 656 X2APIC_MSR(APIC_TASKPRI), 657 MSR_TYPE_R | MSR_TYPE_W); 658 659 if (nested_cpu_has_vid(vmcs12)) { 660 nested_vmx_disable_intercept_for_x2apic_msr( 661 msr_bitmap_l1, msr_bitmap_l0, 662 X2APIC_MSR(APIC_EOI), 663 MSR_TYPE_W); 664 nested_vmx_disable_intercept_for_x2apic_msr( 665 msr_bitmap_l1, msr_bitmap_l0, 666 X2APIC_MSR(APIC_SELF_IPI), 667 MSR_TYPE_W); 668 } 669 } 670 671 /* 672 * Always check vmcs01's bitmap to honor userspace MSR filters and any 673 * other runtime changes to vmcs01's bitmap, e.g. dynamic pass-through. 674 */ 675 #ifdef CONFIG_X86_64 676 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 677 MSR_FS_BASE, MSR_TYPE_RW); 678 679 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 680 MSR_GS_BASE, MSR_TYPE_RW); 681 682 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 683 MSR_KERNEL_GS_BASE, MSR_TYPE_RW); 684 #endif 685 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 686 MSR_IA32_SPEC_CTRL, MSR_TYPE_RW); 687 688 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 689 MSR_IA32_PRED_CMD, MSR_TYPE_W); 690 691 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 692 MSR_IA32_FLUSH_CMD, MSR_TYPE_W); 693 694 kvm_vcpu_unmap(vcpu, &vmx->nested.msr_bitmap_map, false); 695 696 vmx->nested.force_msr_bitmap_recalc = false; 697 698 return true; 699 } 700 701 static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, 702 struct vmcs12 *vmcs12) 703 { 704 struct vcpu_vmx *vmx = to_vmx(vcpu); 705 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; 706 707 if (!nested_cpu_has_shadow_vmcs(vmcs12) || 708 vmcs12->vmcs_link_pointer == INVALID_GPA) 709 return; 710 711 if (ghc->gpa != vmcs12->vmcs_link_pointer && 712 kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, 713 vmcs12->vmcs_link_pointer, VMCS12_SIZE)) 714 return; 715 716 kvm_read_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu), 717 VMCS12_SIZE); 718 } 719 720 static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, 721 struct vmcs12 *vmcs12) 722 { 723 struct vcpu_vmx *vmx = to_vmx(vcpu); 724 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; 725 726 if (!nested_cpu_has_shadow_vmcs(vmcs12) || 727 vmcs12->vmcs_link_pointer == INVALID_GPA) 728 return; 729 730 if (ghc->gpa != vmcs12->vmcs_link_pointer && 731 kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, 732 vmcs12->vmcs_link_pointer, VMCS12_SIZE)) 733 return; 734 735 kvm_write_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu), 736 VMCS12_SIZE); 737 } 738 739 /* 740 * In nested virtualization, check if L1 has set 741 * VM_EXIT_ACK_INTR_ON_EXIT 742 */ 743 static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu) 744 { 745 return get_vmcs12(vcpu)->vm_exit_controls & 746 VM_EXIT_ACK_INTR_ON_EXIT; 747 } 748 749 static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu, 750 struct vmcs12 *vmcs12) 751 { 752 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && 753 CC(!page_address_valid(vcpu, vmcs12->apic_access_addr))) 754 return -EINVAL; 755 else 756 return 0; 757 } 758 759 static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, 760 struct vmcs12 *vmcs12) 761 { 762 if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && 763 !nested_cpu_has_apic_reg_virt(vmcs12) && 764 !nested_cpu_has_vid(vmcs12) && 765 !nested_cpu_has_posted_intr(vmcs12)) 766 return 0; 767 768 /* 769 * If virtualize x2apic mode is enabled, 770 * virtualize apic access must be disabled. 771 */ 772 if (CC(nested_cpu_has_virt_x2apic_mode(vmcs12) && 773 nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))) 774 return -EINVAL; 775 776 /* 777 * If virtual interrupt delivery is enabled, 778 * we must exit on external interrupts. 779 */ 780 if (CC(nested_cpu_has_vid(vmcs12) && !nested_exit_on_intr(vcpu))) 781 return -EINVAL; 782 783 /* 784 * bits 15:8 should be zero in posted_intr_nv, 785 * the descriptor address has been already checked 786 * in nested_get_vmcs12_pages. 787 * 788 * bits 5:0 of posted_intr_desc_addr should be zero. 789 */ 790 if (nested_cpu_has_posted_intr(vmcs12) && 791 (CC(!nested_cpu_has_vid(vmcs12)) || 792 CC(!nested_exit_intr_ack_set(vcpu)) || 793 CC((vmcs12->posted_intr_nv & 0xff00)) || 794 CC(!kvm_vcpu_is_legal_aligned_gpa(vcpu, vmcs12->posted_intr_desc_addr, 64)))) 795 return -EINVAL; 796 797 /* tpr shadow is needed by all apicv features. */ 798 if (CC(!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))) 799 return -EINVAL; 800 801 return 0; 802 } 803 804 static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, 805 u32 count, u64 addr) 806 { 807 if (count == 0) 808 return 0; 809 810 if (!kvm_vcpu_is_legal_aligned_gpa(vcpu, addr, 16) || 811 !kvm_vcpu_is_legal_gpa(vcpu, (addr + count * sizeof(struct vmx_msr_entry) - 1))) 812 return -EINVAL; 813 814 return 0; 815 } 816 817 static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu, 818 struct vmcs12 *vmcs12) 819 { 820 if (CC(nested_vmx_check_msr_switch(vcpu, 821 vmcs12->vm_exit_msr_load_count, 822 vmcs12->vm_exit_msr_load_addr)) || 823 CC(nested_vmx_check_msr_switch(vcpu, 824 vmcs12->vm_exit_msr_store_count, 825 vmcs12->vm_exit_msr_store_addr))) 826 return -EINVAL; 827 828 return 0; 829 } 830 831 static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu, 832 struct vmcs12 *vmcs12) 833 { 834 if (CC(nested_vmx_check_msr_switch(vcpu, 835 vmcs12->vm_entry_msr_load_count, 836 vmcs12->vm_entry_msr_load_addr))) 837 return -EINVAL; 838 839 return 0; 840 } 841 842 static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu, 843 struct vmcs12 *vmcs12) 844 { 845 if (!nested_cpu_has_pml(vmcs12)) 846 return 0; 847 848 if (CC(!nested_cpu_has_ept(vmcs12)) || 849 CC(!page_address_valid(vcpu, vmcs12->pml_address))) 850 return -EINVAL; 851 852 return 0; 853 } 854 855 static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu, 856 struct vmcs12 *vmcs12) 857 { 858 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) && 859 !nested_cpu_has_ept(vmcs12))) 860 return -EINVAL; 861 return 0; 862 } 863 864 static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu, 865 struct vmcs12 *vmcs12) 866 { 867 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) && 868 !nested_cpu_has_ept(vmcs12))) 869 return -EINVAL; 870 return 0; 871 } 872 873 static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu, 874 struct vmcs12 *vmcs12) 875 { 876 if (!nested_cpu_has_shadow_vmcs(vmcs12)) 877 return 0; 878 879 if (CC(!page_address_valid(vcpu, vmcs12->vmread_bitmap)) || 880 CC(!page_address_valid(vcpu, vmcs12->vmwrite_bitmap))) 881 return -EINVAL; 882 883 return 0; 884 } 885 886 static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, 887 struct vmx_msr_entry *e) 888 { 889 /* x2APIC MSR accesses are not allowed */ 890 if (CC(vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8)) 891 return -EINVAL; 892 if (CC(e->index == MSR_IA32_UCODE_WRITE) || /* SDM Table 35-2 */ 893 CC(e->index == MSR_IA32_UCODE_REV)) 894 return -EINVAL; 895 if (CC(e->reserved != 0)) 896 return -EINVAL; 897 return 0; 898 } 899 900 static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu, 901 struct vmx_msr_entry *e) 902 { 903 if (CC(e->index == MSR_FS_BASE) || 904 CC(e->index == MSR_GS_BASE) || 905 CC(e->index == MSR_IA32_SMM_MONITOR_CTL) || /* SMM is not supported */ 906 nested_vmx_msr_check_common(vcpu, e)) 907 return -EINVAL; 908 return 0; 909 } 910 911 static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, 912 struct vmx_msr_entry *e) 913 { 914 if (CC(e->index == MSR_IA32_SMBASE) || /* SMM is not supported */ 915 nested_vmx_msr_check_common(vcpu, e)) 916 return -EINVAL; 917 return 0; 918 } 919 920 static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu) 921 { 922 struct vcpu_vmx *vmx = to_vmx(vcpu); 923 u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, 924 vmx->nested.msrs.misc_high); 925 926 return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER; 927 } 928 929 /* 930 * Load guest's/host's msr at nested entry/exit. 931 * return 0 for success, entry index for failure. 932 * 933 * One of the failure modes for MSR load/store is when a list exceeds the 934 * virtual hardware's capacity. To maintain compatibility with hardware inasmuch 935 * as possible, process all valid entries before failing rather than precheck 936 * for a capacity violation. 937 */ 938 static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 939 { 940 u32 i; 941 struct vmx_msr_entry e; 942 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); 943 944 for (i = 0; i < count; i++) { 945 if (unlikely(i >= max_msr_list_size)) 946 goto fail; 947 948 if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), 949 &e, sizeof(e))) { 950 pr_debug_ratelimited( 951 "%s cannot read MSR entry (%u, 0x%08llx)\n", 952 __func__, i, gpa + i * sizeof(e)); 953 goto fail; 954 } 955 if (nested_vmx_load_msr_check(vcpu, &e)) { 956 pr_debug_ratelimited( 957 "%s check failed (%u, 0x%x, 0x%x)\n", 958 __func__, i, e.index, e.reserved); 959 goto fail; 960 } 961 if (kvm_set_msr(vcpu, e.index, e.value)) { 962 pr_debug_ratelimited( 963 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 964 __func__, i, e.index, e.value); 965 goto fail; 966 } 967 } 968 return 0; 969 fail: 970 /* Note, max_msr_list_size is at most 4096, i.e. this can't wrap. */ 971 return i + 1; 972 } 973 974 static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu, 975 u32 msr_index, 976 u64 *data) 977 { 978 struct vcpu_vmx *vmx = to_vmx(vcpu); 979 980 /* 981 * If the L0 hypervisor stored a more accurate value for the TSC that 982 * does not include the time taken for emulation of the L2->L1 983 * VM-exit in L0, use the more accurate value. 984 */ 985 if (msr_index == MSR_IA32_TSC) { 986 int i = vmx_find_loadstore_msr_slot(&vmx->msr_autostore.guest, 987 MSR_IA32_TSC); 988 989 if (i >= 0) { 990 u64 val = vmx->msr_autostore.guest.val[i].value; 991 992 *data = kvm_read_l1_tsc(vcpu, val); 993 return true; 994 } 995 } 996 997 if (kvm_get_msr(vcpu, msr_index, data)) { 998 pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__, 999 msr_index); 1000 return false; 1001 } 1002 return true; 1003 } 1004 1005 static bool read_and_check_msr_entry(struct kvm_vcpu *vcpu, u64 gpa, int i, 1006 struct vmx_msr_entry *e) 1007 { 1008 if (kvm_vcpu_read_guest(vcpu, 1009 gpa + i * sizeof(*e), 1010 e, 2 * sizeof(u32))) { 1011 pr_debug_ratelimited( 1012 "%s cannot read MSR entry (%u, 0x%08llx)\n", 1013 __func__, i, gpa + i * sizeof(*e)); 1014 return false; 1015 } 1016 if (nested_vmx_store_msr_check(vcpu, e)) { 1017 pr_debug_ratelimited( 1018 "%s check failed (%u, 0x%x, 0x%x)\n", 1019 __func__, i, e->index, e->reserved); 1020 return false; 1021 } 1022 return true; 1023 } 1024 1025 static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 1026 { 1027 u64 data; 1028 u32 i; 1029 struct vmx_msr_entry e; 1030 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); 1031 1032 for (i = 0; i < count; i++) { 1033 if (unlikely(i >= max_msr_list_size)) 1034 return -EINVAL; 1035 1036 if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) 1037 return -EINVAL; 1038 1039 if (!nested_vmx_get_vmexit_msr_value(vcpu, e.index, &data)) 1040 return -EINVAL; 1041 1042 if (kvm_vcpu_write_guest(vcpu, 1043 gpa + i * sizeof(e) + 1044 offsetof(struct vmx_msr_entry, value), 1045 &data, sizeof(data))) { 1046 pr_debug_ratelimited( 1047 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 1048 __func__, i, e.index, data); 1049 return -EINVAL; 1050 } 1051 } 1052 return 0; 1053 } 1054 1055 static bool nested_msr_store_list_has_msr(struct kvm_vcpu *vcpu, u32 msr_index) 1056 { 1057 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1058 u32 count = vmcs12->vm_exit_msr_store_count; 1059 u64 gpa = vmcs12->vm_exit_msr_store_addr; 1060 struct vmx_msr_entry e; 1061 u32 i; 1062 1063 for (i = 0; i < count; i++) { 1064 if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) 1065 return false; 1066 1067 if (e.index == msr_index) 1068 return true; 1069 } 1070 return false; 1071 } 1072 1073 static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu, 1074 u32 msr_index) 1075 { 1076 struct vcpu_vmx *vmx = to_vmx(vcpu); 1077 struct vmx_msrs *autostore = &vmx->msr_autostore.guest; 1078 bool in_vmcs12_store_list; 1079 int msr_autostore_slot; 1080 bool in_autostore_list; 1081 int last; 1082 1083 msr_autostore_slot = vmx_find_loadstore_msr_slot(autostore, msr_index); 1084 in_autostore_list = msr_autostore_slot >= 0; 1085 in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index); 1086 1087 if (in_vmcs12_store_list && !in_autostore_list) { 1088 if (autostore->nr == MAX_NR_LOADSTORE_MSRS) { 1089 /* 1090 * Emulated VMEntry does not fail here. Instead a less 1091 * accurate value will be returned by 1092 * nested_vmx_get_vmexit_msr_value() using kvm_get_msr() 1093 * instead of reading the value from the vmcs02 VMExit 1094 * MSR-store area. 1095 */ 1096 pr_warn_ratelimited( 1097 "Not enough msr entries in msr_autostore. Can't add msr %x\n", 1098 msr_index); 1099 return; 1100 } 1101 last = autostore->nr++; 1102 autostore->val[last].index = msr_index; 1103 } else if (!in_vmcs12_store_list && in_autostore_list) { 1104 last = --autostore->nr; 1105 autostore->val[msr_autostore_slot] = autostore->val[last]; 1106 } 1107 } 1108 1109 /* 1110 * Load guest's/host's cr3 at nested entry/exit. @nested_ept is true if we are 1111 * emulating VM-Entry into a guest with EPT enabled. On failure, the expected 1112 * Exit Qualification (for a VM-Entry consistency check VM-Exit) is assigned to 1113 * @entry_failure_code. 1114 */ 1115 static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, 1116 bool nested_ept, bool reload_pdptrs, 1117 enum vm_entry_failure_code *entry_failure_code) 1118 { 1119 if (CC(!kvm_vcpu_is_legal_cr3(vcpu, cr3))) { 1120 *entry_failure_code = ENTRY_FAIL_DEFAULT; 1121 return -EINVAL; 1122 } 1123 1124 /* 1125 * If PAE paging and EPT are both on, CR3 is not used by the CPU and 1126 * must not be dereferenced. 1127 */ 1128 if (reload_pdptrs && !nested_ept && is_pae_paging(vcpu) && 1129 CC(!load_pdptrs(vcpu, cr3))) { 1130 *entry_failure_code = ENTRY_FAIL_PDPTE; 1131 return -EINVAL; 1132 } 1133 1134 vcpu->arch.cr3 = cr3; 1135 kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); 1136 1137 /* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */ 1138 kvm_init_mmu(vcpu); 1139 1140 if (!nested_ept) 1141 kvm_mmu_new_pgd(vcpu, cr3); 1142 1143 return 0; 1144 } 1145 1146 /* 1147 * Returns if KVM is able to config CPU to tag TLB entries 1148 * populated by L2 differently than TLB entries populated 1149 * by L1. 1150 * 1151 * If L0 uses EPT, L1 and L2 run with different EPTP because 1152 * guest_mode is part of kvm_mmu_page_role. Thus, TLB entries 1153 * are tagged with different EPTP. 1154 * 1155 * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged 1156 * with different VPID (L1 entries are tagged with vmx->vpid 1157 * while L2 entries are tagged with vmx->nested.vpid02). 1158 */ 1159 static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu) 1160 { 1161 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1162 1163 return enable_ept || 1164 (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02); 1165 } 1166 1167 static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu, 1168 struct vmcs12 *vmcs12, 1169 bool is_vmenter) 1170 { 1171 struct vcpu_vmx *vmx = to_vmx(vcpu); 1172 1173 /* Handle pending Hyper-V TLB flush requests */ 1174 kvm_hv_nested_transtion_tlb_flush(vcpu, enable_ept); 1175 1176 /* 1177 * If vmcs12 doesn't use VPID, L1 expects linear and combined mappings 1178 * for *all* contexts to be flushed on VM-Enter/VM-Exit, i.e. it's a 1179 * full TLB flush from the guest's perspective. This is required even 1180 * if VPID is disabled in the host as KVM may need to synchronize the 1181 * MMU in response to the guest TLB flush. 1182 * 1183 * Note, using TLB_FLUSH_GUEST is correct even if nested EPT is in use. 1184 * EPT is a special snowflake, as guest-physical mappings aren't 1185 * flushed on VPID invalidations, including VM-Enter or VM-Exit with 1186 * VPID disabled. As a result, KVM _never_ needs to sync nEPT 1187 * entries on VM-Enter because L1 can't rely on VM-Enter to flush 1188 * those mappings. 1189 */ 1190 if (!nested_cpu_has_vpid(vmcs12)) { 1191 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 1192 return; 1193 } 1194 1195 /* L2 should never have a VPID if VPID is disabled. */ 1196 WARN_ON(!enable_vpid); 1197 1198 /* 1199 * VPID is enabled and in use by vmcs12. If vpid12 is changing, then 1200 * emulate a guest TLB flush as KVM does not track vpid12 history nor 1201 * is the VPID incorporated into the MMU context. I.e. KVM must assume 1202 * that the new vpid12 has never been used and thus represents a new 1203 * guest ASID that cannot have entries in the TLB. 1204 */ 1205 if (is_vmenter && vmcs12->virtual_processor_id != vmx->nested.last_vpid) { 1206 vmx->nested.last_vpid = vmcs12->virtual_processor_id; 1207 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 1208 return; 1209 } 1210 1211 /* 1212 * If VPID is enabled, used by vmc12, and vpid12 is not changing but 1213 * does not have a unique TLB tag (ASID), i.e. EPT is disabled and 1214 * KVM was unable to allocate a VPID for L2, flush the current context 1215 * as the effective ASID is common to both L1 and L2. 1216 */ 1217 if (!nested_has_guest_tlb_tag(vcpu)) 1218 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); 1219 } 1220 1221 static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) 1222 { 1223 superset &= mask; 1224 subset &= mask; 1225 1226 return (superset | subset) == superset; 1227 } 1228 1229 static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) 1230 { 1231 const u64 feature_and_reserved = 1232 /* feature (except bit 48; see below) */ 1233 BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) | 1234 /* reserved */ 1235 BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56); 1236 u64 vmx_basic = vmcs_config.nested.basic; 1237 1238 if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved)) 1239 return -EINVAL; 1240 1241 /* 1242 * KVM does not emulate a version of VMX that constrains physical 1243 * addresses of VMX structures (e.g. VMCS) to 32-bits. 1244 */ 1245 if (data & BIT_ULL(48)) 1246 return -EINVAL; 1247 1248 if (vmx_basic_vmcs_revision_id(vmx_basic) != 1249 vmx_basic_vmcs_revision_id(data)) 1250 return -EINVAL; 1251 1252 if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data)) 1253 return -EINVAL; 1254 1255 vmx->nested.msrs.basic = data; 1256 return 0; 1257 } 1258 1259 static void vmx_get_control_msr(struct nested_vmx_msrs *msrs, u32 msr_index, 1260 u32 **low, u32 **high) 1261 { 1262 switch (msr_index) { 1263 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1264 *low = &msrs->pinbased_ctls_low; 1265 *high = &msrs->pinbased_ctls_high; 1266 break; 1267 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1268 *low = &msrs->procbased_ctls_low; 1269 *high = &msrs->procbased_ctls_high; 1270 break; 1271 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1272 *low = &msrs->exit_ctls_low; 1273 *high = &msrs->exit_ctls_high; 1274 break; 1275 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1276 *low = &msrs->entry_ctls_low; 1277 *high = &msrs->entry_ctls_high; 1278 break; 1279 case MSR_IA32_VMX_PROCBASED_CTLS2: 1280 *low = &msrs->secondary_ctls_low; 1281 *high = &msrs->secondary_ctls_high; 1282 break; 1283 default: 1284 BUG(); 1285 } 1286 } 1287 1288 static int 1289 vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) 1290 { 1291 u32 *lowp, *highp; 1292 u64 supported; 1293 1294 vmx_get_control_msr(&vmcs_config.nested, msr_index, &lowp, &highp); 1295 1296 supported = vmx_control_msr(*lowp, *highp); 1297 1298 /* Check must-be-1 bits are still 1. */ 1299 if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0))) 1300 return -EINVAL; 1301 1302 /* Check must-be-0 bits are still 0. */ 1303 if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32))) 1304 return -EINVAL; 1305 1306 vmx_get_control_msr(&vmx->nested.msrs, msr_index, &lowp, &highp); 1307 *lowp = data; 1308 *highp = data >> 32; 1309 return 0; 1310 } 1311 1312 static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) 1313 { 1314 const u64 feature_and_reserved_bits = 1315 /* feature */ 1316 BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) | 1317 BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) | 1318 /* reserved */ 1319 GENMASK_ULL(13, 9) | BIT_ULL(31); 1320 u64 vmx_misc = vmx_control_msr(vmcs_config.nested.misc_low, 1321 vmcs_config.nested.misc_high); 1322 1323 if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits)) 1324 return -EINVAL; 1325 1326 if ((vmx->nested.msrs.pinbased_ctls_high & 1327 PIN_BASED_VMX_PREEMPTION_TIMER) && 1328 vmx_misc_preemption_timer_rate(data) != 1329 vmx_misc_preemption_timer_rate(vmx_misc)) 1330 return -EINVAL; 1331 1332 if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc)) 1333 return -EINVAL; 1334 1335 if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc)) 1336 return -EINVAL; 1337 1338 if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc)) 1339 return -EINVAL; 1340 1341 vmx->nested.msrs.misc_low = data; 1342 vmx->nested.msrs.misc_high = data >> 32; 1343 1344 return 0; 1345 } 1346 1347 static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data) 1348 { 1349 u64 vmx_ept_vpid_cap = vmx_control_msr(vmcs_config.nested.ept_caps, 1350 vmcs_config.nested.vpid_caps); 1351 1352 /* Every bit is either reserved or a feature bit. */ 1353 if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL)) 1354 return -EINVAL; 1355 1356 vmx->nested.msrs.ept_caps = data; 1357 vmx->nested.msrs.vpid_caps = data >> 32; 1358 return 0; 1359 } 1360 1361 static u64 *vmx_get_fixed0_msr(struct nested_vmx_msrs *msrs, u32 msr_index) 1362 { 1363 switch (msr_index) { 1364 case MSR_IA32_VMX_CR0_FIXED0: 1365 return &msrs->cr0_fixed0; 1366 case MSR_IA32_VMX_CR4_FIXED0: 1367 return &msrs->cr4_fixed0; 1368 default: 1369 BUG(); 1370 } 1371 } 1372 1373 static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) 1374 { 1375 const u64 *msr = vmx_get_fixed0_msr(&vmcs_config.nested, msr_index); 1376 1377 /* 1378 * 1 bits (which indicates bits which "must-be-1" during VMX operation) 1379 * must be 1 in the restored value. 1380 */ 1381 if (!is_bitwise_subset(data, *msr, -1ULL)) 1382 return -EINVAL; 1383 1384 *vmx_get_fixed0_msr(&vmx->nested.msrs, msr_index) = data; 1385 return 0; 1386 } 1387 1388 /* 1389 * Called when userspace is restoring VMX MSRs. 1390 * 1391 * Returns 0 on success, non-0 otherwise. 1392 */ 1393 int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 1394 { 1395 struct vcpu_vmx *vmx = to_vmx(vcpu); 1396 1397 /* 1398 * Don't allow changes to the VMX capability MSRs while the vCPU 1399 * is in VMX operation. 1400 */ 1401 if (vmx->nested.vmxon) 1402 return -EBUSY; 1403 1404 switch (msr_index) { 1405 case MSR_IA32_VMX_BASIC: 1406 return vmx_restore_vmx_basic(vmx, data); 1407 case MSR_IA32_VMX_PINBASED_CTLS: 1408 case MSR_IA32_VMX_PROCBASED_CTLS: 1409 case MSR_IA32_VMX_EXIT_CTLS: 1410 case MSR_IA32_VMX_ENTRY_CTLS: 1411 /* 1412 * The "non-true" VMX capability MSRs are generated from the 1413 * "true" MSRs, so we do not support restoring them directly. 1414 * 1415 * If userspace wants to emulate VMX_BASIC[55]=0, userspace 1416 * should restore the "true" MSRs with the must-be-1 bits 1417 * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND 1418 * DEFAULT SETTINGS". 1419 */ 1420 return -EINVAL; 1421 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1422 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1423 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1424 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1425 case MSR_IA32_VMX_PROCBASED_CTLS2: 1426 return vmx_restore_control_msr(vmx, msr_index, data); 1427 case MSR_IA32_VMX_MISC: 1428 return vmx_restore_vmx_misc(vmx, data); 1429 case MSR_IA32_VMX_CR0_FIXED0: 1430 case MSR_IA32_VMX_CR4_FIXED0: 1431 return vmx_restore_fixed0_msr(vmx, msr_index, data); 1432 case MSR_IA32_VMX_CR0_FIXED1: 1433 case MSR_IA32_VMX_CR4_FIXED1: 1434 /* 1435 * These MSRs are generated based on the vCPU's CPUID, so we 1436 * do not support restoring them directly. 1437 */ 1438 return -EINVAL; 1439 case MSR_IA32_VMX_EPT_VPID_CAP: 1440 return vmx_restore_vmx_ept_vpid_cap(vmx, data); 1441 case MSR_IA32_VMX_VMCS_ENUM: 1442 vmx->nested.msrs.vmcs_enum = data; 1443 return 0; 1444 case MSR_IA32_VMX_VMFUNC: 1445 if (data & ~vmcs_config.nested.vmfunc_controls) 1446 return -EINVAL; 1447 vmx->nested.msrs.vmfunc_controls = data; 1448 return 0; 1449 default: 1450 /* 1451 * The rest of the VMX capability MSRs do not support restore. 1452 */ 1453 return -EINVAL; 1454 } 1455 } 1456 1457 /* Returns 0 on success, non-0 otherwise. */ 1458 int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata) 1459 { 1460 switch (msr_index) { 1461 case MSR_IA32_VMX_BASIC: 1462 *pdata = msrs->basic; 1463 break; 1464 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1465 case MSR_IA32_VMX_PINBASED_CTLS: 1466 *pdata = vmx_control_msr( 1467 msrs->pinbased_ctls_low, 1468 msrs->pinbased_ctls_high); 1469 if (msr_index == MSR_IA32_VMX_PINBASED_CTLS) 1470 *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 1471 break; 1472 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1473 case MSR_IA32_VMX_PROCBASED_CTLS: 1474 *pdata = vmx_control_msr( 1475 msrs->procbased_ctls_low, 1476 msrs->procbased_ctls_high); 1477 if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS) 1478 *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 1479 break; 1480 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1481 case MSR_IA32_VMX_EXIT_CTLS: 1482 *pdata = vmx_control_msr( 1483 msrs->exit_ctls_low, 1484 msrs->exit_ctls_high); 1485 if (msr_index == MSR_IA32_VMX_EXIT_CTLS) 1486 *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 1487 break; 1488 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1489 case MSR_IA32_VMX_ENTRY_CTLS: 1490 *pdata = vmx_control_msr( 1491 msrs->entry_ctls_low, 1492 msrs->entry_ctls_high); 1493 if (msr_index == MSR_IA32_VMX_ENTRY_CTLS) 1494 *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 1495 break; 1496 case MSR_IA32_VMX_MISC: 1497 *pdata = vmx_control_msr( 1498 msrs->misc_low, 1499 msrs->misc_high); 1500 break; 1501 case MSR_IA32_VMX_CR0_FIXED0: 1502 *pdata = msrs->cr0_fixed0; 1503 break; 1504 case MSR_IA32_VMX_CR0_FIXED1: 1505 *pdata = msrs->cr0_fixed1; 1506 break; 1507 case MSR_IA32_VMX_CR4_FIXED0: 1508 *pdata = msrs->cr4_fixed0; 1509 break; 1510 case MSR_IA32_VMX_CR4_FIXED1: 1511 *pdata = msrs->cr4_fixed1; 1512 break; 1513 case MSR_IA32_VMX_VMCS_ENUM: 1514 *pdata = msrs->vmcs_enum; 1515 break; 1516 case MSR_IA32_VMX_PROCBASED_CTLS2: 1517 *pdata = vmx_control_msr( 1518 msrs->secondary_ctls_low, 1519 msrs->secondary_ctls_high); 1520 break; 1521 case MSR_IA32_VMX_EPT_VPID_CAP: 1522 *pdata = msrs->ept_caps | 1523 ((u64)msrs->vpid_caps << 32); 1524 break; 1525 case MSR_IA32_VMX_VMFUNC: 1526 *pdata = msrs->vmfunc_controls; 1527 break; 1528 default: 1529 return 1; 1530 } 1531 1532 return 0; 1533 } 1534 1535 /* 1536 * Copy the writable VMCS shadow fields back to the VMCS12, in case they have 1537 * been modified by the L1 guest. Note, "writable" in this context means 1538 * "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of 1539 * fields tagged SHADOW_FIELD_RO may or may not align with the "read-only" 1540 * VM-exit information fields (which are actually writable if the vCPU is 1541 * configured to support "VMWRITE to any supported field in the VMCS"). 1542 */ 1543 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) 1544 { 1545 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 1546 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); 1547 struct shadow_vmcs_field field; 1548 unsigned long val; 1549 int i; 1550 1551 if (WARN_ON(!shadow_vmcs)) 1552 return; 1553 1554 preempt_disable(); 1555 1556 vmcs_load(shadow_vmcs); 1557 1558 for (i = 0; i < max_shadow_read_write_fields; i++) { 1559 field = shadow_read_write_fields[i]; 1560 val = __vmcs_readl(field.encoding); 1561 vmcs12_write_any(vmcs12, field.encoding, field.offset, val); 1562 } 1563 1564 vmcs_clear(shadow_vmcs); 1565 vmcs_load(vmx->loaded_vmcs->vmcs); 1566 1567 preempt_enable(); 1568 } 1569 1570 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) 1571 { 1572 const struct shadow_vmcs_field *fields[] = { 1573 shadow_read_write_fields, 1574 shadow_read_only_fields 1575 }; 1576 const int max_fields[] = { 1577 max_shadow_read_write_fields, 1578 max_shadow_read_only_fields 1579 }; 1580 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 1581 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); 1582 struct shadow_vmcs_field field; 1583 unsigned long val; 1584 int i, q; 1585 1586 if (WARN_ON(!shadow_vmcs)) 1587 return; 1588 1589 vmcs_load(shadow_vmcs); 1590 1591 for (q = 0; q < ARRAY_SIZE(fields); q++) { 1592 for (i = 0; i < max_fields[q]; i++) { 1593 field = fields[q][i]; 1594 val = vmcs12_read_any(vmcs12, field.encoding, 1595 field.offset); 1596 __vmcs_writel(field.encoding, val); 1597 } 1598 } 1599 1600 vmcs_clear(shadow_vmcs); 1601 vmcs_load(vmx->loaded_vmcs->vmcs); 1602 } 1603 1604 static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields) 1605 { 1606 #ifdef CONFIG_KVM_HYPERV 1607 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; 1608 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 1609 struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(&vmx->vcpu); 1610 1611 /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */ 1612 vmcs12->tpr_threshold = evmcs->tpr_threshold; 1613 vmcs12->guest_rip = evmcs->guest_rip; 1614 1615 if (unlikely(!(hv_clean_fields & 1616 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ENLIGHTENMENTSCONTROL))) { 1617 hv_vcpu->nested.pa_page_gpa = evmcs->partition_assist_page; 1618 hv_vcpu->nested.vm_id = evmcs->hv_vm_id; 1619 hv_vcpu->nested.vp_id = evmcs->hv_vp_id; 1620 } 1621 1622 if (unlikely(!(hv_clean_fields & 1623 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) { 1624 vmcs12->guest_rsp = evmcs->guest_rsp; 1625 vmcs12->guest_rflags = evmcs->guest_rflags; 1626 vmcs12->guest_interruptibility_info = 1627 evmcs->guest_interruptibility_info; 1628 /* 1629 * Not present in struct vmcs12: 1630 * vmcs12->guest_ssp = evmcs->guest_ssp; 1631 */ 1632 } 1633 1634 if (unlikely(!(hv_clean_fields & 1635 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) { 1636 vmcs12->cpu_based_vm_exec_control = 1637 evmcs->cpu_based_vm_exec_control; 1638 } 1639 1640 if (unlikely(!(hv_clean_fields & 1641 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN))) { 1642 vmcs12->exception_bitmap = evmcs->exception_bitmap; 1643 } 1644 1645 if (unlikely(!(hv_clean_fields & 1646 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) { 1647 vmcs12->vm_entry_controls = evmcs->vm_entry_controls; 1648 } 1649 1650 if (unlikely(!(hv_clean_fields & 1651 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) { 1652 vmcs12->vm_entry_intr_info_field = 1653 evmcs->vm_entry_intr_info_field; 1654 vmcs12->vm_entry_exception_error_code = 1655 evmcs->vm_entry_exception_error_code; 1656 vmcs12->vm_entry_instruction_len = 1657 evmcs->vm_entry_instruction_len; 1658 } 1659 1660 if (unlikely(!(hv_clean_fields & 1661 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) { 1662 vmcs12->host_ia32_pat = evmcs->host_ia32_pat; 1663 vmcs12->host_ia32_efer = evmcs->host_ia32_efer; 1664 vmcs12->host_cr0 = evmcs->host_cr0; 1665 vmcs12->host_cr3 = evmcs->host_cr3; 1666 vmcs12->host_cr4 = evmcs->host_cr4; 1667 vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp; 1668 vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip; 1669 vmcs12->host_rip = evmcs->host_rip; 1670 vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs; 1671 vmcs12->host_es_selector = evmcs->host_es_selector; 1672 vmcs12->host_cs_selector = evmcs->host_cs_selector; 1673 vmcs12->host_ss_selector = evmcs->host_ss_selector; 1674 vmcs12->host_ds_selector = evmcs->host_ds_selector; 1675 vmcs12->host_fs_selector = evmcs->host_fs_selector; 1676 vmcs12->host_gs_selector = evmcs->host_gs_selector; 1677 vmcs12->host_tr_selector = evmcs->host_tr_selector; 1678 vmcs12->host_ia32_perf_global_ctrl = evmcs->host_ia32_perf_global_ctrl; 1679 /* 1680 * Not present in struct vmcs12: 1681 * vmcs12->host_ia32_s_cet = evmcs->host_ia32_s_cet; 1682 * vmcs12->host_ssp = evmcs->host_ssp; 1683 * vmcs12->host_ia32_int_ssp_table_addr = evmcs->host_ia32_int_ssp_table_addr; 1684 */ 1685 } 1686 1687 if (unlikely(!(hv_clean_fields & 1688 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1))) { 1689 vmcs12->pin_based_vm_exec_control = 1690 evmcs->pin_based_vm_exec_control; 1691 vmcs12->vm_exit_controls = evmcs->vm_exit_controls; 1692 vmcs12->secondary_vm_exec_control = 1693 evmcs->secondary_vm_exec_control; 1694 } 1695 1696 if (unlikely(!(hv_clean_fields & 1697 HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) { 1698 vmcs12->io_bitmap_a = evmcs->io_bitmap_a; 1699 vmcs12->io_bitmap_b = evmcs->io_bitmap_b; 1700 } 1701 1702 if (unlikely(!(hv_clean_fields & 1703 HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) { 1704 vmcs12->msr_bitmap = evmcs->msr_bitmap; 1705 } 1706 1707 if (unlikely(!(hv_clean_fields & 1708 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) { 1709 vmcs12->guest_es_base = evmcs->guest_es_base; 1710 vmcs12->guest_cs_base = evmcs->guest_cs_base; 1711 vmcs12->guest_ss_base = evmcs->guest_ss_base; 1712 vmcs12->guest_ds_base = evmcs->guest_ds_base; 1713 vmcs12->guest_fs_base = evmcs->guest_fs_base; 1714 vmcs12->guest_gs_base = evmcs->guest_gs_base; 1715 vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base; 1716 vmcs12->guest_tr_base = evmcs->guest_tr_base; 1717 vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base; 1718 vmcs12->guest_idtr_base = evmcs->guest_idtr_base; 1719 vmcs12->guest_es_limit = evmcs->guest_es_limit; 1720 vmcs12->guest_cs_limit = evmcs->guest_cs_limit; 1721 vmcs12->guest_ss_limit = evmcs->guest_ss_limit; 1722 vmcs12->guest_ds_limit = evmcs->guest_ds_limit; 1723 vmcs12->guest_fs_limit = evmcs->guest_fs_limit; 1724 vmcs12->guest_gs_limit = evmcs->guest_gs_limit; 1725 vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit; 1726 vmcs12->guest_tr_limit = evmcs->guest_tr_limit; 1727 vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit; 1728 vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit; 1729 vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes; 1730 vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes; 1731 vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes; 1732 vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes; 1733 vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes; 1734 vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes; 1735 vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes; 1736 vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes; 1737 vmcs12->guest_es_selector = evmcs->guest_es_selector; 1738 vmcs12->guest_cs_selector = evmcs->guest_cs_selector; 1739 vmcs12->guest_ss_selector = evmcs->guest_ss_selector; 1740 vmcs12->guest_ds_selector = evmcs->guest_ds_selector; 1741 vmcs12->guest_fs_selector = evmcs->guest_fs_selector; 1742 vmcs12->guest_gs_selector = evmcs->guest_gs_selector; 1743 vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector; 1744 vmcs12->guest_tr_selector = evmcs->guest_tr_selector; 1745 } 1746 1747 if (unlikely(!(hv_clean_fields & 1748 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) { 1749 vmcs12->tsc_offset = evmcs->tsc_offset; 1750 vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr; 1751 vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap; 1752 vmcs12->encls_exiting_bitmap = evmcs->encls_exiting_bitmap; 1753 vmcs12->tsc_multiplier = evmcs->tsc_multiplier; 1754 } 1755 1756 if (unlikely(!(hv_clean_fields & 1757 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) { 1758 vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask; 1759 vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask; 1760 vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow; 1761 vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow; 1762 vmcs12->guest_cr0 = evmcs->guest_cr0; 1763 vmcs12->guest_cr3 = evmcs->guest_cr3; 1764 vmcs12->guest_cr4 = evmcs->guest_cr4; 1765 vmcs12->guest_dr7 = evmcs->guest_dr7; 1766 } 1767 1768 if (unlikely(!(hv_clean_fields & 1769 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) { 1770 vmcs12->host_fs_base = evmcs->host_fs_base; 1771 vmcs12->host_gs_base = evmcs->host_gs_base; 1772 vmcs12->host_tr_base = evmcs->host_tr_base; 1773 vmcs12->host_gdtr_base = evmcs->host_gdtr_base; 1774 vmcs12->host_idtr_base = evmcs->host_idtr_base; 1775 vmcs12->host_rsp = evmcs->host_rsp; 1776 } 1777 1778 if (unlikely(!(hv_clean_fields & 1779 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) { 1780 vmcs12->ept_pointer = evmcs->ept_pointer; 1781 vmcs12->virtual_processor_id = evmcs->virtual_processor_id; 1782 } 1783 1784 if (unlikely(!(hv_clean_fields & 1785 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) { 1786 vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer; 1787 vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl; 1788 vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat; 1789 vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer; 1790 vmcs12->guest_pdptr0 = evmcs->guest_pdptr0; 1791 vmcs12->guest_pdptr1 = evmcs->guest_pdptr1; 1792 vmcs12->guest_pdptr2 = evmcs->guest_pdptr2; 1793 vmcs12->guest_pdptr3 = evmcs->guest_pdptr3; 1794 vmcs12->guest_pending_dbg_exceptions = 1795 evmcs->guest_pending_dbg_exceptions; 1796 vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp; 1797 vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip; 1798 vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs; 1799 vmcs12->guest_activity_state = evmcs->guest_activity_state; 1800 vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs; 1801 vmcs12->guest_ia32_perf_global_ctrl = evmcs->guest_ia32_perf_global_ctrl; 1802 /* 1803 * Not present in struct vmcs12: 1804 * vmcs12->guest_ia32_s_cet = evmcs->guest_ia32_s_cet; 1805 * vmcs12->guest_ia32_lbr_ctl = evmcs->guest_ia32_lbr_ctl; 1806 * vmcs12->guest_ia32_int_ssp_table_addr = evmcs->guest_ia32_int_ssp_table_addr; 1807 */ 1808 } 1809 1810 /* 1811 * Not used? 1812 * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr; 1813 * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr; 1814 * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr; 1815 * vmcs12->page_fault_error_code_mask = 1816 * evmcs->page_fault_error_code_mask; 1817 * vmcs12->page_fault_error_code_match = 1818 * evmcs->page_fault_error_code_match; 1819 * vmcs12->cr3_target_count = evmcs->cr3_target_count; 1820 * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count; 1821 * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count; 1822 * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count; 1823 */ 1824 1825 /* 1826 * Read only fields: 1827 * vmcs12->guest_physical_address = evmcs->guest_physical_address; 1828 * vmcs12->vm_instruction_error = evmcs->vm_instruction_error; 1829 * vmcs12->vm_exit_reason = evmcs->vm_exit_reason; 1830 * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info; 1831 * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code; 1832 * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field; 1833 * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code; 1834 * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len; 1835 * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info; 1836 * vmcs12->exit_qualification = evmcs->exit_qualification; 1837 * vmcs12->guest_linear_address = evmcs->guest_linear_address; 1838 * 1839 * Not present in struct vmcs12: 1840 * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx; 1841 * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi; 1842 * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi; 1843 * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip; 1844 */ 1845 1846 return; 1847 #else /* CONFIG_KVM_HYPERV */ 1848 KVM_BUG_ON(1, vmx->vcpu.kvm); 1849 #endif /* CONFIG_KVM_HYPERV */ 1850 } 1851 1852 static void copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) 1853 { 1854 #ifdef CONFIG_KVM_HYPERV 1855 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; 1856 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 1857 1858 /* 1859 * Should not be changed by KVM: 1860 * 1861 * evmcs->host_es_selector = vmcs12->host_es_selector; 1862 * evmcs->host_cs_selector = vmcs12->host_cs_selector; 1863 * evmcs->host_ss_selector = vmcs12->host_ss_selector; 1864 * evmcs->host_ds_selector = vmcs12->host_ds_selector; 1865 * evmcs->host_fs_selector = vmcs12->host_fs_selector; 1866 * evmcs->host_gs_selector = vmcs12->host_gs_selector; 1867 * evmcs->host_tr_selector = vmcs12->host_tr_selector; 1868 * evmcs->host_ia32_pat = vmcs12->host_ia32_pat; 1869 * evmcs->host_ia32_efer = vmcs12->host_ia32_efer; 1870 * evmcs->host_cr0 = vmcs12->host_cr0; 1871 * evmcs->host_cr3 = vmcs12->host_cr3; 1872 * evmcs->host_cr4 = vmcs12->host_cr4; 1873 * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp; 1874 * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip; 1875 * evmcs->host_rip = vmcs12->host_rip; 1876 * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs; 1877 * evmcs->host_fs_base = vmcs12->host_fs_base; 1878 * evmcs->host_gs_base = vmcs12->host_gs_base; 1879 * evmcs->host_tr_base = vmcs12->host_tr_base; 1880 * evmcs->host_gdtr_base = vmcs12->host_gdtr_base; 1881 * evmcs->host_idtr_base = vmcs12->host_idtr_base; 1882 * evmcs->host_rsp = vmcs12->host_rsp; 1883 * sync_vmcs02_to_vmcs12() doesn't read these: 1884 * evmcs->io_bitmap_a = vmcs12->io_bitmap_a; 1885 * evmcs->io_bitmap_b = vmcs12->io_bitmap_b; 1886 * evmcs->msr_bitmap = vmcs12->msr_bitmap; 1887 * evmcs->ept_pointer = vmcs12->ept_pointer; 1888 * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap; 1889 * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr; 1890 * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr; 1891 * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr; 1892 * evmcs->tpr_threshold = vmcs12->tpr_threshold; 1893 * evmcs->virtual_processor_id = vmcs12->virtual_processor_id; 1894 * evmcs->exception_bitmap = vmcs12->exception_bitmap; 1895 * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer; 1896 * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control; 1897 * evmcs->vm_exit_controls = vmcs12->vm_exit_controls; 1898 * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control; 1899 * evmcs->page_fault_error_code_mask = 1900 * vmcs12->page_fault_error_code_mask; 1901 * evmcs->page_fault_error_code_match = 1902 * vmcs12->page_fault_error_code_match; 1903 * evmcs->cr3_target_count = vmcs12->cr3_target_count; 1904 * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr; 1905 * evmcs->tsc_offset = vmcs12->tsc_offset; 1906 * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl; 1907 * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask; 1908 * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask; 1909 * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow; 1910 * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow; 1911 * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count; 1912 * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count; 1913 * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count; 1914 * evmcs->guest_ia32_perf_global_ctrl = vmcs12->guest_ia32_perf_global_ctrl; 1915 * evmcs->host_ia32_perf_global_ctrl = vmcs12->host_ia32_perf_global_ctrl; 1916 * evmcs->encls_exiting_bitmap = vmcs12->encls_exiting_bitmap; 1917 * evmcs->tsc_multiplier = vmcs12->tsc_multiplier; 1918 * 1919 * Not present in struct vmcs12: 1920 * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx; 1921 * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi; 1922 * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi; 1923 * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip; 1924 * evmcs->host_ia32_s_cet = vmcs12->host_ia32_s_cet; 1925 * evmcs->host_ssp = vmcs12->host_ssp; 1926 * evmcs->host_ia32_int_ssp_table_addr = vmcs12->host_ia32_int_ssp_table_addr; 1927 * evmcs->guest_ia32_s_cet = vmcs12->guest_ia32_s_cet; 1928 * evmcs->guest_ia32_lbr_ctl = vmcs12->guest_ia32_lbr_ctl; 1929 * evmcs->guest_ia32_int_ssp_table_addr = vmcs12->guest_ia32_int_ssp_table_addr; 1930 * evmcs->guest_ssp = vmcs12->guest_ssp; 1931 */ 1932 1933 evmcs->guest_es_selector = vmcs12->guest_es_selector; 1934 evmcs->guest_cs_selector = vmcs12->guest_cs_selector; 1935 evmcs->guest_ss_selector = vmcs12->guest_ss_selector; 1936 evmcs->guest_ds_selector = vmcs12->guest_ds_selector; 1937 evmcs->guest_fs_selector = vmcs12->guest_fs_selector; 1938 evmcs->guest_gs_selector = vmcs12->guest_gs_selector; 1939 evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector; 1940 evmcs->guest_tr_selector = vmcs12->guest_tr_selector; 1941 1942 evmcs->guest_es_limit = vmcs12->guest_es_limit; 1943 evmcs->guest_cs_limit = vmcs12->guest_cs_limit; 1944 evmcs->guest_ss_limit = vmcs12->guest_ss_limit; 1945 evmcs->guest_ds_limit = vmcs12->guest_ds_limit; 1946 evmcs->guest_fs_limit = vmcs12->guest_fs_limit; 1947 evmcs->guest_gs_limit = vmcs12->guest_gs_limit; 1948 evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit; 1949 evmcs->guest_tr_limit = vmcs12->guest_tr_limit; 1950 evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit; 1951 evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit; 1952 1953 evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes; 1954 evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes; 1955 evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes; 1956 evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes; 1957 evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes; 1958 evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes; 1959 evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes; 1960 evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes; 1961 1962 evmcs->guest_es_base = vmcs12->guest_es_base; 1963 evmcs->guest_cs_base = vmcs12->guest_cs_base; 1964 evmcs->guest_ss_base = vmcs12->guest_ss_base; 1965 evmcs->guest_ds_base = vmcs12->guest_ds_base; 1966 evmcs->guest_fs_base = vmcs12->guest_fs_base; 1967 evmcs->guest_gs_base = vmcs12->guest_gs_base; 1968 evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base; 1969 evmcs->guest_tr_base = vmcs12->guest_tr_base; 1970 evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base; 1971 evmcs->guest_idtr_base = vmcs12->guest_idtr_base; 1972 1973 evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat; 1974 evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer; 1975 1976 evmcs->guest_pdptr0 = vmcs12->guest_pdptr0; 1977 evmcs->guest_pdptr1 = vmcs12->guest_pdptr1; 1978 evmcs->guest_pdptr2 = vmcs12->guest_pdptr2; 1979 evmcs->guest_pdptr3 = vmcs12->guest_pdptr3; 1980 1981 evmcs->guest_pending_dbg_exceptions = 1982 vmcs12->guest_pending_dbg_exceptions; 1983 evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp; 1984 evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip; 1985 1986 evmcs->guest_activity_state = vmcs12->guest_activity_state; 1987 evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs; 1988 1989 evmcs->guest_cr0 = vmcs12->guest_cr0; 1990 evmcs->guest_cr3 = vmcs12->guest_cr3; 1991 evmcs->guest_cr4 = vmcs12->guest_cr4; 1992 evmcs->guest_dr7 = vmcs12->guest_dr7; 1993 1994 evmcs->guest_physical_address = vmcs12->guest_physical_address; 1995 1996 evmcs->vm_instruction_error = vmcs12->vm_instruction_error; 1997 evmcs->vm_exit_reason = vmcs12->vm_exit_reason; 1998 evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info; 1999 evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code; 2000 evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field; 2001 evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code; 2002 evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len; 2003 evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info; 2004 2005 evmcs->exit_qualification = vmcs12->exit_qualification; 2006 2007 evmcs->guest_linear_address = vmcs12->guest_linear_address; 2008 evmcs->guest_rsp = vmcs12->guest_rsp; 2009 evmcs->guest_rflags = vmcs12->guest_rflags; 2010 2011 evmcs->guest_interruptibility_info = 2012 vmcs12->guest_interruptibility_info; 2013 evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control; 2014 evmcs->vm_entry_controls = vmcs12->vm_entry_controls; 2015 evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field; 2016 evmcs->vm_entry_exception_error_code = 2017 vmcs12->vm_entry_exception_error_code; 2018 evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len; 2019 2020 evmcs->guest_rip = vmcs12->guest_rip; 2021 2022 evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs; 2023 2024 return; 2025 #else /* CONFIG_KVM_HYPERV */ 2026 KVM_BUG_ON(1, vmx->vcpu.kvm); 2027 #endif /* CONFIG_KVM_HYPERV */ 2028 } 2029 2030 /* 2031 * This is an equivalent of the nested hypervisor executing the vmptrld 2032 * instruction. 2033 */ 2034 static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld( 2035 struct kvm_vcpu *vcpu, bool from_launch) 2036 { 2037 #ifdef CONFIG_KVM_HYPERV 2038 struct vcpu_vmx *vmx = to_vmx(vcpu); 2039 bool evmcs_gpa_changed = false; 2040 u64 evmcs_gpa; 2041 2042 if (likely(!guest_cpuid_has_evmcs(vcpu))) 2043 return EVMPTRLD_DISABLED; 2044 2045 evmcs_gpa = nested_get_evmptr(vcpu); 2046 if (!evmptr_is_valid(evmcs_gpa)) { 2047 nested_release_evmcs(vcpu); 2048 return EVMPTRLD_DISABLED; 2049 } 2050 2051 if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) { 2052 vmx->nested.current_vmptr = INVALID_GPA; 2053 2054 nested_release_evmcs(vcpu); 2055 2056 if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa), 2057 &vmx->nested.hv_evmcs_map)) 2058 return EVMPTRLD_ERROR; 2059 2060 vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva; 2061 2062 /* 2063 * Currently, KVM only supports eVMCS version 1 2064 * (== KVM_EVMCS_VERSION) and thus we expect guest to set this 2065 * value to first u32 field of eVMCS which should specify eVMCS 2066 * VersionNumber. 2067 * 2068 * Guest should be aware of supported eVMCS versions by host by 2069 * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is 2070 * expected to set this CPUID leaf according to the value 2071 * returned in vmcs_version from nested_enable_evmcs(). 2072 * 2073 * However, it turns out that Microsoft Hyper-V fails to comply 2074 * to their own invented interface: When Hyper-V use eVMCS, it 2075 * just sets first u32 field of eVMCS to revision_id specified 2076 * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number 2077 * which is one of the supported versions specified in 2078 * CPUID.0x4000000A.EAX[0:15]. 2079 * 2080 * To overcome Hyper-V bug, we accept here either a supported 2081 * eVMCS version or VMCS12 revision_id as valid values for first 2082 * u32 field of eVMCS. 2083 */ 2084 if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) && 2085 (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) { 2086 nested_release_evmcs(vcpu); 2087 return EVMPTRLD_VMFAIL; 2088 } 2089 2090 vmx->nested.hv_evmcs_vmptr = evmcs_gpa; 2091 2092 evmcs_gpa_changed = true; 2093 /* 2094 * Unlike normal vmcs12, enlightened vmcs12 is not fully 2095 * reloaded from guest's memory (read only fields, fields not 2096 * present in struct hv_enlightened_vmcs, ...). Make sure there 2097 * are no leftovers. 2098 */ 2099 if (from_launch) { 2100 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2101 memset(vmcs12, 0, sizeof(*vmcs12)); 2102 vmcs12->hdr.revision_id = VMCS12_REVISION; 2103 } 2104 2105 } 2106 2107 /* 2108 * Clean fields data can't be used on VMLAUNCH and when we switch 2109 * between different L2 guests as KVM keeps a single VMCS12 per L1. 2110 */ 2111 if (from_launch || evmcs_gpa_changed) { 2112 vmx->nested.hv_evmcs->hv_clean_fields &= 2113 ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 2114 2115 vmx->nested.force_msr_bitmap_recalc = true; 2116 } 2117 2118 return EVMPTRLD_SUCCEEDED; 2119 #else 2120 return EVMPTRLD_DISABLED; 2121 #endif 2122 } 2123 2124 void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu) 2125 { 2126 struct vcpu_vmx *vmx = to_vmx(vcpu); 2127 2128 if (nested_vmx_is_evmptr12_valid(vmx)) 2129 copy_vmcs12_to_enlightened(vmx); 2130 else 2131 copy_vmcs12_to_shadow(vmx); 2132 2133 vmx->nested.need_vmcs12_to_shadow_sync = false; 2134 } 2135 2136 static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) 2137 { 2138 struct vcpu_vmx *vmx = 2139 container_of(timer, struct vcpu_vmx, nested.preemption_timer); 2140 2141 vmx->nested.preemption_timer_expired = true; 2142 kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); 2143 kvm_vcpu_kick(&vmx->vcpu); 2144 2145 return HRTIMER_NORESTART; 2146 } 2147 2148 static u64 vmx_calc_preemption_timer_value(struct kvm_vcpu *vcpu) 2149 { 2150 struct vcpu_vmx *vmx = to_vmx(vcpu); 2151 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2152 2153 u64 l1_scaled_tsc = kvm_read_l1_tsc(vcpu, rdtsc()) >> 2154 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 2155 2156 if (!vmx->nested.has_preemption_timer_deadline) { 2157 vmx->nested.preemption_timer_deadline = 2158 vmcs12->vmx_preemption_timer_value + l1_scaled_tsc; 2159 vmx->nested.has_preemption_timer_deadline = true; 2160 } 2161 return vmx->nested.preemption_timer_deadline - l1_scaled_tsc; 2162 } 2163 2164 static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu, 2165 u64 preemption_timeout) 2166 { 2167 struct vcpu_vmx *vmx = to_vmx(vcpu); 2168 2169 /* 2170 * A timer value of zero is architecturally guaranteed to cause 2171 * a VMExit prior to executing any instructions in the guest. 2172 */ 2173 if (preemption_timeout == 0) { 2174 vmx_preemption_timer_fn(&vmx->nested.preemption_timer); 2175 return; 2176 } 2177 2178 if (vcpu->arch.virtual_tsc_khz == 0) 2179 return; 2180 2181 preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 2182 preemption_timeout *= 1000000; 2183 do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); 2184 hrtimer_start(&vmx->nested.preemption_timer, 2185 ktime_add_ns(ktime_get(), preemption_timeout), 2186 HRTIMER_MODE_ABS_PINNED); 2187 } 2188 2189 static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2190 { 2191 if (vmx->nested.nested_run_pending && 2192 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) 2193 return vmcs12->guest_ia32_efer; 2194 else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) 2195 return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME); 2196 else 2197 return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME); 2198 } 2199 2200 static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) 2201 { 2202 struct kvm *kvm = vmx->vcpu.kvm; 2203 2204 /* 2205 * If vmcs02 hasn't been initialized, set the constant vmcs02 state 2206 * according to L0's settings (vmcs12 is irrelevant here). Host 2207 * fields that come from L0 and are not constant, e.g. HOST_CR3, 2208 * will be set as needed prior to VMLAUNCH/VMRESUME. 2209 */ 2210 if (vmx->nested.vmcs02_initialized) 2211 return; 2212 vmx->nested.vmcs02_initialized = true; 2213 2214 /* 2215 * We don't care what the EPTP value is we just need to guarantee 2216 * it's valid so we don't get a false positive when doing early 2217 * consistency checks. 2218 */ 2219 if (enable_ept && nested_early_check) 2220 vmcs_write64(EPT_POINTER, 2221 construct_eptp(&vmx->vcpu, 0, PT64_ROOT_4LEVEL)); 2222 2223 /* All VMFUNCs are currently emulated through L0 vmexits. */ 2224 if (cpu_has_vmx_vmfunc()) 2225 vmcs_write64(VM_FUNCTION_CONTROL, 0); 2226 2227 if (cpu_has_vmx_posted_intr()) 2228 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR); 2229 2230 if (cpu_has_vmx_msr_bitmap()) 2231 vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap)); 2232 2233 /* 2234 * PML is emulated for L2, but never enabled in hardware as the MMU 2235 * handles A/D emulation. Disabling PML for L2 also avoids having to 2236 * deal with filtering out L2 GPAs from the buffer. 2237 */ 2238 if (enable_pml) { 2239 vmcs_write64(PML_ADDRESS, 0); 2240 vmcs_write16(GUEST_PML_INDEX, -1); 2241 } 2242 2243 if (cpu_has_vmx_encls_vmexit()) 2244 vmcs_write64(ENCLS_EXITING_BITMAP, INVALID_GPA); 2245 2246 if (kvm_notify_vmexit_enabled(kvm)) 2247 vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window); 2248 2249 /* 2250 * Set the MSR load/store lists to match L0's settings. Only the 2251 * addresses are constant (for vmcs02), the counts can change based 2252 * on L2's behavior, e.g. switching to/from long mode. 2253 */ 2254 vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.guest.val)); 2255 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); 2256 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); 2257 2258 vmx_set_constant_host_state(vmx); 2259 } 2260 2261 static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx, 2262 struct vmcs12 *vmcs12) 2263 { 2264 prepare_vmcs02_constant_state(vmx); 2265 2266 vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); 2267 2268 if (enable_vpid) { 2269 if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) 2270 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); 2271 else 2272 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 2273 } 2274 } 2275 2276 static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs01, 2277 struct vmcs12 *vmcs12) 2278 { 2279 u32 exec_control; 2280 u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12); 2281 2282 if (vmx->nested.dirty_vmcs12 || nested_vmx_is_evmptr12_valid(vmx)) 2283 prepare_vmcs02_early_rare(vmx, vmcs12); 2284 2285 /* 2286 * PIN CONTROLS 2287 */ 2288 exec_control = __pin_controls_get(vmcs01); 2289 exec_control |= (vmcs12->pin_based_vm_exec_control & 2290 ~PIN_BASED_VMX_PREEMPTION_TIMER); 2291 2292 /* Posted interrupts setting is only taken from vmcs12. */ 2293 vmx->nested.pi_pending = false; 2294 if (nested_cpu_has_posted_intr(vmcs12)) 2295 vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; 2296 else 2297 exec_control &= ~PIN_BASED_POSTED_INTR; 2298 pin_controls_set(vmx, exec_control); 2299 2300 /* 2301 * EXEC CONTROLS 2302 */ 2303 exec_control = __exec_controls_get(vmcs01); /* L0's desires */ 2304 exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING; 2305 exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING; 2306 exec_control &= ~CPU_BASED_TPR_SHADOW; 2307 exec_control |= vmcs12->cpu_based_vm_exec_control; 2308 2309 vmx->nested.l1_tpr_threshold = -1; 2310 if (exec_control & CPU_BASED_TPR_SHADOW) 2311 vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); 2312 #ifdef CONFIG_X86_64 2313 else 2314 exec_control |= CPU_BASED_CR8_LOAD_EXITING | 2315 CPU_BASED_CR8_STORE_EXITING; 2316 #endif 2317 2318 /* 2319 * A vmexit (to either L1 hypervisor or L0 userspace) is always needed 2320 * for I/O port accesses. 2321 */ 2322 exec_control |= CPU_BASED_UNCOND_IO_EXITING; 2323 exec_control &= ~CPU_BASED_USE_IO_BITMAPS; 2324 2325 /* 2326 * This bit will be computed in nested_get_vmcs12_pages, because 2327 * we do not have access to L1's MSR bitmap yet. For now, keep 2328 * the same bit as before, hoping to avoid multiple VMWRITEs that 2329 * only set/clear this bit. 2330 */ 2331 exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; 2332 exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS; 2333 2334 exec_controls_set(vmx, exec_control); 2335 2336 /* 2337 * SECONDARY EXEC CONTROLS 2338 */ 2339 if (cpu_has_secondary_exec_ctrls()) { 2340 exec_control = __secondary_exec_controls_get(vmcs01); 2341 2342 /* Take the following fields only from vmcs12 */ 2343 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 2344 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 2345 SECONDARY_EXEC_ENABLE_INVPCID | 2346 SECONDARY_EXEC_ENABLE_RDTSCP | 2347 SECONDARY_EXEC_ENABLE_XSAVES | 2348 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | 2349 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 2350 SECONDARY_EXEC_APIC_REGISTER_VIRT | 2351 SECONDARY_EXEC_ENABLE_VMFUNC | 2352 SECONDARY_EXEC_DESC); 2353 2354 if (nested_cpu_has(vmcs12, 2355 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) 2356 exec_control |= vmcs12->secondary_vm_exec_control; 2357 2358 /* PML is emulated and never enabled in hardware for L2. */ 2359 exec_control &= ~SECONDARY_EXEC_ENABLE_PML; 2360 2361 /* VMCS shadowing for L2 is emulated for now */ 2362 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; 2363 2364 /* 2365 * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4() 2366 * will not have to rewrite the controls just for this bit. 2367 */ 2368 if (vmx_umip_emulated() && (vmcs12->guest_cr4 & X86_CR4_UMIP)) 2369 exec_control |= SECONDARY_EXEC_DESC; 2370 2371 if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) 2372 vmcs_write16(GUEST_INTR_STATUS, 2373 vmcs12->guest_intr_status); 2374 2375 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) 2376 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 2377 2378 if (exec_control & SECONDARY_EXEC_ENCLS_EXITING) 2379 vmx_write_encls_bitmap(&vmx->vcpu, vmcs12); 2380 2381 secondary_exec_controls_set(vmx, exec_control); 2382 } 2383 2384 /* 2385 * ENTRY CONTROLS 2386 * 2387 * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE 2388 * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate 2389 * on the related bits (if supported by the CPU) in the hope that 2390 * we can avoid VMWrites during vmx_set_efer(). 2391 * 2392 * Similarly, take vmcs01's PERF_GLOBAL_CTRL in the hope that if KVM is 2393 * loading PERF_GLOBAL_CTRL via the VMCS for L1, then KVM will want to 2394 * do the same for L2. 2395 */ 2396 exec_control = __vm_entry_controls_get(vmcs01); 2397 exec_control |= (vmcs12->vm_entry_controls & 2398 ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL); 2399 exec_control &= ~(VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER); 2400 if (cpu_has_load_ia32_efer()) { 2401 if (guest_efer & EFER_LMA) 2402 exec_control |= VM_ENTRY_IA32E_MODE; 2403 if (guest_efer != host_efer) 2404 exec_control |= VM_ENTRY_LOAD_IA32_EFER; 2405 } 2406 vm_entry_controls_set(vmx, exec_control); 2407 2408 /* 2409 * EXIT CONTROLS 2410 * 2411 * L2->L1 exit controls are emulated - the hardware exit is to L0 so 2412 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER 2413 * bits may be modified by vmx_set_efer() in prepare_vmcs02(). 2414 */ 2415 exec_control = __vm_exit_controls_get(vmcs01); 2416 if (cpu_has_load_ia32_efer() && guest_efer != host_efer) 2417 exec_control |= VM_EXIT_LOAD_IA32_EFER; 2418 else 2419 exec_control &= ~VM_EXIT_LOAD_IA32_EFER; 2420 vm_exit_controls_set(vmx, exec_control); 2421 2422 /* 2423 * Interrupt/Exception Fields 2424 */ 2425 if (vmx->nested.nested_run_pending) { 2426 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 2427 vmcs12->vm_entry_intr_info_field); 2428 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 2429 vmcs12->vm_entry_exception_error_code); 2430 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 2431 vmcs12->vm_entry_instruction_len); 2432 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 2433 vmcs12->guest_interruptibility_info); 2434 vmx->loaded_vmcs->nmi_known_unmasked = 2435 !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI); 2436 } else { 2437 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); 2438 } 2439 } 2440 2441 static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2442 { 2443 struct hv_enlightened_vmcs *hv_evmcs = nested_vmx_evmcs(vmx); 2444 2445 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2446 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { 2447 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); 2448 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); 2449 vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); 2450 vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector); 2451 vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector); 2452 vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector); 2453 vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector); 2454 vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector); 2455 vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit); 2456 vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit); 2457 vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit); 2458 vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit); 2459 vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit); 2460 vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit); 2461 vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit); 2462 vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit); 2463 vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); 2464 vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); 2465 vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); 2466 vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); 2467 vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); 2468 vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); 2469 vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); 2470 vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes); 2471 vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes); 2472 vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes); 2473 vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); 2474 vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); 2475 vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base); 2476 vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base); 2477 vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base); 2478 vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base); 2479 vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base); 2480 vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); 2481 vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); 2482 vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); 2483 2484 vmx->segment_cache.bitmask = 0; 2485 } 2486 2487 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2488 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) { 2489 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); 2490 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 2491 vmcs12->guest_pending_dbg_exceptions); 2492 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); 2493 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); 2494 2495 /* 2496 * L1 may access the L2's PDPTR, so save them to construct 2497 * vmcs12 2498 */ 2499 if (enable_ept) { 2500 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 2501 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 2502 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 2503 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 2504 } 2505 2506 if (kvm_mpx_supported() && vmx->nested.nested_run_pending && 2507 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) 2508 vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); 2509 } 2510 2511 if (nested_cpu_has_xsaves(vmcs12)) 2512 vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap); 2513 2514 /* 2515 * Whether page-faults are trapped is determined by a combination of 2516 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. If L0 2517 * doesn't care about page faults then we should set all of these to 2518 * L1's desires. However, if L0 does care about (some) page faults, it 2519 * is not easy (if at all possible?) to merge L0 and L1's desires, we 2520 * simply ask to exit on each and every L2 page fault. This is done by 2521 * setting MASK=MATCH=0 and (see below) EB.PF=1. 2522 * Note that below we don't need special code to set EB.PF beyond the 2523 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, 2524 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when 2525 * !enable_ept, EB.PF is 1, so the "or" will always be 1. 2526 */ 2527 if (vmx_need_pf_intercept(&vmx->vcpu)) { 2528 /* 2529 * TODO: if both L0 and L1 need the same MASK and MATCH, 2530 * go ahead and use it? 2531 */ 2532 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); 2533 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); 2534 } else { 2535 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, vmcs12->page_fault_error_code_mask); 2536 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, vmcs12->page_fault_error_code_match); 2537 } 2538 2539 if (cpu_has_vmx_apicv()) { 2540 vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0); 2541 vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1); 2542 vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2); 2543 vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3); 2544 } 2545 2546 /* 2547 * Make sure the msr_autostore list is up to date before we set the 2548 * count in the vmcs02. 2549 */ 2550 prepare_vmx_msr_autostore_list(&vmx->vcpu, MSR_IA32_TSC); 2551 2552 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.guest.nr); 2553 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 2554 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 2555 2556 set_cr4_guest_host_mask(vmx); 2557 } 2558 2559 /* 2560 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested 2561 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it 2562 * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2 2563 * guest in a way that will both be appropriate to L1's requests, and our 2564 * needs. In addition to modifying the active vmcs (which is vmcs02), this 2565 * function also has additional necessary side-effects, like setting various 2566 * vcpu->arch fields. 2567 * Returns 0 on success, 1 on failure. Invalid state exit qualification code 2568 * is assigned to entry_failure_code on failure. 2569 */ 2570 static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 2571 bool from_vmentry, 2572 enum vm_entry_failure_code *entry_failure_code) 2573 { 2574 struct vcpu_vmx *vmx = to_vmx(vcpu); 2575 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 2576 bool load_guest_pdptrs_vmcs12 = false; 2577 2578 if (vmx->nested.dirty_vmcs12 || nested_vmx_is_evmptr12_valid(vmx)) { 2579 prepare_vmcs02_rare(vmx, vmcs12); 2580 vmx->nested.dirty_vmcs12 = false; 2581 2582 load_guest_pdptrs_vmcs12 = !nested_vmx_is_evmptr12_valid(vmx) || 2583 !(evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1); 2584 } 2585 2586 if (vmx->nested.nested_run_pending && 2587 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { 2588 kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); 2589 vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); 2590 } else { 2591 kvm_set_dr(vcpu, 7, vcpu->arch.dr7); 2592 vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.pre_vmenter_debugctl); 2593 } 2594 if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || 2595 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) 2596 vmcs_write64(GUEST_BNDCFGS, vmx->nested.pre_vmenter_bndcfgs); 2597 vmx_set_rflags(vcpu, vmcs12->guest_rflags); 2598 2599 /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the 2600 * bitwise-or of what L1 wants to trap for L2, and what we want to 2601 * trap. Note that CR0.TS also needs updating - we do this later. 2602 */ 2603 vmx_update_exception_bitmap(vcpu); 2604 vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; 2605 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); 2606 2607 if (vmx->nested.nested_run_pending && 2608 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) { 2609 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); 2610 vcpu->arch.pat = vmcs12->guest_ia32_pat; 2611 } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { 2612 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); 2613 } 2614 2615 vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset( 2616 vcpu->arch.l1_tsc_offset, 2617 vmx_get_l2_tsc_offset(vcpu), 2618 vmx_get_l2_tsc_multiplier(vcpu)); 2619 2620 vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier( 2621 vcpu->arch.l1_tsc_scaling_ratio, 2622 vmx_get_l2_tsc_multiplier(vcpu)); 2623 2624 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 2625 if (kvm_caps.has_tsc_control) 2626 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); 2627 2628 nested_vmx_transition_tlb_flush(vcpu, vmcs12, true); 2629 2630 if (nested_cpu_has_ept(vmcs12)) 2631 nested_ept_init_mmu_context(vcpu); 2632 2633 /* 2634 * Override the CR0/CR4 read shadows after setting the effective guest 2635 * CR0/CR4. The common helpers also set the shadows, but they don't 2636 * account for vmcs12's cr0/4_guest_host_mask. 2637 */ 2638 vmx_set_cr0(vcpu, vmcs12->guest_cr0); 2639 vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); 2640 2641 vmx_set_cr4(vcpu, vmcs12->guest_cr4); 2642 vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12)); 2643 2644 vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12); 2645 /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ 2646 vmx_set_efer(vcpu, vcpu->arch.efer); 2647 2648 /* 2649 * Guest state is invalid and unrestricted guest is disabled, 2650 * which means L1 attempted VMEntry to L2 with invalid state. 2651 * Fail the VMEntry. 2652 * 2653 * However when force loading the guest state (SMM exit or 2654 * loading nested state after migration, it is possible to 2655 * have invalid guest state now, which will be later fixed by 2656 * restoring L2 register state 2657 */ 2658 if (CC(from_vmentry && !vmx_guest_state_valid(vcpu))) { 2659 *entry_failure_code = ENTRY_FAIL_DEFAULT; 2660 return -EINVAL; 2661 } 2662 2663 /* Shadow page tables on either EPT or shadow page tables. */ 2664 if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12), 2665 from_vmentry, entry_failure_code)) 2666 return -EINVAL; 2667 2668 /* 2669 * Immediately write vmcs02.GUEST_CR3. It will be propagated to vmcs12 2670 * on nested VM-Exit, which can occur without actually running L2 and 2671 * thus without hitting vmx_load_mmu_pgd(), e.g. if L1 is entering L2 with 2672 * vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the 2673 * transition to HLT instead of running L2. 2674 */ 2675 if (enable_ept) 2676 vmcs_writel(GUEST_CR3, vmcs12->guest_cr3); 2677 2678 /* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */ 2679 if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) && 2680 is_pae_paging(vcpu)) { 2681 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 2682 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 2683 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 2684 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 2685 } 2686 2687 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && 2688 kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)) && 2689 WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 2690 vmcs12->guest_ia32_perf_global_ctrl))) { 2691 *entry_failure_code = ENTRY_FAIL_DEFAULT; 2692 return -EINVAL; 2693 } 2694 2695 kvm_rsp_write(vcpu, vmcs12->guest_rsp); 2696 kvm_rip_write(vcpu, vmcs12->guest_rip); 2697 2698 /* 2699 * It was observed that genuine Hyper-V running in L1 doesn't reset 2700 * 'hv_clean_fields' by itself, it only sets the corresponding dirty 2701 * bits when it changes a field in eVMCS. Mark all fields as clean 2702 * here. 2703 */ 2704 if (nested_vmx_is_evmptr12_valid(vmx)) 2705 evmcs->hv_clean_fields |= HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 2706 2707 return 0; 2708 } 2709 2710 static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12) 2711 { 2712 if (CC(!nested_cpu_has_nmi_exiting(vmcs12) && 2713 nested_cpu_has_virtual_nmis(vmcs12))) 2714 return -EINVAL; 2715 2716 if (CC(!nested_cpu_has_virtual_nmis(vmcs12) && 2717 nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING))) 2718 return -EINVAL; 2719 2720 return 0; 2721 } 2722 2723 static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp) 2724 { 2725 struct vcpu_vmx *vmx = to_vmx(vcpu); 2726 2727 /* Check for memory type validity */ 2728 switch (new_eptp & VMX_EPTP_MT_MASK) { 2729 case VMX_EPTP_MT_UC: 2730 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT))) 2731 return false; 2732 break; 2733 case VMX_EPTP_MT_WB: 2734 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT))) 2735 return false; 2736 break; 2737 default: 2738 return false; 2739 } 2740 2741 /* Page-walk levels validity. */ 2742 switch (new_eptp & VMX_EPTP_PWL_MASK) { 2743 case VMX_EPTP_PWL_5: 2744 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_5_BIT))) 2745 return false; 2746 break; 2747 case VMX_EPTP_PWL_4: 2748 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_4_BIT))) 2749 return false; 2750 break; 2751 default: 2752 return false; 2753 } 2754 2755 /* Reserved bits should not be set */ 2756 if (CC(!kvm_vcpu_is_legal_gpa(vcpu, new_eptp) || ((new_eptp >> 7) & 0x1f))) 2757 return false; 2758 2759 /* AD, if set, should be supported */ 2760 if (new_eptp & VMX_EPTP_AD_ENABLE_BIT) { 2761 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT))) 2762 return false; 2763 } 2764 2765 return true; 2766 } 2767 2768 /* 2769 * Checks related to VM-Execution Control Fields 2770 */ 2771 static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu, 2772 struct vmcs12 *vmcs12) 2773 { 2774 struct vcpu_vmx *vmx = to_vmx(vcpu); 2775 2776 if (CC(!vmx_control_verify(vmcs12->pin_based_vm_exec_control, 2777 vmx->nested.msrs.pinbased_ctls_low, 2778 vmx->nested.msrs.pinbased_ctls_high)) || 2779 CC(!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, 2780 vmx->nested.msrs.procbased_ctls_low, 2781 vmx->nested.msrs.procbased_ctls_high))) 2782 return -EINVAL; 2783 2784 if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && 2785 CC(!vmx_control_verify(vmcs12->secondary_vm_exec_control, 2786 vmx->nested.msrs.secondary_ctls_low, 2787 vmx->nested.msrs.secondary_ctls_high))) 2788 return -EINVAL; 2789 2790 if (CC(vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) || 2791 nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) || 2792 nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) || 2793 nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) || 2794 nested_vmx_check_apic_access_controls(vcpu, vmcs12) || 2795 nested_vmx_check_apicv_controls(vcpu, vmcs12) || 2796 nested_vmx_check_nmi_controls(vmcs12) || 2797 nested_vmx_check_pml_controls(vcpu, vmcs12) || 2798 nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) || 2799 nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) || 2800 nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) || 2801 CC(nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id)) 2802 return -EINVAL; 2803 2804 if (!nested_cpu_has_preemption_timer(vmcs12) && 2805 nested_cpu_has_save_preemption_timer(vmcs12)) 2806 return -EINVAL; 2807 2808 if (nested_cpu_has_ept(vmcs12) && 2809 CC(!nested_vmx_check_eptp(vcpu, vmcs12->ept_pointer))) 2810 return -EINVAL; 2811 2812 if (nested_cpu_has_vmfunc(vmcs12)) { 2813 if (CC(vmcs12->vm_function_control & 2814 ~vmx->nested.msrs.vmfunc_controls)) 2815 return -EINVAL; 2816 2817 if (nested_cpu_has_eptp_switching(vmcs12)) { 2818 if (CC(!nested_cpu_has_ept(vmcs12)) || 2819 CC(!page_address_valid(vcpu, vmcs12->eptp_list_address))) 2820 return -EINVAL; 2821 } 2822 } 2823 2824 return 0; 2825 } 2826 2827 /* 2828 * Checks related to VM-Exit Control Fields 2829 */ 2830 static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu, 2831 struct vmcs12 *vmcs12) 2832 { 2833 struct vcpu_vmx *vmx = to_vmx(vcpu); 2834 2835 if (CC(!vmx_control_verify(vmcs12->vm_exit_controls, 2836 vmx->nested.msrs.exit_ctls_low, 2837 vmx->nested.msrs.exit_ctls_high)) || 2838 CC(nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12))) 2839 return -EINVAL; 2840 2841 return 0; 2842 } 2843 2844 /* 2845 * Checks related to VM-Entry Control Fields 2846 */ 2847 static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, 2848 struct vmcs12 *vmcs12) 2849 { 2850 struct vcpu_vmx *vmx = to_vmx(vcpu); 2851 2852 if (CC(!vmx_control_verify(vmcs12->vm_entry_controls, 2853 vmx->nested.msrs.entry_ctls_low, 2854 vmx->nested.msrs.entry_ctls_high))) 2855 return -EINVAL; 2856 2857 /* 2858 * From the Intel SDM, volume 3: 2859 * Fields relevant to VM-entry event injection must be set properly. 2860 * These fields are the VM-entry interruption-information field, the 2861 * VM-entry exception error code, and the VM-entry instruction length. 2862 */ 2863 if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) { 2864 u32 intr_info = vmcs12->vm_entry_intr_info_field; 2865 u8 vector = intr_info & INTR_INFO_VECTOR_MASK; 2866 u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK; 2867 bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK; 2868 bool should_have_error_code; 2869 bool urg = nested_cpu_has2(vmcs12, 2870 SECONDARY_EXEC_UNRESTRICTED_GUEST); 2871 bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE; 2872 2873 /* VM-entry interruption-info field: interruption type */ 2874 if (CC(intr_type == INTR_TYPE_RESERVED) || 2875 CC(intr_type == INTR_TYPE_OTHER_EVENT && 2876 !nested_cpu_supports_monitor_trap_flag(vcpu))) 2877 return -EINVAL; 2878 2879 /* VM-entry interruption-info field: vector */ 2880 if (CC(intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) || 2881 CC(intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) || 2882 CC(intr_type == INTR_TYPE_OTHER_EVENT && vector != 0)) 2883 return -EINVAL; 2884 2885 /* VM-entry interruption-info field: deliver error code */ 2886 should_have_error_code = 2887 intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode && 2888 x86_exception_has_error_code(vector); 2889 if (CC(has_error_code != should_have_error_code)) 2890 return -EINVAL; 2891 2892 /* VM-entry exception error code */ 2893 if (CC(has_error_code && 2894 vmcs12->vm_entry_exception_error_code & GENMASK(31, 16))) 2895 return -EINVAL; 2896 2897 /* VM-entry interruption-info field: reserved bits */ 2898 if (CC(intr_info & INTR_INFO_RESVD_BITS_MASK)) 2899 return -EINVAL; 2900 2901 /* VM-entry instruction length */ 2902 switch (intr_type) { 2903 case INTR_TYPE_SOFT_EXCEPTION: 2904 case INTR_TYPE_SOFT_INTR: 2905 case INTR_TYPE_PRIV_SW_EXCEPTION: 2906 if (CC(vmcs12->vm_entry_instruction_len > 15) || 2907 CC(vmcs12->vm_entry_instruction_len == 0 && 2908 CC(!nested_cpu_has_zero_length_injection(vcpu)))) 2909 return -EINVAL; 2910 } 2911 } 2912 2913 if (nested_vmx_check_entry_msr_switch_controls(vcpu, vmcs12)) 2914 return -EINVAL; 2915 2916 return 0; 2917 } 2918 2919 static int nested_vmx_check_controls(struct kvm_vcpu *vcpu, 2920 struct vmcs12 *vmcs12) 2921 { 2922 if (nested_check_vm_execution_controls(vcpu, vmcs12) || 2923 nested_check_vm_exit_controls(vcpu, vmcs12) || 2924 nested_check_vm_entry_controls(vcpu, vmcs12)) 2925 return -EINVAL; 2926 2927 #ifdef CONFIG_KVM_HYPERV 2928 if (guest_cpuid_has_evmcs(vcpu)) 2929 return nested_evmcs_check_controls(vmcs12); 2930 #endif 2931 2932 return 0; 2933 } 2934 2935 static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu, 2936 struct vmcs12 *vmcs12) 2937 { 2938 #ifdef CONFIG_X86_64 2939 if (CC(!!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) != 2940 !!(vcpu->arch.efer & EFER_LMA))) 2941 return -EINVAL; 2942 #endif 2943 return 0; 2944 } 2945 2946 static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, 2947 struct vmcs12 *vmcs12) 2948 { 2949 bool ia32e = !!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE); 2950 2951 if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) || 2952 CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) || 2953 CC(!kvm_vcpu_is_legal_cr3(vcpu, vmcs12->host_cr3))) 2954 return -EINVAL; 2955 2956 if (CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu)) || 2957 CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu))) 2958 return -EINVAL; 2959 2960 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) && 2961 CC(!kvm_pat_valid(vmcs12->host_ia32_pat))) 2962 return -EINVAL; 2963 2964 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && 2965 CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), 2966 vmcs12->host_ia32_perf_global_ctrl))) 2967 return -EINVAL; 2968 2969 if (ia32e) { 2970 if (CC(!(vmcs12->host_cr4 & X86_CR4_PAE))) 2971 return -EINVAL; 2972 } else { 2973 if (CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) || 2974 CC(vmcs12->host_cr4 & X86_CR4_PCIDE) || 2975 CC((vmcs12->host_rip) >> 32)) 2976 return -EINVAL; 2977 } 2978 2979 if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2980 CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2981 CC(vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2982 CC(vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2983 CC(vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2984 CC(vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2985 CC(vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2986 CC(vmcs12->host_cs_selector == 0) || 2987 CC(vmcs12->host_tr_selector == 0) || 2988 CC(vmcs12->host_ss_selector == 0 && !ia32e)) 2989 return -EINVAL; 2990 2991 if (CC(is_noncanonical_address(vmcs12->host_fs_base, vcpu)) || 2992 CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) || 2993 CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) || 2994 CC(is_noncanonical_address(vmcs12->host_idtr_base, vcpu)) || 2995 CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu)) || 2996 CC(is_noncanonical_address(vmcs12->host_rip, vcpu))) 2997 return -EINVAL; 2998 2999 /* 3000 * If the load IA32_EFER VM-exit control is 1, bits reserved in the 3001 * IA32_EFER MSR must be 0 in the field for that register. In addition, 3002 * the values of the LMA and LME bits in the field must each be that of 3003 * the host address-space size VM-exit control. 3004 */ 3005 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) { 3006 if (CC(!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer)) || 3007 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA)) || 3008 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))) 3009 return -EINVAL; 3010 } 3011 3012 return 0; 3013 } 3014 3015 static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, 3016 struct vmcs12 *vmcs12) 3017 { 3018 struct vcpu_vmx *vmx = to_vmx(vcpu); 3019 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; 3020 struct vmcs_hdr hdr; 3021 3022 if (vmcs12->vmcs_link_pointer == INVALID_GPA) 3023 return 0; 3024 3025 if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))) 3026 return -EINVAL; 3027 3028 if (ghc->gpa != vmcs12->vmcs_link_pointer && 3029 CC(kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, 3030 vmcs12->vmcs_link_pointer, VMCS12_SIZE))) 3031 return -EINVAL; 3032 3033 if (CC(kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr, 3034 offsetof(struct vmcs12, hdr), 3035 sizeof(hdr)))) 3036 return -EINVAL; 3037 3038 if (CC(hdr.revision_id != VMCS12_REVISION) || 3039 CC(hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))) 3040 return -EINVAL; 3041 3042 return 0; 3043 } 3044 3045 /* 3046 * Checks related to Guest Non-register State 3047 */ 3048 static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12) 3049 { 3050 if (CC(vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && 3051 vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT && 3052 vmcs12->guest_activity_state != GUEST_ACTIVITY_WAIT_SIPI)) 3053 return -EINVAL; 3054 3055 return 0; 3056 } 3057 3058 static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, 3059 struct vmcs12 *vmcs12, 3060 enum vm_entry_failure_code *entry_failure_code) 3061 { 3062 bool ia32e = !!(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE); 3063 3064 *entry_failure_code = ENTRY_FAIL_DEFAULT; 3065 3066 if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) || 3067 CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))) 3068 return -EINVAL; 3069 3070 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) && 3071 CC(!kvm_dr7_valid(vmcs12->guest_dr7))) 3072 return -EINVAL; 3073 3074 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) && 3075 CC(!kvm_pat_valid(vmcs12->guest_ia32_pat))) 3076 return -EINVAL; 3077 3078 if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) { 3079 *entry_failure_code = ENTRY_FAIL_VMCS_LINK_PTR; 3080 return -EINVAL; 3081 } 3082 3083 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && 3084 CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), 3085 vmcs12->guest_ia32_perf_global_ctrl))) 3086 return -EINVAL; 3087 3088 if (CC((vmcs12->guest_cr0 & (X86_CR0_PG | X86_CR0_PE)) == X86_CR0_PG)) 3089 return -EINVAL; 3090 3091 if (CC(ia32e && !(vmcs12->guest_cr4 & X86_CR4_PAE)) || 3092 CC(ia32e && !(vmcs12->guest_cr0 & X86_CR0_PG))) 3093 return -EINVAL; 3094 3095 /* 3096 * If the load IA32_EFER VM-entry control is 1, the following checks 3097 * are performed on the field for the IA32_EFER MSR: 3098 * - Bits reserved in the IA32_EFER MSR must be 0. 3099 * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of 3100 * the IA-32e mode guest VM-exit control. It must also be identical 3101 * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to 3102 * CR0.PG) is 1. 3103 */ 3104 if (to_vmx(vcpu)->nested.nested_run_pending && 3105 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) { 3106 if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) || 3107 CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) || 3108 CC(((vmcs12->guest_cr0 & X86_CR0_PG) && 3109 ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))) 3110 return -EINVAL; 3111 } 3112 3113 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) && 3114 (CC(is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) || 3115 CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))) 3116 return -EINVAL; 3117 3118 if (nested_check_guest_non_reg_state(vmcs12)) 3119 return -EINVAL; 3120 3121 return 0; 3122 } 3123 3124 static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) 3125 { 3126 struct vcpu_vmx *vmx = to_vmx(vcpu); 3127 unsigned long cr3, cr4; 3128 bool vm_fail; 3129 3130 if (!nested_early_check) 3131 return 0; 3132 3133 if (vmx->msr_autoload.host.nr) 3134 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); 3135 if (vmx->msr_autoload.guest.nr) 3136 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); 3137 3138 preempt_disable(); 3139 3140 vmx_prepare_switch_to_guest(vcpu); 3141 3142 /* 3143 * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS, 3144 * which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to 3145 * be written (by prepare_vmcs02()) before the "real" VMEnter, i.e. 3146 * there is no need to preserve other bits or save/restore the field. 3147 */ 3148 vmcs_writel(GUEST_RFLAGS, 0); 3149 3150 cr3 = __get_current_cr3_fast(); 3151 if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { 3152 vmcs_writel(HOST_CR3, cr3); 3153 vmx->loaded_vmcs->host_state.cr3 = cr3; 3154 } 3155 3156 cr4 = cr4_read_shadow(); 3157 if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { 3158 vmcs_writel(HOST_CR4, cr4); 3159 vmx->loaded_vmcs->host_state.cr4 = cr4; 3160 } 3161 3162 vm_fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, 3163 __vmx_vcpu_run_flags(vmx)); 3164 3165 if (vmx->msr_autoload.host.nr) 3166 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 3167 if (vmx->msr_autoload.guest.nr) 3168 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 3169 3170 if (vm_fail) { 3171 u32 error = vmcs_read32(VM_INSTRUCTION_ERROR); 3172 3173 preempt_enable(); 3174 3175 trace_kvm_nested_vmenter_failed( 3176 "early hardware check VM-instruction error: ", error); 3177 WARN_ON_ONCE(error != VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3178 return 1; 3179 } 3180 3181 /* 3182 * VMExit clears RFLAGS.IF and DR7, even on a consistency check. 3183 */ 3184 if (hw_breakpoint_active()) 3185 set_debugreg(__this_cpu_read(cpu_dr7), 7); 3186 local_irq_enable(); 3187 preempt_enable(); 3188 3189 /* 3190 * A non-failing VMEntry means we somehow entered guest mode with 3191 * an illegal RIP, and that's just the tip of the iceberg. There 3192 * is no telling what memory has been modified or what state has 3193 * been exposed to unknown code. Hitting this all but guarantees 3194 * a (very critical) hardware issue. 3195 */ 3196 WARN_ON(!(vmcs_read32(VM_EXIT_REASON) & 3197 VMX_EXIT_REASONS_FAILED_VMENTRY)); 3198 3199 return 0; 3200 } 3201 3202 #ifdef CONFIG_KVM_HYPERV 3203 static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu) 3204 { 3205 struct vcpu_vmx *vmx = to_vmx(vcpu); 3206 3207 /* 3208 * hv_evmcs may end up being not mapped after migration (when 3209 * L2 was running), map it here to make sure vmcs12 changes are 3210 * properly reflected. 3211 */ 3212 if (guest_cpuid_has_evmcs(vcpu) && 3213 vmx->nested.hv_evmcs_vmptr == EVMPTR_MAP_PENDING) { 3214 enum nested_evmptrld_status evmptrld_status = 3215 nested_vmx_handle_enlightened_vmptrld(vcpu, false); 3216 3217 if (evmptrld_status == EVMPTRLD_VMFAIL || 3218 evmptrld_status == EVMPTRLD_ERROR) 3219 return false; 3220 3221 /* 3222 * Post migration VMCS12 always provides the most actual 3223 * information, copy it to eVMCS upon entry. 3224 */ 3225 vmx->nested.need_vmcs12_to_shadow_sync = true; 3226 } 3227 3228 return true; 3229 } 3230 #endif 3231 3232 static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) 3233 { 3234 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3235 struct vcpu_vmx *vmx = to_vmx(vcpu); 3236 struct kvm_host_map *map; 3237 3238 if (!vcpu->arch.pdptrs_from_userspace && 3239 !nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { 3240 /* 3241 * Reload the guest's PDPTRs since after a migration 3242 * the guest CR3 might be restored prior to setting the nested 3243 * state which can lead to a load of wrong PDPTRs. 3244 */ 3245 if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3))) 3246 return false; 3247 } 3248 3249 3250 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 3251 map = &vmx->nested.apic_access_page_map; 3252 3253 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->apic_access_addr), map)) { 3254 vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(map->pfn)); 3255 } else { 3256 pr_debug_ratelimited("%s: no backing for APIC-access address in vmcs12\n", 3257 __func__); 3258 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3259 vcpu->run->internal.suberror = 3260 KVM_INTERNAL_ERROR_EMULATION; 3261 vcpu->run->internal.ndata = 0; 3262 return false; 3263 } 3264 } 3265 3266 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 3267 map = &vmx->nested.virtual_apic_map; 3268 3269 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) { 3270 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn)); 3271 } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) && 3272 nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) && 3273 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 3274 /* 3275 * The processor will never use the TPR shadow, simply 3276 * clear the bit from the execution control. Such a 3277 * configuration is useless, but it happens in tests. 3278 * For any other configuration, failing the vm entry is 3279 * _not_ what the processor does but it's basically the 3280 * only possibility we have. 3281 */ 3282 exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW); 3283 } else { 3284 /* 3285 * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to 3286 * force VM-Entry to fail. 3287 */ 3288 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, INVALID_GPA); 3289 } 3290 } 3291 3292 if (nested_cpu_has_posted_intr(vmcs12)) { 3293 map = &vmx->nested.pi_desc_map; 3294 3295 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) { 3296 vmx->nested.pi_desc = 3297 (struct pi_desc *)(((void *)map->hva) + 3298 offset_in_page(vmcs12->posted_intr_desc_addr)); 3299 vmcs_write64(POSTED_INTR_DESC_ADDR, 3300 pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr)); 3301 } else { 3302 /* 3303 * Defer the KVM_INTERNAL_EXIT until KVM tries to 3304 * access the contents of the VMCS12 posted interrupt 3305 * descriptor. (Note that KVM may do this when it 3306 * should not, per the architectural specification.) 3307 */ 3308 vmx->nested.pi_desc = NULL; 3309 pin_controls_clearbit(vmx, PIN_BASED_POSTED_INTR); 3310 } 3311 } 3312 if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12)) 3313 exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS); 3314 else 3315 exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS); 3316 3317 return true; 3318 } 3319 3320 static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu) 3321 { 3322 #ifdef CONFIG_KVM_HYPERV 3323 /* 3324 * Note: nested_get_evmcs_page() also updates 'vp_assist_page' copy 3325 * in 'struct kvm_vcpu_hv' in case eVMCS is in use, this is mandatory 3326 * to make nested_evmcs_l2_tlb_flush_enabled() work correctly post 3327 * migration. 3328 */ 3329 if (!nested_get_evmcs_page(vcpu)) { 3330 pr_debug_ratelimited("%s: enlightened vmptrld failed\n", 3331 __func__); 3332 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3333 vcpu->run->internal.suberror = 3334 KVM_INTERNAL_ERROR_EMULATION; 3335 vcpu->run->internal.ndata = 0; 3336 3337 return false; 3338 } 3339 #endif 3340 3341 if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu)) 3342 return false; 3343 3344 return true; 3345 } 3346 3347 static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa) 3348 { 3349 struct vmcs12 *vmcs12; 3350 struct vcpu_vmx *vmx = to_vmx(vcpu); 3351 gpa_t dst; 3352 3353 if (WARN_ON_ONCE(!is_guest_mode(vcpu))) 3354 return 0; 3355 3356 if (WARN_ON_ONCE(vmx->nested.pml_full)) 3357 return 1; 3358 3359 /* 3360 * Check if PML is enabled for the nested guest. Whether eptp bit 6 is 3361 * set is already checked as part of A/D emulation. 3362 */ 3363 vmcs12 = get_vmcs12(vcpu); 3364 if (!nested_cpu_has_pml(vmcs12)) 3365 return 0; 3366 3367 if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) { 3368 vmx->nested.pml_full = true; 3369 return 1; 3370 } 3371 3372 gpa &= ~0xFFFull; 3373 dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index; 3374 3375 if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa, 3376 offset_in_page(dst), sizeof(gpa))) 3377 return 0; 3378 3379 vmcs12->guest_pml_index--; 3380 3381 return 0; 3382 } 3383 3384 /* 3385 * Intel's VMX Instruction Reference specifies a common set of prerequisites 3386 * for running VMX instructions (except VMXON, whose prerequisites are 3387 * slightly different). It also specifies what exception to inject otherwise. 3388 * Note that many of these exceptions have priority over VM exits, so they 3389 * don't have to be checked again here. 3390 */ 3391 static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) 3392 { 3393 if (!to_vmx(vcpu)->nested.vmxon) { 3394 kvm_queue_exception(vcpu, UD_VECTOR); 3395 return 0; 3396 } 3397 3398 if (vmx_get_cpl(vcpu)) { 3399 kvm_inject_gp(vcpu, 0); 3400 return 0; 3401 } 3402 3403 return 1; 3404 } 3405 3406 static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu) 3407 { 3408 u8 rvi = vmx_get_rvi(); 3409 u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI); 3410 3411 return ((rvi & 0xf0) > (vppr & 0xf0)); 3412 } 3413 3414 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 3415 struct vmcs12 *vmcs12); 3416 3417 /* 3418 * If from_vmentry is false, this is being called from state restore (either RSM 3419 * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume. 3420 * 3421 * Returns: 3422 * NVMX_VMENTRY_SUCCESS: Entered VMX non-root mode 3423 * NVMX_VMENTRY_VMFAIL: Consistency check VMFail 3424 * NVMX_VMENTRY_VMEXIT: Consistency check VMExit 3425 * NVMX_VMENTRY_KVM_INTERNAL_ERROR: KVM internal error 3426 */ 3427 enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, 3428 bool from_vmentry) 3429 { 3430 struct vcpu_vmx *vmx = to_vmx(vcpu); 3431 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3432 enum vm_entry_failure_code entry_failure_code; 3433 bool evaluate_pending_interrupts; 3434 union vmx_exit_reason exit_reason = { 3435 .basic = EXIT_REASON_INVALID_STATE, 3436 .failed_vmentry = 1, 3437 }; 3438 u32 failed_index; 3439 3440 trace_kvm_nested_vmenter(kvm_rip_read(vcpu), 3441 vmx->nested.current_vmptr, 3442 vmcs12->guest_rip, 3443 vmcs12->guest_intr_status, 3444 vmcs12->vm_entry_intr_info_field, 3445 vmcs12->secondary_vm_exec_control & SECONDARY_EXEC_ENABLE_EPT, 3446 vmcs12->ept_pointer, 3447 vmcs12->guest_cr3, 3448 KVM_ISA_VMX); 3449 3450 kvm_service_local_tlb_flush_requests(vcpu); 3451 3452 evaluate_pending_interrupts = exec_controls_get(vmx) & 3453 (CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING); 3454 if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu)) 3455 evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu); 3456 if (!evaluate_pending_interrupts) 3457 evaluate_pending_interrupts |= kvm_apic_has_pending_init_or_sipi(vcpu); 3458 3459 if (!vmx->nested.nested_run_pending || 3460 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) 3461 vmx->nested.pre_vmenter_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); 3462 if (kvm_mpx_supported() && 3463 (!vmx->nested.nested_run_pending || 3464 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) 3465 vmx->nested.pre_vmenter_bndcfgs = vmcs_read64(GUEST_BNDCFGS); 3466 3467 /* 3468 * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and* 3469 * nested early checks are disabled. In the event of a "late" VM-Fail, 3470 * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its 3471 * software model to the pre-VMEntry host state. When EPT is disabled, 3472 * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes 3473 * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3. Stuffing 3474 * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to 3475 * the correct value. Smashing vmcs01.GUEST_CR3 is safe because nested 3476 * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is 3477 * guaranteed to be overwritten with a shadow CR3 prior to re-entering 3478 * L1. Don't stuff vmcs01.GUEST_CR3 when using nested early checks as 3479 * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks 3480 * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail 3481 * path would need to manually save/restore vmcs01.GUEST_CR3. 3482 */ 3483 if (!enable_ept && !nested_early_check) 3484 vmcs_writel(GUEST_CR3, vcpu->arch.cr3); 3485 3486 vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); 3487 3488 prepare_vmcs02_early(vmx, &vmx->vmcs01, vmcs12); 3489 3490 if (from_vmentry) { 3491 if (unlikely(!nested_get_vmcs12_pages(vcpu))) { 3492 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3493 return NVMX_VMENTRY_KVM_INTERNAL_ERROR; 3494 } 3495 3496 if (nested_vmx_check_vmentry_hw(vcpu)) { 3497 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3498 return NVMX_VMENTRY_VMFAIL; 3499 } 3500 3501 if (nested_vmx_check_guest_state(vcpu, vmcs12, 3502 &entry_failure_code)) { 3503 exit_reason.basic = EXIT_REASON_INVALID_STATE; 3504 vmcs12->exit_qualification = entry_failure_code; 3505 goto vmentry_fail_vmexit; 3506 } 3507 } 3508 3509 enter_guest_mode(vcpu); 3510 3511 if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &entry_failure_code)) { 3512 exit_reason.basic = EXIT_REASON_INVALID_STATE; 3513 vmcs12->exit_qualification = entry_failure_code; 3514 goto vmentry_fail_vmexit_guest_mode; 3515 } 3516 3517 if (from_vmentry) { 3518 failed_index = nested_vmx_load_msr(vcpu, 3519 vmcs12->vm_entry_msr_load_addr, 3520 vmcs12->vm_entry_msr_load_count); 3521 if (failed_index) { 3522 exit_reason.basic = EXIT_REASON_MSR_LOAD_FAIL; 3523 vmcs12->exit_qualification = failed_index; 3524 goto vmentry_fail_vmexit_guest_mode; 3525 } 3526 } else { 3527 /* 3528 * The MMU is not initialized to point at the right entities yet and 3529 * "get pages" would need to read data from the guest (i.e. we will 3530 * need to perform gpa to hpa translation). Request a call 3531 * to nested_get_vmcs12_pages before the next VM-entry. The MSRs 3532 * have already been set at vmentry time and should not be reset. 3533 */ 3534 kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 3535 } 3536 3537 /* 3538 * Re-evaluate pending events if L1 had a pending IRQ/NMI/INIT/SIPI 3539 * when it executed VMLAUNCH/VMRESUME, as entering non-root mode can 3540 * effectively unblock various events, e.g. INIT/SIPI cause VM-Exit 3541 * unconditionally. 3542 */ 3543 if (unlikely(evaluate_pending_interrupts)) 3544 kvm_make_request(KVM_REQ_EVENT, vcpu); 3545 3546 /* 3547 * Do not start the preemption timer hrtimer until after we know 3548 * we are successful, so that only nested_vmx_vmexit needs to cancel 3549 * the timer. 3550 */ 3551 vmx->nested.preemption_timer_expired = false; 3552 if (nested_cpu_has_preemption_timer(vmcs12)) { 3553 u64 timer_value = vmx_calc_preemption_timer_value(vcpu); 3554 vmx_start_preemption_timer(vcpu, timer_value); 3555 } 3556 3557 /* 3558 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point 3559 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet 3560 * returned as far as L1 is concerned. It will only return (and set 3561 * the success flag) when L2 exits (see nested_vmx_vmexit()). 3562 */ 3563 return NVMX_VMENTRY_SUCCESS; 3564 3565 /* 3566 * A failed consistency check that leads to a VMExit during L1's 3567 * VMEnter to L2 is a variation of a normal VMexit, as explained in 3568 * 26.7 "VM-entry failures during or after loading guest state". 3569 */ 3570 vmentry_fail_vmexit_guest_mode: 3571 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING) 3572 vcpu->arch.tsc_offset -= vmcs12->tsc_offset; 3573 leave_guest_mode(vcpu); 3574 3575 vmentry_fail_vmexit: 3576 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3577 3578 if (!from_vmentry) 3579 return NVMX_VMENTRY_VMEXIT; 3580 3581 load_vmcs12_host_state(vcpu, vmcs12); 3582 vmcs12->vm_exit_reason = exit_reason.full; 3583 if (enable_shadow_vmcs || nested_vmx_is_evmptr12_valid(vmx)) 3584 vmx->nested.need_vmcs12_to_shadow_sync = true; 3585 return NVMX_VMENTRY_VMEXIT; 3586 } 3587 3588 /* 3589 * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1 3590 * for running an L2 nested guest. 3591 */ 3592 static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) 3593 { 3594 struct vmcs12 *vmcs12; 3595 enum nvmx_vmentry_status status; 3596 struct vcpu_vmx *vmx = to_vmx(vcpu); 3597 u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu); 3598 enum nested_evmptrld_status evmptrld_status; 3599 3600 if (!nested_vmx_check_permission(vcpu)) 3601 return 1; 3602 3603 evmptrld_status = nested_vmx_handle_enlightened_vmptrld(vcpu, launch); 3604 if (evmptrld_status == EVMPTRLD_ERROR) { 3605 kvm_queue_exception(vcpu, UD_VECTOR); 3606 return 1; 3607 } 3608 3609 kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED); 3610 3611 if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) 3612 return nested_vmx_failInvalid(vcpu); 3613 3614 if (CC(!nested_vmx_is_evmptr12_valid(vmx) && 3615 vmx->nested.current_vmptr == INVALID_GPA)) 3616 return nested_vmx_failInvalid(vcpu); 3617 3618 vmcs12 = get_vmcs12(vcpu); 3619 3620 /* 3621 * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact 3622 * that there *is* a valid VMCS pointer, RFLAGS.CF is set 3623 * rather than RFLAGS.ZF, and no error number is stored to the 3624 * VM-instruction error field. 3625 */ 3626 if (CC(vmcs12->hdr.shadow_vmcs)) 3627 return nested_vmx_failInvalid(vcpu); 3628 3629 if (nested_vmx_is_evmptr12_valid(vmx)) { 3630 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 3631 3632 copy_enlightened_to_vmcs12(vmx, evmcs->hv_clean_fields); 3633 /* Enlightened VMCS doesn't have launch state */ 3634 vmcs12->launch_state = !launch; 3635 } else if (enable_shadow_vmcs) { 3636 copy_shadow_to_vmcs12(vmx); 3637 } 3638 3639 /* 3640 * The nested entry process starts with enforcing various prerequisites 3641 * on vmcs12 as required by the Intel SDM, and act appropriately when 3642 * they fail: As the SDM explains, some conditions should cause the 3643 * instruction to fail, while others will cause the instruction to seem 3644 * to succeed, but return an EXIT_REASON_INVALID_STATE. 3645 * To speed up the normal (success) code path, we should avoid checking 3646 * for misconfigurations which will anyway be caught by the processor 3647 * when using the merged vmcs02. 3648 */ 3649 if (CC(interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS)) 3650 return nested_vmx_fail(vcpu, VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS); 3651 3652 if (CC(vmcs12->launch_state == launch)) 3653 return nested_vmx_fail(vcpu, 3654 launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS 3655 : VMXERR_VMRESUME_NONLAUNCHED_VMCS); 3656 3657 if (nested_vmx_check_controls(vcpu, vmcs12)) 3658 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3659 3660 if (nested_vmx_check_address_space_size(vcpu, vmcs12)) 3661 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); 3662 3663 if (nested_vmx_check_host_state(vcpu, vmcs12)) 3664 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); 3665 3666 /* 3667 * We're finally done with prerequisite checking, and can start with 3668 * the nested entry. 3669 */ 3670 vmx->nested.nested_run_pending = 1; 3671 vmx->nested.has_preemption_timer_deadline = false; 3672 status = nested_vmx_enter_non_root_mode(vcpu, true); 3673 if (unlikely(status != NVMX_VMENTRY_SUCCESS)) 3674 goto vmentry_failed; 3675 3676 /* Emulate processing of posted interrupts on VM-Enter. */ 3677 if (nested_cpu_has_posted_intr(vmcs12) && 3678 kvm_apic_has_interrupt(vcpu) == vmx->nested.posted_intr_nv) { 3679 vmx->nested.pi_pending = true; 3680 kvm_make_request(KVM_REQ_EVENT, vcpu); 3681 kvm_apic_clear_irr(vcpu, vmx->nested.posted_intr_nv); 3682 } 3683 3684 /* Hide L1D cache contents from the nested guest. */ 3685 vmx->vcpu.arch.l1tf_flush_l1d = true; 3686 3687 /* 3688 * Must happen outside of nested_vmx_enter_non_root_mode() as it will 3689 * also be used as part of restoring nVMX state for 3690 * snapshot restore (migration). 3691 * 3692 * In this flow, it is assumed that vmcs12 cache was 3693 * transferred as part of captured nVMX state and should 3694 * therefore not be read from guest memory (which may not 3695 * exist on destination host yet). 3696 */ 3697 nested_cache_shadow_vmcs12(vcpu, vmcs12); 3698 3699 switch (vmcs12->guest_activity_state) { 3700 case GUEST_ACTIVITY_HLT: 3701 /* 3702 * If we're entering a halted L2 vcpu and the L2 vcpu won't be 3703 * awakened by event injection or by an NMI-window VM-exit or 3704 * by an interrupt-window VM-exit, halt the vcpu. 3705 */ 3706 if (!(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) && 3707 !nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING) && 3708 !(nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING) && 3709 (vmcs12->guest_rflags & X86_EFLAGS_IF))) { 3710 vmx->nested.nested_run_pending = 0; 3711 return kvm_emulate_halt_noskip(vcpu); 3712 } 3713 break; 3714 case GUEST_ACTIVITY_WAIT_SIPI: 3715 vmx->nested.nested_run_pending = 0; 3716 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; 3717 break; 3718 default: 3719 break; 3720 } 3721 3722 return 1; 3723 3724 vmentry_failed: 3725 vmx->nested.nested_run_pending = 0; 3726 if (status == NVMX_VMENTRY_KVM_INTERNAL_ERROR) 3727 return 0; 3728 if (status == NVMX_VMENTRY_VMEXIT) 3729 return 1; 3730 WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL); 3731 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3732 } 3733 3734 /* 3735 * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date 3736 * because L2 may have changed some cr0 bits directly (CR0_GUEST_HOST_MASK). 3737 * This function returns the new value we should put in vmcs12.guest_cr0. 3738 * It's not enough to just return the vmcs02 GUEST_CR0. Rather, 3739 * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now 3740 * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0 3741 * didn't trap the bit, because if L1 did, so would L0). 3742 * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have 3743 * been modified by L2, and L1 knows it. So just leave the old value of 3744 * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0 3745 * isn't relevant, because if L0 traps this bit it can set it to anything. 3746 * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have 3747 * changed these bits, and therefore they need to be updated, but L0 3748 * didn't necessarily allow them to be changed in GUEST_CR0 - and rather 3749 * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there. 3750 */ 3751 static inline unsigned long 3752 vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3753 { 3754 return 3755 /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) | 3756 /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) | 3757 /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask | 3758 vcpu->arch.cr0_guest_owned_bits)); 3759 } 3760 3761 static inline unsigned long 3762 vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3763 { 3764 return 3765 /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) | 3766 /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) | 3767 /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask | 3768 vcpu->arch.cr4_guest_owned_bits)); 3769 } 3770 3771 static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, 3772 struct vmcs12 *vmcs12, 3773 u32 vm_exit_reason, u32 exit_intr_info) 3774 { 3775 u32 idt_vectoring; 3776 unsigned int nr; 3777 3778 /* 3779 * Per the SDM, VM-Exits due to double and triple faults are never 3780 * considered to occur during event delivery, even if the double/triple 3781 * fault is the result of an escalating vectoring issue. 3782 * 3783 * Note, the SDM qualifies the double fault behavior with "The original 3784 * event results in a double-fault exception". It's unclear why the 3785 * qualification exists since exits due to double fault can occur only 3786 * while vectoring a different exception (injected events are never 3787 * subject to interception), i.e. there's _always_ an original event. 3788 * 3789 * The SDM also uses NMI as a confusing example for the "original event 3790 * causes the VM exit directly" clause. NMI isn't special in any way, 3791 * the same rule applies to all events that cause an exit directly. 3792 * NMI is an odd choice for the example because NMIs can only occur on 3793 * instruction boundaries, i.e. they _can't_ occur during vectoring. 3794 */ 3795 if ((u16)vm_exit_reason == EXIT_REASON_TRIPLE_FAULT || 3796 ((u16)vm_exit_reason == EXIT_REASON_EXCEPTION_NMI && 3797 is_double_fault(exit_intr_info))) { 3798 vmcs12->idt_vectoring_info_field = 0; 3799 } else if (vcpu->arch.exception.injected) { 3800 nr = vcpu->arch.exception.vector; 3801 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 3802 3803 if (kvm_exception_is_soft(nr)) { 3804 vmcs12->vm_exit_instruction_len = 3805 vcpu->arch.event_exit_inst_len; 3806 idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION; 3807 } else 3808 idt_vectoring |= INTR_TYPE_HARD_EXCEPTION; 3809 3810 if (vcpu->arch.exception.has_error_code) { 3811 idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK; 3812 vmcs12->idt_vectoring_error_code = 3813 vcpu->arch.exception.error_code; 3814 } 3815 3816 vmcs12->idt_vectoring_info_field = idt_vectoring; 3817 } else if (vcpu->arch.nmi_injected) { 3818 vmcs12->idt_vectoring_info_field = 3819 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR; 3820 } else if (vcpu->arch.interrupt.injected) { 3821 nr = vcpu->arch.interrupt.nr; 3822 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 3823 3824 if (vcpu->arch.interrupt.soft) { 3825 idt_vectoring |= INTR_TYPE_SOFT_INTR; 3826 vmcs12->vm_entry_instruction_len = 3827 vcpu->arch.event_exit_inst_len; 3828 } else 3829 idt_vectoring |= INTR_TYPE_EXT_INTR; 3830 3831 vmcs12->idt_vectoring_info_field = idt_vectoring; 3832 } else { 3833 vmcs12->idt_vectoring_info_field = 0; 3834 } 3835 } 3836 3837 3838 void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu) 3839 { 3840 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3841 gfn_t gfn; 3842 3843 /* 3844 * Don't need to mark the APIC access page dirty; it is never 3845 * written to by the CPU during APIC virtualization. 3846 */ 3847 3848 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 3849 gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT; 3850 kvm_vcpu_mark_page_dirty(vcpu, gfn); 3851 } 3852 3853 if (nested_cpu_has_posted_intr(vmcs12)) { 3854 gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT; 3855 kvm_vcpu_mark_page_dirty(vcpu, gfn); 3856 } 3857 } 3858 3859 static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) 3860 { 3861 struct vcpu_vmx *vmx = to_vmx(vcpu); 3862 int max_irr; 3863 void *vapic_page; 3864 u16 status; 3865 3866 if (!vmx->nested.pi_pending) 3867 return 0; 3868 3869 if (!vmx->nested.pi_desc) 3870 goto mmio_needed; 3871 3872 vmx->nested.pi_pending = false; 3873 3874 if (!pi_test_and_clear_on(vmx->nested.pi_desc)) 3875 return 0; 3876 3877 max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256); 3878 if (max_irr != 256) { 3879 vapic_page = vmx->nested.virtual_apic_map.hva; 3880 if (!vapic_page) 3881 goto mmio_needed; 3882 3883 __kvm_apic_update_irr(vmx->nested.pi_desc->pir, 3884 vapic_page, &max_irr); 3885 status = vmcs_read16(GUEST_INTR_STATUS); 3886 if ((u8)max_irr > ((u8)status & 0xff)) { 3887 status &= ~0xff; 3888 status |= (u8)max_irr; 3889 vmcs_write16(GUEST_INTR_STATUS, status); 3890 } 3891 } 3892 3893 nested_mark_vmcs12_pages_dirty(vcpu); 3894 return 0; 3895 3896 mmio_needed: 3897 kvm_handle_memory_failure(vcpu, X86EMUL_IO_NEEDED, NULL); 3898 return -ENXIO; 3899 } 3900 3901 static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu) 3902 { 3903 struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit; 3904 u32 intr_info = ex->vector | INTR_INFO_VALID_MASK; 3905 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3906 unsigned long exit_qual; 3907 3908 if (ex->has_payload) { 3909 exit_qual = ex->payload; 3910 } else if (ex->vector == PF_VECTOR) { 3911 exit_qual = vcpu->arch.cr2; 3912 } else if (ex->vector == DB_VECTOR) { 3913 exit_qual = vcpu->arch.dr6; 3914 exit_qual &= ~DR6_BT; 3915 exit_qual ^= DR6_ACTIVE_LOW; 3916 } else { 3917 exit_qual = 0; 3918 } 3919 3920 /* 3921 * Unlike AMD's Paged Real Mode, which reports an error code on #PF 3922 * VM-Exits even if the CPU is in Real Mode, Intel VMX never sets the 3923 * "has error code" flags on VM-Exit if the CPU is in Real Mode. 3924 */ 3925 if (ex->has_error_code && is_protmode(vcpu)) { 3926 /* 3927 * Intel CPUs do not generate error codes with bits 31:16 set, 3928 * and more importantly VMX disallows setting bits 31:16 in the 3929 * injected error code for VM-Entry. Drop the bits to mimic 3930 * hardware and avoid inducing failure on nested VM-Entry if L1 3931 * chooses to inject the exception back to L2. AMD CPUs _do_ 3932 * generate "full" 32-bit error codes, so KVM allows userspace 3933 * to inject exception error codes with bits 31:16 set. 3934 */ 3935 vmcs12->vm_exit_intr_error_code = (u16)ex->error_code; 3936 intr_info |= INTR_INFO_DELIVER_CODE_MASK; 3937 } 3938 3939 if (kvm_exception_is_soft(ex->vector)) 3940 intr_info |= INTR_TYPE_SOFT_EXCEPTION; 3941 else 3942 intr_info |= INTR_TYPE_HARD_EXCEPTION; 3943 3944 if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) && 3945 vmx_get_nmi_mask(vcpu)) 3946 intr_info |= INTR_INFO_UNBLOCK_NMI; 3947 3948 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual); 3949 } 3950 3951 /* 3952 * Returns true if a debug trap is (likely) pending delivery. Infer the class 3953 * of a #DB (trap-like vs. fault-like) from the exception payload (to-be-DR6). 3954 * Using the payload is flawed because code breakpoints (fault-like) and data 3955 * breakpoints (trap-like) set the same bits in DR6 (breakpoint detected), i.e. 3956 * this will return false positives if a to-be-injected code breakpoint #DB is 3957 * pending (from KVM's perspective, but not "pending" across an instruction 3958 * boundary). ICEBP, a.k.a. INT1, is also not reflected here even though it 3959 * too is trap-like. 3960 * 3961 * KVM "works" despite these flaws as ICEBP isn't currently supported by the 3962 * emulator, Monitor Trap Flag is not marked pending on intercepted #DBs (the 3963 * #DB has already happened), and MTF isn't marked pending on code breakpoints 3964 * from the emulator (because such #DBs are fault-like and thus don't trigger 3965 * actions that fire on instruction retire). 3966 */ 3967 static unsigned long vmx_get_pending_dbg_trap(struct kvm_queued_exception *ex) 3968 { 3969 if (!ex->pending || ex->vector != DB_VECTOR) 3970 return 0; 3971 3972 /* General Detect #DBs are always fault-like. */ 3973 return ex->payload & ~DR6_BD; 3974 } 3975 3976 /* 3977 * Returns true if there's a pending #DB exception that is lower priority than 3978 * a pending Monitor Trap Flag VM-Exit. TSS T-flag #DBs are not emulated by 3979 * KVM, but could theoretically be injected by userspace. Note, this code is 3980 * imperfect, see above. 3981 */ 3982 static bool vmx_is_low_priority_db_trap(struct kvm_queued_exception *ex) 3983 { 3984 return vmx_get_pending_dbg_trap(ex) & ~DR6_BT; 3985 } 3986 3987 /* 3988 * Certain VM-exits set the 'pending debug exceptions' field to indicate a 3989 * recognized #DB (data or single-step) that has yet to be delivered. Since KVM 3990 * represents these debug traps with a payload that is said to be compatible 3991 * with the 'pending debug exceptions' field, write the payload to the VMCS 3992 * field if a VM-exit is delivered before the debug trap. 3993 */ 3994 static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu) 3995 { 3996 unsigned long pending_dbg; 3997 3998 pending_dbg = vmx_get_pending_dbg_trap(&vcpu->arch.exception); 3999 if (pending_dbg) 4000 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, pending_dbg); 4001 } 4002 4003 static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu) 4004 { 4005 return nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && 4006 to_vmx(vcpu)->nested.preemption_timer_expired; 4007 } 4008 4009 static bool vmx_has_nested_events(struct kvm_vcpu *vcpu) 4010 { 4011 return nested_vmx_preemption_timer_pending(vcpu) || 4012 to_vmx(vcpu)->nested.mtf_pending; 4013 } 4014 4015 /* 4016 * Per the Intel SDM's table "Priority Among Concurrent Events", with minor 4017 * edits to fill in missing examples, e.g. #DB due to split-lock accesses, 4018 * and less minor edits to splice in the priority of VMX Non-Root specific 4019 * events, e.g. MTF and NMI/INTR-window exiting. 4020 * 4021 * 1 Hardware Reset and Machine Checks 4022 * - RESET 4023 * - Machine Check 4024 * 4025 * 2 Trap on Task Switch 4026 * - T flag in TSS is set (on task switch) 4027 * 4028 * 3 External Hardware Interventions 4029 * - FLUSH 4030 * - STOPCLK 4031 * - SMI 4032 * - INIT 4033 * 4034 * 3.5 Monitor Trap Flag (MTF) VM-exit[1] 4035 * 4036 * 4 Traps on Previous Instruction 4037 * - Breakpoints 4038 * - Trap-class Debug Exceptions (#DB due to TF flag set, data/I-O 4039 * breakpoint, or #DB due to a split-lock access) 4040 * 4041 * 4.3 VMX-preemption timer expired VM-exit 4042 * 4043 * 4.6 NMI-window exiting VM-exit[2] 4044 * 4045 * 5 Nonmaskable Interrupts (NMI) 4046 * 4047 * 5.5 Interrupt-window exiting VM-exit and Virtual-interrupt delivery 4048 * 4049 * 6 Maskable Hardware Interrupts 4050 * 4051 * 7 Code Breakpoint Fault 4052 * 4053 * 8 Faults from Fetching Next Instruction 4054 * - Code-Segment Limit Violation 4055 * - Code Page Fault 4056 * - Control protection exception (missing ENDBRANCH at target of indirect 4057 * call or jump) 4058 * 4059 * 9 Faults from Decoding Next Instruction 4060 * - Instruction length > 15 bytes 4061 * - Invalid Opcode 4062 * - Coprocessor Not Available 4063 * 4064 *10 Faults on Executing Instruction 4065 * - Overflow 4066 * - Bound error 4067 * - Invalid TSS 4068 * - Segment Not Present 4069 * - Stack fault 4070 * - General Protection 4071 * - Data Page Fault 4072 * - Alignment Check 4073 * - x86 FPU Floating-point exception 4074 * - SIMD floating-point exception 4075 * - Virtualization exception 4076 * - Control protection exception 4077 * 4078 * [1] Per the "Monitor Trap Flag" section: System-management interrupts (SMIs), 4079 * INIT signals, and higher priority events take priority over MTF VM exits. 4080 * MTF VM exits take priority over debug-trap exceptions and lower priority 4081 * events. 4082 * 4083 * [2] Debug-trap exceptions and higher priority events take priority over VM exits 4084 * caused by the VMX-preemption timer. VM exits caused by the VMX-preemption 4085 * timer take priority over VM exits caused by the "NMI-window exiting" 4086 * VM-execution control and lower priority events. 4087 * 4088 * [3] Debug-trap exceptions and higher priority events take priority over VM exits 4089 * caused by "NMI-window exiting". VM exits caused by this control take 4090 * priority over non-maskable interrupts (NMIs) and lower priority events. 4091 * 4092 * [4] Virtual-interrupt delivery has the same priority as that of VM exits due to 4093 * the 1-setting of the "interrupt-window exiting" VM-execution control. Thus, 4094 * non-maskable interrupts (NMIs) and higher priority events take priority over 4095 * delivery of a virtual interrupt; delivery of a virtual interrupt takes 4096 * priority over external interrupts and lower priority events. 4097 */ 4098 static int vmx_check_nested_events(struct kvm_vcpu *vcpu) 4099 { 4100 struct kvm_lapic *apic = vcpu->arch.apic; 4101 struct vcpu_vmx *vmx = to_vmx(vcpu); 4102 /* 4103 * Only a pending nested run blocks a pending exception. If there is a 4104 * previously injected event, the pending exception occurred while said 4105 * event was being delivered and thus needs to be handled. 4106 */ 4107 bool block_nested_exceptions = vmx->nested.nested_run_pending; 4108 /* 4109 * New events (not exceptions) are only recognized at instruction 4110 * boundaries. If an event needs reinjection, then KVM is handling a 4111 * VM-Exit that occurred _during_ instruction execution; new events are 4112 * blocked until the instruction completes. 4113 */ 4114 bool block_nested_events = block_nested_exceptions || 4115 kvm_event_needs_reinjection(vcpu); 4116 4117 if (lapic_in_kernel(vcpu) && 4118 test_bit(KVM_APIC_INIT, &apic->pending_events)) { 4119 if (block_nested_events) 4120 return -EBUSY; 4121 nested_vmx_update_pending_dbg(vcpu); 4122 clear_bit(KVM_APIC_INIT, &apic->pending_events); 4123 if (vcpu->arch.mp_state != KVM_MP_STATE_INIT_RECEIVED) 4124 nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0); 4125 4126 /* MTF is discarded if the vCPU is in WFS. */ 4127 vmx->nested.mtf_pending = false; 4128 return 0; 4129 } 4130 4131 if (lapic_in_kernel(vcpu) && 4132 test_bit(KVM_APIC_SIPI, &apic->pending_events)) { 4133 if (block_nested_events) 4134 return -EBUSY; 4135 4136 clear_bit(KVM_APIC_SIPI, &apic->pending_events); 4137 if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { 4138 nested_vmx_vmexit(vcpu, EXIT_REASON_SIPI_SIGNAL, 0, 4139 apic->sipi_vector & 0xFFUL); 4140 return 0; 4141 } 4142 /* Fallthrough, the SIPI is completely ignored. */ 4143 } 4144 4145 /* 4146 * Process exceptions that are higher priority than Monitor Trap Flag: 4147 * fault-like exceptions, TSS T flag #DB (not emulated by KVM, but 4148 * could theoretically come in from userspace), and ICEBP (INT1). 4149 * 4150 * TODO: SMIs have higher priority than MTF and trap-like #DBs (except 4151 * for TSS T flag #DBs). KVM also doesn't save/restore pending MTF 4152 * across SMI/RSM as it should; that needs to be addressed in order to 4153 * prioritize SMI over MTF and trap-like #DBs. 4154 */ 4155 if (vcpu->arch.exception_vmexit.pending && 4156 !vmx_is_low_priority_db_trap(&vcpu->arch.exception_vmexit)) { 4157 if (block_nested_exceptions) 4158 return -EBUSY; 4159 4160 nested_vmx_inject_exception_vmexit(vcpu); 4161 return 0; 4162 } 4163 4164 if (vcpu->arch.exception.pending && 4165 !vmx_is_low_priority_db_trap(&vcpu->arch.exception)) { 4166 if (block_nested_exceptions) 4167 return -EBUSY; 4168 goto no_vmexit; 4169 } 4170 4171 if (vmx->nested.mtf_pending) { 4172 if (block_nested_events) 4173 return -EBUSY; 4174 nested_vmx_update_pending_dbg(vcpu); 4175 nested_vmx_vmexit(vcpu, EXIT_REASON_MONITOR_TRAP_FLAG, 0, 0); 4176 return 0; 4177 } 4178 4179 if (vcpu->arch.exception_vmexit.pending) { 4180 if (block_nested_exceptions) 4181 return -EBUSY; 4182 4183 nested_vmx_inject_exception_vmexit(vcpu); 4184 return 0; 4185 } 4186 4187 if (vcpu->arch.exception.pending) { 4188 if (block_nested_exceptions) 4189 return -EBUSY; 4190 goto no_vmexit; 4191 } 4192 4193 if (nested_vmx_preemption_timer_pending(vcpu)) { 4194 if (block_nested_events) 4195 return -EBUSY; 4196 nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0); 4197 return 0; 4198 } 4199 4200 if (vcpu->arch.smi_pending && !is_smm(vcpu)) { 4201 if (block_nested_events) 4202 return -EBUSY; 4203 goto no_vmexit; 4204 } 4205 4206 if (vcpu->arch.nmi_pending && !vmx_nmi_blocked(vcpu)) { 4207 if (block_nested_events) 4208 return -EBUSY; 4209 if (!nested_exit_on_nmi(vcpu)) 4210 goto no_vmexit; 4211 4212 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, 4213 NMI_VECTOR | INTR_TYPE_NMI_INTR | 4214 INTR_INFO_VALID_MASK, 0); 4215 /* 4216 * The NMI-triggered VM exit counts as injection: 4217 * clear this one and block further NMIs. 4218 */ 4219 vcpu->arch.nmi_pending = 0; 4220 vmx_set_nmi_mask(vcpu, true); 4221 return 0; 4222 } 4223 4224 if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) { 4225 if (block_nested_events) 4226 return -EBUSY; 4227 if (!nested_exit_on_intr(vcpu)) 4228 goto no_vmexit; 4229 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); 4230 return 0; 4231 } 4232 4233 no_vmexit: 4234 return vmx_complete_nested_posted_interrupt(vcpu); 4235 } 4236 4237 static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) 4238 { 4239 ktime_t remaining = 4240 hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer); 4241 u64 value; 4242 4243 if (ktime_to_ns(remaining) <= 0) 4244 return 0; 4245 4246 value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz; 4247 do_div(value, 1000000); 4248 return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 4249 } 4250 4251 static bool is_vmcs12_ext_field(unsigned long field) 4252 { 4253 switch (field) { 4254 case GUEST_ES_SELECTOR: 4255 case GUEST_CS_SELECTOR: 4256 case GUEST_SS_SELECTOR: 4257 case GUEST_DS_SELECTOR: 4258 case GUEST_FS_SELECTOR: 4259 case GUEST_GS_SELECTOR: 4260 case GUEST_LDTR_SELECTOR: 4261 case GUEST_TR_SELECTOR: 4262 case GUEST_ES_LIMIT: 4263 case GUEST_CS_LIMIT: 4264 case GUEST_SS_LIMIT: 4265 case GUEST_DS_LIMIT: 4266 case GUEST_FS_LIMIT: 4267 case GUEST_GS_LIMIT: 4268 case GUEST_LDTR_LIMIT: 4269 case GUEST_TR_LIMIT: 4270 case GUEST_GDTR_LIMIT: 4271 case GUEST_IDTR_LIMIT: 4272 case GUEST_ES_AR_BYTES: 4273 case GUEST_DS_AR_BYTES: 4274 case GUEST_FS_AR_BYTES: 4275 case GUEST_GS_AR_BYTES: 4276 case GUEST_LDTR_AR_BYTES: 4277 case GUEST_TR_AR_BYTES: 4278 case GUEST_ES_BASE: 4279 case GUEST_CS_BASE: 4280 case GUEST_SS_BASE: 4281 case GUEST_DS_BASE: 4282 case GUEST_FS_BASE: 4283 case GUEST_GS_BASE: 4284 case GUEST_LDTR_BASE: 4285 case GUEST_TR_BASE: 4286 case GUEST_GDTR_BASE: 4287 case GUEST_IDTR_BASE: 4288 case GUEST_PENDING_DBG_EXCEPTIONS: 4289 case GUEST_BNDCFGS: 4290 return true; 4291 default: 4292 break; 4293 } 4294 4295 return false; 4296 } 4297 4298 static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, 4299 struct vmcs12 *vmcs12) 4300 { 4301 struct vcpu_vmx *vmx = to_vmx(vcpu); 4302 4303 vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); 4304 vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR); 4305 vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR); 4306 vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR); 4307 vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR); 4308 vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR); 4309 vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR); 4310 vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR); 4311 vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT); 4312 vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT); 4313 vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT); 4314 vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT); 4315 vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT); 4316 vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT); 4317 vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT); 4318 vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT); 4319 vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT); 4320 vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT); 4321 vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES); 4322 vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES); 4323 vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES); 4324 vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES); 4325 vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES); 4326 vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES); 4327 vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE); 4328 vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE); 4329 vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE); 4330 vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE); 4331 vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE); 4332 vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE); 4333 vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE); 4334 vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE); 4335 vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); 4336 vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); 4337 vmcs12->guest_pending_dbg_exceptions = 4338 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); 4339 4340 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false; 4341 } 4342 4343 static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, 4344 struct vmcs12 *vmcs12) 4345 { 4346 struct vcpu_vmx *vmx = to_vmx(vcpu); 4347 int cpu; 4348 4349 if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare) 4350 return; 4351 4352 4353 WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01); 4354 4355 cpu = get_cpu(); 4356 vmx->loaded_vmcs = &vmx->nested.vmcs02; 4357 vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->vmcs01); 4358 4359 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 4360 4361 vmx->loaded_vmcs = &vmx->vmcs01; 4362 vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->nested.vmcs02); 4363 put_cpu(); 4364 } 4365 4366 /* 4367 * Update the guest state fields of vmcs12 to reflect changes that 4368 * occurred while L2 was running. (The "IA-32e mode guest" bit of the 4369 * VM-entry controls is also updated, since this is really a guest 4370 * state bit.) 4371 */ 4372 static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 4373 { 4374 struct vcpu_vmx *vmx = to_vmx(vcpu); 4375 4376 if (nested_vmx_is_evmptr12_valid(vmx)) 4377 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 4378 4379 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = 4380 !nested_vmx_is_evmptr12_valid(vmx); 4381 4382 vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); 4383 vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); 4384 4385 vmcs12->guest_rsp = kvm_rsp_read(vcpu); 4386 vmcs12->guest_rip = kvm_rip_read(vcpu); 4387 vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); 4388 4389 vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); 4390 vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); 4391 4392 vmcs12->guest_interruptibility_info = 4393 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 4394 4395 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) 4396 vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT; 4397 else if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) 4398 vmcs12->guest_activity_state = GUEST_ACTIVITY_WAIT_SIPI; 4399 else 4400 vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; 4401 4402 if (nested_cpu_has_preemption_timer(vmcs12) && 4403 vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER && 4404 !vmx->nested.nested_run_pending) 4405 vmcs12->vmx_preemption_timer_value = 4406 vmx_get_preemption_timer_value(vcpu); 4407 4408 /* 4409 * In some cases (usually, nested EPT), L2 is allowed to change its 4410 * own CR3 without exiting. If it has changed it, we must keep it. 4411 * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined 4412 * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12. 4413 * 4414 * Additionally, restore L2's PDPTR to vmcs12. 4415 */ 4416 if (enable_ept) { 4417 vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3); 4418 if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { 4419 vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0); 4420 vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1); 4421 vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2); 4422 vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); 4423 } 4424 } 4425 4426 vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); 4427 4428 if (nested_cpu_has_vid(vmcs12)) 4429 vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS); 4430 4431 vmcs12->vm_entry_controls = 4432 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | 4433 (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); 4434 4435 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) 4436 vmcs12->guest_dr7 = vcpu->arch.dr7; 4437 4438 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) 4439 vmcs12->guest_ia32_efer = vcpu->arch.efer; 4440 } 4441 4442 /* 4443 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits 4444 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), 4445 * and this function updates it to reflect the changes to the guest state while 4446 * L2 was running (and perhaps made some exits which were handled directly by L0 4447 * without going back to L1), and to reflect the exit reason. 4448 * Note that we do not have to copy here all VMCS fields, just those that 4449 * could have changed by the L2 guest or the exit - i.e., the guest-state and 4450 * exit-information fields only. Other fields are modified by L1 with VMWRITE, 4451 * which already writes to vmcs12 directly. 4452 */ 4453 static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 4454 u32 vm_exit_reason, u32 exit_intr_info, 4455 unsigned long exit_qualification) 4456 { 4457 /* update exit information fields: */ 4458 vmcs12->vm_exit_reason = vm_exit_reason; 4459 if (to_vmx(vcpu)->exit_reason.enclave_mode) 4460 vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE; 4461 vmcs12->exit_qualification = exit_qualification; 4462 4463 /* 4464 * On VM-Exit due to a failed VM-Entry, the VMCS isn't marked launched 4465 * and only EXIT_REASON and EXIT_QUALIFICATION are updated, all other 4466 * exit info fields are unmodified. 4467 */ 4468 if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { 4469 vmcs12->launch_state = 1; 4470 4471 /* vm_entry_intr_info_field is cleared on exit. Emulate this 4472 * instead of reading the real value. */ 4473 vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK; 4474 4475 /* 4476 * Transfer the event that L0 or L1 may wanted to inject into 4477 * L2 to IDT_VECTORING_INFO_FIELD. 4478 */ 4479 vmcs12_save_pending_event(vcpu, vmcs12, 4480 vm_exit_reason, exit_intr_info); 4481 4482 vmcs12->vm_exit_intr_info = exit_intr_info; 4483 vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 4484 vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 4485 4486 /* 4487 * According to spec, there's no need to store the guest's 4488 * MSRs if the exit is due to a VM-entry failure that occurs 4489 * during or after loading the guest state. Since this exit 4490 * does not fall in that category, we need to save the MSRs. 4491 */ 4492 if (nested_vmx_store_msr(vcpu, 4493 vmcs12->vm_exit_msr_store_addr, 4494 vmcs12->vm_exit_msr_store_count)) 4495 nested_vmx_abort(vcpu, 4496 VMX_ABORT_SAVE_GUEST_MSR_FAIL); 4497 } 4498 } 4499 4500 /* 4501 * A part of what we need to when the nested L2 guest exits and we want to 4502 * run its L1 parent, is to reset L1's guest state to the host state specified 4503 * in vmcs12. 4504 * This function is to be called not only on normal nested exit, but also on 4505 * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry 4506 * Failures During or After Loading Guest State"). 4507 * This function should be called when the active VMCS is L1's (vmcs01). 4508 */ 4509 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 4510 struct vmcs12 *vmcs12) 4511 { 4512 enum vm_entry_failure_code ignored; 4513 struct kvm_segment seg; 4514 4515 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) 4516 vcpu->arch.efer = vmcs12->host_ia32_efer; 4517 else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 4518 vcpu->arch.efer |= (EFER_LMA | EFER_LME); 4519 else 4520 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); 4521 vmx_set_efer(vcpu, vcpu->arch.efer); 4522 4523 kvm_rsp_write(vcpu, vmcs12->host_rsp); 4524 kvm_rip_write(vcpu, vmcs12->host_rip); 4525 vmx_set_rflags(vcpu, X86_EFLAGS_FIXED); 4526 vmx_set_interrupt_shadow(vcpu, 0); 4527 4528 /* 4529 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't 4530 * actually changed, because vmx_set_cr0 refers to efer set above. 4531 * 4532 * CR0_GUEST_HOST_MASK is already set in the original vmcs01 4533 * (KVM doesn't change it); 4534 */ 4535 vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); 4536 vmx_set_cr0(vcpu, vmcs12->host_cr0); 4537 4538 /* Same as above - no reason to call set_cr4_guest_host_mask(). */ 4539 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 4540 vmx_set_cr4(vcpu, vmcs12->host_cr4); 4541 4542 nested_ept_uninit_mmu_context(vcpu); 4543 4544 /* 4545 * Only PDPTE load can fail as the value of cr3 was checked on entry and 4546 * couldn't have changed. 4547 */ 4548 if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, true, &ignored)) 4549 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); 4550 4551 nested_vmx_transition_tlb_flush(vcpu, vmcs12, false); 4552 4553 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); 4554 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); 4555 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); 4556 vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); 4557 vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); 4558 vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF); 4559 vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF); 4560 4561 /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ 4562 if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) 4563 vmcs_write64(GUEST_BNDCFGS, 0); 4564 4565 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { 4566 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); 4567 vcpu->arch.pat = vmcs12->host_ia32_pat; 4568 } 4569 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && 4570 kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu))) 4571 WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 4572 vmcs12->host_ia32_perf_global_ctrl)); 4573 4574 /* Set L1 segment info according to Intel SDM 4575 27.5.2 Loading Host Segment and Descriptor-Table Registers */ 4576 seg = (struct kvm_segment) { 4577 .base = 0, 4578 .limit = 0xFFFFFFFF, 4579 .selector = vmcs12->host_cs_selector, 4580 .type = 11, 4581 .present = 1, 4582 .s = 1, 4583 .g = 1 4584 }; 4585 if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 4586 seg.l = 1; 4587 else 4588 seg.db = 1; 4589 __vmx_set_segment(vcpu, &seg, VCPU_SREG_CS); 4590 seg = (struct kvm_segment) { 4591 .base = 0, 4592 .limit = 0xFFFFFFFF, 4593 .type = 3, 4594 .present = 1, 4595 .s = 1, 4596 .db = 1, 4597 .g = 1 4598 }; 4599 seg.selector = vmcs12->host_ds_selector; 4600 __vmx_set_segment(vcpu, &seg, VCPU_SREG_DS); 4601 seg.selector = vmcs12->host_es_selector; 4602 __vmx_set_segment(vcpu, &seg, VCPU_SREG_ES); 4603 seg.selector = vmcs12->host_ss_selector; 4604 __vmx_set_segment(vcpu, &seg, VCPU_SREG_SS); 4605 seg.selector = vmcs12->host_fs_selector; 4606 seg.base = vmcs12->host_fs_base; 4607 __vmx_set_segment(vcpu, &seg, VCPU_SREG_FS); 4608 seg.selector = vmcs12->host_gs_selector; 4609 seg.base = vmcs12->host_gs_base; 4610 __vmx_set_segment(vcpu, &seg, VCPU_SREG_GS); 4611 seg = (struct kvm_segment) { 4612 .base = vmcs12->host_tr_base, 4613 .limit = 0x67, 4614 .selector = vmcs12->host_tr_selector, 4615 .type = 11, 4616 .present = 1 4617 }; 4618 __vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); 4619 4620 memset(&seg, 0, sizeof(seg)); 4621 seg.unusable = 1; 4622 __vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR); 4623 4624 kvm_set_dr(vcpu, 7, 0x400); 4625 vmcs_write64(GUEST_IA32_DEBUGCTL, 0); 4626 4627 if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, 4628 vmcs12->vm_exit_msr_load_count)) 4629 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 4630 4631 to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu); 4632 } 4633 4634 static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) 4635 { 4636 struct vmx_uret_msr *efer_msr; 4637 unsigned int i; 4638 4639 if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER) 4640 return vmcs_read64(GUEST_IA32_EFER); 4641 4642 if (cpu_has_load_ia32_efer()) 4643 return host_efer; 4644 4645 for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) { 4646 if (vmx->msr_autoload.guest.val[i].index == MSR_EFER) 4647 return vmx->msr_autoload.guest.val[i].value; 4648 } 4649 4650 efer_msr = vmx_find_uret_msr(vmx, MSR_EFER); 4651 if (efer_msr) 4652 return efer_msr->data; 4653 4654 return host_efer; 4655 } 4656 4657 static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) 4658 { 4659 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4660 struct vcpu_vmx *vmx = to_vmx(vcpu); 4661 struct vmx_msr_entry g, h; 4662 gpa_t gpa; 4663 u32 i, j; 4664 4665 vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT); 4666 4667 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { 4668 /* 4669 * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set 4670 * as vmcs01.GUEST_DR7 contains a userspace defined value 4671 * and vcpu->arch.dr7 is not squirreled away before the 4672 * nested VMENTER (not worth adding a variable in nested_vmx). 4673 */ 4674 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 4675 kvm_set_dr(vcpu, 7, DR7_FIXED_1); 4676 else 4677 WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7))); 4678 } 4679 4680 /* 4681 * Note that calling vmx_set_{efer,cr0,cr4} is important as they 4682 * handle a variety of side effects to KVM's software model. 4683 */ 4684 vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx)); 4685 4686 vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); 4687 vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW)); 4688 4689 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 4690 vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW)); 4691 4692 nested_ept_uninit_mmu_context(vcpu); 4693 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 4694 kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); 4695 4696 /* 4697 * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs 4698 * from vmcs01 (if necessary). The PDPTRs are not loaded on 4699 * VMFail, like everything else we just need to ensure our 4700 * software model is up-to-date. 4701 */ 4702 if (enable_ept && is_pae_paging(vcpu)) 4703 ept_save_pdptrs(vcpu); 4704 4705 kvm_mmu_reset_context(vcpu); 4706 4707 /* 4708 * This nasty bit of open coding is a compromise between blindly 4709 * loading L1's MSRs using the exit load lists (incorrect emulation 4710 * of VMFail), leaving the nested VM's MSRs in the software model 4711 * (incorrect behavior) and snapshotting the modified MSRs (too 4712 * expensive since the lists are unbound by hardware). For each 4713 * MSR that was (prematurely) loaded from the nested VMEntry load 4714 * list, reload it from the exit load list if it exists and differs 4715 * from the guest value. The intent is to stuff host state as 4716 * silently as possible, not to fully process the exit load list. 4717 */ 4718 for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) { 4719 gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g)); 4720 if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) { 4721 pr_debug_ratelimited( 4722 "%s read MSR index failed (%u, 0x%08llx)\n", 4723 __func__, i, gpa); 4724 goto vmabort; 4725 } 4726 4727 for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) { 4728 gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h)); 4729 if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) { 4730 pr_debug_ratelimited( 4731 "%s read MSR failed (%u, 0x%08llx)\n", 4732 __func__, j, gpa); 4733 goto vmabort; 4734 } 4735 if (h.index != g.index) 4736 continue; 4737 if (h.value == g.value) 4738 break; 4739 4740 if (nested_vmx_load_msr_check(vcpu, &h)) { 4741 pr_debug_ratelimited( 4742 "%s check failed (%u, 0x%x, 0x%x)\n", 4743 __func__, j, h.index, h.reserved); 4744 goto vmabort; 4745 } 4746 4747 if (kvm_set_msr(vcpu, h.index, h.value)) { 4748 pr_debug_ratelimited( 4749 "%s WRMSR failed (%u, 0x%x, 0x%llx)\n", 4750 __func__, j, h.index, h.value); 4751 goto vmabort; 4752 } 4753 } 4754 } 4755 4756 return; 4757 4758 vmabort: 4759 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 4760 } 4761 4762 /* 4763 * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1 4764 * and modify vmcs12 to make it see what it would expect to see there if 4765 * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) 4766 */ 4767 void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, 4768 u32 exit_intr_info, unsigned long exit_qualification) 4769 { 4770 struct vcpu_vmx *vmx = to_vmx(vcpu); 4771 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4772 4773 /* Pending MTF traps are discarded on VM-Exit. */ 4774 vmx->nested.mtf_pending = false; 4775 4776 /* trying to cancel vmlaunch/vmresume is a bug */ 4777 WARN_ON_ONCE(vmx->nested.nested_run_pending); 4778 4779 #ifdef CONFIG_KVM_HYPERV 4780 if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) { 4781 /* 4782 * KVM_REQ_GET_NESTED_STATE_PAGES is also used to map 4783 * Enlightened VMCS after migration and we still need to 4784 * do that when something is forcing L2->L1 exit prior to 4785 * the first L2 run. 4786 */ 4787 (void)nested_get_evmcs_page(vcpu); 4788 } 4789 #endif 4790 4791 /* Service pending TLB flush requests for L2 before switching to L1. */ 4792 kvm_service_local_tlb_flush_requests(vcpu); 4793 4794 /* 4795 * VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between 4796 * now and the new vmentry. Ensure that the VMCS02 PDPTR fields are 4797 * up-to-date before switching to L1. 4798 */ 4799 if (enable_ept && is_pae_paging(vcpu)) 4800 vmx_ept_load_pdptrs(vcpu); 4801 4802 leave_guest_mode(vcpu); 4803 4804 if (nested_cpu_has_preemption_timer(vmcs12)) 4805 hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); 4806 4807 if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) { 4808 vcpu->arch.tsc_offset = vcpu->arch.l1_tsc_offset; 4809 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING)) 4810 vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio; 4811 } 4812 4813 if (likely(!vmx->fail)) { 4814 sync_vmcs02_to_vmcs12(vcpu, vmcs12); 4815 4816 if (vm_exit_reason != -1) 4817 prepare_vmcs12(vcpu, vmcs12, vm_exit_reason, 4818 exit_intr_info, exit_qualification); 4819 4820 /* 4821 * Must happen outside of sync_vmcs02_to_vmcs12() as it will 4822 * also be used to capture vmcs12 cache as part of 4823 * capturing nVMX state for snapshot (migration). 4824 * 4825 * Otherwise, this flush will dirty guest memory at a 4826 * point it is already assumed by user-space to be 4827 * immutable. 4828 */ 4829 nested_flush_cached_shadow_vmcs12(vcpu, vmcs12); 4830 } else { 4831 /* 4832 * The only expected VM-instruction error is "VM entry with 4833 * invalid control field(s)." Anything else indicates a 4834 * problem with L0. And we should never get here with a 4835 * VMFail of any type if early consistency checks are enabled. 4836 */ 4837 WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != 4838 VMXERR_ENTRY_INVALID_CONTROL_FIELD); 4839 WARN_ON_ONCE(nested_early_check); 4840 } 4841 4842 /* 4843 * Drop events/exceptions that were queued for re-injection to L2 4844 * (picked up via vmx_complete_interrupts()), as well as exceptions 4845 * that were pending for L2. Note, this must NOT be hoisted above 4846 * prepare_vmcs12(), events/exceptions queued for re-injection need to 4847 * be captured in vmcs12 (see vmcs12_save_pending_event()). 4848 */ 4849 vcpu->arch.nmi_injected = false; 4850 kvm_clear_exception_queue(vcpu); 4851 kvm_clear_interrupt_queue(vcpu); 4852 4853 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 4854 4855 /* 4856 * If IBRS is advertised to the vCPU, KVM must flush the indirect 4857 * branch predictors when transitioning from L2 to L1, as L1 expects 4858 * hardware (KVM in this case) to provide separate predictor modes. 4859 * Bare metal isolates VMX root (host) from VMX non-root (guest), but 4860 * doesn't isolate different VMCSs, i.e. in this case, doesn't provide 4861 * separate modes for L2 vs L1. 4862 */ 4863 if (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) 4864 indirect_branch_prediction_barrier(); 4865 4866 /* Update any VMCS fields that might have changed while L2 ran */ 4867 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 4868 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 4869 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 4870 if (kvm_caps.has_tsc_control) 4871 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); 4872 4873 if (vmx->nested.l1_tpr_threshold != -1) 4874 vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold); 4875 4876 if (vmx->nested.change_vmcs01_virtual_apic_mode) { 4877 vmx->nested.change_vmcs01_virtual_apic_mode = false; 4878 vmx_set_virtual_apic_mode(vcpu); 4879 } 4880 4881 if (vmx->nested.update_vmcs01_cpu_dirty_logging) { 4882 vmx->nested.update_vmcs01_cpu_dirty_logging = false; 4883 vmx_update_cpu_dirty_logging(vcpu); 4884 } 4885 4886 /* Unpin physical memory we referred to in vmcs02 */ 4887 kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map, false); 4888 kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); 4889 kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); 4890 vmx->nested.pi_desc = NULL; 4891 4892 if (vmx->nested.reload_vmcs01_apic_access_page) { 4893 vmx->nested.reload_vmcs01_apic_access_page = false; 4894 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 4895 } 4896 4897 if (vmx->nested.update_vmcs01_apicv_status) { 4898 vmx->nested.update_vmcs01_apicv_status = false; 4899 kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); 4900 } 4901 4902 if ((vm_exit_reason != -1) && 4903 (enable_shadow_vmcs || nested_vmx_is_evmptr12_valid(vmx))) 4904 vmx->nested.need_vmcs12_to_shadow_sync = true; 4905 4906 /* in case we halted in L2 */ 4907 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 4908 4909 if (likely(!vmx->fail)) { 4910 if ((u16)vm_exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT && 4911 nested_exit_intr_ack_set(vcpu)) { 4912 int irq = kvm_cpu_get_interrupt(vcpu); 4913 WARN_ON(irq < 0); 4914 vmcs12->vm_exit_intr_info = irq | 4915 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; 4916 } 4917 4918 if (vm_exit_reason != -1) 4919 trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, 4920 vmcs12->exit_qualification, 4921 vmcs12->idt_vectoring_info_field, 4922 vmcs12->vm_exit_intr_info, 4923 vmcs12->vm_exit_intr_error_code, 4924 KVM_ISA_VMX); 4925 4926 load_vmcs12_host_state(vcpu, vmcs12); 4927 4928 return; 4929 } 4930 4931 /* 4932 * After an early L2 VM-entry failure, we're now back 4933 * in L1 which thinks it just finished a VMLAUNCH or 4934 * VMRESUME instruction, so we need to set the failure 4935 * flag and the VM-instruction error field of the VMCS 4936 * accordingly, and skip the emulated instruction. 4937 */ 4938 (void)nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 4939 4940 /* 4941 * Restore L1's host state to KVM's software model. We're here 4942 * because a consistency check was caught by hardware, which 4943 * means some amount of guest state has been propagated to KVM's 4944 * model and needs to be unwound to the host's state. 4945 */ 4946 nested_vmx_restore_host_state(vcpu); 4947 4948 vmx->fail = 0; 4949 } 4950 4951 static void nested_vmx_triple_fault(struct kvm_vcpu *vcpu) 4952 { 4953 kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu); 4954 nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0); 4955 } 4956 4957 /* 4958 * Decode the memory-address operand of a vmx instruction, as recorded on an 4959 * exit caused by such an instruction (run by a guest hypervisor). 4960 * On success, returns 0. When the operand is invalid, returns 1 and throws 4961 * #UD, #GP, or #SS. 4962 */ 4963 int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, 4964 u32 vmx_instruction_info, bool wr, int len, gva_t *ret) 4965 { 4966 gva_t off; 4967 bool exn; 4968 struct kvm_segment s; 4969 4970 /* 4971 * According to Vol. 3B, "Information for VM Exits Due to Instruction 4972 * Execution", on an exit, vmx_instruction_info holds most of the 4973 * addressing components of the operand. Only the displacement part 4974 * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). 4975 * For how an actual address is calculated from all these components, 4976 * refer to Vol. 1, "Operand Addressing". 4977 */ 4978 int scaling = vmx_instruction_info & 3; 4979 int addr_size = (vmx_instruction_info >> 7) & 7; 4980 bool is_reg = vmx_instruction_info & (1u << 10); 4981 int seg_reg = (vmx_instruction_info >> 15) & 7; 4982 int index_reg = (vmx_instruction_info >> 18) & 0xf; 4983 bool index_is_valid = !(vmx_instruction_info & (1u << 22)); 4984 int base_reg = (vmx_instruction_info >> 23) & 0xf; 4985 bool base_is_valid = !(vmx_instruction_info & (1u << 27)); 4986 4987 if (is_reg) { 4988 kvm_queue_exception(vcpu, UD_VECTOR); 4989 return 1; 4990 } 4991 4992 /* Addr = segment_base + offset */ 4993 /* offset = base + [index * scale] + displacement */ 4994 off = exit_qualification; /* holds the displacement */ 4995 if (addr_size == 1) 4996 off = (gva_t)sign_extend64(off, 31); 4997 else if (addr_size == 0) 4998 off = (gva_t)sign_extend64(off, 15); 4999 if (base_is_valid) 5000 off += kvm_register_read(vcpu, base_reg); 5001 if (index_is_valid) 5002 off += kvm_register_read(vcpu, index_reg) << scaling; 5003 vmx_get_segment(vcpu, &s, seg_reg); 5004 5005 /* 5006 * The effective address, i.e. @off, of a memory operand is truncated 5007 * based on the address size of the instruction. Note that this is 5008 * the *effective address*, i.e. the address prior to accounting for 5009 * the segment's base. 5010 */ 5011 if (addr_size == 1) /* 32 bit */ 5012 off &= 0xffffffff; 5013 else if (addr_size == 0) /* 16 bit */ 5014 off &= 0xffff; 5015 5016 /* Checks for #GP/#SS exceptions. */ 5017 exn = false; 5018 if (is_long_mode(vcpu)) { 5019 /* 5020 * The virtual/linear address is never truncated in 64-bit 5021 * mode, e.g. a 32-bit address size can yield a 64-bit virtual 5022 * address when using FS/GS with a non-zero base. 5023 */ 5024 if (seg_reg == VCPU_SREG_FS || seg_reg == VCPU_SREG_GS) 5025 *ret = s.base + off; 5026 else 5027 *ret = off; 5028 5029 *ret = vmx_get_untagged_addr(vcpu, *ret, 0); 5030 /* Long mode: #GP(0)/#SS(0) if the memory address is in a 5031 * non-canonical form. This is the only check on the memory 5032 * destination for long mode! 5033 */ 5034 exn = is_noncanonical_address(*ret, vcpu); 5035 } else { 5036 /* 5037 * When not in long mode, the virtual/linear address is 5038 * unconditionally truncated to 32 bits regardless of the 5039 * address size. 5040 */ 5041 *ret = (s.base + off) & 0xffffffff; 5042 5043 /* Protected mode: apply checks for segment validity in the 5044 * following order: 5045 * - segment type check (#GP(0) may be thrown) 5046 * - usability check (#GP(0)/#SS(0)) 5047 * - limit check (#GP(0)/#SS(0)) 5048 */ 5049 if (wr) 5050 /* #GP(0) if the destination operand is located in a 5051 * read-only data segment or any code segment. 5052 */ 5053 exn = ((s.type & 0xa) == 0 || (s.type & 8)); 5054 else 5055 /* #GP(0) if the source operand is located in an 5056 * execute-only code segment 5057 */ 5058 exn = ((s.type & 0xa) == 8); 5059 if (exn) { 5060 kvm_queue_exception_e(vcpu, GP_VECTOR, 0); 5061 return 1; 5062 } 5063 /* Protected mode: #GP(0)/#SS(0) if the segment is unusable. 5064 */ 5065 exn = (s.unusable != 0); 5066 5067 /* 5068 * Protected mode: #GP(0)/#SS(0) if the memory operand is 5069 * outside the segment limit. All CPUs that support VMX ignore 5070 * limit checks for flat segments, i.e. segments with base==0, 5071 * limit==0xffffffff and of type expand-up data or code. 5072 */ 5073 if (!(s.base == 0 && s.limit == 0xffffffff && 5074 ((s.type & 8) || !(s.type & 4)))) 5075 exn = exn || ((u64)off + len - 1 > s.limit); 5076 } 5077 if (exn) { 5078 kvm_queue_exception_e(vcpu, 5079 seg_reg == VCPU_SREG_SS ? 5080 SS_VECTOR : GP_VECTOR, 5081 0); 5082 return 1; 5083 } 5084 5085 return 0; 5086 } 5087 5088 static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer, 5089 int *ret) 5090 { 5091 gva_t gva; 5092 struct x86_exception e; 5093 int r; 5094 5095 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 5096 vmcs_read32(VMX_INSTRUCTION_INFO), false, 5097 sizeof(*vmpointer), &gva)) { 5098 *ret = 1; 5099 return -EINVAL; 5100 } 5101 5102 r = kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e); 5103 if (r != X86EMUL_CONTINUE) { 5104 *ret = kvm_handle_memory_failure(vcpu, r, &e); 5105 return -EINVAL; 5106 } 5107 5108 return 0; 5109 } 5110 5111 /* 5112 * Allocate a shadow VMCS and associate it with the currently loaded 5113 * VMCS, unless such a shadow VMCS already exists. The newly allocated 5114 * VMCS is also VMCLEARed, so that it is ready for use. 5115 */ 5116 static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu) 5117 { 5118 struct vcpu_vmx *vmx = to_vmx(vcpu); 5119 struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs; 5120 5121 /* 5122 * KVM allocates a shadow VMCS only when L1 executes VMXON and frees it 5123 * when L1 executes VMXOFF or the vCPU is forced out of nested 5124 * operation. VMXON faults if the CPU is already post-VMXON, so it 5125 * should be impossible to already have an allocated shadow VMCS. KVM 5126 * doesn't support virtualization of VMCS shadowing, so vmcs01 should 5127 * always be the loaded VMCS. 5128 */ 5129 if (WARN_ON(loaded_vmcs != &vmx->vmcs01 || loaded_vmcs->shadow_vmcs)) 5130 return loaded_vmcs->shadow_vmcs; 5131 5132 loaded_vmcs->shadow_vmcs = alloc_vmcs(true); 5133 if (loaded_vmcs->shadow_vmcs) 5134 vmcs_clear(loaded_vmcs->shadow_vmcs); 5135 5136 return loaded_vmcs->shadow_vmcs; 5137 } 5138 5139 static int enter_vmx_operation(struct kvm_vcpu *vcpu) 5140 { 5141 struct vcpu_vmx *vmx = to_vmx(vcpu); 5142 int r; 5143 5144 r = alloc_loaded_vmcs(&vmx->nested.vmcs02); 5145 if (r < 0) 5146 goto out_vmcs02; 5147 5148 vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); 5149 if (!vmx->nested.cached_vmcs12) 5150 goto out_cached_vmcs12; 5151 5152 vmx->nested.shadow_vmcs12_cache.gpa = INVALID_GPA; 5153 vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); 5154 if (!vmx->nested.cached_shadow_vmcs12) 5155 goto out_cached_shadow_vmcs12; 5156 5157 if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu)) 5158 goto out_shadow_vmcs; 5159 5160 hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, 5161 HRTIMER_MODE_ABS_PINNED); 5162 vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; 5163 5164 vmx->nested.vpid02 = allocate_vpid(); 5165 5166 vmx->nested.vmcs02_initialized = false; 5167 vmx->nested.vmxon = true; 5168 5169 if (vmx_pt_mode_is_host_guest()) { 5170 vmx->pt_desc.guest.ctl = 0; 5171 pt_update_intercept_for_msr(vcpu); 5172 } 5173 5174 return 0; 5175 5176 out_shadow_vmcs: 5177 kfree(vmx->nested.cached_shadow_vmcs12); 5178 5179 out_cached_shadow_vmcs12: 5180 kfree(vmx->nested.cached_vmcs12); 5181 5182 out_cached_vmcs12: 5183 free_loaded_vmcs(&vmx->nested.vmcs02); 5184 5185 out_vmcs02: 5186 return -ENOMEM; 5187 } 5188 5189 /* Emulate the VMXON instruction. */ 5190 static int handle_vmxon(struct kvm_vcpu *vcpu) 5191 { 5192 int ret; 5193 gpa_t vmptr; 5194 uint32_t revision; 5195 struct vcpu_vmx *vmx = to_vmx(vcpu); 5196 const u64 VMXON_NEEDED_FEATURES = FEAT_CTL_LOCKED 5197 | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; 5198 5199 /* 5200 * Manually check CR4.VMXE checks, KVM must force CR4.VMXE=1 to enter 5201 * the guest and so cannot rely on hardware to perform the check, 5202 * which has higher priority than VM-Exit (see Intel SDM's pseudocode 5203 * for VMXON). 5204 * 5205 * Rely on hardware for the other pre-VM-Exit checks, CR0.PE=1, !VM86 5206 * and !COMPATIBILITY modes. For an unrestricted guest, KVM doesn't 5207 * force any of the relevant guest state. For a restricted guest, KVM 5208 * does force CR0.PE=1, but only to also force VM86 in order to emulate 5209 * Real Mode, and so there's no need to check CR0.PE manually. 5210 */ 5211 if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_VMXE)) { 5212 kvm_queue_exception(vcpu, UD_VECTOR); 5213 return 1; 5214 } 5215 5216 /* 5217 * The CPL is checked for "not in VMX operation" and for "in VMX root", 5218 * and has higher priority than the VM-Fail due to being post-VMXON, 5219 * i.e. VMXON #GPs outside of VMX non-root if CPL!=0. In VMX non-root, 5220 * VMXON causes VM-Exit and KVM unconditionally forwards VMXON VM-Exits 5221 * from L2 to L1, i.e. there's no need to check for the vCPU being in 5222 * VMX non-root. 5223 * 5224 * Forwarding the VM-Exit unconditionally, i.e. without performing the 5225 * #UD checks (see above), is functionally ok because KVM doesn't allow 5226 * L1 to run L2 without CR4.VMXE=0, and because KVM never modifies L2's 5227 * CR0 or CR4, i.e. it's L2's responsibility to emulate #UDs that are 5228 * missed by hardware due to shadowing CR0 and/or CR4. 5229 */ 5230 if (vmx_get_cpl(vcpu)) { 5231 kvm_inject_gp(vcpu, 0); 5232 return 1; 5233 } 5234 5235 if (vmx->nested.vmxon) 5236 return nested_vmx_fail(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); 5237 5238 /* 5239 * Invalid CR0/CR4 generates #GP. These checks are performed if and 5240 * only if the vCPU isn't already in VMX operation, i.e. effectively 5241 * have lower priority than the VM-Fail above. 5242 */ 5243 if (!nested_host_cr0_valid(vcpu, kvm_read_cr0(vcpu)) || 5244 !nested_host_cr4_valid(vcpu, kvm_read_cr4(vcpu))) { 5245 kvm_inject_gp(vcpu, 0); 5246 return 1; 5247 } 5248 5249 if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) 5250 != VMXON_NEEDED_FEATURES) { 5251 kvm_inject_gp(vcpu, 0); 5252 return 1; 5253 } 5254 5255 if (nested_vmx_get_vmptr(vcpu, &vmptr, &ret)) 5256 return ret; 5257 5258 /* 5259 * SDM 3: 24.11.5 5260 * The first 4 bytes of VMXON region contain the supported 5261 * VMCS revision identifier 5262 * 5263 * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case; 5264 * which replaces physical address width with 32 5265 */ 5266 if (!page_address_valid(vcpu, vmptr)) 5267 return nested_vmx_failInvalid(vcpu); 5268 5269 if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) || 5270 revision != VMCS12_REVISION) 5271 return nested_vmx_failInvalid(vcpu); 5272 5273 vmx->nested.vmxon_ptr = vmptr; 5274 ret = enter_vmx_operation(vcpu); 5275 if (ret) 5276 return ret; 5277 5278 return nested_vmx_succeed(vcpu); 5279 } 5280 5281 static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) 5282 { 5283 struct vcpu_vmx *vmx = to_vmx(vcpu); 5284 5285 if (vmx->nested.current_vmptr == INVALID_GPA) 5286 return; 5287 5288 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); 5289 5290 if (enable_shadow_vmcs) { 5291 /* copy to memory all shadowed fields in case 5292 they were modified */ 5293 copy_shadow_to_vmcs12(vmx); 5294 vmx_disable_shadow_vmcs(vmx); 5295 } 5296 vmx->nested.posted_intr_nv = -1; 5297 5298 /* Flush VMCS12 to guest memory */ 5299 kvm_vcpu_write_guest_page(vcpu, 5300 vmx->nested.current_vmptr >> PAGE_SHIFT, 5301 vmx->nested.cached_vmcs12, 0, VMCS12_SIZE); 5302 5303 kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 5304 5305 vmx->nested.current_vmptr = INVALID_GPA; 5306 } 5307 5308 /* Emulate the VMXOFF instruction */ 5309 static int handle_vmxoff(struct kvm_vcpu *vcpu) 5310 { 5311 if (!nested_vmx_check_permission(vcpu)) 5312 return 1; 5313 5314 free_nested(vcpu); 5315 5316 if (kvm_apic_has_pending_init_or_sipi(vcpu)) 5317 kvm_make_request(KVM_REQ_EVENT, vcpu); 5318 5319 return nested_vmx_succeed(vcpu); 5320 } 5321 5322 /* Emulate the VMCLEAR instruction */ 5323 static int handle_vmclear(struct kvm_vcpu *vcpu) 5324 { 5325 struct vcpu_vmx *vmx = to_vmx(vcpu); 5326 u32 zero = 0; 5327 gpa_t vmptr; 5328 int r; 5329 5330 if (!nested_vmx_check_permission(vcpu)) 5331 return 1; 5332 5333 if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) 5334 return r; 5335 5336 if (!page_address_valid(vcpu, vmptr)) 5337 return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS); 5338 5339 if (vmptr == vmx->nested.vmxon_ptr) 5340 return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_VMXON_POINTER); 5341 5342 if (likely(!nested_evmcs_handle_vmclear(vcpu, vmptr))) { 5343 if (vmptr == vmx->nested.current_vmptr) 5344 nested_release_vmcs12(vcpu); 5345 5346 /* 5347 * Silently ignore memory errors on VMCLEAR, Intel's pseudocode 5348 * for VMCLEAR includes a "ensure that data for VMCS referenced 5349 * by the operand is in memory" clause that guards writes to 5350 * memory, i.e. doing nothing for I/O is architecturally valid. 5351 * 5352 * FIXME: Suppress failures if and only if no memslot is found, 5353 * i.e. exit to userspace if __copy_to_user() fails. 5354 */ 5355 (void)kvm_vcpu_write_guest(vcpu, 5356 vmptr + offsetof(struct vmcs12, 5357 launch_state), 5358 &zero, sizeof(zero)); 5359 } 5360 5361 return nested_vmx_succeed(vcpu); 5362 } 5363 5364 /* Emulate the VMLAUNCH instruction */ 5365 static int handle_vmlaunch(struct kvm_vcpu *vcpu) 5366 { 5367 return nested_vmx_run(vcpu, true); 5368 } 5369 5370 /* Emulate the VMRESUME instruction */ 5371 static int handle_vmresume(struct kvm_vcpu *vcpu) 5372 { 5373 5374 return nested_vmx_run(vcpu, false); 5375 } 5376 5377 static int handle_vmread(struct kvm_vcpu *vcpu) 5378 { 5379 struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) 5380 : get_vmcs12(vcpu); 5381 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5382 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5383 struct vcpu_vmx *vmx = to_vmx(vcpu); 5384 struct x86_exception e; 5385 unsigned long field; 5386 u64 value; 5387 gva_t gva = 0; 5388 short offset; 5389 int len, r; 5390 5391 if (!nested_vmx_check_permission(vcpu)) 5392 return 1; 5393 5394 /* Decode instruction info and find the field to read */ 5395 field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); 5396 5397 if (!nested_vmx_is_evmptr12_valid(vmx)) { 5398 /* 5399 * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, 5400 * any VMREAD sets the ALU flags for VMfailInvalid. 5401 */ 5402 if (vmx->nested.current_vmptr == INVALID_GPA || 5403 (is_guest_mode(vcpu) && 5404 get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) 5405 return nested_vmx_failInvalid(vcpu); 5406 5407 offset = get_vmcs12_field_offset(field); 5408 if (offset < 0) 5409 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5410 5411 if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field)) 5412 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 5413 5414 /* Read the field, zero-extended to a u64 value */ 5415 value = vmcs12_read_any(vmcs12, field, offset); 5416 } else { 5417 /* 5418 * Hyper-V TLFS (as of 6.0b) explicitly states, that while an 5419 * enlightened VMCS is active VMREAD/VMWRITE instructions are 5420 * unsupported. Unfortunately, certain versions of Windows 11 5421 * don't comply with this requirement which is not enforced in 5422 * genuine Hyper-V. Allow VMREAD from an enlightened VMCS as a 5423 * workaround, as misbehaving guests will panic on VM-Fail. 5424 * Note, enlightened VMCS is incompatible with shadow VMCS so 5425 * all VMREADs from L2 should go to L1. 5426 */ 5427 if (WARN_ON_ONCE(is_guest_mode(vcpu))) 5428 return nested_vmx_failInvalid(vcpu); 5429 5430 offset = evmcs_field_offset(field, NULL); 5431 if (offset < 0) 5432 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5433 5434 /* Read the field, zero-extended to a u64 value */ 5435 value = evmcs_read_any(nested_vmx_evmcs(vmx), field, offset); 5436 } 5437 5438 /* 5439 * Now copy part of this value to register or memory, as requested. 5440 * Note that the number of bits actually copied is 32 or 64 depending 5441 * on the guest's mode (32 or 64 bit), not on the given field's length. 5442 */ 5443 if (instr_info & BIT(10)) { 5444 kvm_register_write(vcpu, (((instr_info) >> 3) & 0xf), value); 5445 } else { 5446 len = is_64_bit_mode(vcpu) ? 8 : 4; 5447 if (get_vmx_mem_address(vcpu, exit_qualification, 5448 instr_info, true, len, &gva)) 5449 return 1; 5450 /* _system ok, nested_vmx_check_permission has verified cpl=0 */ 5451 r = kvm_write_guest_virt_system(vcpu, gva, &value, len, &e); 5452 if (r != X86EMUL_CONTINUE) 5453 return kvm_handle_memory_failure(vcpu, r, &e); 5454 } 5455 5456 return nested_vmx_succeed(vcpu); 5457 } 5458 5459 static bool is_shadow_field_rw(unsigned long field) 5460 { 5461 switch (field) { 5462 #define SHADOW_FIELD_RW(x, y) case x: 5463 #include "vmcs_shadow_fields.h" 5464 return true; 5465 default: 5466 break; 5467 } 5468 return false; 5469 } 5470 5471 static bool is_shadow_field_ro(unsigned long field) 5472 { 5473 switch (field) { 5474 #define SHADOW_FIELD_RO(x, y) case x: 5475 #include "vmcs_shadow_fields.h" 5476 return true; 5477 default: 5478 break; 5479 } 5480 return false; 5481 } 5482 5483 static int handle_vmwrite(struct kvm_vcpu *vcpu) 5484 { 5485 struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) 5486 : get_vmcs12(vcpu); 5487 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5488 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5489 struct vcpu_vmx *vmx = to_vmx(vcpu); 5490 struct x86_exception e; 5491 unsigned long field; 5492 short offset; 5493 gva_t gva; 5494 int len, r; 5495 5496 /* 5497 * The value to write might be 32 or 64 bits, depending on L1's long 5498 * mode, and eventually we need to write that into a field of several 5499 * possible lengths. The code below first zero-extends the value to 64 5500 * bit (value), and then copies only the appropriate number of 5501 * bits into the vmcs12 field. 5502 */ 5503 u64 value = 0; 5504 5505 if (!nested_vmx_check_permission(vcpu)) 5506 return 1; 5507 5508 /* 5509 * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, 5510 * any VMWRITE sets the ALU flags for VMfailInvalid. 5511 */ 5512 if (vmx->nested.current_vmptr == INVALID_GPA || 5513 (is_guest_mode(vcpu) && 5514 get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) 5515 return nested_vmx_failInvalid(vcpu); 5516 5517 if (instr_info & BIT(10)) 5518 value = kvm_register_read(vcpu, (((instr_info) >> 3) & 0xf)); 5519 else { 5520 len = is_64_bit_mode(vcpu) ? 8 : 4; 5521 if (get_vmx_mem_address(vcpu, exit_qualification, 5522 instr_info, false, len, &gva)) 5523 return 1; 5524 r = kvm_read_guest_virt(vcpu, gva, &value, len, &e); 5525 if (r != X86EMUL_CONTINUE) 5526 return kvm_handle_memory_failure(vcpu, r, &e); 5527 } 5528 5529 field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); 5530 5531 offset = get_vmcs12_field_offset(field); 5532 if (offset < 0) 5533 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5534 5535 /* 5536 * If the vCPU supports "VMWRITE to any supported field in the 5537 * VMCS," then the "read-only" fields are actually read/write. 5538 */ 5539 if (vmcs_field_readonly(field) && 5540 !nested_cpu_has_vmwrite_any_field(vcpu)) 5541 return nested_vmx_fail(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); 5542 5543 /* 5544 * Ensure vmcs12 is up-to-date before any VMWRITE that dirties 5545 * vmcs12, else we may crush a field or consume a stale value. 5546 */ 5547 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) 5548 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 5549 5550 /* 5551 * Some Intel CPUs intentionally drop the reserved bits of the AR byte 5552 * fields on VMWRITE. Emulate this behavior to ensure consistent KVM 5553 * behavior regardless of the underlying hardware, e.g. if an AR_BYTE 5554 * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD 5555 * from L1 will return a different value than VMREAD from L2 (L1 sees 5556 * the stripped down value, L2 sees the full value as stored by KVM). 5557 */ 5558 if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES) 5559 value &= 0x1f0ff; 5560 5561 vmcs12_write_any(vmcs12, field, offset, value); 5562 5563 /* 5564 * Do not track vmcs12 dirty-state if in guest-mode as we actually 5565 * dirty shadow vmcs12 instead of vmcs12. Fields that can be updated 5566 * by L1 without a vmexit are always updated in the vmcs02, i.e. don't 5567 * "dirty" vmcs12, all others go down the prepare_vmcs02() slow path. 5568 */ 5569 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) { 5570 /* 5571 * L1 can read these fields without exiting, ensure the 5572 * shadow VMCS is up-to-date. 5573 */ 5574 if (enable_shadow_vmcs && is_shadow_field_ro(field)) { 5575 preempt_disable(); 5576 vmcs_load(vmx->vmcs01.shadow_vmcs); 5577 5578 __vmcs_writel(field, value); 5579 5580 vmcs_clear(vmx->vmcs01.shadow_vmcs); 5581 vmcs_load(vmx->loaded_vmcs->vmcs); 5582 preempt_enable(); 5583 } 5584 vmx->nested.dirty_vmcs12 = true; 5585 } 5586 5587 return nested_vmx_succeed(vcpu); 5588 } 5589 5590 static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr) 5591 { 5592 vmx->nested.current_vmptr = vmptr; 5593 if (enable_shadow_vmcs) { 5594 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); 5595 vmcs_write64(VMCS_LINK_POINTER, 5596 __pa(vmx->vmcs01.shadow_vmcs)); 5597 vmx->nested.need_vmcs12_to_shadow_sync = true; 5598 } 5599 vmx->nested.dirty_vmcs12 = true; 5600 vmx->nested.force_msr_bitmap_recalc = true; 5601 } 5602 5603 /* Emulate the VMPTRLD instruction */ 5604 static int handle_vmptrld(struct kvm_vcpu *vcpu) 5605 { 5606 struct vcpu_vmx *vmx = to_vmx(vcpu); 5607 gpa_t vmptr; 5608 int r; 5609 5610 if (!nested_vmx_check_permission(vcpu)) 5611 return 1; 5612 5613 if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) 5614 return r; 5615 5616 if (!page_address_valid(vcpu, vmptr)) 5617 return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS); 5618 5619 if (vmptr == vmx->nested.vmxon_ptr) 5620 return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_VMXON_POINTER); 5621 5622 /* Forbid normal VMPTRLD if Enlightened version was used */ 5623 if (nested_vmx_is_evmptr12_valid(vmx)) 5624 return 1; 5625 5626 if (vmx->nested.current_vmptr != vmptr) { 5627 struct gfn_to_hva_cache *ghc = &vmx->nested.vmcs12_cache; 5628 struct vmcs_hdr hdr; 5629 5630 if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmptr, VMCS12_SIZE)) { 5631 /* 5632 * Reads from an unbacked page return all 1s, 5633 * which means that the 32 bits located at the 5634 * given physical address won't match the required 5635 * VMCS12_REVISION identifier. 5636 */ 5637 return nested_vmx_fail(vcpu, 5638 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5639 } 5640 5641 if (kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr, 5642 offsetof(struct vmcs12, hdr), 5643 sizeof(hdr))) { 5644 return nested_vmx_fail(vcpu, 5645 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5646 } 5647 5648 if (hdr.revision_id != VMCS12_REVISION || 5649 (hdr.shadow_vmcs && 5650 !nested_cpu_has_vmx_shadow_vmcs(vcpu))) { 5651 return nested_vmx_fail(vcpu, 5652 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5653 } 5654 5655 nested_release_vmcs12(vcpu); 5656 5657 /* 5658 * Load VMCS12 from guest memory since it is not already 5659 * cached. 5660 */ 5661 if (kvm_read_guest_cached(vcpu->kvm, ghc, vmx->nested.cached_vmcs12, 5662 VMCS12_SIZE)) { 5663 return nested_vmx_fail(vcpu, 5664 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5665 } 5666 5667 set_current_vmptr(vmx, vmptr); 5668 } 5669 5670 return nested_vmx_succeed(vcpu); 5671 } 5672 5673 /* Emulate the VMPTRST instruction */ 5674 static int handle_vmptrst(struct kvm_vcpu *vcpu) 5675 { 5676 unsigned long exit_qual = vmx_get_exit_qual(vcpu); 5677 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5678 gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr; 5679 struct x86_exception e; 5680 gva_t gva; 5681 int r; 5682 5683 if (!nested_vmx_check_permission(vcpu)) 5684 return 1; 5685 5686 if (unlikely(nested_vmx_is_evmptr12_valid(to_vmx(vcpu)))) 5687 return 1; 5688 5689 if (get_vmx_mem_address(vcpu, exit_qual, instr_info, 5690 true, sizeof(gpa_t), &gva)) 5691 return 1; 5692 /* *_system ok, nested_vmx_check_permission has verified cpl=0 */ 5693 r = kvm_write_guest_virt_system(vcpu, gva, (void *)¤t_vmptr, 5694 sizeof(gpa_t), &e); 5695 if (r != X86EMUL_CONTINUE) 5696 return kvm_handle_memory_failure(vcpu, r, &e); 5697 5698 return nested_vmx_succeed(vcpu); 5699 } 5700 5701 /* Emulate the INVEPT instruction */ 5702 static int handle_invept(struct kvm_vcpu *vcpu) 5703 { 5704 struct vcpu_vmx *vmx = to_vmx(vcpu); 5705 u32 vmx_instruction_info, types; 5706 unsigned long type, roots_to_free; 5707 struct kvm_mmu *mmu; 5708 gva_t gva; 5709 struct x86_exception e; 5710 struct { 5711 u64 eptp, gpa; 5712 } operand; 5713 int i, r, gpr_index; 5714 5715 if (!(vmx->nested.msrs.secondary_ctls_high & 5716 SECONDARY_EXEC_ENABLE_EPT) || 5717 !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) { 5718 kvm_queue_exception(vcpu, UD_VECTOR); 5719 return 1; 5720 } 5721 5722 if (!nested_vmx_check_permission(vcpu)) 5723 return 1; 5724 5725 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5726 gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); 5727 type = kvm_register_read(vcpu, gpr_index); 5728 5729 types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; 5730 5731 if (type >= 32 || !(types & (1 << type))) 5732 return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5733 5734 /* According to the Intel VMX instruction reference, the memory 5735 * operand is read even if it isn't needed (e.g., for type==global) 5736 */ 5737 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 5738 vmx_instruction_info, false, sizeof(operand), &gva)) 5739 return 1; 5740 r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); 5741 if (r != X86EMUL_CONTINUE) 5742 return kvm_handle_memory_failure(vcpu, r, &e); 5743 5744 /* 5745 * Nested EPT roots are always held through guest_mmu, 5746 * not root_mmu. 5747 */ 5748 mmu = &vcpu->arch.guest_mmu; 5749 5750 switch (type) { 5751 case VMX_EPT_EXTENT_CONTEXT: 5752 if (!nested_vmx_check_eptp(vcpu, operand.eptp)) 5753 return nested_vmx_fail(vcpu, 5754 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5755 5756 roots_to_free = 0; 5757 if (nested_ept_root_matches(mmu->root.hpa, mmu->root.pgd, 5758 operand.eptp)) 5759 roots_to_free |= KVM_MMU_ROOT_CURRENT; 5760 5761 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 5762 if (nested_ept_root_matches(mmu->prev_roots[i].hpa, 5763 mmu->prev_roots[i].pgd, 5764 operand.eptp)) 5765 roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); 5766 } 5767 break; 5768 case VMX_EPT_EXTENT_GLOBAL: 5769 roots_to_free = KVM_MMU_ROOTS_ALL; 5770 break; 5771 default: 5772 BUG(); 5773 break; 5774 } 5775 5776 if (roots_to_free) 5777 kvm_mmu_free_roots(vcpu->kvm, mmu, roots_to_free); 5778 5779 return nested_vmx_succeed(vcpu); 5780 } 5781 5782 static int handle_invvpid(struct kvm_vcpu *vcpu) 5783 { 5784 struct vcpu_vmx *vmx = to_vmx(vcpu); 5785 u32 vmx_instruction_info; 5786 unsigned long type, types; 5787 gva_t gva; 5788 struct x86_exception e; 5789 struct { 5790 u64 vpid; 5791 u64 gla; 5792 } operand; 5793 u16 vpid02; 5794 int r, gpr_index; 5795 5796 if (!(vmx->nested.msrs.secondary_ctls_high & 5797 SECONDARY_EXEC_ENABLE_VPID) || 5798 !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) { 5799 kvm_queue_exception(vcpu, UD_VECTOR); 5800 return 1; 5801 } 5802 5803 if (!nested_vmx_check_permission(vcpu)) 5804 return 1; 5805 5806 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5807 gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); 5808 type = kvm_register_read(vcpu, gpr_index); 5809 5810 types = (vmx->nested.msrs.vpid_caps & 5811 VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; 5812 5813 if (type >= 32 || !(types & (1 << type))) 5814 return nested_vmx_fail(vcpu, 5815 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5816 5817 /* according to the intel vmx instruction reference, the memory 5818 * operand is read even if it isn't needed (e.g., for type==global) 5819 */ 5820 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 5821 vmx_instruction_info, false, sizeof(operand), &gva)) 5822 return 1; 5823 r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); 5824 if (r != X86EMUL_CONTINUE) 5825 return kvm_handle_memory_failure(vcpu, r, &e); 5826 5827 if (operand.vpid >> 16) 5828 return nested_vmx_fail(vcpu, 5829 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5830 5831 vpid02 = nested_get_vpid02(vcpu); 5832 switch (type) { 5833 case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: 5834 /* 5835 * LAM doesn't apply to addresses that are inputs to TLB 5836 * invalidation. 5837 */ 5838 if (!operand.vpid || 5839 is_noncanonical_address(operand.gla, vcpu)) 5840 return nested_vmx_fail(vcpu, 5841 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5842 vpid_sync_vcpu_addr(vpid02, operand.gla); 5843 break; 5844 case VMX_VPID_EXTENT_SINGLE_CONTEXT: 5845 case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: 5846 if (!operand.vpid) 5847 return nested_vmx_fail(vcpu, 5848 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5849 vpid_sync_context(vpid02); 5850 break; 5851 case VMX_VPID_EXTENT_ALL_CONTEXT: 5852 vpid_sync_context(vpid02); 5853 break; 5854 default: 5855 WARN_ON_ONCE(1); 5856 return kvm_skip_emulated_instruction(vcpu); 5857 } 5858 5859 /* 5860 * Sync the shadow page tables if EPT is disabled, L1 is invalidating 5861 * linear mappings for L2 (tagged with L2's VPID). Free all guest 5862 * roots as VPIDs are not tracked in the MMU role. 5863 * 5864 * Note, this operates on root_mmu, not guest_mmu, as L1 and L2 share 5865 * an MMU when EPT is disabled. 5866 * 5867 * TODO: sync only the affected SPTEs for INVDIVIDUAL_ADDR. 5868 */ 5869 if (!enable_ept) 5870 kvm_mmu_free_guest_mode_roots(vcpu->kvm, &vcpu->arch.root_mmu); 5871 5872 return nested_vmx_succeed(vcpu); 5873 } 5874 5875 static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, 5876 struct vmcs12 *vmcs12) 5877 { 5878 u32 index = kvm_rcx_read(vcpu); 5879 u64 new_eptp; 5880 5881 if (WARN_ON_ONCE(!nested_cpu_has_ept(vmcs12))) 5882 return 1; 5883 if (index >= VMFUNC_EPTP_ENTRIES) 5884 return 1; 5885 5886 if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT, 5887 &new_eptp, index * 8, 8)) 5888 return 1; 5889 5890 /* 5891 * If the (L2) guest does a vmfunc to the currently 5892 * active ept pointer, we don't have to do anything else 5893 */ 5894 if (vmcs12->ept_pointer != new_eptp) { 5895 if (!nested_vmx_check_eptp(vcpu, new_eptp)) 5896 return 1; 5897 5898 vmcs12->ept_pointer = new_eptp; 5899 nested_ept_new_eptp(vcpu); 5900 5901 if (!nested_cpu_has_vpid(vmcs12)) 5902 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 5903 } 5904 5905 return 0; 5906 } 5907 5908 static int handle_vmfunc(struct kvm_vcpu *vcpu) 5909 { 5910 struct vcpu_vmx *vmx = to_vmx(vcpu); 5911 struct vmcs12 *vmcs12; 5912 u32 function = kvm_rax_read(vcpu); 5913 5914 /* 5915 * VMFUNC should never execute cleanly while L1 is active; KVM supports 5916 * VMFUNC for nested VMs, but not for L1. 5917 */ 5918 if (WARN_ON_ONCE(!is_guest_mode(vcpu))) { 5919 kvm_queue_exception(vcpu, UD_VECTOR); 5920 return 1; 5921 } 5922 5923 vmcs12 = get_vmcs12(vcpu); 5924 5925 /* 5926 * #UD on out-of-bounds function has priority over VM-Exit, and VMFUNC 5927 * is enabled in vmcs02 if and only if it's enabled in vmcs12. 5928 */ 5929 if (WARN_ON_ONCE((function > 63) || !nested_cpu_has_vmfunc(vmcs12))) { 5930 kvm_queue_exception(vcpu, UD_VECTOR); 5931 return 1; 5932 } 5933 5934 if (!(vmcs12->vm_function_control & BIT_ULL(function))) 5935 goto fail; 5936 5937 switch (function) { 5938 case 0: 5939 if (nested_vmx_eptp_switching(vcpu, vmcs12)) 5940 goto fail; 5941 break; 5942 default: 5943 goto fail; 5944 } 5945 return kvm_skip_emulated_instruction(vcpu); 5946 5947 fail: 5948 /* 5949 * This is effectively a reflected VM-Exit, as opposed to a synthesized 5950 * nested VM-Exit. Pass the original exit reason, i.e. don't hardcode 5951 * EXIT_REASON_VMFUNC as the exit reason. 5952 */ 5953 nested_vmx_vmexit(vcpu, vmx->exit_reason.full, 5954 vmx_get_intr_info(vcpu), 5955 vmx_get_exit_qual(vcpu)); 5956 return 1; 5957 } 5958 5959 /* 5960 * Return true if an IO instruction with the specified port and size should cause 5961 * a VM-exit into L1. 5962 */ 5963 bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port, 5964 int size) 5965 { 5966 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5967 gpa_t bitmap, last_bitmap; 5968 u8 b; 5969 5970 last_bitmap = INVALID_GPA; 5971 b = -1; 5972 5973 while (size > 0) { 5974 if (port < 0x8000) 5975 bitmap = vmcs12->io_bitmap_a; 5976 else if (port < 0x10000) 5977 bitmap = vmcs12->io_bitmap_b; 5978 else 5979 return true; 5980 bitmap += (port & 0x7fff) / 8; 5981 5982 if (last_bitmap != bitmap) 5983 if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1)) 5984 return true; 5985 if (b & (1 << (port & 7))) 5986 return true; 5987 5988 port++; 5989 size--; 5990 last_bitmap = bitmap; 5991 } 5992 5993 return false; 5994 } 5995 5996 static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, 5997 struct vmcs12 *vmcs12) 5998 { 5999 unsigned long exit_qualification; 6000 unsigned short port; 6001 int size; 6002 6003 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 6004 return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); 6005 6006 exit_qualification = vmx_get_exit_qual(vcpu); 6007 6008 port = exit_qualification >> 16; 6009 size = (exit_qualification & 7) + 1; 6010 6011 return nested_vmx_check_io_bitmaps(vcpu, port, size); 6012 } 6013 6014 /* 6015 * Return 1 if we should exit from L2 to L1 to handle an MSR access, 6016 * rather than handle it ourselves in L0. I.e., check whether L1 expressed 6017 * disinterest in the current event (read or write a specific MSR) by using an 6018 * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps. 6019 */ 6020 static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, 6021 struct vmcs12 *vmcs12, 6022 union vmx_exit_reason exit_reason) 6023 { 6024 u32 msr_index = kvm_rcx_read(vcpu); 6025 gpa_t bitmap; 6026 6027 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 6028 return true; 6029 6030 /* 6031 * The MSR_BITMAP page is divided into four 1024-byte bitmaps, 6032 * for the four combinations of read/write and low/high MSR numbers. 6033 * First we need to figure out which of the four to use: 6034 */ 6035 bitmap = vmcs12->msr_bitmap; 6036 if (exit_reason.basic == EXIT_REASON_MSR_WRITE) 6037 bitmap += 2048; 6038 if (msr_index >= 0xc0000000) { 6039 msr_index -= 0xc0000000; 6040 bitmap += 1024; 6041 } 6042 6043 /* Then read the msr_index'th bit from this bitmap: */ 6044 if (msr_index < 1024*8) { 6045 unsigned char b; 6046 if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1)) 6047 return true; 6048 return 1 & (b >> (msr_index & 7)); 6049 } else 6050 return true; /* let L1 handle the wrong parameter */ 6051 } 6052 6053 /* 6054 * Return 1 if we should exit from L2 to L1 to handle a CR access exit, 6055 * rather than handle it ourselves in L0. I.e., check if L1 wanted to 6056 * intercept (via guest_host_mask etc.) the current event. 6057 */ 6058 static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, 6059 struct vmcs12 *vmcs12) 6060 { 6061 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 6062 int cr = exit_qualification & 15; 6063 int reg; 6064 unsigned long val; 6065 6066 switch ((exit_qualification >> 4) & 3) { 6067 case 0: /* mov to cr */ 6068 reg = (exit_qualification >> 8) & 15; 6069 val = kvm_register_read(vcpu, reg); 6070 switch (cr) { 6071 case 0: 6072 if (vmcs12->cr0_guest_host_mask & 6073 (val ^ vmcs12->cr0_read_shadow)) 6074 return true; 6075 break; 6076 case 3: 6077 if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) 6078 return true; 6079 break; 6080 case 4: 6081 if (vmcs12->cr4_guest_host_mask & 6082 (vmcs12->cr4_read_shadow ^ val)) 6083 return true; 6084 break; 6085 case 8: 6086 if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING)) 6087 return true; 6088 break; 6089 } 6090 break; 6091 case 2: /* clts */ 6092 if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) && 6093 (vmcs12->cr0_read_shadow & X86_CR0_TS)) 6094 return true; 6095 break; 6096 case 1: /* mov from cr */ 6097 switch (cr) { 6098 case 3: 6099 if (vmcs12->cpu_based_vm_exec_control & 6100 CPU_BASED_CR3_STORE_EXITING) 6101 return true; 6102 break; 6103 case 8: 6104 if (vmcs12->cpu_based_vm_exec_control & 6105 CPU_BASED_CR8_STORE_EXITING) 6106 return true; 6107 break; 6108 } 6109 break; 6110 case 3: /* lmsw */ 6111 /* 6112 * lmsw can change bits 1..3 of cr0, and only set bit 0 of 6113 * cr0. Other attempted changes are ignored, with no exit. 6114 */ 6115 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; 6116 if (vmcs12->cr0_guest_host_mask & 0xe & 6117 (val ^ vmcs12->cr0_read_shadow)) 6118 return true; 6119 if ((vmcs12->cr0_guest_host_mask & 0x1) && 6120 !(vmcs12->cr0_read_shadow & 0x1) && 6121 (val & 0x1)) 6122 return true; 6123 break; 6124 } 6125 return false; 6126 } 6127 6128 static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu, 6129 struct vmcs12 *vmcs12) 6130 { 6131 u32 encls_leaf; 6132 6133 if (!guest_cpuid_has(vcpu, X86_FEATURE_SGX) || 6134 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING)) 6135 return false; 6136 6137 encls_leaf = kvm_rax_read(vcpu); 6138 if (encls_leaf > 62) 6139 encls_leaf = 63; 6140 return vmcs12->encls_exiting_bitmap & BIT_ULL(encls_leaf); 6141 } 6142 6143 static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu, 6144 struct vmcs12 *vmcs12, gpa_t bitmap) 6145 { 6146 u32 vmx_instruction_info; 6147 unsigned long field; 6148 u8 b; 6149 6150 if (!nested_cpu_has_shadow_vmcs(vmcs12)) 6151 return true; 6152 6153 /* Decode instruction info and find the field to access */ 6154 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 6155 field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); 6156 6157 /* Out-of-range fields always cause a VM exit from L2 to L1 */ 6158 if (field >> 15) 6159 return true; 6160 6161 if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1)) 6162 return true; 6163 6164 return 1 & (b >> (field & 7)); 6165 } 6166 6167 static bool nested_vmx_exit_handled_mtf(struct vmcs12 *vmcs12) 6168 { 6169 u32 entry_intr_info = vmcs12->vm_entry_intr_info_field; 6170 6171 if (nested_cpu_has_mtf(vmcs12)) 6172 return true; 6173 6174 /* 6175 * An MTF VM-exit may be injected into the guest by setting the 6176 * interruption-type to 7 (other event) and the vector field to 0. Such 6177 * is the case regardless of the 'monitor trap flag' VM-execution 6178 * control. 6179 */ 6180 return entry_intr_info == (INTR_INFO_VALID_MASK 6181 | INTR_TYPE_OTHER_EVENT); 6182 } 6183 6184 /* 6185 * Return true if L0 wants to handle an exit from L2 regardless of whether or not 6186 * L1 wants the exit. Only call this when in is_guest_mode (L2). 6187 */ 6188 static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, 6189 union vmx_exit_reason exit_reason) 6190 { 6191 u32 intr_info; 6192 6193 switch ((u16)exit_reason.basic) { 6194 case EXIT_REASON_EXCEPTION_NMI: 6195 intr_info = vmx_get_intr_info(vcpu); 6196 if (is_nmi(intr_info)) 6197 return true; 6198 else if (is_page_fault(intr_info)) 6199 return vcpu->arch.apf.host_apf_flags || 6200 vmx_need_pf_intercept(vcpu); 6201 else if (is_debug(intr_info) && 6202 vcpu->guest_debug & 6203 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) 6204 return true; 6205 else if (is_breakpoint(intr_info) && 6206 vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 6207 return true; 6208 else if (is_alignment_check(intr_info) && 6209 !vmx_guest_inject_ac(vcpu)) 6210 return true; 6211 return false; 6212 case EXIT_REASON_EXTERNAL_INTERRUPT: 6213 return true; 6214 case EXIT_REASON_MCE_DURING_VMENTRY: 6215 return true; 6216 case EXIT_REASON_EPT_VIOLATION: 6217 /* 6218 * L0 always deals with the EPT violation. If nested EPT is 6219 * used, and the nested mmu code discovers that the address is 6220 * missing in the guest EPT table (EPT12), the EPT violation 6221 * will be injected with nested_ept_inject_page_fault() 6222 */ 6223 return true; 6224 case EXIT_REASON_EPT_MISCONFIG: 6225 /* 6226 * L2 never uses directly L1's EPT, but rather L0's own EPT 6227 * table (shadow on EPT) or a merged EPT table that L0 built 6228 * (EPT on EPT). So any problems with the structure of the 6229 * table is L0's fault. 6230 */ 6231 return true; 6232 case EXIT_REASON_PREEMPTION_TIMER: 6233 return true; 6234 case EXIT_REASON_PML_FULL: 6235 /* 6236 * PML is emulated for an L1 VMM and should never be enabled in 6237 * vmcs02, always "handle" PML_FULL by exiting to userspace. 6238 */ 6239 return true; 6240 case EXIT_REASON_VMFUNC: 6241 /* VM functions are emulated through L2->L0 vmexits. */ 6242 return true; 6243 case EXIT_REASON_BUS_LOCK: 6244 /* 6245 * At present, bus lock VM exit is never exposed to L1. 6246 * Handle L2's bus locks in L0 directly. 6247 */ 6248 return true; 6249 #ifdef CONFIG_KVM_HYPERV 6250 case EXIT_REASON_VMCALL: 6251 /* Hyper-V L2 TLB flush hypercall is handled by L0 */ 6252 return guest_hv_cpuid_has_l2_tlb_flush(vcpu) && 6253 nested_evmcs_l2_tlb_flush_enabled(vcpu) && 6254 kvm_hv_is_tlb_flush_hcall(vcpu); 6255 #endif 6256 default: 6257 break; 6258 } 6259 return false; 6260 } 6261 6262 /* 6263 * Return 1 if L1 wants to intercept an exit from L2. Only call this when in 6264 * is_guest_mode (L2). 6265 */ 6266 static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, 6267 union vmx_exit_reason exit_reason) 6268 { 6269 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6270 u32 intr_info; 6271 6272 switch ((u16)exit_reason.basic) { 6273 case EXIT_REASON_EXCEPTION_NMI: 6274 intr_info = vmx_get_intr_info(vcpu); 6275 if (is_nmi(intr_info)) 6276 return true; 6277 else if (is_page_fault(intr_info)) 6278 return true; 6279 return vmcs12->exception_bitmap & 6280 (1u << (intr_info & INTR_INFO_VECTOR_MASK)); 6281 case EXIT_REASON_EXTERNAL_INTERRUPT: 6282 return nested_exit_on_intr(vcpu); 6283 case EXIT_REASON_TRIPLE_FAULT: 6284 return true; 6285 case EXIT_REASON_INTERRUPT_WINDOW: 6286 return nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING); 6287 case EXIT_REASON_NMI_WINDOW: 6288 return nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING); 6289 case EXIT_REASON_TASK_SWITCH: 6290 return true; 6291 case EXIT_REASON_CPUID: 6292 return true; 6293 case EXIT_REASON_HLT: 6294 return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); 6295 case EXIT_REASON_INVD: 6296 return true; 6297 case EXIT_REASON_INVLPG: 6298 return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 6299 case EXIT_REASON_RDPMC: 6300 return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); 6301 case EXIT_REASON_RDRAND: 6302 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING); 6303 case EXIT_REASON_RDSEED: 6304 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING); 6305 case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: 6306 return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); 6307 case EXIT_REASON_VMREAD: 6308 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, 6309 vmcs12->vmread_bitmap); 6310 case EXIT_REASON_VMWRITE: 6311 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, 6312 vmcs12->vmwrite_bitmap); 6313 case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: 6314 case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: 6315 case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME: 6316 case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: 6317 case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID: 6318 /* 6319 * VMX instructions trap unconditionally. This allows L1 to 6320 * emulate them for its L2 guest, i.e., allows 3-level nesting! 6321 */ 6322 return true; 6323 case EXIT_REASON_CR_ACCESS: 6324 return nested_vmx_exit_handled_cr(vcpu, vmcs12); 6325 case EXIT_REASON_DR_ACCESS: 6326 return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); 6327 case EXIT_REASON_IO_INSTRUCTION: 6328 return nested_vmx_exit_handled_io(vcpu, vmcs12); 6329 case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR: 6330 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC); 6331 case EXIT_REASON_MSR_READ: 6332 case EXIT_REASON_MSR_WRITE: 6333 return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); 6334 case EXIT_REASON_INVALID_STATE: 6335 return true; 6336 case EXIT_REASON_MWAIT_INSTRUCTION: 6337 return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); 6338 case EXIT_REASON_MONITOR_TRAP_FLAG: 6339 return nested_vmx_exit_handled_mtf(vmcs12); 6340 case EXIT_REASON_MONITOR_INSTRUCTION: 6341 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING); 6342 case EXIT_REASON_PAUSE_INSTRUCTION: 6343 return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) || 6344 nested_cpu_has2(vmcs12, 6345 SECONDARY_EXEC_PAUSE_LOOP_EXITING); 6346 case EXIT_REASON_MCE_DURING_VMENTRY: 6347 return true; 6348 case EXIT_REASON_TPR_BELOW_THRESHOLD: 6349 return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW); 6350 case EXIT_REASON_APIC_ACCESS: 6351 case EXIT_REASON_APIC_WRITE: 6352 case EXIT_REASON_EOI_INDUCED: 6353 /* 6354 * The controls for "virtualize APIC accesses," "APIC- 6355 * register virtualization," and "virtual-interrupt 6356 * delivery" only come from vmcs12. 6357 */ 6358 return true; 6359 case EXIT_REASON_INVPCID: 6360 return 6361 nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) && 6362 nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 6363 case EXIT_REASON_WBINVD: 6364 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); 6365 case EXIT_REASON_XSETBV: 6366 return true; 6367 case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS: 6368 /* 6369 * This should never happen, since it is not possible to 6370 * set XSS to a non-zero value---neither in L1 nor in L2. 6371 * If if it were, XSS would have to be checked against 6372 * the XSS exit bitmap in vmcs12. 6373 */ 6374 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_XSAVES); 6375 case EXIT_REASON_UMWAIT: 6376 case EXIT_REASON_TPAUSE: 6377 return nested_cpu_has2(vmcs12, 6378 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE); 6379 case EXIT_REASON_ENCLS: 6380 return nested_vmx_exit_handled_encls(vcpu, vmcs12); 6381 case EXIT_REASON_NOTIFY: 6382 /* Notify VM exit is not exposed to L1 */ 6383 return false; 6384 default: 6385 return true; 6386 } 6387 } 6388 6389 /* 6390 * Conditionally reflect a VM-Exit into L1. Returns %true if the VM-Exit was 6391 * reflected into L1. 6392 */ 6393 bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu) 6394 { 6395 struct vcpu_vmx *vmx = to_vmx(vcpu); 6396 union vmx_exit_reason exit_reason = vmx->exit_reason; 6397 unsigned long exit_qual; 6398 u32 exit_intr_info; 6399 6400 WARN_ON_ONCE(vmx->nested.nested_run_pending); 6401 6402 /* 6403 * Late nested VM-Fail shares the same flow as nested VM-Exit since KVM 6404 * has already loaded L2's state. 6405 */ 6406 if (unlikely(vmx->fail)) { 6407 trace_kvm_nested_vmenter_failed( 6408 "hardware VM-instruction error: ", 6409 vmcs_read32(VM_INSTRUCTION_ERROR)); 6410 exit_intr_info = 0; 6411 exit_qual = 0; 6412 goto reflect_vmexit; 6413 } 6414 6415 trace_kvm_nested_vmexit(vcpu, KVM_ISA_VMX); 6416 6417 /* If L0 (KVM) wants the exit, it trumps L1's desires. */ 6418 if (nested_vmx_l0_wants_exit(vcpu, exit_reason)) 6419 return false; 6420 6421 /* If L1 doesn't want the exit, handle it in L0. */ 6422 if (!nested_vmx_l1_wants_exit(vcpu, exit_reason)) 6423 return false; 6424 6425 /* 6426 * vmcs.VM_EXIT_INTR_INFO is only valid for EXCEPTION_NMI exits. For 6427 * EXTERNAL_INTERRUPT, the value for vmcs12->vm_exit_intr_info would 6428 * need to be synthesized by querying the in-kernel LAPIC, but external 6429 * interrupts are never reflected to L1 so it's a non-issue. 6430 */ 6431 exit_intr_info = vmx_get_intr_info(vcpu); 6432 if (is_exception_with_error_code(exit_intr_info)) { 6433 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6434 6435 vmcs12->vm_exit_intr_error_code = 6436 vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 6437 } 6438 exit_qual = vmx_get_exit_qual(vcpu); 6439 6440 reflect_vmexit: 6441 nested_vmx_vmexit(vcpu, exit_reason.full, exit_intr_info, exit_qual); 6442 return true; 6443 } 6444 6445 static int vmx_get_nested_state(struct kvm_vcpu *vcpu, 6446 struct kvm_nested_state __user *user_kvm_nested_state, 6447 u32 user_data_size) 6448 { 6449 struct vcpu_vmx *vmx; 6450 struct vmcs12 *vmcs12; 6451 struct kvm_nested_state kvm_state = { 6452 .flags = 0, 6453 .format = KVM_STATE_NESTED_FORMAT_VMX, 6454 .size = sizeof(kvm_state), 6455 .hdr.vmx.flags = 0, 6456 .hdr.vmx.vmxon_pa = INVALID_GPA, 6457 .hdr.vmx.vmcs12_pa = INVALID_GPA, 6458 .hdr.vmx.preemption_timer_deadline = 0, 6459 }; 6460 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = 6461 &user_kvm_nested_state->data.vmx[0]; 6462 6463 if (!vcpu) 6464 return kvm_state.size + sizeof(*user_vmx_nested_state); 6465 6466 vmx = to_vmx(vcpu); 6467 vmcs12 = get_vmcs12(vcpu); 6468 6469 if (guest_can_use(vcpu, X86_FEATURE_VMX) && 6470 (vmx->nested.vmxon || vmx->nested.smm.vmxon)) { 6471 kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr; 6472 kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr; 6473 6474 if (vmx_has_valid_vmcs12(vcpu)) { 6475 kvm_state.size += sizeof(user_vmx_nested_state->vmcs12); 6476 6477 /* 'hv_evmcs_vmptr' can also be EVMPTR_MAP_PENDING here */ 6478 if (nested_vmx_is_evmptr12_set(vmx)) 6479 kvm_state.flags |= KVM_STATE_NESTED_EVMCS; 6480 6481 if (is_guest_mode(vcpu) && 6482 nested_cpu_has_shadow_vmcs(vmcs12) && 6483 vmcs12->vmcs_link_pointer != INVALID_GPA) 6484 kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12); 6485 } 6486 6487 if (vmx->nested.smm.vmxon) 6488 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON; 6489 6490 if (vmx->nested.smm.guest_mode) 6491 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE; 6492 6493 if (is_guest_mode(vcpu)) { 6494 kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE; 6495 6496 if (vmx->nested.nested_run_pending) 6497 kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING; 6498 6499 if (vmx->nested.mtf_pending) 6500 kvm_state.flags |= KVM_STATE_NESTED_MTF_PENDING; 6501 6502 if (nested_cpu_has_preemption_timer(vmcs12) && 6503 vmx->nested.has_preemption_timer_deadline) { 6504 kvm_state.hdr.vmx.flags |= 6505 KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE; 6506 kvm_state.hdr.vmx.preemption_timer_deadline = 6507 vmx->nested.preemption_timer_deadline; 6508 } 6509 } 6510 } 6511 6512 if (user_data_size < kvm_state.size) 6513 goto out; 6514 6515 if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state))) 6516 return -EFAULT; 6517 6518 if (!vmx_has_valid_vmcs12(vcpu)) 6519 goto out; 6520 6521 /* 6522 * When running L2, the authoritative vmcs12 state is in the 6523 * vmcs02. When running L1, the authoritative vmcs12 state is 6524 * in the shadow or enlightened vmcs linked to vmcs01, unless 6525 * need_vmcs12_to_shadow_sync is set, in which case, the authoritative 6526 * vmcs12 state is in the vmcs12 already. 6527 */ 6528 if (is_guest_mode(vcpu)) { 6529 sync_vmcs02_to_vmcs12(vcpu, vmcs12); 6530 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 6531 } else { 6532 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); 6533 if (!vmx->nested.need_vmcs12_to_shadow_sync) { 6534 if (nested_vmx_is_evmptr12_valid(vmx)) 6535 /* 6536 * L1 hypervisor is not obliged to keep eVMCS 6537 * clean fields data always up-to-date while 6538 * not in guest mode, 'hv_clean_fields' is only 6539 * supposed to be actual upon vmentry so we need 6540 * to ignore it here and do full copy. 6541 */ 6542 copy_enlightened_to_vmcs12(vmx, 0); 6543 else if (enable_shadow_vmcs) 6544 copy_shadow_to_vmcs12(vmx); 6545 } 6546 } 6547 6548 BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE); 6549 BUILD_BUG_ON(sizeof(user_vmx_nested_state->shadow_vmcs12) < VMCS12_SIZE); 6550 6551 /* 6552 * Copy over the full allocated size of vmcs12 rather than just the size 6553 * of the struct. 6554 */ 6555 if (copy_to_user(user_vmx_nested_state->vmcs12, vmcs12, VMCS12_SIZE)) 6556 return -EFAULT; 6557 6558 if (nested_cpu_has_shadow_vmcs(vmcs12) && 6559 vmcs12->vmcs_link_pointer != INVALID_GPA) { 6560 if (copy_to_user(user_vmx_nested_state->shadow_vmcs12, 6561 get_shadow_vmcs12(vcpu), VMCS12_SIZE)) 6562 return -EFAULT; 6563 } 6564 out: 6565 return kvm_state.size; 6566 } 6567 6568 void vmx_leave_nested(struct kvm_vcpu *vcpu) 6569 { 6570 if (is_guest_mode(vcpu)) { 6571 to_vmx(vcpu)->nested.nested_run_pending = 0; 6572 nested_vmx_vmexit(vcpu, -1, 0, 0); 6573 } 6574 free_nested(vcpu); 6575 } 6576 6577 static int vmx_set_nested_state(struct kvm_vcpu *vcpu, 6578 struct kvm_nested_state __user *user_kvm_nested_state, 6579 struct kvm_nested_state *kvm_state) 6580 { 6581 struct vcpu_vmx *vmx = to_vmx(vcpu); 6582 struct vmcs12 *vmcs12; 6583 enum vm_entry_failure_code ignored; 6584 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = 6585 &user_kvm_nested_state->data.vmx[0]; 6586 int ret; 6587 6588 if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX) 6589 return -EINVAL; 6590 6591 if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) { 6592 if (kvm_state->hdr.vmx.smm.flags) 6593 return -EINVAL; 6594 6595 if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) 6596 return -EINVAL; 6597 6598 /* 6599 * KVM_STATE_NESTED_EVMCS used to signal that KVM should 6600 * enable eVMCS capability on vCPU. However, since then 6601 * code was changed such that flag signals vmcs12 should 6602 * be copied into eVMCS in guest memory. 6603 * 6604 * To preserve backwards compatibility, allow user 6605 * to set this flag even when there is no VMXON region. 6606 */ 6607 if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS) 6608 return -EINVAL; 6609 } else { 6610 if (!guest_can_use(vcpu, X86_FEATURE_VMX)) 6611 return -EINVAL; 6612 6613 if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa)) 6614 return -EINVAL; 6615 } 6616 6617 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 6618 (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 6619 return -EINVAL; 6620 6621 if (kvm_state->hdr.vmx.smm.flags & 6622 ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON)) 6623 return -EINVAL; 6624 6625 if (kvm_state->hdr.vmx.flags & ~KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) 6626 return -EINVAL; 6627 6628 /* 6629 * SMM temporarily disables VMX, so we cannot be in guest mode, 6630 * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags 6631 * must be zero. 6632 */ 6633 if (is_smm(vcpu) ? 6634 (kvm_state->flags & 6635 (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING)) 6636 : kvm_state->hdr.vmx.smm.flags) 6637 return -EINVAL; 6638 6639 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 6640 !(kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON)) 6641 return -EINVAL; 6642 6643 if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) && 6644 (!guest_can_use(vcpu, X86_FEATURE_VMX) || 6645 !vmx->nested.enlightened_vmcs_enabled)) 6646 return -EINVAL; 6647 6648 vmx_leave_nested(vcpu); 6649 6650 if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) 6651 return 0; 6652 6653 vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa; 6654 ret = enter_vmx_operation(vcpu); 6655 if (ret) 6656 return ret; 6657 6658 /* Empty 'VMXON' state is permitted if no VMCS loaded */ 6659 if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) { 6660 /* See vmx_has_valid_vmcs12. */ 6661 if ((kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE) || 6662 (kvm_state->flags & KVM_STATE_NESTED_EVMCS) || 6663 (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA)) 6664 return -EINVAL; 6665 else 6666 return 0; 6667 } 6668 6669 if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) { 6670 if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa || 6671 !page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa)) 6672 return -EINVAL; 6673 6674 set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa); 6675 #ifdef CONFIG_KVM_HYPERV 6676 } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) { 6677 /* 6678 * nested_vmx_handle_enlightened_vmptrld() cannot be called 6679 * directly from here as HV_X64_MSR_VP_ASSIST_PAGE may not be 6680 * restored yet. EVMCS will be mapped from 6681 * nested_get_vmcs12_pages(). 6682 */ 6683 vmx->nested.hv_evmcs_vmptr = EVMPTR_MAP_PENDING; 6684 kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 6685 #endif 6686 } else { 6687 return -EINVAL; 6688 } 6689 6690 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) { 6691 vmx->nested.smm.vmxon = true; 6692 vmx->nested.vmxon = false; 6693 6694 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) 6695 vmx->nested.smm.guest_mode = true; 6696 } 6697 6698 vmcs12 = get_vmcs12(vcpu); 6699 if (copy_from_user(vmcs12, user_vmx_nested_state->vmcs12, sizeof(*vmcs12))) 6700 return -EFAULT; 6701 6702 if (vmcs12->hdr.revision_id != VMCS12_REVISION) 6703 return -EINVAL; 6704 6705 if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 6706 return 0; 6707 6708 vmx->nested.nested_run_pending = 6709 !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); 6710 6711 vmx->nested.mtf_pending = 6712 !!(kvm_state->flags & KVM_STATE_NESTED_MTF_PENDING); 6713 6714 ret = -EINVAL; 6715 if (nested_cpu_has_shadow_vmcs(vmcs12) && 6716 vmcs12->vmcs_link_pointer != INVALID_GPA) { 6717 struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); 6718 6719 if (kvm_state->size < 6720 sizeof(*kvm_state) + 6721 sizeof(user_vmx_nested_state->vmcs12) + sizeof(*shadow_vmcs12)) 6722 goto error_guest_mode; 6723 6724 if (copy_from_user(shadow_vmcs12, 6725 user_vmx_nested_state->shadow_vmcs12, 6726 sizeof(*shadow_vmcs12))) { 6727 ret = -EFAULT; 6728 goto error_guest_mode; 6729 } 6730 6731 if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION || 6732 !shadow_vmcs12->hdr.shadow_vmcs) 6733 goto error_guest_mode; 6734 } 6735 6736 vmx->nested.has_preemption_timer_deadline = false; 6737 if (kvm_state->hdr.vmx.flags & KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) { 6738 vmx->nested.has_preemption_timer_deadline = true; 6739 vmx->nested.preemption_timer_deadline = 6740 kvm_state->hdr.vmx.preemption_timer_deadline; 6741 } 6742 6743 if (nested_vmx_check_controls(vcpu, vmcs12) || 6744 nested_vmx_check_host_state(vcpu, vmcs12) || 6745 nested_vmx_check_guest_state(vcpu, vmcs12, &ignored)) 6746 goto error_guest_mode; 6747 6748 vmx->nested.dirty_vmcs12 = true; 6749 vmx->nested.force_msr_bitmap_recalc = true; 6750 ret = nested_vmx_enter_non_root_mode(vcpu, false); 6751 if (ret) 6752 goto error_guest_mode; 6753 6754 if (vmx->nested.mtf_pending) 6755 kvm_make_request(KVM_REQ_EVENT, vcpu); 6756 6757 return 0; 6758 6759 error_guest_mode: 6760 vmx->nested.nested_run_pending = 0; 6761 return ret; 6762 } 6763 6764 void nested_vmx_set_vmcs_shadowing_bitmap(void) 6765 { 6766 if (enable_shadow_vmcs) { 6767 vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); 6768 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); 6769 } 6770 } 6771 6772 /* 6773 * Indexing into the vmcs12 uses the VMCS encoding rotated left by 6. Undo 6774 * that madness to get the encoding for comparison. 6775 */ 6776 #define VMCS12_IDX_TO_ENC(idx) ((u16)(((u16)(idx) >> 6) | ((u16)(idx) << 10))) 6777 6778 static u64 nested_vmx_calc_vmcs_enum_msr(void) 6779 { 6780 /* 6781 * Note these are the so called "index" of the VMCS field encoding, not 6782 * the index into vmcs12. 6783 */ 6784 unsigned int max_idx, idx; 6785 int i; 6786 6787 /* 6788 * For better or worse, KVM allows VMREAD/VMWRITE to all fields in 6789 * vmcs12, regardless of whether or not the associated feature is 6790 * exposed to L1. Simply find the field with the highest index. 6791 */ 6792 max_idx = 0; 6793 for (i = 0; i < nr_vmcs12_fields; i++) { 6794 /* The vmcs12 table is very, very sparsely populated. */ 6795 if (!vmcs12_field_offsets[i]) 6796 continue; 6797 6798 idx = vmcs_field_index(VMCS12_IDX_TO_ENC(i)); 6799 if (idx > max_idx) 6800 max_idx = idx; 6801 } 6802 6803 return (u64)max_idx << VMCS_FIELD_INDEX_SHIFT; 6804 } 6805 6806 static void nested_vmx_setup_pinbased_ctls(struct vmcs_config *vmcs_conf, 6807 struct nested_vmx_msrs *msrs) 6808 { 6809 msrs->pinbased_ctls_low = 6810 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 6811 6812 msrs->pinbased_ctls_high = vmcs_conf->pin_based_exec_ctrl; 6813 msrs->pinbased_ctls_high &= 6814 PIN_BASED_EXT_INTR_MASK | 6815 PIN_BASED_NMI_EXITING | 6816 PIN_BASED_VIRTUAL_NMIS | 6817 (enable_apicv ? PIN_BASED_POSTED_INTR : 0); 6818 msrs->pinbased_ctls_high |= 6819 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 6820 PIN_BASED_VMX_PREEMPTION_TIMER; 6821 } 6822 6823 static void nested_vmx_setup_exit_ctls(struct vmcs_config *vmcs_conf, 6824 struct nested_vmx_msrs *msrs) 6825 { 6826 msrs->exit_ctls_low = 6827 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 6828 6829 msrs->exit_ctls_high = vmcs_conf->vmexit_ctrl; 6830 msrs->exit_ctls_high &= 6831 #ifdef CONFIG_X86_64 6832 VM_EXIT_HOST_ADDR_SPACE_SIZE | 6833 #endif 6834 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT | 6835 VM_EXIT_CLEAR_BNDCFGS; 6836 msrs->exit_ctls_high |= 6837 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | 6838 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | 6839 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT | 6840 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; 6841 6842 /* We support free control of debug control saving. */ 6843 msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; 6844 } 6845 6846 static void nested_vmx_setup_entry_ctls(struct vmcs_config *vmcs_conf, 6847 struct nested_vmx_msrs *msrs) 6848 { 6849 msrs->entry_ctls_low = 6850 VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 6851 6852 msrs->entry_ctls_high = vmcs_conf->vmentry_ctrl; 6853 msrs->entry_ctls_high &= 6854 #ifdef CONFIG_X86_64 6855 VM_ENTRY_IA32E_MODE | 6856 #endif 6857 VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS; 6858 msrs->entry_ctls_high |= 6859 (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER | 6860 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL); 6861 6862 /* We support free control of debug control loading. */ 6863 msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; 6864 } 6865 6866 static void nested_vmx_setup_cpubased_ctls(struct vmcs_config *vmcs_conf, 6867 struct nested_vmx_msrs *msrs) 6868 { 6869 msrs->procbased_ctls_low = 6870 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 6871 6872 msrs->procbased_ctls_high = vmcs_conf->cpu_based_exec_ctrl; 6873 msrs->procbased_ctls_high &= 6874 CPU_BASED_INTR_WINDOW_EXITING | 6875 CPU_BASED_NMI_WINDOW_EXITING | CPU_BASED_USE_TSC_OFFSETTING | 6876 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | 6877 CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | 6878 CPU_BASED_CR3_STORE_EXITING | 6879 #ifdef CONFIG_X86_64 6880 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | 6881 #endif 6882 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | 6883 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG | 6884 CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING | 6885 CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING | 6886 CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 6887 /* 6888 * We can allow some features even when not supported by the 6889 * hardware. For example, L1 can specify an MSR bitmap - and we 6890 * can use it to avoid exits to L1 - even when L0 runs L2 6891 * without MSR bitmaps. 6892 */ 6893 msrs->procbased_ctls_high |= 6894 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 6895 CPU_BASED_USE_MSR_BITMAPS; 6896 6897 /* We support free control of CR3 access interception. */ 6898 msrs->procbased_ctls_low &= 6899 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); 6900 } 6901 6902 static void nested_vmx_setup_secondary_ctls(u32 ept_caps, 6903 struct vmcs_config *vmcs_conf, 6904 struct nested_vmx_msrs *msrs) 6905 { 6906 msrs->secondary_ctls_low = 0; 6907 6908 msrs->secondary_ctls_high = vmcs_conf->cpu_based_2nd_exec_ctrl; 6909 msrs->secondary_ctls_high &= 6910 SECONDARY_EXEC_DESC | 6911 SECONDARY_EXEC_ENABLE_RDTSCP | 6912 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 6913 SECONDARY_EXEC_WBINVD_EXITING | 6914 SECONDARY_EXEC_APIC_REGISTER_VIRT | 6915 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 6916 SECONDARY_EXEC_RDRAND_EXITING | 6917 SECONDARY_EXEC_ENABLE_INVPCID | 6918 SECONDARY_EXEC_ENABLE_VMFUNC | 6919 SECONDARY_EXEC_RDSEED_EXITING | 6920 SECONDARY_EXEC_ENABLE_XSAVES | 6921 SECONDARY_EXEC_TSC_SCALING | 6922 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; 6923 6924 /* 6925 * We can emulate "VMCS shadowing," even if the hardware 6926 * doesn't support it. 6927 */ 6928 msrs->secondary_ctls_high |= 6929 SECONDARY_EXEC_SHADOW_VMCS; 6930 6931 if (enable_ept) { 6932 /* nested EPT: emulate EPT also to L1 */ 6933 msrs->secondary_ctls_high |= 6934 SECONDARY_EXEC_ENABLE_EPT; 6935 msrs->ept_caps = 6936 VMX_EPT_PAGE_WALK_4_BIT | 6937 VMX_EPT_PAGE_WALK_5_BIT | 6938 VMX_EPTP_WB_BIT | 6939 VMX_EPT_INVEPT_BIT | 6940 VMX_EPT_EXECUTE_ONLY_BIT; 6941 6942 msrs->ept_caps &= ept_caps; 6943 msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | 6944 VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT | 6945 VMX_EPT_1GB_PAGE_BIT; 6946 if (enable_ept_ad_bits) { 6947 msrs->secondary_ctls_high |= 6948 SECONDARY_EXEC_ENABLE_PML; 6949 msrs->ept_caps |= VMX_EPT_AD_BIT; 6950 } 6951 6952 /* 6953 * Advertise EPTP switching irrespective of hardware support, 6954 * KVM emulates it in software so long as VMFUNC is supported. 6955 */ 6956 if (cpu_has_vmx_vmfunc()) 6957 msrs->vmfunc_controls = VMX_VMFUNC_EPTP_SWITCHING; 6958 } 6959 6960 /* 6961 * Old versions of KVM use the single-context version without 6962 * checking for support, so declare that it is supported even 6963 * though it is treated as global context. The alternative is 6964 * not failing the single-context invvpid, and it is worse. 6965 */ 6966 if (enable_vpid) { 6967 msrs->secondary_ctls_high |= 6968 SECONDARY_EXEC_ENABLE_VPID; 6969 msrs->vpid_caps = VMX_VPID_INVVPID_BIT | 6970 VMX_VPID_EXTENT_SUPPORTED_MASK; 6971 } 6972 6973 if (enable_unrestricted_guest) 6974 msrs->secondary_ctls_high |= 6975 SECONDARY_EXEC_UNRESTRICTED_GUEST; 6976 6977 if (flexpriority_enabled) 6978 msrs->secondary_ctls_high |= 6979 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 6980 6981 if (enable_sgx) 6982 msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING; 6983 } 6984 6985 static void nested_vmx_setup_misc_data(struct vmcs_config *vmcs_conf, 6986 struct nested_vmx_msrs *msrs) 6987 { 6988 msrs->misc_low = (u32)vmcs_conf->misc & VMX_MISC_SAVE_EFER_LMA; 6989 msrs->misc_low |= 6990 MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | 6991 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | 6992 VMX_MISC_ACTIVITY_HLT | 6993 VMX_MISC_ACTIVITY_WAIT_SIPI; 6994 msrs->misc_high = 0; 6995 } 6996 6997 static void nested_vmx_setup_basic(struct nested_vmx_msrs *msrs) 6998 { 6999 /* 7000 * This MSR reports some information about VMX support. We 7001 * should return information about the VMX we emulate for the 7002 * guest, and the VMCS structure we give it - not about the 7003 * VMX support of the underlying hardware. 7004 */ 7005 msrs->basic = 7006 VMCS12_REVISION | 7007 VMX_BASIC_TRUE_CTLS | 7008 ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | 7009 (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); 7010 7011 if (cpu_has_vmx_basic_inout()) 7012 msrs->basic |= VMX_BASIC_INOUT; 7013 } 7014 7015 static void nested_vmx_setup_cr_fixed(struct nested_vmx_msrs *msrs) 7016 { 7017 /* 7018 * These MSRs specify bits which the guest must keep fixed on 7019 * while L1 is in VMXON mode (in L1's root mode, or running an L2). 7020 * We picked the standard core2 setting. 7021 */ 7022 #define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) 7023 #define VMXON_CR4_ALWAYSON X86_CR4_VMXE 7024 msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON; 7025 msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON; 7026 7027 /* These MSRs specify bits which the guest must keep fixed off. */ 7028 rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1); 7029 rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1); 7030 7031 if (vmx_umip_emulated()) 7032 msrs->cr4_fixed1 |= X86_CR4_UMIP; 7033 } 7034 7035 /* 7036 * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be 7037 * returned for the various VMX controls MSRs when nested VMX is enabled. 7038 * The same values should also be used to verify that vmcs12 control fields are 7039 * valid during nested entry from L1 to L2. 7040 * Each of these control msrs has a low and high 32-bit half: A low bit is on 7041 * if the corresponding bit in the (32-bit) control field *must* be on, and a 7042 * bit in the high half is on if the corresponding bit in the control field 7043 * may be on. See also vmx_control_verify(). 7044 */ 7045 void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps) 7046 { 7047 struct nested_vmx_msrs *msrs = &vmcs_conf->nested; 7048 7049 /* 7050 * Note that as a general rule, the high half of the MSRs (bits in 7051 * the control fields which may be 1) should be initialized by the 7052 * intersection of the underlying hardware's MSR (i.e., features which 7053 * can be supported) and the list of features we want to expose - 7054 * because they are known to be properly supported in our code. 7055 * Also, usually, the low half of the MSRs (bits which must be 1) can 7056 * be set to 0, meaning that L1 may turn off any of these bits. The 7057 * reason is that if one of these bits is necessary, it will appear 7058 * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control 7059 * fields of vmcs01 and vmcs02, will turn these bits off - and 7060 * nested_vmx_l1_wants_exit() will not pass related exits to L1. 7061 * These rules have exceptions below. 7062 */ 7063 nested_vmx_setup_pinbased_ctls(vmcs_conf, msrs); 7064 7065 nested_vmx_setup_exit_ctls(vmcs_conf, msrs); 7066 7067 nested_vmx_setup_entry_ctls(vmcs_conf, msrs); 7068 7069 nested_vmx_setup_cpubased_ctls(vmcs_conf, msrs); 7070 7071 nested_vmx_setup_secondary_ctls(ept_caps, vmcs_conf, msrs); 7072 7073 nested_vmx_setup_misc_data(vmcs_conf, msrs); 7074 7075 nested_vmx_setup_basic(msrs); 7076 7077 nested_vmx_setup_cr_fixed(msrs); 7078 7079 msrs->vmcs_enum = nested_vmx_calc_vmcs_enum_msr(); 7080 } 7081 7082 void nested_vmx_hardware_unsetup(void) 7083 { 7084 int i; 7085 7086 if (enable_shadow_vmcs) { 7087 for (i = 0; i < VMX_BITMAP_NR; i++) 7088 free_page((unsigned long)vmx_bitmap[i]); 7089 } 7090 } 7091 7092 __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) 7093 { 7094 int i; 7095 7096 if (!cpu_has_vmx_shadow_vmcs()) 7097 enable_shadow_vmcs = 0; 7098 if (enable_shadow_vmcs) { 7099 for (i = 0; i < VMX_BITMAP_NR; i++) { 7100 /* 7101 * The vmx_bitmap is not tied to a VM and so should 7102 * not be charged to a memcg. 7103 */ 7104 vmx_bitmap[i] = (unsigned long *) 7105 __get_free_page(GFP_KERNEL); 7106 if (!vmx_bitmap[i]) { 7107 nested_vmx_hardware_unsetup(); 7108 return -ENOMEM; 7109 } 7110 } 7111 7112 init_vmcs_shadow_fields(); 7113 } 7114 7115 exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear; 7116 exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch; 7117 exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld; 7118 exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst; 7119 exit_handlers[EXIT_REASON_VMREAD] = handle_vmread; 7120 exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume; 7121 exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite; 7122 exit_handlers[EXIT_REASON_VMOFF] = handle_vmxoff; 7123 exit_handlers[EXIT_REASON_VMON] = handle_vmxon; 7124 exit_handlers[EXIT_REASON_INVEPT] = handle_invept; 7125 exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid; 7126 exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc; 7127 7128 return 0; 7129 } 7130 7131 struct kvm_x86_nested_ops vmx_nested_ops = { 7132 .leave_nested = vmx_leave_nested, 7133 .is_exception_vmexit = nested_vmx_is_exception_vmexit, 7134 .check_events = vmx_check_nested_events, 7135 .has_events = vmx_has_nested_events, 7136 .triple_fault = nested_vmx_triple_fault, 7137 .get_state = vmx_get_nested_state, 7138 .set_state = vmx_set_nested_state, 7139 .get_nested_state_pages = vmx_get_nested_state_pages, 7140 .write_log_dirty = nested_vmx_write_pml_buffer, 7141 #ifdef CONFIG_KVM_HYPERV 7142 .enable_evmcs = nested_enable_evmcs, 7143 .get_evmcs_version = nested_get_evmcs_version, 7144 .hv_inject_synthetic_vmexit_post_tlb_flush = vmx_hv_inject_synthetic_vmexit_post_tlb_flush, 7145 #endif 7146 }; 7147