1 // SPDX-License-Identifier: GPL-2.0 2 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 3 4 #include <linux/objtool.h> 5 #include <linux/percpu.h> 6 7 #include <asm/debugreg.h> 8 #include <asm/mmu_context.h> 9 10 #include "cpuid.h" 11 #include "hyperv.h" 12 #include "mmu.h" 13 #include "nested.h" 14 #include "pmu.h" 15 #include "sgx.h" 16 #include "trace.h" 17 #include "vmx.h" 18 #include "x86.h" 19 #include "smm.h" 20 21 static bool __read_mostly enable_shadow_vmcs = 1; 22 module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); 23 24 static bool __read_mostly nested_early_check = 0; 25 module_param(nested_early_check, bool, S_IRUGO); 26 27 #define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK 28 29 /* 30 * Hyper-V requires all of these, so mark them as supported even though 31 * they are just treated the same as all-context. 32 */ 33 #define VMX_VPID_EXTENT_SUPPORTED_MASK \ 34 (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \ 35 VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \ 36 VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \ 37 VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT) 38 39 #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 40 41 enum { 42 VMX_VMREAD_BITMAP, 43 VMX_VMWRITE_BITMAP, 44 VMX_BITMAP_NR 45 }; 46 static unsigned long *vmx_bitmap[VMX_BITMAP_NR]; 47 48 #define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP]) 49 #define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP]) 50 51 struct shadow_vmcs_field { 52 u16 encoding; 53 u16 offset; 54 }; 55 static struct shadow_vmcs_field shadow_read_only_fields[] = { 56 #define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) }, 57 #include "vmcs_shadow_fields.h" 58 }; 59 static int max_shadow_read_only_fields = 60 ARRAY_SIZE(shadow_read_only_fields); 61 62 static struct shadow_vmcs_field shadow_read_write_fields[] = { 63 #define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) }, 64 #include "vmcs_shadow_fields.h" 65 }; 66 static int max_shadow_read_write_fields = 67 ARRAY_SIZE(shadow_read_write_fields); 68 69 static void init_vmcs_shadow_fields(void) 70 { 71 int i, j; 72 73 memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); 74 memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); 75 76 for (i = j = 0; i < max_shadow_read_only_fields; i++) { 77 struct shadow_vmcs_field entry = shadow_read_only_fields[i]; 78 u16 field = entry.encoding; 79 80 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && 81 (i + 1 == max_shadow_read_only_fields || 82 shadow_read_only_fields[i + 1].encoding != field + 1)) 83 pr_err("Missing field from shadow_read_only_field %x\n", 84 field + 1); 85 86 clear_bit(field, vmx_vmread_bitmap); 87 if (field & 1) 88 #ifdef CONFIG_X86_64 89 continue; 90 #else 91 entry.offset += sizeof(u32); 92 #endif 93 shadow_read_only_fields[j++] = entry; 94 } 95 max_shadow_read_only_fields = j; 96 97 for (i = j = 0; i < max_shadow_read_write_fields; i++) { 98 struct shadow_vmcs_field entry = shadow_read_write_fields[i]; 99 u16 field = entry.encoding; 100 101 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && 102 (i + 1 == max_shadow_read_write_fields || 103 shadow_read_write_fields[i + 1].encoding != field + 1)) 104 pr_err("Missing field from shadow_read_write_field %x\n", 105 field + 1); 106 107 WARN_ONCE(field >= GUEST_ES_AR_BYTES && 108 field <= GUEST_TR_AR_BYTES, 109 "Update vmcs12_write_any() to drop reserved bits from AR_BYTES"); 110 111 /* 112 * PML and the preemption timer can be emulated, but the 113 * processor cannot vmwrite to fields that don't exist 114 * on bare metal. 115 */ 116 switch (field) { 117 case GUEST_PML_INDEX: 118 if (!cpu_has_vmx_pml()) 119 continue; 120 break; 121 case VMX_PREEMPTION_TIMER_VALUE: 122 if (!cpu_has_vmx_preemption_timer()) 123 continue; 124 break; 125 case GUEST_INTR_STATUS: 126 if (!cpu_has_vmx_apicv()) 127 continue; 128 break; 129 default: 130 break; 131 } 132 133 clear_bit(field, vmx_vmwrite_bitmap); 134 clear_bit(field, vmx_vmread_bitmap); 135 if (field & 1) 136 #ifdef CONFIG_X86_64 137 continue; 138 #else 139 entry.offset += sizeof(u32); 140 #endif 141 shadow_read_write_fields[j++] = entry; 142 } 143 max_shadow_read_write_fields = j; 144 } 145 146 /* 147 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), 148 * set the success or error code of an emulated VMX instruction (as specified 149 * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated 150 * instruction. 151 */ 152 static int nested_vmx_succeed(struct kvm_vcpu *vcpu) 153 { 154 vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) 155 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 156 X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); 157 return kvm_skip_emulated_instruction(vcpu); 158 } 159 160 static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu) 161 { 162 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 163 & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | 164 X86_EFLAGS_SF | X86_EFLAGS_OF)) 165 | X86_EFLAGS_CF); 166 return kvm_skip_emulated_instruction(vcpu); 167 } 168 169 static int nested_vmx_failValid(struct kvm_vcpu *vcpu, 170 u32 vm_instruction_error) 171 { 172 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 173 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 174 X86_EFLAGS_SF | X86_EFLAGS_OF)) 175 | X86_EFLAGS_ZF); 176 get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; 177 /* 178 * We don't need to force sync to shadow VMCS because 179 * VM_INSTRUCTION_ERROR is not shadowed. Enlightened VMCS 'shadows' all 180 * fields and thus must be synced. 181 */ 182 if (nested_vmx_is_evmptr12_set(to_vmx(vcpu))) 183 to_vmx(vcpu)->nested.need_vmcs12_to_shadow_sync = true; 184 185 return kvm_skip_emulated_instruction(vcpu); 186 } 187 188 static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error) 189 { 190 struct vcpu_vmx *vmx = to_vmx(vcpu); 191 192 /* 193 * failValid writes the error number to the current VMCS, which 194 * can't be done if there isn't a current VMCS. 195 */ 196 if (vmx->nested.current_vmptr == INVALID_GPA && 197 !nested_vmx_is_evmptr12_valid(vmx)) 198 return nested_vmx_failInvalid(vcpu); 199 200 return nested_vmx_failValid(vcpu, vm_instruction_error); 201 } 202 203 static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) 204 { 205 /* TODO: not to reset guest simply here. */ 206 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 207 pr_debug_ratelimited("nested vmx abort, indicator %d\n", indicator); 208 } 209 210 static inline bool vmx_control_verify(u32 control, u32 low, u32 high) 211 { 212 return fixed_bits_valid(control, low, high); 213 } 214 215 static inline u64 vmx_control_msr(u32 low, u32 high) 216 { 217 return low | ((u64)high << 32); 218 } 219 220 static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) 221 { 222 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); 223 vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); 224 vmx->nested.need_vmcs12_to_shadow_sync = false; 225 } 226 227 static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) 228 { 229 #ifdef CONFIG_KVM_HYPERV 230 struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); 231 struct vcpu_vmx *vmx = to_vmx(vcpu); 232 233 if (nested_vmx_is_evmptr12_valid(vmx)) { 234 kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true); 235 vmx->nested.hv_evmcs = NULL; 236 } 237 238 vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; 239 240 if (hv_vcpu) { 241 hv_vcpu->nested.pa_page_gpa = INVALID_GPA; 242 hv_vcpu->nested.vm_id = 0; 243 hv_vcpu->nested.vp_id = 0; 244 } 245 #endif 246 } 247 248 static bool nested_evmcs_handle_vmclear(struct kvm_vcpu *vcpu, gpa_t vmptr) 249 { 250 #ifdef CONFIG_KVM_HYPERV 251 struct vcpu_vmx *vmx = to_vmx(vcpu); 252 /* 253 * When Enlightened VMEntry is enabled on the calling CPU we treat 254 * memory area pointer by vmptr as Enlightened VMCS (as there's no good 255 * way to distinguish it from VMCS12) and we must not corrupt it by 256 * writing to the non-existent 'launch_state' field. The area doesn't 257 * have to be the currently active EVMCS on the calling CPU and there's 258 * nothing KVM has to do to transition it from 'active' to 'non-active' 259 * state. It is possible that the area will stay mapped as 260 * vmx->nested.hv_evmcs but this shouldn't be a problem. 261 */ 262 if (!guest_cpuid_has_evmcs(vcpu) || 263 !evmptr_is_valid(nested_get_evmptr(vcpu))) 264 return false; 265 266 if (nested_vmx_evmcs(vmx) && vmptr == vmx->nested.hv_evmcs_vmptr) 267 nested_release_evmcs(vcpu); 268 269 return true; 270 #else 271 return false; 272 #endif 273 } 274 275 static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, 276 struct loaded_vmcs *prev) 277 { 278 struct vmcs_host_state *dest, *src; 279 280 if (unlikely(!vmx->guest_state_loaded)) 281 return; 282 283 src = &prev->host_state; 284 dest = &vmx->loaded_vmcs->host_state; 285 286 vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base); 287 dest->ldt_sel = src->ldt_sel; 288 #ifdef CONFIG_X86_64 289 dest->ds_sel = src->ds_sel; 290 dest->es_sel = src->es_sel; 291 #endif 292 } 293 294 static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) 295 { 296 struct vcpu_vmx *vmx = to_vmx(vcpu); 297 struct loaded_vmcs *prev; 298 int cpu; 299 300 if (WARN_ON_ONCE(vmx->loaded_vmcs == vmcs)) 301 return; 302 303 cpu = get_cpu(); 304 prev = vmx->loaded_vmcs; 305 vmx->loaded_vmcs = vmcs; 306 vmx_vcpu_load_vmcs(vcpu, cpu, prev); 307 vmx_sync_vmcs_host_state(vmx, prev); 308 put_cpu(); 309 310 vcpu->arch.regs_avail = ~VMX_REGS_LAZY_LOAD_SET; 311 312 /* 313 * All lazily updated registers will be reloaded from VMCS12 on both 314 * vmentry and vmexit. 315 */ 316 vcpu->arch.regs_dirty = 0; 317 } 318 319 /* 320 * Free whatever needs to be freed from vmx->nested when L1 goes down, or 321 * just stops using VMX. 322 */ 323 static void free_nested(struct kvm_vcpu *vcpu) 324 { 325 struct vcpu_vmx *vmx = to_vmx(vcpu); 326 327 if (WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01)) 328 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 329 330 if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) 331 return; 332 333 kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 334 335 vmx->nested.vmxon = false; 336 vmx->nested.smm.vmxon = false; 337 vmx->nested.vmxon_ptr = INVALID_GPA; 338 free_vpid(vmx->nested.vpid02); 339 vmx->nested.posted_intr_nv = -1; 340 vmx->nested.current_vmptr = INVALID_GPA; 341 if (enable_shadow_vmcs) { 342 vmx_disable_shadow_vmcs(vmx); 343 vmcs_clear(vmx->vmcs01.shadow_vmcs); 344 free_vmcs(vmx->vmcs01.shadow_vmcs); 345 vmx->vmcs01.shadow_vmcs = NULL; 346 } 347 kfree(vmx->nested.cached_vmcs12); 348 vmx->nested.cached_vmcs12 = NULL; 349 kfree(vmx->nested.cached_shadow_vmcs12); 350 vmx->nested.cached_shadow_vmcs12 = NULL; 351 /* 352 * Unpin physical memory we referred to in the vmcs02. The APIC access 353 * page's backing page (yeah, confusing) shouldn't actually be accessed, 354 * and if it is written, the contents are irrelevant. 355 */ 356 kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map, false); 357 kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); 358 kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); 359 vmx->nested.pi_desc = NULL; 360 361 kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 362 363 nested_release_evmcs(vcpu); 364 365 free_loaded_vmcs(&vmx->nested.vmcs02); 366 } 367 368 /* 369 * Ensure that the current vmcs of the logical processor is the 370 * vmcs01 of the vcpu before calling free_nested(). 371 */ 372 void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu) 373 { 374 vcpu_load(vcpu); 375 vmx_leave_nested(vcpu); 376 vcpu_put(vcpu); 377 } 378 379 #define EPTP_PA_MASK GENMASK_ULL(51, 12) 380 381 static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp) 382 { 383 return VALID_PAGE(root_hpa) && 384 ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK)); 385 } 386 387 static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp, 388 gpa_t addr) 389 { 390 unsigned long roots = 0; 391 uint i; 392 struct kvm_mmu_root_info *cached_root; 393 394 WARN_ON_ONCE(!mmu_is_nested(vcpu)); 395 396 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 397 cached_root = &vcpu->arch.mmu->prev_roots[i]; 398 399 if (nested_ept_root_matches(cached_root->hpa, cached_root->pgd, 400 eptp)) 401 roots |= KVM_MMU_ROOT_PREVIOUS(i); 402 } 403 if (roots) 404 kvm_mmu_invalidate_addr(vcpu, vcpu->arch.mmu, addr, roots); 405 } 406 407 static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, 408 struct x86_exception *fault) 409 { 410 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 411 struct vcpu_vmx *vmx = to_vmx(vcpu); 412 unsigned long exit_qualification; 413 u32 vm_exit_reason; 414 415 if (vmx->nested.pml_full) { 416 vm_exit_reason = EXIT_REASON_PML_FULL; 417 vmx->nested.pml_full = false; 418 419 /* 420 * It should be impossible to trigger a nested PML Full VM-Exit 421 * for anything other than an EPT Violation from L2. KVM *can* 422 * trigger nEPT page fault injection in response to an EPT 423 * Misconfig, e.g. if the MMIO SPTE was stale and L1's EPT 424 * tables also changed, but KVM should not treat EPT Misconfig 425 * VM-Exits as writes. 426 */ 427 WARN_ON_ONCE(vmx->exit_reason.basic != EXIT_REASON_EPT_VIOLATION); 428 429 /* 430 * PML Full and EPT Violation VM-Exits both use bit 12 to report 431 * "NMI unblocking due to IRET", i.e. the bit can be propagated 432 * as-is from the original EXIT_QUALIFICATION. 433 */ 434 exit_qualification = vmx_get_exit_qual(vcpu) & INTR_INFO_UNBLOCK_NMI; 435 } else { 436 if (fault->error_code & PFERR_RSVD_MASK) { 437 vm_exit_reason = EXIT_REASON_EPT_MISCONFIG; 438 exit_qualification = 0; 439 } else { 440 exit_qualification = fault->exit_qualification; 441 exit_qualification |= vmx_get_exit_qual(vcpu) & 442 (EPT_VIOLATION_GVA_IS_VALID | 443 EPT_VIOLATION_GVA_TRANSLATED); 444 vm_exit_reason = EXIT_REASON_EPT_VIOLATION; 445 } 446 447 /* 448 * Although the caller (kvm_inject_emulated_page_fault) would 449 * have already synced the faulting address in the shadow EPT 450 * tables for the current EPTP12, we also need to sync it for 451 * any other cached EPTP02s based on the same EP4TA, since the 452 * TLB associates mappings to the EP4TA rather than the full EPTP. 453 */ 454 nested_ept_invalidate_addr(vcpu, vmcs12->ept_pointer, 455 fault->address); 456 } 457 458 nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification); 459 vmcs12->guest_physical_address = fault->address; 460 } 461 462 static void nested_ept_new_eptp(struct kvm_vcpu *vcpu) 463 { 464 struct vcpu_vmx *vmx = to_vmx(vcpu); 465 bool execonly = vmx->nested.msrs.ept_caps & VMX_EPT_EXECUTE_ONLY_BIT; 466 int ept_lpage_level = ept_caps_to_lpage_level(vmx->nested.msrs.ept_caps); 467 468 kvm_init_shadow_ept_mmu(vcpu, execonly, ept_lpage_level, 469 nested_ept_ad_enabled(vcpu), 470 nested_ept_get_eptp(vcpu)); 471 } 472 473 static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) 474 { 475 WARN_ON(mmu_is_nested(vcpu)); 476 477 vcpu->arch.mmu = &vcpu->arch.guest_mmu; 478 nested_ept_new_eptp(vcpu); 479 vcpu->arch.mmu->get_guest_pgd = nested_ept_get_eptp; 480 vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault; 481 vcpu->arch.mmu->get_pdptr = kvm_pdptr_read; 482 483 vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; 484 } 485 486 static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) 487 { 488 vcpu->arch.mmu = &vcpu->arch.root_mmu; 489 vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; 490 } 491 492 static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, 493 u16 error_code) 494 { 495 bool inequality, bit; 496 497 bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0; 498 inequality = 499 (error_code & vmcs12->page_fault_error_code_mask) != 500 vmcs12->page_fault_error_code_match; 501 return inequality ^ bit; 502 } 503 504 static bool nested_vmx_is_exception_vmexit(struct kvm_vcpu *vcpu, u8 vector, 505 u32 error_code) 506 { 507 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 508 509 /* 510 * Drop bits 31:16 of the error code when performing the #PF mask+match 511 * check. All VMCS fields involved are 32 bits, but Intel CPUs never 512 * set bits 31:16 and VMX disallows setting bits 31:16 in the injected 513 * error code. Including the to-be-dropped bits in the check might 514 * result in an "impossible" or missed exit from L1's perspective. 515 */ 516 if (vector == PF_VECTOR) 517 return nested_vmx_is_page_fault_vmexit(vmcs12, (u16)error_code); 518 519 return (vmcs12->exception_bitmap & (1u << vector)); 520 } 521 522 static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu, 523 struct vmcs12 *vmcs12) 524 { 525 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 526 return 0; 527 528 if (CC(!page_address_valid(vcpu, vmcs12->io_bitmap_a)) || 529 CC(!page_address_valid(vcpu, vmcs12->io_bitmap_b))) 530 return -EINVAL; 531 532 return 0; 533 } 534 535 static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu, 536 struct vmcs12 *vmcs12) 537 { 538 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 539 return 0; 540 541 if (CC(!page_address_valid(vcpu, vmcs12->msr_bitmap))) 542 return -EINVAL; 543 544 return 0; 545 } 546 547 static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu, 548 struct vmcs12 *vmcs12) 549 { 550 if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) 551 return 0; 552 553 if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr))) 554 return -EINVAL; 555 556 return 0; 557 } 558 559 /* 560 * For x2APIC MSRs, ignore the vmcs01 bitmap. L1 can enable x2APIC without L1 561 * itself utilizing x2APIC. All MSRs were previously set to be intercepted, 562 * only the "disable intercept" case needs to be handled. 563 */ 564 static void nested_vmx_disable_intercept_for_x2apic_msr(unsigned long *msr_bitmap_l1, 565 unsigned long *msr_bitmap_l0, 566 u32 msr, int type) 567 { 568 if (type & MSR_TYPE_R && !vmx_test_msr_bitmap_read(msr_bitmap_l1, msr)) 569 vmx_clear_msr_bitmap_read(msr_bitmap_l0, msr); 570 571 if (type & MSR_TYPE_W && !vmx_test_msr_bitmap_write(msr_bitmap_l1, msr)) 572 vmx_clear_msr_bitmap_write(msr_bitmap_l0, msr); 573 } 574 575 static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) 576 { 577 int msr; 578 579 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 580 unsigned word = msr / BITS_PER_LONG; 581 582 msr_bitmap[word] = ~0; 583 msr_bitmap[word + (0x800 / sizeof(long))] = ~0; 584 } 585 } 586 587 #define BUILD_NVMX_MSR_INTERCEPT_HELPER(rw) \ 588 static inline \ 589 void nested_vmx_set_msr_##rw##_intercept(struct vcpu_vmx *vmx, \ 590 unsigned long *msr_bitmap_l1, \ 591 unsigned long *msr_bitmap_l0, u32 msr) \ 592 { \ 593 if (vmx_test_msr_bitmap_##rw(vmx->vmcs01.msr_bitmap, msr) || \ 594 vmx_test_msr_bitmap_##rw(msr_bitmap_l1, msr)) \ 595 vmx_set_msr_bitmap_##rw(msr_bitmap_l0, msr); \ 596 else \ 597 vmx_clear_msr_bitmap_##rw(msr_bitmap_l0, msr); \ 598 } 599 BUILD_NVMX_MSR_INTERCEPT_HELPER(read) 600 BUILD_NVMX_MSR_INTERCEPT_HELPER(write) 601 602 static inline void nested_vmx_set_intercept_for_msr(struct vcpu_vmx *vmx, 603 unsigned long *msr_bitmap_l1, 604 unsigned long *msr_bitmap_l0, 605 u32 msr, int types) 606 { 607 if (types & MSR_TYPE_R) 608 nested_vmx_set_msr_read_intercept(vmx, msr_bitmap_l1, 609 msr_bitmap_l0, msr); 610 if (types & MSR_TYPE_W) 611 nested_vmx_set_msr_write_intercept(vmx, msr_bitmap_l1, 612 msr_bitmap_l0, msr); 613 } 614 615 /* 616 * Merge L0's and L1's MSR bitmap, return false to indicate that 617 * we do not use the hardware. 618 */ 619 static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, 620 struct vmcs12 *vmcs12) 621 { 622 struct vcpu_vmx *vmx = to_vmx(vcpu); 623 int msr; 624 unsigned long *msr_bitmap_l1; 625 unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap; 626 struct kvm_host_map *map = &vmx->nested.msr_bitmap_map; 627 628 /* Nothing to do if the MSR bitmap is not in use. */ 629 if (!cpu_has_vmx_msr_bitmap() || 630 !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 631 return false; 632 633 /* 634 * MSR bitmap update can be skipped when: 635 * - MSR bitmap for L1 hasn't changed. 636 * - Nested hypervisor (L1) is attempting to launch the same L2 as 637 * before. 638 * - Nested hypervisor (L1) has enabled 'Enlightened MSR Bitmap' feature 639 * and tells KVM (L0) there were no changes in MSR bitmap for L2. 640 */ 641 if (!vmx->nested.force_msr_bitmap_recalc) { 642 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 643 644 if (evmcs && evmcs->hv_enlightenments_control.msr_bitmap && 645 evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP) 646 return true; 647 } 648 649 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map)) 650 return false; 651 652 msr_bitmap_l1 = (unsigned long *)map->hva; 653 654 /* 655 * To keep the control flow simple, pay eight 8-byte writes (sixteen 656 * 4-byte writes on 32-bit systems) up front to enable intercepts for 657 * the x2APIC MSR range and selectively toggle those relevant to L2. 658 */ 659 enable_x2apic_msr_intercepts(msr_bitmap_l0); 660 661 if (nested_cpu_has_virt_x2apic_mode(vmcs12)) { 662 if (nested_cpu_has_apic_reg_virt(vmcs12)) { 663 /* 664 * L0 need not intercept reads for MSRs between 0x800 665 * and 0x8ff, it just lets the processor take the value 666 * from the virtual-APIC page; take those 256 bits 667 * directly from the L1 bitmap. 668 */ 669 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 670 unsigned word = msr / BITS_PER_LONG; 671 672 msr_bitmap_l0[word] = msr_bitmap_l1[word]; 673 } 674 } 675 676 nested_vmx_disable_intercept_for_x2apic_msr( 677 msr_bitmap_l1, msr_bitmap_l0, 678 X2APIC_MSR(APIC_TASKPRI), 679 MSR_TYPE_R | MSR_TYPE_W); 680 681 if (nested_cpu_has_vid(vmcs12)) { 682 nested_vmx_disable_intercept_for_x2apic_msr( 683 msr_bitmap_l1, msr_bitmap_l0, 684 X2APIC_MSR(APIC_EOI), 685 MSR_TYPE_W); 686 nested_vmx_disable_intercept_for_x2apic_msr( 687 msr_bitmap_l1, msr_bitmap_l0, 688 X2APIC_MSR(APIC_SELF_IPI), 689 MSR_TYPE_W); 690 } 691 } 692 693 /* 694 * Always check vmcs01's bitmap to honor userspace MSR filters and any 695 * other runtime changes to vmcs01's bitmap, e.g. dynamic pass-through. 696 */ 697 #ifdef CONFIG_X86_64 698 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 699 MSR_FS_BASE, MSR_TYPE_RW); 700 701 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 702 MSR_GS_BASE, MSR_TYPE_RW); 703 704 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 705 MSR_KERNEL_GS_BASE, MSR_TYPE_RW); 706 #endif 707 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 708 MSR_IA32_SPEC_CTRL, MSR_TYPE_RW); 709 710 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 711 MSR_IA32_PRED_CMD, MSR_TYPE_W); 712 713 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 714 MSR_IA32_FLUSH_CMD, MSR_TYPE_W); 715 716 kvm_vcpu_unmap(vcpu, &vmx->nested.msr_bitmap_map, false); 717 718 vmx->nested.force_msr_bitmap_recalc = false; 719 720 return true; 721 } 722 723 static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, 724 struct vmcs12 *vmcs12) 725 { 726 struct vcpu_vmx *vmx = to_vmx(vcpu); 727 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; 728 729 if (!nested_cpu_has_shadow_vmcs(vmcs12) || 730 vmcs12->vmcs_link_pointer == INVALID_GPA) 731 return; 732 733 if (ghc->gpa != vmcs12->vmcs_link_pointer && 734 kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, 735 vmcs12->vmcs_link_pointer, VMCS12_SIZE)) 736 return; 737 738 kvm_read_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu), 739 VMCS12_SIZE); 740 } 741 742 static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, 743 struct vmcs12 *vmcs12) 744 { 745 struct vcpu_vmx *vmx = to_vmx(vcpu); 746 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; 747 748 if (!nested_cpu_has_shadow_vmcs(vmcs12) || 749 vmcs12->vmcs_link_pointer == INVALID_GPA) 750 return; 751 752 if (ghc->gpa != vmcs12->vmcs_link_pointer && 753 kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, 754 vmcs12->vmcs_link_pointer, VMCS12_SIZE)) 755 return; 756 757 kvm_write_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu), 758 VMCS12_SIZE); 759 } 760 761 /* 762 * In nested virtualization, check if L1 has set 763 * VM_EXIT_ACK_INTR_ON_EXIT 764 */ 765 static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu) 766 { 767 return get_vmcs12(vcpu)->vm_exit_controls & 768 VM_EXIT_ACK_INTR_ON_EXIT; 769 } 770 771 static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu, 772 struct vmcs12 *vmcs12) 773 { 774 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && 775 CC(!page_address_valid(vcpu, vmcs12->apic_access_addr))) 776 return -EINVAL; 777 else 778 return 0; 779 } 780 781 static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, 782 struct vmcs12 *vmcs12) 783 { 784 if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && 785 !nested_cpu_has_apic_reg_virt(vmcs12) && 786 !nested_cpu_has_vid(vmcs12) && 787 !nested_cpu_has_posted_intr(vmcs12)) 788 return 0; 789 790 /* 791 * If virtualize x2apic mode is enabled, 792 * virtualize apic access must be disabled. 793 */ 794 if (CC(nested_cpu_has_virt_x2apic_mode(vmcs12) && 795 nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))) 796 return -EINVAL; 797 798 /* 799 * If virtual interrupt delivery is enabled, 800 * we must exit on external interrupts. 801 */ 802 if (CC(nested_cpu_has_vid(vmcs12) && !nested_exit_on_intr(vcpu))) 803 return -EINVAL; 804 805 /* 806 * bits 15:8 should be zero in posted_intr_nv, 807 * the descriptor address has been already checked 808 * in nested_get_vmcs12_pages. 809 * 810 * bits 5:0 of posted_intr_desc_addr should be zero. 811 */ 812 if (nested_cpu_has_posted_intr(vmcs12) && 813 (CC(!nested_cpu_has_vid(vmcs12)) || 814 CC(!nested_exit_intr_ack_set(vcpu)) || 815 CC((vmcs12->posted_intr_nv & 0xff00)) || 816 CC(!kvm_vcpu_is_legal_aligned_gpa(vcpu, vmcs12->posted_intr_desc_addr, 64)))) 817 return -EINVAL; 818 819 /* tpr shadow is needed by all apicv features. */ 820 if (CC(!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))) 821 return -EINVAL; 822 823 return 0; 824 } 825 826 static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, 827 u32 count, u64 addr) 828 { 829 if (count == 0) 830 return 0; 831 832 if (!kvm_vcpu_is_legal_aligned_gpa(vcpu, addr, 16) || 833 !kvm_vcpu_is_legal_gpa(vcpu, (addr + count * sizeof(struct vmx_msr_entry) - 1))) 834 return -EINVAL; 835 836 return 0; 837 } 838 839 static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu, 840 struct vmcs12 *vmcs12) 841 { 842 if (CC(nested_vmx_check_msr_switch(vcpu, 843 vmcs12->vm_exit_msr_load_count, 844 vmcs12->vm_exit_msr_load_addr)) || 845 CC(nested_vmx_check_msr_switch(vcpu, 846 vmcs12->vm_exit_msr_store_count, 847 vmcs12->vm_exit_msr_store_addr))) 848 return -EINVAL; 849 850 return 0; 851 } 852 853 static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu, 854 struct vmcs12 *vmcs12) 855 { 856 if (CC(nested_vmx_check_msr_switch(vcpu, 857 vmcs12->vm_entry_msr_load_count, 858 vmcs12->vm_entry_msr_load_addr))) 859 return -EINVAL; 860 861 return 0; 862 } 863 864 static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu, 865 struct vmcs12 *vmcs12) 866 { 867 if (!nested_cpu_has_pml(vmcs12)) 868 return 0; 869 870 if (CC(!nested_cpu_has_ept(vmcs12)) || 871 CC(!page_address_valid(vcpu, vmcs12->pml_address))) 872 return -EINVAL; 873 874 return 0; 875 } 876 877 static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu, 878 struct vmcs12 *vmcs12) 879 { 880 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) && 881 !nested_cpu_has_ept(vmcs12))) 882 return -EINVAL; 883 return 0; 884 } 885 886 static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu, 887 struct vmcs12 *vmcs12) 888 { 889 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) && 890 !nested_cpu_has_ept(vmcs12))) 891 return -EINVAL; 892 return 0; 893 } 894 895 static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu, 896 struct vmcs12 *vmcs12) 897 { 898 if (!nested_cpu_has_shadow_vmcs(vmcs12)) 899 return 0; 900 901 if (CC(!page_address_valid(vcpu, vmcs12->vmread_bitmap)) || 902 CC(!page_address_valid(vcpu, vmcs12->vmwrite_bitmap))) 903 return -EINVAL; 904 905 return 0; 906 } 907 908 static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, 909 struct vmx_msr_entry *e) 910 { 911 /* x2APIC MSR accesses are not allowed */ 912 if (CC(vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8)) 913 return -EINVAL; 914 if (CC(e->index == MSR_IA32_UCODE_WRITE) || /* SDM Table 35-2 */ 915 CC(e->index == MSR_IA32_UCODE_REV)) 916 return -EINVAL; 917 if (CC(e->reserved != 0)) 918 return -EINVAL; 919 return 0; 920 } 921 922 static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu, 923 struct vmx_msr_entry *e) 924 { 925 if (CC(e->index == MSR_FS_BASE) || 926 CC(e->index == MSR_GS_BASE) || 927 CC(e->index == MSR_IA32_SMM_MONITOR_CTL) || /* SMM is not supported */ 928 nested_vmx_msr_check_common(vcpu, e)) 929 return -EINVAL; 930 return 0; 931 } 932 933 static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, 934 struct vmx_msr_entry *e) 935 { 936 if (CC(e->index == MSR_IA32_SMBASE) || /* SMM is not supported */ 937 nested_vmx_msr_check_common(vcpu, e)) 938 return -EINVAL; 939 return 0; 940 } 941 942 static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu) 943 { 944 struct vcpu_vmx *vmx = to_vmx(vcpu); 945 u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, 946 vmx->nested.msrs.misc_high); 947 948 return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER; 949 } 950 951 /* 952 * Load guest's/host's msr at nested entry/exit. 953 * return 0 for success, entry index for failure. 954 * 955 * One of the failure modes for MSR load/store is when a list exceeds the 956 * virtual hardware's capacity. To maintain compatibility with hardware inasmuch 957 * as possible, process all valid entries before failing rather than precheck 958 * for a capacity violation. 959 */ 960 static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 961 { 962 u32 i; 963 struct vmx_msr_entry e; 964 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); 965 966 for (i = 0; i < count; i++) { 967 if (unlikely(i >= max_msr_list_size)) 968 goto fail; 969 970 if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), 971 &e, sizeof(e))) { 972 pr_debug_ratelimited( 973 "%s cannot read MSR entry (%u, 0x%08llx)\n", 974 __func__, i, gpa + i * sizeof(e)); 975 goto fail; 976 } 977 if (nested_vmx_load_msr_check(vcpu, &e)) { 978 pr_debug_ratelimited( 979 "%s check failed (%u, 0x%x, 0x%x)\n", 980 __func__, i, e.index, e.reserved); 981 goto fail; 982 } 983 if (kvm_set_msr(vcpu, e.index, e.value)) { 984 pr_debug_ratelimited( 985 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 986 __func__, i, e.index, e.value); 987 goto fail; 988 } 989 } 990 return 0; 991 fail: 992 /* Note, max_msr_list_size is at most 4096, i.e. this can't wrap. */ 993 return i + 1; 994 } 995 996 static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu, 997 u32 msr_index, 998 u64 *data) 999 { 1000 struct vcpu_vmx *vmx = to_vmx(vcpu); 1001 1002 /* 1003 * If the L0 hypervisor stored a more accurate value for the TSC that 1004 * does not include the time taken for emulation of the L2->L1 1005 * VM-exit in L0, use the more accurate value. 1006 */ 1007 if (msr_index == MSR_IA32_TSC) { 1008 int i = vmx_find_loadstore_msr_slot(&vmx->msr_autostore.guest, 1009 MSR_IA32_TSC); 1010 1011 if (i >= 0) { 1012 u64 val = vmx->msr_autostore.guest.val[i].value; 1013 1014 *data = kvm_read_l1_tsc(vcpu, val); 1015 return true; 1016 } 1017 } 1018 1019 if (kvm_get_msr(vcpu, msr_index, data)) { 1020 pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__, 1021 msr_index); 1022 return false; 1023 } 1024 return true; 1025 } 1026 1027 static bool read_and_check_msr_entry(struct kvm_vcpu *vcpu, u64 gpa, int i, 1028 struct vmx_msr_entry *e) 1029 { 1030 if (kvm_vcpu_read_guest(vcpu, 1031 gpa + i * sizeof(*e), 1032 e, 2 * sizeof(u32))) { 1033 pr_debug_ratelimited( 1034 "%s cannot read MSR entry (%u, 0x%08llx)\n", 1035 __func__, i, gpa + i * sizeof(*e)); 1036 return false; 1037 } 1038 if (nested_vmx_store_msr_check(vcpu, e)) { 1039 pr_debug_ratelimited( 1040 "%s check failed (%u, 0x%x, 0x%x)\n", 1041 __func__, i, e->index, e->reserved); 1042 return false; 1043 } 1044 return true; 1045 } 1046 1047 static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 1048 { 1049 u64 data; 1050 u32 i; 1051 struct vmx_msr_entry e; 1052 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); 1053 1054 for (i = 0; i < count; i++) { 1055 if (unlikely(i >= max_msr_list_size)) 1056 return -EINVAL; 1057 1058 if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) 1059 return -EINVAL; 1060 1061 if (!nested_vmx_get_vmexit_msr_value(vcpu, e.index, &data)) 1062 return -EINVAL; 1063 1064 if (kvm_vcpu_write_guest(vcpu, 1065 gpa + i * sizeof(e) + 1066 offsetof(struct vmx_msr_entry, value), 1067 &data, sizeof(data))) { 1068 pr_debug_ratelimited( 1069 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 1070 __func__, i, e.index, data); 1071 return -EINVAL; 1072 } 1073 } 1074 return 0; 1075 } 1076 1077 static bool nested_msr_store_list_has_msr(struct kvm_vcpu *vcpu, u32 msr_index) 1078 { 1079 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1080 u32 count = vmcs12->vm_exit_msr_store_count; 1081 u64 gpa = vmcs12->vm_exit_msr_store_addr; 1082 struct vmx_msr_entry e; 1083 u32 i; 1084 1085 for (i = 0; i < count; i++) { 1086 if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) 1087 return false; 1088 1089 if (e.index == msr_index) 1090 return true; 1091 } 1092 return false; 1093 } 1094 1095 static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu, 1096 u32 msr_index) 1097 { 1098 struct vcpu_vmx *vmx = to_vmx(vcpu); 1099 struct vmx_msrs *autostore = &vmx->msr_autostore.guest; 1100 bool in_vmcs12_store_list; 1101 int msr_autostore_slot; 1102 bool in_autostore_list; 1103 int last; 1104 1105 msr_autostore_slot = vmx_find_loadstore_msr_slot(autostore, msr_index); 1106 in_autostore_list = msr_autostore_slot >= 0; 1107 in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index); 1108 1109 if (in_vmcs12_store_list && !in_autostore_list) { 1110 if (autostore->nr == MAX_NR_LOADSTORE_MSRS) { 1111 /* 1112 * Emulated VMEntry does not fail here. Instead a less 1113 * accurate value will be returned by 1114 * nested_vmx_get_vmexit_msr_value() using kvm_get_msr() 1115 * instead of reading the value from the vmcs02 VMExit 1116 * MSR-store area. 1117 */ 1118 pr_warn_ratelimited( 1119 "Not enough msr entries in msr_autostore. Can't add msr %x\n", 1120 msr_index); 1121 return; 1122 } 1123 last = autostore->nr++; 1124 autostore->val[last].index = msr_index; 1125 } else if (!in_vmcs12_store_list && in_autostore_list) { 1126 last = --autostore->nr; 1127 autostore->val[msr_autostore_slot] = autostore->val[last]; 1128 } 1129 } 1130 1131 /* 1132 * Load guest's/host's cr3 at nested entry/exit. @nested_ept is true if we are 1133 * emulating VM-Entry into a guest with EPT enabled. On failure, the expected 1134 * Exit Qualification (for a VM-Entry consistency check VM-Exit) is assigned to 1135 * @entry_failure_code. 1136 */ 1137 static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, 1138 bool nested_ept, bool reload_pdptrs, 1139 enum vm_entry_failure_code *entry_failure_code) 1140 { 1141 if (CC(!kvm_vcpu_is_legal_cr3(vcpu, cr3))) { 1142 *entry_failure_code = ENTRY_FAIL_DEFAULT; 1143 return -EINVAL; 1144 } 1145 1146 /* 1147 * If PAE paging and EPT are both on, CR3 is not used by the CPU and 1148 * must not be dereferenced. 1149 */ 1150 if (reload_pdptrs && !nested_ept && is_pae_paging(vcpu) && 1151 CC(!load_pdptrs(vcpu, cr3))) { 1152 *entry_failure_code = ENTRY_FAIL_PDPTE; 1153 return -EINVAL; 1154 } 1155 1156 vcpu->arch.cr3 = cr3; 1157 kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); 1158 1159 /* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */ 1160 kvm_init_mmu(vcpu); 1161 1162 if (!nested_ept) 1163 kvm_mmu_new_pgd(vcpu, cr3); 1164 1165 return 0; 1166 } 1167 1168 /* 1169 * Returns if KVM is able to config CPU to tag TLB entries 1170 * populated by L2 differently than TLB entries populated 1171 * by L1. 1172 * 1173 * If L0 uses EPT, L1 and L2 run with different EPTP because 1174 * guest_mode is part of kvm_mmu_page_role. Thus, TLB entries 1175 * are tagged with different EPTP. 1176 * 1177 * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged 1178 * with different VPID (L1 entries are tagged with vmx->vpid 1179 * while L2 entries are tagged with vmx->nested.vpid02). 1180 */ 1181 static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu) 1182 { 1183 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1184 1185 return enable_ept || 1186 (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02); 1187 } 1188 1189 static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu, 1190 struct vmcs12 *vmcs12, 1191 bool is_vmenter) 1192 { 1193 struct vcpu_vmx *vmx = to_vmx(vcpu); 1194 1195 /* Handle pending Hyper-V TLB flush requests */ 1196 kvm_hv_nested_transtion_tlb_flush(vcpu, enable_ept); 1197 1198 /* 1199 * If vmcs12 doesn't use VPID, L1 expects linear and combined mappings 1200 * for *all* contexts to be flushed on VM-Enter/VM-Exit, i.e. it's a 1201 * full TLB flush from the guest's perspective. This is required even 1202 * if VPID is disabled in the host as KVM may need to synchronize the 1203 * MMU in response to the guest TLB flush. 1204 * 1205 * Note, using TLB_FLUSH_GUEST is correct even if nested EPT is in use. 1206 * EPT is a special snowflake, as guest-physical mappings aren't 1207 * flushed on VPID invalidations, including VM-Enter or VM-Exit with 1208 * VPID disabled. As a result, KVM _never_ needs to sync nEPT 1209 * entries on VM-Enter because L1 can't rely on VM-Enter to flush 1210 * those mappings. 1211 */ 1212 if (!nested_cpu_has_vpid(vmcs12)) { 1213 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 1214 return; 1215 } 1216 1217 /* L2 should never have a VPID if VPID is disabled. */ 1218 WARN_ON(!enable_vpid); 1219 1220 /* 1221 * VPID is enabled and in use by vmcs12. If vpid12 is changing, then 1222 * emulate a guest TLB flush as KVM does not track vpid12 history nor 1223 * is the VPID incorporated into the MMU context. I.e. KVM must assume 1224 * that the new vpid12 has never been used and thus represents a new 1225 * guest ASID that cannot have entries in the TLB. 1226 */ 1227 if (is_vmenter && vmcs12->virtual_processor_id != vmx->nested.last_vpid) { 1228 vmx->nested.last_vpid = vmcs12->virtual_processor_id; 1229 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 1230 return; 1231 } 1232 1233 /* 1234 * If VPID is enabled, used by vmc12, and vpid12 is not changing but 1235 * does not have a unique TLB tag (ASID), i.e. EPT is disabled and 1236 * KVM was unable to allocate a VPID for L2, flush the current context 1237 * as the effective ASID is common to both L1 and L2. 1238 */ 1239 if (!nested_has_guest_tlb_tag(vcpu)) 1240 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); 1241 } 1242 1243 static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) 1244 { 1245 superset &= mask; 1246 subset &= mask; 1247 1248 return (superset | subset) == superset; 1249 } 1250 1251 static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) 1252 { 1253 const u64 feature_and_reserved = 1254 /* feature (except bit 48; see below) */ 1255 BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) | 1256 /* reserved */ 1257 BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56); 1258 u64 vmx_basic = vmcs_config.nested.basic; 1259 1260 if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved)) 1261 return -EINVAL; 1262 1263 /* 1264 * KVM does not emulate a version of VMX that constrains physical 1265 * addresses of VMX structures (e.g. VMCS) to 32-bits. 1266 */ 1267 if (data & BIT_ULL(48)) 1268 return -EINVAL; 1269 1270 if (vmx_basic_vmcs_revision_id(vmx_basic) != 1271 vmx_basic_vmcs_revision_id(data)) 1272 return -EINVAL; 1273 1274 if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data)) 1275 return -EINVAL; 1276 1277 vmx->nested.msrs.basic = data; 1278 return 0; 1279 } 1280 1281 static void vmx_get_control_msr(struct nested_vmx_msrs *msrs, u32 msr_index, 1282 u32 **low, u32 **high) 1283 { 1284 switch (msr_index) { 1285 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1286 *low = &msrs->pinbased_ctls_low; 1287 *high = &msrs->pinbased_ctls_high; 1288 break; 1289 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1290 *low = &msrs->procbased_ctls_low; 1291 *high = &msrs->procbased_ctls_high; 1292 break; 1293 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1294 *low = &msrs->exit_ctls_low; 1295 *high = &msrs->exit_ctls_high; 1296 break; 1297 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1298 *low = &msrs->entry_ctls_low; 1299 *high = &msrs->entry_ctls_high; 1300 break; 1301 case MSR_IA32_VMX_PROCBASED_CTLS2: 1302 *low = &msrs->secondary_ctls_low; 1303 *high = &msrs->secondary_ctls_high; 1304 break; 1305 default: 1306 BUG(); 1307 } 1308 } 1309 1310 static int 1311 vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) 1312 { 1313 u32 *lowp, *highp; 1314 u64 supported; 1315 1316 vmx_get_control_msr(&vmcs_config.nested, msr_index, &lowp, &highp); 1317 1318 supported = vmx_control_msr(*lowp, *highp); 1319 1320 /* Check must-be-1 bits are still 1. */ 1321 if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0))) 1322 return -EINVAL; 1323 1324 /* Check must-be-0 bits are still 0. */ 1325 if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32))) 1326 return -EINVAL; 1327 1328 vmx_get_control_msr(&vmx->nested.msrs, msr_index, &lowp, &highp); 1329 *lowp = data; 1330 *highp = data >> 32; 1331 return 0; 1332 } 1333 1334 static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) 1335 { 1336 const u64 feature_and_reserved_bits = 1337 /* feature */ 1338 BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) | 1339 BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) | 1340 /* reserved */ 1341 GENMASK_ULL(13, 9) | BIT_ULL(31); 1342 u64 vmx_misc = vmx_control_msr(vmcs_config.nested.misc_low, 1343 vmcs_config.nested.misc_high); 1344 1345 if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits)) 1346 return -EINVAL; 1347 1348 if ((vmx->nested.msrs.pinbased_ctls_high & 1349 PIN_BASED_VMX_PREEMPTION_TIMER) && 1350 vmx_misc_preemption_timer_rate(data) != 1351 vmx_misc_preemption_timer_rate(vmx_misc)) 1352 return -EINVAL; 1353 1354 if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc)) 1355 return -EINVAL; 1356 1357 if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc)) 1358 return -EINVAL; 1359 1360 if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc)) 1361 return -EINVAL; 1362 1363 vmx->nested.msrs.misc_low = data; 1364 vmx->nested.msrs.misc_high = data >> 32; 1365 1366 return 0; 1367 } 1368 1369 static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data) 1370 { 1371 u64 vmx_ept_vpid_cap = vmx_control_msr(vmcs_config.nested.ept_caps, 1372 vmcs_config.nested.vpid_caps); 1373 1374 /* Every bit is either reserved or a feature bit. */ 1375 if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL)) 1376 return -EINVAL; 1377 1378 vmx->nested.msrs.ept_caps = data; 1379 vmx->nested.msrs.vpid_caps = data >> 32; 1380 return 0; 1381 } 1382 1383 static u64 *vmx_get_fixed0_msr(struct nested_vmx_msrs *msrs, u32 msr_index) 1384 { 1385 switch (msr_index) { 1386 case MSR_IA32_VMX_CR0_FIXED0: 1387 return &msrs->cr0_fixed0; 1388 case MSR_IA32_VMX_CR4_FIXED0: 1389 return &msrs->cr4_fixed0; 1390 default: 1391 BUG(); 1392 } 1393 } 1394 1395 static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) 1396 { 1397 const u64 *msr = vmx_get_fixed0_msr(&vmcs_config.nested, msr_index); 1398 1399 /* 1400 * 1 bits (which indicates bits which "must-be-1" during VMX operation) 1401 * must be 1 in the restored value. 1402 */ 1403 if (!is_bitwise_subset(data, *msr, -1ULL)) 1404 return -EINVAL; 1405 1406 *vmx_get_fixed0_msr(&vmx->nested.msrs, msr_index) = data; 1407 return 0; 1408 } 1409 1410 /* 1411 * Called when userspace is restoring VMX MSRs. 1412 * 1413 * Returns 0 on success, non-0 otherwise. 1414 */ 1415 int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 1416 { 1417 struct vcpu_vmx *vmx = to_vmx(vcpu); 1418 1419 /* 1420 * Don't allow changes to the VMX capability MSRs while the vCPU 1421 * is in VMX operation. 1422 */ 1423 if (vmx->nested.vmxon) 1424 return -EBUSY; 1425 1426 switch (msr_index) { 1427 case MSR_IA32_VMX_BASIC: 1428 return vmx_restore_vmx_basic(vmx, data); 1429 case MSR_IA32_VMX_PINBASED_CTLS: 1430 case MSR_IA32_VMX_PROCBASED_CTLS: 1431 case MSR_IA32_VMX_EXIT_CTLS: 1432 case MSR_IA32_VMX_ENTRY_CTLS: 1433 /* 1434 * The "non-true" VMX capability MSRs are generated from the 1435 * "true" MSRs, so we do not support restoring them directly. 1436 * 1437 * If userspace wants to emulate VMX_BASIC[55]=0, userspace 1438 * should restore the "true" MSRs with the must-be-1 bits 1439 * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND 1440 * DEFAULT SETTINGS". 1441 */ 1442 return -EINVAL; 1443 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1444 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1445 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1446 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1447 case MSR_IA32_VMX_PROCBASED_CTLS2: 1448 return vmx_restore_control_msr(vmx, msr_index, data); 1449 case MSR_IA32_VMX_MISC: 1450 return vmx_restore_vmx_misc(vmx, data); 1451 case MSR_IA32_VMX_CR0_FIXED0: 1452 case MSR_IA32_VMX_CR4_FIXED0: 1453 return vmx_restore_fixed0_msr(vmx, msr_index, data); 1454 case MSR_IA32_VMX_CR0_FIXED1: 1455 case MSR_IA32_VMX_CR4_FIXED1: 1456 /* 1457 * These MSRs are generated based on the vCPU's CPUID, so we 1458 * do not support restoring them directly. 1459 */ 1460 return -EINVAL; 1461 case MSR_IA32_VMX_EPT_VPID_CAP: 1462 return vmx_restore_vmx_ept_vpid_cap(vmx, data); 1463 case MSR_IA32_VMX_VMCS_ENUM: 1464 vmx->nested.msrs.vmcs_enum = data; 1465 return 0; 1466 case MSR_IA32_VMX_VMFUNC: 1467 if (data & ~vmcs_config.nested.vmfunc_controls) 1468 return -EINVAL; 1469 vmx->nested.msrs.vmfunc_controls = data; 1470 return 0; 1471 default: 1472 /* 1473 * The rest of the VMX capability MSRs do not support restore. 1474 */ 1475 return -EINVAL; 1476 } 1477 } 1478 1479 /* Returns 0 on success, non-0 otherwise. */ 1480 int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata) 1481 { 1482 switch (msr_index) { 1483 case MSR_IA32_VMX_BASIC: 1484 *pdata = msrs->basic; 1485 break; 1486 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1487 case MSR_IA32_VMX_PINBASED_CTLS: 1488 *pdata = vmx_control_msr( 1489 msrs->pinbased_ctls_low, 1490 msrs->pinbased_ctls_high); 1491 if (msr_index == MSR_IA32_VMX_PINBASED_CTLS) 1492 *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 1493 break; 1494 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1495 case MSR_IA32_VMX_PROCBASED_CTLS: 1496 *pdata = vmx_control_msr( 1497 msrs->procbased_ctls_low, 1498 msrs->procbased_ctls_high); 1499 if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS) 1500 *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 1501 break; 1502 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1503 case MSR_IA32_VMX_EXIT_CTLS: 1504 *pdata = vmx_control_msr( 1505 msrs->exit_ctls_low, 1506 msrs->exit_ctls_high); 1507 if (msr_index == MSR_IA32_VMX_EXIT_CTLS) 1508 *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 1509 break; 1510 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1511 case MSR_IA32_VMX_ENTRY_CTLS: 1512 *pdata = vmx_control_msr( 1513 msrs->entry_ctls_low, 1514 msrs->entry_ctls_high); 1515 if (msr_index == MSR_IA32_VMX_ENTRY_CTLS) 1516 *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 1517 break; 1518 case MSR_IA32_VMX_MISC: 1519 *pdata = vmx_control_msr( 1520 msrs->misc_low, 1521 msrs->misc_high); 1522 break; 1523 case MSR_IA32_VMX_CR0_FIXED0: 1524 *pdata = msrs->cr0_fixed0; 1525 break; 1526 case MSR_IA32_VMX_CR0_FIXED1: 1527 *pdata = msrs->cr0_fixed1; 1528 break; 1529 case MSR_IA32_VMX_CR4_FIXED0: 1530 *pdata = msrs->cr4_fixed0; 1531 break; 1532 case MSR_IA32_VMX_CR4_FIXED1: 1533 *pdata = msrs->cr4_fixed1; 1534 break; 1535 case MSR_IA32_VMX_VMCS_ENUM: 1536 *pdata = msrs->vmcs_enum; 1537 break; 1538 case MSR_IA32_VMX_PROCBASED_CTLS2: 1539 *pdata = vmx_control_msr( 1540 msrs->secondary_ctls_low, 1541 msrs->secondary_ctls_high); 1542 break; 1543 case MSR_IA32_VMX_EPT_VPID_CAP: 1544 *pdata = msrs->ept_caps | 1545 ((u64)msrs->vpid_caps << 32); 1546 break; 1547 case MSR_IA32_VMX_VMFUNC: 1548 *pdata = msrs->vmfunc_controls; 1549 break; 1550 default: 1551 return 1; 1552 } 1553 1554 return 0; 1555 } 1556 1557 /* 1558 * Copy the writable VMCS shadow fields back to the VMCS12, in case they have 1559 * been modified by the L1 guest. Note, "writable" in this context means 1560 * "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of 1561 * fields tagged SHADOW_FIELD_RO may or may not align with the "read-only" 1562 * VM-exit information fields (which are actually writable if the vCPU is 1563 * configured to support "VMWRITE to any supported field in the VMCS"). 1564 */ 1565 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) 1566 { 1567 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 1568 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); 1569 struct shadow_vmcs_field field; 1570 unsigned long val; 1571 int i; 1572 1573 if (WARN_ON(!shadow_vmcs)) 1574 return; 1575 1576 preempt_disable(); 1577 1578 vmcs_load(shadow_vmcs); 1579 1580 for (i = 0; i < max_shadow_read_write_fields; i++) { 1581 field = shadow_read_write_fields[i]; 1582 val = __vmcs_readl(field.encoding); 1583 vmcs12_write_any(vmcs12, field.encoding, field.offset, val); 1584 } 1585 1586 vmcs_clear(shadow_vmcs); 1587 vmcs_load(vmx->loaded_vmcs->vmcs); 1588 1589 preempt_enable(); 1590 } 1591 1592 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) 1593 { 1594 const struct shadow_vmcs_field *fields[] = { 1595 shadow_read_write_fields, 1596 shadow_read_only_fields 1597 }; 1598 const int max_fields[] = { 1599 max_shadow_read_write_fields, 1600 max_shadow_read_only_fields 1601 }; 1602 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 1603 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); 1604 struct shadow_vmcs_field field; 1605 unsigned long val; 1606 int i, q; 1607 1608 if (WARN_ON(!shadow_vmcs)) 1609 return; 1610 1611 vmcs_load(shadow_vmcs); 1612 1613 for (q = 0; q < ARRAY_SIZE(fields); q++) { 1614 for (i = 0; i < max_fields[q]; i++) { 1615 field = fields[q][i]; 1616 val = vmcs12_read_any(vmcs12, field.encoding, 1617 field.offset); 1618 __vmcs_writel(field.encoding, val); 1619 } 1620 } 1621 1622 vmcs_clear(shadow_vmcs); 1623 vmcs_load(vmx->loaded_vmcs->vmcs); 1624 } 1625 1626 static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields) 1627 { 1628 #ifdef CONFIG_KVM_HYPERV 1629 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; 1630 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 1631 struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(&vmx->vcpu); 1632 1633 /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */ 1634 vmcs12->tpr_threshold = evmcs->tpr_threshold; 1635 vmcs12->guest_rip = evmcs->guest_rip; 1636 1637 if (unlikely(!(hv_clean_fields & 1638 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ENLIGHTENMENTSCONTROL))) { 1639 hv_vcpu->nested.pa_page_gpa = evmcs->partition_assist_page; 1640 hv_vcpu->nested.vm_id = evmcs->hv_vm_id; 1641 hv_vcpu->nested.vp_id = evmcs->hv_vp_id; 1642 } 1643 1644 if (unlikely(!(hv_clean_fields & 1645 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) { 1646 vmcs12->guest_rsp = evmcs->guest_rsp; 1647 vmcs12->guest_rflags = evmcs->guest_rflags; 1648 vmcs12->guest_interruptibility_info = 1649 evmcs->guest_interruptibility_info; 1650 /* 1651 * Not present in struct vmcs12: 1652 * vmcs12->guest_ssp = evmcs->guest_ssp; 1653 */ 1654 } 1655 1656 if (unlikely(!(hv_clean_fields & 1657 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) { 1658 vmcs12->cpu_based_vm_exec_control = 1659 evmcs->cpu_based_vm_exec_control; 1660 } 1661 1662 if (unlikely(!(hv_clean_fields & 1663 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN))) { 1664 vmcs12->exception_bitmap = evmcs->exception_bitmap; 1665 } 1666 1667 if (unlikely(!(hv_clean_fields & 1668 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) { 1669 vmcs12->vm_entry_controls = evmcs->vm_entry_controls; 1670 } 1671 1672 if (unlikely(!(hv_clean_fields & 1673 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) { 1674 vmcs12->vm_entry_intr_info_field = 1675 evmcs->vm_entry_intr_info_field; 1676 vmcs12->vm_entry_exception_error_code = 1677 evmcs->vm_entry_exception_error_code; 1678 vmcs12->vm_entry_instruction_len = 1679 evmcs->vm_entry_instruction_len; 1680 } 1681 1682 if (unlikely(!(hv_clean_fields & 1683 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) { 1684 vmcs12->host_ia32_pat = evmcs->host_ia32_pat; 1685 vmcs12->host_ia32_efer = evmcs->host_ia32_efer; 1686 vmcs12->host_cr0 = evmcs->host_cr0; 1687 vmcs12->host_cr3 = evmcs->host_cr3; 1688 vmcs12->host_cr4 = evmcs->host_cr4; 1689 vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp; 1690 vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip; 1691 vmcs12->host_rip = evmcs->host_rip; 1692 vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs; 1693 vmcs12->host_es_selector = evmcs->host_es_selector; 1694 vmcs12->host_cs_selector = evmcs->host_cs_selector; 1695 vmcs12->host_ss_selector = evmcs->host_ss_selector; 1696 vmcs12->host_ds_selector = evmcs->host_ds_selector; 1697 vmcs12->host_fs_selector = evmcs->host_fs_selector; 1698 vmcs12->host_gs_selector = evmcs->host_gs_selector; 1699 vmcs12->host_tr_selector = evmcs->host_tr_selector; 1700 vmcs12->host_ia32_perf_global_ctrl = evmcs->host_ia32_perf_global_ctrl; 1701 /* 1702 * Not present in struct vmcs12: 1703 * vmcs12->host_ia32_s_cet = evmcs->host_ia32_s_cet; 1704 * vmcs12->host_ssp = evmcs->host_ssp; 1705 * vmcs12->host_ia32_int_ssp_table_addr = evmcs->host_ia32_int_ssp_table_addr; 1706 */ 1707 } 1708 1709 if (unlikely(!(hv_clean_fields & 1710 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1))) { 1711 vmcs12->pin_based_vm_exec_control = 1712 evmcs->pin_based_vm_exec_control; 1713 vmcs12->vm_exit_controls = evmcs->vm_exit_controls; 1714 vmcs12->secondary_vm_exec_control = 1715 evmcs->secondary_vm_exec_control; 1716 } 1717 1718 if (unlikely(!(hv_clean_fields & 1719 HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) { 1720 vmcs12->io_bitmap_a = evmcs->io_bitmap_a; 1721 vmcs12->io_bitmap_b = evmcs->io_bitmap_b; 1722 } 1723 1724 if (unlikely(!(hv_clean_fields & 1725 HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) { 1726 vmcs12->msr_bitmap = evmcs->msr_bitmap; 1727 } 1728 1729 if (unlikely(!(hv_clean_fields & 1730 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) { 1731 vmcs12->guest_es_base = evmcs->guest_es_base; 1732 vmcs12->guest_cs_base = evmcs->guest_cs_base; 1733 vmcs12->guest_ss_base = evmcs->guest_ss_base; 1734 vmcs12->guest_ds_base = evmcs->guest_ds_base; 1735 vmcs12->guest_fs_base = evmcs->guest_fs_base; 1736 vmcs12->guest_gs_base = evmcs->guest_gs_base; 1737 vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base; 1738 vmcs12->guest_tr_base = evmcs->guest_tr_base; 1739 vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base; 1740 vmcs12->guest_idtr_base = evmcs->guest_idtr_base; 1741 vmcs12->guest_es_limit = evmcs->guest_es_limit; 1742 vmcs12->guest_cs_limit = evmcs->guest_cs_limit; 1743 vmcs12->guest_ss_limit = evmcs->guest_ss_limit; 1744 vmcs12->guest_ds_limit = evmcs->guest_ds_limit; 1745 vmcs12->guest_fs_limit = evmcs->guest_fs_limit; 1746 vmcs12->guest_gs_limit = evmcs->guest_gs_limit; 1747 vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit; 1748 vmcs12->guest_tr_limit = evmcs->guest_tr_limit; 1749 vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit; 1750 vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit; 1751 vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes; 1752 vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes; 1753 vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes; 1754 vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes; 1755 vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes; 1756 vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes; 1757 vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes; 1758 vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes; 1759 vmcs12->guest_es_selector = evmcs->guest_es_selector; 1760 vmcs12->guest_cs_selector = evmcs->guest_cs_selector; 1761 vmcs12->guest_ss_selector = evmcs->guest_ss_selector; 1762 vmcs12->guest_ds_selector = evmcs->guest_ds_selector; 1763 vmcs12->guest_fs_selector = evmcs->guest_fs_selector; 1764 vmcs12->guest_gs_selector = evmcs->guest_gs_selector; 1765 vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector; 1766 vmcs12->guest_tr_selector = evmcs->guest_tr_selector; 1767 } 1768 1769 if (unlikely(!(hv_clean_fields & 1770 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) { 1771 vmcs12->tsc_offset = evmcs->tsc_offset; 1772 vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr; 1773 vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap; 1774 vmcs12->encls_exiting_bitmap = evmcs->encls_exiting_bitmap; 1775 vmcs12->tsc_multiplier = evmcs->tsc_multiplier; 1776 } 1777 1778 if (unlikely(!(hv_clean_fields & 1779 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) { 1780 vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask; 1781 vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask; 1782 vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow; 1783 vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow; 1784 vmcs12->guest_cr0 = evmcs->guest_cr0; 1785 vmcs12->guest_cr3 = evmcs->guest_cr3; 1786 vmcs12->guest_cr4 = evmcs->guest_cr4; 1787 vmcs12->guest_dr7 = evmcs->guest_dr7; 1788 } 1789 1790 if (unlikely(!(hv_clean_fields & 1791 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) { 1792 vmcs12->host_fs_base = evmcs->host_fs_base; 1793 vmcs12->host_gs_base = evmcs->host_gs_base; 1794 vmcs12->host_tr_base = evmcs->host_tr_base; 1795 vmcs12->host_gdtr_base = evmcs->host_gdtr_base; 1796 vmcs12->host_idtr_base = evmcs->host_idtr_base; 1797 vmcs12->host_rsp = evmcs->host_rsp; 1798 } 1799 1800 if (unlikely(!(hv_clean_fields & 1801 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) { 1802 vmcs12->ept_pointer = evmcs->ept_pointer; 1803 vmcs12->virtual_processor_id = evmcs->virtual_processor_id; 1804 } 1805 1806 if (unlikely(!(hv_clean_fields & 1807 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) { 1808 vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer; 1809 vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl; 1810 vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat; 1811 vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer; 1812 vmcs12->guest_pdptr0 = evmcs->guest_pdptr0; 1813 vmcs12->guest_pdptr1 = evmcs->guest_pdptr1; 1814 vmcs12->guest_pdptr2 = evmcs->guest_pdptr2; 1815 vmcs12->guest_pdptr3 = evmcs->guest_pdptr3; 1816 vmcs12->guest_pending_dbg_exceptions = 1817 evmcs->guest_pending_dbg_exceptions; 1818 vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp; 1819 vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip; 1820 vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs; 1821 vmcs12->guest_activity_state = evmcs->guest_activity_state; 1822 vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs; 1823 vmcs12->guest_ia32_perf_global_ctrl = evmcs->guest_ia32_perf_global_ctrl; 1824 /* 1825 * Not present in struct vmcs12: 1826 * vmcs12->guest_ia32_s_cet = evmcs->guest_ia32_s_cet; 1827 * vmcs12->guest_ia32_lbr_ctl = evmcs->guest_ia32_lbr_ctl; 1828 * vmcs12->guest_ia32_int_ssp_table_addr = evmcs->guest_ia32_int_ssp_table_addr; 1829 */ 1830 } 1831 1832 /* 1833 * Not used? 1834 * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr; 1835 * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr; 1836 * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr; 1837 * vmcs12->page_fault_error_code_mask = 1838 * evmcs->page_fault_error_code_mask; 1839 * vmcs12->page_fault_error_code_match = 1840 * evmcs->page_fault_error_code_match; 1841 * vmcs12->cr3_target_count = evmcs->cr3_target_count; 1842 * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count; 1843 * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count; 1844 * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count; 1845 */ 1846 1847 /* 1848 * Read only fields: 1849 * vmcs12->guest_physical_address = evmcs->guest_physical_address; 1850 * vmcs12->vm_instruction_error = evmcs->vm_instruction_error; 1851 * vmcs12->vm_exit_reason = evmcs->vm_exit_reason; 1852 * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info; 1853 * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code; 1854 * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field; 1855 * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code; 1856 * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len; 1857 * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info; 1858 * vmcs12->exit_qualification = evmcs->exit_qualification; 1859 * vmcs12->guest_linear_address = evmcs->guest_linear_address; 1860 * 1861 * Not present in struct vmcs12: 1862 * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx; 1863 * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi; 1864 * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi; 1865 * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip; 1866 */ 1867 1868 return; 1869 #else /* CONFIG_KVM_HYPERV */ 1870 KVM_BUG_ON(1, vmx->vcpu.kvm); 1871 #endif /* CONFIG_KVM_HYPERV */ 1872 } 1873 1874 static void copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) 1875 { 1876 #ifdef CONFIG_KVM_HYPERV 1877 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; 1878 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 1879 1880 /* 1881 * Should not be changed by KVM: 1882 * 1883 * evmcs->host_es_selector = vmcs12->host_es_selector; 1884 * evmcs->host_cs_selector = vmcs12->host_cs_selector; 1885 * evmcs->host_ss_selector = vmcs12->host_ss_selector; 1886 * evmcs->host_ds_selector = vmcs12->host_ds_selector; 1887 * evmcs->host_fs_selector = vmcs12->host_fs_selector; 1888 * evmcs->host_gs_selector = vmcs12->host_gs_selector; 1889 * evmcs->host_tr_selector = vmcs12->host_tr_selector; 1890 * evmcs->host_ia32_pat = vmcs12->host_ia32_pat; 1891 * evmcs->host_ia32_efer = vmcs12->host_ia32_efer; 1892 * evmcs->host_cr0 = vmcs12->host_cr0; 1893 * evmcs->host_cr3 = vmcs12->host_cr3; 1894 * evmcs->host_cr4 = vmcs12->host_cr4; 1895 * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp; 1896 * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip; 1897 * evmcs->host_rip = vmcs12->host_rip; 1898 * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs; 1899 * evmcs->host_fs_base = vmcs12->host_fs_base; 1900 * evmcs->host_gs_base = vmcs12->host_gs_base; 1901 * evmcs->host_tr_base = vmcs12->host_tr_base; 1902 * evmcs->host_gdtr_base = vmcs12->host_gdtr_base; 1903 * evmcs->host_idtr_base = vmcs12->host_idtr_base; 1904 * evmcs->host_rsp = vmcs12->host_rsp; 1905 * sync_vmcs02_to_vmcs12() doesn't read these: 1906 * evmcs->io_bitmap_a = vmcs12->io_bitmap_a; 1907 * evmcs->io_bitmap_b = vmcs12->io_bitmap_b; 1908 * evmcs->msr_bitmap = vmcs12->msr_bitmap; 1909 * evmcs->ept_pointer = vmcs12->ept_pointer; 1910 * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap; 1911 * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr; 1912 * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr; 1913 * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr; 1914 * evmcs->tpr_threshold = vmcs12->tpr_threshold; 1915 * evmcs->virtual_processor_id = vmcs12->virtual_processor_id; 1916 * evmcs->exception_bitmap = vmcs12->exception_bitmap; 1917 * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer; 1918 * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control; 1919 * evmcs->vm_exit_controls = vmcs12->vm_exit_controls; 1920 * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control; 1921 * evmcs->page_fault_error_code_mask = 1922 * vmcs12->page_fault_error_code_mask; 1923 * evmcs->page_fault_error_code_match = 1924 * vmcs12->page_fault_error_code_match; 1925 * evmcs->cr3_target_count = vmcs12->cr3_target_count; 1926 * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr; 1927 * evmcs->tsc_offset = vmcs12->tsc_offset; 1928 * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl; 1929 * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask; 1930 * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask; 1931 * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow; 1932 * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow; 1933 * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count; 1934 * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count; 1935 * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count; 1936 * evmcs->guest_ia32_perf_global_ctrl = vmcs12->guest_ia32_perf_global_ctrl; 1937 * evmcs->host_ia32_perf_global_ctrl = vmcs12->host_ia32_perf_global_ctrl; 1938 * evmcs->encls_exiting_bitmap = vmcs12->encls_exiting_bitmap; 1939 * evmcs->tsc_multiplier = vmcs12->tsc_multiplier; 1940 * 1941 * Not present in struct vmcs12: 1942 * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx; 1943 * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi; 1944 * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi; 1945 * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip; 1946 * evmcs->host_ia32_s_cet = vmcs12->host_ia32_s_cet; 1947 * evmcs->host_ssp = vmcs12->host_ssp; 1948 * evmcs->host_ia32_int_ssp_table_addr = vmcs12->host_ia32_int_ssp_table_addr; 1949 * evmcs->guest_ia32_s_cet = vmcs12->guest_ia32_s_cet; 1950 * evmcs->guest_ia32_lbr_ctl = vmcs12->guest_ia32_lbr_ctl; 1951 * evmcs->guest_ia32_int_ssp_table_addr = vmcs12->guest_ia32_int_ssp_table_addr; 1952 * evmcs->guest_ssp = vmcs12->guest_ssp; 1953 */ 1954 1955 evmcs->guest_es_selector = vmcs12->guest_es_selector; 1956 evmcs->guest_cs_selector = vmcs12->guest_cs_selector; 1957 evmcs->guest_ss_selector = vmcs12->guest_ss_selector; 1958 evmcs->guest_ds_selector = vmcs12->guest_ds_selector; 1959 evmcs->guest_fs_selector = vmcs12->guest_fs_selector; 1960 evmcs->guest_gs_selector = vmcs12->guest_gs_selector; 1961 evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector; 1962 evmcs->guest_tr_selector = vmcs12->guest_tr_selector; 1963 1964 evmcs->guest_es_limit = vmcs12->guest_es_limit; 1965 evmcs->guest_cs_limit = vmcs12->guest_cs_limit; 1966 evmcs->guest_ss_limit = vmcs12->guest_ss_limit; 1967 evmcs->guest_ds_limit = vmcs12->guest_ds_limit; 1968 evmcs->guest_fs_limit = vmcs12->guest_fs_limit; 1969 evmcs->guest_gs_limit = vmcs12->guest_gs_limit; 1970 evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit; 1971 evmcs->guest_tr_limit = vmcs12->guest_tr_limit; 1972 evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit; 1973 evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit; 1974 1975 evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes; 1976 evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes; 1977 evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes; 1978 evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes; 1979 evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes; 1980 evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes; 1981 evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes; 1982 evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes; 1983 1984 evmcs->guest_es_base = vmcs12->guest_es_base; 1985 evmcs->guest_cs_base = vmcs12->guest_cs_base; 1986 evmcs->guest_ss_base = vmcs12->guest_ss_base; 1987 evmcs->guest_ds_base = vmcs12->guest_ds_base; 1988 evmcs->guest_fs_base = vmcs12->guest_fs_base; 1989 evmcs->guest_gs_base = vmcs12->guest_gs_base; 1990 evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base; 1991 evmcs->guest_tr_base = vmcs12->guest_tr_base; 1992 evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base; 1993 evmcs->guest_idtr_base = vmcs12->guest_idtr_base; 1994 1995 evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat; 1996 evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer; 1997 1998 evmcs->guest_pdptr0 = vmcs12->guest_pdptr0; 1999 evmcs->guest_pdptr1 = vmcs12->guest_pdptr1; 2000 evmcs->guest_pdptr2 = vmcs12->guest_pdptr2; 2001 evmcs->guest_pdptr3 = vmcs12->guest_pdptr3; 2002 2003 evmcs->guest_pending_dbg_exceptions = 2004 vmcs12->guest_pending_dbg_exceptions; 2005 evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp; 2006 evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip; 2007 2008 evmcs->guest_activity_state = vmcs12->guest_activity_state; 2009 evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs; 2010 2011 evmcs->guest_cr0 = vmcs12->guest_cr0; 2012 evmcs->guest_cr3 = vmcs12->guest_cr3; 2013 evmcs->guest_cr4 = vmcs12->guest_cr4; 2014 evmcs->guest_dr7 = vmcs12->guest_dr7; 2015 2016 evmcs->guest_physical_address = vmcs12->guest_physical_address; 2017 2018 evmcs->vm_instruction_error = vmcs12->vm_instruction_error; 2019 evmcs->vm_exit_reason = vmcs12->vm_exit_reason; 2020 evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info; 2021 evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code; 2022 evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field; 2023 evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code; 2024 evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len; 2025 evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info; 2026 2027 evmcs->exit_qualification = vmcs12->exit_qualification; 2028 2029 evmcs->guest_linear_address = vmcs12->guest_linear_address; 2030 evmcs->guest_rsp = vmcs12->guest_rsp; 2031 evmcs->guest_rflags = vmcs12->guest_rflags; 2032 2033 evmcs->guest_interruptibility_info = 2034 vmcs12->guest_interruptibility_info; 2035 evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control; 2036 evmcs->vm_entry_controls = vmcs12->vm_entry_controls; 2037 evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field; 2038 evmcs->vm_entry_exception_error_code = 2039 vmcs12->vm_entry_exception_error_code; 2040 evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len; 2041 2042 evmcs->guest_rip = vmcs12->guest_rip; 2043 2044 evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs; 2045 2046 return; 2047 #else /* CONFIG_KVM_HYPERV */ 2048 KVM_BUG_ON(1, vmx->vcpu.kvm); 2049 #endif /* CONFIG_KVM_HYPERV */ 2050 } 2051 2052 /* 2053 * This is an equivalent of the nested hypervisor executing the vmptrld 2054 * instruction. 2055 */ 2056 static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld( 2057 struct kvm_vcpu *vcpu, bool from_launch) 2058 { 2059 #ifdef CONFIG_KVM_HYPERV 2060 struct vcpu_vmx *vmx = to_vmx(vcpu); 2061 bool evmcs_gpa_changed = false; 2062 u64 evmcs_gpa; 2063 2064 if (likely(!guest_cpuid_has_evmcs(vcpu))) 2065 return EVMPTRLD_DISABLED; 2066 2067 evmcs_gpa = nested_get_evmptr(vcpu); 2068 if (!evmptr_is_valid(evmcs_gpa)) { 2069 nested_release_evmcs(vcpu); 2070 return EVMPTRLD_DISABLED; 2071 } 2072 2073 if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) { 2074 vmx->nested.current_vmptr = INVALID_GPA; 2075 2076 nested_release_evmcs(vcpu); 2077 2078 if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa), 2079 &vmx->nested.hv_evmcs_map)) 2080 return EVMPTRLD_ERROR; 2081 2082 vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva; 2083 2084 /* 2085 * Currently, KVM only supports eVMCS version 1 2086 * (== KVM_EVMCS_VERSION) and thus we expect guest to set this 2087 * value to first u32 field of eVMCS which should specify eVMCS 2088 * VersionNumber. 2089 * 2090 * Guest should be aware of supported eVMCS versions by host by 2091 * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is 2092 * expected to set this CPUID leaf according to the value 2093 * returned in vmcs_version from nested_enable_evmcs(). 2094 * 2095 * However, it turns out that Microsoft Hyper-V fails to comply 2096 * to their own invented interface: When Hyper-V use eVMCS, it 2097 * just sets first u32 field of eVMCS to revision_id specified 2098 * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number 2099 * which is one of the supported versions specified in 2100 * CPUID.0x4000000A.EAX[0:15]. 2101 * 2102 * To overcome Hyper-V bug, we accept here either a supported 2103 * eVMCS version or VMCS12 revision_id as valid values for first 2104 * u32 field of eVMCS. 2105 */ 2106 if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) && 2107 (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) { 2108 nested_release_evmcs(vcpu); 2109 return EVMPTRLD_VMFAIL; 2110 } 2111 2112 vmx->nested.hv_evmcs_vmptr = evmcs_gpa; 2113 2114 evmcs_gpa_changed = true; 2115 /* 2116 * Unlike normal vmcs12, enlightened vmcs12 is not fully 2117 * reloaded from guest's memory (read only fields, fields not 2118 * present in struct hv_enlightened_vmcs, ...). Make sure there 2119 * are no leftovers. 2120 */ 2121 if (from_launch) { 2122 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2123 memset(vmcs12, 0, sizeof(*vmcs12)); 2124 vmcs12->hdr.revision_id = VMCS12_REVISION; 2125 } 2126 2127 } 2128 2129 /* 2130 * Clean fields data can't be used on VMLAUNCH and when we switch 2131 * between different L2 guests as KVM keeps a single VMCS12 per L1. 2132 */ 2133 if (from_launch || evmcs_gpa_changed) { 2134 vmx->nested.hv_evmcs->hv_clean_fields &= 2135 ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 2136 2137 vmx->nested.force_msr_bitmap_recalc = true; 2138 } 2139 2140 return EVMPTRLD_SUCCEEDED; 2141 #else 2142 return EVMPTRLD_DISABLED; 2143 #endif 2144 } 2145 2146 void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu) 2147 { 2148 struct vcpu_vmx *vmx = to_vmx(vcpu); 2149 2150 if (nested_vmx_is_evmptr12_valid(vmx)) 2151 copy_vmcs12_to_enlightened(vmx); 2152 else 2153 copy_vmcs12_to_shadow(vmx); 2154 2155 vmx->nested.need_vmcs12_to_shadow_sync = false; 2156 } 2157 2158 static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) 2159 { 2160 struct vcpu_vmx *vmx = 2161 container_of(timer, struct vcpu_vmx, nested.preemption_timer); 2162 2163 vmx->nested.preemption_timer_expired = true; 2164 kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); 2165 kvm_vcpu_kick(&vmx->vcpu); 2166 2167 return HRTIMER_NORESTART; 2168 } 2169 2170 static u64 vmx_calc_preemption_timer_value(struct kvm_vcpu *vcpu) 2171 { 2172 struct vcpu_vmx *vmx = to_vmx(vcpu); 2173 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2174 2175 u64 l1_scaled_tsc = kvm_read_l1_tsc(vcpu, rdtsc()) >> 2176 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 2177 2178 if (!vmx->nested.has_preemption_timer_deadline) { 2179 vmx->nested.preemption_timer_deadline = 2180 vmcs12->vmx_preemption_timer_value + l1_scaled_tsc; 2181 vmx->nested.has_preemption_timer_deadline = true; 2182 } 2183 return vmx->nested.preemption_timer_deadline - l1_scaled_tsc; 2184 } 2185 2186 static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu, 2187 u64 preemption_timeout) 2188 { 2189 struct vcpu_vmx *vmx = to_vmx(vcpu); 2190 2191 /* 2192 * A timer value of zero is architecturally guaranteed to cause 2193 * a VMExit prior to executing any instructions in the guest. 2194 */ 2195 if (preemption_timeout == 0) { 2196 vmx_preemption_timer_fn(&vmx->nested.preemption_timer); 2197 return; 2198 } 2199 2200 if (vcpu->arch.virtual_tsc_khz == 0) 2201 return; 2202 2203 preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 2204 preemption_timeout *= 1000000; 2205 do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); 2206 hrtimer_start(&vmx->nested.preemption_timer, 2207 ktime_add_ns(ktime_get(), preemption_timeout), 2208 HRTIMER_MODE_ABS_PINNED); 2209 } 2210 2211 static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2212 { 2213 if (vmx->nested.nested_run_pending && 2214 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) 2215 return vmcs12->guest_ia32_efer; 2216 else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) 2217 return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME); 2218 else 2219 return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME); 2220 } 2221 2222 static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) 2223 { 2224 struct kvm *kvm = vmx->vcpu.kvm; 2225 2226 /* 2227 * If vmcs02 hasn't been initialized, set the constant vmcs02 state 2228 * according to L0's settings (vmcs12 is irrelevant here). Host 2229 * fields that come from L0 and are not constant, e.g. HOST_CR3, 2230 * will be set as needed prior to VMLAUNCH/VMRESUME. 2231 */ 2232 if (vmx->nested.vmcs02_initialized) 2233 return; 2234 vmx->nested.vmcs02_initialized = true; 2235 2236 /* 2237 * We don't care what the EPTP value is we just need to guarantee 2238 * it's valid so we don't get a false positive when doing early 2239 * consistency checks. 2240 */ 2241 if (enable_ept && nested_early_check) 2242 vmcs_write64(EPT_POINTER, 2243 construct_eptp(&vmx->vcpu, 0, PT64_ROOT_4LEVEL)); 2244 2245 if (vmx->ve_info) 2246 vmcs_write64(VE_INFORMATION_ADDRESS, __pa(vmx->ve_info)); 2247 2248 /* All VMFUNCs are currently emulated through L0 vmexits. */ 2249 if (cpu_has_vmx_vmfunc()) 2250 vmcs_write64(VM_FUNCTION_CONTROL, 0); 2251 2252 if (cpu_has_vmx_posted_intr()) 2253 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR); 2254 2255 if (cpu_has_vmx_msr_bitmap()) 2256 vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap)); 2257 2258 /* 2259 * PML is emulated for L2, but never enabled in hardware as the MMU 2260 * handles A/D emulation. Disabling PML for L2 also avoids having to 2261 * deal with filtering out L2 GPAs from the buffer. 2262 */ 2263 if (enable_pml) { 2264 vmcs_write64(PML_ADDRESS, 0); 2265 vmcs_write16(GUEST_PML_INDEX, -1); 2266 } 2267 2268 if (cpu_has_vmx_encls_vmexit()) 2269 vmcs_write64(ENCLS_EXITING_BITMAP, INVALID_GPA); 2270 2271 if (kvm_notify_vmexit_enabled(kvm)) 2272 vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window); 2273 2274 /* 2275 * Set the MSR load/store lists to match L0's settings. Only the 2276 * addresses are constant (for vmcs02), the counts can change based 2277 * on L2's behavior, e.g. switching to/from long mode. 2278 */ 2279 vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.guest.val)); 2280 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); 2281 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); 2282 2283 vmx_set_constant_host_state(vmx); 2284 } 2285 2286 static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx, 2287 struct vmcs12 *vmcs12) 2288 { 2289 prepare_vmcs02_constant_state(vmx); 2290 2291 vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); 2292 2293 if (enable_vpid) { 2294 if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) 2295 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); 2296 else 2297 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 2298 } 2299 } 2300 2301 static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs01, 2302 struct vmcs12 *vmcs12) 2303 { 2304 u32 exec_control; 2305 u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12); 2306 2307 if (vmx->nested.dirty_vmcs12 || nested_vmx_is_evmptr12_valid(vmx)) 2308 prepare_vmcs02_early_rare(vmx, vmcs12); 2309 2310 /* 2311 * PIN CONTROLS 2312 */ 2313 exec_control = __pin_controls_get(vmcs01); 2314 exec_control |= (vmcs12->pin_based_vm_exec_control & 2315 ~PIN_BASED_VMX_PREEMPTION_TIMER); 2316 2317 /* Posted interrupts setting is only taken from vmcs12. */ 2318 vmx->nested.pi_pending = false; 2319 if (nested_cpu_has_posted_intr(vmcs12)) 2320 vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; 2321 else 2322 exec_control &= ~PIN_BASED_POSTED_INTR; 2323 pin_controls_set(vmx, exec_control); 2324 2325 /* 2326 * EXEC CONTROLS 2327 */ 2328 exec_control = __exec_controls_get(vmcs01); /* L0's desires */ 2329 exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING; 2330 exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING; 2331 exec_control &= ~CPU_BASED_TPR_SHADOW; 2332 exec_control |= vmcs12->cpu_based_vm_exec_control; 2333 2334 vmx->nested.l1_tpr_threshold = -1; 2335 if (exec_control & CPU_BASED_TPR_SHADOW) 2336 vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); 2337 #ifdef CONFIG_X86_64 2338 else 2339 exec_control |= CPU_BASED_CR8_LOAD_EXITING | 2340 CPU_BASED_CR8_STORE_EXITING; 2341 #endif 2342 2343 /* 2344 * A vmexit (to either L1 hypervisor or L0 userspace) is always needed 2345 * for I/O port accesses. 2346 */ 2347 exec_control |= CPU_BASED_UNCOND_IO_EXITING; 2348 exec_control &= ~CPU_BASED_USE_IO_BITMAPS; 2349 2350 /* 2351 * This bit will be computed in nested_get_vmcs12_pages, because 2352 * we do not have access to L1's MSR bitmap yet. For now, keep 2353 * the same bit as before, hoping to avoid multiple VMWRITEs that 2354 * only set/clear this bit. 2355 */ 2356 exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; 2357 exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS; 2358 2359 exec_controls_set(vmx, exec_control); 2360 2361 /* 2362 * SECONDARY EXEC CONTROLS 2363 */ 2364 if (cpu_has_secondary_exec_ctrls()) { 2365 exec_control = __secondary_exec_controls_get(vmcs01); 2366 2367 /* Take the following fields only from vmcs12 */ 2368 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 2369 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 2370 SECONDARY_EXEC_ENABLE_INVPCID | 2371 SECONDARY_EXEC_ENABLE_RDTSCP | 2372 SECONDARY_EXEC_ENABLE_XSAVES | 2373 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | 2374 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 2375 SECONDARY_EXEC_APIC_REGISTER_VIRT | 2376 SECONDARY_EXEC_ENABLE_VMFUNC | 2377 SECONDARY_EXEC_DESC); 2378 2379 if (nested_cpu_has(vmcs12, 2380 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) 2381 exec_control |= vmcs12->secondary_vm_exec_control; 2382 2383 /* PML is emulated and never enabled in hardware for L2. */ 2384 exec_control &= ~SECONDARY_EXEC_ENABLE_PML; 2385 2386 /* VMCS shadowing for L2 is emulated for now */ 2387 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; 2388 2389 /* 2390 * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4() 2391 * will not have to rewrite the controls just for this bit. 2392 */ 2393 if (vmx_umip_emulated() && (vmcs12->guest_cr4 & X86_CR4_UMIP)) 2394 exec_control |= SECONDARY_EXEC_DESC; 2395 2396 if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) 2397 vmcs_write16(GUEST_INTR_STATUS, 2398 vmcs12->guest_intr_status); 2399 2400 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) 2401 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 2402 2403 if (exec_control & SECONDARY_EXEC_ENCLS_EXITING) 2404 vmx_write_encls_bitmap(&vmx->vcpu, vmcs12); 2405 2406 secondary_exec_controls_set(vmx, exec_control); 2407 } 2408 2409 /* 2410 * ENTRY CONTROLS 2411 * 2412 * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE 2413 * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate 2414 * on the related bits (if supported by the CPU) in the hope that 2415 * we can avoid VMWrites during vmx_set_efer(). 2416 * 2417 * Similarly, take vmcs01's PERF_GLOBAL_CTRL in the hope that if KVM is 2418 * loading PERF_GLOBAL_CTRL via the VMCS for L1, then KVM will want to 2419 * do the same for L2. 2420 */ 2421 exec_control = __vm_entry_controls_get(vmcs01); 2422 exec_control |= (vmcs12->vm_entry_controls & 2423 ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL); 2424 exec_control &= ~(VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER); 2425 if (cpu_has_load_ia32_efer()) { 2426 if (guest_efer & EFER_LMA) 2427 exec_control |= VM_ENTRY_IA32E_MODE; 2428 if (guest_efer != host_efer) 2429 exec_control |= VM_ENTRY_LOAD_IA32_EFER; 2430 } 2431 vm_entry_controls_set(vmx, exec_control); 2432 2433 /* 2434 * EXIT CONTROLS 2435 * 2436 * L2->L1 exit controls are emulated - the hardware exit is to L0 so 2437 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER 2438 * bits may be modified by vmx_set_efer() in prepare_vmcs02(). 2439 */ 2440 exec_control = __vm_exit_controls_get(vmcs01); 2441 if (cpu_has_load_ia32_efer() && guest_efer != host_efer) 2442 exec_control |= VM_EXIT_LOAD_IA32_EFER; 2443 else 2444 exec_control &= ~VM_EXIT_LOAD_IA32_EFER; 2445 vm_exit_controls_set(vmx, exec_control); 2446 2447 /* 2448 * Interrupt/Exception Fields 2449 */ 2450 if (vmx->nested.nested_run_pending) { 2451 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 2452 vmcs12->vm_entry_intr_info_field); 2453 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 2454 vmcs12->vm_entry_exception_error_code); 2455 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 2456 vmcs12->vm_entry_instruction_len); 2457 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 2458 vmcs12->guest_interruptibility_info); 2459 vmx->loaded_vmcs->nmi_known_unmasked = 2460 !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI); 2461 } else { 2462 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); 2463 } 2464 } 2465 2466 static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2467 { 2468 struct hv_enlightened_vmcs *hv_evmcs = nested_vmx_evmcs(vmx); 2469 2470 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2471 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { 2472 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); 2473 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); 2474 vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); 2475 vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector); 2476 vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector); 2477 vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector); 2478 vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector); 2479 vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector); 2480 vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit); 2481 vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit); 2482 vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit); 2483 vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit); 2484 vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit); 2485 vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit); 2486 vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit); 2487 vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit); 2488 vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); 2489 vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); 2490 vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); 2491 vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); 2492 vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); 2493 vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); 2494 vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); 2495 vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes); 2496 vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes); 2497 vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes); 2498 vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); 2499 vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); 2500 vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base); 2501 vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base); 2502 vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base); 2503 vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base); 2504 vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base); 2505 vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); 2506 vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); 2507 vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); 2508 2509 vmx->segment_cache.bitmask = 0; 2510 } 2511 2512 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2513 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) { 2514 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); 2515 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 2516 vmcs12->guest_pending_dbg_exceptions); 2517 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); 2518 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); 2519 2520 /* 2521 * L1 may access the L2's PDPTR, so save them to construct 2522 * vmcs12 2523 */ 2524 if (enable_ept) { 2525 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 2526 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 2527 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 2528 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 2529 } 2530 2531 if (kvm_mpx_supported() && vmx->nested.nested_run_pending && 2532 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) 2533 vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); 2534 } 2535 2536 if (nested_cpu_has_xsaves(vmcs12)) 2537 vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap); 2538 2539 /* 2540 * Whether page-faults are trapped is determined by a combination of 2541 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. If L0 2542 * doesn't care about page faults then we should set all of these to 2543 * L1's desires. However, if L0 does care about (some) page faults, it 2544 * is not easy (if at all possible?) to merge L0 and L1's desires, we 2545 * simply ask to exit on each and every L2 page fault. This is done by 2546 * setting MASK=MATCH=0 and (see below) EB.PF=1. 2547 * Note that below we don't need special code to set EB.PF beyond the 2548 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, 2549 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when 2550 * !enable_ept, EB.PF is 1, so the "or" will always be 1. 2551 */ 2552 if (vmx_need_pf_intercept(&vmx->vcpu)) { 2553 /* 2554 * TODO: if both L0 and L1 need the same MASK and MATCH, 2555 * go ahead and use it? 2556 */ 2557 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); 2558 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); 2559 } else { 2560 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, vmcs12->page_fault_error_code_mask); 2561 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, vmcs12->page_fault_error_code_match); 2562 } 2563 2564 if (cpu_has_vmx_apicv()) { 2565 vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0); 2566 vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1); 2567 vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2); 2568 vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3); 2569 } 2570 2571 /* 2572 * Make sure the msr_autostore list is up to date before we set the 2573 * count in the vmcs02. 2574 */ 2575 prepare_vmx_msr_autostore_list(&vmx->vcpu, MSR_IA32_TSC); 2576 2577 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.guest.nr); 2578 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 2579 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 2580 2581 set_cr4_guest_host_mask(vmx); 2582 } 2583 2584 /* 2585 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested 2586 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it 2587 * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2 2588 * guest in a way that will both be appropriate to L1's requests, and our 2589 * needs. In addition to modifying the active vmcs (which is vmcs02), this 2590 * function also has additional necessary side-effects, like setting various 2591 * vcpu->arch fields. 2592 * Returns 0 on success, 1 on failure. Invalid state exit qualification code 2593 * is assigned to entry_failure_code on failure. 2594 */ 2595 static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 2596 bool from_vmentry, 2597 enum vm_entry_failure_code *entry_failure_code) 2598 { 2599 struct vcpu_vmx *vmx = to_vmx(vcpu); 2600 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 2601 bool load_guest_pdptrs_vmcs12 = false; 2602 2603 if (vmx->nested.dirty_vmcs12 || nested_vmx_is_evmptr12_valid(vmx)) { 2604 prepare_vmcs02_rare(vmx, vmcs12); 2605 vmx->nested.dirty_vmcs12 = false; 2606 2607 load_guest_pdptrs_vmcs12 = !nested_vmx_is_evmptr12_valid(vmx) || 2608 !(evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1); 2609 } 2610 2611 if (vmx->nested.nested_run_pending && 2612 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { 2613 kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); 2614 vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); 2615 } else { 2616 kvm_set_dr(vcpu, 7, vcpu->arch.dr7); 2617 vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.pre_vmenter_debugctl); 2618 } 2619 if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || 2620 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) 2621 vmcs_write64(GUEST_BNDCFGS, vmx->nested.pre_vmenter_bndcfgs); 2622 vmx_set_rflags(vcpu, vmcs12->guest_rflags); 2623 2624 /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the 2625 * bitwise-or of what L1 wants to trap for L2, and what we want to 2626 * trap. Note that CR0.TS also needs updating - we do this later. 2627 */ 2628 vmx_update_exception_bitmap(vcpu); 2629 vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; 2630 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); 2631 2632 if (vmx->nested.nested_run_pending && 2633 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) { 2634 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); 2635 vcpu->arch.pat = vmcs12->guest_ia32_pat; 2636 } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { 2637 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); 2638 } 2639 2640 vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset( 2641 vcpu->arch.l1_tsc_offset, 2642 vmx_get_l2_tsc_offset(vcpu), 2643 vmx_get_l2_tsc_multiplier(vcpu)); 2644 2645 vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier( 2646 vcpu->arch.l1_tsc_scaling_ratio, 2647 vmx_get_l2_tsc_multiplier(vcpu)); 2648 2649 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 2650 if (kvm_caps.has_tsc_control) 2651 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); 2652 2653 nested_vmx_transition_tlb_flush(vcpu, vmcs12, true); 2654 2655 if (nested_cpu_has_ept(vmcs12)) 2656 nested_ept_init_mmu_context(vcpu); 2657 2658 /* 2659 * Override the CR0/CR4 read shadows after setting the effective guest 2660 * CR0/CR4. The common helpers also set the shadows, but they don't 2661 * account for vmcs12's cr0/4_guest_host_mask. 2662 */ 2663 vmx_set_cr0(vcpu, vmcs12->guest_cr0); 2664 vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); 2665 2666 vmx_set_cr4(vcpu, vmcs12->guest_cr4); 2667 vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12)); 2668 2669 vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12); 2670 /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ 2671 vmx_set_efer(vcpu, vcpu->arch.efer); 2672 2673 /* 2674 * Guest state is invalid and unrestricted guest is disabled, 2675 * which means L1 attempted VMEntry to L2 with invalid state. 2676 * Fail the VMEntry. 2677 * 2678 * However when force loading the guest state (SMM exit or 2679 * loading nested state after migration, it is possible to 2680 * have invalid guest state now, which will be later fixed by 2681 * restoring L2 register state 2682 */ 2683 if (CC(from_vmentry && !vmx_guest_state_valid(vcpu))) { 2684 *entry_failure_code = ENTRY_FAIL_DEFAULT; 2685 return -EINVAL; 2686 } 2687 2688 /* Shadow page tables on either EPT or shadow page tables. */ 2689 if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12), 2690 from_vmentry, entry_failure_code)) 2691 return -EINVAL; 2692 2693 /* 2694 * Immediately write vmcs02.GUEST_CR3. It will be propagated to vmcs12 2695 * on nested VM-Exit, which can occur without actually running L2 and 2696 * thus without hitting vmx_load_mmu_pgd(), e.g. if L1 is entering L2 with 2697 * vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the 2698 * transition to HLT instead of running L2. 2699 */ 2700 if (enable_ept) 2701 vmcs_writel(GUEST_CR3, vmcs12->guest_cr3); 2702 2703 /* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */ 2704 if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) && 2705 is_pae_paging(vcpu)) { 2706 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 2707 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 2708 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 2709 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 2710 } 2711 2712 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && 2713 kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)) && 2714 WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 2715 vmcs12->guest_ia32_perf_global_ctrl))) { 2716 *entry_failure_code = ENTRY_FAIL_DEFAULT; 2717 return -EINVAL; 2718 } 2719 2720 kvm_rsp_write(vcpu, vmcs12->guest_rsp); 2721 kvm_rip_write(vcpu, vmcs12->guest_rip); 2722 2723 /* 2724 * It was observed that genuine Hyper-V running in L1 doesn't reset 2725 * 'hv_clean_fields' by itself, it only sets the corresponding dirty 2726 * bits when it changes a field in eVMCS. Mark all fields as clean 2727 * here. 2728 */ 2729 if (nested_vmx_is_evmptr12_valid(vmx)) 2730 evmcs->hv_clean_fields |= HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 2731 2732 return 0; 2733 } 2734 2735 static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12) 2736 { 2737 if (CC(!nested_cpu_has_nmi_exiting(vmcs12) && 2738 nested_cpu_has_virtual_nmis(vmcs12))) 2739 return -EINVAL; 2740 2741 if (CC(!nested_cpu_has_virtual_nmis(vmcs12) && 2742 nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING))) 2743 return -EINVAL; 2744 2745 return 0; 2746 } 2747 2748 static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp) 2749 { 2750 struct vcpu_vmx *vmx = to_vmx(vcpu); 2751 2752 /* Check for memory type validity */ 2753 switch (new_eptp & VMX_EPTP_MT_MASK) { 2754 case VMX_EPTP_MT_UC: 2755 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT))) 2756 return false; 2757 break; 2758 case VMX_EPTP_MT_WB: 2759 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT))) 2760 return false; 2761 break; 2762 default: 2763 return false; 2764 } 2765 2766 /* Page-walk levels validity. */ 2767 switch (new_eptp & VMX_EPTP_PWL_MASK) { 2768 case VMX_EPTP_PWL_5: 2769 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_5_BIT))) 2770 return false; 2771 break; 2772 case VMX_EPTP_PWL_4: 2773 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_4_BIT))) 2774 return false; 2775 break; 2776 default: 2777 return false; 2778 } 2779 2780 /* Reserved bits should not be set */ 2781 if (CC(!kvm_vcpu_is_legal_gpa(vcpu, new_eptp) || ((new_eptp >> 7) & 0x1f))) 2782 return false; 2783 2784 /* AD, if set, should be supported */ 2785 if (new_eptp & VMX_EPTP_AD_ENABLE_BIT) { 2786 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT))) 2787 return false; 2788 } 2789 2790 return true; 2791 } 2792 2793 /* 2794 * Checks related to VM-Execution Control Fields 2795 */ 2796 static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu, 2797 struct vmcs12 *vmcs12) 2798 { 2799 struct vcpu_vmx *vmx = to_vmx(vcpu); 2800 2801 if (CC(!vmx_control_verify(vmcs12->pin_based_vm_exec_control, 2802 vmx->nested.msrs.pinbased_ctls_low, 2803 vmx->nested.msrs.pinbased_ctls_high)) || 2804 CC(!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, 2805 vmx->nested.msrs.procbased_ctls_low, 2806 vmx->nested.msrs.procbased_ctls_high))) 2807 return -EINVAL; 2808 2809 if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && 2810 CC(!vmx_control_verify(vmcs12->secondary_vm_exec_control, 2811 vmx->nested.msrs.secondary_ctls_low, 2812 vmx->nested.msrs.secondary_ctls_high))) 2813 return -EINVAL; 2814 2815 if (CC(vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) || 2816 nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) || 2817 nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) || 2818 nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) || 2819 nested_vmx_check_apic_access_controls(vcpu, vmcs12) || 2820 nested_vmx_check_apicv_controls(vcpu, vmcs12) || 2821 nested_vmx_check_nmi_controls(vmcs12) || 2822 nested_vmx_check_pml_controls(vcpu, vmcs12) || 2823 nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) || 2824 nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) || 2825 nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) || 2826 CC(nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id)) 2827 return -EINVAL; 2828 2829 if (!nested_cpu_has_preemption_timer(vmcs12) && 2830 nested_cpu_has_save_preemption_timer(vmcs12)) 2831 return -EINVAL; 2832 2833 if (nested_cpu_has_ept(vmcs12) && 2834 CC(!nested_vmx_check_eptp(vcpu, vmcs12->ept_pointer))) 2835 return -EINVAL; 2836 2837 if (nested_cpu_has_vmfunc(vmcs12)) { 2838 if (CC(vmcs12->vm_function_control & 2839 ~vmx->nested.msrs.vmfunc_controls)) 2840 return -EINVAL; 2841 2842 if (nested_cpu_has_eptp_switching(vmcs12)) { 2843 if (CC(!nested_cpu_has_ept(vmcs12)) || 2844 CC(!page_address_valid(vcpu, vmcs12->eptp_list_address))) 2845 return -EINVAL; 2846 } 2847 } 2848 2849 return 0; 2850 } 2851 2852 /* 2853 * Checks related to VM-Exit Control Fields 2854 */ 2855 static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu, 2856 struct vmcs12 *vmcs12) 2857 { 2858 struct vcpu_vmx *vmx = to_vmx(vcpu); 2859 2860 if (CC(!vmx_control_verify(vmcs12->vm_exit_controls, 2861 vmx->nested.msrs.exit_ctls_low, 2862 vmx->nested.msrs.exit_ctls_high)) || 2863 CC(nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12))) 2864 return -EINVAL; 2865 2866 return 0; 2867 } 2868 2869 /* 2870 * Checks related to VM-Entry Control Fields 2871 */ 2872 static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, 2873 struct vmcs12 *vmcs12) 2874 { 2875 struct vcpu_vmx *vmx = to_vmx(vcpu); 2876 2877 if (CC(!vmx_control_verify(vmcs12->vm_entry_controls, 2878 vmx->nested.msrs.entry_ctls_low, 2879 vmx->nested.msrs.entry_ctls_high))) 2880 return -EINVAL; 2881 2882 /* 2883 * From the Intel SDM, volume 3: 2884 * Fields relevant to VM-entry event injection must be set properly. 2885 * These fields are the VM-entry interruption-information field, the 2886 * VM-entry exception error code, and the VM-entry instruction length. 2887 */ 2888 if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) { 2889 u32 intr_info = vmcs12->vm_entry_intr_info_field; 2890 u8 vector = intr_info & INTR_INFO_VECTOR_MASK; 2891 u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK; 2892 bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK; 2893 bool should_have_error_code; 2894 bool urg = nested_cpu_has2(vmcs12, 2895 SECONDARY_EXEC_UNRESTRICTED_GUEST); 2896 bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE; 2897 2898 /* VM-entry interruption-info field: interruption type */ 2899 if (CC(intr_type == INTR_TYPE_RESERVED) || 2900 CC(intr_type == INTR_TYPE_OTHER_EVENT && 2901 !nested_cpu_supports_monitor_trap_flag(vcpu))) 2902 return -EINVAL; 2903 2904 /* VM-entry interruption-info field: vector */ 2905 if (CC(intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) || 2906 CC(intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) || 2907 CC(intr_type == INTR_TYPE_OTHER_EVENT && vector != 0)) 2908 return -EINVAL; 2909 2910 /* VM-entry interruption-info field: deliver error code */ 2911 should_have_error_code = 2912 intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode && 2913 x86_exception_has_error_code(vector); 2914 if (CC(has_error_code != should_have_error_code)) 2915 return -EINVAL; 2916 2917 /* VM-entry exception error code */ 2918 if (CC(has_error_code && 2919 vmcs12->vm_entry_exception_error_code & GENMASK(31, 16))) 2920 return -EINVAL; 2921 2922 /* VM-entry interruption-info field: reserved bits */ 2923 if (CC(intr_info & INTR_INFO_RESVD_BITS_MASK)) 2924 return -EINVAL; 2925 2926 /* VM-entry instruction length */ 2927 switch (intr_type) { 2928 case INTR_TYPE_SOFT_EXCEPTION: 2929 case INTR_TYPE_SOFT_INTR: 2930 case INTR_TYPE_PRIV_SW_EXCEPTION: 2931 if (CC(vmcs12->vm_entry_instruction_len > 15) || 2932 CC(vmcs12->vm_entry_instruction_len == 0 && 2933 CC(!nested_cpu_has_zero_length_injection(vcpu)))) 2934 return -EINVAL; 2935 } 2936 } 2937 2938 if (nested_vmx_check_entry_msr_switch_controls(vcpu, vmcs12)) 2939 return -EINVAL; 2940 2941 return 0; 2942 } 2943 2944 static int nested_vmx_check_controls(struct kvm_vcpu *vcpu, 2945 struct vmcs12 *vmcs12) 2946 { 2947 if (nested_check_vm_execution_controls(vcpu, vmcs12) || 2948 nested_check_vm_exit_controls(vcpu, vmcs12) || 2949 nested_check_vm_entry_controls(vcpu, vmcs12)) 2950 return -EINVAL; 2951 2952 #ifdef CONFIG_KVM_HYPERV 2953 if (guest_cpuid_has_evmcs(vcpu)) 2954 return nested_evmcs_check_controls(vmcs12); 2955 #endif 2956 2957 return 0; 2958 } 2959 2960 static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu, 2961 struct vmcs12 *vmcs12) 2962 { 2963 #ifdef CONFIG_X86_64 2964 if (CC(!!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) != 2965 !!(vcpu->arch.efer & EFER_LMA))) 2966 return -EINVAL; 2967 #endif 2968 return 0; 2969 } 2970 2971 static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, 2972 struct vmcs12 *vmcs12) 2973 { 2974 bool ia32e = !!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE); 2975 2976 if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) || 2977 CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) || 2978 CC(!kvm_vcpu_is_legal_cr3(vcpu, vmcs12->host_cr3))) 2979 return -EINVAL; 2980 2981 if (CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu)) || 2982 CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu))) 2983 return -EINVAL; 2984 2985 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) && 2986 CC(!kvm_pat_valid(vmcs12->host_ia32_pat))) 2987 return -EINVAL; 2988 2989 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && 2990 CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), 2991 vmcs12->host_ia32_perf_global_ctrl))) 2992 return -EINVAL; 2993 2994 if (ia32e) { 2995 if (CC(!(vmcs12->host_cr4 & X86_CR4_PAE))) 2996 return -EINVAL; 2997 } else { 2998 if (CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) || 2999 CC(vmcs12->host_cr4 & X86_CR4_PCIDE) || 3000 CC((vmcs12->host_rip) >> 32)) 3001 return -EINVAL; 3002 } 3003 3004 if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3005 CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3006 CC(vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3007 CC(vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3008 CC(vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3009 CC(vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3010 CC(vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3011 CC(vmcs12->host_cs_selector == 0) || 3012 CC(vmcs12->host_tr_selector == 0) || 3013 CC(vmcs12->host_ss_selector == 0 && !ia32e)) 3014 return -EINVAL; 3015 3016 if (CC(is_noncanonical_address(vmcs12->host_fs_base, vcpu)) || 3017 CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) || 3018 CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) || 3019 CC(is_noncanonical_address(vmcs12->host_idtr_base, vcpu)) || 3020 CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu)) || 3021 CC(is_noncanonical_address(vmcs12->host_rip, vcpu))) 3022 return -EINVAL; 3023 3024 /* 3025 * If the load IA32_EFER VM-exit control is 1, bits reserved in the 3026 * IA32_EFER MSR must be 0 in the field for that register. In addition, 3027 * the values of the LMA and LME bits in the field must each be that of 3028 * the host address-space size VM-exit control. 3029 */ 3030 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) { 3031 if (CC(!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer)) || 3032 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA)) || 3033 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))) 3034 return -EINVAL; 3035 } 3036 3037 return 0; 3038 } 3039 3040 static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, 3041 struct vmcs12 *vmcs12) 3042 { 3043 struct vcpu_vmx *vmx = to_vmx(vcpu); 3044 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; 3045 struct vmcs_hdr hdr; 3046 3047 if (vmcs12->vmcs_link_pointer == INVALID_GPA) 3048 return 0; 3049 3050 if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))) 3051 return -EINVAL; 3052 3053 if (ghc->gpa != vmcs12->vmcs_link_pointer && 3054 CC(kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, 3055 vmcs12->vmcs_link_pointer, VMCS12_SIZE))) 3056 return -EINVAL; 3057 3058 if (CC(kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr, 3059 offsetof(struct vmcs12, hdr), 3060 sizeof(hdr)))) 3061 return -EINVAL; 3062 3063 if (CC(hdr.revision_id != VMCS12_REVISION) || 3064 CC(hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))) 3065 return -EINVAL; 3066 3067 return 0; 3068 } 3069 3070 /* 3071 * Checks related to Guest Non-register State 3072 */ 3073 static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12) 3074 { 3075 if (CC(vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && 3076 vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT && 3077 vmcs12->guest_activity_state != GUEST_ACTIVITY_WAIT_SIPI)) 3078 return -EINVAL; 3079 3080 return 0; 3081 } 3082 3083 static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, 3084 struct vmcs12 *vmcs12, 3085 enum vm_entry_failure_code *entry_failure_code) 3086 { 3087 bool ia32e = !!(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE); 3088 3089 *entry_failure_code = ENTRY_FAIL_DEFAULT; 3090 3091 if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) || 3092 CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))) 3093 return -EINVAL; 3094 3095 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) && 3096 CC(!kvm_dr7_valid(vmcs12->guest_dr7))) 3097 return -EINVAL; 3098 3099 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) && 3100 CC(!kvm_pat_valid(vmcs12->guest_ia32_pat))) 3101 return -EINVAL; 3102 3103 if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) { 3104 *entry_failure_code = ENTRY_FAIL_VMCS_LINK_PTR; 3105 return -EINVAL; 3106 } 3107 3108 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && 3109 CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), 3110 vmcs12->guest_ia32_perf_global_ctrl))) 3111 return -EINVAL; 3112 3113 if (CC((vmcs12->guest_cr0 & (X86_CR0_PG | X86_CR0_PE)) == X86_CR0_PG)) 3114 return -EINVAL; 3115 3116 if (CC(ia32e && !(vmcs12->guest_cr4 & X86_CR4_PAE)) || 3117 CC(ia32e && !(vmcs12->guest_cr0 & X86_CR0_PG))) 3118 return -EINVAL; 3119 3120 /* 3121 * If the load IA32_EFER VM-entry control is 1, the following checks 3122 * are performed on the field for the IA32_EFER MSR: 3123 * - Bits reserved in the IA32_EFER MSR must be 0. 3124 * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of 3125 * the IA-32e mode guest VM-exit control. It must also be identical 3126 * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to 3127 * CR0.PG) is 1. 3128 */ 3129 if (to_vmx(vcpu)->nested.nested_run_pending && 3130 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) { 3131 if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) || 3132 CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) || 3133 CC(((vmcs12->guest_cr0 & X86_CR0_PG) && 3134 ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))) 3135 return -EINVAL; 3136 } 3137 3138 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) && 3139 (CC(is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) || 3140 CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))) 3141 return -EINVAL; 3142 3143 if (nested_check_guest_non_reg_state(vmcs12)) 3144 return -EINVAL; 3145 3146 return 0; 3147 } 3148 3149 static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) 3150 { 3151 struct vcpu_vmx *vmx = to_vmx(vcpu); 3152 unsigned long cr3, cr4; 3153 bool vm_fail; 3154 3155 if (!nested_early_check) 3156 return 0; 3157 3158 if (vmx->msr_autoload.host.nr) 3159 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); 3160 if (vmx->msr_autoload.guest.nr) 3161 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); 3162 3163 preempt_disable(); 3164 3165 vmx_prepare_switch_to_guest(vcpu); 3166 3167 /* 3168 * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS, 3169 * which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to 3170 * be written (by prepare_vmcs02()) before the "real" VMEnter, i.e. 3171 * there is no need to preserve other bits or save/restore the field. 3172 */ 3173 vmcs_writel(GUEST_RFLAGS, 0); 3174 3175 cr3 = __get_current_cr3_fast(); 3176 if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { 3177 vmcs_writel(HOST_CR3, cr3); 3178 vmx->loaded_vmcs->host_state.cr3 = cr3; 3179 } 3180 3181 cr4 = cr4_read_shadow(); 3182 if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { 3183 vmcs_writel(HOST_CR4, cr4); 3184 vmx->loaded_vmcs->host_state.cr4 = cr4; 3185 } 3186 3187 vm_fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, 3188 __vmx_vcpu_run_flags(vmx)); 3189 3190 if (vmx->msr_autoload.host.nr) 3191 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 3192 if (vmx->msr_autoload.guest.nr) 3193 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 3194 3195 if (vm_fail) { 3196 u32 error = vmcs_read32(VM_INSTRUCTION_ERROR); 3197 3198 preempt_enable(); 3199 3200 trace_kvm_nested_vmenter_failed( 3201 "early hardware check VM-instruction error: ", error); 3202 WARN_ON_ONCE(error != VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3203 return 1; 3204 } 3205 3206 /* 3207 * VMExit clears RFLAGS.IF and DR7, even on a consistency check. 3208 */ 3209 if (hw_breakpoint_active()) 3210 set_debugreg(__this_cpu_read(cpu_dr7), 7); 3211 local_irq_enable(); 3212 preempt_enable(); 3213 3214 /* 3215 * A non-failing VMEntry means we somehow entered guest mode with 3216 * an illegal RIP, and that's just the tip of the iceberg. There 3217 * is no telling what memory has been modified or what state has 3218 * been exposed to unknown code. Hitting this all but guarantees 3219 * a (very critical) hardware issue. 3220 */ 3221 WARN_ON(!(vmcs_read32(VM_EXIT_REASON) & 3222 VMX_EXIT_REASONS_FAILED_VMENTRY)); 3223 3224 return 0; 3225 } 3226 3227 #ifdef CONFIG_KVM_HYPERV 3228 static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu) 3229 { 3230 struct vcpu_vmx *vmx = to_vmx(vcpu); 3231 3232 /* 3233 * hv_evmcs may end up being not mapped after migration (when 3234 * L2 was running), map it here to make sure vmcs12 changes are 3235 * properly reflected. 3236 */ 3237 if (guest_cpuid_has_evmcs(vcpu) && 3238 vmx->nested.hv_evmcs_vmptr == EVMPTR_MAP_PENDING) { 3239 enum nested_evmptrld_status evmptrld_status = 3240 nested_vmx_handle_enlightened_vmptrld(vcpu, false); 3241 3242 if (evmptrld_status == EVMPTRLD_VMFAIL || 3243 evmptrld_status == EVMPTRLD_ERROR) 3244 return false; 3245 3246 /* 3247 * Post migration VMCS12 always provides the most actual 3248 * information, copy it to eVMCS upon entry. 3249 */ 3250 vmx->nested.need_vmcs12_to_shadow_sync = true; 3251 } 3252 3253 return true; 3254 } 3255 #endif 3256 3257 static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) 3258 { 3259 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3260 struct vcpu_vmx *vmx = to_vmx(vcpu); 3261 struct kvm_host_map *map; 3262 3263 if (!vcpu->arch.pdptrs_from_userspace && 3264 !nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { 3265 /* 3266 * Reload the guest's PDPTRs since after a migration 3267 * the guest CR3 might be restored prior to setting the nested 3268 * state which can lead to a load of wrong PDPTRs. 3269 */ 3270 if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3))) 3271 return false; 3272 } 3273 3274 3275 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 3276 map = &vmx->nested.apic_access_page_map; 3277 3278 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->apic_access_addr), map)) { 3279 vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(map->pfn)); 3280 } else { 3281 pr_debug_ratelimited("%s: no backing for APIC-access address in vmcs12\n", 3282 __func__); 3283 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3284 vcpu->run->internal.suberror = 3285 KVM_INTERNAL_ERROR_EMULATION; 3286 vcpu->run->internal.ndata = 0; 3287 return false; 3288 } 3289 } 3290 3291 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 3292 map = &vmx->nested.virtual_apic_map; 3293 3294 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) { 3295 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn)); 3296 } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) && 3297 nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) && 3298 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 3299 /* 3300 * The processor will never use the TPR shadow, simply 3301 * clear the bit from the execution control. Such a 3302 * configuration is useless, but it happens in tests. 3303 * For any other configuration, failing the vm entry is 3304 * _not_ what the processor does but it's basically the 3305 * only possibility we have. 3306 */ 3307 exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW); 3308 } else { 3309 /* 3310 * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to 3311 * force VM-Entry to fail. 3312 */ 3313 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, INVALID_GPA); 3314 } 3315 } 3316 3317 if (nested_cpu_has_posted_intr(vmcs12)) { 3318 map = &vmx->nested.pi_desc_map; 3319 3320 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) { 3321 vmx->nested.pi_desc = 3322 (struct pi_desc *)(((void *)map->hva) + 3323 offset_in_page(vmcs12->posted_intr_desc_addr)); 3324 vmcs_write64(POSTED_INTR_DESC_ADDR, 3325 pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr)); 3326 } else { 3327 /* 3328 * Defer the KVM_INTERNAL_EXIT until KVM tries to 3329 * access the contents of the VMCS12 posted interrupt 3330 * descriptor. (Note that KVM may do this when it 3331 * should not, per the architectural specification.) 3332 */ 3333 vmx->nested.pi_desc = NULL; 3334 pin_controls_clearbit(vmx, PIN_BASED_POSTED_INTR); 3335 } 3336 } 3337 if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12)) 3338 exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS); 3339 else 3340 exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS); 3341 3342 return true; 3343 } 3344 3345 static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu) 3346 { 3347 #ifdef CONFIG_KVM_HYPERV 3348 /* 3349 * Note: nested_get_evmcs_page() also updates 'vp_assist_page' copy 3350 * in 'struct kvm_vcpu_hv' in case eVMCS is in use, this is mandatory 3351 * to make nested_evmcs_l2_tlb_flush_enabled() work correctly post 3352 * migration. 3353 */ 3354 if (!nested_get_evmcs_page(vcpu)) { 3355 pr_debug_ratelimited("%s: enlightened vmptrld failed\n", 3356 __func__); 3357 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3358 vcpu->run->internal.suberror = 3359 KVM_INTERNAL_ERROR_EMULATION; 3360 vcpu->run->internal.ndata = 0; 3361 3362 return false; 3363 } 3364 #endif 3365 3366 if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu)) 3367 return false; 3368 3369 return true; 3370 } 3371 3372 static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa) 3373 { 3374 struct vmcs12 *vmcs12; 3375 struct vcpu_vmx *vmx = to_vmx(vcpu); 3376 gpa_t dst; 3377 3378 if (WARN_ON_ONCE(!is_guest_mode(vcpu))) 3379 return 0; 3380 3381 if (WARN_ON_ONCE(vmx->nested.pml_full)) 3382 return 1; 3383 3384 /* 3385 * Check if PML is enabled for the nested guest. Whether eptp bit 6 is 3386 * set is already checked as part of A/D emulation. 3387 */ 3388 vmcs12 = get_vmcs12(vcpu); 3389 if (!nested_cpu_has_pml(vmcs12)) 3390 return 0; 3391 3392 if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) { 3393 vmx->nested.pml_full = true; 3394 return 1; 3395 } 3396 3397 gpa &= ~0xFFFull; 3398 dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index; 3399 3400 if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa, 3401 offset_in_page(dst), sizeof(gpa))) 3402 return 0; 3403 3404 vmcs12->guest_pml_index--; 3405 3406 return 0; 3407 } 3408 3409 /* 3410 * Intel's VMX Instruction Reference specifies a common set of prerequisites 3411 * for running VMX instructions (except VMXON, whose prerequisites are 3412 * slightly different). It also specifies what exception to inject otherwise. 3413 * Note that many of these exceptions have priority over VM exits, so they 3414 * don't have to be checked again here. 3415 */ 3416 static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) 3417 { 3418 if (!to_vmx(vcpu)->nested.vmxon) { 3419 kvm_queue_exception(vcpu, UD_VECTOR); 3420 return 0; 3421 } 3422 3423 if (vmx_get_cpl(vcpu)) { 3424 kvm_inject_gp(vcpu, 0); 3425 return 0; 3426 } 3427 3428 return 1; 3429 } 3430 3431 static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu) 3432 { 3433 u8 rvi = vmx_get_rvi(); 3434 u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI); 3435 3436 return ((rvi & 0xf0) > (vppr & 0xf0)); 3437 } 3438 3439 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 3440 struct vmcs12 *vmcs12); 3441 3442 /* 3443 * If from_vmentry is false, this is being called from state restore (either RSM 3444 * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume. 3445 * 3446 * Returns: 3447 * NVMX_VMENTRY_SUCCESS: Entered VMX non-root mode 3448 * NVMX_VMENTRY_VMFAIL: Consistency check VMFail 3449 * NVMX_VMENTRY_VMEXIT: Consistency check VMExit 3450 * NVMX_VMENTRY_KVM_INTERNAL_ERROR: KVM internal error 3451 */ 3452 enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, 3453 bool from_vmentry) 3454 { 3455 struct vcpu_vmx *vmx = to_vmx(vcpu); 3456 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3457 enum vm_entry_failure_code entry_failure_code; 3458 bool evaluate_pending_interrupts; 3459 union vmx_exit_reason exit_reason = { 3460 .basic = EXIT_REASON_INVALID_STATE, 3461 .failed_vmentry = 1, 3462 }; 3463 u32 failed_index; 3464 3465 trace_kvm_nested_vmenter(kvm_rip_read(vcpu), 3466 vmx->nested.current_vmptr, 3467 vmcs12->guest_rip, 3468 vmcs12->guest_intr_status, 3469 vmcs12->vm_entry_intr_info_field, 3470 vmcs12->secondary_vm_exec_control & SECONDARY_EXEC_ENABLE_EPT, 3471 vmcs12->ept_pointer, 3472 vmcs12->guest_cr3, 3473 KVM_ISA_VMX); 3474 3475 kvm_service_local_tlb_flush_requests(vcpu); 3476 3477 evaluate_pending_interrupts = exec_controls_get(vmx) & 3478 (CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING); 3479 if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu)) 3480 evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu); 3481 if (!evaluate_pending_interrupts) 3482 evaluate_pending_interrupts |= kvm_apic_has_pending_init_or_sipi(vcpu); 3483 3484 if (!vmx->nested.nested_run_pending || 3485 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) 3486 vmx->nested.pre_vmenter_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); 3487 if (kvm_mpx_supported() && 3488 (!vmx->nested.nested_run_pending || 3489 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) 3490 vmx->nested.pre_vmenter_bndcfgs = vmcs_read64(GUEST_BNDCFGS); 3491 3492 /* 3493 * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and* 3494 * nested early checks are disabled. In the event of a "late" VM-Fail, 3495 * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its 3496 * software model to the pre-VMEntry host state. When EPT is disabled, 3497 * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes 3498 * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3. Stuffing 3499 * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to 3500 * the correct value. Smashing vmcs01.GUEST_CR3 is safe because nested 3501 * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is 3502 * guaranteed to be overwritten with a shadow CR3 prior to re-entering 3503 * L1. Don't stuff vmcs01.GUEST_CR3 when using nested early checks as 3504 * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks 3505 * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail 3506 * path would need to manually save/restore vmcs01.GUEST_CR3. 3507 */ 3508 if (!enable_ept && !nested_early_check) 3509 vmcs_writel(GUEST_CR3, vcpu->arch.cr3); 3510 3511 vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); 3512 3513 prepare_vmcs02_early(vmx, &vmx->vmcs01, vmcs12); 3514 3515 if (from_vmentry) { 3516 if (unlikely(!nested_get_vmcs12_pages(vcpu))) { 3517 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3518 return NVMX_VMENTRY_KVM_INTERNAL_ERROR; 3519 } 3520 3521 if (nested_vmx_check_vmentry_hw(vcpu)) { 3522 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3523 return NVMX_VMENTRY_VMFAIL; 3524 } 3525 3526 if (nested_vmx_check_guest_state(vcpu, vmcs12, 3527 &entry_failure_code)) { 3528 exit_reason.basic = EXIT_REASON_INVALID_STATE; 3529 vmcs12->exit_qualification = entry_failure_code; 3530 goto vmentry_fail_vmexit; 3531 } 3532 } 3533 3534 enter_guest_mode(vcpu); 3535 3536 if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &entry_failure_code)) { 3537 exit_reason.basic = EXIT_REASON_INVALID_STATE; 3538 vmcs12->exit_qualification = entry_failure_code; 3539 goto vmentry_fail_vmexit_guest_mode; 3540 } 3541 3542 if (from_vmentry) { 3543 failed_index = nested_vmx_load_msr(vcpu, 3544 vmcs12->vm_entry_msr_load_addr, 3545 vmcs12->vm_entry_msr_load_count); 3546 if (failed_index) { 3547 exit_reason.basic = EXIT_REASON_MSR_LOAD_FAIL; 3548 vmcs12->exit_qualification = failed_index; 3549 goto vmentry_fail_vmexit_guest_mode; 3550 } 3551 } else { 3552 /* 3553 * The MMU is not initialized to point at the right entities yet and 3554 * "get pages" would need to read data from the guest (i.e. we will 3555 * need to perform gpa to hpa translation). Request a call 3556 * to nested_get_vmcs12_pages before the next VM-entry. The MSRs 3557 * have already been set at vmentry time and should not be reset. 3558 */ 3559 kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 3560 } 3561 3562 /* 3563 * Re-evaluate pending events if L1 had a pending IRQ/NMI/INIT/SIPI 3564 * when it executed VMLAUNCH/VMRESUME, as entering non-root mode can 3565 * effectively unblock various events, e.g. INIT/SIPI cause VM-Exit 3566 * unconditionally. 3567 */ 3568 if (unlikely(evaluate_pending_interrupts)) 3569 kvm_make_request(KVM_REQ_EVENT, vcpu); 3570 3571 /* 3572 * Do not start the preemption timer hrtimer until after we know 3573 * we are successful, so that only nested_vmx_vmexit needs to cancel 3574 * the timer. 3575 */ 3576 vmx->nested.preemption_timer_expired = false; 3577 if (nested_cpu_has_preemption_timer(vmcs12)) { 3578 u64 timer_value = vmx_calc_preemption_timer_value(vcpu); 3579 vmx_start_preemption_timer(vcpu, timer_value); 3580 } 3581 3582 /* 3583 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point 3584 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet 3585 * returned as far as L1 is concerned. It will only return (and set 3586 * the success flag) when L2 exits (see nested_vmx_vmexit()). 3587 */ 3588 return NVMX_VMENTRY_SUCCESS; 3589 3590 /* 3591 * A failed consistency check that leads to a VMExit during L1's 3592 * VMEnter to L2 is a variation of a normal VMexit, as explained in 3593 * 26.7 "VM-entry failures during or after loading guest state". 3594 */ 3595 vmentry_fail_vmexit_guest_mode: 3596 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING) 3597 vcpu->arch.tsc_offset -= vmcs12->tsc_offset; 3598 leave_guest_mode(vcpu); 3599 3600 vmentry_fail_vmexit: 3601 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3602 3603 if (!from_vmentry) 3604 return NVMX_VMENTRY_VMEXIT; 3605 3606 load_vmcs12_host_state(vcpu, vmcs12); 3607 vmcs12->vm_exit_reason = exit_reason.full; 3608 if (enable_shadow_vmcs || nested_vmx_is_evmptr12_valid(vmx)) 3609 vmx->nested.need_vmcs12_to_shadow_sync = true; 3610 return NVMX_VMENTRY_VMEXIT; 3611 } 3612 3613 /* 3614 * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1 3615 * for running an L2 nested guest. 3616 */ 3617 static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) 3618 { 3619 struct vmcs12 *vmcs12; 3620 enum nvmx_vmentry_status status; 3621 struct vcpu_vmx *vmx = to_vmx(vcpu); 3622 u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu); 3623 enum nested_evmptrld_status evmptrld_status; 3624 3625 if (!nested_vmx_check_permission(vcpu)) 3626 return 1; 3627 3628 evmptrld_status = nested_vmx_handle_enlightened_vmptrld(vcpu, launch); 3629 if (evmptrld_status == EVMPTRLD_ERROR) { 3630 kvm_queue_exception(vcpu, UD_VECTOR); 3631 return 1; 3632 } 3633 3634 kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED); 3635 3636 if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) 3637 return nested_vmx_failInvalid(vcpu); 3638 3639 if (CC(!nested_vmx_is_evmptr12_valid(vmx) && 3640 vmx->nested.current_vmptr == INVALID_GPA)) 3641 return nested_vmx_failInvalid(vcpu); 3642 3643 vmcs12 = get_vmcs12(vcpu); 3644 3645 /* 3646 * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact 3647 * that there *is* a valid VMCS pointer, RFLAGS.CF is set 3648 * rather than RFLAGS.ZF, and no error number is stored to the 3649 * VM-instruction error field. 3650 */ 3651 if (CC(vmcs12->hdr.shadow_vmcs)) 3652 return nested_vmx_failInvalid(vcpu); 3653 3654 if (nested_vmx_is_evmptr12_valid(vmx)) { 3655 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 3656 3657 copy_enlightened_to_vmcs12(vmx, evmcs->hv_clean_fields); 3658 /* Enlightened VMCS doesn't have launch state */ 3659 vmcs12->launch_state = !launch; 3660 } else if (enable_shadow_vmcs) { 3661 copy_shadow_to_vmcs12(vmx); 3662 } 3663 3664 /* 3665 * The nested entry process starts with enforcing various prerequisites 3666 * on vmcs12 as required by the Intel SDM, and act appropriately when 3667 * they fail: As the SDM explains, some conditions should cause the 3668 * instruction to fail, while others will cause the instruction to seem 3669 * to succeed, but return an EXIT_REASON_INVALID_STATE. 3670 * To speed up the normal (success) code path, we should avoid checking 3671 * for misconfigurations which will anyway be caught by the processor 3672 * when using the merged vmcs02. 3673 */ 3674 if (CC(interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS)) 3675 return nested_vmx_fail(vcpu, VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS); 3676 3677 if (CC(vmcs12->launch_state == launch)) 3678 return nested_vmx_fail(vcpu, 3679 launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS 3680 : VMXERR_VMRESUME_NONLAUNCHED_VMCS); 3681 3682 if (nested_vmx_check_controls(vcpu, vmcs12)) 3683 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3684 3685 if (nested_vmx_check_address_space_size(vcpu, vmcs12)) 3686 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); 3687 3688 if (nested_vmx_check_host_state(vcpu, vmcs12)) 3689 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); 3690 3691 /* 3692 * We're finally done with prerequisite checking, and can start with 3693 * the nested entry. 3694 */ 3695 vmx->nested.nested_run_pending = 1; 3696 vmx->nested.has_preemption_timer_deadline = false; 3697 status = nested_vmx_enter_non_root_mode(vcpu, true); 3698 if (unlikely(status != NVMX_VMENTRY_SUCCESS)) 3699 goto vmentry_failed; 3700 3701 /* Emulate processing of posted interrupts on VM-Enter. */ 3702 if (nested_cpu_has_posted_intr(vmcs12) && 3703 kvm_apic_has_interrupt(vcpu) == vmx->nested.posted_intr_nv) { 3704 vmx->nested.pi_pending = true; 3705 kvm_make_request(KVM_REQ_EVENT, vcpu); 3706 kvm_apic_clear_irr(vcpu, vmx->nested.posted_intr_nv); 3707 } 3708 3709 /* Hide L1D cache contents from the nested guest. */ 3710 vmx->vcpu.arch.l1tf_flush_l1d = true; 3711 3712 /* 3713 * Must happen outside of nested_vmx_enter_non_root_mode() as it will 3714 * also be used as part of restoring nVMX state for 3715 * snapshot restore (migration). 3716 * 3717 * In this flow, it is assumed that vmcs12 cache was 3718 * transferred as part of captured nVMX state and should 3719 * therefore not be read from guest memory (which may not 3720 * exist on destination host yet). 3721 */ 3722 nested_cache_shadow_vmcs12(vcpu, vmcs12); 3723 3724 switch (vmcs12->guest_activity_state) { 3725 case GUEST_ACTIVITY_HLT: 3726 /* 3727 * If we're entering a halted L2 vcpu and the L2 vcpu won't be 3728 * awakened by event injection or by an NMI-window VM-exit or 3729 * by an interrupt-window VM-exit, halt the vcpu. 3730 */ 3731 if (!(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) && 3732 !nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING) && 3733 !(nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING) && 3734 (vmcs12->guest_rflags & X86_EFLAGS_IF))) { 3735 vmx->nested.nested_run_pending = 0; 3736 return kvm_emulate_halt_noskip(vcpu); 3737 } 3738 break; 3739 case GUEST_ACTIVITY_WAIT_SIPI: 3740 vmx->nested.nested_run_pending = 0; 3741 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; 3742 break; 3743 default: 3744 break; 3745 } 3746 3747 return 1; 3748 3749 vmentry_failed: 3750 vmx->nested.nested_run_pending = 0; 3751 if (status == NVMX_VMENTRY_KVM_INTERNAL_ERROR) 3752 return 0; 3753 if (status == NVMX_VMENTRY_VMEXIT) 3754 return 1; 3755 WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL); 3756 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3757 } 3758 3759 /* 3760 * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date 3761 * because L2 may have changed some cr0 bits directly (CR0_GUEST_HOST_MASK). 3762 * This function returns the new value we should put in vmcs12.guest_cr0. 3763 * It's not enough to just return the vmcs02 GUEST_CR0. Rather, 3764 * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now 3765 * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0 3766 * didn't trap the bit, because if L1 did, so would L0). 3767 * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have 3768 * been modified by L2, and L1 knows it. So just leave the old value of 3769 * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0 3770 * isn't relevant, because if L0 traps this bit it can set it to anything. 3771 * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have 3772 * changed these bits, and therefore they need to be updated, but L0 3773 * didn't necessarily allow them to be changed in GUEST_CR0 - and rather 3774 * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there. 3775 */ 3776 static inline unsigned long 3777 vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3778 { 3779 return 3780 /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) | 3781 /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) | 3782 /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask | 3783 vcpu->arch.cr0_guest_owned_bits)); 3784 } 3785 3786 static inline unsigned long 3787 vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3788 { 3789 return 3790 /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) | 3791 /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) | 3792 /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask | 3793 vcpu->arch.cr4_guest_owned_bits)); 3794 } 3795 3796 static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, 3797 struct vmcs12 *vmcs12, 3798 u32 vm_exit_reason, u32 exit_intr_info) 3799 { 3800 u32 idt_vectoring; 3801 unsigned int nr; 3802 3803 /* 3804 * Per the SDM, VM-Exits due to double and triple faults are never 3805 * considered to occur during event delivery, even if the double/triple 3806 * fault is the result of an escalating vectoring issue. 3807 * 3808 * Note, the SDM qualifies the double fault behavior with "The original 3809 * event results in a double-fault exception". It's unclear why the 3810 * qualification exists since exits due to double fault can occur only 3811 * while vectoring a different exception (injected events are never 3812 * subject to interception), i.e. there's _always_ an original event. 3813 * 3814 * The SDM also uses NMI as a confusing example for the "original event 3815 * causes the VM exit directly" clause. NMI isn't special in any way, 3816 * the same rule applies to all events that cause an exit directly. 3817 * NMI is an odd choice for the example because NMIs can only occur on 3818 * instruction boundaries, i.e. they _can't_ occur during vectoring. 3819 */ 3820 if ((u16)vm_exit_reason == EXIT_REASON_TRIPLE_FAULT || 3821 ((u16)vm_exit_reason == EXIT_REASON_EXCEPTION_NMI && 3822 is_double_fault(exit_intr_info))) { 3823 vmcs12->idt_vectoring_info_field = 0; 3824 } else if (vcpu->arch.exception.injected) { 3825 nr = vcpu->arch.exception.vector; 3826 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 3827 3828 if (kvm_exception_is_soft(nr)) { 3829 vmcs12->vm_exit_instruction_len = 3830 vcpu->arch.event_exit_inst_len; 3831 idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION; 3832 } else 3833 idt_vectoring |= INTR_TYPE_HARD_EXCEPTION; 3834 3835 if (vcpu->arch.exception.has_error_code) { 3836 idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK; 3837 vmcs12->idt_vectoring_error_code = 3838 vcpu->arch.exception.error_code; 3839 } 3840 3841 vmcs12->idt_vectoring_info_field = idt_vectoring; 3842 } else if (vcpu->arch.nmi_injected) { 3843 vmcs12->idt_vectoring_info_field = 3844 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR; 3845 } else if (vcpu->arch.interrupt.injected) { 3846 nr = vcpu->arch.interrupt.nr; 3847 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 3848 3849 if (vcpu->arch.interrupt.soft) { 3850 idt_vectoring |= INTR_TYPE_SOFT_INTR; 3851 vmcs12->vm_entry_instruction_len = 3852 vcpu->arch.event_exit_inst_len; 3853 } else 3854 idt_vectoring |= INTR_TYPE_EXT_INTR; 3855 3856 vmcs12->idt_vectoring_info_field = idt_vectoring; 3857 } else { 3858 vmcs12->idt_vectoring_info_field = 0; 3859 } 3860 } 3861 3862 3863 void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu) 3864 { 3865 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3866 gfn_t gfn; 3867 3868 /* 3869 * Don't need to mark the APIC access page dirty; it is never 3870 * written to by the CPU during APIC virtualization. 3871 */ 3872 3873 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 3874 gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT; 3875 kvm_vcpu_mark_page_dirty(vcpu, gfn); 3876 } 3877 3878 if (nested_cpu_has_posted_intr(vmcs12)) { 3879 gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT; 3880 kvm_vcpu_mark_page_dirty(vcpu, gfn); 3881 } 3882 } 3883 3884 static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) 3885 { 3886 struct vcpu_vmx *vmx = to_vmx(vcpu); 3887 int max_irr; 3888 void *vapic_page; 3889 u16 status; 3890 3891 if (!vmx->nested.pi_pending) 3892 return 0; 3893 3894 if (!vmx->nested.pi_desc) 3895 goto mmio_needed; 3896 3897 vmx->nested.pi_pending = false; 3898 3899 if (!pi_test_and_clear_on(vmx->nested.pi_desc)) 3900 return 0; 3901 3902 max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256); 3903 if (max_irr != 256) { 3904 vapic_page = vmx->nested.virtual_apic_map.hva; 3905 if (!vapic_page) 3906 goto mmio_needed; 3907 3908 __kvm_apic_update_irr(vmx->nested.pi_desc->pir, 3909 vapic_page, &max_irr); 3910 status = vmcs_read16(GUEST_INTR_STATUS); 3911 if ((u8)max_irr > ((u8)status & 0xff)) { 3912 status &= ~0xff; 3913 status |= (u8)max_irr; 3914 vmcs_write16(GUEST_INTR_STATUS, status); 3915 } 3916 } 3917 3918 nested_mark_vmcs12_pages_dirty(vcpu); 3919 return 0; 3920 3921 mmio_needed: 3922 kvm_handle_memory_failure(vcpu, X86EMUL_IO_NEEDED, NULL); 3923 return -ENXIO; 3924 } 3925 3926 static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu) 3927 { 3928 struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit; 3929 u32 intr_info = ex->vector | INTR_INFO_VALID_MASK; 3930 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3931 unsigned long exit_qual; 3932 3933 if (ex->has_payload) { 3934 exit_qual = ex->payload; 3935 } else if (ex->vector == PF_VECTOR) { 3936 exit_qual = vcpu->arch.cr2; 3937 } else if (ex->vector == DB_VECTOR) { 3938 exit_qual = vcpu->arch.dr6; 3939 exit_qual &= ~DR6_BT; 3940 exit_qual ^= DR6_ACTIVE_LOW; 3941 } else { 3942 exit_qual = 0; 3943 } 3944 3945 /* 3946 * Unlike AMD's Paged Real Mode, which reports an error code on #PF 3947 * VM-Exits even if the CPU is in Real Mode, Intel VMX never sets the 3948 * "has error code" flags on VM-Exit if the CPU is in Real Mode. 3949 */ 3950 if (ex->has_error_code && is_protmode(vcpu)) { 3951 /* 3952 * Intel CPUs do not generate error codes with bits 31:16 set, 3953 * and more importantly VMX disallows setting bits 31:16 in the 3954 * injected error code for VM-Entry. Drop the bits to mimic 3955 * hardware and avoid inducing failure on nested VM-Entry if L1 3956 * chooses to inject the exception back to L2. AMD CPUs _do_ 3957 * generate "full" 32-bit error codes, so KVM allows userspace 3958 * to inject exception error codes with bits 31:16 set. 3959 */ 3960 vmcs12->vm_exit_intr_error_code = (u16)ex->error_code; 3961 intr_info |= INTR_INFO_DELIVER_CODE_MASK; 3962 } 3963 3964 if (kvm_exception_is_soft(ex->vector)) 3965 intr_info |= INTR_TYPE_SOFT_EXCEPTION; 3966 else 3967 intr_info |= INTR_TYPE_HARD_EXCEPTION; 3968 3969 if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) && 3970 vmx_get_nmi_mask(vcpu)) 3971 intr_info |= INTR_INFO_UNBLOCK_NMI; 3972 3973 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual); 3974 } 3975 3976 /* 3977 * Returns true if a debug trap is (likely) pending delivery. Infer the class 3978 * of a #DB (trap-like vs. fault-like) from the exception payload (to-be-DR6). 3979 * Using the payload is flawed because code breakpoints (fault-like) and data 3980 * breakpoints (trap-like) set the same bits in DR6 (breakpoint detected), i.e. 3981 * this will return false positives if a to-be-injected code breakpoint #DB is 3982 * pending (from KVM's perspective, but not "pending" across an instruction 3983 * boundary). ICEBP, a.k.a. INT1, is also not reflected here even though it 3984 * too is trap-like. 3985 * 3986 * KVM "works" despite these flaws as ICEBP isn't currently supported by the 3987 * emulator, Monitor Trap Flag is not marked pending on intercepted #DBs (the 3988 * #DB has already happened), and MTF isn't marked pending on code breakpoints 3989 * from the emulator (because such #DBs are fault-like and thus don't trigger 3990 * actions that fire on instruction retire). 3991 */ 3992 static unsigned long vmx_get_pending_dbg_trap(struct kvm_queued_exception *ex) 3993 { 3994 if (!ex->pending || ex->vector != DB_VECTOR) 3995 return 0; 3996 3997 /* General Detect #DBs are always fault-like. */ 3998 return ex->payload & ~DR6_BD; 3999 } 4000 4001 /* 4002 * Returns true if there's a pending #DB exception that is lower priority than 4003 * a pending Monitor Trap Flag VM-Exit. TSS T-flag #DBs are not emulated by 4004 * KVM, but could theoretically be injected by userspace. Note, this code is 4005 * imperfect, see above. 4006 */ 4007 static bool vmx_is_low_priority_db_trap(struct kvm_queued_exception *ex) 4008 { 4009 return vmx_get_pending_dbg_trap(ex) & ~DR6_BT; 4010 } 4011 4012 /* 4013 * Certain VM-exits set the 'pending debug exceptions' field to indicate a 4014 * recognized #DB (data or single-step) that has yet to be delivered. Since KVM 4015 * represents these debug traps with a payload that is said to be compatible 4016 * with the 'pending debug exceptions' field, write the payload to the VMCS 4017 * field if a VM-exit is delivered before the debug trap. 4018 */ 4019 static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu) 4020 { 4021 unsigned long pending_dbg; 4022 4023 pending_dbg = vmx_get_pending_dbg_trap(&vcpu->arch.exception); 4024 if (pending_dbg) 4025 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, pending_dbg); 4026 } 4027 4028 static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu) 4029 { 4030 return nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && 4031 to_vmx(vcpu)->nested.preemption_timer_expired; 4032 } 4033 4034 static bool vmx_has_nested_events(struct kvm_vcpu *vcpu) 4035 { 4036 return nested_vmx_preemption_timer_pending(vcpu) || 4037 to_vmx(vcpu)->nested.mtf_pending; 4038 } 4039 4040 /* 4041 * Per the Intel SDM's table "Priority Among Concurrent Events", with minor 4042 * edits to fill in missing examples, e.g. #DB due to split-lock accesses, 4043 * and less minor edits to splice in the priority of VMX Non-Root specific 4044 * events, e.g. MTF and NMI/INTR-window exiting. 4045 * 4046 * 1 Hardware Reset and Machine Checks 4047 * - RESET 4048 * - Machine Check 4049 * 4050 * 2 Trap on Task Switch 4051 * - T flag in TSS is set (on task switch) 4052 * 4053 * 3 External Hardware Interventions 4054 * - FLUSH 4055 * - STOPCLK 4056 * - SMI 4057 * - INIT 4058 * 4059 * 3.5 Monitor Trap Flag (MTF) VM-exit[1] 4060 * 4061 * 4 Traps on Previous Instruction 4062 * - Breakpoints 4063 * - Trap-class Debug Exceptions (#DB due to TF flag set, data/I-O 4064 * breakpoint, or #DB due to a split-lock access) 4065 * 4066 * 4.3 VMX-preemption timer expired VM-exit 4067 * 4068 * 4.6 NMI-window exiting VM-exit[2] 4069 * 4070 * 5 Nonmaskable Interrupts (NMI) 4071 * 4072 * 5.5 Interrupt-window exiting VM-exit and Virtual-interrupt delivery 4073 * 4074 * 6 Maskable Hardware Interrupts 4075 * 4076 * 7 Code Breakpoint Fault 4077 * 4078 * 8 Faults from Fetching Next Instruction 4079 * - Code-Segment Limit Violation 4080 * - Code Page Fault 4081 * - Control protection exception (missing ENDBRANCH at target of indirect 4082 * call or jump) 4083 * 4084 * 9 Faults from Decoding Next Instruction 4085 * - Instruction length > 15 bytes 4086 * - Invalid Opcode 4087 * - Coprocessor Not Available 4088 * 4089 *10 Faults on Executing Instruction 4090 * - Overflow 4091 * - Bound error 4092 * - Invalid TSS 4093 * - Segment Not Present 4094 * - Stack fault 4095 * - General Protection 4096 * - Data Page Fault 4097 * - Alignment Check 4098 * - x86 FPU Floating-point exception 4099 * - SIMD floating-point exception 4100 * - Virtualization exception 4101 * - Control protection exception 4102 * 4103 * [1] Per the "Monitor Trap Flag" section: System-management interrupts (SMIs), 4104 * INIT signals, and higher priority events take priority over MTF VM exits. 4105 * MTF VM exits take priority over debug-trap exceptions and lower priority 4106 * events. 4107 * 4108 * [2] Debug-trap exceptions and higher priority events take priority over VM exits 4109 * caused by the VMX-preemption timer. VM exits caused by the VMX-preemption 4110 * timer take priority over VM exits caused by the "NMI-window exiting" 4111 * VM-execution control and lower priority events. 4112 * 4113 * [3] Debug-trap exceptions and higher priority events take priority over VM exits 4114 * caused by "NMI-window exiting". VM exits caused by this control take 4115 * priority over non-maskable interrupts (NMIs) and lower priority events. 4116 * 4117 * [4] Virtual-interrupt delivery has the same priority as that of VM exits due to 4118 * the 1-setting of the "interrupt-window exiting" VM-execution control. Thus, 4119 * non-maskable interrupts (NMIs) and higher priority events take priority over 4120 * delivery of a virtual interrupt; delivery of a virtual interrupt takes 4121 * priority over external interrupts and lower priority events. 4122 */ 4123 static int vmx_check_nested_events(struct kvm_vcpu *vcpu) 4124 { 4125 struct kvm_lapic *apic = vcpu->arch.apic; 4126 struct vcpu_vmx *vmx = to_vmx(vcpu); 4127 /* 4128 * Only a pending nested run blocks a pending exception. If there is a 4129 * previously injected event, the pending exception occurred while said 4130 * event was being delivered and thus needs to be handled. 4131 */ 4132 bool block_nested_exceptions = vmx->nested.nested_run_pending; 4133 /* 4134 * New events (not exceptions) are only recognized at instruction 4135 * boundaries. If an event needs reinjection, then KVM is handling a 4136 * VM-Exit that occurred _during_ instruction execution; new events are 4137 * blocked until the instruction completes. 4138 */ 4139 bool block_nested_events = block_nested_exceptions || 4140 kvm_event_needs_reinjection(vcpu); 4141 4142 if (lapic_in_kernel(vcpu) && 4143 test_bit(KVM_APIC_INIT, &apic->pending_events)) { 4144 if (block_nested_events) 4145 return -EBUSY; 4146 nested_vmx_update_pending_dbg(vcpu); 4147 clear_bit(KVM_APIC_INIT, &apic->pending_events); 4148 if (vcpu->arch.mp_state != KVM_MP_STATE_INIT_RECEIVED) 4149 nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0); 4150 4151 /* MTF is discarded if the vCPU is in WFS. */ 4152 vmx->nested.mtf_pending = false; 4153 return 0; 4154 } 4155 4156 if (lapic_in_kernel(vcpu) && 4157 test_bit(KVM_APIC_SIPI, &apic->pending_events)) { 4158 if (block_nested_events) 4159 return -EBUSY; 4160 4161 clear_bit(KVM_APIC_SIPI, &apic->pending_events); 4162 if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { 4163 nested_vmx_vmexit(vcpu, EXIT_REASON_SIPI_SIGNAL, 0, 4164 apic->sipi_vector & 0xFFUL); 4165 return 0; 4166 } 4167 /* Fallthrough, the SIPI is completely ignored. */ 4168 } 4169 4170 /* 4171 * Process exceptions that are higher priority than Monitor Trap Flag: 4172 * fault-like exceptions, TSS T flag #DB (not emulated by KVM, but 4173 * could theoretically come in from userspace), and ICEBP (INT1). 4174 * 4175 * TODO: SMIs have higher priority than MTF and trap-like #DBs (except 4176 * for TSS T flag #DBs). KVM also doesn't save/restore pending MTF 4177 * across SMI/RSM as it should; that needs to be addressed in order to 4178 * prioritize SMI over MTF and trap-like #DBs. 4179 */ 4180 if (vcpu->arch.exception_vmexit.pending && 4181 !vmx_is_low_priority_db_trap(&vcpu->arch.exception_vmexit)) { 4182 if (block_nested_exceptions) 4183 return -EBUSY; 4184 4185 nested_vmx_inject_exception_vmexit(vcpu); 4186 return 0; 4187 } 4188 4189 if (vcpu->arch.exception.pending && 4190 !vmx_is_low_priority_db_trap(&vcpu->arch.exception)) { 4191 if (block_nested_exceptions) 4192 return -EBUSY; 4193 goto no_vmexit; 4194 } 4195 4196 if (vmx->nested.mtf_pending) { 4197 if (block_nested_events) 4198 return -EBUSY; 4199 nested_vmx_update_pending_dbg(vcpu); 4200 nested_vmx_vmexit(vcpu, EXIT_REASON_MONITOR_TRAP_FLAG, 0, 0); 4201 return 0; 4202 } 4203 4204 if (vcpu->arch.exception_vmexit.pending) { 4205 if (block_nested_exceptions) 4206 return -EBUSY; 4207 4208 nested_vmx_inject_exception_vmexit(vcpu); 4209 return 0; 4210 } 4211 4212 if (vcpu->arch.exception.pending) { 4213 if (block_nested_exceptions) 4214 return -EBUSY; 4215 goto no_vmexit; 4216 } 4217 4218 if (nested_vmx_preemption_timer_pending(vcpu)) { 4219 if (block_nested_events) 4220 return -EBUSY; 4221 nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0); 4222 return 0; 4223 } 4224 4225 if (vcpu->arch.smi_pending && !is_smm(vcpu)) { 4226 if (block_nested_events) 4227 return -EBUSY; 4228 goto no_vmexit; 4229 } 4230 4231 if (vcpu->arch.nmi_pending && !vmx_nmi_blocked(vcpu)) { 4232 if (block_nested_events) 4233 return -EBUSY; 4234 if (!nested_exit_on_nmi(vcpu)) 4235 goto no_vmexit; 4236 4237 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, 4238 NMI_VECTOR | INTR_TYPE_NMI_INTR | 4239 INTR_INFO_VALID_MASK, 0); 4240 /* 4241 * The NMI-triggered VM exit counts as injection: 4242 * clear this one and block further NMIs. 4243 */ 4244 vcpu->arch.nmi_pending = 0; 4245 vmx_set_nmi_mask(vcpu, true); 4246 return 0; 4247 } 4248 4249 if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) { 4250 if (block_nested_events) 4251 return -EBUSY; 4252 if (!nested_exit_on_intr(vcpu)) 4253 goto no_vmexit; 4254 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); 4255 return 0; 4256 } 4257 4258 no_vmexit: 4259 return vmx_complete_nested_posted_interrupt(vcpu); 4260 } 4261 4262 static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) 4263 { 4264 ktime_t remaining = 4265 hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer); 4266 u64 value; 4267 4268 if (ktime_to_ns(remaining) <= 0) 4269 return 0; 4270 4271 value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz; 4272 do_div(value, 1000000); 4273 return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 4274 } 4275 4276 static bool is_vmcs12_ext_field(unsigned long field) 4277 { 4278 switch (field) { 4279 case GUEST_ES_SELECTOR: 4280 case GUEST_CS_SELECTOR: 4281 case GUEST_SS_SELECTOR: 4282 case GUEST_DS_SELECTOR: 4283 case GUEST_FS_SELECTOR: 4284 case GUEST_GS_SELECTOR: 4285 case GUEST_LDTR_SELECTOR: 4286 case GUEST_TR_SELECTOR: 4287 case GUEST_ES_LIMIT: 4288 case GUEST_CS_LIMIT: 4289 case GUEST_SS_LIMIT: 4290 case GUEST_DS_LIMIT: 4291 case GUEST_FS_LIMIT: 4292 case GUEST_GS_LIMIT: 4293 case GUEST_LDTR_LIMIT: 4294 case GUEST_TR_LIMIT: 4295 case GUEST_GDTR_LIMIT: 4296 case GUEST_IDTR_LIMIT: 4297 case GUEST_ES_AR_BYTES: 4298 case GUEST_DS_AR_BYTES: 4299 case GUEST_FS_AR_BYTES: 4300 case GUEST_GS_AR_BYTES: 4301 case GUEST_LDTR_AR_BYTES: 4302 case GUEST_TR_AR_BYTES: 4303 case GUEST_ES_BASE: 4304 case GUEST_CS_BASE: 4305 case GUEST_SS_BASE: 4306 case GUEST_DS_BASE: 4307 case GUEST_FS_BASE: 4308 case GUEST_GS_BASE: 4309 case GUEST_LDTR_BASE: 4310 case GUEST_TR_BASE: 4311 case GUEST_GDTR_BASE: 4312 case GUEST_IDTR_BASE: 4313 case GUEST_PENDING_DBG_EXCEPTIONS: 4314 case GUEST_BNDCFGS: 4315 return true; 4316 default: 4317 break; 4318 } 4319 4320 return false; 4321 } 4322 4323 static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, 4324 struct vmcs12 *vmcs12) 4325 { 4326 struct vcpu_vmx *vmx = to_vmx(vcpu); 4327 4328 vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); 4329 vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR); 4330 vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR); 4331 vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR); 4332 vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR); 4333 vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR); 4334 vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR); 4335 vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR); 4336 vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT); 4337 vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT); 4338 vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT); 4339 vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT); 4340 vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT); 4341 vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT); 4342 vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT); 4343 vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT); 4344 vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT); 4345 vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT); 4346 vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES); 4347 vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES); 4348 vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES); 4349 vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES); 4350 vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES); 4351 vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES); 4352 vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE); 4353 vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE); 4354 vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE); 4355 vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE); 4356 vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE); 4357 vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE); 4358 vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE); 4359 vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE); 4360 vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); 4361 vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); 4362 vmcs12->guest_pending_dbg_exceptions = 4363 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); 4364 4365 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false; 4366 } 4367 4368 static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, 4369 struct vmcs12 *vmcs12) 4370 { 4371 struct vcpu_vmx *vmx = to_vmx(vcpu); 4372 int cpu; 4373 4374 if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare) 4375 return; 4376 4377 4378 WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01); 4379 4380 cpu = get_cpu(); 4381 vmx->loaded_vmcs = &vmx->nested.vmcs02; 4382 vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->vmcs01); 4383 4384 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 4385 4386 vmx->loaded_vmcs = &vmx->vmcs01; 4387 vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->nested.vmcs02); 4388 put_cpu(); 4389 } 4390 4391 /* 4392 * Update the guest state fields of vmcs12 to reflect changes that 4393 * occurred while L2 was running. (The "IA-32e mode guest" bit of the 4394 * VM-entry controls is also updated, since this is really a guest 4395 * state bit.) 4396 */ 4397 static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 4398 { 4399 struct vcpu_vmx *vmx = to_vmx(vcpu); 4400 4401 if (nested_vmx_is_evmptr12_valid(vmx)) 4402 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 4403 4404 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = 4405 !nested_vmx_is_evmptr12_valid(vmx); 4406 4407 vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); 4408 vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); 4409 4410 vmcs12->guest_rsp = kvm_rsp_read(vcpu); 4411 vmcs12->guest_rip = kvm_rip_read(vcpu); 4412 vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); 4413 4414 vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); 4415 vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); 4416 4417 vmcs12->guest_interruptibility_info = 4418 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 4419 4420 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) 4421 vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT; 4422 else if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) 4423 vmcs12->guest_activity_state = GUEST_ACTIVITY_WAIT_SIPI; 4424 else 4425 vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; 4426 4427 if (nested_cpu_has_preemption_timer(vmcs12) && 4428 vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER && 4429 !vmx->nested.nested_run_pending) 4430 vmcs12->vmx_preemption_timer_value = 4431 vmx_get_preemption_timer_value(vcpu); 4432 4433 /* 4434 * In some cases (usually, nested EPT), L2 is allowed to change its 4435 * own CR3 without exiting. If it has changed it, we must keep it. 4436 * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined 4437 * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12. 4438 * 4439 * Additionally, restore L2's PDPTR to vmcs12. 4440 */ 4441 if (enable_ept) { 4442 vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3); 4443 if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { 4444 vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0); 4445 vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1); 4446 vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2); 4447 vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); 4448 } 4449 } 4450 4451 vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); 4452 4453 if (nested_cpu_has_vid(vmcs12)) 4454 vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS); 4455 4456 vmcs12->vm_entry_controls = 4457 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | 4458 (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); 4459 4460 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) 4461 vmcs12->guest_dr7 = vcpu->arch.dr7; 4462 4463 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) 4464 vmcs12->guest_ia32_efer = vcpu->arch.efer; 4465 } 4466 4467 /* 4468 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits 4469 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), 4470 * and this function updates it to reflect the changes to the guest state while 4471 * L2 was running (and perhaps made some exits which were handled directly by L0 4472 * without going back to L1), and to reflect the exit reason. 4473 * Note that we do not have to copy here all VMCS fields, just those that 4474 * could have changed by the L2 guest or the exit - i.e., the guest-state and 4475 * exit-information fields only. Other fields are modified by L1 with VMWRITE, 4476 * which already writes to vmcs12 directly. 4477 */ 4478 static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 4479 u32 vm_exit_reason, u32 exit_intr_info, 4480 unsigned long exit_qualification) 4481 { 4482 /* update exit information fields: */ 4483 vmcs12->vm_exit_reason = vm_exit_reason; 4484 if (to_vmx(vcpu)->exit_reason.enclave_mode) 4485 vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE; 4486 vmcs12->exit_qualification = exit_qualification; 4487 4488 /* 4489 * On VM-Exit due to a failed VM-Entry, the VMCS isn't marked launched 4490 * and only EXIT_REASON and EXIT_QUALIFICATION are updated, all other 4491 * exit info fields are unmodified. 4492 */ 4493 if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { 4494 vmcs12->launch_state = 1; 4495 4496 /* vm_entry_intr_info_field is cleared on exit. Emulate this 4497 * instead of reading the real value. */ 4498 vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK; 4499 4500 /* 4501 * Transfer the event that L0 or L1 may wanted to inject into 4502 * L2 to IDT_VECTORING_INFO_FIELD. 4503 */ 4504 vmcs12_save_pending_event(vcpu, vmcs12, 4505 vm_exit_reason, exit_intr_info); 4506 4507 vmcs12->vm_exit_intr_info = exit_intr_info; 4508 vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 4509 vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 4510 4511 /* 4512 * According to spec, there's no need to store the guest's 4513 * MSRs if the exit is due to a VM-entry failure that occurs 4514 * during or after loading the guest state. Since this exit 4515 * does not fall in that category, we need to save the MSRs. 4516 */ 4517 if (nested_vmx_store_msr(vcpu, 4518 vmcs12->vm_exit_msr_store_addr, 4519 vmcs12->vm_exit_msr_store_count)) 4520 nested_vmx_abort(vcpu, 4521 VMX_ABORT_SAVE_GUEST_MSR_FAIL); 4522 } 4523 } 4524 4525 /* 4526 * A part of what we need to when the nested L2 guest exits and we want to 4527 * run its L1 parent, is to reset L1's guest state to the host state specified 4528 * in vmcs12. 4529 * This function is to be called not only on normal nested exit, but also on 4530 * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry 4531 * Failures During or After Loading Guest State"). 4532 * This function should be called when the active VMCS is L1's (vmcs01). 4533 */ 4534 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 4535 struct vmcs12 *vmcs12) 4536 { 4537 enum vm_entry_failure_code ignored; 4538 struct kvm_segment seg; 4539 4540 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) 4541 vcpu->arch.efer = vmcs12->host_ia32_efer; 4542 else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 4543 vcpu->arch.efer |= (EFER_LMA | EFER_LME); 4544 else 4545 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); 4546 vmx_set_efer(vcpu, vcpu->arch.efer); 4547 4548 kvm_rsp_write(vcpu, vmcs12->host_rsp); 4549 kvm_rip_write(vcpu, vmcs12->host_rip); 4550 vmx_set_rflags(vcpu, X86_EFLAGS_FIXED); 4551 vmx_set_interrupt_shadow(vcpu, 0); 4552 4553 /* 4554 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't 4555 * actually changed, because vmx_set_cr0 refers to efer set above. 4556 * 4557 * CR0_GUEST_HOST_MASK is already set in the original vmcs01 4558 * (KVM doesn't change it); 4559 */ 4560 vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); 4561 vmx_set_cr0(vcpu, vmcs12->host_cr0); 4562 4563 /* Same as above - no reason to call set_cr4_guest_host_mask(). */ 4564 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 4565 vmx_set_cr4(vcpu, vmcs12->host_cr4); 4566 4567 nested_ept_uninit_mmu_context(vcpu); 4568 4569 /* 4570 * Only PDPTE load can fail as the value of cr3 was checked on entry and 4571 * couldn't have changed. 4572 */ 4573 if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, true, &ignored)) 4574 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); 4575 4576 nested_vmx_transition_tlb_flush(vcpu, vmcs12, false); 4577 4578 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); 4579 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); 4580 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); 4581 vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); 4582 vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); 4583 vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF); 4584 vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF); 4585 4586 /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ 4587 if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) 4588 vmcs_write64(GUEST_BNDCFGS, 0); 4589 4590 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { 4591 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); 4592 vcpu->arch.pat = vmcs12->host_ia32_pat; 4593 } 4594 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && 4595 kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu))) 4596 WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 4597 vmcs12->host_ia32_perf_global_ctrl)); 4598 4599 /* Set L1 segment info according to Intel SDM 4600 27.5.2 Loading Host Segment and Descriptor-Table Registers */ 4601 seg = (struct kvm_segment) { 4602 .base = 0, 4603 .limit = 0xFFFFFFFF, 4604 .selector = vmcs12->host_cs_selector, 4605 .type = 11, 4606 .present = 1, 4607 .s = 1, 4608 .g = 1 4609 }; 4610 if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 4611 seg.l = 1; 4612 else 4613 seg.db = 1; 4614 __vmx_set_segment(vcpu, &seg, VCPU_SREG_CS); 4615 seg = (struct kvm_segment) { 4616 .base = 0, 4617 .limit = 0xFFFFFFFF, 4618 .type = 3, 4619 .present = 1, 4620 .s = 1, 4621 .db = 1, 4622 .g = 1 4623 }; 4624 seg.selector = vmcs12->host_ds_selector; 4625 __vmx_set_segment(vcpu, &seg, VCPU_SREG_DS); 4626 seg.selector = vmcs12->host_es_selector; 4627 __vmx_set_segment(vcpu, &seg, VCPU_SREG_ES); 4628 seg.selector = vmcs12->host_ss_selector; 4629 __vmx_set_segment(vcpu, &seg, VCPU_SREG_SS); 4630 seg.selector = vmcs12->host_fs_selector; 4631 seg.base = vmcs12->host_fs_base; 4632 __vmx_set_segment(vcpu, &seg, VCPU_SREG_FS); 4633 seg.selector = vmcs12->host_gs_selector; 4634 seg.base = vmcs12->host_gs_base; 4635 __vmx_set_segment(vcpu, &seg, VCPU_SREG_GS); 4636 seg = (struct kvm_segment) { 4637 .base = vmcs12->host_tr_base, 4638 .limit = 0x67, 4639 .selector = vmcs12->host_tr_selector, 4640 .type = 11, 4641 .present = 1 4642 }; 4643 __vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); 4644 4645 memset(&seg, 0, sizeof(seg)); 4646 seg.unusable = 1; 4647 __vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR); 4648 4649 kvm_set_dr(vcpu, 7, 0x400); 4650 vmcs_write64(GUEST_IA32_DEBUGCTL, 0); 4651 4652 if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, 4653 vmcs12->vm_exit_msr_load_count)) 4654 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 4655 4656 to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu); 4657 } 4658 4659 static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) 4660 { 4661 struct vmx_uret_msr *efer_msr; 4662 unsigned int i; 4663 4664 if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER) 4665 return vmcs_read64(GUEST_IA32_EFER); 4666 4667 if (cpu_has_load_ia32_efer()) 4668 return host_efer; 4669 4670 for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) { 4671 if (vmx->msr_autoload.guest.val[i].index == MSR_EFER) 4672 return vmx->msr_autoload.guest.val[i].value; 4673 } 4674 4675 efer_msr = vmx_find_uret_msr(vmx, MSR_EFER); 4676 if (efer_msr) 4677 return efer_msr->data; 4678 4679 return host_efer; 4680 } 4681 4682 static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) 4683 { 4684 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4685 struct vcpu_vmx *vmx = to_vmx(vcpu); 4686 struct vmx_msr_entry g, h; 4687 gpa_t gpa; 4688 u32 i, j; 4689 4690 vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT); 4691 4692 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { 4693 /* 4694 * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set 4695 * as vmcs01.GUEST_DR7 contains a userspace defined value 4696 * and vcpu->arch.dr7 is not squirreled away before the 4697 * nested VMENTER (not worth adding a variable in nested_vmx). 4698 */ 4699 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 4700 kvm_set_dr(vcpu, 7, DR7_FIXED_1); 4701 else 4702 WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7))); 4703 } 4704 4705 /* 4706 * Note that calling vmx_set_{efer,cr0,cr4} is important as they 4707 * handle a variety of side effects to KVM's software model. 4708 */ 4709 vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx)); 4710 4711 vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); 4712 vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW)); 4713 4714 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 4715 vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW)); 4716 4717 nested_ept_uninit_mmu_context(vcpu); 4718 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 4719 kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); 4720 4721 /* 4722 * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs 4723 * from vmcs01 (if necessary). The PDPTRs are not loaded on 4724 * VMFail, like everything else we just need to ensure our 4725 * software model is up-to-date. 4726 */ 4727 if (enable_ept && is_pae_paging(vcpu)) 4728 ept_save_pdptrs(vcpu); 4729 4730 kvm_mmu_reset_context(vcpu); 4731 4732 /* 4733 * This nasty bit of open coding is a compromise between blindly 4734 * loading L1's MSRs using the exit load lists (incorrect emulation 4735 * of VMFail), leaving the nested VM's MSRs in the software model 4736 * (incorrect behavior) and snapshotting the modified MSRs (too 4737 * expensive since the lists are unbound by hardware). For each 4738 * MSR that was (prematurely) loaded from the nested VMEntry load 4739 * list, reload it from the exit load list if it exists and differs 4740 * from the guest value. The intent is to stuff host state as 4741 * silently as possible, not to fully process the exit load list. 4742 */ 4743 for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) { 4744 gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g)); 4745 if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) { 4746 pr_debug_ratelimited( 4747 "%s read MSR index failed (%u, 0x%08llx)\n", 4748 __func__, i, gpa); 4749 goto vmabort; 4750 } 4751 4752 for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) { 4753 gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h)); 4754 if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) { 4755 pr_debug_ratelimited( 4756 "%s read MSR failed (%u, 0x%08llx)\n", 4757 __func__, j, gpa); 4758 goto vmabort; 4759 } 4760 if (h.index != g.index) 4761 continue; 4762 if (h.value == g.value) 4763 break; 4764 4765 if (nested_vmx_load_msr_check(vcpu, &h)) { 4766 pr_debug_ratelimited( 4767 "%s check failed (%u, 0x%x, 0x%x)\n", 4768 __func__, j, h.index, h.reserved); 4769 goto vmabort; 4770 } 4771 4772 if (kvm_set_msr(vcpu, h.index, h.value)) { 4773 pr_debug_ratelimited( 4774 "%s WRMSR failed (%u, 0x%x, 0x%llx)\n", 4775 __func__, j, h.index, h.value); 4776 goto vmabort; 4777 } 4778 } 4779 } 4780 4781 return; 4782 4783 vmabort: 4784 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 4785 } 4786 4787 /* 4788 * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1 4789 * and modify vmcs12 to make it see what it would expect to see there if 4790 * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) 4791 */ 4792 void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, 4793 u32 exit_intr_info, unsigned long exit_qualification) 4794 { 4795 struct vcpu_vmx *vmx = to_vmx(vcpu); 4796 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4797 4798 /* Pending MTF traps are discarded on VM-Exit. */ 4799 vmx->nested.mtf_pending = false; 4800 4801 /* trying to cancel vmlaunch/vmresume is a bug */ 4802 WARN_ON_ONCE(vmx->nested.nested_run_pending); 4803 4804 #ifdef CONFIG_KVM_HYPERV 4805 if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) { 4806 /* 4807 * KVM_REQ_GET_NESTED_STATE_PAGES is also used to map 4808 * Enlightened VMCS after migration and we still need to 4809 * do that when something is forcing L2->L1 exit prior to 4810 * the first L2 run. 4811 */ 4812 (void)nested_get_evmcs_page(vcpu); 4813 } 4814 #endif 4815 4816 /* Service pending TLB flush requests for L2 before switching to L1. */ 4817 kvm_service_local_tlb_flush_requests(vcpu); 4818 4819 /* 4820 * VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between 4821 * now and the new vmentry. Ensure that the VMCS02 PDPTR fields are 4822 * up-to-date before switching to L1. 4823 */ 4824 if (enable_ept && is_pae_paging(vcpu)) 4825 vmx_ept_load_pdptrs(vcpu); 4826 4827 leave_guest_mode(vcpu); 4828 4829 if (nested_cpu_has_preemption_timer(vmcs12)) 4830 hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); 4831 4832 if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) { 4833 vcpu->arch.tsc_offset = vcpu->arch.l1_tsc_offset; 4834 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING)) 4835 vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio; 4836 } 4837 4838 if (likely(!vmx->fail)) { 4839 sync_vmcs02_to_vmcs12(vcpu, vmcs12); 4840 4841 if (vm_exit_reason != -1) 4842 prepare_vmcs12(vcpu, vmcs12, vm_exit_reason, 4843 exit_intr_info, exit_qualification); 4844 4845 /* 4846 * Must happen outside of sync_vmcs02_to_vmcs12() as it will 4847 * also be used to capture vmcs12 cache as part of 4848 * capturing nVMX state for snapshot (migration). 4849 * 4850 * Otherwise, this flush will dirty guest memory at a 4851 * point it is already assumed by user-space to be 4852 * immutable. 4853 */ 4854 nested_flush_cached_shadow_vmcs12(vcpu, vmcs12); 4855 } else { 4856 /* 4857 * The only expected VM-instruction error is "VM entry with 4858 * invalid control field(s)." Anything else indicates a 4859 * problem with L0. And we should never get here with a 4860 * VMFail of any type if early consistency checks are enabled. 4861 */ 4862 WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != 4863 VMXERR_ENTRY_INVALID_CONTROL_FIELD); 4864 WARN_ON_ONCE(nested_early_check); 4865 } 4866 4867 /* 4868 * Drop events/exceptions that were queued for re-injection to L2 4869 * (picked up via vmx_complete_interrupts()), as well as exceptions 4870 * that were pending for L2. Note, this must NOT be hoisted above 4871 * prepare_vmcs12(), events/exceptions queued for re-injection need to 4872 * be captured in vmcs12 (see vmcs12_save_pending_event()). 4873 */ 4874 vcpu->arch.nmi_injected = false; 4875 kvm_clear_exception_queue(vcpu); 4876 kvm_clear_interrupt_queue(vcpu); 4877 4878 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 4879 4880 /* 4881 * If IBRS is advertised to the vCPU, KVM must flush the indirect 4882 * branch predictors when transitioning from L2 to L1, as L1 expects 4883 * hardware (KVM in this case) to provide separate predictor modes. 4884 * Bare metal isolates VMX root (host) from VMX non-root (guest), but 4885 * doesn't isolate different VMCSs, i.e. in this case, doesn't provide 4886 * separate modes for L2 vs L1. 4887 */ 4888 if (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) 4889 indirect_branch_prediction_barrier(); 4890 4891 /* Update any VMCS fields that might have changed while L2 ran */ 4892 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 4893 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 4894 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 4895 if (kvm_caps.has_tsc_control) 4896 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); 4897 4898 if (vmx->nested.l1_tpr_threshold != -1) 4899 vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold); 4900 4901 if (vmx->nested.change_vmcs01_virtual_apic_mode) { 4902 vmx->nested.change_vmcs01_virtual_apic_mode = false; 4903 vmx_set_virtual_apic_mode(vcpu); 4904 } 4905 4906 if (vmx->nested.update_vmcs01_cpu_dirty_logging) { 4907 vmx->nested.update_vmcs01_cpu_dirty_logging = false; 4908 vmx_update_cpu_dirty_logging(vcpu); 4909 } 4910 4911 /* Unpin physical memory we referred to in vmcs02 */ 4912 kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map, false); 4913 kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); 4914 kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); 4915 vmx->nested.pi_desc = NULL; 4916 4917 if (vmx->nested.reload_vmcs01_apic_access_page) { 4918 vmx->nested.reload_vmcs01_apic_access_page = false; 4919 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 4920 } 4921 4922 if (vmx->nested.update_vmcs01_apicv_status) { 4923 vmx->nested.update_vmcs01_apicv_status = false; 4924 kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); 4925 } 4926 4927 if ((vm_exit_reason != -1) && 4928 (enable_shadow_vmcs || nested_vmx_is_evmptr12_valid(vmx))) 4929 vmx->nested.need_vmcs12_to_shadow_sync = true; 4930 4931 /* in case we halted in L2 */ 4932 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 4933 4934 if (likely(!vmx->fail)) { 4935 if ((u16)vm_exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT && 4936 nested_exit_intr_ack_set(vcpu)) { 4937 int irq = kvm_cpu_get_interrupt(vcpu); 4938 WARN_ON(irq < 0); 4939 vmcs12->vm_exit_intr_info = irq | 4940 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; 4941 } 4942 4943 if (vm_exit_reason != -1) 4944 trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, 4945 vmcs12->exit_qualification, 4946 vmcs12->idt_vectoring_info_field, 4947 vmcs12->vm_exit_intr_info, 4948 vmcs12->vm_exit_intr_error_code, 4949 KVM_ISA_VMX); 4950 4951 load_vmcs12_host_state(vcpu, vmcs12); 4952 4953 return; 4954 } 4955 4956 /* 4957 * After an early L2 VM-entry failure, we're now back 4958 * in L1 which thinks it just finished a VMLAUNCH or 4959 * VMRESUME instruction, so we need to set the failure 4960 * flag and the VM-instruction error field of the VMCS 4961 * accordingly, and skip the emulated instruction. 4962 */ 4963 (void)nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 4964 4965 /* 4966 * Restore L1's host state to KVM's software model. We're here 4967 * because a consistency check was caught by hardware, which 4968 * means some amount of guest state has been propagated to KVM's 4969 * model and needs to be unwound to the host's state. 4970 */ 4971 nested_vmx_restore_host_state(vcpu); 4972 4973 vmx->fail = 0; 4974 } 4975 4976 static void nested_vmx_triple_fault(struct kvm_vcpu *vcpu) 4977 { 4978 kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu); 4979 nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0); 4980 } 4981 4982 /* 4983 * Decode the memory-address operand of a vmx instruction, as recorded on an 4984 * exit caused by such an instruction (run by a guest hypervisor). 4985 * On success, returns 0. When the operand is invalid, returns 1 and throws 4986 * #UD, #GP, or #SS. 4987 */ 4988 int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, 4989 u32 vmx_instruction_info, bool wr, int len, gva_t *ret) 4990 { 4991 gva_t off; 4992 bool exn; 4993 struct kvm_segment s; 4994 4995 /* 4996 * According to Vol. 3B, "Information for VM Exits Due to Instruction 4997 * Execution", on an exit, vmx_instruction_info holds most of the 4998 * addressing components of the operand. Only the displacement part 4999 * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). 5000 * For how an actual address is calculated from all these components, 5001 * refer to Vol. 1, "Operand Addressing". 5002 */ 5003 int scaling = vmx_instruction_info & 3; 5004 int addr_size = (vmx_instruction_info >> 7) & 7; 5005 bool is_reg = vmx_instruction_info & (1u << 10); 5006 int seg_reg = (vmx_instruction_info >> 15) & 7; 5007 int index_reg = (vmx_instruction_info >> 18) & 0xf; 5008 bool index_is_valid = !(vmx_instruction_info & (1u << 22)); 5009 int base_reg = (vmx_instruction_info >> 23) & 0xf; 5010 bool base_is_valid = !(vmx_instruction_info & (1u << 27)); 5011 5012 if (is_reg) { 5013 kvm_queue_exception(vcpu, UD_VECTOR); 5014 return 1; 5015 } 5016 5017 /* Addr = segment_base + offset */ 5018 /* offset = base + [index * scale] + displacement */ 5019 off = exit_qualification; /* holds the displacement */ 5020 if (addr_size == 1) 5021 off = (gva_t)sign_extend64(off, 31); 5022 else if (addr_size == 0) 5023 off = (gva_t)sign_extend64(off, 15); 5024 if (base_is_valid) 5025 off += kvm_register_read(vcpu, base_reg); 5026 if (index_is_valid) 5027 off += kvm_register_read(vcpu, index_reg) << scaling; 5028 vmx_get_segment(vcpu, &s, seg_reg); 5029 5030 /* 5031 * The effective address, i.e. @off, of a memory operand is truncated 5032 * based on the address size of the instruction. Note that this is 5033 * the *effective address*, i.e. the address prior to accounting for 5034 * the segment's base. 5035 */ 5036 if (addr_size == 1) /* 32 bit */ 5037 off &= 0xffffffff; 5038 else if (addr_size == 0) /* 16 bit */ 5039 off &= 0xffff; 5040 5041 /* Checks for #GP/#SS exceptions. */ 5042 exn = false; 5043 if (is_long_mode(vcpu)) { 5044 /* 5045 * The virtual/linear address is never truncated in 64-bit 5046 * mode, e.g. a 32-bit address size can yield a 64-bit virtual 5047 * address when using FS/GS with a non-zero base. 5048 */ 5049 if (seg_reg == VCPU_SREG_FS || seg_reg == VCPU_SREG_GS) 5050 *ret = s.base + off; 5051 else 5052 *ret = off; 5053 5054 *ret = vmx_get_untagged_addr(vcpu, *ret, 0); 5055 /* Long mode: #GP(0)/#SS(0) if the memory address is in a 5056 * non-canonical form. This is the only check on the memory 5057 * destination for long mode! 5058 */ 5059 exn = is_noncanonical_address(*ret, vcpu); 5060 } else { 5061 /* 5062 * When not in long mode, the virtual/linear address is 5063 * unconditionally truncated to 32 bits regardless of the 5064 * address size. 5065 */ 5066 *ret = (s.base + off) & 0xffffffff; 5067 5068 /* Protected mode: apply checks for segment validity in the 5069 * following order: 5070 * - segment type check (#GP(0) may be thrown) 5071 * - usability check (#GP(0)/#SS(0)) 5072 * - limit check (#GP(0)/#SS(0)) 5073 */ 5074 if (wr) 5075 /* #GP(0) if the destination operand is located in a 5076 * read-only data segment or any code segment. 5077 */ 5078 exn = ((s.type & 0xa) == 0 || (s.type & 8)); 5079 else 5080 /* #GP(0) if the source operand is located in an 5081 * execute-only code segment 5082 */ 5083 exn = ((s.type & 0xa) == 8); 5084 if (exn) { 5085 kvm_queue_exception_e(vcpu, GP_VECTOR, 0); 5086 return 1; 5087 } 5088 /* Protected mode: #GP(0)/#SS(0) if the segment is unusable. 5089 */ 5090 exn = (s.unusable != 0); 5091 5092 /* 5093 * Protected mode: #GP(0)/#SS(0) if the memory operand is 5094 * outside the segment limit. All CPUs that support VMX ignore 5095 * limit checks for flat segments, i.e. segments with base==0, 5096 * limit==0xffffffff and of type expand-up data or code. 5097 */ 5098 if (!(s.base == 0 && s.limit == 0xffffffff && 5099 ((s.type & 8) || !(s.type & 4)))) 5100 exn = exn || ((u64)off + len - 1 > s.limit); 5101 } 5102 if (exn) { 5103 kvm_queue_exception_e(vcpu, 5104 seg_reg == VCPU_SREG_SS ? 5105 SS_VECTOR : GP_VECTOR, 5106 0); 5107 return 1; 5108 } 5109 5110 return 0; 5111 } 5112 5113 static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer, 5114 int *ret) 5115 { 5116 gva_t gva; 5117 struct x86_exception e; 5118 int r; 5119 5120 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 5121 vmcs_read32(VMX_INSTRUCTION_INFO), false, 5122 sizeof(*vmpointer), &gva)) { 5123 *ret = 1; 5124 return -EINVAL; 5125 } 5126 5127 r = kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e); 5128 if (r != X86EMUL_CONTINUE) { 5129 *ret = kvm_handle_memory_failure(vcpu, r, &e); 5130 return -EINVAL; 5131 } 5132 5133 return 0; 5134 } 5135 5136 /* 5137 * Allocate a shadow VMCS and associate it with the currently loaded 5138 * VMCS, unless such a shadow VMCS already exists. The newly allocated 5139 * VMCS is also VMCLEARed, so that it is ready for use. 5140 */ 5141 static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu) 5142 { 5143 struct vcpu_vmx *vmx = to_vmx(vcpu); 5144 struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs; 5145 5146 /* 5147 * KVM allocates a shadow VMCS only when L1 executes VMXON and frees it 5148 * when L1 executes VMXOFF or the vCPU is forced out of nested 5149 * operation. VMXON faults if the CPU is already post-VMXON, so it 5150 * should be impossible to already have an allocated shadow VMCS. KVM 5151 * doesn't support virtualization of VMCS shadowing, so vmcs01 should 5152 * always be the loaded VMCS. 5153 */ 5154 if (WARN_ON(loaded_vmcs != &vmx->vmcs01 || loaded_vmcs->shadow_vmcs)) 5155 return loaded_vmcs->shadow_vmcs; 5156 5157 loaded_vmcs->shadow_vmcs = alloc_vmcs(true); 5158 if (loaded_vmcs->shadow_vmcs) 5159 vmcs_clear(loaded_vmcs->shadow_vmcs); 5160 5161 return loaded_vmcs->shadow_vmcs; 5162 } 5163 5164 static int enter_vmx_operation(struct kvm_vcpu *vcpu) 5165 { 5166 struct vcpu_vmx *vmx = to_vmx(vcpu); 5167 int r; 5168 5169 r = alloc_loaded_vmcs(&vmx->nested.vmcs02); 5170 if (r < 0) 5171 goto out_vmcs02; 5172 5173 vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); 5174 if (!vmx->nested.cached_vmcs12) 5175 goto out_cached_vmcs12; 5176 5177 vmx->nested.shadow_vmcs12_cache.gpa = INVALID_GPA; 5178 vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); 5179 if (!vmx->nested.cached_shadow_vmcs12) 5180 goto out_cached_shadow_vmcs12; 5181 5182 if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu)) 5183 goto out_shadow_vmcs; 5184 5185 hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, 5186 HRTIMER_MODE_ABS_PINNED); 5187 vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; 5188 5189 vmx->nested.vpid02 = allocate_vpid(); 5190 5191 vmx->nested.vmcs02_initialized = false; 5192 vmx->nested.vmxon = true; 5193 5194 if (vmx_pt_mode_is_host_guest()) { 5195 vmx->pt_desc.guest.ctl = 0; 5196 pt_update_intercept_for_msr(vcpu); 5197 } 5198 5199 return 0; 5200 5201 out_shadow_vmcs: 5202 kfree(vmx->nested.cached_shadow_vmcs12); 5203 5204 out_cached_shadow_vmcs12: 5205 kfree(vmx->nested.cached_vmcs12); 5206 5207 out_cached_vmcs12: 5208 free_loaded_vmcs(&vmx->nested.vmcs02); 5209 5210 out_vmcs02: 5211 return -ENOMEM; 5212 } 5213 5214 /* Emulate the VMXON instruction. */ 5215 static int handle_vmxon(struct kvm_vcpu *vcpu) 5216 { 5217 int ret; 5218 gpa_t vmptr; 5219 uint32_t revision; 5220 struct vcpu_vmx *vmx = to_vmx(vcpu); 5221 const u64 VMXON_NEEDED_FEATURES = FEAT_CTL_LOCKED 5222 | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; 5223 5224 /* 5225 * Manually check CR4.VMXE checks, KVM must force CR4.VMXE=1 to enter 5226 * the guest and so cannot rely on hardware to perform the check, 5227 * which has higher priority than VM-Exit (see Intel SDM's pseudocode 5228 * for VMXON). 5229 * 5230 * Rely on hardware for the other pre-VM-Exit checks, CR0.PE=1, !VM86 5231 * and !COMPATIBILITY modes. For an unrestricted guest, KVM doesn't 5232 * force any of the relevant guest state. For a restricted guest, KVM 5233 * does force CR0.PE=1, but only to also force VM86 in order to emulate 5234 * Real Mode, and so there's no need to check CR0.PE manually. 5235 */ 5236 if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_VMXE)) { 5237 kvm_queue_exception(vcpu, UD_VECTOR); 5238 return 1; 5239 } 5240 5241 /* 5242 * The CPL is checked for "not in VMX operation" and for "in VMX root", 5243 * and has higher priority than the VM-Fail due to being post-VMXON, 5244 * i.e. VMXON #GPs outside of VMX non-root if CPL!=0. In VMX non-root, 5245 * VMXON causes VM-Exit and KVM unconditionally forwards VMXON VM-Exits 5246 * from L2 to L1, i.e. there's no need to check for the vCPU being in 5247 * VMX non-root. 5248 * 5249 * Forwarding the VM-Exit unconditionally, i.e. without performing the 5250 * #UD checks (see above), is functionally ok because KVM doesn't allow 5251 * L1 to run L2 without CR4.VMXE=0, and because KVM never modifies L2's 5252 * CR0 or CR4, i.e. it's L2's responsibility to emulate #UDs that are 5253 * missed by hardware due to shadowing CR0 and/or CR4. 5254 */ 5255 if (vmx_get_cpl(vcpu)) { 5256 kvm_inject_gp(vcpu, 0); 5257 return 1; 5258 } 5259 5260 if (vmx->nested.vmxon) 5261 return nested_vmx_fail(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); 5262 5263 /* 5264 * Invalid CR0/CR4 generates #GP. These checks are performed if and 5265 * only if the vCPU isn't already in VMX operation, i.e. effectively 5266 * have lower priority than the VM-Fail above. 5267 */ 5268 if (!nested_host_cr0_valid(vcpu, kvm_read_cr0(vcpu)) || 5269 !nested_host_cr4_valid(vcpu, kvm_read_cr4(vcpu))) { 5270 kvm_inject_gp(vcpu, 0); 5271 return 1; 5272 } 5273 5274 if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) 5275 != VMXON_NEEDED_FEATURES) { 5276 kvm_inject_gp(vcpu, 0); 5277 return 1; 5278 } 5279 5280 if (nested_vmx_get_vmptr(vcpu, &vmptr, &ret)) 5281 return ret; 5282 5283 /* 5284 * SDM 3: 24.11.5 5285 * The first 4 bytes of VMXON region contain the supported 5286 * VMCS revision identifier 5287 * 5288 * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case; 5289 * which replaces physical address width with 32 5290 */ 5291 if (!page_address_valid(vcpu, vmptr)) 5292 return nested_vmx_failInvalid(vcpu); 5293 5294 if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) || 5295 revision != VMCS12_REVISION) 5296 return nested_vmx_failInvalid(vcpu); 5297 5298 vmx->nested.vmxon_ptr = vmptr; 5299 ret = enter_vmx_operation(vcpu); 5300 if (ret) 5301 return ret; 5302 5303 return nested_vmx_succeed(vcpu); 5304 } 5305 5306 static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) 5307 { 5308 struct vcpu_vmx *vmx = to_vmx(vcpu); 5309 5310 if (vmx->nested.current_vmptr == INVALID_GPA) 5311 return; 5312 5313 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); 5314 5315 if (enable_shadow_vmcs) { 5316 /* copy to memory all shadowed fields in case 5317 they were modified */ 5318 copy_shadow_to_vmcs12(vmx); 5319 vmx_disable_shadow_vmcs(vmx); 5320 } 5321 vmx->nested.posted_intr_nv = -1; 5322 5323 /* Flush VMCS12 to guest memory */ 5324 kvm_vcpu_write_guest_page(vcpu, 5325 vmx->nested.current_vmptr >> PAGE_SHIFT, 5326 vmx->nested.cached_vmcs12, 0, VMCS12_SIZE); 5327 5328 kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 5329 5330 vmx->nested.current_vmptr = INVALID_GPA; 5331 } 5332 5333 /* Emulate the VMXOFF instruction */ 5334 static int handle_vmxoff(struct kvm_vcpu *vcpu) 5335 { 5336 if (!nested_vmx_check_permission(vcpu)) 5337 return 1; 5338 5339 free_nested(vcpu); 5340 5341 if (kvm_apic_has_pending_init_or_sipi(vcpu)) 5342 kvm_make_request(KVM_REQ_EVENT, vcpu); 5343 5344 return nested_vmx_succeed(vcpu); 5345 } 5346 5347 /* Emulate the VMCLEAR instruction */ 5348 static int handle_vmclear(struct kvm_vcpu *vcpu) 5349 { 5350 struct vcpu_vmx *vmx = to_vmx(vcpu); 5351 u32 zero = 0; 5352 gpa_t vmptr; 5353 int r; 5354 5355 if (!nested_vmx_check_permission(vcpu)) 5356 return 1; 5357 5358 if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) 5359 return r; 5360 5361 if (!page_address_valid(vcpu, vmptr)) 5362 return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS); 5363 5364 if (vmptr == vmx->nested.vmxon_ptr) 5365 return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_VMXON_POINTER); 5366 5367 if (likely(!nested_evmcs_handle_vmclear(vcpu, vmptr))) { 5368 if (vmptr == vmx->nested.current_vmptr) 5369 nested_release_vmcs12(vcpu); 5370 5371 /* 5372 * Silently ignore memory errors on VMCLEAR, Intel's pseudocode 5373 * for VMCLEAR includes a "ensure that data for VMCS referenced 5374 * by the operand is in memory" clause that guards writes to 5375 * memory, i.e. doing nothing for I/O is architecturally valid. 5376 * 5377 * FIXME: Suppress failures if and only if no memslot is found, 5378 * i.e. exit to userspace if __copy_to_user() fails. 5379 */ 5380 (void)kvm_vcpu_write_guest(vcpu, 5381 vmptr + offsetof(struct vmcs12, 5382 launch_state), 5383 &zero, sizeof(zero)); 5384 } 5385 5386 return nested_vmx_succeed(vcpu); 5387 } 5388 5389 /* Emulate the VMLAUNCH instruction */ 5390 static int handle_vmlaunch(struct kvm_vcpu *vcpu) 5391 { 5392 return nested_vmx_run(vcpu, true); 5393 } 5394 5395 /* Emulate the VMRESUME instruction */ 5396 static int handle_vmresume(struct kvm_vcpu *vcpu) 5397 { 5398 5399 return nested_vmx_run(vcpu, false); 5400 } 5401 5402 static int handle_vmread(struct kvm_vcpu *vcpu) 5403 { 5404 struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) 5405 : get_vmcs12(vcpu); 5406 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5407 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5408 struct vcpu_vmx *vmx = to_vmx(vcpu); 5409 struct x86_exception e; 5410 unsigned long field; 5411 u64 value; 5412 gva_t gva = 0; 5413 short offset; 5414 int len, r; 5415 5416 if (!nested_vmx_check_permission(vcpu)) 5417 return 1; 5418 5419 /* Decode instruction info and find the field to read */ 5420 field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); 5421 5422 if (!nested_vmx_is_evmptr12_valid(vmx)) { 5423 /* 5424 * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, 5425 * any VMREAD sets the ALU flags for VMfailInvalid. 5426 */ 5427 if (vmx->nested.current_vmptr == INVALID_GPA || 5428 (is_guest_mode(vcpu) && 5429 get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) 5430 return nested_vmx_failInvalid(vcpu); 5431 5432 offset = get_vmcs12_field_offset(field); 5433 if (offset < 0) 5434 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5435 5436 if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field)) 5437 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 5438 5439 /* Read the field, zero-extended to a u64 value */ 5440 value = vmcs12_read_any(vmcs12, field, offset); 5441 } else { 5442 /* 5443 * Hyper-V TLFS (as of 6.0b) explicitly states, that while an 5444 * enlightened VMCS is active VMREAD/VMWRITE instructions are 5445 * unsupported. Unfortunately, certain versions of Windows 11 5446 * don't comply with this requirement which is not enforced in 5447 * genuine Hyper-V. Allow VMREAD from an enlightened VMCS as a 5448 * workaround, as misbehaving guests will panic on VM-Fail. 5449 * Note, enlightened VMCS is incompatible with shadow VMCS so 5450 * all VMREADs from L2 should go to L1. 5451 */ 5452 if (WARN_ON_ONCE(is_guest_mode(vcpu))) 5453 return nested_vmx_failInvalid(vcpu); 5454 5455 offset = evmcs_field_offset(field, NULL); 5456 if (offset < 0) 5457 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5458 5459 /* Read the field, zero-extended to a u64 value */ 5460 value = evmcs_read_any(nested_vmx_evmcs(vmx), field, offset); 5461 } 5462 5463 /* 5464 * Now copy part of this value to register or memory, as requested. 5465 * Note that the number of bits actually copied is 32 or 64 depending 5466 * on the guest's mode (32 or 64 bit), not on the given field's length. 5467 */ 5468 if (instr_info & BIT(10)) { 5469 kvm_register_write(vcpu, (((instr_info) >> 3) & 0xf), value); 5470 } else { 5471 len = is_64_bit_mode(vcpu) ? 8 : 4; 5472 if (get_vmx_mem_address(vcpu, exit_qualification, 5473 instr_info, true, len, &gva)) 5474 return 1; 5475 /* _system ok, nested_vmx_check_permission has verified cpl=0 */ 5476 r = kvm_write_guest_virt_system(vcpu, gva, &value, len, &e); 5477 if (r != X86EMUL_CONTINUE) 5478 return kvm_handle_memory_failure(vcpu, r, &e); 5479 } 5480 5481 return nested_vmx_succeed(vcpu); 5482 } 5483 5484 static bool is_shadow_field_rw(unsigned long field) 5485 { 5486 switch (field) { 5487 #define SHADOW_FIELD_RW(x, y) case x: 5488 #include "vmcs_shadow_fields.h" 5489 return true; 5490 default: 5491 break; 5492 } 5493 return false; 5494 } 5495 5496 static bool is_shadow_field_ro(unsigned long field) 5497 { 5498 switch (field) { 5499 #define SHADOW_FIELD_RO(x, y) case x: 5500 #include "vmcs_shadow_fields.h" 5501 return true; 5502 default: 5503 break; 5504 } 5505 return false; 5506 } 5507 5508 static int handle_vmwrite(struct kvm_vcpu *vcpu) 5509 { 5510 struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) 5511 : get_vmcs12(vcpu); 5512 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5513 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5514 struct vcpu_vmx *vmx = to_vmx(vcpu); 5515 struct x86_exception e; 5516 unsigned long field; 5517 short offset; 5518 gva_t gva; 5519 int len, r; 5520 5521 /* 5522 * The value to write might be 32 or 64 bits, depending on L1's long 5523 * mode, and eventually we need to write that into a field of several 5524 * possible lengths. The code below first zero-extends the value to 64 5525 * bit (value), and then copies only the appropriate number of 5526 * bits into the vmcs12 field. 5527 */ 5528 u64 value = 0; 5529 5530 if (!nested_vmx_check_permission(vcpu)) 5531 return 1; 5532 5533 /* 5534 * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, 5535 * any VMWRITE sets the ALU flags for VMfailInvalid. 5536 */ 5537 if (vmx->nested.current_vmptr == INVALID_GPA || 5538 (is_guest_mode(vcpu) && 5539 get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) 5540 return nested_vmx_failInvalid(vcpu); 5541 5542 if (instr_info & BIT(10)) 5543 value = kvm_register_read(vcpu, (((instr_info) >> 3) & 0xf)); 5544 else { 5545 len = is_64_bit_mode(vcpu) ? 8 : 4; 5546 if (get_vmx_mem_address(vcpu, exit_qualification, 5547 instr_info, false, len, &gva)) 5548 return 1; 5549 r = kvm_read_guest_virt(vcpu, gva, &value, len, &e); 5550 if (r != X86EMUL_CONTINUE) 5551 return kvm_handle_memory_failure(vcpu, r, &e); 5552 } 5553 5554 field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); 5555 5556 offset = get_vmcs12_field_offset(field); 5557 if (offset < 0) 5558 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5559 5560 /* 5561 * If the vCPU supports "VMWRITE to any supported field in the 5562 * VMCS," then the "read-only" fields are actually read/write. 5563 */ 5564 if (vmcs_field_readonly(field) && 5565 !nested_cpu_has_vmwrite_any_field(vcpu)) 5566 return nested_vmx_fail(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); 5567 5568 /* 5569 * Ensure vmcs12 is up-to-date before any VMWRITE that dirties 5570 * vmcs12, else we may crush a field or consume a stale value. 5571 */ 5572 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) 5573 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 5574 5575 /* 5576 * Some Intel CPUs intentionally drop the reserved bits of the AR byte 5577 * fields on VMWRITE. Emulate this behavior to ensure consistent KVM 5578 * behavior regardless of the underlying hardware, e.g. if an AR_BYTE 5579 * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD 5580 * from L1 will return a different value than VMREAD from L2 (L1 sees 5581 * the stripped down value, L2 sees the full value as stored by KVM). 5582 */ 5583 if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES) 5584 value &= 0x1f0ff; 5585 5586 vmcs12_write_any(vmcs12, field, offset, value); 5587 5588 /* 5589 * Do not track vmcs12 dirty-state if in guest-mode as we actually 5590 * dirty shadow vmcs12 instead of vmcs12. Fields that can be updated 5591 * by L1 without a vmexit are always updated in the vmcs02, i.e. don't 5592 * "dirty" vmcs12, all others go down the prepare_vmcs02() slow path. 5593 */ 5594 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) { 5595 /* 5596 * L1 can read these fields without exiting, ensure the 5597 * shadow VMCS is up-to-date. 5598 */ 5599 if (enable_shadow_vmcs && is_shadow_field_ro(field)) { 5600 preempt_disable(); 5601 vmcs_load(vmx->vmcs01.shadow_vmcs); 5602 5603 __vmcs_writel(field, value); 5604 5605 vmcs_clear(vmx->vmcs01.shadow_vmcs); 5606 vmcs_load(vmx->loaded_vmcs->vmcs); 5607 preempt_enable(); 5608 } 5609 vmx->nested.dirty_vmcs12 = true; 5610 } 5611 5612 return nested_vmx_succeed(vcpu); 5613 } 5614 5615 static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr) 5616 { 5617 vmx->nested.current_vmptr = vmptr; 5618 if (enable_shadow_vmcs) { 5619 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); 5620 vmcs_write64(VMCS_LINK_POINTER, 5621 __pa(vmx->vmcs01.shadow_vmcs)); 5622 vmx->nested.need_vmcs12_to_shadow_sync = true; 5623 } 5624 vmx->nested.dirty_vmcs12 = true; 5625 vmx->nested.force_msr_bitmap_recalc = true; 5626 } 5627 5628 /* Emulate the VMPTRLD instruction */ 5629 static int handle_vmptrld(struct kvm_vcpu *vcpu) 5630 { 5631 struct vcpu_vmx *vmx = to_vmx(vcpu); 5632 gpa_t vmptr; 5633 int r; 5634 5635 if (!nested_vmx_check_permission(vcpu)) 5636 return 1; 5637 5638 if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) 5639 return r; 5640 5641 if (!page_address_valid(vcpu, vmptr)) 5642 return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS); 5643 5644 if (vmptr == vmx->nested.vmxon_ptr) 5645 return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_VMXON_POINTER); 5646 5647 /* Forbid normal VMPTRLD if Enlightened version was used */ 5648 if (nested_vmx_is_evmptr12_valid(vmx)) 5649 return 1; 5650 5651 if (vmx->nested.current_vmptr != vmptr) { 5652 struct gfn_to_hva_cache *ghc = &vmx->nested.vmcs12_cache; 5653 struct vmcs_hdr hdr; 5654 5655 if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmptr, VMCS12_SIZE)) { 5656 /* 5657 * Reads from an unbacked page return all 1s, 5658 * which means that the 32 bits located at the 5659 * given physical address won't match the required 5660 * VMCS12_REVISION identifier. 5661 */ 5662 return nested_vmx_fail(vcpu, 5663 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5664 } 5665 5666 if (kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr, 5667 offsetof(struct vmcs12, hdr), 5668 sizeof(hdr))) { 5669 return nested_vmx_fail(vcpu, 5670 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5671 } 5672 5673 if (hdr.revision_id != VMCS12_REVISION || 5674 (hdr.shadow_vmcs && 5675 !nested_cpu_has_vmx_shadow_vmcs(vcpu))) { 5676 return nested_vmx_fail(vcpu, 5677 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5678 } 5679 5680 nested_release_vmcs12(vcpu); 5681 5682 /* 5683 * Load VMCS12 from guest memory since it is not already 5684 * cached. 5685 */ 5686 if (kvm_read_guest_cached(vcpu->kvm, ghc, vmx->nested.cached_vmcs12, 5687 VMCS12_SIZE)) { 5688 return nested_vmx_fail(vcpu, 5689 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5690 } 5691 5692 set_current_vmptr(vmx, vmptr); 5693 } 5694 5695 return nested_vmx_succeed(vcpu); 5696 } 5697 5698 /* Emulate the VMPTRST instruction */ 5699 static int handle_vmptrst(struct kvm_vcpu *vcpu) 5700 { 5701 unsigned long exit_qual = vmx_get_exit_qual(vcpu); 5702 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5703 gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr; 5704 struct x86_exception e; 5705 gva_t gva; 5706 int r; 5707 5708 if (!nested_vmx_check_permission(vcpu)) 5709 return 1; 5710 5711 if (unlikely(nested_vmx_is_evmptr12_valid(to_vmx(vcpu)))) 5712 return 1; 5713 5714 if (get_vmx_mem_address(vcpu, exit_qual, instr_info, 5715 true, sizeof(gpa_t), &gva)) 5716 return 1; 5717 /* *_system ok, nested_vmx_check_permission has verified cpl=0 */ 5718 r = kvm_write_guest_virt_system(vcpu, gva, (void *)¤t_vmptr, 5719 sizeof(gpa_t), &e); 5720 if (r != X86EMUL_CONTINUE) 5721 return kvm_handle_memory_failure(vcpu, r, &e); 5722 5723 return nested_vmx_succeed(vcpu); 5724 } 5725 5726 /* Emulate the INVEPT instruction */ 5727 static int handle_invept(struct kvm_vcpu *vcpu) 5728 { 5729 struct vcpu_vmx *vmx = to_vmx(vcpu); 5730 u32 vmx_instruction_info, types; 5731 unsigned long type, roots_to_free; 5732 struct kvm_mmu *mmu; 5733 gva_t gva; 5734 struct x86_exception e; 5735 struct { 5736 u64 eptp, gpa; 5737 } operand; 5738 int i, r, gpr_index; 5739 5740 if (!(vmx->nested.msrs.secondary_ctls_high & 5741 SECONDARY_EXEC_ENABLE_EPT) || 5742 !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) { 5743 kvm_queue_exception(vcpu, UD_VECTOR); 5744 return 1; 5745 } 5746 5747 if (!nested_vmx_check_permission(vcpu)) 5748 return 1; 5749 5750 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5751 gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); 5752 type = kvm_register_read(vcpu, gpr_index); 5753 5754 types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; 5755 5756 if (type >= 32 || !(types & (1 << type))) 5757 return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5758 5759 /* According to the Intel VMX instruction reference, the memory 5760 * operand is read even if it isn't needed (e.g., for type==global) 5761 */ 5762 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 5763 vmx_instruction_info, false, sizeof(operand), &gva)) 5764 return 1; 5765 r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); 5766 if (r != X86EMUL_CONTINUE) 5767 return kvm_handle_memory_failure(vcpu, r, &e); 5768 5769 /* 5770 * Nested EPT roots are always held through guest_mmu, 5771 * not root_mmu. 5772 */ 5773 mmu = &vcpu->arch.guest_mmu; 5774 5775 switch (type) { 5776 case VMX_EPT_EXTENT_CONTEXT: 5777 if (!nested_vmx_check_eptp(vcpu, operand.eptp)) 5778 return nested_vmx_fail(vcpu, 5779 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5780 5781 roots_to_free = 0; 5782 if (nested_ept_root_matches(mmu->root.hpa, mmu->root.pgd, 5783 operand.eptp)) 5784 roots_to_free |= KVM_MMU_ROOT_CURRENT; 5785 5786 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 5787 if (nested_ept_root_matches(mmu->prev_roots[i].hpa, 5788 mmu->prev_roots[i].pgd, 5789 operand.eptp)) 5790 roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); 5791 } 5792 break; 5793 case VMX_EPT_EXTENT_GLOBAL: 5794 roots_to_free = KVM_MMU_ROOTS_ALL; 5795 break; 5796 default: 5797 BUG(); 5798 break; 5799 } 5800 5801 if (roots_to_free) 5802 kvm_mmu_free_roots(vcpu->kvm, mmu, roots_to_free); 5803 5804 return nested_vmx_succeed(vcpu); 5805 } 5806 5807 static int handle_invvpid(struct kvm_vcpu *vcpu) 5808 { 5809 struct vcpu_vmx *vmx = to_vmx(vcpu); 5810 u32 vmx_instruction_info; 5811 unsigned long type, types; 5812 gva_t gva; 5813 struct x86_exception e; 5814 struct { 5815 u64 vpid; 5816 u64 gla; 5817 } operand; 5818 u16 vpid02; 5819 int r, gpr_index; 5820 5821 if (!(vmx->nested.msrs.secondary_ctls_high & 5822 SECONDARY_EXEC_ENABLE_VPID) || 5823 !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) { 5824 kvm_queue_exception(vcpu, UD_VECTOR); 5825 return 1; 5826 } 5827 5828 if (!nested_vmx_check_permission(vcpu)) 5829 return 1; 5830 5831 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5832 gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); 5833 type = kvm_register_read(vcpu, gpr_index); 5834 5835 types = (vmx->nested.msrs.vpid_caps & 5836 VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; 5837 5838 if (type >= 32 || !(types & (1 << type))) 5839 return nested_vmx_fail(vcpu, 5840 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5841 5842 /* according to the intel vmx instruction reference, the memory 5843 * operand is read even if it isn't needed (e.g., for type==global) 5844 */ 5845 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 5846 vmx_instruction_info, false, sizeof(operand), &gva)) 5847 return 1; 5848 r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); 5849 if (r != X86EMUL_CONTINUE) 5850 return kvm_handle_memory_failure(vcpu, r, &e); 5851 5852 if (operand.vpid >> 16) 5853 return nested_vmx_fail(vcpu, 5854 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5855 5856 vpid02 = nested_get_vpid02(vcpu); 5857 switch (type) { 5858 case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: 5859 /* 5860 * LAM doesn't apply to addresses that are inputs to TLB 5861 * invalidation. 5862 */ 5863 if (!operand.vpid || 5864 is_noncanonical_address(operand.gla, vcpu)) 5865 return nested_vmx_fail(vcpu, 5866 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5867 vpid_sync_vcpu_addr(vpid02, operand.gla); 5868 break; 5869 case VMX_VPID_EXTENT_SINGLE_CONTEXT: 5870 case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: 5871 if (!operand.vpid) 5872 return nested_vmx_fail(vcpu, 5873 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5874 vpid_sync_context(vpid02); 5875 break; 5876 case VMX_VPID_EXTENT_ALL_CONTEXT: 5877 vpid_sync_context(vpid02); 5878 break; 5879 default: 5880 WARN_ON_ONCE(1); 5881 return kvm_skip_emulated_instruction(vcpu); 5882 } 5883 5884 /* 5885 * Sync the shadow page tables if EPT is disabled, L1 is invalidating 5886 * linear mappings for L2 (tagged with L2's VPID). Free all guest 5887 * roots as VPIDs are not tracked in the MMU role. 5888 * 5889 * Note, this operates on root_mmu, not guest_mmu, as L1 and L2 share 5890 * an MMU when EPT is disabled. 5891 * 5892 * TODO: sync only the affected SPTEs for INVDIVIDUAL_ADDR. 5893 */ 5894 if (!enable_ept) 5895 kvm_mmu_free_guest_mode_roots(vcpu->kvm, &vcpu->arch.root_mmu); 5896 5897 return nested_vmx_succeed(vcpu); 5898 } 5899 5900 static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, 5901 struct vmcs12 *vmcs12) 5902 { 5903 u32 index = kvm_rcx_read(vcpu); 5904 u64 new_eptp; 5905 5906 if (WARN_ON_ONCE(!nested_cpu_has_ept(vmcs12))) 5907 return 1; 5908 if (index >= VMFUNC_EPTP_ENTRIES) 5909 return 1; 5910 5911 if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT, 5912 &new_eptp, index * 8, 8)) 5913 return 1; 5914 5915 /* 5916 * If the (L2) guest does a vmfunc to the currently 5917 * active ept pointer, we don't have to do anything else 5918 */ 5919 if (vmcs12->ept_pointer != new_eptp) { 5920 if (!nested_vmx_check_eptp(vcpu, new_eptp)) 5921 return 1; 5922 5923 vmcs12->ept_pointer = new_eptp; 5924 nested_ept_new_eptp(vcpu); 5925 5926 if (!nested_cpu_has_vpid(vmcs12)) 5927 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 5928 } 5929 5930 return 0; 5931 } 5932 5933 static int handle_vmfunc(struct kvm_vcpu *vcpu) 5934 { 5935 struct vcpu_vmx *vmx = to_vmx(vcpu); 5936 struct vmcs12 *vmcs12; 5937 u32 function = kvm_rax_read(vcpu); 5938 5939 /* 5940 * VMFUNC should never execute cleanly while L1 is active; KVM supports 5941 * VMFUNC for nested VMs, but not for L1. 5942 */ 5943 if (WARN_ON_ONCE(!is_guest_mode(vcpu))) { 5944 kvm_queue_exception(vcpu, UD_VECTOR); 5945 return 1; 5946 } 5947 5948 vmcs12 = get_vmcs12(vcpu); 5949 5950 /* 5951 * #UD on out-of-bounds function has priority over VM-Exit, and VMFUNC 5952 * is enabled in vmcs02 if and only if it's enabled in vmcs12. 5953 */ 5954 if (WARN_ON_ONCE((function > 63) || !nested_cpu_has_vmfunc(vmcs12))) { 5955 kvm_queue_exception(vcpu, UD_VECTOR); 5956 return 1; 5957 } 5958 5959 if (!(vmcs12->vm_function_control & BIT_ULL(function))) 5960 goto fail; 5961 5962 switch (function) { 5963 case 0: 5964 if (nested_vmx_eptp_switching(vcpu, vmcs12)) 5965 goto fail; 5966 break; 5967 default: 5968 goto fail; 5969 } 5970 return kvm_skip_emulated_instruction(vcpu); 5971 5972 fail: 5973 /* 5974 * This is effectively a reflected VM-Exit, as opposed to a synthesized 5975 * nested VM-Exit. Pass the original exit reason, i.e. don't hardcode 5976 * EXIT_REASON_VMFUNC as the exit reason. 5977 */ 5978 nested_vmx_vmexit(vcpu, vmx->exit_reason.full, 5979 vmx_get_intr_info(vcpu), 5980 vmx_get_exit_qual(vcpu)); 5981 return 1; 5982 } 5983 5984 /* 5985 * Return true if an IO instruction with the specified port and size should cause 5986 * a VM-exit into L1. 5987 */ 5988 bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port, 5989 int size) 5990 { 5991 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5992 gpa_t bitmap, last_bitmap; 5993 u8 b; 5994 5995 last_bitmap = INVALID_GPA; 5996 b = -1; 5997 5998 while (size > 0) { 5999 if (port < 0x8000) 6000 bitmap = vmcs12->io_bitmap_a; 6001 else if (port < 0x10000) 6002 bitmap = vmcs12->io_bitmap_b; 6003 else 6004 return true; 6005 bitmap += (port & 0x7fff) / 8; 6006 6007 if (last_bitmap != bitmap) 6008 if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1)) 6009 return true; 6010 if (b & (1 << (port & 7))) 6011 return true; 6012 6013 port++; 6014 size--; 6015 last_bitmap = bitmap; 6016 } 6017 6018 return false; 6019 } 6020 6021 static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, 6022 struct vmcs12 *vmcs12) 6023 { 6024 unsigned long exit_qualification; 6025 unsigned short port; 6026 int size; 6027 6028 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 6029 return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); 6030 6031 exit_qualification = vmx_get_exit_qual(vcpu); 6032 6033 port = exit_qualification >> 16; 6034 size = (exit_qualification & 7) + 1; 6035 6036 return nested_vmx_check_io_bitmaps(vcpu, port, size); 6037 } 6038 6039 /* 6040 * Return 1 if we should exit from L2 to L1 to handle an MSR access, 6041 * rather than handle it ourselves in L0. I.e., check whether L1 expressed 6042 * disinterest in the current event (read or write a specific MSR) by using an 6043 * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps. 6044 */ 6045 static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, 6046 struct vmcs12 *vmcs12, 6047 union vmx_exit_reason exit_reason) 6048 { 6049 u32 msr_index = kvm_rcx_read(vcpu); 6050 gpa_t bitmap; 6051 6052 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 6053 return true; 6054 6055 /* 6056 * The MSR_BITMAP page is divided into four 1024-byte bitmaps, 6057 * for the four combinations of read/write and low/high MSR numbers. 6058 * First we need to figure out which of the four to use: 6059 */ 6060 bitmap = vmcs12->msr_bitmap; 6061 if (exit_reason.basic == EXIT_REASON_MSR_WRITE) 6062 bitmap += 2048; 6063 if (msr_index >= 0xc0000000) { 6064 msr_index -= 0xc0000000; 6065 bitmap += 1024; 6066 } 6067 6068 /* Then read the msr_index'th bit from this bitmap: */ 6069 if (msr_index < 1024*8) { 6070 unsigned char b; 6071 if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1)) 6072 return true; 6073 return 1 & (b >> (msr_index & 7)); 6074 } else 6075 return true; /* let L1 handle the wrong parameter */ 6076 } 6077 6078 /* 6079 * Return 1 if we should exit from L2 to L1 to handle a CR access exit, 6080 * rather than handle it ourselves in L0. I.e., check if L1 wanted to 6081 * intercept (via guest_host_mask etc.) the current event. 6082 */ 6083 static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, 6084 struct vmcs12 *vmcs12) 6085 { 6086 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 6087 int cr = exit_qualification & 15; 6088 int reg; 6089 unsigned long val; 6090 6091 switch ((exit_qualification >> 4) & 3) { 6092 case 0: /* mov to cr */ 6093 reg = (exit_qualification >> 8) & 15; 6094 val = kvm_register_read(vcpu, reg); 6095 switch (cr) { 6096 case 0: 6097 if (vmcs12->cr0_guest_host_mask & 6098 (val ^ vmcs12->cr0_read_shadow)) 6099 return true; 6100 break; 6101 case 3: 6102 if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) 6103 return true; 6104 break; 6105 case 4: 6106 if (vmcs12->cr4_guest_host_mask & 6107 (vmcs12->cr4_read_shadow ^ val)) 6108 return true; 6109 break; 6110 case 8: 6111 if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING)) 6112 return true; 6113 break; 6114 } 6115 break; 6116 case 2: /* clts */ 6117 if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) && 6118 (vmcs12->cr0_read_shadow & X86_CR0_TS)) 6119 return true; 6120 break; 6121 case 1: /* mov from cr */ 6122 switch (cr) { 6123 case 3: 6124 if (vmcs12->cpu_based_vm_exec_control & 6125 CPU_BASED_CR3_STORE_EXITING) 6126 return true; 6127 break; 6128 case 8: 6129 if (vmcs12->cpu_based_vm_exec_control & 6130 CPU_BASED_CR8_STORE_EXITING) 6131 return true; 6132 break; 6133 } 6134 break; 6135 case 3: /* lmsw */ 6136 /* 6137 * lmsw can change bits 1..3 of cr0, and only set bit 0 of 6138 * cr0. Other attempted changes are ignored, with no exit. 6139 */ 6140 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; 6141 if (vmcs12->cr0_guest_host_mask & 0xe & 6142 (val ^ vmcs12->cr0_read_shadow)) 6143 return true; 6144 if ((vmcs12->cr0_guest_host_mask & 0x1) && 6145 !(vmcs12->cr0_read_shadow & 0x1) && 6146 (val & 0x1)) 6147 return true; 6148 break; 6149 } 6150 return false; 6151 } 6152 6153 static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu, 6154 struct vmcs12 *vmcs12) 6155 { 6156 u32 encls_leaf; 6157 6158 if (!guest_cpuid_has(vcpu, X86_FEATURE_SGX) || 6159 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING)) 6160 return false; 6161 6162 encls_leaf = kvm_rax_read(vcpu); 6163 if (encls_leaf > 62) 6164 encls_leaf = 63; 6165 return vmcs12->encls_exiting_bitmap & BIT_ULL(encls_leaf); 6166 } 6167 6168 static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu, 6169 struct vmcs12 *vmcs12, gpa_t bitmap) 6170 { 6171 u32 vmx_instruction_info; 6172 unsigned long field; 6173 u8 b; 6174 6175 if (!nested_cpu_has_shadow_vmcs(vmcs12)) 6176 return true; 6177 6178 /* Decode instruction info and find the field to access */ 6179 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 6180 field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); 6181 6182 /* Out-of-range fields always cause a VM exit from L2 to L1 */ 6183 if (field >> 15) 6184 return true; 6185 6186 if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1)) 6187 return true; 6188 6189 return 1 & (b >> (field & 7)); 6190 } 6191 6192 static bool nested_vmx_exit_handled_mtf(struct vmcs12 *vmcs12) 6193 { 6194 u32 entry_intr_info = vmcs12->vm_entry_intr_info_field; 6195 6196 if (nested_cpu_has_mtf(vmcs12)) 6197 return true; 6198 6199 /* 6200 * An MTF VM-exit may be injected into the guest by setting the 6201 * interruption-type to 7 (other event) and the vector field to 0. Such 6202 * is the case regardless of the 'monitor trap flag' VM-execution 6203 * control. 6204 */ 6205 return entry_intr_info == (INTR_INFO_VALID_MASK 6206 | INTR_TYPE_OTHER_EVENT); 6207 } 6208 6209 /* 6210 * Return true if L0 wants to handle an exit from L2 regardless of whether or not 6211 * L1 wants the exit. Only call this when in is_guest_mode (L2). 6212 */ 6213 static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, 6214 union vmx_exit_reason exit_reason) 6215 { 6216 u32 intr_info; 6217 6218 switch ((u16)exit_reason.basic) { 6219 case EXIT_REASON_EXCEPTION_NMI: 6220 intr_info = vmx_get_intr_info(vcpu); 6221 if (is_nmi(intr_info)) 6222 return true; 6223 else if (is_page_fault(intr_info)) 6224 return vcpu->arch.apf.host_apf_flags || 6225 vmx_need_pf_intercept(vcpu); 6226 else if (is_debug(intr_info) && 6227 vcpu->guest_debug & 6228 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) 6229 return true; 6230 else if (is_breakpoint(intr_info) && 6231 vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 6232 return true; 6233 else if (is_alignment_check(intr_info) && 6234 !vmx_guest_inject_ac(vcpu)) 6235 return true; 6236 else if (is_ve_fault(intr_info)) 6237 return true; 6238 return false; 6239 case EXIT_REASON_EXTERNAL_INTERRUPT: 6240 return true; 6241 case EXIT_REASON_MCE_DURING_VMENTRY: 6242 return true; 6243 case EXIT_REASON_EPT_VIOLATION: 6244 /* 6245 * L0 always deals with the EPT violation. If nested EPT is 6246 * used, and the nested mmu code discovers that the address is 6247 * missing in the guest EPT table (EPT12), the EPT violation 6248 * will be injected with nested_ept_inject_page_fault() 6249 */ 6250 return true; 6251 case EXIT_REASON_EPT_MISCONFIG: 6252 /* 6253 * L2 never uses directly L1's EPT, but rather L0's own EPT 6254 * table (shadow on EPT) or a merged EPT table that L0 built 6255 * (EPT on EPT). So any problems with the structure of the 6256 * table is L0's fault. 6257 */ 6258 return true; 6259 case EXIT_REASON_PREEMPTION_TIMER: 6260 return true; 6261 case EXIT_REASON_PML_FULL: 6262 /* 6263 * PML is emulated for an L1 VMM and should never be enabled in 6264 * vmcs02, always "handle" PML_FULL by exiting to userspace. 6265 */ 6266 return true; 6267 case EXIT_REASON_VMFUNC: 6268 /* VM functions are emulated through L2->L0 vmexits. */ 6269 return true; 6270 case EXIT_REASON_BUS_LOCK: 6271 /* 6272 * At present, bus lock VM exit is never exposed to L1. 6273 * Handle L2's bus locks in L0 directly. 6274 */ 6275 return true; 6276 #ifdef CONFIG_KVM_HYPERV 6277 case EXIT_REASON_VMCALL: 6278 /* Hyper-V L2 TLB flush hypercall is handled by L0 */ 6279 return guest_hv_cpuid_has_l2_tlb_flush(vcpu) && 6280 nested_evmcs_l2_tlb_flush_enabled(vcpu) && 6281 kvm_hv_is_tlb_flush_hcall(vcpu); 6282 #endif 6283 default: 6284 break; 6285 } 6286 return false; 6287 } 6288 6289 /* 6290 * Return 1 if L1 wants to intercept an exit from L2. Only call this when in 6291 * is_guest_mode (L2). 6292 */ 6293 static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, 6294 union vmx_exit_reason exit_reason) 6295 { 6296 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6297 u32 intr_info; 6298 6299 switch ((u16)exit_reason.basic) { 6300 case EXIT_REASON_EXCEPTION_NMI: 6301 intr_info = vmx_get_intr_info(vcpu); 6302 if (is_nmi(intr_info)) 6303 return true; 6304 else if (is_page_fault(intr_info)) 6305 return true; 6306 return vmcs12->exception_bitmap & 6307 (1u << (intr_info & INTR_INFO_VECTOR_MASK)); 6308 case EXIT_REASON_EXTERNAL_INTERRUPT: 6309 return nested_exit_on_intr(vcpu); 6310 case EXIT_REASON_TRIPLE_FAULT: 6311 return true; 6312 case EXIT_REASON_INTERRUPT_WINDOW: 6313 return nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING); 6314 case EXIT_REASON_NMI_WINDOW: 6315 return nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING); 6316 case EXIT_REASON_TASK_SWITCH: 6317 return true; 6318 case EXIT_REASON_CPUID: 6319 return true; 6320 case EXIT_REASON_HLT: 6321 return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); 6322 case EXIT_REASON_INVD: 6323 return true; 6324 case EXIT_REASON_INVLPG: 6325 return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 6326 case EXIT_REASON_RDPMC: 6327 return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); 6328 case EXIT_REASON_RDRAND: 6329 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING); 6330 case EXIT_REASON_RDSEED: 6331 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING); 6332 case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: 6333 return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); 6334 case EXIT_REASON_VMREAD: 6335 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, 6336 vmcs12->vmread_bitmap); 6337 case EXIT_REASON_VMWRITE: 6338 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, 6339 vmcs12->vmwrite_bitmap); 6340 case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: 6341 case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: 6342 case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME: 6343 case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: 6344 case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID: 6345 /* 6346 * VMX instructions trap unconditionally. This allows L1 to 6347 * emulate them for its L2 guest, i.e., allows 3-level nesting! 6348 */ 6349 return true; 6350 case EXIT_REASON_CR_ACCESS: 6351 return nested_vmx_exit_handled_cr(vcpu, vmcs12); 6352 case EXIT_REASON_DR_ACCESS: 6353 return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); 6354 case EXIT_REASON_IO_INSTRUCTION: 6355 return nested_vmx_exit_handled_io(vcpu, vmcs12); 6356 case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR: 6357 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC); 6358 case EXIT_REASON_MSR_READ: 6359 case EXIT_REASON_MSR_WRITE: 6360 return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); 6361 case EXIT_REASON_INVALID_STATE: 6362 return true; 6363 case EXIT_REASON_MWAIT_INSTRUCTION: 6364 return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); 6365 case EXIT_REASON_MONITOR_TRAP_FLAG: 6366 return nested_vmx_exit_handled_mtf(vmcs12); 6367 case EXIT_REASON_MONITOR_INSTRUCTION: 6368 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING); 6369 case EXIT_REASON_PAUSE_INSTRUCTION: 6370 return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) || 6371 nested_cpu_has2(vmcs12, 6372 SECONDARY_EXEC_PAUSE_LOOP_EXITING); 6373 case EXIT_REASON_MCE_DURING_VMENTRY: 6374 return true; 6375 case EXIT_REASON_TPR_BELOW_THRESHOLD: 6376 return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW); 6377 case EXIT_REASON_APIC_ACCESS: 6378 case EXIT_REASON_APIC_WRITE: 6379 case EXIT_REASON_EOI_INDUCED: 6380 /* 6381 * The controls for "virtualize APIC accesses," "APIC- 6382 * register virtualization," and "virtual-interrupt 6383 * delivery" only come from vmcs12. 6384 */ 6385 return true; 6386 case EXIT_REASON_INVPCID: 6387 return 6388 nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) && 6389 nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 6390 case EXIT_REASON_WBINVD: 6391 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); 6392 case EXIT_REASON_XSETBV: 6393 return true; 6394 case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS: 6395 /* 6396 * This should never happen, since it is not possible to 6397 * set XSS to a non-zero value---neither in L1 nor in L2. 6398 * If if it were, XSS would have to be checked against 6399 * the XSS exit bitmap in vmcs12. 6400 */ 6401 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_XSAVES); 6402 case EXIT_REASON_UMWAIT: 6403 case EXIT_REASON_TPAUSE: 6404 return nested_cpu_has2(vmcs12, 6405 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE); 6406 case EXIT_REASON_ENCLS: 6407 return nested_vmx_exit_handled_encls(vcpu, vmcs12); 6408 case EXIT_REASON_NOTIFY: 6409 /* Notify VM exit is not exposed to L1 */ 6410 return false; 6411 default: 6412 return true; 6413 } 6414 } 6415 6416 /* 6417 * Conditionally reflect a VM-Exit into L1. Returns %true if the VM-Exit was 6418 * reflected into L1. 6419 */ 6420 bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu) 6421 { 6422 struct vcpu_vmx *vmx = to_vmx(vcpu); 6423 union vmx_exit_reason exit_reason = vmx->exit_reason; 6424 unsigned long exit_qual; 6425 u32 exit_intr_info; 6426 6427 WARN_ON_ONCE(vmx->nested.nested_run_pending); 6428 6429 /* 6430 * Late nested VM-Fail shares the same flow as nested VM-Exit since KVM 6431 * has already loaded L2's state. 6432 */ 6433 if (unlikely(vmx->fail)) { 6434 trace_kvm_nested_vmenter_failed( 6435 "hardware VM-instruction error: ", 6436 vmcs_read32(VM_INSTRUCTION_ERROR)); 6437 exit_intr_info = 0; 6438 exit_qual = 0; 6439 goto reflect_vmexit; 6440 } 6441 6442 trace_kvm_nested_vmexit(vcpu, KVM_ISA_VMX); 6443 6444 /* If L0 (KVM) wants the exit, it trumps L1's desires. */ 6445 if (nested_vmx_l0_wants_exit(vcpu, exit_reason)) 6446 return false; 6447 6448 /* If L1 doesn't want the exit, handle it in L0. */ 6449 if (!nested_vmx_l1_wants_exit(vcpu, exit_reason)) 6450 return false; 6451 6452 /* 6453 * vmcs.VM_EXIT_INTR_INFO is only valid for EXCEPTION_NMI exits. For 6454 * EXTERNAL_INTERRUPT, the value for vmcs12->vm_exit_intr_info would 6455 * need to be synthesized by querying the in-kernel LAPIC, but external 6456 * interrupts are never reflected to L1 so it's a non-issue. 6457 */ 6458 exit_intr_info = vmx_get_intr_info(vcpu); 6459 if (is_exception_with_error_code(exit_intr_info)) { 6460 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6461 6462 vmcs12->vm_exit_intr_error_code = 6463 vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 6464 } 6465 exit_qual = vmx_get_exit_qual(vcpu); 6466 6467 reflect_vmexit: 6468 nested_vmx_vmexit(vcpu, exit_reason.full, exit_intr_info, exit_qual); 6469 return true; 6470 } 6471 6472 static int vmx_get_nested_state(struct kvm_vcpu *vcpu, 6473 struct kvm_nested_state __user *user_kvm_nested_state, 6474 u32 user_data_size) 6475 { 6476 struct vcpu_vmx *vmx; 6477 struct vmcs12 *vmcs12; 6478 struct kvm_nested_state kvm_state = { 6479 .flags = 0, 6480 .format = KVM_STATE_NESTED_FORMAT_VMX, 6481 .size = sizeof(kvm_state), 6482 .hdr.vmx.flags = 0, 6483 .hdr.vmx.vmxon_pa = INVALID_GPA, 6484 .hdr.vmx.vmcs12_pa = INVALID_GPA, 6485 .hdr.vmx.preemption_timer_deadline = 0, 6486 }; 6487 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = 6488 &user_kvm_nested_state->data.vmx[0]; 6489 6490 if (!vcpu) 6491 return kvm_state.size + sizeof(*user_vmx_nested_state); 6492 6493 vmx = to_vmx(vcpu); 6494 vmcs12 = get_vmcs12(vcpu); 6495 6496 if (guest_can_use(vcpu, X86_FEATURE_VMX) && 6497 (vmx->nested.vmxon || vmx->nested.smm.vmxon)) { 6498 kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr; 6499 kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr; 6500 6501 if (vmx_has_valid_vmcs12(vcpu)) { 6502 kvm_state.size += sizeof(user_vmx_nested_state->vmcs12); 6503 6504 /* 'hv_evmcs_vmptr' can also be EVMPTR_MAP_PENDING here */ 6505 if (nested_vmx_is_evmptr12_set(vmx)) 6506 kvm_state.flags |= KVM_STATE_NESTED_EVMCS; 6507 6508 if (is_guest_mode(vcpu) && 6509 nested_cpu_has_shadow_vmcs(vmcs12) && 6510 vmcs12->vmcs_link_pointer != INVALID_GPA) 6511 kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12); 6512 } 6513 6514 if (vmx->nested.smm.vmxon) 6515 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON; 6516 6517 if (vmx->nested.smm.guest_mode) 6518 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE; 6519 6520 if (is_guest_mode(vcpu)) { 6521 kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE; 6522 6523 if (vmx->nested.nested_run_pending) 6524 kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING; 6525 6526 if (vmx->nested.mtf_pending) 6527 kvm_state.flags |= KVM_STATE_NESTED_MTF_PENDING; 6528 6529 if (nested_cpu_has_preemption_timer(vmcs12) && 6530 vmx->nested.has_preemption_timer_deadline) { 6531 kvm_state.hdr.vmx.flags |= 6532 KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE; 6533 kvm_state.hdr.vmx.preemption_timer_deadline = 6534 vmx->nested.preemption_timer_deadline; 6535 } 6536 } 6537 } 6538 6539 if (user_data_size < kvm_state.size) 6540 goto out; 6541 6542 if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state))) 6543 return -EFAULT; 6544 6545 if (!vmx_has_valid_vmcs12(vcpu)) 6546 goto out; 6547 6548 /* 6549 * When running L2, the authoritative vmcs12 state is in the 6550 * vmcs02. When running L1, the authoritative vmcs12 state is 6551 * in the shadow or enlightened vmcs linked to vmcs01, unless 6552 * need_vmcs12_to_shadow_sync is set, in which case, the authoritative 6553 * vmcs12 state is in the vmcs12 already. 6554 */ 6555 if (is_guest_mode(vcpu)) { 6556 sync_vmcs02_to_vmcs12(vcpu, vmcs12); 6557 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 6558 } else { 6559 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); 6560 if (!vmx->nested.need_vmcs12_to_shadow_sync) { 6561 if (nested_vmx_is_evmptr12_valid(vmx)) 6562 /* 6563 * L1 hypervisor is not obliged to keep eVMCS 6564 * clean fields data always up-to-date while 6565 * not in guest mode, 'hv_clean_fields' is only 6566 * supposed to be actual upon vmentry so we need 6567 * to ignore it here and do full copy. 6568 */ 6569 copy_enlightened_to_vmcs12(vmx, 0); 6570 else if (enable_shadow_vmcs) 6571 copy_shadow_to_vmcs12(vmx); 6572 } 6573 } 6574 6575 BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE); 6576 BUILD_BUG_ON(sizeof(user_vmx_nested_state->shadow_vmcs12) < VMCS12_SIZE); 6577 6578 /* 6579 * Copy over the full allocated size of vmcs12 rather than just the size 6580 * of the struct. 6581 */ 6582 if (copy_to_user(user_vmx_nested_state->vmcs12, vmcs12, VMCS12_SIZE)) 6583 return -EFAULT; 6584 6585 if (nested_cpu_has_shadow_vmcs(vmcs12) && 6586 vmcs12->vmcs_link_pointer != INVALID_GPA) { 6587 if (copy_to_user(user_vmx_nested_state->shadow_vmcs12, 6588 get_shadow_vmcs12(vcpu), VMCS12_SIZE)) 6589 return -EFAULT; 6590 } 6591 out: 6592 return kvm_state.size; 6593 } 6594 6595 void vmx_leave_nested(struct kvm_vcpu *vcpu) 6596 { 6597 if (is_guest_mode(vcpu)) { 6598 to_vmx(vcpu)->nested.nested_run_pending = 0; 6599 nested_vmx_vmexit(vcpu, -1, 0, 0); 6600 } 6601 free_nested(vcpu); 6602 } 6603 6604 static int vmx_set_nested_state(struct kvm_vcpu *vcpu, 6605 struct kvm_nested_state __user *user_kvm_nested_state, 6606 struct kvm_nested_state *kvm_state) 6607 { 6608 struct vcpu_vmx *vmx = to_vmx(vcpu); 6609 struct vmcs12 *vmcs12; 6610 enum vm_entry_failure_code ignored; 6611 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = 6612 &user_kvm_nested_state->data.vmx[0]; 6613 int ret; 6614 6615 if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX) 6616 return -EINVAL; 6617 6618 if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) { 6619 if (kvm_state->hdr.vmx.smm.flags) 6620 return -EINVAL; 6621 6622 if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) 6623 return -EINVAL; 6624 6625 /* 6626 * KVM_STATE_NESTED_EVMCS used to signal that KVM should 6627 * enable eVMCS capability on vCPU. However, since then 6628 * code was changed such that flag signals vmcs12 should 6629 * be copied into eVMCS in guest memory. 6630 * 6631 * To preserve backwards compatibility, allow user 6632 * to set this flag even when there is no VMXON region. 6633 */ 6634 if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS) 6635 return -EINVAL; 6636 } else { 6637 if (!guest_can_use(vcpu, X86_FEATURE_VMX)) 6638 return -EINVAL; 6639 6640 if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa)) 6641 return -EINVAL; 6642 } 6643 6644 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 6645 (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 6646 return -EINVAL; 6647 6648 if (kvm_state->hdr.vmx.smm.flags & 6649 ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON)) 6650 return -EINVAL; 6651 6652 if (kvm_state->hdr.vmx.flags & ~KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) 6653 return -EINVAL; 6654 6655 /* 6656 * SMM temporarily disables VMX, so we cannot be in guest mode, 6657 * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags 6658 * must be zero. 6659 */ 6660 if (is_smm(vcpu) ? 6661 (kvm_state->flags & 6662 (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING)) 6663 : kvm_state->hdr.vmx.smm.flags) 6664 return -EINVAL; 6665 6666 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 6667 !(kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON)) 6668 return -EINVAL; 6669 6670 if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) && 6671 (!guest_can_use(vcpu, X86_FEATURE_VMX) || 6672 !vmx->nested.enlightened_vmcs_enabled)) 6673 return -EINVAL; 6674 6675 vmx_leave_nested(vcpu); 6676 6677 if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) 6678 return 0; 6679 6680 vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa; 6681 ret = enter_vmx_operation(vcpu); 6682 if (ret) 6683 return ret; 6684 6685 /* Empty 'VMXON' state is permitted if no VMCS loaded */ 6686 if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) { 6687 /* See vmx_has_valid_vmcs12. */ 6688 if ((kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE) || 6689 (kvm_state->flags & KVM_STATE_NESTED_EVMCS) || 6690 (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA)) 6691 return -EINVAL; 6692 else 6693 return 0; 6694 } 6695 6696 if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) { 6697 if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa || 6698 !page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa)) 6699 return -EINVAL; 6700 6701 set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa); 6702 #ifdef CONFIG_KVM_HYPERV 6703 } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) { 6704 /* 6705 * nested_vmx_handle_enlightened_vmptrld() cannot be called 6706 * directly from here as HV_X64_MSR_VP_ASSIST_PAGE may not be 6707 * restored yet. EVMCS will be mapped from 6708 * nested_get_vmcs12_pages(). 6709 */ 6710 vmx->nested.hv_evmcs_vmptr = EVMPTR_MAP_PENDING; 6711 kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 6712 #endif 6713 } else { 6714 return -EINVAL; 6715 } 6716 6717 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) { 6718 vmx->nested.smm.vmxon = true; 6719 vmx->nested.vmxon = false; 6720 6721 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) 6722 vmx->nested.smm.guest_mode = true; 6723 } 6724 6725 vmcs12 = get_vmcs12(vcpu); 6726 if (copy_from_user(vmcs12, user_vmx_nested_state->vmcs12, sizeof(*vmcs12))) 6727 return -EFAULT; 6728 6729 if (vmcs12->hdr.revision_id != VMCS12_REVISION) 6730 return -EINVAL; 6731 6732 if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 6733 return 0; 6734 6735 vmx->nested.nested_run_pending = 6736 !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); 6737 6738 vmx->nested.mtf_pending = 6739 !!(kvm_state->flags & KVM_STATE_NESTED_MTF_PENDING); 6740 6741 ret = -EINVAL; 6742 if (nested_cpu_has_shadow_vmcs(vmcs12) && 6743 vmcs12->vmcs_link_pointer != INVALID_GPA) { 6744 struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); 6745 6746 if (kvm_state->size < 6747 sizeof(*kvm_state) + 6748 sizeof(user_vmx_nested_state->vmcs12) + sizeof(*shadow_vmcs12)) 6749 goto error_guest_mode; 6750 6751 if (copy_from_user(shadow_vmcs12, 6752 user_vmx_nested_state->shadow_vmcs12, 6753 sizeof(*shadow_vmcs12))) { 6754 ret = -EFAULT; 6755 goto error_guest_mode; 6756 } 6757 6758 if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION || 6759 !shadow_vmcs12->hdr.shadow_vmcs) 6760 goto error_guest_mode; 6761 } 6762 6763 vmx->nested.has_preemption_timer_deadline = false; 6764 if (kvm_state->hdr.vmx.flags & KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) { 6765 vmx->nested.has_preemption_timer_deadline = true; 6766 vmx->nested.preemption_timer_deadline = 6767 kvm_state->hdr.vmx.preemption_timer_deadline; 6768 } 6769 6770 if (nested_vmx_check_controls(vcpu, vmcs12) || 6771 nested_vmx_check_host_state(vcpu, vmcs12) || 6772 nested_vmx_check_guest_state(vcpu, vmcs12, &ignored)) 6773 goto error_guest_mode; 6774 6775 vmx->nested.dirty_vmcs12 = true; 6776 vmx->nested.force_msr_bitmap_recalc = true; 6777 ret = nested_vmx_enter_non_root_mode(vcpu, false); 6778 if (ret) 6779 goto error_guest_mode; 6780 6781 if (vmx->nested.mtf_pending) 6782 kvm_make_request(KVM_REQ_EVENT, vcpu); 6783 6784 return 0; 6785 6786 error_guest_mode: 6787 vmx->nested.nested_run_pending = 0; 6788 return ret; 6789 } 6790 6791 void nested_vmx_set_vmcs_shadowing_bitmap(void) 6792 { 6793 if (enable_shadow_vmcs) { 6794 vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); 6795 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); 6796 } 6797 } 6798 6799 /* 6800 * Indexing into the vmcs12 uses the VMCS encoding rotated left by 6. Undo 6801 * that madness to get the encoding for comparison. 6802 */ 6803 #define VMCS12_IDX_TO_ENC(idx) ((u16)(((u16)(idx) >> 6) | ((u16)(idx) << 10))) 6804 6805 static u64 nested_vmx_calc_vmcs_enum_msr(void) 6806 { 6807 /* 6808 * Note these are the so called "index" of the VMCS field encoding, not 6809 * the index into vmcs12. 6810 */ 6811 unsigned int max_idx, idx; 6812 int i; 6813 6814 /* 6815 * For better or worse, KVM allows VMREAD/VMWRITE to all fields in 6816 * vmcs12, regardless of whether or not the associated feature is 6817 * exposed to L1. Simply find the field with the highest index. 6818 */ 6819 max_idx = 0; 6820 for (i = 0; i < nr_vmcs12_fields; i++) { 6821 /* The vmcs12 table is very, very sparsely populated. */ 6822 if (!vmcs12_field_offsets[i]) 6823 continue; 6824 6825 idx = vmcs_field_index(VMCS12_IDX_TO_ENC(i)); 6826 if (idx > max_idx) 6827 max_idx = idx; 6828 } 6829 6830 return (u64)max_idx << VMCS_FIELD_INDEX_SHIFT; 6831 } 6832 6833 static void nested_vmx_setup_pinbased_ctls(struct vmcs_config *vmcs_conf, 6834 struct nested_vmx_msrs *msrs) 6835 { 6836 msrs->pinbased_ctls_low = 6837 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 6838 6839 msrs->pinbased_ctls_high = vmcs_conf->pin_based_exec_ctrl; 6840 msrs->pinbased_ctls_high &= 6841 PIN_BASED_EXT_INTR_MASK | 6842 PIN_BASED_NMI_EXITING | 6843 PIN_BASED_VIRTUAL_NMIS | 6844 (enable_apicv ? PIN_BASED_POSTED_INTR : 0); 6845 msrs->pinbased_ctls_high |= 6846 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 6847 PIN_BASED_VMX_PREEMPTION_TIMER; 6848 } 6849 6850 static void nested_vmx_setup_exit_ctls(struct vmcs_config *vmcs_conf, 6851 struct nested_vmx_msrs *msrs) 6852 { 6853 msrs->exit_ctls_low = 6854 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 6855 6856 msrs->exit_ctls_high = vmcs_conf->vmexit_ctrl; 6857 msrs->exit_ctls_high &= 6858 #ifdef CONFIG_X86_64 6859 VM_EXIT_HOST_ADDR_SPACE_SIZE | 6860 #endif 6861 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT | 6862 VM_EXIT_CLEAR_BNDCFGS; 6863 msrs->exit_ctls_high |= 6864 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | 6865 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | 6866 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT | 6867 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; 6868 6869 /* We support free control of debug control saving. */ 6870 msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; 6871 } 6872 6873 static void nested_vmx_setup_entry_ctls(struct vmcs_config *vmcs_conf, 6874 struct nested_vmx_msrs *msrs) 6875 { 6876 msrs->entry_ctls_low = 6877 VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 6878 6879 msrs->entry_ctls_high = vmcs_conf->vmentry_ctrl; 6880 msrs->entry_ctls_high &= 6881 #ifdef CONFIG_X86_64 6882 VM_ENTRY_IA32E_MODE | 6883 #endif 6884 VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS; 6885 msrs->entry_ctls_high |= 6886 (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER | 6887 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL); 6888 6889 /* We support free control of debug control loading. */ 6890 msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; 6891 } 6892 6893 static void nested_vmx_setup_cpubased_ctls(struct vmcs_config *vmcs_conf, 6894 struct nested_vmx_msrs *msrs) 6895 { 6896 msrs->procbased_ctls_low = 6897 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 6898 6899 msrs->procbased_ctls_high = vmcs_conf->cpu_based_exec_ctrl; 6900 msrs->procbased_ctls_high &= 6901 CPU_BASED_INTR_WINDOW_EXITING | 6902 CPU_BASED_NMI_WINDOW_EXITING | CPU_BASED_USE_TSC_OFFSETTING | 6903 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | 6904 CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | 6905 CPU_BASED_CR3_STORE_EXITING | 6906 #ifdef CONFIG_X86_64 6907 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | 6908 #endif 6909 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | 6910 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG | 6911 CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING | 6912 CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING | 6913 CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 6914 /* 6915 * We can allow some features even when not supported by the 6916 * hardware. For example, L1 can specify an MSR bitmap - and we 6917 * can use it to avoid exits to L1 - even when L0 runs L2 6918 * without MSR bitmaps. 6919 */ 6920 msrs->procbased_ctls_high |= 6921 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 6922 CPU_BASED_USE_MSR_BITMAPS; 6923 6924 /* We support free control of CR3 access interception. */ 6925 msrs->procbased_ctls_low &= 6926 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); 6927 } 6928 6929 static void nested_vmx_setup_secondary_ctls(u32 ept_caps, 6930 struct vmcs_config *vmcs_conf, 6931 struct nested_vmx_msrs *msrs) 6932 { 6933 msrs->secondary_ctls_low = 0; 6934 6935 msrs->secondary_ctls_high = vmcs_conf->cpu_based_2nd_exec_ctrl; 6936 msrs->secondary_ctls_high &= 6937 SECONDARY_EXEC_DESC | 6938 SECONDARY_EXEC_ENABLE_RDTSCP | 6939 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 6940 SECONDARY_EXEC_WBINVD_EXITING | 6941 SECONDARY_EXEC_APIC_REGISTER_VIRT | 6942 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 6943 SECONDARY_EXEC_RDRAND_EXITING | 6944 SECONDARY_EXEC_ENABLE_INVPCID | 6945 SECONDARY_EXEC_ENABLE_VMFUNC | 6946 SECONDARY_EXEC_RDSEED_EXITING | 6947 SECONDARY_EXEC_ENABLE_XSAVES | 6948 SECONDARY_EXEC_TSC_SCALING | 6949 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; 6950 6951 /* 6952 * We can emulate "VMCS shadowing," even if the hardware 6953 * doesn't support it. 6954 */ 6955 msrs->secondary_ctls_high |= 6956 SECONDARY_EXEC_SHADOW_VMCS; 6957 6958 if (enable_ept) { 6959 /* nested EPT: emulate EPT also to L1 */ 6960 msrs->secondary_ctls_high |= 6961 SECONDARY_EXEC_ENABLE_EPT; 6962 msrs->ept_caps = 6963 VMX_EPT_PAGE_WALK_4_BIT | 6964 VMX_EPT_PAGE_WALK_5_BIT | 6965 VMX_EPTP_WB_BIT | 6966 VMX_EPT_INVEPT_BIT | 6967 VMX_EPT_EXECUTE_ONLY_BIT; 6968 6969 msrs->ept_caps &= ept_caps; 6970 msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | 6971 VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT | 6972 VMX_EPT_1GB_PAGE_BIT; 6973 if (enable_ept_ad_bits) { 6974 msrs->secondary_ctls_high |= 6975 SECONDARY_EXEC_ENABLE_PML; 6976 msrs->ept_caps |= VMX_EPT_AD_BIT; 6977 } 6978 6979 /* 6980 * Advertise EPTP switching irrespective of hardware support, 6981 * KVM emulates it in software so long as VMFUNC is supported. 6982 */ 6983 if (cpu_has_vmx_vmfunc()) 6984 msrs->vmfunc_controls = VMX_VMFUNC_EPTP_SWITCHING; 6985 } 6986 6987 /* 6988 * Old versions of KVM use the single-context version without 6989 * checking for support, so declare that it is supported even 6990 * though it is treated as global context. The alternative is 6991 * not failing the single-context invvpid, and it is worse. 6992 */ 6993 if (enable_vpid) { 6994 msrs->secondary_ctls_high |= 6995 SECONDARY_EXEC_ENABLE_VPID; 6996 msrs->vpid_caps = VMX_VPID_INVVPID_BIT | 6997 VMX_VPID_EXTENT_SUPPORTED_MASK; 6998 } 6999 7000 if (enable_unrestricted_guest) 7001 msrs->secondary_ctls_high |= 7002 SECONDARY_EXEC_UNRESTRICTED_GUEST; 7003 7004 if (flexpriority_enabled) 7005 msrs->secondary_ctls_high |= 7006 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 7007 7008 if (enable_sgx) 7009 msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING; 7010 } 7011 7012 static void nested_vmx_setup_misc_data(struct vmcs_config *vmcs_conf, 7013 struct nested_vmx_msrs *msrs) 7014 { 7015 msrs->misc_low = (u32)vmcs_conf->misc & VMX_MISC_SAVE_EFER_LMA; 7016 msrs->misc_low |= 7017 MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | 7018 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | 7019 VMX_MISC_ACTIVITY_HLT | 7020 VMX_MISC_ACTIVITY_WAIT_SIPI; 7021 msrs->misc_high = 0; 7022 } 7023 7024 static void nested_vmx_setup_basic(struct nested_vmx_msrs *msrs) 7025 { 7026 /* 7027 * This MSR reports some information about VMX support. We 7028 * should return information about the VMX we emulate for the 7029 * guest, and the VMCS structure we give it - not about the 7030 * VMX support of the underlying hardware. 7031 */ 7032 msrs->basic = 7033 VMCS12_REVISION | 7034 VMX_BASIC_TRUE_CTLS | 7035 ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | 7036 (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); 7037 7038 if (cpu_has_vmx_basic_inout()) 7039 msrs->basic |= VMX_BASIC_INOUT; 7040 } 7041 7042 static void nested_vmx_setup_cr_fixed(struct nested_vmx_msrs *msrs) 7043 { 7044 /* 7045 * These MSRs specify bits which the guest must keep fixed on 7046 * while L1 is in VMXON mode (in L1's root mode, or running an L2). 7047 * We picked the standard core2 setting. 7048 */ 7049 #define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) 7050 #define VMXON_CR4_ALWAYSON X86_CR4_VMXE 7051 msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON; 7052 msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON; 7053 7054 /* These MSRs specify bits which the guest must keep fixed off. */ 7055 rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1); 7056 rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1); 7057 7058 if (vmx_umip_emulated()) 7059 msrs->cr4_fixed1 |= X86_CR4_UMIP; 7060 } 7061 7062 /* 7063 * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be 7064 * returned for the various VMX controls MSRs when nested VMX is enabled. 7065 * The same values should also be used to verify that vmcs12 control fields are 7066 * valid during nested entry from L1 to L2. 7067 * Each of these control msrs has a low and high 32-bit half: A low bit is on 7068 * if the corresponding bit in the (32-bit) control field *must* be on, and a 7069 * bit in the high half is on if the corresponding bit in the control field 7070 * may be on. See also vmx_control_verify(). 7071 */ 7072 void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps) 7073 { 7074 struct nested_vmx_msrs *msrs = &vmcs_conf->nested; 7075 7076 /* 7077 * Note that as a general rule, the high half of the MSRs (bits in 7078 * the control fields which may be 1) should be initialized by the 7079 * intersection of the underlying hardware's MSR (i.e., features which 7080 * can be supported) and the list of features we want to expose - 7081 * because they are known to be properly supported in our code. 7082 * Also, usually, the low half of the MSRs (bits which must be 1) can 7083 * be set to 0, meaning that L1 may turn off any of these bits. The 7084 * reason is that if one of these bits is necessary, it will appear 7085 * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control 7086 * fields of vmcs01 and vmcs02, will turn these bits off - and 7087 * nested_vmx_l1_wants_exit() will not pass related exits to L1. 7088 * These rules have exceptions below. 7089 */ 7090 nested_vmx_setup_pinbased_ctls(vmcs_conf, msrs); 7091 7092 nested_vmx_setup_exit_ctls(vmcs_conf, msrs); 7093 7094 nested_vmx_setup_entry_ctls(vmcs_conf, msrs); 7095 7096 nested_vmx_setup_cpubased_ctls(vmcs_conf, msrs); 7097 7098 nested_vmx_setup_secondary_ctls(ept_caps, vmcs_conf, msrs); 7099 7100 nested_vmx_setup_misc_data(vmcs_conf, msrs); 7101 7102 nested_vmx_setup_basic(msrs); 7103 7104 nested_vmx_setup_cr_fixed(msrs); 7105 7106 msrs->vmcs_enum = nested_vmx_calc_vmcs_enum_msr(); 7107 } 7108 7109 void nested_vmx_hardware_unsetup(void) 7110 { 7111 int i; 7112 7113 if (enable_shadow_vmcs) { 7114 for (i = 0; i < VMX_BITMAP_NR; i++) 7115 free_page((unsigned long)vmx_bitmap[i]); 7116 } 7117 } 7118 7119 __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) 7120 { 7121 int i; 7122 7123 if (!cpu_has_vmx_shadow_vmcs()) 7124 enable_shadow_vmcs = 0; 7125 if (enable_shadow_vmcs) { 7126 for (i = 0; i < VMX_BITMAP_NR; i++) { 7127 /* 7128 * The vmx_bitmap is not tied to a VM and so should 7129 * not be charged to a memcg. 7130 */ 7131 vmx_bitmap[i] = (unsigned long *) 7132 __get_free_page(GFP_KERNEL); 7133 if (!vmx_bitmap[i]) { 7134 nested_vmx_hardware_unsetup(); 7135 return -ENOMEM; 7136 } 7137 } 7138 7139 init_vmcs_shadow_fields(); 7140 } 7141 7142 exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear; 7143 exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch; 7144 exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld; 7145 exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst; 7146 exit_handlers[EXIT_REASON_VMREAD] = handle_vmread; 7147 exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume; 7148 exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite; 7149 exit_handlers[EXIT_REASON_VMOFF] = handle_vmxoff; 7150 exit_handlers[EXIT_REASON_VMON] = handle_vmxon; 7151 exit_handlers[EXIT_REASON_INVEPT] = handle_invept; 7152 exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid; 7153 exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc; 7154 7155 return 0; 7156 } 7157 7158 struct kvm_x86_nested_ops vmx_nested_ops = { 7159 .leave_nested = vmx_leave_nested, 7160 .is_exception_vmexit = nested_vmx_is_exception_vmexit, 7161 .check_events = vmx_check_nested_events, 7162 .has_events = vmx_has_nested_events, 7163 .triple_fault = nested_vmx_triple_fault, 7164 .get_state = vmx_get_nested_state, 7165 .set_state = vmx_set_nested_state, 7166 .get_nested_state_pages = vmx_get_nested_state_pages, 7167 .write_log_dirty = nested_vmx_write_pml_buffer, 7168 #ifdef CONFIG_KVM_HYPERV 7169 .enable_evmcs = nested_enable_evmcs, 7170 .get_evmcs_version = nested_get_evmcs_version, 7171 .hv_inject_synthetic_vmexit_post_tlb_flush = vmx_hv_inject_synthetic_vmexit_post_tlb_flush, 7172 #endif 7173 }; 7174