1 // SPDX-License-Identifier: GPL-2.0 2 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 3 4 #include <linux/objtool.h> 5 #include <linux/percpu.h> 6 7 #include <asm/debugreg.h> 8 #include <asm/mmu_context.h> 9 10 #include "cpuid.h" 11 #include "hyperv.h" 12 #include "mmu.h" 13 #include "nested.h" 14 #include "pmu.h" 15 #include "sgx.h" 16 #include "trace.h" 17 #include "vmx.h" 18 #include "x86.h" 19 #include "smm.h" 20 21 static bool __read_mostly enable_shadow_vmcs = 1; 22 module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); 23 24 static bool __read_mostly nested_early_check = 0; 25 module_param(nested_early_check, bool, S_IRUGO); 26 27 #define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK 28 29 /* 30 * Hyper-V requires all of these, so mark them as supported even though 31 * they are just treated the same as all-context. 32 */ 33 #define VMX_VPID_EXTENT_SUPPORTED_MASK \ 34 (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \ 35 VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \ 36 VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \ 37 VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT) 38 39 #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 40 41 enum { 42 VMX_VMREAD_BITMAP, 43 VMX_VMWRITE_BITMAP, 44 VMX_BITMAP_NR 45 }; 46 static unsigned long *vmx_bitmap[VMX_BITMAP_NR]; 47 48 #define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP]) 49 #define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP]) 50 51 struct shadow_vmcs_field { 52 u16 encoding; 53 u16 offset; 54 }; 55 static struct shadow_vmcs_field shadow_read_only_fields[] = { 56 #define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) }, 57 #include "vmcs_shadow_fields.h" 58 }; 59 static int max_shadow_read_only_fields = 60 ARRAY_SIZE(shadow_read_only_fields); 61 62 static struct shadow_vmcs_field shadow_read_write_fields[] = { 63 #define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) }, 64 #include "vmcs_shadow_fields.h" 65 }; 66 static int max_shadow_read_write_fields = 67 ARRAY_SIZE(shadow_read_write_fields); 68 69 static void init_vmcs_shadow_fields(void) 70 { 71 int i, j; 72 73 memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); 74 memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); 75 76 for (i = j = 0; i < max_shadow_read_only_fields; i++) { 77 struct shadow_vmcs_field entry = shadow_read_only_fields[i]; 78 u16 field = entry.encoding; 79 80 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && 81 (i + 1 == max_shadow_read_only_fields || 82 shadow_read_only_fields[i + 1].encoding != field + 1)) 83 pr_err("Missing field from shadow_read_only_field %x\n", 84 field + 1); 85 86 clear_bit(field, vmx_vmread_bitmap); 87 if (field & 1) 88 #ifdef CONFIG_X86_64 89 continue; 90 #else 91 entry.offset += sizeof(u32); 92 #endif 93 shadow_read_only_fields[j++] = entry; 94 } 95 max_shadow_read_only_fields = j; 96 97 for (i = j = 0; i < max_shadow_read_write_fields; i++) { 98 struct shadow_vmcs_field entry = shadow_read_write_fields[i]; 99 u16 field = entry.encoding; 100 101 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && 102 (i + 1 == max_shadow_read_write_fields || 103 shadow_read_write_fields[i + 1].encoding != field + 1)) 104 pr_err("Missing field from shadow_read_write_field %x\n", 105 field + 1); 106 107 WARN_ONCE(field >= GUEST_ES_AR_BYTES && 108 field <= GUEST_TR_AR_BYTES, 109 "Update vmcs12_write_any() to drop reserved bits from AR_BYTES"); 110 111 /* 112 * PML and the preemption timer can be emulated, but the 113 * processor cannot vmwrite to fields that don't exist 114 * on bare metal. 115 */ 116 switch (field) { 117 case GUEST_PML_INDEX: 118 if (!cpu_has_vmx_pml()) 119 continue; 120 break; 121 case VMX_PREEMPTION_TIMER_VALUE: 122 if (!cpu_has_vmx_preemption_timer()) 123 continue; 124 break; 125 case GUEST_INTR_STATUS: 126 if (!cpu_has_vmx_apicv()) 127 continue; 128 break; 129 default: 130 break; 131 } 132 133 clear_bit(field, vmx_vmwrite_bitmap); 134 clear_bit(field, vmx_vmread_bitmap); 135 if (field & 1) 136 #ifdef CONFIG_X86_64 137 continue; 138 #else 139 entry.offset += sizeof(u32); 140 #endif 141 shadow_read_write_fields[j++] = entry; 142 } 143 max_shadow_read_write_fields = j; 144 } 145 146 /* 147 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), 148 * set the success or error code of an emulated VMX instruction (as specified 149 * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated 150 * instruction. 151 */ 152 static int nested_vmx_succeed(struct kvm_vcpu *vcpu) 153 { 154 vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) 155 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 156 X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); 157 return kvm_skip_emulated_instruction(vcpu); 158 } 159 160 static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu) 161 { 162 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 163 & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | 164 X86_EFLAGS_SF | X86_EFLAGS_OF)) 165 | X86_EFLAGS_CF); 166 return kvm_skip_emulated_instruction(vcpu); 167 } 168 169 static int nested_vmx_failValid(struct kvm_vcpu *vcpu, 170 u32 vm_instruction_error) 171 { 172 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 173 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 174 X86_EFLAGS_SF | X86_EFLAGS_OF)) 175 | X86_EFLAGS_ZF); 176 get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; 177 /* 178 * We don't need to force sync to shadow VMCS because 179 * VM_INSTRUCTION_ERROR is not shadowed. Enlightened VMCS 'shadows' all 180 * fields and thus must be synced. 181 */ 182 if (nested_vmx_is_evmptr12_set(to_vmx(vcpu))) 183 to_vmx(vcpu)->nested.need_vmcs12_to_shadow_sync = true; 184 185 return kvm_skip_emulated_instruction(vcpu); 186 } 187 188 static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error) 189 { 190 struct vcpu_vmx *vmx = to_vmx(vcpu); 191 192 /* 193 * failValid writes the error number to the current VMCS, which 194 * can't be done if there isn't a current VMCS. 195 */ 196 if (vmx->nested.current_vmptr == INVALID_GPA && 197 !nested_vmx_is_evmptr12_valid(vmx)) 198 return nested_vmx_failInvalid(vcpu); 199 200 return nested_vmx_failValid(vcpu, vm_instruction_error); 201 } 202 203 static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) 204 { 205 /* TODO: not to reset guest simply here. */ 206 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 207 pr_debug_ratelimited("nested vmx abort, indicator %d\n", indicator); 208 } 209 210 static inline bool vmx_control_verify(u32 control, u32 low, u32 high) 211 { 212 return fixed_bits_valid(control, low, high); 213 } 214 215 static inline u64 vmx_control_msr(u32 low, u32 high) 216 { 217 return low | ((u64)high << 32); 218 } 219 220 static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) 221 { 222 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); 223 vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); 224 vmx->nested.need_vmcs12_to_shadow_sync = false; 225 } 226 227 static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) 228 { 229 #ifdef CONFIG_KVM_HYPERV 230 struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); 231 struct vcpu_vmx *vmx = to_vmx(vcpu); 232 233 if (nested_vmx_is_evmptr12_valid(vmx)) { 234 kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true); 235 vmx->nested.hv_evmcs = NULL; 236 } 237 238 vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; 239 240 if (hv_vcpu) { 241 hv_vcpu->nested.pa_page_gpa = INVALID_GPA; 242 hv_vcpu->nested.vm_id = 0; 243 hv_vcpu->nested.vp_id = 0; 244 } 245 #endif 246 } 247 248 static bool nested_evmcs_handle_vmclear(struct kvm_vcpu *vcpu, gpa_t vmptr) 249 { 250 #ifdef CONFIG_KVM_HYPERV 251 struct vcpu_vmx *vmx = to_vmx(vcpu); 252 /* 253 * When Enlightened VMEntry is enabled on the calling CPU we treat 254 * memory area pointer by vmptr as Enlightened VMCS (as there's no good 255 * way to distinguish it from VMCS12) and we must not corrupt it by 256 * writing to the non-existent 'launch_state' field. The area doesn't 257 * have to be the currently active EVMCS on the calling CPU and there's 258 * nothing KVM has to do to transition it from 'active' to 'non-active' 259 * state. It is possible that the area will stay mapped as 260 * vmx->nested.hv_evmcs but this shouldn't be a problem. 261 */ 262 if (!guest_cpuid_has_evmcs(vcpu) || 263 !evmptr_is_valid(nested_get_evmptr(vcpu))) 264 return false; 265 266 if (nested_vmx_evmcs(vmx) && vmptr == vmx->nested.hv_evmcs_vmptr) 267 nested_release_evmcs(vcpu); 268 269 return true; 270 #else 271 return false; 272 #endif 273 } 274 275 static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, 276 struct loaded_vmcs *prev) 277 { 278 struct vmcs_host_state *dest, *src; 279 280 if (unlikely(!vmx->guest_state_loaded)) 281 return; 282 283 src = &prev->host_state; 284 dest = &vmx->loaded_vmcs->host_state; 285 286 vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base); 287 dest->ldt_sel = src->ldt_sel; 288 #ifdef CONFIG_X86_64 289 dest->ds_sel = src->ds_sel; 290 dest->es_sel = src->es_sel; 291 #endif 292 } 293 294 static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) 295 { 296 struct vcpu_vmx *vmx = to_vmx(vcpu); 297 struct loaded_vmcs *prev; 298 int cpu; 299 300 if (WARN_ON_ONCE(vmx->loaded_vmcs == vmcs)) 301 return; 302 303 cpu = get_cpu(); 304 prev = vmx->loaded_vmcs; 305 vmx->loaded_vmcs = vmcs; 306 vmx_vcpu_load_vmcs(vcpu, cpu, prev); 307 vmx_sync_vmcs_host_state(vmx, prev); 308 put_cpu(); 309 310 vcpu->arch.regs_avail = ~VMX_REGS_LAZY_LOAD_SET; 311 312 /* 313 * All lazily updated registers will be reloaded from VMCS12 on both 314 * vmentry and vmexit. 315 */ 316 vcpu->arch.regs_dirty = 0; 317 } 318 319 /* 320 * Free whatever needs to be freed from vmx->nested when L1 goes down, or 321 * just stops using VMX. 322 */ 323 static void free_nested(struct kvm_vcpu *vcpu) 324 { 325 struct vcpu_vmx *vmx = to_vmx(vcpu); 326 327 if (WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01)) 328 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 329 330 if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) 331 return; 332 333 kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 334 335 vmx->nested.vmxon = false; 336 vmx->nested.smm.vmxon = false; 337 vmx->nested.vmxon_ptr = INVALID_GPA; 338 free_vpid(vmx->nested.vpid02); 339 vmx->nested.posted_intr_nv = -1; 340 vmx->nested.current_vmptr = INVALID_GPA; 341 if (enable_shadow_vmcs) { 342 vmx_disable_shadow_vmcs(vmx); 343 vmcs_clear(vmx->vmcs01.shadow_vmcs); 344 free_vmcs(vmx->vmcs01.shadow_vmcs); 345 vmx->vmcs01.shadow_vmcs = NULL; 346 } 347 kfree(vmx->nested.cached_vmcs12); 348 vmx->nested.cached_vmcs12 = NULL; 349 kfree(vmx->nested.cached_shadow_vmcs12); 350 vmx->nested.cached_shadow_vmcs12 = NULL; 351 /* 352 * Unpin physical memory we referred to in the vmcs02. The APIC access 353 * page's backing page (yeah, confusing) shouldn't actually be accessed, 354 * and if it is written, the contents are irrelevant. 355 */ 356 kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map, false); 357 kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); 358 kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); 359 vmx->nested.pi_desc = NULL; 360 361 kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 362 363 nested_release_evmcs(vcpu); 364 365 free_loaded_vmcs(&vmx->nested.vmcs02); 366 } 367 368 /* 369 * Ensure that the current vmcs of the logical processor is the 370 * vmcs01 of the vcpu before calling free_nested(). 371 */ 372 void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu) 373 { 374 vcpu_load(vcpu); 375 vmx_leave_nested(vcpu); 376 vcpu_put(vcpu); 377 } 378 379 #define EPTP_PA_MASK GENMASK_ULL(51, 12) 380 381 static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp) 382 { 383 return VALID_PAGE(root_hpa) && 384 ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK)); 385 } 386 387 static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp, 388 gpa_t addr) 389 { 390 unsigned long roots = 0; 391 uint i; 392 struct kvm_mmu_root_info *cached_root; 393 394 WARN_ON_ONCE(!mmu_is_nested(vcpu)); 395 396 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 397 cached_root = &vcpu->arch.mmu->prev_roots[i]; 398 399 if (nested_ept_root_matches(cached_root->hpa, cached_root->pgd, 400 eptp)) 401 roots |= KVM_MMU_ROOT_PREVIOUS(i); 402 } 403 if (roots) 404 kvm_mmu_invalidate_addr(vcpu, vcpu->arch.mmu, addr, roots); 405 } 406 407 static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, 408 struct x86_exception *fault) 409 { 410 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 411 struct vcpu_vmx *vmx = to_vmx(vcpu); 412 unsigned long exit_qualification; 413 u32 vm_exit_reason; 414 415 if (vmx->nested.pml_full) { 416 vm_exit_reason = EXIT_REASON_PML_FULL; 417 vmx->nested.pml_full = false; 418 419 /* 420 * It should be impossible to trigger a nested PML Full VM-Exit 421 * for anything other than an EPT Violation from L2. KVM *can* 422 * trigger nEPT page fault injection in response to an EPT 423 * Misconfig, e.g. if the MMIO SPTE was stale and L1's EPT 424 * tables also changed, but KVM should not treat EPT Misconfig 425 * VM-Exits as writes. 426 */ 427 WARN_ON_ONCE(vmx->exit_reason.basic != EXIT_REASON_EPT_VIOLATION); 428 429 /* 430 * PML Full and EPT Violation VM-Exits both use bit 12 to report 431 * "NMI unblocking due to IRET", i.e. the bit can be propagated 432 * as-is from the original EXIT_QUALIFICATION. 433 */ 434 exit_qualification = vmx_get_exit_qual(vcpu) & INTR_INFO_UNBLOCK_NMI; 435 } else { 436 if (fault->error_code & PFERR_RSVD_MASK) { 437 vm_exit_reason = EXIT_REASON_EPT_MISCONFIG; 438 exit_qualification = 0; 439 } else { 440 exit_qualification = fault->exit_qualification; 441 exit_qualification |= vmx_get_exit_qual(vcpu) & 442 (EPT_VIOLATION_GVA_IS_VALID | 443 EPT_VIOLATION_GVA_TRANSLATED); 444 vm_exit_reason = EXIT_REASON_EPT_VIOLATION; 445 } 446 447 /* 448 * Although the caller (kvm_inject_emulated_page_fault) would 449 * have already synced the faulting address in the shadow EPT 450 * tables for the current EPTP12, we also need to sync it for 451 * any other cached EPTP02s based on the same EP4TA, since the 452 * TLB associates mappings to the EP4TA rather than the full EPTP. 453 */ 454 nested_ept_invalidate_addr(vcpu, vmcs12->ept_pointer, 455 fault->address); 456 } 457 458 nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification); 459 vmcs12->guest_physical_address = fault->address; 460 } 461 462 static void nested_ept_new_eptp(struct kvm_vcpu *vcpu) 463 { 464 struct vcpu_vmx *vmx = to_vmx(vcpu); 465 bool execonly = vmx->nested.msrs.ept_caps & VMX_EPT_EXECUTE_ONLY_BIT; 466 int ept_lpage_level = ept_caps_to_lpage_level(vmx->nested.msrs.ept_caps); 467 468 kvm_init_shadow_ept_mmu(vcpu, execonly, ept_lpage_level, 469 nested_ept_ad_enabled(vcpu), 470 nested_ept_get_eptp(vcpu)); 471 } 472 473 static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) 474 { 475 WARN_ON(mmu_is_nested(vcpu)); 476 477 vcpu->arch.mmu = &vcpu->arch.guest_mmu; 478 nested_ept_new_eptp(vcpu); 479 vcpu->arch.mmu->get_guest_pgd = nested_ept_get_eptp; 480 vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault; 481 vcpu->arch.mmu->get_pdptr = kvm_pdptr_read; 482 483 vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; 484 } 485 486 static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) 487 { 488 vcpu->arch.mmu = &vcpu->arch.root_mmu; 489 vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; 490 } 491 492 static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, 493 u16 error_code) 494 { 495 bool inequality, bit; 496 497 bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0; 498 inequality = 499 (error_code & vmcs12->page_fault_error_code_mask) != 500 vmcs12->page_fault_error_code_match; 501 return inequality ^ bit; 502 } 503 504 static bool nested_vmx_is_exception_vmexit(struct kvm_vcpu *vcpu, u8 vector, 505 u32 error_code) 506 { 507 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 508 509 /* 510 * Drop bits 31:16 of the error code when performing the #PF mask+match 511 * check. All VMCS fields involved are 32 bits, but Intel CPUs never 512 * set bits 31:16 and VMX disallows setting bits 31:16 in the injected 513 * error code. Including the to-be-dropped bits in the check might 514 * result in an "impossible" or missed exit from L1's perspective. 515 */ 516 if (vector == PF_VECTOR) 517 return nested_vmx_is_page_fault_vmexit(vmcs12, (u16)error_code); 518 519 return (vmcs12->exception_bitmap & (1u << vector)); 520 } 521 522 static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu, 523 struct vmcs12 *vmcs12) 524 { 525 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 526 return 0; 527 528 if (CC(!page_address_valid(vcpu, vmcs12->io_bitmap_a)) || 529 CC(!page_address_valid(vcpu, vmcs12->io_bitmap_b))) 530 return -EINVAL; 531 532 return 0; 533 } 534 535 static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu, 536 struct vmcs12 *vmcs12) 537 { 538 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 539 return 0; 540 541 if (CC(!page_address_valid(vcpu, vmcs12->msr_bitmap))) 542 return -EINVAL; 543 544 return 0; 545 } 546 547 static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu, 548 struct vmcs12 *vmcs12) 549 { 550 if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) 551 return 0; 552 553 if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr))) 554 return -EINVAL; 555 556 return 0; 557 } 558 559 /* 560 * For x2APIC MSRs, ignore the vmcs01 bitmap. L1 can enable x2APIC without L1 561 * itself utilizing x2APIC. All MSRs were previously set to be intercepted, 562 * only the "disable intercept" case needs to be handled. 563 */ 564 static void nested_vmx_disable_intercept_for_x2apic_msr(unsigned long *msr_bitmap_l1, 565 unsigned long *msr_bitmap_l0, 566 u32 msr, int type) 567 { 568 if (type & MSR_TYPE_R && !vmx_test_msr_bitmap_read(msr_bitmap_l1, msr)) 569 vmx_clear_msr_bitmap_read(msr_bitmap_l0, msr); 570 571 if (type & MSR_TYPE_W && !vmx_test_msr_bitmap_write(msr_bitmap_l1, msr)) 572 vmx_clear_msr_bitmap_write(msr_bitmap_l0, msr); 573 } 574 575 static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) 576 { 577 int msr; 578 579 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 580 unsigned word = msr / BITS_PER_LONG; 581 582 msr_bitmap[word] = ~0; 583 msr_bitmap[word + (0x800 / sizeof(long))] = ~0; 584 } 585 } 586 587 #define BUILD_NVMX_MSR_INTERCEPT_HELPER(rw) \ 588 static inline \ 589 void nested_vmx_set_msr_##rw##_intercept(struct vcpu_vmx *vmx, \ 590 unsigned long *msr_bitmap_l1, \ 591 unsigned long *msr_bitmap_l0, u32 msr) \ 592 { \ 593 if (vmx_test_msr_bitmap_##rw(vmx->vmcs01.msr_bitmap, msr) || \ 594 vmx_test_msr_bitmap_##rw(msr_bitmap_l1, msr)) \ 595 vmx_set_msr_bitmap_##rw(msr_bitmap_l0, msr); \ 596 else \ 597 vmx_clear_msr_bitmap_##rw(msr_bitmap_l0, msr); \ 598 } 599 BUILD_NVMX_MSR_INTERCEPT_HELPER(read) 600 BUILD_NVMX_MSR_INTERCEPT_HELPER(write) 601 602 static inline void nested_vmx_set_intercept_for_msr(struct vcpu_vmx *vmx, 603 unsigned long *msr_bitmap_l1, 604 unsigned long *msr_bitmap_l0, 605 u32 msr, int types) 606 { 607 if (types & MSR_TYPE_R) 608 nested_vmx_set_msr_read_intercept(vmx, msr_bitmap_l1, 609 msr_bitmap_l0, msr); 610 if (types & MSR_TYPE_W) 611 nested_vmx_set_msr_write_intercept(vmx, msr_bitmap_l1, 612 msr_bitmap_l0, msr); 613 } 614 615 /* 616 * Merge L0's and L1's MSR bitmap, return false to indicate that 617 * we do not use the hardware. 618 */ 619 static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, 620 struct vmcs12 *vmcs12) 621 { 622 struct vcpu_vmx *vmx = to_vmx(vcpu); 623 int msr; 624 unsigned long *msr_bitmap_l1; 625 unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap; 626 struct kvm_host_map *map = &vmx->nested.msr_bitmap_map; 627 628 /* Nothing to do if the MSR bitmap is not in use. */ 629 if (!cpu_has_vmx_msr_bitmap() || 630 !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 631 return false; 632 633 /* 634 * MSR bitmap update can be skipped when: 635 * - MSR bitmap for L1 hasn't changed. 636 * - Nested hypervisor (L1) is attempting to launch the same L2 as 637 * before. 638 * - Nested hypervisor (L1) has enabled 'Enlightened MSR Bitmap' feature 639 * and tells KVM (L0) there were no changes in MSR bitmap for L2. 640 */ 641 if (!vmx->nested.force_msr_bitmap_recalc) { 642 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 643 644 if (evmcs && evmcs->hv_enlightenments_control.msr_bitmap && 645 evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP) 646 return true; 647 } 648 649 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map)) 650 return false; 651 652 msr_bitmap_l1 = (unsigned long *)map->hva; 653 654 /* 655 * To keep the control flow simple, pay eight 8-byte writes (sixteen 656 * 4-byte writes on 32-bit systems) up front to enable intercepts for 657 * the x2APIC MSR range and selectively toggle those relevant to L2. 658 */ 659 enable_x2apic_msr_intercepts(msr_bitmap_l0); 660 661 if (nested_cpu_has_virt_x2apic_mode(vmcs12)) { 662 if (nested_cpu_has_apic_reg_virt(vmcs12)) { 663 /* 664 * L0 need not intercept reads for MSRs between 0x800 665 * and 0x8ff, it just lets the processor take the value 666 * from the virtual-APIC page; take those 256 bits 667 * directly from the L1 bitmap. 668 */ 669 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 670 unsigned word = msr / BITS_PER_LONG; 671 672 msr_bitmap_l0[word] = msr_bitmap_l1[word]; 673 } 674 } 675 676 nested_vmx_disable_intercept_for_x2apic_msr( 677 msr_bitmap_l1, msr_bitmap_l0, 678 X2APIC_MSR(APIC_TASKPRI), 679 MSR_TYPE_R | MSR_TYPE_W); 680 681 if (nested_cpu_has_vid(vmcs12)) { 682 nested_vmx_disable_intercept_for_x2apic_msr( 683 msr_bitmap_l1, msr_bitmap_l0, 684 X2APIC_MSR(APIC_EOI), 685 MSR_TYPE_W); 686 nested_vmx_disable_intercept_for_x2apic_msr( 687 msr_bitmap_l1, msr_bitmap_l0, 688 X2APIC_MSR(APIC_SELF_IPI), 689 MSR_TYPE_W); 690 } 691 } 692 693 /* 694 * Always check vmcs01's bitmap to honor userspace MSR filters and any 695 * other runtime changes to vmcs01's bitmap, e.g. dynamic pass-through. 696 */ 697 #ifdef CONFIG_X86_64 698 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 699 MSR_FS_BASE, MSR_TYPE_RW); 700 701 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 702 MSR_GS_BASE, MSR_TYPE_RW); 703 704 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 705 MSR_KERNEL_GS_BASE, MSR_TYPE_RW); 706 #endif 707 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 708 MSR_IA32_SPEC_CTRL, MSR_TYPE_RW); 709 710 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 711 MSR_IA32_PRED_CMD, MSR_TYPE_W); 712 713 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 714 MSR_IA32_FLUSH_CMD, MSR_TYPE_W); 715 716 kvm_vcpu_unmap(vcpu, &vmx->nested.msr_bitmap_map, false); 717 718 vmx->nested.force_msr_bitmap_recalc = false; 719 720 return true; 721 } 722 723 static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, 724 struct vmcs12 *vmcs12) 725 { 726 struct vcpu_vmx *vmx = to_vmx(vcpu); 727 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; 728 729 if (!nested_cpu_has_shadow_vmcs(vmcs12) || 730 vmcs12->vmcs_link_pointer == INVALID_GPA) 731 return; 732 733 if (ghc->gpa != vmcs12->vmcs_link_pointer && 734 kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, 735 vmcs12->vmcs_link_pointer, VMCS12_SIZE)) 736 return; 737 738 kvm_read_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu), 739 VMCS12_SIZE); 740 } 741 742 static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, 743 struct vmcs12 *vmcs12) 744 { 745 struct vcpu_vmx *vmx = to_vmx(vcpu); 746 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; 747 748 if (!nested_cpu_has_shadow_vmcs(vmcs12) || 749 vmcs12->vmcs_link_pointer == INVALID_GPA) 750 return; 751 752 if (ghc->gpa != vmcs12->vmcs_link_pointer && 753 kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, 754 vmcs12->vmcs_link_pointer, VMCS12_SIZE)) 755 return; 756 757 kvm_write_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu), 758 VMCS12_SIZE); 759 } 760 761 /* 762 * In nested virtualization, check if L1 has set 763 * VM_EXIT_ACK_INTR_ON_EXIT 764 */ 765 static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu) 766 { 767 return get_vmcs12(vcpu)->vm_exit_controls & 768 VM_EXIT_ACK_INTR_ON_EXIT; 769 } 770 771 static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu, 772 struct vmcs12 *vmcs12) 773 { 774 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && 775 CC(!page_address_valid(vcpu, vmcs12->apic_access_addr))) 776 return -EINVAL; 777 else 778 return 0; 779 } 780 781 static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, 782 struct vmcs12 *vmcs12) 783 { 784 if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && 785 !nested_cpu_has_apic_reg_virt(vmcs12) && 786 !nested_cpu_has_vid(vmcs12) && 787 !nested_cpu_has_posted_intr(vmcs12)) 788 return 0; 789 790 /* 791 * If virtualize x2apic mode is enabled, 792 * virtualize apic access must be disabled. 793 */ 794 if (CC(nested_cpu_has_virt_x2apic_mode(vmcs12) && 795 nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))) 796 return -EINVAL; 797 798 /* 799 * If virtual interrupt delivery is enabled, 800 * we must exit on external interrupts. 801 */ 802 if (CC(nested_cpu_has_vid(vmcs12) && !nested_exit_on_intr(vcpu))) 803 return -EINVAL; 804 805 /* 806 * bits 15:8 should be zero in posted_intr_nv, 807 * the descriptor address has been already checked 808 * in nested_get_vmcs12_pages. 809 * 810 * bits 5:0 of posted_intr_desc_addr should be zero. 811 */ 812 if (nested_cpu_has_posted_intr(vmcs12) && 813 (CC(!nested_cpu_has_vid(vmcs12)) || 814 CC(!nested_exit_intr_ack_set(vcpu)) || 815 CC((vmcs12->posted_intr_nv & 0xff00)) || 816 CC(!kvm_vcpu_is_legal_aligned_gpa(vcpu, vmcs12->posted_intr_desc_addr, 64)))) 817 return -EINVAL; 818 819 /* tpr shadow is needed by all apicv features. */ 820 if (CC(!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))) 821 return -EINVAL; 822 823 return 0; 824 } 825 826 static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, 827 u32 count, u64 addr) 828 { 829 if (count == 0) 830 return 0; 831 832 if (!kvm_vcpu_is_legal_aligned_gpa(vcpu, addr, 16) || 833 !kvm_vcpu_is_legal_gpa(vcpu, (addr + count * sizeof(struct vmx_msr_entry) - 1))) 834 return -EINVAL; 835 836 return 0; 837 } 838 839 static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu, 840 struct vmcs12 *vmcs12) 841 { 842 if (CC(nested_vmx_check_msr_switch(vcpu, 843 vmcs12->vm_exit_msr_load_count, 844 vmcs12->vm_exit_msr_load_addr)) || 845 CC(nested_vmx_check_msr_switch(vcpu, 846 vmcs12->vm_exit_msr_store_count, 847 vmcs12->vm_exit_msr_store_addr))) 848 return -EINVAL; 849 850 return 0; 851 } 852 853 static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu, 854 struct vmcs12 *vmcs12) 855 { 856 if (CC(nested_vmx_check_msr_switch(vcpu, 857 vmcs12->vm_entry_msr_load_count, 858 vmcs12->vm_entry_msr_load_addr))) 859 return -EINVAL; 860 861 return 0; 862 } 863 864 static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu, 865 struct vmcs12 *vmcs12) 866 { 867 if (!nested_cpu_has_pml(vmcs12)) 868 return 0; 869 870 if (CC(!nested_cpu_has_ept(vmcs12)) || 871 CC(!page_address_valid(vcpu, vmcs12->pml_address))) 872 return -EINVAL; 873 874 return 0; 875 } 876 877 static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu, 878 struct vmcs12 *vmcs12) 879 { 880 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) && 881 !nested_cpu_has_ept(vmcs12))) 882 return -EINVAL; 883 return 0; 884 } 885 886 static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu, 887 struct vmcs12 *vmcs12) 888 { 889 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) && 890 !nested_cpu_has_ept(vmcs12))) 891 return -EINVAL; 892 return 0; 893 } 894 895 static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu, 896 struct vmcs12 *vmcs12) 897 { 898 if (!nested_cpu_has_shadow_vmcs(vmcs12)) 899 return 0; 900 901 if (CC(!page_address_valid(vcpu, vmcs12->vmread_bitmap)) || 902 CC(!page_address_valid(vcpu, vmcs12->vmwrite_bitmap))) 903 return -EINVAL; 904 905 return 0; 906 } 907 908 static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, 909 struct vmx_msr_entry *e) 910 { 911 /* x2APIC MSR accesses are not allowed */ 912 if (CC(vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8)) 913 return -EINVAL; 914 if (CC(e->index == MSR_IA32_UCODE_WRITE) || /* SDM Table 35-2 */ 915 CC(e->index == MSR_IA32_UCODE_REV)) 916 return -EINVAL; 917 if (CC(e->reserved != 0)) 918 return -EINVAL; 919 return 0; 920 } 921 922 static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu, 923 struct vmx_msr_entry *e) 924 { 925 if (CC(e->index == MSR_FS_BASE) || 926 CC(e->index == MSR_GS_BASE) || 927 CC(e->index == MSR_IA32_SMM_MONITOR_CTL) || /* SMM is not supported */ 928 nested_vmx_msr_check_common(vcpu, e)) 929 return -EINVAL; 930 return 0; 931 } 932 933 static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, 934 struct vmx_msr_entry *e) 935 { 936 if (CC(e->index == MSR_IA32_SMBASE) || /* SMM is not supported */ 937 nested_vmx_msr_check_common(vcpu, e)) 938 return -EINVAL; 939 return 0; 940 } 941 942 static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu) 943 { 944 struct vcpu_vmx *vmx = to_vmx(vcpu); 945 u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, 946 vmx->nested.msrs.misc_high); 947 948 return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER; 949 } 950 951 /* 952 * Load guest's/host's msr at nested entry/exit. 953 * return 0 for success, entry index for failure. 954 * 955 * One of the failure modes for MSR load/store is when a list exceeds the 956 * virtual hardware's capacity. To maintain compatibility with hardware inasmuch 957 * as possible, process all valid entries before failing rather than precheck 958 * for a capacity violation. 959 */ 960 static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 961 { 962 u32 i; 963 struct vmx_msr_entry e; 964 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); 965 966 for (i = 0; i < count; i++) { 967 if (unlikely(i >= max_msr_list_size)) 968 goto fail; 969 970 if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), 971 &e, sizeof(e))) { 972 pr_debug_ratelimited( 973 "%s cannot read MSR entry (%u, 0x%08llx)\n", 974 __func__, i, gpa + i * sizeof(e)); 975 goto fail; 976 } 977 if (nested_vmx_load_msr_check(vcpu, &e)) { 978 pr_debug_ratelimited( 979 "%s check failed (%u, 0x%x, 0x%x)\n", 980 __func__, i, e.index, e.reserved); 981 goto fail; 982 } 983 if (kvm_set_msr(vcpu, e.index, e.value)) { 984 pr_debug_ratelimited( 985 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 986 __func__, i, e.index, e.value); 987 goto fail; 988 } 989 } 990 return 0; 991 fail: 992 /* Note, max_msr_list_size is at most 4096, i.e. this can't wrap. */ 993 return i + 1; 994 } 995 996 static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu, 997 u32 msr_index, 998 u64 *data) 999 { 1000 struct vcpu_vmx *vmx = to_vmx(vcpu); 1001 1002 /* 1003 * If the L0 hypervisor stored a more accurate value for the TSC that 1004 * does not include the time taken for emulation of the L2->L1 1005 * VM-exit in L0, use the more accurate value. 1006 */ 1007 if (msr_index == MSR_IA32_TSC) { 1008 int i = vmx_find_loadstore_msr_slot(&vmx->msr_autostore.guest, 1009 MSR_IA32_TSC); 1010 1011 if (i >= 0) { 1012 u64 val = vmx->msr_autostore.guest.val[i].value; 1013 1014 *data = kvm_read_l1_tsc(vcpu, val); 1015 return true; 1016 } 1017 } 1018 1019 if (kvm_get_msr(vcpu, msr_index, data)) { 1020 pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__, 1021 msr_index); 1022 return false; 1023 } 1024 return true; 1025 } 1026 1027 static bool read_and_check_msr_entry(struct kvm_vcpu *vcpu, u64 gpa, int i, 1028 struct vmx_msr_entry *e) 1029 { 1030 if (kvm_vcpu_read_guest(vcpu, 1031 gpa + i * sizeof(*e), 1032 e, 2 * sizeof(u32))) { 1033 pr_debug_ratelimited( 1034 "%s cannot read MSR entry (%u, 0x%08llx)\n", 1035 __func__, i, gpa + i * sizeof(*e)); 1036 return false; 1037 } 1038 if (nested_vmx_store_msr_check(vcpu, e)) { 1039 pr_debug_ratelimited( 1040 "%s check failed (%u, 0x%x, 0x%x)\n", 1041 __func__, i, e->index, e->reserved); 1042 return false; 1043 } 1044 return true; 1045 } 1046 1047 static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 1048 { 1049 u64 data; 1050 u32 i; 1051 struct vmx_msr_entry e; 1052 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); 1053 1054 for (i = 0; i < count; i++) { 1055 if (unlikely(i >= max_msr_list_size)) 1056 return -EINVAL; 1057 1058 if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) 1059 return -EINVAL; 1060 1061 if (!nested_vmx_get_vmexit_msr_value(vcpu, e.index, &data)) 1062 return -EINVAL; 1063 1064 if (kvm_vcpu_write_guest(vcpu, 1065 gpa + i * sizeof(e) + 1066 offsetof(struct vmx_msr_entry, value), 1067 &data, sizeof(data))) { 1068 pr_debug_ratelimited( 1069 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 1070 __func__, i, e.index, data); 1071 return -EINVAL; 1072 } 1073 } 1074 return 0; 1075 } 1076 1077 static bool nested_msr_store_list_has_msr(struct kvm_vcpu *vcpu, u32 msr_index) 1078 { 1079 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1080 u32 count = vmcs12->vm_exit_msr_store_count; 1081 u64 gpa = vmcs12->vm_exit_msr_store_addr; 1082 struct vmx_msr_entry e; 1083 u32 i; 1084 1085 for (i = 0; i < count; i++) { 1086 if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) 1087 return false; 1088 1089 if (e.index == msr_index) 1090 return true; 1091 } 1092 return false; 1093 } 1094 1095 static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu, 1096 u32 msr_index) 1097 { 1098 struct vcpu_vmx *vmx = to_vmx(vcpu); 1099 struct vmx_msrs *autostore = &vmx->msr_autostore.guest; 1100 bool in_vmcs12_store_list; 1101 int msr_autostore_slot; 1102 bool in_autostore_list; 1103 int last; 1104 1105 msr_autostore_slot = vmx_find_loadstore_msr_slot(autostore, msr_index); 1106 in_autostore_list = msr_autostore_slot >= 0; 1107 in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index); 1108 1109 if (in_vmcs12_store_list && !in_autostore_list) { 1110 if (autostore->nr == MAX_NR_LOADSTORE_MSRS) { 1111 /* 1112 * Emulated VMEntry does not fail here. Instead a less 1113 * accurate value will be returned by 1114 * nested_vmx_get_vmexit_msr_value() using kvm_get_msr() 1115 * instead of reading the value from the vmcs02 VMExit 1116 * MSR-store area. 1117 */ 1118 pr_warn_ratelimited( 1119 "Not enough msr entries in msr_autostore. Can't add msr %x\n", 1120 msr_index); 1121 return; 1122 } 1123 last = autostore->nr++; 1124 autostore->val[last].index = msr_index; 1125 } else if (!in_vmcs12_store_list && in_autostore_list) { 1126 last = --autostore->nr; 1127 autostore->val[msr_autostore_slot] = autostore->val[last]; 1128 } 1129 } 1130 1131 /* 1132 * Load guest's/host's cr3 at nested entry/exit. @nested_ept is true if we are 1133 * emulating VM-Entry into a guest with EPT enabled. On failure, the expected 1134 * Exit Qualification (for a VM-Entry consistency check VM-Exit) is assigned to 1135 * @entry_failure_code. 1136 */ 1137 static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, 1138 bool nested_ept, bool reload_pdptrs, 1139 enum vm_entry_failure_code *entry_failure_code) 1140 { 1141 if (CC(!kvm_vcpu_is_legal_cr3(vcpu, cr3))) { 1142 *entry_failure_code = ENTRY_FAIL_DEFAULT; 1143 return -EINVAL; 1144 } 1145 1146 /* 1147 * If PAE paging and EPT are both on, CR3 is not used by the CPU and 1148 * must not be dereferenced. 1149 */ 1150 if (reload_pdptrs && !nested_ept && is_pae_paging(vcpu) && 1151 CC(!load_pdptrs(vcpu, cr3))) { 1152 *entry_failure_code = ENTRY_FAIL_PDPTE; 1153 return -EINVAL; 1154 } 1155 1156 vcpu->arch.cr3 = cr3; 1157 kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); 1158 1159 /* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */ 1160 kvm_init_mmu(vcpu); 1161 1162 if (!nested_ept) 1163 kvm_mmu_new_pgd(vcpu, cr3); 1164 1165 return 0; 1166 } 1167 1168 /* 1169 * Returns if KVM is able to config CPU to tag TLB entries 1170 * populated by L2 differently than TLB entries populated 1171 * by L1. 1172 * 1173 * If L0 uses EPT, L1 and L2 run with different EPTP because 1174 * guest_mode is part of kvm_mmu_page_role. Thus, TLB entries 1175 * are tagged with different EPTP. 1176 * 1177 * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged 1178 * with different VPID (L1 entries are tagged with vmx->vpid 1179 * while L2 entries are tagged with vmx->nested.vpid02). 1180 */ 1181 static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu) 1182 { 1183 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1184 1185 return enable_ept || 1186 (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02); 1187 } 1188 1189 static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu, 1190 struct vmcs12 *vmcs12, 1191 bool is_vmenter) 1192 { 1193 struct vcpu_vmx *vmx = to_vmx(vcpu); 1194 1195 /* Handle pending Hyper-V TLB flush requests */ 1196 kvm_hv_nested_transtion_tlb_flush(vcpu, enable_ept); 1197 1198 /* 1199 * If vmcs12 doesn't use VPID, L1 expects linear and combined mappings 1200 * for *all* contexts to be flushed on VM-Enter/VM-Exit, i.e. it's a 1201 * full TLB flush from the guest's perspective. This is required even 1202 * if VPID is disabled in the host as KVM may need to synchronize the 1203 * MMU in response to the guest TLB flush. 1204 * 1205 * Note, using TLB_FLUSH_GUEST is correct even if nested EPT is in use. 1206 * EPT is a special snowflake, as guest-physical mappings aren't 1207 * flushed on VPID invalidations, including VM-Enter or VM-Exit with 1208 * VPID disabled. As a result, KVM _never_ needs to sync nEPT 1209 * entries on VM-Enter because L1 can't rely on VM-Enter to flush 1210 * those mappings. 1211 */ 1212 if (!nested_cpu_has_vpid(vmcs12)) { 1213 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 1214 return; 1215 } 1216 1217 /* L2 should never have a VPID if VPID is disabled. */ 1218 WARN_ON(!enable_vpid); 1219 1220 /* 1221 * VPID is enabled and in use by vmcs12. If vpid12 is changing, then 1222 * emulate a guest TLB flush as KVM does not track vpid12 history nor 1223 * is the VPID incorporated into the MMU context. I.e. KVM must assume 1224 * that the new vpid12 has never been used and thus represents a new 1225 * guest ASID that cannot have entries in the TLB. 1226 */ 1227 if (is_vmenter && vmcs12->virtual_processor_id != vmx->nested.last_vpid) { 1228 vmx->nested.last_vpid = vmcs12->virtual_processor_id; 1229 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 1230 return; 1231 } 1232 1233 /* 1234 * If VPID is enabled, used by vmc12, and vpid12 is not changing but 1235 * does not have a unique TLB tag (ASID), i.e. EPT is disabled and 1236 * KVM was unable to allocate a VPID for L2, flush the current context 1237 * as the effective ASID is common to both L1 and L2. 1238 */ 1239 if (!nested_has_guest_tlb_tag(vcpu)) 1240 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); 1241 } 1242 1243 static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) 1244 { 1245 superset &= mask; 1246 subset &= mask; 1247 1248 return (superset | subset) == superset; 1249 } 1250 1251 static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) 1252 { 1253 const u64 feature_and_reserved = 1254 /* feature (except bit 48; see below) */ 1255 BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) | 1256 /* reserved */ 1257 BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56); 1258 u64 vmx_basic = vmcs_config.nested.basic; 1259 1260 if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved)) 1261 return -EINVAL; 1262 1263 /* 1264 * KVM does not emulate a version of VMX that constrains physical 1265 * addresses of VMX structures (e.g. VMCS) to 32-bits. 1266 */ 1267 if (data & BIT_ULL(48)) 1268 return -EINVAL; 1269 1270 if (vmx_basic_vmcs_revision_id(vmx_basic) != 1271 vmx_basic_vmcs_revision_id(data)) 1272 return -EINVAL; 1273 1274 if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data)) 1275 return -EINVAL; 1276 1277 vmx->nested.msrs.basic = data; 1278 return 0; 1279 } 1280 1281 static void vmx_get_control_msr(struct nested_vmx_msrs *msrs, u32 msr_index, 1282 u32 **low, u32 **high) 1283 { 1284 switch (msr_index) { 1285 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1286 *low = &msrs->pinbased_ctls_low; 1287 *high = &msrs->pinbased_ctls_high; 1288 break; 1289 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1290 *low = &msrs->procbased_ctls_low; 1291 *high = &msrs->procbased_ctls_high; 1292 break; 1293 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1294 *low = &msrs->exit_ctls_low; 1295 *high = &msrs->exit_ctls_high; 1296 break; 1297 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1298 *low = &msrs->entry_ctls_low; 1299 *high = &msrs->entry_ctls_high; 1300 break; 1301 case MSR_IA32_VMX_PROCBASED_CTLS2: 1302 *low = &msrs->secondary_ctls_low; 1303 *high = &msrs->secondary_ctls_high; 1304 break; 1305 default: 1306 BUG(); 1307 } 1308 } 1309 1310 static int 1311 vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) 1312 { 1313 u32 *lowp, *highp; 1314 u64 supported; 1315 1316 vmx_get_control_msr(&vmcs_config.nested, msr_index, &lowp, &highp); 1317 1318 supported = vmx_control_msr(*lowp, *highp); 1319 1320 /* Check must-be-1 bits are still 1. */ 1321 if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0))) 1322 return -EINVAL; 1323 1324 /* Check must-be-0 bits are still 0. */ 1325 if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32))) 1326 return -EINVAL; 1327 1328 vmx_get_control_msr(&vmx->nested.msrs, msr_index, &lowp, &highp); 1329 *lowp = data; 1330 *highp = data >> 32; 1331 return 0; 1332 } 1333 1334 static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) 1335 { 1336 const u64 feature_and_reserved_bits = 1337 /* feature */ 1338 BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) | 1339 BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) | 1340 /* reserved */ 1341 GENMASK_ULL(13, 9) | BIT_ULL(31); 1342 u64 vmx_misc = vmx_control_msr(vmcs_config.nested.misc_low, 1343 vmcs_config.nested.misc_high); 1344 1345 if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits)) 1346 return -EINVAL; 1347 1348 if ((vmx->nested.msrs.pinbased_ctls_high & 1349 PIN_BASED_VMX_PREEMPTION_TIMER) && 1350 vmx_misc_preemption_timer_rate(data) != 1351 vmx_misc_preemption_timer_rate(vmx_misc)) 1352 return -EINVAL; 1353 1354 if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc)) 1355 return -EINVAL; 1356 1357 if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc)) 1358 return -EINVAL; 1359 1360 if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc)) 1361 return -EINVAL; 1362 1363 vmx->nested.msrs.misc_low = data; 1364 vmx->nested.msrs.misc_high = data >> 32; 1365 1366 return 0; 1367 } 1368 1369 static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data) 1370 { 1371 u64 vmx_ept_vpid_cap = vmx_control_msr(vmcs_config.nested.ept_caps, 1372 vmcs_config.nested.vpid_caps); 1373 1374 /* Every bit is either reserved or a feature bit. */ 1375 if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL)) 1376 return -EINVAL; 1377 1378 vmx->nested.msrs.ept_caps = data; 1379 vmx->nested.msrs.vpid_caps = data >> 32; 1380 return 0; 1381 } 1382 1383 static u64 *vmx_get_fixed0_msr(struct nested_vmx_msrs *msrs, u32 msr_index) 1384 { 1385 switch (msr_index) { 1386 case MSR_IA32_VMX_CR0_FIXED0: 1387 return &msrs->cr0_fixed0; 1388 case MSR_IA32_VMX_CR4_FIXED0: 1389 return &msrs->cr4_fixed0; 1390 default: 1391 BUG(); 1392 } 1393 } 1394 1395 static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) 1396 { 1397 const u64 *msr = vmx_get_fixed0_msr(&vmcs_config.nested, msr_index); 1398 1399 /* 1400 * 1 bits (which indicates bits which "must-be-1" during VMX operation) 1401 * must be 1 in the restored value. 1402 */ 1403 if (!is_bitwise_subset(data, *msr, -1ULL)) 1404 return -EINVAL; 1405 1406 *vmx_get_fixed0_msr(&vmx->nested.msrs, msr_index) = data; 1407 return 0; 1408 } 1409 1410 /* 1411 * Called when userspace is restoring VMX MSRs. 1412 * 1413 * Returns 0 on success, non-0 otherwise. 1414 */ 1415 int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 1416 { 1417 struct vcpu_vmx *vmx = to_vmx(vcpu); 1418 1419 /* 1420 * Don't allow changes to the VMX capability MSRs while the vCPU 1421 * is in VMX operation. 1422 */ 1423 if (vmx->nested.vmxon) 1424 return -EBUSY; 1425 1426 switch (msr_index) { 1427 case MSR_IA32_VMX_BASIC: 1428 return vmx_restore_vmx_basic(vmx, data); 1429 case MSR_IA32_VMX_PINBASED_CTLS: 1430 case MSR_IA32_VMX_PROCBASED_CTLS: 1431 case MSR_IA32_VMX_EXIT_CTLS: 1432 case MSR_IA32_VMX_ENTRY_CTLS: 1433 /* 1434 * The "non-true" VMX capability MSRs are generated from the 1435 * "true" MSRs, so we do not support restoring them directly. 1436 * 1437 * If userspace wants to emulate VMX_BASIC[55]=0, userspace 1438 * should restore the "true" MSRs with the must-be-1 bits 1439 * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND 1440 * DEFAULT SETTINGS". 1441 */ 1442 return -EINVAL; 1443 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1444 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1445 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1446 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1447 case MSR_IA32_VMX_PROCBASED_CTLS2: 1448 return vmx_restore_control_msr(vmx, msr_index, data); 1449 case MSR_IA32_VMX_MISC: 1450 return vmx_restore_vmx_misc(vmx, data); 1451 case MSR_IA32_VMX_CR0_FIXED0: 1452 case MSR_IA32_VMX_CR4_FIXED0: 1453 return vmx_restore_fixed0_msr(vmx, msr_index, data); 1454 case MSR_IA32_VMX_CR0_FIXED1: 1455 case MSR_IA32_VMX_CR4_FIXED1: 1456 /* 1457 * These MSRs are generated based on the vCPU's CPUID, so we 1458 * do not support restoring them directly. 1459 */ 1460 return -EINVAL; 1461 case MSR_IA32_VMX_EPT_VPID_CAP: 1462 return vmx_restore_vmx_ept_vpid_cap(vmx, data); 1463 case MSR_IA32_VMX_VMCS_ENUM: 1464 vmx->nested.msrs.vmcs_enum = data; 1465 return 0; 1466 case MSR_IA32_VMX_VMFUNC: 1467 if (data & ~vmcs_config.nested.vmfunc_controls) 1468 return -EINVAL; 1469 vmx->nested.msrs.vmfunc_controls = data; 1470 return 0; 1471 default: 1472 /* 1473 * The rest of the VMX capability MSRs do not support restore. 1474 */ 1475 return -EINVAL; 1476 } 1477 } 1478 1479 /* Returns 0 on success, non-0 otherwise. */ 1480 int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata) 1481 { 1482 switch (msr_index) { 1483 case MSR_IA32_VMX_BASIC: 1484 *pdata = msrs->basic; 1485 break; 1486 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1487 case MSR_IA32_VMX_PINBASED_CTLS: 1488 *pdata = vmx_control_msr( 1489 msrs->pinbased_ctls_low, 1490 msrs->pinbased_ctls_high); 1491 if (msr_index == MSR_IA32_VMX_PINBASED_CTLS) 1492 *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 1493 break; 1494 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1495 case MSR_IA32_VMX_PROCBASED_CTLS: 1496 *pdata = vmx_control_msr( 1497 msrs->procbased_ctls_low, 1498 msrs->procbased_ctls_high); 1499 if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS) 1500 *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 1501 break; 1502 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1503 case MSR_IA32_VMX_EXIT_CTLS: 1504 *pdata = vmx_control_msr( 1505 msrs->exit_ctls_low, 1506 msrs->exit_ctls_high); 1507 if (msr_index == MSR_IA32_VMX_EXIT_CTLS) 1508 *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 1509 break; 1510 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1511 case MSR_IA32_VMX_ENTRY_CTLS: 1512 *pdata = vmx_control_msr( 1513 msrs->entry_ctls_low, 1514 msrs->entry_ctls_high); 1515 if (msr_index == MSR_IA32_VMX_ENTRY_CTLS) 1516 *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 1517 break; 1518 case MSR_IA32_VMX_MISC: 1519 *pdata = vmx_control_msr( 1520 msrs->misc_low, 1521 msrs->misc_high); 1522 break; 1523 case MSR_IA32_VMX_CR0_FIXED0: 1524 *pdata = msrs->cr0_fixed0; 1525 break; 1526 case MSR_IA32_VMX_CR0_FIXED1: 1527 *pdata = msrs->cr0_fixed1; 1528 break; 1529 case MSR_IA32_VMX_CR4_FIXED0: 1530 *pdata = msrs->cr4_fixed0; 1531 break; 1532 case MSR_IA32_VMX_CR4_FIXED1: 1533 *pdata = msrs->cr4_fixed1; 1534 break; 1535 case MSR_IA32_VMX_VMCS_ENUM: 1536 *pdata = msrs->vmcs_enum; 1537 break; 1538 case MSR_IA32_VMX_PROCBASED_CTLS2: 1539 *pdata = vmx_control_msr( 1540 msrs->secondary_ctls_low, 1541 msrs->secondary_ctls_high); 1542 break; 1543 case MSR_IA32_VMX_EPT_VPID_CAP: 1544 *pdata = msrs->ept_caps | 1545 ((u64)msrs->vpid_caps << 32); 1546 break; 1547 case MSR_IA32_VMX_VMFUNC: 1548 *pdata = msrs->vmfunc_controls; 1549 break; 1550 default: 1551 return 1; 1552 } 1553 1554 return 0; 1555 } 1556 1557 /* 1558 * Copy the writable VMCS shadow fields back to the VMCS12, in case they have 1559 * been modified by the L1 guest. Note, "writable" in this context means 1560 * "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of 1561 * fields tagged SHADOW_FIELD_RO may or may not align with the "read-only" 1562 * VM-exit information fields (which are actually writable if the vCPU is 1563 * configured to support "VMWRITE to any supported field in the VMCS"). 1564 */ 1565 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) 1566 { 1567 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 1568 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); 1569 struct shadow_vmcs_field field; 1570 unsigned long val; 1571 int i; 1572 1573 if (WARN_ON(!shadow_vmcs)) 1574 return; 1575 1576 preempt_disable(); 1577 1578 vmcs_load(shadow_vmcs); 1579 1580 for (i = 0; i < max_shadow_read_write_fields; i++) { 1581 field = shadow_read_write_fields[i]; 1582 val = __vmcs_readl(field.encoding); 1583 vmcs12_write_any(vmcs12, field.encoding, field.offset, val); 1584 } 1585 1586 vmcs_clear(shadow_vmcs); 1587 vmcs_load(vmx->loaded_vmcs->vmcs); 1588 1589 preempt_enable(); 1590 } 1591 1592 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) 1593 { 1594 const struct shadow_vmcs_field *fields[] = { 1595 shadow_read_write_fields, 1596 shadow_read_only_fields 1597 }; 1598 const int max_fields[] = { 1599 max_shadow_read_write_fields, 1600 max_shadow_read_only_fields 1601 }; 1602 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 1603 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); 1604 struct shadow_vmcs_field field; 1605 unsigned long val; 1606 int i, q; 1607 1608 if (WARN_ON(!shadow_vmcs)) 1609 return; 1610 1611 vmcs_load(shadow_vmcs); 1612 1613 for (q = 0; q < ARRAY_SIZE(fields); q++) { 1614 for (i = 0; i < max_fields[q]; i++) { 1615 field = fields[q][i]; 1616 val = vmcs12_read_any(vmcs12, field.encoding, 1617 field.offset); 1618 __vmcs_writel(field.encoding, val); 1619 } 1620 } 1621 1622 vmcs_clear(shadow_vmcs); 1623 vmcs_load(vmx->loaded_vmcs->vmcs); 1624 } 1625 1626 static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields) 1627 { 1628 #ifdef CONFIG_KVM_HYPERV 1629 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; 1630 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 1631 struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(&vmx->vcpu); 1632 1633 /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */ 1634 vmcs12->tpr_threshold = evmcs->tpr_threshold; 1635 vmcs12->guest_rip = evmcs->guest_rip; 1636 1637 if (unlikely(!(hv_clean_fields & 1638 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ENLIGHTENMENTSCONTROL))) { 1639 hv_vcpu->nested.pa_page_gpa = evmcs->partition_assist_page; 1640 hv_vcpu->nested.vm_id = evmcs->hv_vm_id; 1641 hv_vcpu->nested.vp_id = evmcs->hv_vp_id; 1642 } 1643 1644 if (unlikely(!(hv_clean_fields & 1645 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) { 1646 vmcs12->guest_rsp = evmcs->guest_rsp; 1647 vmcs12->guest_rflags = evmcs->guest_rflags; 1648 vmcs12->guest_interruptibility_info = 1649 evmcs->guest_interruptibility_info; 1650 /* 1651 * Not present in struct vmcs12: 1652 * vmcs12->guest_ssp = evmcs->guest_ssp; 1653 */ 1654 } 1655 1656 if (unlikely(!(hv_clean_fields & 1657 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) { 1658 vmcs12->cpu_based_vm_exec_control = 1659 evmcs->cpu_based_vm_exec_control; 1660 } 1661 1662 if (unlikely(!(hv_clean_fields & 1663 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN))) { 1664 vmcs12->exception_bitmap = evmcs->exception_bitmap; 1665 } 1666 1667 if (unlikely(!(hv_clean_fields & 1668 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) { 1669 vmcs12->vm_entry_controls = evmcs->vm_entry_controls; 1670 } 1671 1672 if (unlikely(!(hv_clean_fields & 1673 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) { 1674 vmcs12->vm_entry_intr_info_field = 1675 evmcs->vm_entry_intr_info_field; 1676 vmcs12->vm_entry_exception_error_code = 1677 evmcs->vm_entry_exception_error_code; 1678 vmcs12->vm_entry_instruction_len = 1679 evmcs->vm_entry_instruction_len; 1680 } 1681 1682 if (unlikely(!(hv_clean_fields & 1683 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) { 1684 vmcs12->host_ia32_pat = evmcs->host_ia32_pat; 1685 vmcs12->host_ia32_efer = evmcs->host_ia32_efer; 1686 vmcs12->host_cr0 = evmcs->host_cr0; 1687 vmcs12->host_cr3 = evmcs->host_cr3; 1688 vmcs12->host_cr4 = evmcs->host_cr4; 1689 vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp; 1690 vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip; 1691 vmcs12->host_rip = evmcs->host_rip; 1692 vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs; 1693 vmcs12->host_es_selector = evmcs->host_es_selector; 1694 vmcs12->host_cs_selector = evmcs->host_cs_selector; 1695 vmcs12->host_ss_selector = evmcs->host_ss_selector; 1696 vmcs12->host_ds_selector = evmcs->host_ds_selector; 1697 vmcs12->host_fs_selector = evmcs->host_fs_selector; 1698 vmcs12->host_gs_selector = evmcs->host_gs_selector; 1699 vmcs12->host_tr_selector = evmcs->host_tr_selector; 1700 vmcs12->host_ia32_perf_global_ctrl = evmcs->host_ia32_perf_global_ctrl; 1701 /* 1702 * Not present in struct vmcs12: 1703 * vmcs12->host_ia32_s_cet = evmcs->host_ia32_s_cet; 1704 * vmcs12->host_ssp = evmcs->host_ssp; 1705 * vmcs12->host_ia32_int_ssp_table_addr = evmcs->host_ia32_int_ssp_table_addr; 1706 */ 1707 } 1708 1709 if (unlikely(!(hv_clean_fields & 1710 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1))) { 1711 vmcs12->pin_based_vm_exec_control = 1712 evmcs->pin_based_vm_exec_control; 1713 vmcs12->vm_exit_controls = evmcs->vm_exit_controls; 1714 vmcs12->secondary_vm_exec_control = 1715 evmcs->secondary_vm_exec_control; 1716 } 1717 1718 if (unlikely(!(hv_clean_fields & 1719 HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) { 1720 vmcs12->io_bitmap_a = evmcs->io_bitmap_a; 1721 vmcs12->io_bitmap_b = evmcs->io_bitmap_b; 1722 } 1723 1724 if (unlikely(!(hv_clean_fields & 1725 HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) { 1726 vmcs12->msr_bitmap = evmcs->msr_bitmap; 1727 } 1728 1729 if (unlikely(!(hv_clean_fields & 1730 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) { 1731 vmcs12->guest_es_base = evmcs->guest_es_base; 1732 vmcs12->guest_cs_base = evmcs->guest_cs_base; 1733 vmcs12->guest_ss_base = evmcs->guest_ss_base; 1734 vmcs12->guest_ds_base = evmcs->guest_ds_base; 1735 vmcs12->guest_fs_base = evmcs->guest_fs_base; 1736 vmcs12->guest_gs_base = evmcs->guest_gs_base; 1737 vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base; 1738 vmcs12->guest_tr_base = evmcs->guest_tr_base; 1739 vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base; 1740 vmcs12->guest_idtr_base = evmcs->guest_idtr_base; 1741 vmcs12->guest_es_limit = evmcs->guest_es_limit; 1742 vmcs12->guest_cs_limit = evmcs->guest_cs_limit; 1743 vmcs12->guest_ss_limit = evmcs->guest_ss_limit; 1744 vmcs12->guest_ds_limit = evmcs->guest_ds_limit; 1745 vmcs12->guest_fs_limit = evmcs->guest_fs_limit; 1746 vmcs12->guest_gs_limit = evmcs->guest_gs_limit; 1747 vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit; 1748 vmcs12->guest_tr_limit = evmcs->guest_tr_limit; 1749 vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit; 1750 vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit; 1751 vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes; 1752 vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes; 1753 vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes; 1754 vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes; 1755 vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes; 1756 vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes; 1757 vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes; 1758 vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes; 1759 vmcs12->guest_es_selector = evmcs->guest_es_selector; 1760 vmcs12->guest_cs_selector = evmcs->guest_cs_selector; 1761 vmcs12->guest_ss_selector = evmcs->guest_ss_selector; 1762 vmcs12->guest_ds_selector = evmcs->guest_ds_selector; 1763 vmcs12->guest_fs_selector = evmcs->guest_fs_selector; 1764 vmcs12->guest_gs_selector = evmcs->guest_gs_selector; 1765 vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector; 1766 vmcs12->guest_tr_selector = evmcs->guest_tr_selector; 1767 } 1768 1769 if (unlikely(!(hv_clean_fields & 1770 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) { 1771 vmcs12->tsc_offset = evmcs->tsc_offset; 1772 vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr; 1773 vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap; 1774 vmcs12->encls_exiting_bitmap = evmcs->encls_exiting_bitmap; 1775 vmcs12->tsc_multiplier = evmcs->tsc_multiplier; 1776 } 1777 1778 if (unlikely(!(hv_clean_fields & 1779 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) { 1780 vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask; 1781 vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask; 1782 vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow; 1783 vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow; 1784 vmcs12->guest_cr0 = evmcs->guest_cr0; 1785 vmcs12->guest_cr3 = evmcs->guest_cr3; 1786 vmcs12->guest_cr4 = evmcs->guest_cr4; 1787 vmcs12->guest_dr7 = evmcs->guest_dr7; 1788 } 1789 1790 if (unlikely(!(hv_clean_fields & 1791 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) { 1792 vmcs12->host_fs_base = evmcs->host_fs_base; 1793 vmcs12->host_gs_base = evmcs->host_gs_base; 1794 vmcs12->host_tr_base = evmcs->host_tr_base; 1795 vmcs12->host_gdtr_base = evmcs->host_gdtr_base; 1796 vmcs12->host_idtr_base = evmcs->host_idtr_base; 1797 vmcs12->host_rsp = evmcs->host_rsp; 1798 } 1799 1800 if (unlikely(!(hv_clean_fields & 1801 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) { 1802 vmcs12->ept_pointer = evmcs->ept_pointer; 1803 vmcs12->virtual_processor_id = evmcs->virtual_processor_id; 1804 } 1805 1806 if (unlikely(!(hv_clean_fields & 1807 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) { 1808 vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer; 1809 vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl; 1810 vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat; 1811 vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer; 1812 vmcs12->guest_pdptr0 = evmcs->guest_pdptr0; 1813 vmcs12->guest_pdptr1 = evmcs->guest_pdptr1; 1814 vmcs12->guest_pdptr2 = evmcs->guest_pdptr2; 1815 vmcs12->guest_pdptr3 = evmcs->guest_pdptr3; 1816 vmcs12->guest_pending_dbg_exceptions = 1817 evmcs->guest_pending_dbg_exceptions; 1818 vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp; 1819 vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip; 1820 vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs; 1821 vmcs12->guest_activity_state = evmcs->guest_activity_state; 1822 vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs; 1823 vmcs12->guest_ia32_perf_global_ctrl = evmcs->guest_ia32_perf_global_ctrl; 1824 /* 1825 * Not present in struct vmcs12: 1826 * vmcs12->guest_ia32_s_cet = evmcs->guest_ia32_s_cet; 1827 * vmcs12->guest_ia32_lbr_ctl = evmcs->guest_ia32_lbr_ctl; 1828 * vmcs12->guest_ia32_int_ssp_table_addr = evmcs->guest_ia32_int_ssp_table_addr; 1829 */ 1830 } 1831 1832 /* 1833 * Not used? 1834 * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr; 1835 * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr; 1836 * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr; 1837 * vmcs12->page_fault_error_code_mask = 1838 * evmcs->page_fault_error_code_mask; 1839 * vmcs12->page_fault_error_code_match = 1840 * evmcs->page_fault_error_code_match; 1841 * vmcs12->cr3_target_count = evmcs->cr3_target_count; 1842 * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count; 1843 * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count; 1844 * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count; 1845 */ 1846 1847 /* 1848 * Read only fields: 1849 * vmcs12->guest_physical_address = evmcs->guest_physical_address; 1850 * vmcs12->vm_instruction_error = evmcs->vm_instruction_error; 1851 * vmcs12->vm_exit_reason = evmcs->vm_exit_reason; 1852 * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info; 1853 * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code; 1854 * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field; 1855 * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code; 1856 * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len; 1857 * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info; 1858 * vmcs12->exit_qualification = evmcs->exit_qualification; 1859 * vmcs12->guest_linear_address = evmcs->guest_linear_address; 1860 * 1861 * Not present in struct vmcs12: 1862 * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx; 1863 * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi; 1864 * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi; 1865 * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip; 1866 */ 1867 1868 return; 1869 #else /* CONFIG_KVM_HYPERV */ 1870 KVM_BUG_ON(1, vmx->vcpu.kvm); 1871 #endif /* CONFIG_KVM_HYPERV */ 1872 } 1873 1874 static void copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) 1875 { 1876 #ifdef CONFIG_KVM_HYPERV 1877 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; 1878 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 1879 1880 /* 1881 * Should not be changed by KVM: 1882 * 1883 * evmcs->host_es_selector = vmcs12->host_es_selector; 1884 * evmcs->host_cs_selector = vmcs12->host_cs_selector; 1885 * evmcs->host_ss_selector = vmcs12->host_ss_selector; 1886 * evmcs->host_ds_selector = vmcs12->host_ds_selector; 1887 * evmcs->host_fs_selector = vmcs12->host_fs_selector; 1888 * evmcs->host_gs_selector = vmcs12->host_gs_selector; 1889 * evmcs->host_tr_selector = vmcs12->host_tr_selector; 1890 * evmcs->host_ia32_pat = vmcs12->host_ia32_pat; 1891 * evmcs->host_ia32_efer = vmcs12->host_ia32_efer; 1892 * evmcs->host_cr0 = vmcs12->host_cr0; 1893 * evmcs->host_cr3 = vmcs12->host_cr3; 1894 * evmcs->host_cr4 = vmcs12->host_cr4; 1895 * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp; 1896 * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip; 1897 * evmcs->host_rip = vmcs12->host_rip; 1898 * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs; 1899 * evmcs->host_fs_base = vmcs12->host_fs_base; 1900 * evmcs->host_gs_base = vmcs12->host_gs_base; 1901 * evmcs->host_tr_base = vmcs12->host_tr_base; 1902 * evmcs->host_gdtr_base = vmcs12->host_gdtr_base; 1903 * evmcs->host_idtr_base = vmcs12->host_idtr_base; 1904 * evmcs->host_rsp = vmcs12->host_rsp; 1905 * sync_vmcs02_to_vmcs12() doesn't read these: 1906 * evmcs->io_bitmap_a = vmcs12->io_bitmap_a; 1907 * evmcs->io_bitmap_b = vmcs12->io_bitmap_b; 1908 * evmcs->msr_bitmap = vmcs12->msr_bitmap; 1909 * evmcs->ept_pointer = vmcs12->ept_pointer; 1910 * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap; 1911 * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr; 1912 * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr; 1913 * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr; 1914 * evmcs->tpr_threshold = vmcs12->tpr_threshold; 1915 * evmcs->virtual_processor_id = vmcs12->virtual_processor_id; 1916 * evmcs->exception_bitmap = vmcs12->exception_bitmap; 1917 * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer; 1918 * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control; 1919 * evmcs->vm_exit_controls = vmcs12->vm_exit_controls; 1920 * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control; 1921 * evmcs->page_fault_error_code_mask = 1922 * vmcs12->page_fault_error_code_mask; 1923 * evmcs->page_fault_error_code_match = 1924 * vmcs12->page_fault_error_code_match; 1925 * evmcs->cr3_target_count = vmcs12->cr3_target_count; 1926 * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr; 1927 * evmcs->tsc_offset = vmcs12->tsc_offset; 1928 * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl; 1929 * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask; 1930 * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask; 1931 * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow; 1932 * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow; 1933 * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count; 1934 * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count; 1935 * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count; 1936 * evmcs->guest_ia32_perf_global_ctrl = vmcs12->guest_ia32_perf_global_ctrl; 1937 * evmcs->host_ia32_perf_global_ctrl = vmcs12->host_ia32_perf_global_ctrl; 1938 * evmcs->encls_exiting_bitmap = vmcs12->encls_exiting_bitmap; 1939 * evmcs->tsc_multiplier = vmcs12->tsc_multiplier; 1940 * 1941 * Not present in struct vmcs12: 1942 * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx; 1943 * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi; 1944 * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi; 1945 * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip; 1946 * evmcs->host_ia32_s_cet = vmcs12->host_ia32_s_cet; 1947 * evmcs->host_ssp = vmcs12->host_ssp; 1948 * evmcs->host_ia32_int_ssp_table_addr = vmcs12->host_ia32_int_ssp_table_addr; 1949 * evmcs->guest_ia32_s_cet = vmcs12->guest_ia32_s_cet; 1950 * evmcs->guest_ia32_lbr_ctl = vmcs12->guest_ia32_lbr_ctl; 1951 * evmcs->guest_ia32_int_ssp_table_addr = vmcs12->guest_ia32_int_ssp_table_addr; 1952 * evmcs->guest_ssp = vmcs12->guest_ssp; 1953 */ 1954 1955 evmcs->guest_es_selector = vmcs12->guest_es_selector; 1956 evmcs->guest_cs_selector = vmcs12->guest_cs_selector; 1957 evmcs->guest_ss_selector = vmcs12->guest_ss_selector; 1958 evmcs->guest_ds_selector = vmcs12->guest_ds_selector; 1959 evmcs->guest_fs_selector = vmcs12->guest_fs_selector; 1960 evmcs->guest_gs_selector = vmcs12->guest_gs_selector; 1961 evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector; 1962 evmcs->guest_tr_selector = vmcs12->guest_tr_selector; 1963 1964 evmcs->guest_es_limit = vmcs12->guest_es_limit; 1965 evmcs->guest_cs_limit = vmcs12->guest_cs_limit; 1966 evmcs->guest_ss_limit = vmcs12->guest_ss_limit; 1967 evmcs->guest_ds_limit = vmcs12->guest_ds_limit; 1968 evmcs->guest_fs_limit = vmcs12->guest_fs_limit; 1969 evmcs->guest_gs_limit = vmcs12->guest_gs_limit; 1970 evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit; 1971 evmcs->guest_tr_limit = vmcs12->guest_tr_limit; 1972 evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit; 1973 evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit; 1974 1975 evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes; 1976 evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes; 1977 evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes; 1978 evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes; 1979 evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes; 1980 evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes; 1981 evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes; 1982 evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes; 1983 1984 evmcs->guest_es_base = vmcs12->guest_es_base; 1985 evmcs->guest_cs_base = vmcs12->guest_cs_base; 1986 evmcs->guest_ss_base = vmcs12->guest_ss_base; 1987 evmcs->guest_ds_base = vmcs12->guest_ds_base; 1988 evmcs->guest_fs_base = vmcs12->guest_fs_base; 1989 evmcs->guest_gs_base = vmcs12->guest_gs_base; 1990 evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base; 1991 evmcs->guest_tr_base = vmcs12->guest_tr_base; 1992 evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base; 1993 evmcs->guest_idtr_base = vmcs12->guest_idtr_base; 1994 1995 evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat; 1996 evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer; 1997 1998 evmcs->guest_pdptr0 = vmcs12->guest_pdptr0; 1999 evmcs->guest_pdptr1 = vmcs12->guest_pdptr1; 2000 evmcs->guest_pdptr2 = vmcs12->guest_pdptr2; 2001 evmcs->guest_pdptr3 = vmcs12->guest_pdptr3; 2002 2003 evmcs->guest_pending_dbg_exceptions = 2004 vmcs12->guest_pending_dbg_exceptions; 2005 evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp; 2006 evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip; 2007 2008 evmcs->guest_activity_state = vmcs12->guest_activity_state; 2009 evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs; 2010 2011 evmcs->guest_cr0 = vmcs12->guest_cr0; 2012 evmcs->guest_cr3 = vmcs12->guest_cr3; 2013 evmcs->guest_cr4 = vmcs12->guest_cr4; 2014 evmcs->guest_dr7 = vmcs12->guest_dr7; 2015 2016 evmcs->guest_physical_address = vmcs12->guest_physical_address; 2017 2018 evmcs->vm_instruction_error = vmcs12->vm_instruction_error; 2019 evmcs->vm_exit_reason = vmcs12->vm_exit_reason; 2020 evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info; 2021 evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code; 2022 evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field; 2023 evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code; 2024 evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len; 2025 evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info; 2026 2027 evmcs->exit_qualification = vmcs12->exit_qualification; 2028 2029 evmcs->guest_linear_address = vmcs12->guest_linear_address; 2030 evmcs->guest_rsp = vmcs12->guest_rsp; 2031 evmcs->guest_rflags = vmcs12->guest_rflags; 2032 2033 evmcs->guest_interruptibility_info = 2034 vmcs12->guest_interruptibility_info; 2035 evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control; 2036 evmcs->vm_entry_controls = vmcs12->vm_entry_controls; 2037 evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field; 2038 evmcs->vm_entry_exception_error_code = 2039 vmcs12->vm_entry_exception_error_code; 2040 evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len; 2041 2042 evmcs->guest_rip = vmcs12->guest_rip; 2043 2044 evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs; 2045 2046 return; 2047 #else /* CONFIG_KVM_HYPERV */ 2048 KVM_BUG_ON(1, vmx->vcpu.kvm); 2049 #endif /* CONFIG_KVM_HYPERV */ 2050 } 2051 2052 /* 2053 * This is an equivalent of the nested hypervisor executing the vmptrld 2054 * instruction. 2055 */ 2056 static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld( 2057 struct kvm_vcpu *vcpu, bool from_launch) 2058 { 2059 #ifdef CONFIG_KVM_HYPERV 2060 struct vcpu_vmx *vmx = to_vmx(vcpu); 2061 bool evmcs_gpa_changed = false; 2062 u64 evmcs_gpa; 2063 2064 if (likely(!guest_cpuid_has_evmcs(vcpu))) 2065 return EVMPTRLD_DISABLED; 2066 2067 evmcs_gpa = nested_get_evmptr(vcpu); 2068 if (!evmptr_is_valid(evmcs_gpa)) { 2069 nested_release_evmcs(vcpu); 2070 return EVMPTRLD_DISABLED; 2071 } 2072 2073 if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) { 2074 vmx->nested.current_vmptr = INVALID_GPA; 2075 2076 nested_release_evmcs(vcpu); 2077 2078 if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa), 2079 &vmx->nested.hv_evmcs_map)) 2080 return EVMPTRLD_ERROR; 2081 2082 vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva; 2083 2084 /* 2085 * Currently, KVM only supports eVMCS version 1 2086 * (== KVM_EVMCS_VERSION) and thus we expect guest to set this 2087 * value to first u32 field of eVMCS which should specify eVMCS 2088 * VersionNumber. 2089 * 2090 * Guest should be aware of supported eVMCS versions by host by 2091 * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is 2092 * expected to set this CPUID leaf according to the value 2093 * returned in vmcs_version from nested_enable_evmcs(). 2094 * 2095 * However, it turns out that Microsoft Hyper-V fails to comply 2096 * to their own invented interface: When Hyper-V use eVMCS, it 2097 * just sets first u32 field of eVMCS to revision_id specified 2098 * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number 2099 * which is one of the supported versions specified in 2100 * CPUID.0x4000000A.EAX[0:15]. 2101 * 2102 * To overcome Hyper-V bug, we accept here either a supported 2103 * eVMCS version or VMCS12 revision_id as valid values for first 2104 * u32 field of eVMCS. 2105 */ 2106 if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) && 2107 (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) { 2108 nested_release_evmcs(vcpu); 2109 return EVMPTRLD_VMFAIL; 2110 } 2111 2112 vmx->nested.hv_evmcs_vmptr = evmcs_gpa; 2113 2114 evmcs_gpa_changed = true; 2115 /* 2116 * Unlike normal vmcs12, enlightened vmcs12 is not fully 2117 * reloaded from guest's memory (read only fields, fields not 2118 * present in struct hv_enlightened_vmcs, ...). Make sure there 2119 * are no leftovers. 2120 */ 2121 if (from_launch) { 2122 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2123 memset(vmcs12, 0, sizeof(*vmcs12)); 2124 vmcs12->hdr.revision_id = VMCS12_REVISION; 2125 } 2126 2127 } 2128 2129 /* 2130 * Clean fields data can't be used on VMLAUNCH and when we switch 2131 * between different L2 guests as KVM keeps a single VMCS12 per L1. 2132 */ 2133 if (from_launch || evmcs_gpa_changed) { 2134 vmx->nested.hv_evmcs->hv_clean_fields &= 2135 ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 2136 2137 vmx->nested.force_msr_bitmap_recalc = true; 2138 } 2139 2140 return EVMPTRLD_SUCCEEDED; 2141 #else 2142 return EVMPTRLD_DISABLED; 2143 #endif 2144 } 2145 2146 void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu) 2147 { 2148 struct vcpu_vmx *vmx = to_vmx(vcpu); 2149 2150 if (nested_vmx_is_evmptr12_valid(vmx)) 2151 copy_vmcs12_to_enlightened(vmx); 2152 else 2153 copy_vmcs12_to_shadow(vmx); 2154 2155 vmx->nested.need_vmcs12_to_shadow_sync = false; 2156 } 2157 2158 static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) 2159 { 2160 struct vcpu_vmx *vmx = 2161 container_of(timer, struct vcpu_vmx, nested.preemption_timer); 2162 2163 vmx->nested.preemption_timer_expired = true; 2164 kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); 2165 kvm_vcpu_kick(&vmx->vcpu); 2166 2167 return HRTIMER_NORESTART; 2168 } 2169 2170 static u64 vmx_calc_preemption_timer_value(struct kvm_vcpu *vcpu) 2171 { 2172 struct vcpu_vmx *vmx = to_vmx(vcpu); 2173 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2174 2175 u64 l1_scaled_tsc = kvm_read_l1_tsc(vcpu, rdtsc()) >> 2176 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 2177 2178 if (!vmx->nested.has_preemption_timer_deadline) { 2179 vmx->nested.preemption_timer_deadline = 2180 vmcs12->vmx_preemption_timer_value + l1_scaled_tsc; 2181 vmx->nested.has_preemption_timer_deadline = true; 2182 } 2183 return vmx->nested.preemption_timer_deadline - l1_scaled_tsc; 2184 } 2185 2186 static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu, 2187 u64 preemption_timeout) 2188 { 2189 struct vcpu_vmx *vmx = to_vmx(vcpu); 2190 2191 /* 2192 * A timer value of zero is architecturally guaranteed to cause 2193 * a VMExit prior to executing any instructions in the guest. 2194 */ 2195 if (preemption_timeout == 0) { 2196 vmx_preemption_timer_fn(&vmx->nested.preemption_timer); 2197 return; 2198 } 2199 2200 if (vcpu->arch.virtual_tsc_khz == 0) 2201 return; 2202 2203 preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 2204 preemption_timeout *= 1000000; 2205 do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); 2206 hrtimer_start(&vmx->nested.preemption_timer, 2207 ktime_add_ns(ktime_get(), preemption_timeout), 2208 HRTIMER_MODE_ABS_PINNED); 2209 } 2210 2211 static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2212 { 2213 if (vmx->nested.nested_run_pending && 2214 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) 2215 return vmcs12->guest_ia32_efer; 2216 else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) 2217 return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME); 2218 else 2219 return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME); 2220 } 2221 2222 static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) 2223 { 2224 struct kvm *kvm = vmx->vcpu.kvm; 2225 2226 /* 2227 * If vmcs02 hasn't been initialized, set the constant vmcs02 state 2228 * according to L0's settings (vmcs12 is irrelevant here). Host 2229 * fields that come from L0 and are not constant, e.g. HOST_CR3, 2230 * will be set as needed prior to VMLAUNCH/VMRESUME. 2231 */ 2232 if (vmx->nested.vmcs02_initialized) 2233 return; 2234 vmx->nested.vmcs02_initialized = true; 2235 2236 /* 2237 * We don't care what the EPTP value is we just need to guarantee 2238 * it's valid so we don't get a false positive when doing early 2239 * consistency checks. 2240 */ 2241 if (enable_ept && nested_early_check) 2242 vmcs_write64(EPT_POINTER, 2243 construct_eptp(&vmx->vcpu, 0, PT64_ROOT_4LEVEL)); 2244 2245 /* All VMFUNCs are currently emulated through L0 vmexits. */ 2246 if (cpu_has_vmx_vmfunc()) 2247 vmcs_write64(VM_FUNCTION_CONTROL, 0); 2248 2249 if (cpu_has_vmx_posted_intr()) 2250 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR); 2251 2252 if (cpu_has_vmx_msr_bitmap()) 2253 vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap)); 2254 2255 /* 2256 * PML is emulated for L2, but never enabled in hardware as the MMU 2257 * handles A/D emulation. Disabling PML for L2 also avoids having to 2258 * deal with filtering out L2 GPAs from the buffer. 2259 */ 2260 if (enable_pml) { 2261 vmcs_write64(PML_ADDRESS, 0); 2262 vmcs_write16(GUEST_PML_INDEX, -1); 2263 } 2264 2265 if (cpu_has_vmx_encls_vmexit()) 2266 vmcs_write64(ENCLS_EXITING_BITMAP, INVALID_GPA); 2267 2268 if (kvm_notify_vmexit_enabled(kvm)) 2269 vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window); 2270 2271 /* 2272 * Set the MSR load/store lists to match L0's settings. Only the 2273 * addresses are constant (for vmcs02), the counts can change based 2274 * on L2's behavior, e.g. switching to/from long mode. 2275 */ 2276 vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.guest.val)); 2277 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); 2278 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); 2279 2280 vmx_set_constant_host_state(vmx); 2281 } 2282 2283 static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx, 2284 struct vmcs12 *vmcs12) 2285 { 2286 prepare_vmcs02_constant_state(vmx); 2287 2288 vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); 2289 2290 if (enable_vpid) { 2291 if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) 2292 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); 2293 else 2294 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 2295 } 2296 } 2297 2298 static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs01, 2299 struct vmcs12 *vmcs12) 2300 { 2301 u32 exec_control; 2302 u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12); 2303 2304 if (vmx->nested.dirty_vmcs12 || nested_vmx_is_evmptr12_valid(vmx)) 2305 prepare_vmcs02_early_rare(vmx, vmcs12); 2306 2307 /* 2308 * PIN CONTROLS 2309 */ 2310 exec_control = __pin_controls_get(vmcs01); 2311 exec_control |= (vmcs12->pin_based_vm_exec_control & 2312 ~PIN_BASED_VMX_PREEMPTION_TIMER); 2313 2314 /* Posted interrupts setting is only taken from vmcs12. */ 2315 vmx->nested.pi_pending = false; 2316 if (nested_cpu_has_posted_intr(vmcs12)) 2317 vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; 2318 else 2319 exec_control &= ~PIN_BASED_POSTED_INTR; 2320 pin_controls_set(vmx, exec_control); 2321 2322 /* 2323 * EXEC CONTROLS 2324 */ 2325 exec_control = __exec_controls_get(vmcs01); /* L0's desires */ 2326 exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING; 2327 exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING; 2328 exec_control &= ~CPU_BASED_TPR_SHADOW; 2329 exec_control |= vmcs12->cpu_based_vm_exec_control; 2330 2331 vmx->nested.l1_tpr_threshold = -1; 2332 if (exec_control & CPU_BASED_TPR_SHADOW) 2333 vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); 2334 #ifdef CONFIG_X86_64 2335 else 2336 exec_control |= CPU_BASED_CR8_LOAD_EXITING | 2337 CPU_BASED_CR8_STORE_EXITING; 2338 #endif 2339 2340 /* 2341 * A vmexit (to either L1 hypervisor or L0 userspace) is always needed 2342 * for I/O port accesses. 2343 */ 2344 exec_control |= CPU_BASED_UNCOND_IO_EXITING; 2345 exec_control &= ~CPU_BASED_USE_IO_BITMAPS; 2346 2347 /* 2348 * This bit will be computed in nested_get_vmcs12_pages, because 2349 * we do not have access to L1's MSR bitmap yet. For now, keep 2350 * the same bit as before, hoping to avoid multiple VMWRITEs that 2351 * only set/clear this bit. 2352 */ 2353 exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; 2354 exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS; 2355 2356 exec_controls_set(vmx, exec_control); 2357 2358 /* 2359 * SECONDARY EXEC CONTROLS 2360 */ 2361 if (cpu_has_secondary_exec_ctrls()) { 2362 exec_control = __secondary_exec_controls_get(vmcs01); 2363 2364 /* Take the following fields only from vmcs12 */ 2365 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 2366 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 2367 SECONDARY_EXEC_ENABLE_INVPCID | 2368 SECONDARY_EXEC_ENABLE_RDTSCP | 2369 SECONDARY_EXEC_ENABLE_XSAVES | 2370 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | 2371 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 2372 SECONDARY_EXEC_APIC_REGISTER_VIRT | 2373 SECONDARY_EXEC_ENABLE_VMFUNC | 2374 SECONDARY_EXEC_DESC); 2375 2376 if (nested_cpu_has(vmcs12, 2377 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) 2378 exec_control |= vmcs12->secondary_vm_exec_control; 2379 2380 /* PML is emulated and never enabled in hardware for L2. */ 2381 exec_control &= ~SECONDARY_EXEC_ENABLE_PML; 2382 2383 /* VMCS shadowing for L2 is emulated for now */ 2384 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; 2385 2386 /* 2387 * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4() 2388 * will not have to rewrite the controls just for this bit. 2389 */ 2390 if (vmx_umip_emulated() && (vmcs12->guest_cr4 & X86_CR4_UMIP)) 2391 exec_control |= SECONDARY_EXEC_DESC; 2392 2393 if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) 2394 vmcs_write16(GUEST_INTR_STATUS, 2395 vmcs12->guest_intr_status); 2396 2397 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) 2398 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 2399 2400 if (exec_control & SECONDARY_EXEC_ENCLS_EXITING) 2401 vmx_write_encls_bitmap(&vmx->vcpu, vmcs12); 2402 2403 secondary_exec_controls_set(vmx, exec_control); 2404 } 2405 2406 /* 2407 * ENTRY CONTROLS 2408 * 2409 * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE 2410 * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate 2411 * on the related bits (if supported by the CPU) in the hope that 2412 * we can avoid VMWrites during vmx_set_efer(). 2413 * 2414 * Similarly, take vmcs01's PERF_GLOBAL_CTRL in the hope that if KVM is 2415 * loading PERF_GLOBAL_CTRL via the VMCS for L1, then KVM will want to 2416 * do the same for L2. 2417 */ 2418 exec_control = __vm_entry_controls_get(vmcs01); 2419 exec_control |= (vmcs12->vm_entry_controls & 2420 ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL); 2421 exec_control &= ~(VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER); 2422 if (cpu_has_load_ia32_efer()) { 2423 if (guest_efer & EFER_LMA) 2424 exec_control |= VM_ENTRY_IA32E_MODE; 2425 if (guest_efer != host_efer) 2426 exec_control |= VM_ENTRY_LOAD_IA32_EFER; 2427 } 2428 vm_entry_controls_set(vmx, exec_control); 2429 2430 /* 2431 * EXIT CONTROLS 2432 * 2433 * L2->L1 exit controls are emulated - the hardware exit is to L0 so 2434 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER 2435 * bits may be modified by vmx_set_efer() in prepare_vmcs02(). 2436 */ 2437 exec_control = __vm_exit_controls_get(vmcs01); 2438 if (cpu_has_load_ia32_efer() && guest_efer != host_efer) 2439 exec_control |= VM_EXIT_LOAD_IA32_EFER; 2440 else 2441 exec_control &= ~VM_EXIT_LOAD_IA32_EFER; 2442 vm_exit_controls_set(vmx, exec_control); 2443 2444 /* 2445 * Interrupt/Exception Fields 2446 */ 2447 if (vmx->nested.nested_run_pending) { 2448 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 2449 vmcs12->vm_entry_intr_info_field); 2450 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 2451 vmcs12->vm_entry_exception_error_code); 2452 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 2453 vmcs12->vm_entry_instruction_len); 2454 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 2455 vmcs12->guest_interruptibility_info); 2456 vmx->loaded_vmcs->nmi_known_unmasked = 2457 !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI); 2458 } else { 2459 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); 2460 } 2461 } 2462 2463 static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2464 { 2465 struct hv_enlightened_vmcs *hv_evmcs = nested_vmx_evmcs(vmx); 2466 2467 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2468 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { 2469 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); 2470 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); 2471 vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); 2472 vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector); 2473 vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector); 2474 vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector); 2475 vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector); 2476 vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector); 2477 vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit); 2478 vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit); 2479 vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit); 2480 vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit); 2481 vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit); 2482 vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit); 2483 vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit); 2484 vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit); 2485 vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); 2486 vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); 2487 vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); 2488 vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); 2489 vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); 2490 vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); 2491 vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); 2492 vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes); 2493 vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes); 2494 vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes); 2495 vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); 2496 vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); 2497 vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base); 2498 vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base); 2499 vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base); 2500 vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base); 2501 vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base); 2502 vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); 2503 vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); 2504 vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); 2505 2506 vmx->segment_cache.bitmask = 0; 2507 } 2508 2509 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2510 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) { 2511 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); 2512 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 2513 vmcs12->guest_pending_dbg_exceptions); 2514 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); 2515 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); 2516 2517 /* 2518 * L1 may access the L2's PDPTR, so save them to construct 2519 * vmcs12 2520 */ 2521 if (enable_ept) { 2522 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 2523 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 2524 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 2525 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 2526 } 2527 2528 if (kvm_mpx_supported() && vmx->nested.nested_run_pending && 2529 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) 2530 vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); 2531 } 2532 2533 if (nested_cpu_has_xsaves(vmcs12)) 2534 vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap); 2535 2536 /* 2537 * Whether page-faults are trapped is determined by a combination of 2538 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. If L0 2539 * doesn't care about page faults then we should set all of these to 2540 * L1's desires. However, if L0 does care about (some) page faults, it 2541 * is not easy (if at all possible?) to merge L0 and L1's desires, we 2542 * simply ask to exit on each and every L2 page fault. This is done by 2543 * setting MASK=MATCH=0 and (see below) EB.PF=1. 2544 * Note that below we don't need special code to set EB.PF beyond the 2545 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, 2546 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when 2547 * !enable_ept, EB.PF is 1, so the "or" will always be 1. 2548 */ 2549 if (vmx_need_pf_intercept(&vmx->vcpu)) { 2550 /* 2551 * TODO: if both L0 and L1 need the same MASK and MATCH, 2552 * go ahead and use it? 2553 */ 2554 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); 2555 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); 2556 } else { 2557 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, vmcs12->page_fault_error_code_mask); 2558 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, vmcs12->page_fault_error_code_match); 2559 } 2560 2561 if (cpu_has_vmx_apicv()) { 2562 vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0); 2563 vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1); 2564 vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2); 2565 vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3); 2566 } 2567 2568 /* 2569 * Make sure the msr_autostore list is up to date before we set the 2570 * count in the vmcs02. 2571 */ 2572 prepare_vmx_msr_autostore_list(&vmx->vcpu, MSR_IA32_TSC); 2573 2574 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.guest.nr); 2575 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 2576 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 2577 2578 set_cr4_guest_host_mask(vmx); 2579 } 2580 2581 /* 2582 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested 2583 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it 2584 * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2 2585 * guest in a way that will both be appropriate to L1's requests, and our 2586 * needs. In addition to modifying the active vmcs (which is vmcs02), this 2587 * function also has additional necessary side-effects, like setting various 2588 * vcpu->arch fields. 2589 * Returns 0 on success, 1 on failure. Invalid state exit qualification code 2590 * is assigned to entry_failure_code on failure. 2591 */ 2592 static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 2593 bool from_vmentry, 2594 enum vm_entry_failure_code *entry_failure_code) 2595 { 2596 struct vcpu_vmx *vmx = to_vmx(vcpu); 2597 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 2598 bool load_guest_pdptrs_vmcs12 = false; 2599 2600 if (vmx->nested.dirty_vmcs12 || nested_vmx_is_evmptr12_valid(vmx)) { 2601 prepare_vmcs02_rare(vmx, vmcs12); 2602 vmx->nested.dirty_vmcs12 = false; 2603 2604 load_guest_pdptrs_vmcs12 = !nested_vmx_is_evmptr12_valid(vmx) || 2605 !(evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1); 2606 } 2607 2608 if (vmx->nested.nested_run_pending && 2609 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { 2610 kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); 2611 vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); 2612 } else { 2613 kvm_set_dr(vcpu, 7, vcpu->arch.dr7); 2614 vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.pre_vmenter_debugctl); 2615 } 2616 if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || 2617 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) 2618 vmcs_write64(GUEST_BNDCFGS, vmx->nested.pre_vmenter_bndcfgs); 2619 vmx_set_rflags(vcpu, vmcs12->guest_rflags); 2620 2621 /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the 2622 * bitwise-or of what L1 wants to trap for L2, and what we want to 2623 * trap. Note that CR0.TS also needs updating - we do this later. 2624 */ 2625 vmx_update_exception_bitmap(vcpu); 2626 vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; 2627 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); 2628 2629 if (vmx->nested.nested_run_pending && 2630 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) { 2631 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); 2632 vcpu->arch.pat = vmcs12->guest_ia32_pat; 2633 } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { 2634 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); 2635 } 2636 2637 vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset( 2638 vcpu->arch.l1_tsc_offset, 2639 vmx_get_l2_tsc_offset(vcpu), 2640 vmx_get_l2_tsc_multiplier(vcpu)); 2641 2642 vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier( 2643 vcpu->arch.l1_tsc_scaling_ratio, 2644 vmx_get_l2_tsc_multiplier(vcpu)); 2645 2646 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 2647 if (kvm_caps.has_tsc_control) 2648 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); 2649 2650 nested_vmx_transition_tlb_flush(vcpu, vmcs12, true); 2651 2652 if (nested_cpu_has_ept(vmcs12)) 2653 nested_ept_init_mmu_context(vcpu); 2654 2655 /* 2656 * Override the CR0/CR4 read shadows after setting the effective guest 2657 * CR0/CR4. The common helpers also set the shadows, but they don't 2658 * account for vmcs12's cr0/4_guest_host_mask. 2659 */ 2660 vmx_set_cr0(vcpu, vmcs12->guest_cr0); 2661 vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); 2662 2663 vmx_set_cr4(vcpu, vmcs12->guest_cr4); 2664 vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12)); 2665 2666 vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12); 2667 /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ 2668 vmx_set_efer(vcpu, vcpu->arch.efer); 2669 2670 /* 2671 * Guest state is invalid and unrestricted guest is disabled, 2672 * which means L1 attempted VMEntry to L2 with invalid state. 2673 * Fail the VMEntry. 2674 * 2675 * However when force loading the guest state (SMM exit or 2676 * loading nested state after migration, it is possible to 2677 * have invalid guest state now, which will be later fixed by 2678 * restoring L2 register state 2679 */ 2680 if (CC(from_vmentry && !vmx_guest_state_valid(vcpu))) { 2681 *entry_failure_code = ENTRY_FAIL_DEFAULT; 2682 return -EINVAL; 2683 } 2684 2685 /* Shadow page tables on either EPT or shadow page tables. */ 2686 if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12), 2687 from_vmentry, entry_failure_code)) 2688 return -EINVAL; 2689 2690 /* 2691 * Immediately write vmcs02.GUEST_CR3. It will be propagated to vmcs12 2692 * on nested VM-Exit, which can occur without actually running L2 and 2693 * thus without hitting vmx_load_mmu_pgd(), e.g. if L1 is entering L2 with 2694 * vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the 2695 * transition to HLT instead of running L2. 2696 */ 2697 if (enable_ept) 2698 vmcs_writel(GUEST_CR3, vmcs12->guest_cr3); 2699 2700 /* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */ 2701 if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) && 2702 is_pae_paging(vcpu)) { 2703 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 2704 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 2705 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 2706 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 2707 } 2708 2709 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && 2710 kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)) && 2711 WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 2712 vmcs12->guest_ia32_perf_global_ctrl))) { 2713 *entry_failure_code = ENTRY_FAIL_DEFAULT; 2714 return -EINVAL; 2715 } 2716 2717 kvm_rsp_write(vcpu, vmcs12->guest_rsp); 2718 kvm_rip_write(vcpu, vmcs12->guest_rip); 2719 2720 /* 2721 * It was observed that genuine Hyper-V running in L1 doesn't reset 2722 * 'hv_clean_fields' by itself, it only sets the corresponding dirty 2723 * bits when it changes a field in eVMCS. Mark all fields as clean 2724 * here. 2725 */ 2726 if (nested_vmx_is_evmptr12_valid(vmx)) 2727 evmcs->hv_clean_fields |= HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 2728 2729 return 0; 2730 } 2731 2732 static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12) 2733 { 2734 if (CC(!nested_cpu_has_nmi_exiting(vmcs12) && 2735 nested_cpu_has_virtual_nmis(vmcs12))) 2736 return -EINVAL; 2737 2738 if (CC(!nested_cpu_has_virtual_nmis(vmcs12) && 2739 nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING))) 2740 return -EINVAL; 2741 2742 return 0; 2743 } 2744 2745 static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp) 2746 { 2747 struct vcpu_vmx *vmx = to_vmx(vcpu); 2748 2749 /* Check for memory type validity */ 2750 switch (new_eptp & VMX_EPTP_MT_MASK) { 2751 case VMX_EPTP_MT_UC: 2752 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT))) 2753 return false; 2754 break; 2755 case VMX_EPTP_MT_WB: 2756 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT))) 2757 return false; 2758 break; 2759 default: 2760 return false; 2761 } 2762 2763 /* Page-walk levels validity. */ 2764 switch (new_eptp & VMX_EPTP_PWL_MASK) { 2765 case VMX_EPTP_PWL_5: 2766 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_5_BIT))) 2767 return false; 2768 break; 2769 case VMX_EPTP_PWL_4: 2770 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_4_BIT))) 2771 return false; 2772 break; 2773 default: 2774 return false; 2775 } 2776 2777 /* Reserved bits should not be set */ 2778 if (CC(!kvm_vcpu_is_legal_gpa(vcpu, new_eptp) || ((new_eptp >> 7) & 0x1f))) 2779 return false; 2780 2781 /* AD, if set, should be supported */ 2782 if (new_eptp & VMX_EPTP_AD_ENABLE_BIT) { 2783 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT))) 2784 return false; 2785 } 2786 2787 return true; 2788 } 2789 2790 /* 2791 * Checks related to VM-Execution Control Fields 2792 */ 2793 static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu, 2794 struct vmcs12 *vmcs12) 2795 { 2796 struct vcpu_vmx *vmx = to_vmx(vcpu); 2797 2798 if (CC(!vmx_control_verify(vmcs12->pin_based_vm_exec_control, 2799 vmx->nested.msrs.pinbased_ctls_low, 2800 vmx->nested.msrs.pinbased_ctls_high)) || 2801 CC(!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, 2802 vmx->nested.msrs.procbased_ctls_low, 2803 vmx->nested.msrs.procbased_ctls_high))) 2804 return -EINVAL; 2805 2806 if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && 2807 CC(!vmx_control_verify(vmcs12->secondary_vm_exec_control, 2808 vmx->nested.msrs.secondary_ctls_low, 2809 vmx->nested.msrs.secondary_ctls_high))) 2810 return -EINVAL; 2811 2812 if (CC(vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) || 2813 nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) || 2814 nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) || 2815 nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) || 2816 nested_vmx_check_apic_access_controls(vcpu, vmcs12) || 2817 nested_vmx_check_apicv_controls(vcpu, vmcs12) || 2818 nested_vmx_check_nmi_controls(vmcs12) || 2819 nested_vmx_check_pml_controls(vcpu, vmcs12) || 2820 nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) || 2821 nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) || 2822 nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) || 2823 CC(nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id)) 2824 return -EINVAL; 2825 2826 if (!nested_cpu_has_preemption_timer(vmcs12) && 2827 nested_cpu_has_save_preemption_timer(vmcs12)) 2828 return -EINVAL; 2829 2830 if (nested_cpu_has_ept(vmcs12) && 2831 CC(!nested_vmx_check_eptp(vcpu, vmcs12->ept_pointer))) 2832 return -EINVAL; 2833 2834 if (nested_cpu_has_vmfunc(vmcs12)) { 2835 if (CC(vmcs12->vm_function_control & 2836 ~vmx->nested.msrs.vmfunc_controls)) 2837 return -EINVAL; 2838 2839 if (nested_cpu_has_eptp_switching(vmcs12)) { 2840 if (CC(!nested_cpu_has_ept(vmcs12)) || 2841 CC(!page_address_valid(vcpu, vmcs12->eptp_list_address))) 2842 return -EINVAL; 2843 } 2844 } 2845 2846 return 0; 2847 } 2848 2849 /* 2850 * Checks related to VM-Exit Control Fields 2851 */ 2852 static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu, 2853 struct vmcs12 *vmcs12) 2854 { 2855 struct vcpu_vmx *vmx = to_vmx(vcpu); 2856 2857 if (CC(!vmx_control_verify(vmcs12->vm_exit_controls, 2858 vmx->nested.msrs.exit_ctls_low, 2859 vmx->nested.msrs.exit_ctls_high)) || 2860 CC(nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12))) 2861 return -EINVAL; 2862 2863 return 0; 2864 } 2865 2866 /* 2867 * Checks related to VM-Entry Control Fields 2868 */ 2869 static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, 2870 struct vmcs12 *vmcs12) 2871 { 2872 struct vcpu_vmx *vmx = to_vmx(vcpu); 2873 2874 if (CC(!vmx_control_verify(vmcs12->vm_entry_controls, 2875 vmx->nested.msrs.entry_ctls_low, 2876 vmx->nested.msrs.entry_ctls_high))) 2877 return -EINVAL; 2878 2879 /* 2880 * From the Intel SDM, volume 3: 2881 * Fields relevant to VM-entry event injection must be set properly. 2882 * These fields are the VM-entry interruption-information field, the 2883 * VM-entry exception error code, and the VM-entry instruction length. 2884 */ 2885 if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) { 2886 u32 intr_info = vmcs12->vm_entry_intr_info_field; 2887 u8 vector = intr_info & INTR_INFO_VECTOR_MASK; 2888 u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK; 2889 bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK; 2890 bool should_have_error_code; 2891 bool urg = nested_cpu_has2(vmcs12, 2892 SECONDARY_EXEC_UNRESTRICTED_GUEST); 2893 bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE; 2894 2895 /* VM-entry interruption-info field: interruption type */ 2896 if (CC(intr_type == INTR_TYPE_RESERVED) || 2897 CC(intr_type == INTR_TYPE_OTHER_EVENT && 2898 !nested_cpu_supports_monitor_trap_flag(vcpu))) 2899 return -EINVAL; 2900 2901 /* VM-entry interruption-info field: vector */ 2902 if (CC(intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) || 2903 CC(intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) || 2904 CC(intr_type == INTR_TYPE_OTHER_EVENT && vector != 0)) 2905 return -EINVAL; 2906 2907 /* VM-entry interruption-info field: deliver error code */ 2908 should_have_error_code = 2909 intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode && 2910 x86_exception_has_error_code(vector); 2911 if (CC(has_error_code != should_have_error_code)) 2912 return -EINVAL; 2913 2914 /* VM-entry exception error code */ 2915 if (CC(has_error_code && 2916 vmcs12->vm_entry_exception_error_code & GENMASK(31, 16))) 2917 return -EINVAL; 2918 2919 /* VM-entry interruption-info field: reserved bits */ 2920 if (CC(intr_info & INTR_INFO_RESVD_BITS_MASK)) 2921 return -EINVAL; 2922 2923 /* VM-entry instruction length */ 2924 switch (intr_type) { 2925 case INTR_TYPE_SOFT_EXCEPTION: 2926 case INTR_TYPE_SOFT_INTR: 2927 case INTR_TYPE_PRIV_SW_EXCEPTION: 2928 if (CC(vmcs12->vm_entry_instruction_len > 15) || 2929 CC(vmcs12->vm_entry_instruction_len == 0 && 2930 CC(!nested_cpu_has_zero_length_injection(vcpu)))) 2931 return -EINVAL; 2932 } 2933 } 2934 2935 if (nested_vmx_check_entry_msr_switch_controls(vcpu, vmcs12)) 2936 return -EINVAL; 2937 2938 return 0; 2939 } 2940 2941 static int nested_vmx_check_controls(struct kvm_vcpu *vcpu, 2942 struct vmcs12 *vmcs12) 2943 { 2944 if (nested_check_vm_execution_controls(vcpu, vmcs12) || 2945 nested_check_vm_exit_controls(vcpu, vmcs12) || 2946 nested_check_vm_entry_controls(vcpu, vmcs12)) 2947 return -EINVAL; 2948 2949 #ifdef CONFIG_KVM_HYPERV 2950 if (guest_cpuid_has_evmcs(vcpu)) 2951 return nested_evmcs_check_controls(vmcs12); 2952 #endif 2953 2954 return 0; 2955 } 2956 2957 static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu, 2958 struct vmcs12 *vmcs12) 2959 { 2960 #ifdef CONFIG_X86_64 2961 if (CC(!!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) != 2962 !!(vcpu->arch.efer & EFER_LMA))) 2963 return -EINVAL; 2964 #endif 2965 return 0; 2966 } 2967 2968 static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, 2969 struct vmcs12 *vmcs12) 2970 { 2971 bool ia32e = !!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE); 2972 2973 if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) || 2974 CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) || 2975 CC(!kvm_vcpu_is_legal_cr3(vcpu, vmcs12->host_cr3))) 2976 return -EINVAL; 2977 2978 if (CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu)) || 2979 CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu))) 2980 return -EINVAL; 2981 2982 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) && 2983 CC(!kvm_pat_valid(vmcs12->host_ia32_pat))) 2984 return -EINVAL; 2985 2986 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && 2987 CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), 2988 vmcs12->host_ia32_perf_global_ctrl))) 2989 return -EINVAL; 2990 2991 if (ia32e) { 2992 if (CC(!(vmcs12->host_cr4 & X86_CR4_PAE))) 2993 return -EINVAL; 2994 } else { 2995 if (CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) || 2996 CC(vmcs12->host_cr4 & X86_CR4_PCIDE) || 2997 CC((vmcs12->host_rip) >> 32)) 2998 return -EINVAL; 2999 } 3000 3001 if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3002 CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3003 CC(vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3004 CC(vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3005 CC(vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3006 CC(vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3007 CC(vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3008 CC(vmcs12->host_cs_selector == 0) || 3009 CC(vmcs12->host_tr_selector == 0) || 3010 CC(vmcs12->host_ss_selector == 0 && !ia32e)) 3011 return -EINVAL; 3012 3013 if (CC(is_noncanonical_address(vmcs12->host_fs_base, vcpu)) || 3014 CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) || 3015 CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) || 3016 CC(is_noncanonical_address(vmcs12->host_idtr_base, vcpu)) || 3017 CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu)) || 3018 CC(is_noncanonical_address(vmcs12->host_rip, vcpu))) 3019 return -EINVAL; 3020 3021 /* 3022 * If the load IA32_EFER VM-exit control is 1, bits reserved in the 3023 * IA32_EFER MSR must be 0 in the field for that register. In addition, 3024 * the values of the LMA and LME bits in the field must each be that of 3025 * the host address-space size VM-exit control. 3026 */ 3027 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) { 3028 if (CC(!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer)) || 3029 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA)) || 3030 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))) 3031 return -EINVAL; 3032 } 3033 3034 return 0; 3035 } 3036 3037 static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, 3038 struct vmcs12 *vmcs12) 3039 { 3040 struct vcpu_vmx *vmx = to_vmx(vcpu); 3041 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; 3042 struct vmcs_hdr hdr; 3043 3044 if (vmcs12->vmcs_link_pointer == INVALID_GPA) 3045 return 0; 3046 3047 if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))) 3048 return -EINVAL; 3049 3050 if (ghc->gpa != vmcs12->vmcs_link_pointer && 3051 CC(kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, 3052 vmcs12->vmcs_link_pointer, VMCS12_SIZE))) 3053 return -EINVAL; 3054 3055 if (CC(kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr, 3056 offsetof(struct vmcs12, hdr), 3057 sizeof(hdr)))) 3058 return -EINVAL; 3059 3060 if (CC(hdr.revision_id != VMCS12_REVISION) || 3061 CC(hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))) 3062 return -EINVAL; 3063 3064 return 0; 3065 } 3066 3067 /* 3068 * Checks related to Guest Non-register State 3069 */ 3070 static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12) 3071 { 3072 if (CC(vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && 3073 vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT && 3074 vmcs12->guest_activity_state != GUEST_ACTIVITY_WAIT_SIPI)) 3075 return -EINVAL; 3076 3077 return 0; 3078 } 3079 3080 static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, 3081 struct vmcs12 *vmcs12, 3082 enum vm_entry_failure_code *entry_failure_code) 3083 { 3084 bool ia32e = !!(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE); 3085 3086 *entry_failure_code = ENTRY_FAIL_DEFAULT; 3087 3088 if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) || 3089 CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))) 3090 return -EINVAL; 3091 3092 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) && 3093 CC(!kvm_dr7_valid(vmcs12->guest_dr7))) 3094 return -EINVAL; 3095 3096 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) && 3097 CC(!kvm_pat_valid(vmcs12->guest_ia32_pat))) 3098 return -EINVAL; 3099 3100 if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) { 3101 *entry_failure_code = ENTRY_FAIL_VMCS_LINK_PTR; 3102 return -EINVAL; 3103 } 3104 3105 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && 3106 CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), 3107 vmcs12->guest_ia32_perf_global_ctrl))) 3108 return -EINVAL; 3109 3110 if (CC((vmcs12->guest_cr0 & (X86_CR0_PG | X86_CR0_PE)) == X86_CR0_PG)) 3111 return -EINVAL; 3112 3113 if (CC(ia32e && !(vmcs12->guest_cr4 & X86_CR4_PAE)) || 3114 CC(ia32e && !(vmcs12->guest_cr0 & X86_CR0_PG))) 3115 return -EINVAL; 3116 3117 /* 3118 * If the load IA32_EFER VM-entry control is 1, the following checks 3119 * are performed on the field for the IA32_EFER MSR: 3120 * - Bits reserved in the IA32_EFER MSR must be 0. 3121 * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of 3122 * the IA-32e mode guest VM-exit control. It must also be identical 3123 * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to 3124 * CR0.PG) is 1. 3125 */ 3126 if (to_vmx(vcpu)->nested.nested_run_pending && 3127 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) { 3128 if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) || 3129 CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) || 3130 CC(((vmcs12->guest_cr0 & X86_CR0_PG) && 3131 ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))) 3132 return -EINVAL; 3133 } 3134 3135 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) && 3136 (CC(is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) || 3137 CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))) 3138 return -EINVAL; 3139 3140 if (nested_check_guest_non_reg_state(vmcs12)) 3141 return -EINVAL; 3142 3143 return 0; 3144 } 3145 3146 static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) 3147 { 3148 struct vcpu_vmx *vmx = to_vmx(vcpu); 3149 unsigned long cr3, cr4; 3150 bool vm_fail; 3151 3152 if (!nested_early_check) 3153 return 0; 3154 3155 if (vmx->msr_autoload.host.nr) 3156 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); 3157 if (vmx->msr_autoload.guest.nr) 3158 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); 3159 3160 preempt_disable(); 3161 3162 vmx_prepare_switch_to_guest(vcpu); 3163 3164 /* 3165 * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS, 3166 * which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to 3167 * be written (by prepare_vmcs02()) before the "real" VMEnter, i.e. 3168 * there is no need to preserve other bits or save/restore the field. 3169 */ 3170 vmcs_writel(GUEST_RFLAGS, 0); 3171 3172 cr3 = __get_current_cr3_fast(); 3173 if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { 3174 vmcs_writel(HOST_CR3, cr3); 3175 vmx->loaded_vmcs->host_state.cr3 = cr3; 3176 } 3177 3178 cr4 = cr4_read_shadow(); 3179 if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { 3180 vmcs_writel(HOST_CR4, cr4); 3181 vmx->loaded_vmcs->host_state.cr4 = cr4; 3182 } 3183 3184 vm_fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, 3185 __vmx_vcpu_run_flags(vmx)); 3186 3187 if (vmx->msr_autoload.host.nr) 3188 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 3189 if (vmx->msr_autoload.guest.nr) 3190 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 3191 3192 if (vm_fail) { 3193 u32 error = vmcs_read32(VM_INSTRUCTION_ERROR); 3194 3195 preempt_enable(); 3196 3197 trace_kvm_nested_vmenter_failed( 3198 "early hardware check VM-instruction error: ", error); 3199 WARN_ON_ONCE(error != VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3200 return 1; 3201 } 3202 3203 /* 3204 * VMExit clears RFLAGS.IF and DR7, even on a consistency check. 3205 */ 3206 if (hw_breakpoint_active()) 3207 set_debugreg(__this_cpu_read(cpu_dr7), 7); 3208 local_irq_enable(); 3209 preempt_enable(); 3210 3211 /* 3212 * A non-failing VMEntry means we somehow entered guest mode with 3213 * an illegal RIP, and that's just the tip of the iceberg. There 3214 * is no telling what memory has been modified or what state has 3215 * been exposed to unknown code. Hitting this all but guarantees 3216 * a (very critical) hardware issue. 3217 */ 3218 WARN_ON(!(vmcs_read32(VM_EXIT_REASON) & 3219 VMX_EXIT_REASONS_FAILED_VMENTRY)); 3220 3221 return 0; 3222 } 3223 3224 #ifdef CONFIG_KVM_HYPERV 3225 static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu) 3226 { 3227 struct vcpu_vmx *vmx = to_vmx(vcpu); 3228 3229 /* 3230 * hv_evmcs may end up being not mapped after migration (when 3231 * L2 was running), map it here to make sure vmcs12 changes are 3232 * properly reflected. 3233 */ 3234 if (guest_cpuid_has_evmcs(vcpu) && 3235 vmx->nested.hv_evmcs_vmptr == EVMPTR_MAP_PENDING) { 3236 enum nested_evmptrld_status evmptrld_status = 3237 nested_vmx_handle_enlightened_vmptrld(vcpu, false); 3238 3239 if (evmptrld_status == EVMPTRLD_VMFAIL || 3240 evmptrld_status == EVMPTRLD_ERROR) 3241 return false; 3242 3243 /* 3244 * Post migration VMCS12 always provides the most actual 3245 * information, copy it to eVMCS upon entry. 3246 */ 3247 vmx->nested.need_vmcs12_to_shadow_sync = true; 3248 } 3249 3250 return true; 3251 } 3252 #endif 3253 3254 static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) 3255 { 3256 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3257 struct vcpu_vmx *vmx = to_vmx(vcpu); 3258 struct kvm_host_map *map; 3259 3260 if (!vcpu->arch.pdptrs_from_userspace && 3261 !nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { 3262 /* 3263 * Reload the guest's PDPTRs since after a migration 3264 * the guest CR3 might be restored prior to setting the nested 3265 * state which can lead to a load of wrong PDPTRs. 3266 */ 3267 if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3))) 3268 return false; 3269 } 3270 3271 3272 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 3273 map = &vmx->nested.apic_access_page_map; 3274 3275 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->apic_access_addr), map)) { 3276 vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(map->pfn)); 3277 } else { 3278 pr_debug_ratelimited("%s: no backing for APIC-access address in vmcs12\n", 3279 __func__); 3280 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3281 vcpu->run->internal.suberror = 3282 KVM_INTERNAL_ERROR_EMULATION; 3283 vcpu->run->internal.ndata = 0; 3284 return false; 3285 } 3286 } 3287 3288 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 3289 map = &vmx->nested.virtual_apic_map; 3290 3291 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) { 3292 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn)); 3293 } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) && 3294 nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) && 3295 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 3296 /* 3297 * The processor will never use the TPR shadow, simply 3298 * clear the bit from the execution control. Such a 3299 * configuration is useless, but it happens in tests. 3300 * For any other configuration, failing the vm entry is 3301 * _not_ what the processor does but it's basically the 3302 * only possibility we have. 3303 */ 3304 exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW); 3305 } else { 3306 /* 3307 * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to 3308 * force VM-Entry to fail. 3309 */ 3310 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, INVALID_GPA); 3311 } 3312 } 3313 3314 if (nested_cpu_has_posted_intr(vmcs12)) { 3315 map = &vmx->nested.pi_desc_map; 3316 3317 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) { 3318 vmx->nested.pi_desc = 3319 (struct pi_desc *)(((void *)map->hva) + 3320 offset_in_page(vmcs12->posted_intr_desc_addr)); 3321 vmcs_write64(POSTED_INTR_DESC_ADDR, 3322 pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr)); 3323 } else { 3324 /* 3325 * Defer the KVM_INTERNAL_EXIT until KVM tries to 3326 * access the contents of the VMCS12 posted interrupt 3327 * descriptor. (Note that KVM may do this when it 3328 * should not, per the architectural specification.) 3329 */ 3330 vmx->nested.pi_desc = NULL; 3331 pin_controls_clearbit(vmx, PIN_BASED_POSTED_INTR); 3332 } 3333 } 3334 if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12)) 3335 exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS); 3336 else 3337 exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS); 3338 3339 return true; 3340 } 3341 3342 static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu) 3343 { 3344 #ifdef CONFIG_KVM_HYPERV 3345 /* 3346 * Note: nested_get_evmcs_page() also updates 'vp_assist_page' copy 3347 * in 'struct kvm_vcpu_hv' in case eVMCS is in use, this is mandatory 3348 * to make nested_evmcs_l2_tlb_flush_enabled() work correctly post 3349 * migration. 3350 */ 3351 if (!nested_get_evmcs_page(vcpu)) { 3352 pr_debug_ratelimited("%s: enlightened vmptrld failed\n", 3353 __func__); 3354 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3355 vcpu->run->internal.suberror = 3356 KVM_INTERNAL_ERROR_EMULATION; 3357 vcpu->run->internal.ndata = 0; 3358 3359 return false; 3360 } 3361 #endif 3362 3363 if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu)) 3364 return false; 3365 3366 return true; 3367 } 3368 3369 static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa) 3370 { 3371 struct vmcs12 *vmcs12; 3372 struct vcpu_vmx *vmx = to_vmx(vcpu); 3373 gpa_t dst; 3374 3375 if (WARN_ON_ONCE(!is_guest_mode(vcpu))) 3376 return 0; 3377 3378 if (WARN_ON_ONCE(vmx->nested.pml_full)) 3379 return 1; 3380 3381 /* 3382 * Check if PML is enabled for the nested guest. Whether eptp bit 6 is 3383 * set is already checked as part of A/D emulation. 3384 */ 3385 vmcs12 = get_vmcs12(vcpu); 3386 if (!nested_cpu_has_pml(vmcs12)) 3387 return 0; 3388 3389 if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) { 3390 vmx->nested.pml_full = true; 3391 return 1; 3392 } 3393 3394 gpa &= ~0xFFFull; 3395 dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index; 3396 3397 if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa, 3398 offset_in_page(dst), sizeof(gpa))) 3399 return 0; 3400 3401 vmcs12->guest_pml_index--; 3402 3403 return 0; 3404 } 3405 3406 /* 3407 * Intel's VMX Instruction Reference specifies a common set of prerequisites 3408 * for running VMX instructions (except VMXON, whose prerequisites are 3409 * slightly different). It also specifies what exception to inject otherwise. 3410 * Note that many of these exceptions have priority over VM exits, so they 3411 * don't have to be checked again here. 3412 */ 3413 static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) 3414 { 3415 if (!to_vmx(vcpu)->nested.vmxon) { 3416 kvm_queue_exception(vcpu, UD_VECTOR); 3417 return 0; 3418 } 3419 3420 if (vmx_get_cpl(vcpu)) { 3421 kvm_inject_gp(vcpu, 0); 3422 return 0; 3423 } 3424 3425 return 1; 3426 } 3427 3428 static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu) 3429 { 3430 u8 rvi = vmx_get_rvi(); 3431 u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI); 3432 3433 return ((rvi & 0xf0) > (vppr & 0xf0)); 3434 } 3435 3436 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 3437 struct vmcs12 *vmcs12); 3438 3439 /* 3440 * If from_vmentry is false, this is being called from state restore (either RSM 3441 * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume. 3442 * 3443 * Returns: 3444 * NVMX_VMENTRY_SUCCESS: Entered VMX non-root mode 3445 * NVMX_VMENTRY_VMFAIL: Consistency check VMFail 3446 * NVMX_VMENTRY_VMEXIT: Consistency check VMExit 3447 * NVMX_VMENTRY_KVM_INTERNAL_ERROR: KVM internal error 3448 */ 3449 enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, 3450 bool from_vmentry) 3451 { 3452 struct vcpu_vmx *vmx = to_vmx(vcpu); 3453 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3454 enum vm_entry_failure_code entry_failure_code; 3455 bool evaluate_pending_interrupts; 3456 union vmx_exit_reason exit_reason = { 3457 .basic = EXIT_REASON_INVALID_STATE, 3458 .failed_vmentry = 1, 3459 }; 3460 u32 failed_index; 3461 3462 trace_kvm_nested_vmenter(kvm_rip_read(vcpu), 3463 vmx->nested.current_vmptr, 3464 vmcs12->guest_rip, 3465 vmcs12->guest_intr_status, 3466 vmcs12->vm_entry_intr_info_field, 3467 vmcs12->secondary_vm_exec_control & SECONDARY_EXEC_ENABLE_EPT, 3468 vmcs12->ept_pointer, 3469 vmcs12->guest_cr3, 3470 KVM_ISA_VMX); 3471 3472 kvm_service_local_tlb_flush_requests(vcpu); 3473 3474 evaluate_pending_interrupts = exec_controls_get(vmx) & 3475 (CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING); 3476 if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu)) 3477 evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu); 3478 if (!evaluate_pending_interrupts) 3479 evaluate_pending_interrupts |= kvm_apic_has_pending_init_or_sipi(vcpu); 3480 3481 if (!vmx->nested.nested_run_pending || 3482 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) 3483 vmx->nested.pre_vmenter_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); 3484 if (kvm_mpx_supported() && 3485 (!vmx->nested.nested_run_pending || 3486 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) 3487 vmx->nested.pre_vmenter_bndcfgs = vmcs_read64(GUEST_BNDCFGS); 3488 3489 /* 3490 * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and* 3491 * nested early checks are disabled. In the event of a "late" VM-Fail, 3492 * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its 3493 * software model to the pre-VMEntry host state. When EPT is disabled, 3494 * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes 3495 * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3. Stuffing 3496 * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to 3497 * the correct value. Smashing vmcs01.GUEST_CR3 is safe because nested 3498 * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is 3499 * guaranteed to be overwritten with a shadow CR3 prior to re-entering 3500 * L1. Don't stuff vmcs01.GUEST_CR3 when using nested early checks as 3501 * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks 3502 * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail 3503 * path would need to manually save/restore vmcs01.GUEST_CR3. 3504 */ 3505 if (!enable_ept && !nested_early_check) 3506 vmcs_writel(GUEST_CR3, vcpu->arch.cr3); 3507 3508 vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); 3509 3510 prepare_vmcs02_early(vmx, &vmx->vmcs01, vmcs12); 3511 3512 if (from_vmentry) { 3513 if (unlikely(!nested_get_vmcs12_pages(vcpu))) { 3514 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3515 return NVMX_VMENTRY_KVM_INTERNAL_ERROR; 3516 } 3517 3518 if (nested_vmx_check_vmentry_hw(vcpu)) { 3519 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3520 return NVMX_VMENTRY_VMFAIL; 3521 } 3522 3523 if (nested_vmx_check_guest_state(vcpu, vmcs12, 3524 &entry_failure_code)) { 3525 exit_reason.basic = EXIT_REASON_INVALID_STATE; 3526 vmcs12->exit_qualification = entry_failure_code; 3527 goto vmentry_fail_vmexit; 3528 } 3529 } 3530 3531 enter_guest_mode(vcpu); 3532 3533 if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &entry_failure_code)) { 3534 exit_reason.basic = EXIT_REASON_INVALID_STATE; 3535 vmcs12->exit_qualification = entry_failure_code; 3536 goto vmentry_fail_vmexit_guest_mode; 3537 } 3538 3539 if (from_vmentry) { 3540 failed_index = nested_vmx_load_msr(vcpu, 3541 vmcs12->vm_entry_msr_load_addr, 3542 vmcs12->vm_entry_msr_load_count); 3543 if (failed_index) { 3544 exit_reason.basic = EXIT_REASON_MSR_LOAD_FAIL; 3545 vmcs12->exit_qualification = failed_index; 3546 goto vmentry_fail_vmexit_guest_mode; 3547 } 3548 } else { 3549 /* 3550 * The MMU is not initialized to point at the right entities yet and 3551 * "get pages" would need to read data from the guest (i.e. we will 3552 * need to perform gpa to hpa translation). Request a call 3553 * to nested_get_vmcs12_pages before the next VM-entry. The MSRs 3554 * have already been set at vmentry time and should not be reset. 3555 */ 3556 kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 3557 } 3558 3559 /* 3560 * Re-evaluate pending events if L1 had a pending IRQ/NMI/INIT/SIPI 3561 * when it executed VMLAUNCH/VMRESUME, as entering non-root mode can 3562 * effectively unblock various events, e.g. INIT/SIPI cause VM-Exit 3563 * unconditionally. 3564 */ 3565 if (unlikely(evaluate_pending_interrupts)) 3566 kvm_make_request(KVM_REQ_EVENT, vcpu); 3567 3568 /* 3569 * Do not start the preemption timer hrtimer until after we know 3570 * we are successful, so that only nested_vmx_vmexit needs to cancel 3571 * the timer. 3572 */ 3573 vmx->nested.preemption_timer_expired = false; 3574 if (nested_cpu_has_preemption_timer(vmcs12)) { 3575 u64 timer_value = vmx_calc_preemption_timer_value(vcpu); 3576 vmx_start_preemption_timer(vcpu, timer_value); 3577 } 3578 3579 /* 3580 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point 3581 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet 3582 * returned as far as L1 is concerned. It will only return (and set 3583 * the success flag) when L2 exits (see nested_vmx_vmexit()). 3584 */ 3585 return NVMX_VMENTRY_SUCCESS; 3586 3587 /* 3588 * A failed consistency check that leads to a VMExit during L1's 3589 * VMEnter to L2 is a variation of a normal VMexit, as explained in 3590 * 26.7 "VM-entry failures during or after loading guest state". 3591 */ 3592 vmentry_fail_vmexit_guest_mode: 3593 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING) 3594 vcpu->arch.tsc_offset -= vmcs12->tsc_offset; 3595 leave_guest_mode(vcpu); 3596 3597 vmentry_fail_vmexit: 3598 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3599 3600 if (!from_vmentry) 3601 return NVMX_VMENTRY_VMEXIT; 3602 3603 load_vmcs12_host_state(vcpu, vmcs12); 3604 vmcs12->vm_exit_reason = exit_reason.full; 3605 if (enable_shadow_vmcs || nested_vmx_is_evmptr12_valid(vmx)) 3606 vmx->nested.need_vmcs12_to_shadow_sync = true; 3607 return NVMX_VMENTRY_VMEXIT; 3608 } 3609 3610 /* 3611 * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1 3612 * for running an L2 nested guest. 3613 */ 3614 static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) 3615 { 3616 struct vmcs12 *vmcs12; 3617 enum nvmx_vmentry_status status; 3618 struct vcpu_vmx *vmx = to_vmx(vcpu); 3619 u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu); 3620 enum nested_evmptrld_status evmptrld_status; 3621 3622 if (!nested_vmx_check_permission(vcpu)) 3623 return 1; 3624 3625 evmptrld_status = nested_vmx_handle_enlightened_vmptrld(vcpu, launch); 3626 if (evmptrld_status == EVMPTRLD_ERROR) { 3627 kvm_queue_exception(vcpu, UD_VECTOR); 3628 return 1; 3629 } 3630 3631 kvm_pmu_trigger_event(vcpu, kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED); 3632 3633 if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) 3634 return nested_vmx_failInvalid(vcpu); 3635 3636 if (CC(!nested_vmx_is_evmptr12_valid(vmx) && 3637 vmx->nested.current_vmptr == INVALID_GPA)) 3638 return nested_vmx_failInvalid(vcpu); 3639 3640 vmcs12 = get_vmcs12(vcpu); 3641 3642 /* 3643 * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact 3644 * that there *is* a valid VMCS pointer, RFLAGS.CF is set 3645 * rather than RFLAGS.ZF, and no error number is stored to the 3646 * VM-instruction error field. 3647 */ 3648 if (CC(vmcs12->hdr.shadow_vmcs)) 3649 return nested_vmx_failInvalid(vcpu); 3650 3651 if (nested_vmx_is_evmptr12_valid(vmx)) { 3652 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 3653 3654 copy_enlightened_to_vmcs12(vmx, evmcs->hv_clean_fields); 3655 /* Enlightened VMCS doesn't have launch state */ 3656 vmcs12->launch_state = !launch; 3657 } else if (enable_shadow_vmcs) { 3658 copy_shadow_to_vmcs12(vmx); 3659 } 3660 3661 /* 3662 * The nested entry process starts with enforcing various prerequisites 3663 * on vmcs12 as required by the Intel SDM, and act appropriately when 3664 * they fail: As the SDM explains, some conditions should cause the 3665 * instruction to fail, while others will cause the instruction to seem 3666 * to succeed, but return an EXIT_REASON_INVALID_STATE. 3667 * To speed up the normal (success) code path, we should avoid checking 3668 * for misconfigurations which will anyway be caught by the processor 3669 * when using the merged vmcs02. 3670 */ 3671 if (CC(interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS)) 3672 return nested_vmx_fail(vcpu, VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS); 3673 3674 if (CC(vmcs12->launch_state == launch)) 3675 return nested_vmx_fail(vcpu, 3676 launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS 3677 : VMXERR_VMRESUME_NONLAUNCHED_VMCS); 3678 3679 if (nested_vmx_check_controls(vcpu, vmcs12)) 3680 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3681 3682 if (nested_vmx_check_address_space_size(vcpu, vmcs12)) 3683 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); 3684 3685 if (nested_vmx_check_host_state(vcpu, vmcs12)) 3686 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); 3687 3688 /* 3689 * We're finally done with prerequisite checking, and can start with 3690 * the nested entry. 3691 */ 3692 vmx->nested.nested_run_pending = 1; 3693 vmx->nested.has_preemption_timer_deadline = false; 3694 status = nested_vmx_enter_non_root_mode(vcpu, true); 3695 if (unlikely(status != NVMX_VMENTRY_SUCCESS)) 3696 goto vmentry_failed; 3697 3698 /* Emulate processing of posted interrupts on VM-Enter. */ 3699 if (nested_cpu_has_posted_intr(vmcs12) && 3700 kvm_apic_has_interrupt(vcpu) == vmx->nested.posted_intr_nv) { 3701 vmx->nested.pi_pending = true; 3702 kvm_make_request(KVM_REQ_EVENT, vcpu); 3703 kvm_apic_clear_irr(vcpu, vmx->nested.posted_intr_nv); 3704 } 3705 3706 /* Hide L1D cache contents from the nested guest. */ 3707 vmx->vcpu.arch.l1tf_flush_l1d = true; 3708 3709 /* 3710 * Must happen outside of nested_vmx_enter_non_root_mode() as it will 3711 * also be used as part of restoring nVMX state for 3712 * snapshot restore (migration). 3713 * 3714 * In this flow, it is assumed that vmcs12 cache was 3715 * transferred as part of captured nVMX state and should 3716 * therefore not be read from guest memory (which may not 3717 * exist on destination host yet). 3718 */ 3719 nested_cache_shadow_vmcs12(vcpu, vmcs12); 3720 3721 switch (vmcs12->guest_activity_state) { 3722 case GUEST_ACTIVITY_HLT: 3723 /* 3724 * If we're entering a halted L2 vcpu and the L2 vcpu won't be 3725 * awakened by event injection or by an NMI-window VM-exit or 3726 * by an interrupt-window VM-exit, halt the vcpu. 3727 */ 3728 if (!(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) && 3729 !nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING) && 3730 !(nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING) && 3731 (vmcs12->guest_rflags & X86_EFLAGS_IF))) { 3732 vmx->nested.nested_run_pending = 0; 3733 return kvm_emulate_halt_noskip(vcpu); 3734 } 3735 break; 3736 case GUEST_ACTIVITY_WAIT_SIPI: 3737 vmx->nested.nested_run_pending = 0; 3738 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; 3739 break; 3740 default: 3741 break; 3742 } 3743 3744 return 1; 3745 3746 vmentry_failed: 3747 vmx->nested.nested_run_pending = 0; 3748 if (status == NVMX_VMENTRY_KVM_INTERNAL_ERROR) 3749 return 0; 3750 if (status == NVMX_VMENTRY_VMEXIT) 3751 return 1; 3752 WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL); 3753 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3754 } 3755 3756 /* 3757 * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date 3758 * because L2 may have changed some cr0 bits directly (CR0_GUEST_HOST_MASK). 3759 * This function returns the new value we should put in vmcs12.guest_cr0. 3760 * It's not enough to just return the vmcs02 GUEST_CR0. Rather, 3761 * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now 3762 * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0 3763 * didn't trap the bit, because if L1 did, so would L0). 3764 * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have 3765 * been modified by L2, and L1 knows it. So just leave the old value of 3766 * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0 3767 * isn't relevant, because if L0 traps this bit it can set it to anything. 3768 * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have 3769 * changed these bits, and therefore they need to be updated, but L0 3770 * didn't necessarily allow them to be changed in GUEST_CR0 - and rather 3771 * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there. 3772 */ 3773 static inline unsigned long 3774 vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3775 { 3776 return 3777 /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) | 3778 /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) | 3779 /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask | 3780 vcpu->arch.cr0_guest_owned_bits)); 3781 } 3782 3783 static inline unsigned long 3784 vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3785 { 3786 return 3787 /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) | 3788 /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) | 3789 /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask | 3790 vcpu->arch.cr4_guest_owned_bits)); 3791 } 3792 3793 static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, 3794 struct vmcs12 *vmcs12, 3795 u32 vm_exit_reason, u32 exit_intr_info) 3796 { 3797 u32 idt_vectoring; 3798 unsigned int nr; 3799 3800 /* 3801 * Per the SDM, VM-Exits due to double and triple faults are never 3802 * considered to occur during event delivery, even if the double/triple 3803 * fault is the result of an escalating vectoring issue. 3804 * 3805 * Note, the SDM qualifies the double fault behavior with "The original 3806 * event results in a double-fault exception". It's unclear why the 3807 * qualification exists since exits due to double fault can occur only 3808 * while vectoring a different exception (injected events are never 3809 * subject to interception), i.e. there's _always_ an original event. 3810 * 3811 * The SDM also uses NMI as a confusing example for the "original event 3812 * causes the VM exit directly" clause. NMI isn't special in any way, 3813 * the same rule applies to all events that cause an exit directly. 3814 * NMI is an odd choice for the example because NMIs can only occur on 3815 * instruction boundaries, i.e. they _can't_ occur during vectoring. 3816 */ 3817 if ((u16)vm_exit_reason == EXIT_REASON_TRIPLE_FAULT || 3818 ((u16)vm_exit_reason == EXIT_REASON_EXCEPTION_NMI && 3819 is_double_fault(exit_intr_info))) { 3820 vmcs12->idt_vectoring_info_field = 0; 3821 } else if (vcpu->arch.exception.injected) { 3822 nr = vcpu->arch.exception.vector; 3823 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 3824 3825 if (kvm_exception_is_soft(nr)) { 3826 vmcs12->vm_exit_instruction_len = 3827 vcpu->arch.event_exit_inst_len; 3828 idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION; 3829 } else 3830 idt_vectoring |= INTR_TYPE_HARD_EXCEPTION; 3831 3832 if (vcpu->arch.exception.has_error_code) { 3833 idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK; 3834 vmcs12->idt_vectoring_error_code = 3835 vcpu->arch.exception.error_code; 3836 } 3837 3838 vmcs12->idt_vectoring_info_field = idt_vectoring; 3839 } else if (vcpu->arch.nmi_injected) { 3840 vmcs12->idt_vectoring_info_field = 3841 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR; 3842 } else if (vcpu->arch.interrupt.injected) { 3843 nr = vcpu->arch.interrupt.nr; 3844 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 3845 3846 if (vcpu->arch.interrupt.soft) { 3847 idt_vectoring |= INTR_TYPE_SOFT_INTR; 3848 vmcs12->vm_entry_instruction_len = 3849 vcpu->arch.event_exit_inst_len; 3850 } else 3851 idt_vectoring |= INTR_TYPE_EXT_INTR; 3852 3853 vmcs12->idt_vectoring_info_field = idt_vectoring; 3854 } else { 3855 vmcs12->idt_vectoring_info_field = 0; 3856 } 3857 } 3858 3859 3860 void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu) 3861 { 3862 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3863 gfn_t gfn; 3864 3865 /* 3866 * Don't need to mark the APIC access page dirty; it is never 3867 * written to by the CPU during APIC virtualization. 3868 */ 3869 3870 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 3871 gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT; 3872 kvm_vcpu_mark_page_dirty(vcpu, gfn); 3873 } 3874 3875 if (nested_cpu_has_posted_intr(vmcs12)) { 3876 gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT; 3877 kvm_vcpu_mark_page_dirty(vcpu, gfn); 3878 } 3879 } 3880 3881 static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) 3882 { 3883 struct vcpu_vmx *vmx = to_vmx(vcpu); 3884 int max_irr; 3885 void *vapic_page; 3886 u16 status; 3887 3888 if (!vmx->nested.pi_pending) 3889 return 0; 3890 3891 if (!vmx->nested.pi_desc) 3892 goto mmio_needed; 3893 3894 vmx->nested.pi_pending = false; 3895 3896 if (!pi_test_and_clear_on(vmx->nested.pi_desc)) 3897 return 0; 3898 3899 max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256); 3900 if (max_irr != 256) { 3901 vapic_page = vmx->nested.virtual_apic_map.hva; 3902 if (!vapic_page) 3903 goto mmio_needed; 3904 3905 __kvm_apic_update_irr(vmx->nested.pi_desc->pir, 3906 vapic_page, &max_irr); 3907 status = vmcs_read16(GUEST_INTR_STATUS); 3908 if ((u8)max_irr > ((u8)status & 0xff)) { 3909 status &= ~0xff; 3910 status |= (u8)max_irr; 3911 vmcs_write16(GUEST_INTR_STATUS, status); 3912 } 3913 } 3914 3915 nested_mark_vmcs12_pages_dirty(vcpu); 3916 return 0; 3917 3918 mmio_needed: 3919 kvm_handle_memory_failure(vcpu, X86EMUL_IO_NEEDED, NULL); 3920 return -ENXIO; 3921 } 3922 3923 static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu) 3924 { 3925 struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit; 3926 u32 intr_info = ex->vector | INTR_INFO_VALID_MASK; 3927 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3928 unsigned long exit_qual; 3929 3930 if (ex->has_payload) { 3931 exit_qual = ex->payload; 3932 } else if (ex->vector == PF_VECTOR) { 3933 exit_qual = vcpu->arch.cr2; 3934 } else if (ex->vector == DB_VECTOR) { 3935 exit_qual = vcpu->arch.dr6; 3936 exit_qual &= ~DR6_BT; 3937 exit_qual ^= DR6_ACTIVE_LOW; 3938 } else { 3939 exit_qual = 0; 3940 } 3941 3942 /* 3943 * Unlike AMD's Paged Real Mode, which reports an error code on #PF 3944 * VM-Exits even if the CPU is in Real Mode, Intel VMX never sets the 3945 * "has error code" flags on VM-Exit if the CPU is in Real Mode. 3946 */ 3947 if (ex->has_error_code && is_protmode(vcpu)) { 3948 /* 3949 * Intel CPUs do not generate error codes with bits 31:16 set, 3950 * and more importantly VMX disallows setting bits 31:16 in the 3951 * injected error code for VM-Entry. Drop the bits to mimic 3952 * hardware and avoid inducing failure on nested VM-Entry if L1 3953 * chooses to inject the exception back to L2. AMD CPUs _do_ 3954 * generate "full" 32-bit error codes, so KVM allows userspace 3955 * to inject exception error codes with bits 31:16 set. 3956 */ 3957 vmcs12->vm_exit_intr_error_code = (u16)ex->error_code; 3958 intr_info |= INTR_INFO_DELIVER_CODE_MASK; 3959 } 3960 3961 if (kvm_exception_is_soft(ex->vector)) 3962 intr_info |= INTR_TYPE_SOFT_EXCEPTION; 3963 else 3964 intr_info |= INTR_TYPE_HARD_EXCEPTION; 3965 3966 if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) && 3967 vmx_get_nmi_mask(vcpu)) 3968 intr_info |= INTR_INFO_UNBLOCK_NMI; 3969 3970 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual); 3971 } 3972 3973 /* 3974 * Returns true if a debug trap is (likely) pending delivery. Infer the class 3975 * of a #DB (trap-like vs. fault-like) from the exception payload (to-be-DR6). 3976 * Using the payload is flawed because code breakpoints (fault-like) and data 3977 * breakpoints (trap-like) set the same bits in DR6 (breakpoint detected), i.e. 3978 * this will return false positives if a to-be-injected code breakpoint #DB is 3979 * pending (from KVM's perspective, but not "pending" across an instruction 3980 * boundary). ICEBP, a.k.a. INT1, is also not reflected here even though it 3981 * too is trap-like. 3982 * 3983 * KVM "works" despite these flaws as ICEBP isn't currently supported by the 3984 * emulator, Monitor Trap Flag is not marked pending on intercepted #DBs (the 3985 * #DB has already happened), and MTF isn't marked pending on code breakpoints 3986 * from the emulator (because such #DBs are fault-like and thus don't trigger 3987 * actions that fire on instruction retire). 3988 */ 3989 static unsigned long vmx_get_pending_dbg_trap(struct kvm_queued_exception *ex) 3990 { 3991 if (!ex->pending || ex->vector != DB_VECTOR) 3992 return 0; 3993 3994 /* General Detect #DBs are always fault-like. */ 3995 return ex->payload & ~DR6_BD; 3996 } 3997 3998 /* 3999 * Returns true if there's a pending #DB exception that is lower priority than 4000 * a pending Monitor Trap Flag VM-Exit. TSS T-flag #DBs are not emulated by 4001 * KVM, but could theoretically be injected by userspace. Note, this code is 4002 * imperfect, see above. 4003 */ 4004 static bool vmx_is_low_priority_db_trap(struct kvm_queued_exception *ex) 4005 { 4006 return vmx_get_pending_dbg_trap(ex) & ~DR6_BT; 4007 } 4008 4009 /* 4010 * Certain VM-exits set the 'pending debug exceptions' field to indicate a 4011 * recognized #DB (data or single-step) that has yet to be delivered. Since KVM 4012 * represents these debug traps with a payload that is said to be compatible 4013 * with the 'pending debug exceptions' field, write the payload to the VMCS 4014 * field if a VM-exit is delivered before the debug trap. 4015 */ 4016 static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu) 4017 { 4018 unsigned long pending_dbg; 4019 4020 pending_dbg = vmx_get_pending_dbg_trap(&vcpu->arch.exception); 4021 if (pending_dbg) 4022 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, pending_dbg); 4023 } 4024 4025 static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu) 4026 { 4027 return nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && 4028 to_vmx(vcpu)->nested.preemption_timer_expired; 4029 } 4030 4031 static bool vmx_has_nested_events(struct kvm_vcpu *vcpu) 4032 { 4033 return nested_vmx_preemption_timer_pending(vcpu) || 4034 to_vmx(vcpu)->nested.mtf_pending; 4035 } 4036 4037 /* 4038 * Per the Intel SDM's table "Priority Among Concurrent Events", with minor 4039 * edits to fill in missing examples, e.g. #DB due to split-lock accesses, 4040 * and less minor edits to splice in the priority of VMX Non-Root specific 4041 * events, e.g. MTF and NMI/INTR-window exiting. 4042 * 4043 * 1 Hardware Reset and Machine Checks 4044 * - RESET 4045 * - Machine Check 4046 * 4047 * 2 Trap on Task Switch 4048 * - T flag in TSS is set (on task switch) 4049 * 4050 * 3 External Hardware Interventions 4051 * - FLUSH 4052 * - STOPCLK 4053 * - SMI 4054 * - INIT 4055 * 4056 * 3.5 Monitor Trap Flag (MTF) VM-exit[1] 4057 * 4058 * 4 Traps on Previous Instruction 4059 * - Breakpoints 4060 * - Trap-class Debug Exceptions (#DB due to TF flag set, data/I-O 4061 * breakpoint, or #DB due to a split-lock access) 4062 * 4063 * 4.3 VMX-preemption timer expired VM-exit 4064 * 4065 * 4.6 NMI-window exiting VM-exit[2] 4066 * 4067 * 5 Nonmaskable Interrupts (NMI) 4068 * 4069 * 5.5 Interrupt-window exiting VM-exit and Virtual-interrupt delivery 4070 * 4071 * 6 Maskable Hardware Interrupts 4072 * 4073 * 7 Code Breakpoint Fault 4074 * 4075 * 8 Faults from Fetching Next Instruction 4076 * - Code-Segment Limit Violation 4077 * - Code Page Fault 4078 * - Control protection exception (missing ENDBRANCH at target of indirect 4079 * call or jump) 4080 * 4081 * 9 Faults from Decoding Next Instruction 4082 * - Instruction length > 15 bytes 4083 * - Invalid Opcode 4084 * - Coprocessor Not Available 4085 * 4086 *10 Faults on Executing Instruction 4087 * - Overflow 4088 * - Bound error 4089 * - Invalid TSS 4090 * - Segment Not Present 4091 * - Stack fault 4092 * - General Protection 4093 * - Data Page Fault 4094 * - Alignment Check 4095 * - x86 FPU Floating-point exception 4096 * - SIMD floating-point exception 4097 * - Virtualization exception 4098 * - Control protection exception 4099 * 4100 * [1] Per the "Monitor Trap Flag" section: System-management interrupts (SMIs), 4101 * INIT signals, and higher priority events take priority over MTF VM exits. 4102 * MTF VM exits take priority over debug-trap exceptions and lower priority 4103 * events. 4104 * 4105 * [2] Debug-trap exceptions and higher priority events take priority over VM exits 4106 * caused by the VMX-preemption timer. VM exits caused by the VMX-preemption 4107 * timer take priority over VM exits caused by the "NMI-window exiting" 4108 * VM-execution control and lower priority events. 4109 * 4110 * [3] Debug-trap exceptions and higher priority events take priority over VM exits 4111 * caused by "NMI-window exiting". VM exits caused by this control take 4112 * priority over non-maskable interrupts (NMIs) and lower priority events. 4113 * 4114 * [4] Virtual-interrupt delivery has the same priority as that of VM exits due to 4115 * the 1-setting of the "interrupt-window exiting" VM-execution control. Thus, 4116 * non-maskable interrupts (NMIs) and higher priority events take priority over 4117 * delivery of a virtual interrupt; delivery of a virtual interrupt takes 4118 * priority over external interrupts and lower priority events. 4119 */ 4120 static int vmx_check_nested_events(struct kvm_vcpu *vcpu) 4121 { 4122 struct kvm_lapic *apic = vcpu->arch.apic; 4123 struct vcpu_vmx *vmx = to_vmx(vcpu); 4124 /* 4125 * Only a pending nested run blocks a pending exception. If there is a 4126 * previously injected event, the pending exception occurred while said 4127 * event was being delivered and thus needs to be handled. 4128 */ 4129 bool block_nested_exceptions = vmx->nested.nested_run_pending; 4130 /* 4131 * New events (not exceptions) are only recognized at instruction 4132 * boundaries. If an event needs reinjection, then KVM is handling a 4133 * VM-Exit that occurred _during_ instruction execution; new events are 4134 * blocked until the instruction completes. 4135 */ 4136 bool block_nested_events = block_nested_exceptions || 4137 kvm_event_needs_reinjection(vcpu); 4138 4139 if (lapic_in_kernel(vcpu) && 4140 test_bit(KVM_APIC_INIT, &apic->pending_events)) { 4141 if (block_nested_events) 4142 return -EBUSY; 4143 nested_vmx_update_pending_dbg(vcpu); 4144 clear_bit(KVM_APIC_INIT, &apic->pending_events); 4145 if (vcpu->arch.mp_state != KVM_MP_STATE_INIT_RECEIVED) 4146 nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0); 4147 4148 /* MTF is discarded if the vCPU is in WFS. */ 4149 vmx->nested.mtf_pending = false; 4150 return 0; 4151 } 4152 4153 if (lapic_in_kernel(vcpu) && 4154 test_bit(KVM_APIC_SIPI, &apic->pending_events)) { 4155 if (block_nested_events) 4156 return -EBUSY; 4157 4158 clear_bit(KVM_APIC_SIPI, &apic->pending_events); 4159 if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { 4160 nested_vmx_vmexit(vcpu, EXIT_REASON_SIPI_SIGNAL, 0, 4161 apic->sipi_vector & 0xFFUL); 4162 return 0; 4163 } 4164 /* Fallthrough, the SIPI is completely ignored. */ 4165 } 4166 4167 /* 4168 * Process exceptions that are higher priority than Monitor Trap Flag: 4169 * fault-like exceptions, TSS T flag #DB (not emulated by KVM, but 4170 * could theoretically come in from userspace), and ICEBP (INT1). 4171 * 4172 * TODO: SMIs have higher priority than MTF and trap-like #DBs (except 4173 * for TSS T flag #DBs). KVM also doesn't save/restore pending MTF 4174 * across SMI/RSM as it should; that needs to be addressed in order to 4175 * prioritize SMI over MTF and trap-like #DBs. 4176 */ 4177 if (vcpu->arch.exception_vmexit.pending && 4178 !vmx_is_low_priority_db_trap(&vcpu->arch.exception_vmexit)) { 4179 if (block_nested_exceptions) 4180 return -EBUSY; 4181 4182 nested_vmx_inject_exception_vmexit(vcpu); 4183 return 0; 4184 } 4185 4186 if (vcpu->arch.exception.pending && 4187 !vmx_is_low_priority_db_trap(&vcpu->arch.exception)) { 4188 if (block_nested_exceptions) 4189 return -EBUSY; 4190 goto no_vmexit; 4191 } 4192 4193 if (vmx->nested.mtf_pending) { 4194 if (block_nested_events) 4195 return -EBUSY; 4196 nested_vmx_update_pending_dbg(vcpu); 4197 nested_vmx_vmexit(vcpu, EXIT_REASON_MONITOR_TRAP_FLAG, 0, 0); 4198 return 0; 4199 } 4200 4201 if (vcpu->arch.exception_vmexit.pending) { 4202 if (block_nested_exceptions) 4203 return -EBUSY; 4204 4205 nested_vmx_inject_exception_vmexit(vcpu); 4206 return 0; 4207 } 4208 4209 if (vcpu->arch.exception.pending) { 4210 if (block_nested_exceptions) 4211 return -EBUSY; 4212 goto no_vmexit; 4213 } 4214 4215 if (nested_vmx_preemption_timer_pending(vcpu)) { 4216 if (block_nested_events) 4217 return -EBUSY; 4218 nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0); 4219 return 0; 4220 } 4221 4222 if (vcpu->arch.smi_pending && !is_smm(vcpu)) { 4223 if (block_nested_events) 4224 return -EBUSY; 4225 goto no_vmexit; 4226 } 4227 4228 if (vcpu->arch.nmi_pending && !vmx_nmi_blocked(vcpu)) { 4229 if (block_nested_events) 4230 return -EBUSY; 4231 if (!nested_exit_on_nmi(vcpu)) 4232 goto no_vmexit; 4233 4234 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, 4235 NMI_VECTOR | INTR_TYPE_NMI_INTR | 4236 INTR_INFO_VALID_MASK, 0); 4237 /* 4238 * The NMI-triggered VM exit counts as injection: 4239 * clear this one and block further NMIs. 4240 */ 4241 vcpu->arch.nmi_pending = 0; 4242 vmx_set_nmi_mask(vcpu, true); 4243 return 0; 4244 } 4245 4246 if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) { 4247 if (block_nested_events) 4248 return -EBUSY; 4249 if (!nested_exit_on_intr(vcpu)) 4250 goto no_vmexit; 4251 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); 4252 return 0; 4253 } 4254 4255 no_vmexit: 4256 return vmx_complete_nested_posted_interrupt(vcpu); 4257 } 4258 4259 static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) 4260 { 4261 ktime_t remaining = 4262 hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer); 4263 u64 value; 4264 4265 if (ktime_to_ns(remaining) <= 0) 4266 return 0; 4267 4268 value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz; 4269 do_div(value, 1000000); 4270 return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 4271 } 4272 4273 static bool is_vmcs12_ext_field(unsigned long field) 4274 { 4275 switch (field) { 4276 case GUEST_ES_SELECTOR: 4277 case GUEST_CS_SELECTOR: 4278 case GUEST_SS_SELECTOR: 4279 case GUEST_DS_SELECTOR: 4280 case GUEST_FS_SELECTOR: 4281 case GUEST_GS_SELECTOR: 4282 case GUEST_LDTR_SELECTOR: 4283 case GUEST_TR_SELECTOR: 4284 case GUEST_ES_LIMIT: 4285 case GUEST_CS_LIMIT: 4286 case GUEST_SS_LIMIT: 4287 case GUEST_DS_LIMIT: 4288 case GUEST_FS_LIMIT: 4289 case GUEST_GS_LIMIT: 4290 case GUEST_LDTR_LIMIT: 4291 case GUEST_TR_LIMIT: 4292 case GUEST_GDTR_LIMIT: 4293 case GUEST_IDTR_LIMIT: 4294 case GUEST_ES_AR_BYTES: 4295 case GUEST_DS_AR_BYTES: 4296 case GUEST_FS_AR_BYTES: 4297 case GUEST_GS_AR_BYTES: 4298 case GUEST_LDTR_AR_BYTES: 4299 case GUEST_TR_AR_BYTES: 4300 case GUEST_ES_BASE: 4301 case GUEST_CS_BASE: 4302 case GUEST_SS_BASE: 4303 case GUEST_DS_BASE: 4304 case GUEST_FS_BASE: 4305 case GUEST_GS_BASE: 4306 case GUEST_LDTR_BASE: 4307 case GUEST_TR_BASE: 4308 case GUEST_GDTR_BASE: 4309 case GUEST_IDTR_BASE: 4310 case GUEST_PENDING_DBG_EXCEPTIONS: 4311 case GUEST_BNDCFGS: 4312 return true; 4313 default: 4314 break; 4315 } 4316 4317 return false; 4318 } 4319 4320 static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, 4321 struct vmcs12 *vmcs12) 4322 { 4323 struct vcpu_vmx *vmx = to_vmx(vcpu); 4324 4325 vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); 4326 vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR); 4327 vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR); 4328 vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR); 4329 vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR); 4330 vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR); 4331 vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR); 4332 vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR); 4333 vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT); 4334 vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT); 4335 vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT); 4336 vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT); 4337 vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT); 4338 vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT); 4339 vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT); 4340 vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT); 4341 vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT); 4342 vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT); 4343 vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES); 4344 vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES); 4345 vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES); 4346 vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES); 4347 vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES); 4348 vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES); 4349 vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE); 4350 vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE); 4351 vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE); 4352 vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE); 4353 vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE); 4354 vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE); 4355 vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE); 4356 vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE); 4357 vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); 4358 vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); 4359 vmcs12->guest_pending_dbg_exceptions = 4360 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); 4361 4362 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false; 4363 } 4364 4365 static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, 4366 struct vmcs12 *vmcs12) 4367 { 4368 struct vcpu_vmx *vmx = to_vmx(vcpu); 4369 int cpu; 4370 4371 if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare) 4372 return; 4373 4374 4375 WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01); 4376 4377 cpu = get_cpu(); 4378 vmx->loaded_vmcs = &vmx->nested.vmcs02; 4379 vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->vmcs01); 4380 4381 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 4382 4383 vmx->loaded_vmcs = &vmx->vmcs01; 4384 vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->nested.vmcs02); 4385 put_cpu(); 4386 } 4387 4388 /* 4389 * Update the guest state fields of vmcs12 to reflect changes that 4390 * occurred while L2 was running. (The "IA-32e mode guest" bit of the 4391 * VM-entry controls is also updated, since this is really a guest 4392 * state bit.) 4393 */ 4394 static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 4395 { 4396 struct vcpu_vmx *vmx = to_vmx(vcpu); 4397 4398 if (nested_vmx_is_evmptr12_valid(vmx)) 4399 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 4400 4401 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = 4402 !nested_vmx_is_evmptr12_valid(vmx); 4403 4404 vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); 4405 vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); 4406 4407 vmcs12->guest_rsp = kvm_rsp_read(vcpu); 4408 vmcs12->guest_rip = kvm_rip_read(vcpu); 4409 vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); 4410 4411 vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); 4412 vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); 4413 4414 vmcs12->guest_interruptibility_info = 4415 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 4416 4417 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) 4418 vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT; 4419 else if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) 4420 vmcs12->guest_activity_state = GUEST_ACTIVITY_WAIT_SIPI; 4421 else 4422 vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; 4423 4424 if (nested_cpu_has_preemption_timer(vmcs12) && 4425 vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER && 4426 !vmx->nested.nested_run_pending) 4427 vmcs12->vmx_preemption_timer_value = 4428 vmx_get_preemption_timer_value(vcpu); 4429 4430 /* 4431 * In some cases (usually, nested EPT), L2 is allowed to change its 4432 * own CR3 without exiting. If it has changed it, we must keep it. 4433 * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined 4434 * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12. 4435 * 4436 * Additionally, restore L2's PDPTR to vmcs12. 4437 */ 4438 if (enable_ept) { 4439 vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3); 4440 if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { 4441 vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0); 4442 vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1); 4443 vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2); 4444 vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); 4445 } 4446 } 4447 4448 vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); 4449 4450 if (nested_cpu_has_vid(vmcs12)) 4451 vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS); 4452 4453 vmcs12->vm_entry_controls = 4454 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | 4455 (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); 4456 4457 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) 4458 vmcs12->guest_dr7 = vcpu->arch.dr7; 4459 4460 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) 4461 vmcs12->guest_ia32_efer = vcpu->arch.efer; 4462 } 4463 4464 /* 4465 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits 4466 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), 4467 * and this function updates it to reflect the changes to the guest state while 4468 * L2 was running (and perhaps made some exits which were handled directly by L0 4469 * without going back to L1), and to reflect the exit reason. 4470 * Note that we do not have to copy here all VMCS fields, just those that 4471 * could have changed by the L2 guest or the exit - i.e., the guest-state and 4472 * exit-information fields only. Other fields are modified by L1 with VMWRITE, 4473 * which already writes to vmcs12 directly. 4474 */ 4475 static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 4476 u32 vm_exit_reason, u32 exit_intr_info, 4477 unsigned long exit_qualification) 4478 { 4479 /* update exit information fields: */ 4480 vmcs12->vm_exit_reason = vm_exit_reason; 4481 if (to_vmx(vcpu)->exit_reason.enclave_mode) 4482 vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE; 4483 vmcs12->exit_qualification = exit_qualification; 4484 4485 /* 4486 * On VM-Exit due to a failed VM-Entry, the VMCS isn't marked launched 4487 * and only EXIT_REASON and EXIT_QUALIFICATION are updated, all other 4488 * exit info fields are unmodified. 4489 */ 4490 if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { 4491 vmcs12->launch_state = 1; 4492 4493 /* vm_entry_intr_info_field is cleared on exit. Emulate this 4494 * instead of reading the real value. */ 4495 vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK; 4496 4497 /* 4498 * Transfer the event that L0 or L1 may wanted to inject into 4499 * L2 to IDT_VECTORING_INFO_FIELD. 4500 */ 4501 vmcs12_save_pending_event(vcpu, vmcs12, 4502 vm_exit_reason, exit_intr_info); 4503 4504 vmcs12->vm_exit_intr_info = exit_intr_info; 4505 vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 4506 vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 4507 4508 /* 4509 * According to spec, there's no need to store the guest's 4510 * MSRs if the exit is due to a VM-entry failure that occurs 4511 * during or after loading the guest state. Since this exit 4512 * does not fall in that category, we need to save the MSRs. 4513 */ 4514 if (nested_vmx_store_msr(vcpu, 4515 vmcs12->vm_exit_msr_store_addr, 4516 vmcs12->vm_exit_msr_store_count)) 4517 nested_vmx_abort(vcpu, 4518 VMX_ABORT_SAVE_GUEST_MSR_FAIL); 4519 } 4520 } 4521 4522 /* 4523 * A part of what we need to when the nested L2 guest exits and we want to 4524 * run its L1 parent, is to reset L1's guest state to the host state specified 4525 * in vmcs12. 4526 * This function is to be called not only on normal nested exit, but also on 4527 * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry 4528 * Failures During or After Loading Guest State"). 4529 * This function should be called when the active VMCS is L1's (vmcs01). 4530 */ 4531 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 4532 struct vmcs12 *vmcs12) 4533 { 4534 enum vm_entry_failure_code ignored; 4535 struct kvm_segment seg; 4536 4537 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) 4538 vcpu->arch.efer = vmcs12->host_ia32_efer; 4539 else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 4540 vcpu->arch.efer |= (EFER_LMA | EFER_LME); 4541 else 4542 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); 4543 vmx_set_efer(vcpu, vcpu->arch.efer); 4544 4545 kvm_rsp_write(vcpu, vmcs12->host_rsp); 4546 kvm_rip_write(vcpu, vmcs12->host_rip); 4547 vmx_set_rflags(vcpu, X86_EFLAGS_FIXED); 4548 vmx_set_interrupt_shadow(vcpu, 0); 4549 4550 /* 4551 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't 4552 * actually changed, because vmx_set_cr0 refers to efer set above. 4553 * 4554 * CR0_GUEST_HOST_MASK is already set in the original vmcs01 4555 * (KVM doesn't change it); 4556 */ 4557 vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); 4558 vmx_set_cr0(vcpu, vmcs12->host_cr0); 4559 4560 /* Same as above - no reason to call set_cr4_guest_host_mask(). */ 4561 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 4562 vmx_set_cr4(vcpu, vmcs12->host_cr4); 4563 4564 nested_ept_uninit_mmu_context(vcpu); 4565 4566 /* 4567 * Only PDPTE load can fail as the value of cr3 was checked on entry and 4568 * couldn't have changed. 4569 */ 4570 if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, true, &ignored)) 4571 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); 4572 4573 nested_vmx_transition_tlb_flush(vcpu, vmcs12, false); 4574 4575 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); 4576 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); 4577 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); 4578 vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); 4579 vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); 4580 vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF); 4581 vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF); 4582 4583 /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ 4584 if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) 4585 vmcs_write64(GUEST_BNDCFGS, 0); 4586 4587 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { 4588 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); 4589 vcpu->arch.pat = vmcs12->host_ia32_pat; 4590 } 4591 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && 4592 kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu))) 4593 WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 4594 vmcs12->host_ia32_perf_global_ctrl)); 4595 4596 /* Set L1 segment info according to Intel SDM 4597 27.5.2 Loading Host Segment and Descriptor-Table Registers */ 4598 seg = (struct kvm_segment) { 4599 .base = 0, 4600 .limit = 0xFFFFFFFF, 4601 .selector = vmcs12->host_cs_selector, 4602 .type = 11, 4603 .present = 1, 4604 .s = 1, 4605 .g = 1 4606 }; 4607 if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 4608 seg.l = 1; 4609 else 4610 seg.db = 1; 4611 __vmx_set_segment(vcpu, &seg, VCPU_SREG_CS); 4612 seg = (struct kvm_segment) { 4613 .base = 0, 4614 .limit = 0xFFFFFFFF, 4615 .type = 3, 4616 .present = 1, 4617 .s = 1, 4618 .db = 1, 4619 .g = 1 4620 }; 4621 seg.selector = vmcs12->host_ds_selector; 4622 __vmx_set_segment(vcpu, &seg, VCPU_SREG_DS); 4623 seg.selector = vmcs12->host_es_selector; 4624 __vmx_set_segment(vcpu, &seg, VCPU_SREG_ES); 4625 seg.selector = vmcs12->host_ss_selector; 4626 __vmx_set_segment(vcpu, &seg, VCPU_SREG_SS); 4627 seg.selector = vmcs12->host_fs_selector; 4628 seg.base = vmcs12->host_fs_base; 4629 __vmx_set_segment(vcpu, &seg, VCPU_SREG_FS); 4630 seg.selector = vmcs12->host_gs_selector; 4631 seg.base = vmcs12->host_gs_base; 4632 __vmx_set_segment(vcpu, &seg, VCPU_SREG_GS); 4633 seg = (struct kvm_segment) { 4634 .base = vmcs12->host_tr_base, 4635 .limit = 0x67, 4636 .selector = vmcs12->host_tr_selector, 4637 .type = 11, 4638 .present = 1 4639 }; 4640 __vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); 4641 4642 memset(&seg, 0, sizeof(seg)); 4643 seg.unusable = 1; 4644 __vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR); 4645 4646 kvm_set_dr(vcpu, 7, 0x400); 4647 vmcs_write64(GUEST_IA32_DEBUGCTL, 0); 4648 4649 if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, 4650 vmcs12->vm_exit_msr_load_count)) 4651 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 4652 4653 to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu); 4654 } 4655 4656 static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) 4657 { 4658 struct vmx_uret_msr *efer_msr; 4659 unsigned int i; 4660 4661 if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER) 4662 return vmcs_read64(GUEST_IA32_EFER); 4663 4664 if (cpu_has_load_ia32_efer()) 4665 return host_efer; 4666 4667 for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) { 4668 if (vmx->msr_autoload.guest.val[i].index == MSR_EFER) 4669 return vmx->msr_autoload.guest.val[i].value; 4670 } 4671 4672 efer_msr = vmx_find_uret_msr(vmx, MSR_EFER); 4673 if (efer_msr) 4674 return efer_msr->data; 4675 4676 return host_efer; 4677 } 4678 4679 static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) 4680 { 4681 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4682 struct vcpu_vmx *vmx = to_vmx(vcpu); 4683 struct vmx_msr_entry g, h; 4684 gpa_t gpa; 4685 u32 i, j; 4686 4687 vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT); 4688 4689 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { 4690 /* 4691 * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set 4692 * as vmcs01.GUEST_DR7 contains a userspace defined value 4693 * and vcpu->arch.dr7 is not squirreled away before the 4694 * nested VMENTER (not worth adding a variable in nested_vmx). 4695 */ 4696 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 4697 kvm_set_dr(vcpu, 7, DR7_FIXED_1); 4698 else 4699 WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7))); 4700 } 4701 4702 /* 4703 * Note that calling vmx_set_{efer,cr0,cr4} is important as they 4704 * handle a variety of side effects to KVM's software model. 4705 */ 4706 vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx)); 4707 4708 vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); 4709 vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW)); 4710 4711 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 4712 vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW)); 4713 4714 nested_ept_uninit_mmu_context(vcpu); 4715 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 4716 kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); 4717 4718 /* 4719 * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs 4720 * from vmcs01 (if necessary). The PDPTRs are not loaded on 4721 * VMFail, like everything else we just need to ensure our 4722 * software model is up-to-date. 4723 */ 4724 if (enable_ept && is_pae_paging(vcpu)) 4725 ept_save_pdptrs(vcpu); 4726 4727 kvm_mmu_reset_context(vcpu); 4728 4729 /* 4730 * This nasty bit of open coding is a compromise between blindly 4731 * loading L1's MSRs using the exit load lists (incorrect emulation 4732 * of VMFail), leaving the nested VM's MSRs in the software model 4733 * (incorrect behavior) and snapshotting the modified MSRs (too 4734 * expensive since the lists are unbound by hardware). For each 4735 * MSR that was (prematurely) loaded from the nested VMEntry load 4736 * list, reload it from the exit load list if it exists and differs 4737 * from the guest value. The intent is to stuff host state as 4738 * silently as possible, not to fully process the exit load list. 4739 */ 4740 for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) { 4741 gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g)); 4742 if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) { 4743 pr_debug_ratelimited( 4744 "%s read MSR index failed (%u, 0x%08llx)\n", 4745 __func__, i, gpa); 4746 goto vmabort; 4747 } 4748 4749 for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) { 4750 gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h)); 4751 if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) { 4752 pr_debug_ratelimited( 4753 "%s read MSR failed (%u, 0x%08llx)\n", 4754 __func__, j, gpa); 4755 goto vmabort; 4756 } 4757 if (h.index != g.index) 4758 continue; 4759 if (h.value == g.value) 4760 break; 4761 4762 if (nested_vmx_load_msr_check(vcpu, &h)) { 4763 pr_debug_ratelimited( 4764 "%s check failed (%u, 0x%x, 0x%x)\n", 4765 __func__, j, h.index, h.reserved); 4766 goto vmabort; 4767 } 4768 4769 if (kvm_set_msr(vcpu, h.index, h.value)) { 4770 pr_debug_ratelimited( 4771 "%s WRMSR failed (%u, 0x%x, 0x%llx)\n", 4772 __func__, j, h.index, h.value); 4773 goto vmabort; 4774 } 4775 } 4776 } 4777 4778 return; 4779 4780 vmabort: 4781 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 4782 } 4783 4784 /* 4785 * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1 4786 * and modify vmcs12 to make it see what it would expect to see there if 4787 * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) 4788 */ 4789 void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, 4790 u32 exit_intr_info, unsigned long exit_qualification) 4791 { 4792 struct vcpu_vmx *vmx = to_vmx(vcpu); 4793 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4794 4795 /* Pending MTF traps are discarded on VM-Exit. */ 4796 vmx->nested.mtf_pending = false; 4797 4798 /* trying to cancel vmlaunch/vmresume is a bug */ 4799 WARN_ON_ONCE(vmx->nested.nested_run_pending); 4800 4801 #ifdef CONFIG_KVM_HYPERV 4802 if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) { 4803 /* 4804 * KVM_REQ_GET_NESTED_STATE_PAGES is also used to map 4805 * Enlightened VMCS after migration and we still need to 4806 * do that when something is forcing L2->L1 exit prior to 4807 * the first L2 run. 4808 */ 4809 (void)nested_get_evmcs_page(vcpu); 4810 } 4811 #endif 4812 4813 /* Service pending TLB flush requests for L2 before switching to L1. */ 4814 kvm_service_local_tlb_flush_requests(vcpu); 4815 4816 /* 4817 * VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between 4818 * now and the new vmentry. Ensure that the VMCS02 PDPTR fields are 4819 * up-to-date before switching to L1. 4820 */ 4821 if (enable_ept && is_pae_paging(vcpu)) 4822 vmx_ept_load_pdptrs(vcpu); 4823 4824 leave_guest_mode(vcpu); 4825 4826 if (nested_cpu_has_preemption_timer(vmcs12)) 4827 hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); 4828 4829 if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) { 4830 vcpu->arch.tsc_offset = vcpu->arch.l1_tsc_offset; 4831 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING)) 4832 vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio; 4833 } 4834 4835 if (likely(!vmx->fail)) { 4836 sync_vmcs02_to_vmcs12(vcpu, vmcs12); 4837 4838 if (vm_exit_reason != -1) 4839 prepare_vmcs12(vcpu, vmcs12, vm_exit_reason, 4840 exit_intr_info, exit_qualification); 4841 4842 /* 4843 * Must happen outside of sync_vmcs02_to_vmcs12() as it will 4844 * also be used to capture vmcs12 cache as part of 4845 * capturing nVMX state for snapshot (migration). 4846 * 4847 * Otherwise, this flush will dirty guest memory at a 4848 * point it is already assumed by user-space to be 4849 * immutable. 4850 */ 4851 nested_flush_cached_shadow_vmcs12(vcpu, vmcs12); 4852 } else { 4853 /* 4854 * The only expected VM-instruction error is "VM entry with 4855 * invalid control field(s)." Anything else indicates a 4856 * problem with L0. And we should never get here with a 4857 * VMFail of any type if early consistency checks are enabled. 4858 */ 4859 WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != 4860 VMXERR_ENTRY_INVALID_CONTROL_FIELD); 4861 WARN_ON_ONCE(nested_early_check); 4862 } 4863 4864 /* 4865 * Drop events/exceptions that were queued for re-injection to L2 4866 * (picked up via vmx_complete_interrupts()), as well as exceptions 4867 * that were pending for L2. Note, this must NOT be hoisted above 4868 * prepare_vmcs12(), events/exceptions queued for re-injection need to 4869 * be captured in vmcs12 (see vmcs12_save_pending_event()). 4870 */ 4871 vcpu->arch.nmi_injected = false; 4872 kvm_clear_exception_queue(vcpu); 4873 kvm_clear_interrupt_queue(vcpu); 4874 4875 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 4876 4877 /* 4878 * If IBRS is advertised to the vCPU, KVM must flush the indirect 4879 * branch predictors when transitioning from L2 to L1, as L1 expects 4880 * hardware (KVM in this case) to provide separate predictor modes. 4881 * Bare metal isolates VMX root (host) from VMX non-root (guest), but 4882 * doesn't isolate different VMCSs, i.e. in this case, doesn't provide 4883 * separate modes for L2 vs L1. 4884 */ 4885 if (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) 4886 indirect_branch_prediction_barrier(); 4887 4888 /* Update any VMCS fields that might have changed while L2 ran */ 4889 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 4890 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 4891 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 4892 if (kvm_caps.has_tsc_control) 4893 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); 4894 4895 if (vmx->nested.l1_tpr_threshold != -1) 4896 vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold); 4897 4898 if (vmx->nested.change_vmcs01_virtual_apic_mode) { 4899 vmx->nested.change_vmcs01_virtual_apic_mode = false; 4900 vmx_set_virtual_apic_mode(vcpu); 4901 } 4902 4903 if (vmx->nested.update_vmcs01_cpu_dirty_logging) { 4904 vmx->nested.update_vmcs01_cpu_dirty_logging = false; 4905 vmx_update_cpu_dirty_logging(vcpu); 4906 } 4907 4908 /* Unpin physical memory we referred to in vmcs02 */ 4909 kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map, false); 4910 kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); 4911 kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); 4912 vmx->nested.pi_desc = NULL; 4913 4914 if (vmx->nested.reload_vmcs01_apic_access_page) { 4915 vmx->nested.reload_vmcs01_apic_access_page = false; 4916 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 4917 } 4918 4919 if (vmx->nested.update_vmcs01_apicv_status) { 4920 vmx->nested.update_vmcs01_apicv_status = false; 4921 kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); 4922 } 4923 4924 if ((vm_exit_reason != -1) && 4925 (enable_shadow_vmcs || nested_vmx_is_evmptr12_valid(vmx))) 4926 vmx->nested.need_vmcs12_to_shadow_sync = true; 4927 4928 /* in case we halted in L2 */ 4929 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 4930 4931 if (likely(!vmx->fail)) { 4932 if ((u16)vm_exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT && 4933 nested_exit_intr_ack_set(vcpu)) { 4934 int irq = kvm_cpu_get_interrupt(vcpu); 4935 WARN_ON(irq < 0); 4936 vmcs12->vm_exit_intr_info = irq | 4937 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; 4938 } 4939 4940 if (vm_exit_reason != -1) 4941 trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, 4942 vmcs12->exit_qualification, 4943 vmcs12->idt_vectoring_info_field, 4944 vmcs12->vm_exit_intr_info, 4945 vmcs12->vm_exit_intr_error_code, 4946 KVM_ISA_VMX); 4947 4948 load_vmcs12_host_state(vcpu, vmcs12); 4949 4950 return; 4951 } 4952 4953 /* 4954 * After an early L2 VM-entry failure, we're now back 4955 * in L1 which thinks it just finished a VMLAUNCH or 4956 * VMRESUME instruction, so we need to set the failure 4957 * flag and the VM-instruction error field of the VMCS 4958 * accordingly, and skip the emulated instruction. 4959 */ 4960 (void)nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 4961 4962 /* 4963 * Restore L1's host state to KVM's software model. We're here 4964 * because a consistency check was caught by hardware, which 4965 * means some amount of guest state has been propagated to KVM's 4966 * model and needs to be unwound to the host's state. 4967 */ 4968 nested_vmx_restore_host_state(vcpu); 4969 4970 vmx->fail = 0; 4971 } 4972 4973 static void nested_vmx_triple_fault(struct kvm_vcpu *vcpu) 4974 { 4975 kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu); 4976 nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0); 4977 } 4978 4979 /* 4980 * Decode the memory-address operand of a vmx instruction, as recorded on an 4981 * exit caused by such an instruction (run by a guest hypervisor). 4982 * On success, returns 0. When the operand is invalid, returns 1 and throws 4983 * #UD, #GP, or #SS. 4984 */ 4985 int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, 4986 u32 vmx_instruction_info, bool wr, int len, gva_t *ret) 4987 { 4988 gva_t off; 4989 bool exn; 4990 struct kvm_segment s; 4991 4992 /* 4993 * According to Vol. 3B, "Information for VM Exits Due to Instruction 4994 * Execution", on an exit, vmx_instruction_info holds most of the 4995 * addressing components of the operand. Only the displacement part 4996 * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). 4997 * For how an actual address is calculated from all these components, 4998 * refer to Vol. 1, "Operand Addressing". 4999 */ 5000 int scaling = vmx_instruction_info & 3; 5001 int addr_size = (vmx_instruction_info >> 7) & 7; 5002 bool is_reg = vmx_instruction_info & (1u << 10); 5003 int seg_reg = (vmx_instruction_info >> 15) & 7; 5004 int index_reg = (vmx_instruction_info >> 18) & 0xf; 5005 bool index_is_valid = !(vmx_instruction_info & (1u << 22)); 5006 int base_reg = (vmx_instruction_info >> 23) & 0xf; 5007 bool base_is_valid = !(vmx_instruction_info & (1u << 27)); 5008 5009 if (is_reg) { 5010 kvm_queue_exception(vcpu, UD_VECTOR); 5011 return 1; 5012 } 5013 5014 /* Addr = segment_base + offset */ 5015 /* offset = base + [index * scale] + displacement */ 5016 off = exit_qualification; /* holds the displacement */ 5017 if (addr_size == 1) 5018 off = (gva_t)sign_extend64(off, 31); 5019 else if (addr_size == 0) 5020 off = (gva_t)sign_extend64(off, 15); 5021 if (base_is_valid) 5022 off += kvm_register_read(vcpu, base_reg); 5023 if (index_is_valid) 5024 off += kvm_register_read(vcpu, index_reg) << scaling; 5025 vmx_get_segment(vcpu, &s, seg_reg); 5026 5027 /* 5028 * The effective address, i.e. @off, of a memory operand is truncated 5029 * based on the address size of the instruction. Note that this is 5030 * the *effective address*, i.e. the address prior to accounting for 5031 * the segment's base. 5032 */ 5033 if (addr_size == 1) /* 32 bit */ 5034 off &= 0xffffffff; 5035 else if (addr_size == 0) /* 16 bit */ 5036 off &= 0xffff; 5037 5038 /* Checks for #GP/#SS exceptions. */ 5039 exn = false; 5040 if (is_long_mode(vcpu)) { 5041 /* 5042 * The virtual/linear address is never truncated in 64-bit 5043 * mode, e.g. a 32-bit address size can yield a 64-bit virtual 5044 * address when using FS/GS with a non-zero base. 5045 */ 5046 if (seg_reg == VCPU_SREG_FS || seg_reg == VCPU_SREG_GS) 5047 *ret = s.base + off; 5048 else 5049 *ret = off; 5050 5051 *ret = vmx_get_untagged_addr(vcpu, *ret, 0); 5052 /* Long mode: #GP(0)/#SS(0) if the memory address is in a 5053 * non-canonical form. This is the only check on the memory 5054 * destination for long mode! 5055 */ 5056 exn = is_noncanonical_address(*ret, vcpu); 5057 } else { 5058 /* 5059 * When not in long mode, the virtual/linear address is 5060 * unconditionally truncated to 32 bits regardless of the 5061 * address size. 5062 */ 5063 *ret = (s.base + off) & 0xffffffff; 5064 5065 /* Protected mode: apply checks for segment validity in the 5066 * following order: 5067 * - segment type check (#GP(0) may be thrown) 5068 * - usability check (#GP(0)/#SS(0)) 5069 * - limit check (#GP(0)/#SS(0)) 5070 */ 5071 if (wr) 5072 /* #GP(0) if the destination operand is located in a 5073 * read-only data segment or any code segment. 5074 */ 5075 exn = ((s.type & 0xa) == 0 || (s.type & 8)); 5076 else 5077 /* #GP(0) if the source operand is located in an 5078 * execute-only code segment 5079 */ 5080 exn = ((s.type & 0xa) == 8); 5081 if (exn) { 5082 kvm_queue_exception_e(vcpu, GP_VECTOR, 0); 5083 return 1; 5084 } 5085 /* Protected mode: #GP(0)/#SS(0) if the segment is unusable. 5086 */ 5087 exn = (s.unusable != 0); 5088 5089 /* 5090 * Protected mode: #GP(0)/#SS(0) if the memory operand is 5091 * outside the segment limit. All CPUs that support VMX ignore 5092 * limit checks for flat segments, i.e. segments with base==0, 5093 * limit==0xffffffff and of type expand-up data or code. 5094 */ 5095 if (!(s.base == 0 && s.limit == 0xffffffff && 5096 ((s.type & 8) || !(s.type & 4)))) 5097 exn = exn || ((u64)off + len - 1 > s.limit); 5098 } 5099 if (exn) { 5100 kvm_queue_exception_e(vcpu, 5101 seg_reg == VCPU_SREG_SS ? 5102 SS_VECTOR : GP_VECTOR, 5103 0); 5104 return 1; 5105 } 5106 5107 return 0; 5108 } 5109 5110 static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer, 5111 int *ret) 5112 { 5113 gva_t gva; 5114 struct x86_exception e; 5115 int r; 5116 5117 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 5118 vmcs_read32(VMX_INSTRUCTION_INFO), false, 5119 sizeof(*vmpointer), &gva)) { 5120 *ret = 1; 5121 return -EINVAL; 5122 } 5123 5124 r = kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e); 5125 if (r != X86EMUL_CONTINUE) { 5126 *ret = kvm_handle_memory_failure(vcpu, r, &e); 5127 return -EINVAL; 5128 } 5129 5130 return 0; 5131 } 5132 5133 /* 5134 * Allocate a shadow VMCS and associate it with the currently loaded 5135 * VMCS, unless such a shadow VMCS already exists. The newly allocated 5136 * VMCS is also VMCLEARed, so that it is ready for use. 5137 */ 5138 static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu) 5139 { 5140 struct vcpu_vmx *vmx = to_vmx(vcpu); 5141 struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs; 5142 5143 /* 5144 * KVM allocates a shadow VMCS only when L1 executes VMXON and frees it 5145 * when L1 executes VMXOFF or the vCPU is forced out of nested 5146 * operation. VMXON faults if the CPU is already post-VMXON, so it 5147 * should be impossible to already have an allocated shadow VMCS. KVM 5148 * doesn't support virtualization of VMCS shadowing, so vmcs01 should 5149 * always be the loaded VMCS. 5150 */ 5151 if (WARN_ON(loaded_vmcs != &vmx->vmcs01 || loaded_vmcs->shadow_vmcs)) 5152 return loaded_vmcs->shadow_vmcs; 5153 5154 loaded_vmcs->shadow_vmcs = alloc_vmcs(true); 5155 if (loaded_vmcs->shadow_vmcs) 5156 vmcs_clear(loaded_vmcs->shadow_vmcs); 5157 5158 return loaded_vmcs->shadow_vmcs; 5159 } 5160 5161 static int enter_vmx_operation(struct kvm_vcpu *vcpu) 5162 { 5163 struct vcpu_vmx *vmx = to_vmx(vcpu); 5164 int r; 5165 5166 r = alloc_loaded_vmcs(&vmx->nested.vmcs02); 5167 if (r < 0) 5168 goto out_vmcs02; 5169 5170 vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); 5171 if (!vmx->nested.cached_vmcs12) 5172 goto out_cached_vmcs12; 5173 5174 vmx->nested.shadow_vmcs12_cache.gpa = INVALID_GPA; 5175 vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); 5176 if (!vmx->nested.cached_shadow_vmcs12) 5177 goto out_cached_shadow_vmcs12; 5178 5179 if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu)) 5180 goto out_shadow_vmcs; 5181 5182 hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, 5183 HRTIMER_MODE_ABS_PINNED); 5184 vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; 5185 5186 vmx->nested.vpid02 = allocate_vpid(); 5187 5188 vmx->nested.vmcs02_initialized = false; 5189 vmx->nested.vmxon = true; 5190 5191 if (vmx_pt_mode_is_host_guest()) { 5192 vmx->pt_desc.guest.ctl = 0; 5193 pt_update_intercept_for_msr(vcpu); 5194 } 5195 5196 return 0; 5197 5198 out_shadow_vmcs: 5199 kfree(vmx->nested.cached_shadow_vmcs12); 5200 5201 out_cached_shadow_vmcs12: 5202 kfree(vmx->nested.cached_vmcs12); 5203 5204 out_cached_vmcs12: 5205 free_loaded_vmcs(&vmx->nested.vmcs02); 5206 5207 out_vmcs02: 5208 return -ENOMEM; 5209 } 5210 5211 /* Emulate the VMXON instruction. */ 5212 static int handle_vmxon(struct kvm_vcpu *vcpu) 5213 { 5214 int ret; 5215 gpa_t vmptr; 5216 uint32_t revision; 5217 struct vcpu_vmx *vmx = to_vmx(vcpu); 5218 const u64 VMXON_NEEDED_FEATURES = FEAT_CTL_LOCKED 5219 | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; 5220 5221 /* 5222 * Manually check CR4.VMXE checks, KVM must force CR4.VMXE=1 to enter 5223 * the guest and so cannot rely on hardware to perform the check, 5224 * which has higher priority than VM-Exit (see Intel SDM's pseudocode 5225 * for VMXON). 5226 * 5227 * Rely on hardware for the other pre-VM-Exit checks, CR0.PE=1, !VM86 5228 * and !COMPATIBILITY modes. For an unrestricted guest, KVM doesn't 5229 * force any of the relevant guest state. For a restricted guest, KVM 5230 * does force CR0.PE=1, but only to also force VM86 in order to emulate 5231 * Real Mode, and so there's no need to check CR0.PE manually. 5232 */ 5233 if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_VMXE)) { 5234 kvm_queue_exception(vcpu, UD_VECTOR); 5235 return 1; 5236 } 5237 5238 /* 5239 * The CPL is checked for "not in VMX operation" and for "in VMX root", 5240 * and has higher priority than the VM-Fail due to being post-VMXON, 5241 * i.e. VMXON #GPs outside of VMX non-root if CPL!=0. In VMX non-root, 5242 * VMXON causes VM-Exit and KVM unconditionally forwards VMXON VM-Exits 5243 * from L2 to L1, i.e. there's no need to check for the vCPU being in 5244 * VMX non-root. 5245 * 5246 * Forwarding the VM-Exit unconditionally, i.e. without performing the 5247 * #UD checks (see above), is functionally ok because KVM doesn't allow 5248 * L1 to run L2 without CR4.VMXE=0, and because KVM never modifies L2's 5249 * CR0 or CR4, i.e. it's L2's responsibility to emulate #UDs that are 5250 * missed by hardware due to shadowing CR0 and/or CR4. 5251 */ 5252 if (vmx_get_cpl(vcpu)) { 5253 kvm_inject_gp(vcpu, 0); 5254 return 1; 5255 } 5256 5257 if (vmx->nested.vmxon) 5258 return nested_vmx_fail(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); 5259 5260 /* 5261 * Invalid CR0/CR4 generates #GP. These checks are performed if and 5262 * only if the vCPU isn't already in VMX operation, i.e. effectively 5263 * have lower priority than the VM-Fail above. 5264 */ 5265 if (!nested_host_cr0_valid(vcpu, kvm_read_cr0(vcpu)) || 5266 !nested_host_cr4_valid(vcpu, kvm_read_cr4(vcpu))) { 5267 kvm_inject_gp(vcpu, 0); 5268 return 1; 5269 } 5270 5271 if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) 5272 != VMXON_NEEDED_FEATURES) { 5273 kvm_inject_gp(vcpu, 0); 5274 return 1; 5275 } 5276 5277 if (nested_vmx_get_vmptr(vcpu, &vmptr, &ret)) 5278 return ret; 5279 5280 /* 5281 * SDM 3: 24.11.5 5282 * The first 4 bytes of VMXON region contain the supported 5283 * VMCS revision identifier 5284 * 5285 * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case; 5286 * which replaces physical address width with 32 5287 */ 5288 if (!page_address_valid(vcpu, vmptr)) 5289 return nested_vmx_failInvalid(vcpu); 5290 5291 if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) || 5292 revision != VMCS12_REVISION) 5293 return nested_vmx_failInvalid(vcpu); 5294 5295 vmx->nested.vmxon_ptr = vmptr; 5296 ret = enter_vmx_operation(vcpu); 5297 if (ret) 5298 return ret; 5299 5300 return nested_vmx_succeed(vcpu); 5301 } 5302 5303 static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) 5304 { 5305 struct vcpu_vmx *vmx = to_vmx(vcpu); 5306 5307 if (vmx->nested.current_vmptr == INVALID_GPA) 5308 return; 5309 5310 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); 5311 5312 if (enable_shadow_vmcs) { 5313 /* copy to memory all shadowed fields in case 5314 they were modified */ 5315 copy_shadow_to_vmcs12(vmx); 5316 vmx_disable_shadow_vmcs(vmx); 5317 } 5318 vmx->nested.posted_intr_nv = -1; 5319 5320 /* Flush VMCS12 to guest memory */ 5321 kvm_vcpu_write_guest_page(vcpu, 5322 vmx->nested.current_vmptr >> PAGE_SHIFT, 5323 vmx->nested.cached_vmcs12, 0, VMCS12_SIZE); 5324 5325 kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 5326 5327 vmx->nested.current_vmptr = INVALID_GPA; 5328 } 5329 5330 /* Emulate the VMXOFF instruction */ 5331 static int handle_vmxoff(struct kvm_vcpu *vcpu) 5332 { 5333 if (!nested_vmx_check_permission(vcpu)) 5334 return 1; 5335 5336 free_nested(vcpu); 5337 5338 if (kvm_apic_has_pending_init_or_sipi(vcpu)) 5339 kvm_make_request(KVM_REQ_EVENT, vcpu); 5340 5341 return nested_vmx_succeed(vcpu); 5342 } 5343 5344 /* Emulate the VMCLEAR instruction */ 5345 static int handle_vmclear(struct kvm_vcpu *vcpu) 5346 { 5347 struct vcpu_vmx *vmx = to_vmx(vcpu); 5348 u32 zero = 0; 5349 gpa_t vmptr; 5350 int r; 5351 5352 if (!nested_vmx_check_permission(vcpu)) 5353 return 1; 5354 5355 if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) 5356 return r; 5357 5358 if (!page_address_valid(vcpu, vmptr)) 5359 return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS); 5360 5361 if (vmptr == vmx->nested.vmxon_ptr) 5362 return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_VMXON_POINTER); 5363 5364 if (likely(!nested_evmcs_handle_vmclear(vcpu, vmptr))) { 5365 if (vmptr == vmx->nested.current_vmptr) 5366 nested_release_vmcs12(vcpu); 5367 5368 /* 5369 * Silently ignore memory errors on VMCLEAR, Intel's pseudocode 5370 * for VMCLEAR includes a "ensure that data for VMCS referenced 5371 * by the operand is in memory" clause that guards writes to 5372 * memory, i.e. doing nothing for I/O is architecturally valid. 5373 * 5374 * FIXME: Suppress failures if and only if no memslot is found, 5375 * i.e. exit to userspace if __copy_to_user() fails. 5376 */ 5377 (void)kvm_vcpu_write_guest(vcpu, 5378 vmptr + offsetof(struct vmcs12, 5379 launch_state), 5380 &zero, sizeof(zero)); 5381 } 5382 5383 return nested_vmx_succeed(vcpu); 5384 } 5385 5386 /* Emulate the VMLAUNCH instruction */ 5387 static int handle_vmlaunch(struct kvm_vcpu *vcpu) 5388 { 5389 return nested_vmx_run(vcpu, true); 5390 } 5391 5392 /* Emulate the VMRESUME instruction */ 5393 static int handle_vmresume(struct kvm_vcpu *vcpu) 5394 { 5395 5396 return nested_vmx_run(vcpu, false); 5397 } 5398 5399 static int handle_vmread(struct kvm_vcpu *vcpu) 5400 { 5401 struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) 5402 : get_vmcs12(vcpu); 5403 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5404 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5405 struct vcpu_vmx *vmx = to_vmx(vcpu); 5406 struct x86_exception e; 5407 unsigned long field; 5408 u64 value; 5409 gva_t gva = 0; 5410 short offset; 5411 int len, r; 5412 5413 if (!nested_vmx_check_permission(vcpu)) 5414 return 1; 5415 5416 /* Decode instruction info and find the field to read */ 5417 field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); 5418 5419 if (!nested_vmx_is_evmptr12_valid(vmx)) { 5420 /* 5421 * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, 5422 * any VMREAD sets the ALU flags for VMfailInvalid. 5423 */ 5424 if (vmx->nested.current_vmptr == INVALID_GPA || 5425 (is_guest_mode(vcpu) && 5426 get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) 5427 return nested_vmx_failInvalid(vcpu); 5428 5429 offset = get_vmcs12_field_offset(field); 5430 if (offset < 0) 5431 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5432 5433 if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field)) 5434 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 5435 5436 /* Read the field, zero-extended to a u64 value */ 5437 value = vmcs12_read_any(vmcs12, field, offset); 5438 } else { 5439 /* 5440 * Hyper-V TLFS (as of 6.0b) explicitly states, that while an 5441 * enlightened VMCS is active VMREAD/VMWRITE instructions are 5442 * unsupported. Unfortunately, certain versions of Windows 11 5443 * don't comply with this requirement which is not enforced in 5444 * genuine Hyper-V. Allow VMREAD from an enlightened VMCS as a 5445 * workaround, as misbehaving guests will panic on VM-Fail. 5446 * Note, enlightened VMCS is incompatible with shadow VMCS so 5447 * all VMREADs from L2 should go to L1. 5448 */ 5449 if (WARN_ON_ONCE(is_guest_mode(vcpu))) 5450 return nested_vmx_failInvalid(vcpu); 5451 5452 offset = evmcs_field_offset(field, NULL); 5453 if (offset < 0) 5454 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5455 5456 /* Read the field, zero-extended to a u64 value */ 5457 value = evmcs_read_any(nested_vmx_evmcs(vmx), field, offset); 5458 } 5459 5460 /* 5461 * Now copy part of this value to register or memory, as requested. 5462 * Note that the number of bits actually copied is 32 or 64 depending 5463 * on the guest's mode (32 or 64 bit), not on the given field's length. 5464 */ 5465 if (instr_info & BIT(10)) { 5466 kvm_register_write(vcpu, (((instr_info) >> 3) & 0xf), value); 5467 } else { 5468 len = is_64_bit_mode(vcpu) ? 8 : 4; 5469 if (get_vmx_mem_address(vcpu, exit_qualification, 5470 instr_info, true, len, &gva)) 5471 return 1; 5472 /* _system ok, nested_vmx_check_permission has verified cpl=0 */ 5473 r = kvm_write_guest_virt_system(vcpu, gva, &value, len, &e); 5474 if (r != X86EMUL_CONTINUE) 5475 return kvm_handle_memory_failure(vcpu, r, &e); 5476 } 5477 5478 return nested_vmx_succeed(vcpu); 5479 } 5480 5481 static bool is_shadow_field_rw(unsigned long field) 5482 { 5483 switch (field) { 5484 #define SHADOW_FIELD_RW(x, y) case x: 5485 #include "vmcs_shadow_fields.h" 5486 return true; 5487 default: 5488 break; 5489 } 5490 return false; 5491 } 5492 5493 static bool is_shadow_field_ro(unsigned long field) 5494 { 5495 switch (field) { 5496 #define SHADOW_FIELD_RO(x, y) case x: 5497 #include "vmcs_shadow_fields.h" 5498 return true; 5499 default: 5500 break; 5501 } 5502 return false; 5503 } 5504 5505 static int handle_vmwrite(struct kvm_vcpu *vcpu) 5506 { 5507 struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) 5508 : get_vmcs12(vcpu); 5509 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5510 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5511 struct vcpu_vmx *vmx = to_vmx(vcpu); 5512 struct x86_exception e; 5513 unsigned long field; 5514 short offset; 5515 gva_t gva; 5516 int len, r; 5517 5518 /* 5519 * The value to write might be 32 or 64 bits, depending on L1's long 5520 * mode, and eventually we need to write that into a field of several 5521 * possible lengths. The code below first zero-extends the value to 64 5522 * bit (value), and then copies only the appropriate number of 5523 * bits into the vmcs12 field. 5524 */ 5525 u64 value = 0; 5526 5527 if (!nested_vmx_check_permission(vcpu)) 5528 return 1; 5529 5530 /* 5531 * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, 5532 * any VMWRITE sets the ALU flags for VMfailInvalid. 5533 */ 5534 if (vmx->nested.current_vmptr == INVALID_GPA || 5535 (is_guest_mode(vcpu) && 5536 get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) 5537 return nested_vmx_failInvalid(vcpu); 5538 5539 if (instr_info & BIT(10)) 5540 value = kvm_register_read(vcpu, (((instr_info) >> 3) & 0xf)); 5541 else { 5542 len = is_64_bit_mode(vcpu) ? 8 : 4; 5543 if (get_vmx_mem_address(vcpu, exit_qualification, 5544 instr_info, false, len, &gva)) 5545 return 1; 5546 r = kvm_read_guest_virt(vcpu, gva, &value, len, &e); 5547 if (r != X86EMUL_CONTINUE) 5548 return kvm_handle_memory_failure(vcpu, r, &e); 5549 } 5550 5551 field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); 5552 5553 offset = get_vmcs12_field_offset(field); 5554 if (offset < 0) 5555 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5556 5557 /* 5558 * If the vCPU supports "VMWRITE to any supported field in the 5559 * VMCS," then the "read-only" fields are actually read/write. 5560 */ 5561 if (vmcs_field_readonly(field) && 5562 !nested_cpu_has_vmwrite_any_field(vcpu)) 5563 return nested_vmx_fail(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); 5564 5565 /* 5566 * Ensure vmcs12 is up-to-date before any VMWRITE that dirties 5567 * vmcs12, else we may crush a field or consume a stale value. 5568 */ 5569 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) 5570 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 5571 5572 /* 5573 * Some Intel CPUs intentionally drop the reserved bits of the AR byte 5574 * fields on VMWRITE. Emulate this behavior to ensure consistent KVM 5575 * behavior regardless of the underlying hardware, e.g. if an AR_BYTE 5576 * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD 5577 * from L1 will return a different value than VMREAD from L2 (L1 sees 5578 * the stripped down value, L2 sees the full value as stored by KVM). 5579 */ 5580 if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES) 5581 value &= 0x1f0ff; 5582 5583 vmcs12_write_any(vmcs12, field, offset, value); 5584 5585 /* 5586 * Do not track vmcs12 dirty-state if in guest-mode as we actually 5587 * dirty shadow vmcs12 instead of vmcs12. Fields that can be updated 5588 * by L1 without a vmexit are always updated in the vmcs02, i.e. don't 5589 * "dirty" vmcs12, all others go down the prepare_vmcs02() slow path. 5590 */ 5591 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) { 5592 /* 5593 * L1 can read these fields without exiting, ensure the 5594 * shadow VMCS is up-to-date. 5595 */ 5596 if (enable_shadow_vmcs && is_shadow_field_ro(field)) { 5597 preempt_disable(); 5598 vmcs_load(vmx->vmcs01.shadow_vmcs); 5599 5600 __vmcs_writel(field, value); 5601 5602 vmcs_clear(vmx->vmcs01.shadow_vmcs); 5603 vmcs_load(vmx->loaded_vmcs->vmcs); 5604 preempt_enable(); 5605 } 5606 vmx->nested.dirty_vmcs12 = true; 5607 } 5608 5609 return nested_vmx_succeed(vcpu); 5610 } 5611 5612 static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr) 5613 { 5614 vmx->nested.current_vmptr = vmptr; 5615 if (enable_shadow_vmcs) { 5616 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); 5617 vmcs_write64(VMCS_LINK_POINTER, 5618 __pa(vmx->vmcs01.shadow_vmcs)); 5619 vmx->nested.need_vmcs12_to_shadow_sync = true; 5620 } 5621 vmx->nested.dirty_vmcs12 = true; 5622 vmx->nested.force_msr_bitmap_recalc = true; 5623 } 5624 5625 /* Emulate the VMPTRLD instruction */ 5626 static int handle_vmptrld(struct kvm_vcpu *vcpu) 5627 { 5628 struct vcpu_vmx *vmx = to_vmx(vcpu); 5629 gpa_t vmptr; 5630 int r; 5631 5632 if (!nested_vmx_check_permission(vcpu)) 5633 return 1; 5634 5635 if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) 5636 return r; 5637 5638 if (!page_address_valid(vcpu, vmptr)) 5639 return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS); 5640 5641 if (vmptr == vmx->nested.vmxon_ptr) 5642 return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_VMXON_POINTER); 5643 5644 /* Forbid normal VMPTRLD if Enlightened version was used */ 5645 if (nested_vmx_is_evmptr12_valid(vmx)) 5646 return 1; 5647 5648 if (vmx->nested.current_vmptr != vmptr) { 5649 struct gfn_to_hva_cache *ghc = &vmx->nested.vmcs12_cache; 5650 struct vmcs_hdr hdr; 5651 5652 if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmptr, VMCS12_SIZE)) { 5653 /* 5654 * Reads from an unbacked page return all 1s, 5655 * which means that the 32 bits located at the 5656 * given physical address won't match the required 5657 * VMCS12_REVISION identifier. 5658 */ 5659 return nested_vmx_fail(vcpu, 5660 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5661 } 5662 5663 if (kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr, 5664 offsetof(struct vmcs12, hdr), 5665 sizeof(hdr))) { 5666 return nested_vmx_fail(vcpu, 5667 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5668 } 5669 5670 if (hdr.revision_id != VMCS12_REVISION || 5671 (hdr.shadow_vmcs && 5672 !nested_cpu_has_vmx_shadow_vmcs(vcpu))) { 5673 return nested_vmx_fail(vcpu, 5674 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5675 } 5676 5677 nested_release_vmcs12(vcpu); 5678 5679 /* 5680 * Load VMCS12 from guest memory since it is not already 5681 * cached. 5682 */ 5683 if (kvm_read_guest_cached(vcpu->kvm, ghc, vmx->nested.cached_vmcs12, 5684 VMCS12_SIZE)) { 5685 return nested_vmx_fail(vcpu, 5686 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5687 } 5688 5689 set_current_vmptr(vmx, vmptr); 5690 } 5691 5692 return nested_vmx_succeed(vcpu); 5693 } 5694 5695 /* Emulate the VMPTRST instruction */ 5696 static int handle_vmptrst(struct kvm_vcpu *vcpu) 5697 { 5698 unsigned long exit_qual = vmx_get_exit_qual(vcpu); 5699 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5700 gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr; 5701 struct x86_exception e; 5702 gva_t gva; 5703 int r; 5704 5705 if (!nested_vmx_check_permission(vcpu)) 5706 return 1; 5707 5708 if (unlikely(nested_vmx_is_evmptr12_valid(to_vmx(vcpu)))) 5709 return 1; 5710 5711 if (get_vmx_mem_address(vcpu, exit_qual, instr_info, 5712 true, sizeof(gpa_t), &gva)) 5713 return 1; 5714 /* *_system ok, nested_vmx_check_permission has verified cpl=0 */ 5715 r = kvm_write_guest_virt_system(vcpu, gva, (void *)¤t_vmptr, 5716 sizeof(gpa_t), &e); 5717 if (r != X86EMUL_CONTINUE) 5718 return kvm_handle_memory_failure(vcpu, r, &e); 5719 5720 return nested_vmx_succeed(vcpu); 5721 } 5722 5723 /* Emulate the INVEPT instruction */ 5724 static int handle_invept(struct kvm_vcpu *vcpu) 5725 { 5726 struct vcpu_vmx *vmx = to_vmx(vcpu); 5727 u32 vmx_instruction_info, types; 5728 unsigned long type, roots_to_free; 5729 struct kvm_mmu *mmu; 5730 gva_t gva; 5731 struct x86_exception e; 5732 struct { 5733 u64 eptp, gpa; 5734 } operand; 5735 int i, r, gpr_index; 5736 5737 if (!(vmx->nested.msrs.secondary_ctls_high & 5738 SECONDARY_EXEC_ENABLE_EPT) || 5739 !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) { 5740 kvm_queue_exception(vcpu, UD_VECTOR); 5741 return 1; 5742 } 5743 5744 if (!nested_vmx_check_permission(vcpu)) 5745 return 1; 5746 5747 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5748 gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); 5749 type = kvm_register_read(vcpu, gpr_index); 5750 5751 types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; 5752 5753 if (type >= 32 || !(types & (1 << type))) 5754 return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5755 5756 /* According to the Intel VMX instruction reference, the memory 5757 * operand is read even if it isn't needed (e.g., for type==global) 5758 */ 5759 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 5760 vmx_instruction_info, false, sizeof(operand), &gva)) 5761 return 1; 5762 r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); 5763 if (r != X86EMUL_CONTINUE) 5764 return kvm_handle_memory_failure(vcpu, r, &e); 5765 5766 /* 5767 * Nested EPT roots are always held through guest_mmu, 5768 * not root_mmu. 5769 */ 5770 mmu = &vcpu->arch.guest_mmu; 5771 5772 switch (type) { 5773 case VMX_EPT_EXTENT_CONTEXT: 5774 if (!nested_vmx_check_eptp(vcpu, operand.eptp)) 5775 return nested_vmx_fail(vcpu, 5776 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5777 5778 roots_to_free = 0; 5779 if (nested_ept_root_matches(mmu->root.hpa, mmu->root.pgd, 5780 operand.eptp)) 5781 roots_to_free |= KVM_MMU_ROOT_CURRENT; 5782 5783 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 5784 if (nested_ept_root_matches(mmu->prev_roots[i].hpa, 5785 mmu->prev_roots[i].pgd, 5786 operand.eptp)) 5787 roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); 5788 } 5789 break; 5790 case VMX_EPT_EXTENT_GLOBAL: 5791 roots_to_free = KVM_MMU_ROOTS_ALL; 5792 break; 5793 default: 5794 BUG(); 5795 break; 5796 } 5797 5798 if (roots_to_free) 5799 kvm_mmu_free_roots(vcpu->kvm, mmu, roots_to_free); 5800 5801 return nested_vmx_succeed(vcpu); 5802 } 5803 5804 static int handle_invvpid(struct kvm_vcpu *vcpu) 5805 { 5806 struct vcpu_vmx *vmx = to_vmx(vcpu); 5807 u32 vmx_instruction_info; 5808 unsigned long type, types; 5809 gva_t gva; 5810 struct x86_exception e; 5811 struct { 5812 u64 vpid; 5813 u64 gla; 5814 } operand; 5815 u16 vpid02; 5816 int r, gpr_index; 5817 5818 if (!(vmx->nested.msrs.secondary_ctls_high & 5819 SECONDARY_EXEC_ENABLE_VPID) || 5820 !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) { 5821 kvm_queue_exception(vcpu, UD_VECTOR); 5822 return 1; 5823 } 5824 5825 if (!nested_vmx_check_permission(vcpu)) 5826 return 1; 5827 5828 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5829 gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); 5830 type = kvm_register_read(vcpu, gpr_index); 5831 5832 types = (vmx->nested.msrs.vpid_caps & 5833 VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; 5834 5835 if (type >= 32 || !(types & (1 << type))) 5836 return nested_vmx_fail(vcpu, 5837 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5838 5839 /* according to the intel vmx instruction reference, the memory 5840 * operand is read even if it isn't needed (e.g., for type==global) 5841 */ 5842 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 5843 vmx_instruction_info, false, sizeof(operand), &gva)) 5844 return 1; 5845 r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); 5846 if (r != X86EMUL_CONTINUE) 5847 return kvm_handle_memory_failure(vcpu, r, &e); 5848 5849 if (operand.vpid >> 16) 5850 return nested_vmx_fail(vcpu, 5851 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5852 5853 vpid02 = nested_get_vpid02(vcpu); 5854 switch (type) { 5855 case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: 5856 /* 5857 * LAM doesn't apply to addresses that are inputs to TLB 5858 * invalidation. 5859 */ 5860 if (!operand.vpid || 5861 is_noncanonical_address(operand.gla, vcpu)) 5862 return nested_vmx_fail(vcpu, 5863 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5864 vpid_sync_vcpu_addr(vpid02, operand.gla); 5865 break; 5866 case VMX_VPID_EXTENT_SINGLE_CONTEXT: 5867 case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: 5868 if (!operand.vpid) 5869 return nested_vmx_fail(vcpu, 5870 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5871 vpid_sync_context(vpid02); 5872 break; 5873 case VMX_VPID_EXTENT_ALL_CONTEXT: 5874 vpid_sync_context(vpid02); 5875 break; 5876 default: 5877 WARN_ON_ONCE(1); 5878 return kvm_skip_emulated_instruction(vcpu); 5879 } 5880 5881 /* 5882 * Sync the shadow page tables if EPT is disabled, L1 is invalidating 5883 * linear mappings for L2 (tagged with L2's VPID). Free all guest 5884 * roots as VPIDs are not tracked in the MMU role. 5885 * 5886 * Note, this operates on root_mmu, not guest_mmu, as L1 and L2 share 5887 * an MMU when EPT is disabled. 5888 * 5889 * TODO: sync only the affected SPTEs for INVDIVIDUAL_ADDR. 5890 */ 5891 if (!enable_ept) 5892 kvm_mmu_free_guest_mode_roots(vcpu->kvm, &vcpu->arch.root_mmu); 5893 5894 return nested_vmx_succeed(vcpu); 5895 } 5896 5897 static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, 5898 struct vmcs12 *vmcs12) 5899 { 5900 u32 index = kvm_rcx_read(vcpu); 5901 u64 new_eptp; 5902 5903 if (WARN_ON_ONCE(!nested_cpu_has_ept(vmcs12))) 5904 return 1; 5905 if (index >= VMFUNC_EPTP_ENTRIES) 5906 return 1; 5907 5908 if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT, 5909 &new_eptp, index * 8, 8)) 5910 return 1; 5911 5912 /* 5913 * If the (L2) guest does a vmfunc to the currently 5914 * active ept pointer, we don't have to do anything else 5915 */ 5916 if (vmcs12->ept_pointer != new_eptp) { 5917 if (!nested_vmx_check_eptp(vcpu, new_eptp)) 5918 return 1; 5919 5920 vmcs12->ept_pointer = new_eptp; 5921 nested_ept_new_eptp(vcpu); 5922 5923 if (!nested_cpu_has_vpid(vmcs12)) 5924 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 5925 } 5926 5927 return 0; 5928 } 5929 5930 static int handle_vmfunc(struct kvm_vcpu *vcpu) 5931 { 5932 struct vcpu_vmx *vmx = to_vmx(vcpu); 5933 struct vmcs12 *vmcs12; 5934 u32 function = kvm_rax_read(vcpu); 5935 5936 /* 5937 * VMFUNC should never execute cleanly while L1 is active; KVM supports 5938 * VMFUNC for nested VMs, but not for L1. 5939 */ 5940 if (WARN_ON_ONCE(!is_guest_mode(vcpu))) { 5941 kvm_queue_exception(vcpu, UD_VECTOR); 5942 return 1; 5943 } 5944 5945 vmcs12 = get_vmcs12(vcpu); 5946 5947 /* 5948 * #UD on out-of-bounds function has priority over VM-Exit, and VMFUNC 5949 * is enabled in vmcs02 if and only if it's enabled in vmcs12. 5950 */ 5951 if (WARN_ON_ONCE((function > 63) || !nested_cpu_has_vmfunc(vmcs12))) { 5952 kvm_queue_exception(vcpu, UD_VECTOR); 5953 return 1; 5954 } 5955 5956 if (!(vmcs12->vm_function_control & BIT_ULL(function))) 5957 goto fail; 5958 5959 switch (function) { 5960 case 0: 5961 if (nested_vmx_eptp_switching(vcpu, vmcs12)) 5962 goto fail; 5963 break; 5964 default: 5965 goto fail; 5966 } 5967 return kvm_skip_emulated_instruction(vcpu); 5968 5969 fail: 5970 /* 5971 * This is effectively a reflected VM-Exit, as opposed to a synthesized 5972 * nested VM-Exit. Pass the original exit reason, i.e. don't hardcode 5973 * EXIT_REASON_VMFUNC as the exit reason. 5974 */ 5975 nested_vmx_vmexit(vcpu, vmx->exit_reason.full, 5976 vmx_get_intr_info(vcpu), 5977 vmx_get_exit_qual(vcpu)); 5978 return 1; 5979 } 5980 5981 /* 5982 * Return true if an IO instruction with the specified port and size should cause 5983 * a VM-exit into L1. 5984 */ 5985 bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port, 5986 int size) 5987 { 5988 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5989 gpa_t bitmap, last_bitmap; 5990 u8 b; 5991 5992 last_bitmap = INVALID_GPA; 5993 b = -1; 5994 5995 while (size > 0) { 5996 if (port < 0x8000) 5997 bitmap = vmcs12->io_bitmap_a; 5998 else if (port < 0x10000) 5999 bitmap = vmcs12->io_bitmap_b; 6000 else 6001 return true; 6002 bitmap += (port & 0x7fff) / 8; 6003 6004 if (last_bitmap != bitmap) 6005 if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1)) 6006 return true; 6007 if (b & (1 << (port & 7))) 6008 return true; 6009 6010 port++; 6011 size--; 6012 last_bitmap = bitmap; 6013 } 6014 6015 return false; 6016 } 6017 6018 static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, 6019 struct vmcs12 *vmcs12) 6020 { 6021 unsigned long exit_qualification; 6022 unsigned short port; 6023 int size; 6024 6025 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 6026 return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); 6027 6028 exit_qualification = vmx_get_exit_qual(vcpu); 6029 6030 port = exit_qualification >> 16; 6031 size = (exit_qualification & 7) + 1; 6032 6033 return nested_vmx_check_io_bitmaps(vcpu, port, size); 6034 } 6035 6036 /* 6037 * Return 1 if we should exit from L2 to L1 to handle an MSR access, 6038 * rather than handle it ourselves in L0. I.e., check whether L1 expressed 6039 * disinterest in the current event (read or write a specific MSR) by using an 6040 * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps. 6041 */ 6042 static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, 6043 struct vmcs12 *vmcs12, 6044 union vmx_exit_reason exit_reason) 6045 { 6046 u32 msr_index = kvm_rcx_read(vcpu); 6047 gpa_t bitmap; 6048 6049 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 6050 return true; 6051 6052 /* 6053 * The MSR_BITMAP page is divided into four 1024-byte bitmaps, 6054 * for the four combinations of read/write and low/high MSR numbers. 6055 * First we need to figure out which of the four to use: 6056 */ 6057 bitmap = vmcs12->msr_bitmap; 6058 if (exit_reason.basic == EXIT_REASON_MSR_WRITE) 6059 bitmap += 2048; 6060 if (msr_index >= 0xc0000000) { 6061 msr_index -= 0xc0000000; 6062 bitmap += 1024; 6063 } 6064 6065 /* Then read the msr_index'th bit from this bitmap: */ 6066 if (msr_index < 1024*8) { 6067 unsigned char b; 6068 if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1)) 6069 return true; 6070 return 1 & (b >> (msr_index & 7)); 6071 } else 6072 return true; /* let L1 handle the wrong parameter */ 6073 } 6074 6075 /* 6076 * Return 1 if we should exit from L2 to L1 to handle a CR access exit, 6077 * rather than handle it ourselves in L0. I.e., check if L1 wanted to 6078 * intercept (via guest_host_mask etc.) the current event. 6079 */ 6080 static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, 6081 struct vmcs12 *vmcs12) 6082 { 6083 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 6084 int cr = exit_qualification & 15; 6085 int reg; 6086 unsigned long val; 6087 6088 switch ((exit_qualification >> 4) & 3) { 6089 case 0: /* mov to cr */ 6090 reg = (exit_qualification >> 8) & 15; 6091 val = kvm_register_read(vcpu, reg); 6092 switch (cr) { 6093 case 0: 6094 if (vmcs12->cr0_guest_host_mask & 6095 (val ^ vmcs12->cr0_read_shadow)) 6096 return true; 6097 break; 6098 case 3: 6099 if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) 6100 return true; 6101 break; 6102 case 4: 6103 if (vmcs12->cr4_guest_host_mask & 6104 (vmcs12->cr4_read_shadow ^ val)) 6105 return true; 6106 break; 6107 case 8: 6108 if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING)) 6109 return true; 6110 break; 6111 } 6112 break; 6113 case 2: /* clts */ 6114 if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) && 6115 (vmcs12->cr0_read_shadow & X86_CR0_TS)) 6116 return true; 6117 break; 6118 case 1: /* mov from cr */ 6119 switch (cr) { 6120 case 3: 6121 if (vmcs12->cpu_based_vm_exec_control & 6122 CPU_BASED_CR3_STORE_EXITING) 6123 return true; 6124 break; 6125 case 8: 6126 if (vmcs12->cpu_based_vm_exec_control & 6127 CPU_BASED_CR8_STORE_EXITING) 6128 return true; 6129 break; 6130 } 6131 break; 6132 case 3: /* lmsw */ 6133 /* 6134 * lmsw can change bits 1..3 of cr0, and only set bit 0 of 6135 * cr0. Other attempted changes are ignored, with no exit. 6136 */ 6137 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; 6138 if (vmcs12->cr0_guest_host_mask & 0xe & 6139 (val ^ vmcs12->cr0_read_shadow)) 6140 return true; 6141 if ((vmcs12->cr0_guest_host_mask & 0x1) && 6142 !(vmcs12->cr0_read_shadow & 0x1) && 6143 (val & 0x1)) 6144 return true; 6145 break; 6146 } 6147 return false; 6148 } 6149 6150 static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu, 6151 struct vmcs12 *vmcs12) 6152 { 6153 u32 encls_leaf; 6154 6155 if (!guest_cpuid_has(vcpu, X86_FEATURE_SGX) || 6156 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING)) 6157 return false; 6158 6159 encls_leaf = kvm_rax_read(vcpu); 6160 if (encls_leaf > 62) 6161 encls_leaf = 63; 6162 return vmcs12->encls_exiting_bitmap & BIT_ULL(encls_leaf); 6163 } 6164 6165 static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu, 6166 struct vmcs12 *vmcs12, gpa_t bitmap) 6167 { 6168 u32 vmx_instruction_info; 6169 unsigned long field; 6170 u8 b; 6171 6172 if (!nested_cpu_has_shadow_vmcs(vmcs12)) 6173 return true; 6174 6175 /* Decode instruction info and find the field to access */ 6176 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 6177 field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); 6178 6179 /* Out-of-range fields always cause a VM exit from L2 to L1 */ 6180 if (field >> 15) 6181 return true; 6182 6183 if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1)) 6184 return true; 6185 6186 return 1 & (b >> (field & 7)); 6187 } 6188 6189 static bool nested_vmx_exit_handled_mtf(struct vmcs12 *vmcs12) 6190 { 6191 u32 entry_intr_info = vmcs12->vm_entry_intr_info_field; 6192 6193 if (nested_cpu_has_mtf(vmcs12)) 6194 return true; 6195 6196 /* 6197 * An MTF VM-exit may be injected into the guest by setting the 6198 * interruption-type to 7 (other event) and the vector field to 0. Such 6199 * is the case regardless of the 'monitor trap flag' VM-execution 6200 * control. 6201 */ 6202 return entry_intr_info == (INTR_INFO_VALID_MASK 6203 | INTR_TYPE_OTHER_EVENT); 6204 } 6205 6206 /* 6207 * Return true if L0 wants to handle an exit from L2 regardless of whether or not 6208 * L1 wants the exit. Only call this when in is_guest_mode (L2). 6209 */ 6210 static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, 6211 union vmx_exit_reason exit_reason) 6212 { 6213 u32 intr_info; 6214 6215 switch ((u16)exit_reason.basic) { 6216 case EXIT_REASON_EXCEPTION_NMI: 6217 intr_info = vmx_get_intr_info(vcpu); 6218 if (is_nmi(intr_info)) 6219 return true; 6220 else if (is_page_fault(intr_info)) 6221 return vcpu->arch.apf.host_apf_flags || 6222 vmx_need_pf_intercept(vcpu); 6223 else if (is_debug(intr_info) && 6224 vcpu->guest_debug & 6225 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) 6226 return true; 6227 else if (is_breakpoint(intr_info) && 6228 vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 6229 return true; 6230 else if (is_alignment_check(intr_info) && 6231 !vmx_guest_inject_ac(vcpu)) 6232 return true; 6233 return false; 6234 case EXIT_REASON_EXTERNAL_INTERRUPT: 6235 return true; 6236 case EXIT_REASON_MCE_DURING_VMENTRY: 6237 return true; 6238 case EXIT_REASON_EPT_VIOLATION: 6239 /* 6240 * L0 always deals with the EPT violation. If nested EPT is 6241 * used, and the nested mmu code discovers that the address is 6242 * missing in the guest EPT table (EPT12), the EPT violation 6243 * will be injected with nested_ept_inject_page_fault() 6244 */ 6245 return true; 6246 case EXIT_REASON_EPT_MISCONFIG: 6247 /* 6248 * L2 never uses directly L1's EPT, but rather L0's own EPT 6249 * table (shadow on EPT) or a merged EPT table that L0 built 6250 * (EPT on EPT). So any problems with the structure of the 6251 * table is L0's fault. 6252 */ 6253 return true; 6254 case EXIT_REASON_PREEMPTION_TIMER: 6255 return true; 6256 case EXIT_REASON_PML_FULL: 6257 /* 6258 * PML is emulated for an L1 VMM and should never be enabled in 6259 * vmcs02, always "handle" PML_FULL by exiting to userspace. 6260 */ 6261 return true; 6262 case EXIT_REASON_VMFUNC: 6263 /* VM functions are emulated through L2->L0 vmexits. */ 6264 return true; 6265 case EXIT_REASON_BUS_LOCK: 6266 /* 6267 * At present, bus lock VM exit is never exposed to L1. 6268 * Handle L2's bus locks in L0 directly. 6269 */ 6270 return true; 6271 #ifdef CONFIG_KVM_HYPERV 6272 case EXIT_REASON_VMCALL: 6273 /* Hyper-V L2 TLB flush hypercall is handled by L0 */ 6274 return guest_hv_cpuid_has_l2_tlb_flush(vcpu) && 6275 nested_evmcs_l2_tlb_flush_enabled(vcpu) && 6276 kvm_hv_is_tlb_flush_hcall(vcpu); 6277 #endif 6278 default: 6279 break; 6280 } 6281 return false; 6282 } 6283 6284 /* 6285 * Return 1 if L1 wants to intercept an exit from L2. Only call this when in 6286 * is_guest_mode (L2). 6287 */ 6288 static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, 6289 union vmx_exit_reason exit_reason) 6290 { 6291 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6292 u32 intr_info; 6293 6294 switch ((u16)exit_reason.basic) { 6295 case EXIT_REASON_EXCEPTION_NMI: 6296 intr_info = vmx_get_intr_info(vcpu); 6297 if (is_nmi(intr_info)) 6298 return true; 6299 else if (is_page_fault(intr_info)) 6300 return true; 6301 return vmcs12->exception_bitmap & 6302 (1u << (intr_info & INTR_INFO_VECTOR_MASK)); 6303 case EXIT_REASON_EXTERNAL_INTERRUPT: 6304 return nested_exit_on_intr(vcpu); 6305 case EXIT_REASON_TRIPLE_FAULT: 6306 return true; 6307 case EXIT_REASON_INTERRUPT_WINDOW: 6308 return nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING); 6309 case EXIT_REASON_NMI_WINDOW: 6310 return nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING); 6311 case EXIT_REASON_TASK_SWITCH: 6312 return true; 6313 case EXIT_REASON_CPUID: 6314 return true; 6315 case EXIT_REASON_HLT: 6316 return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); 6317 case EXIT_REASON_INVD: 6318 return true; 6319 case EXIT_REASON_INVLPG: 6320 return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 6321 case EXIT_REASON_RDPMC: 6322 return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); 6323 case EXIT_REASON_RDRAND: 6324 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING); 6325 case EXIT_REASON_RDSEED: 6326 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING); 6327 case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: 6328 return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); 6329 case EXIT_REASON_VMREAD: 6330 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, 6331 vmcs12->vmread_bitmap); 6332 case EXIT_REASON_VMWRITE: 6333 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, 6334 vmcs12->vmwrite_bitmap); 6335 case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: 6336 case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: 6337 case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME: 6338 case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: 6339 case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID: 6340 /* 6341 * VMX instructions trap unconditionally. This allows L1 to 6342 * emulate them for its L2 guest, i.e., allows 3-level nesting! 6343 */ 6344 return true; 6345 case EXIT_REASON_CR_ACCESS: 6346 return nested_vmx_exit_handled_cr(vcpu, vmcs12); 6347 case EXIT_REASON_DR_ACCESS: 6348 return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); 6349 case EXIT_REASON_IO_INSTRUCTION: 6350 return nested_vmx_exit_handled_io(vcpu, vmcs12); 6351 case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR: 6352 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC); 6353 case EXIT_REASON_MSR_READ: 6354 case EXIT_REASON_MSR_WRITE: 6355 return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); 6356 case EXIT_REASON_INVALID_STATE: 6357 return true; 6358 case EXIT_REASON_MWAIT_INSTRUCTION: 6359 return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); 6360 case EXIT_REASON_MONITOR_TRAP_FLAG: 6361 return nested_vmx_exit_handled_mtf(vmcs12); 6362 case EXIT_REASON_MONITOR_INSTRUCTION: 6363 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING); 6364 case EXIT_REASON_PAUSE_INSTRUCTION: 6365 return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) || 6366 nested_cpu_has2(vmcs12, 6367 SECONDARY_EXEC_PAUSE_LOOP_EXITING); 6368 case EXIT_REASON_MCE_DURING_VMENTRY: 6369 return true; 6370 case EXIT_REASON_TPR_BELOW_THRESHOLD: 6371 return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW); 6372 case EXIT_REASON_APIC_ACCESS: 6373 case EXIT_REASON_APIC_WRITE: 6374 case EXIT_REASON_EOI_INDUCED: 6375 /* 6376 * The controls for "virtualize APIC accesses," "APIC- 6377 * register virtualization," and "virtual-interrupt 6378 * delivery" only come from vmcs12. 6379 */ 6380 return true; 6381 case EXIT_REASON_INVPCID: 6382 return 6383 nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) && 6384 nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 6385 case EXIT_REASON_WBINVD: 6386 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); 6387 case EXIT_REASON_XSETBV: 6388 return true; 6389 case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS: 6390 /* 6391 * This should never happen, since it is not possible to 6392 * set XSS to a non-zero value---neither in L1 nor in L2. 6393 * If if it were, XSS would have to be checked against 6394 * the XSS exit bitmap in vmcs12. 6395 */ 6396 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_XSAVES); 6397 case EXIT_REASON_UMWAIT: 6398 case EXIT_REASON_TPAUSE: 6399 return nested_cpu_has2(vmcs12, 6400 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE); 6401 case EXIT_REASON_ENCLS: 6402 return nested_vmx_exit_handled_encls(vcpu, vmcs12); 6403 case EXIT_REASON_NOTIFY: 6404 /* Notify VM exit is not exposed to L1 */ 6405 return false; 6406 default: 6407 return true; 6408 } 6409 } 6410 6411 /* 6412 * Conditionally reflect a VM-Exit into L1. Returns %true if the VM-Exit was 6413 * reflected into L1. 6414 */ 6415 bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu) 6416 { 6417 struct vcpu_vmx *vmx = to_vmx(vcpu); 6418 union vmx_exit_reason exit_reason = vmx->exit_reason; 6419 unsigned long exit_qual; 6420 u32 exit_intr_info; 6421 6422 WARN_ON_ONCE(vmx->nested.nested_run_pending); 6423 6424 /* 6425 * Late nested VM-Fail shares the same flow as nested VM-Exit since KVM 6426 * has already loaded L2's state. 6427 */ 6428 if (unlikely(vmx->fail)) { 6429 trace_kvm_nested_vmenter_failed( 6430 "hardware VM-instruction error: ", 6431 vmcs_read32(VM_INSTRUCTION_ERROR)); 6432 exit_intr_info = 0; 6433 exit_qual = 0; 6434 goto reflect_vmexit; 6435 } 6436 6437 trace_kvm_nested_vmexit(vcpu, KVM_ISA_VMX); 6438 6439 /* If L0 (KVM) wants the exit, it trumps L1's desires. */ 6440 if (nested_vmx_l0_wants_exit(vcpu, exit_reason)) 6441 return false; 6442 6443 /* If L1 doesn't want the exit, handle it in L0. */ 6444 if (!nested_vmx_l1_wants_exit(vcpu, exit_reason)) 6445 return false; 6446 6447 /* 6448 * vmcs.VM_EXIT_INTR_INFO is only valid for EXCEPTION_NMI exits. For 6449 * EXTERNAL_INTERRUPT, the value for vmcs12->vm_exit_intr_info would 6450 * need to be synthesized by querying the in-kernel LAPIC, but external 6451 * interrupts are never reflected to L1 so it's a non-issue. 6452 */ 6453 exit_intr_info = vmx_get_intr_info(vcpu); 6454 if (is_exception_with_error_code(exit_intr_info)) { 6455 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6456 6457 vmcs12->vm_exit_intr_error_code = 6458 vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 6459 } 6460 exit_qual = vmx_get_exit_qual(vcpu); 6461 6462 reflect_vmexit: 6463 nested_vmx_vmexit(vcpu, exit_reason.full, exit_intr_info, exit_qual); 6464 return true; 6465 } 6466 6467 static int vmx_get_nested_state(struct kvm_vcpu *vcpu, 6468 struct kvm_nested_state __user *user_kvm_nested_state, 6469 u32 user_data_size) 6470 { 6471 struct vcpu_vmx *vmx; 6472 struct vmcs12 *vmcs12; 6473 struct kvm_nested_state kvm_state = { 6474 .flags = 0, 6475 .format = KVM_STATE_NESTED_FORMAT_VMX, 6476 .size = sizeof(kvm_state), 6477 .hdr.vmx.flags = 0, 6478 .hdr.vmx.vmxon_pa = INVALID_GPA, 6479 .hdr.vmx.vmcs12_pa = INVALID_GPA, 6480 .hdr.vmx.preemption_timer_deadline = 0, 6481 }; 6482 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = 6483 &user_kvm_nested_state->data.vmx[0]; 6484 6485 if (!vcpu) 6486 return kvm_state.size + sizeof(*user_vmx_nested_state); 6487 6488 vmx = to_vmx(vcpu); 6489 vmcs12 = get_vmcs12(vcpu); 6490 6491 if (guest_can_use(vcpu, X86_FEATURE_VMX) && 6492 (vmx->nested.vmxon || vmx->nested.smm.vmxon)) { 6493 kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr; 6494 kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr; 6495 6496 if (vmx_has_valid_vmcs12(vcpu)) { 6497 kvm_state.size += sizeof(user_vmx_nested_state->vmcs12); 6498 6499 /* 'hv_evmcs_vmptr' can also be EVMPTR_MAP_PENDING here */ 6500 if (nested_vmx_is_evmptr12_set(vmx)) 6501 kvm_state.flags |= KVM_STATE_NESTED_EVMCS; 6502 6503 if (is_guest_mode(vcpu) && 6504 nested_cpu_has_shadow_vmcs(vmcs12) && 6505 vmcs12->vmcs_link_pointer != INVALID_GPA) 6506 kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12); 6507 } 6508 6509 if (vmx->nested.smm.vmxon) 6510 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON; 6511 6512 if (vmx->nested.smm.guest_mode) 6513 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE; 6514 6515 if (is_guest_mode(vcpu)) { 6516 kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE; 6517 6518 if (vmx->nested.nested_run_pending) 6519 kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING; 6520 6521 if (vmx->nested.mtf_pending) 6522 kvm_state.flags |= KVM_STATE_NESTED_MTF_PENDING; 6523 6524 if (nested_cpu_has_preemption_timer(vmcs12) && 6525 vmx->nested.has_preemption_timer_deadline) { 6526 kvm_state.hdr.vmx.flags |= 6527 KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE; 6528 kvm_state.hdr.vmx.preemption_timer_deadline = 6529 vmx->nested.preemption_timer_deadline; 6530 } 6531 } 6532 } 6533 6534 if (user_data_size < kvm_state.size) 6535 goto out; 6536 6537 if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state))) 6538 return -EFAULT; 6539 6540 if (!vmx_has_valid_vmcs12(vcpu)) 6541 goto out; 6542 6543 /* 6544 * When running L2, the authoritative vmcs12 state is in the 6545 * vmcs02. When running L1, the authoritative vmcs12 state is 6546 * in the shadow or enlightened vmcs linked to vmcs01, unless 6547 * need_vmcs12_to_shadow_sync is set, in which case, the authoritative 6548 * vmcs12 state is in the vmcs12 already. 6549 */ 6550 if (is_guest_mode(vcpu)) { 6551 sync_vmcs02_to_vmcs12(vcpu, vmcs12); 6552 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 6553 } else { 6554 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); 6555 if (!vmx->nested.need_vmcs12_to_shadow_sync) { 6556 if (nested_vmx_is_evmptr12_valid(vmx)) 6557 /* 6558 * L1 hypervisor is not obliged to keep eVMCS 6559 * clean fields data always up-to-date while 6560 * not in guest mode, 'hv_clean_fields' is only 6561 * supposed to be actual upon vmentry so we need 6562 * to ignore it here and do full copy. 6563 */ 6564 copy_enlightened_to_vmcs12(vmx, 0); 6565 else if (enable_shadow_vmcs) 6566 copy_shadow_to_vmcs12(vmx); 6567 } 6568 } 6569 6570 BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE); 6571 BUILD_BUG_ON(sizeof(user_vmx_nested_state->shadow_vmcs12) < VMCS12_SIZE); 6572 6573 /* 6574 * Copy over the full allocated size of vmcs12 rather than just the size 6575 * of the struct. 6576 */ 6577 if (copy_to_user(user_vmx_nested_state->vmcs12, vmcs12, VMCS12_SIZE)) 6578 return -EFAULT; 6579 6580 if (nested_cpu_has_shadow_vmcs(vmcs12) && 6581 vmcs12->vmcs_link_pointer != INVALID_GPA) { 6582 if (copy_to_user(user_vmx_nested_state->shadow_vmcs12, 6583 get_shadow_vmcs12(vcpu), VMCS12_SIZE)) 6584 return -EFAULT; 6585 } 6586 out: 6587 return kvm_state.size; 6588 } 6589 6590 void vmx_leave_nested(struct kvm_vcpu *vcpu) 6591 { 6592 if (is_guest_mode(vcpu)) { 6593 to_vmx(vcpu)->nested.nested_run_pending = 0; 6594 nested_vmx_vmexit(vcpu, -1, 0, 0); 6595 } 6596 free_nested(vcpu); 6597 } 6598 6599 static int vmx_set_nested_state(struct kvm_vcpu *vcpu, 6600 struct kvm_nested_state __user *user_kvm_nested_state, 6601 struct kvm_nested_state *kvm_state) 6602 { 6603 struct vcpu_vmx *vmx = to_vmx(vcpu); 6604 struct vmcs12 *vmcs12; 6605 enum vm_entry_failure_code ignored; 6606 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = 6607 &user_kvm_nested_state->data.vmx[0]; 6608 int ret; 6609 6610 if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX) 6611 return -EINVAL; 6612 6613 if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) { 6614 if (kvm_state->hdr.vmx.smm.flags) 6615 return -EINVAL; 6616 6617 if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) 6618 return -EINVAL; 6619 6620 /* 6621 * KVM_STATE_NESTED_EVMCS used to signal that KVM should 6622 * enable eVMCS capability on vCPU. However, since then 6623 * code was changed such that flag signals vmcs12 should 6624 * be copied into eVMCS in guest memory. 6625 * 6626 * To preserve backwards compatibility, allow user 6627 * to set this flag even when there is no VMXON region. 6628 */ 6629 if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS) 6630 return -EINVAL; 6631 } else { 6632 if (!guest_can_use(vcpu, X86_FEATURE_VMX)) 6633 return -EINVAL; 6634 6635 if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa)) 6636 return -EINVAL; 6637 } 6638 6639 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 6640 (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 6641 return -EINVAL; 6642 6643 if (kvm_state->hdr.vmx.smm.flags & 6644 ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON)) 6645 return -EINVAL; 6646 6647 if (kvm_state->hdr.vmx.flags & ~KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) 6648 return -EINVAL; 6649 6650 /* 6651 * SMM temporarily disables VMX, so we cannot be in guest mode, 6652 * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags 6653 * must be zero. 6654 */ 6655 if (is_smm(vcpu) ? 6656 (kvm_state->flags & 6657 (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING)) 6658 : kvm_state->hdr.vmx.smm.flags) 6659 return -EINVAL; 6660 6661 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 6662 !(kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON)) 6663 return -EINVAL; 6664 6665 if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) && 6666 (!guest_can_use(vcpu, X86_FEATURE_VMX) || 6667 !vmx->nested.enlightened_vmcs_enabled)) 6668 return -EINVAL; 6669 6670 vmx_leave_nested(vcpu); 6671 6672 if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) 6673 return 0; 6674 6675 vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa; 6676 ret = enter_vmx_operation(vcpu); 6677 if (ret) 6678 return ret; 6679 6680 /* Empty 'VMXON' state is permitted if no VMCS loaded */ 6681 if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) { 6682 /* See vmx_has_valid_vmcs12. */ 6683 if ((kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE) || 6684 (kvm_state->flags & KVM_STATE_NESTED_EVMCS) || 6685 (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA)) 6686 return -EINVAL; 6687 else 6688 return 0; 6689 } 6690 6691 if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) { 6692 if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa || 6693 !page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa)) 6694 return -EINVAL; 6695 6696 set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa); 6697 #ifdef CONFIG_KVM_HYPERV 6698 } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) { 6699 /* 6700 * nested_vmx_handle_enlightened_vmptrld() cannot be called 6701 * directly from here as HV_X64_MSR_VP_ASSIST_PAGE may not be 6702 * restored yet. EVMCS will be mapped from 6703 * nested_get_vmcs12_pages(). 6704 */ 6705 vmx->nested.hv_evmcs_vmptr = EVMPTR_MAP_PENDING; 6706 kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 6707 #endif 6708 } else { 6709 return -EINVAL; 6710 } 6711 6712 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) { 6713 vmx->nested.smm.vmxon = true; 6714 vmx->nested.vmxon = false; 6715 6716 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) 6717 vmx->nested.smm.guest_mode = true; 6718 } 6719 6720 vmcs12 = get_vmcs12(vcpu); 6721 if (copy_from_user(vmcs12, user_vmx_nested_state->vmcs12, sizeof(*vmcs12))) 6722 return -EFAULT; 6723 6724 if (vmcs12->hdr.revision_id != VMCS12_REVISION) 6725 return -EINVAL; 6726 6727 if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 6728 return 0; 6729 6730 vmx->nested.nested_run_pending = 6731 !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); 6732 6733 vmx->nested.mtf_pending = 6734 !!(kvm_state->flags & KVM_STATE_NESTED_MTF_PENDING); 6735 6736 ret = -EINVAL; 6737 if (nested_cpu_has_shadow_vmcs(vmcs12) && 6738 vmcs12->vmcs_link_pointer != INVALID_GPA) { 6739 struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); 6740 6741 if (kvm_state->size < 6742 sizeof(*kvm_state) + 6743 sizeof(user_vmx_nested_state->vmcs12) + sizeof(*shadow_vmcs12)) 6744 goto error_guest_mode; 6745 6746 if (copy_from_user(shadow_vmcs12, 6747 user_vmx_nested_state->shadow_vmcs12, 6748 sizeof(*shadow_vmcs12))) { 6749 ret = -EFAULT; 6750 goto error_guest_mode; 6751 } 6752 6753 if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION || 6754 !shadow_vmcs12->hdr.shadow_vmcs) 6755 goto error_guest_mode; 6756 } 6757 6758 vmx->nested.has_preemption_timer_deadline = false; 6759 if (kvm_state->hdr.vmx.flags & KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) { 6760 vmx->nested.has_preemption_timer_deadline = true; 6761 vmx->nested.preemption_timer_deadline = 6762 kvm_state->hdr.vmx.preemption_timer_deadline; 6763 } 6764 6765 if (nested_vmx_check_controls(vcpu, vmcs12) || 6766 nested_vmx_check_host_state(vcpu, vmcs12) || 6767 nested_vmx_check_guest_state(vcpu, vmcs12, &ignored)) 6768 goto error_guest_mode; 6769 6770 vmx->nested.dirty_vmcs12 = true; 6771 vmx->nested.force_msr_bitmap_recalc = true; 6772 ret = nested_vmx_enter_non_root_mode(vcpu, false); 6773 if (ret) 6774 goto error_guest_mode; 6775 6776 if (vmx->nested.mtf_pending) 6777 kvm_make_request(KVM_REQ_EVENT, vcpu); 6778 6779 return 0; 6780 6781 error_guest_mode: 6782 vmx->nested.nested_run_pending = 0; 6783 return ret; 6784 } 6785 6786 void nested_vmx_set_vmcs_shadowing_bitmap(void) 6787 { 6788 if (enable_shadow_vmcs) { 6789 vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); 6790 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); 6791 } 6792 } 6793 6794 /* 6795 * Indexing into the vmcs12 uses the VMCS encoding rotated left by 6. Undo 6796 * that madness to get the encoding for comparison. 6797 */ 6798 #define VMCS12_IDX_TO_ENC(idx) ((u16)(((u16)(idx) >> 6) | ((u16)(idx) << 10))) 6799 6800 static u64 nested_vmx_calc_vmcs_enum_msr(void) 6801 { 6802 /* 6803 * Note these are the so called "index" of the VMCS field encoding, not 6804 * the index into vmcs12. 6805 */ 6806 unsigned int max_idx, idx; 6807 int i; 6808 6809 /* 6810 * For better or worse, KVM allows VMREAD/VMWRITE to all fields in 6811 * vmcs12, regardless of whether or not the associated feature is 6812 * exposed to L1. Simply find the field with the highest index. 6813 */ 6814 max_idx = 0; 6815 for (i = 0; i < nr_vmcs12_fields; i++) { 6816 /* The vmcs12 table is very, very sparsely populated. */ 6817 if (!vmcs12_field_offsets[i]) 6818 continue; 6819 6820 idx = vmcs_field_index(VMCS12_IDX_TO_ENC(i)); 6821 if (idx > max_idx) 6822 max_idx = idx; 6823 } 6824 6825 return (u64)max_idx << VMCS_FIELD_INDEX_SHIFT; 6826 } 6827 6828 static void nested_vmx_setup_pinbased_ctls(struct vmcs_config *vmcs_conf, 6829 struct nested_vmx_msrs *msrs) 6830 { 6831 msrs->pinbased_ctls_low = 6832 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 6833 6834 msrs->pinbased_ctls_high = vmcs_conf->pin_based_exec_ctrl; 6835 msrs->pinbased_ctls_high &= 6836 PIN_BASED_EXT_INTR_MASK | 6837 PIN_BASED_NMI_EXITING | 6838 PIN_BASED_VIRTUAL_NMIS | 6839 (enable_apicv ? PIN_BASED_POSTED_INTR : 0); 6840 msrs->pinbased_ctls_high |= 6841 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 6842 PIN_BASED_VMX_PREEMPTION_TIMER; 6843 } 6844 6845 static void nested_vmx_setup_exit_ctls(struct vmcs_config *vmcs_conf, 6846 struct nested_vmx_msrs *msrs) 6847 { 6848 msrs->exit_ctls_low = 6849 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 6850 6851 msrs->exit_ctls_high = vmcs_conf->vmexit_ctrl; 6852 msrs->exit_ctls_high &= 6853 #ifdef CONFIG_X86_64 6854 VM_EXIT_HOST_ADDR_SPACE_SIZE | 6855 #endif 6856 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT | 6857 VM_EXIT_CLEAR_BNDCFGS; 6858 msrs->exit_ctls_high |= 6859 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | 6860 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | 6861 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT | 6862 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; 6863 6864 /* We support free control of debug control saving. */ 6865 msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; 6866 } 6867 6868 static void nested_vmx_setup_entry_ctls(struct vmcs_config *vmcs_conf, 6869 struct nested_vmx_msrs *msrs) 6870 { 6871 msrs->entry_ctls_low = 6872 VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 6873 6874 msrs->entry_ctls_high = vmcs_conf->vmentry_ctrl; 6875 msrs->entry_ctls_high &= 6876 #ifdef CONFIG_X86_64 6877 VM_ENTRY_IA32E_MODE | 6878 #endif 6879 VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS; 6880 msrs->entry_ctls_high |= 6881 (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER | 6882 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL); 6883 6884 /* We support free control of debug control loading. */ 6885 msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; 6886 } 6887 6888 static void nested_vmx_setup_cpubased_ctls(struct vmcs_config *vmcs_conf, 6889 struct nested_vmx_msrs *msrs) 6890 { 6891 msrs->procbased_ctls_low = 6892 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 6893 6894 msrs->procbased_ctls_high = vmcs_conf->cpu_based_exec_ctrl; 6895 msrs->procbased_ctls_high &= 6896 CPU_BASED_INTR_WINDOW_EXITING | 6897 CPU_BASED_NMI_WINDOW_EXITING | CPU_BASED_USE_TSC_OFFSETTING | 6898 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | 6899 CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | 6900 CPU_BASED_CR3_STORE_EXITING | 6901 #ifdef CONFIG_X86_64 6902 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | 6903 #endif 6904 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | 6905 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG | 6906 CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING | 6907 CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING | 6908 CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 6909 /* 6910 * We can allow some features even when not supported by the 6911 * hardware. For example, L1 can specify an MSR bitmap - and we 6912 * can use it to avoid exits to L1 - even when L0 runs L2 6913 * without MSR bitmaps. 6914 */ 6915 msrs->procbased_ctls_high |= 6916 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 6917 CPU_BASED_USE_MSR_BITMAPS; 6918 6919 /* We support free control of CR3 access interception. */ 6920 msrs->procbased_ctls_low &= 6921 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); 6922 } 6923 6924 static void nested_vmx_setup_secondary_ctls(u32 ept_caps, 6925 struct vmcs_config *vmcs_conf, 6926 struct nested_vmx_msrs *msrs) 6927 { 6928 msrs->secondary_ctls_low = 0; 6929 6930 msrs->secondary_ctls_high = vmcs_conf->cpu_based_2nd_exec_ctrl; 6931 msrs->secondary_ctls_high &= 6932 SECONDARY_EXEC_DESC | 6933 SECONDARY_EXEC_ENABLE_RDTSCP | 6934 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 6935 SECONDARY_EXEC_WBINVD_EXITING | 6936 SECONDARY_EXEC_APIC_REGISTER_VIRT | 6937 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 6938 SECONDARY_EXEC_RDRAND_EXITING | 6939 SECONDARY_EXEC_ENABLE_INVPCID | 6940 SECONDARY_EXEC_ENABLE_VMFUNC | 6941 SECONDARY_EXEC_RDSEED_EXITING | 6942 SECONDARY_EXEC_ENABLE_XSAVES | 6943 SECONDARY_EXEC_TSC_SCALING | 6944 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; 6945 6946 /* 6947 * We can emulate "VMCS shadowing," even if the hardware 6948 * doesn't support it. 6949 */ 6950 msrs->secondary_ctls_high |= 6951 SECONDARY_EXEC_SHADOW_VMCS; 6952 6953 if (enable_ept) { 6954 /* nested EPT: emulate EPT also to L1 */ 6955 msrs->secondary_ctls_high |= 6956 SECONDARY_EXEC_ENABLE_EPT; 6957 msrs->ept_caps = 6958 VMX_EPT_PAGE_WALK_4_BIT | 6959 VMX_EPT_PAGE_WALK_5_BIT | 6960 VMX_EPTP_WB_BIT | 6961 VMX_EPT_INVEPT_BIT | 6962 VMX_EPT_EXECUTE_ONLY_BIT; 6963 6964 msrs->ept_caps &= ept_caps; 6965 msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | 6966 VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT | 6967 VMX_EPT_1GB_PAGE_BIT; 6968 if (enable_ept_ad_bits) { 6969 msrs->secondary_ctls_high |= 6970 SECONDARY_EXEC_ENABLE_PML; 6971 msrs->ept_caps |= VMX_EPT_AD_BIT; 6972 } 6973 6974 /* 6975 * Advertise EPTP switching irrespective of hardware support, 6976 * KVM emulates it in software so long as VMFUNC is supported. 6977 */ 6978 if (cpu_has_vmx_vmfunc()) 6979 msrs->vmfunc_controls = VMX_VMFUNC_EPTP_SWITCHING; 6980 } 6981 6982 /* 6983 * Old versions of KVM use the single-context version without 6984 * checking for support, so declare that it is supported even 6985 * though it is treated as global context. The alternative is 6986 * not failing the single-context invvpid, and it is worse. 6987 */ 6988 if (enable_vpid) { 6989 msrs->secondary_ctls_high |= 6990 SECONDARY_EXEC_ENABLE_VPID; 6991 msrs->vpid_caps = VMX_VPID_INVVPID_BIT | 6992 VMX_VPID_EXTENT_SUPPORTED_MASK; 6993 } 6994 6995 if (enable_unrestricted_guest) 6996 msrs->secondary_ctls_high |= 6997 SECONDARY_EXEC_UNRESTRICTED_GUEST; 6998 6999 if (flexpriority_enabled) 7000 msrs->secondary_ctls_high |= 7001 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 7002 7003 if (enable_sgx) 7004 msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING; 7005 } 7006 7007 static void nested_vmx_setup_misc_data(struct vmcs_config *vmcs_conf, 7008 struct nested_vmx_msrs *msrs) 7009 { 7010 msrs->misc_low = (u32)vmcs_conf->misc & VMX_MISC_SAVE_EFER_LMA; 7011 msrs->misc_low |= 7012 MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | 7013 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | 7014 VMX_MISC_ACTIVITY_HLT | 7015 VMX_MISC_ACTIVITY_WAIT_SIPI; 7016 msrs->misc_high = 0; 7017 } 7018 7019 static void nested_vmx_setup_basic(struct nested_vmx_msrs *msrs) 7020 { 7021 /* 7022 * This MSR reports some information about VMX support. We 7023 * should return information about the VMX we emulate for the 7024 * guest, and the VMCS structure we give it - not about the 7025 * VMX support of the underlying hardware. 7026 */ 7027 msrs->basic = 7028 VMCS12_REVISION | 7029 VMX_BASIC_TRUE_CTLS | 7030 ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | 7031 (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); 7032 7033 if (cpu_has_vmx_basic_inout()) 7034 msrs->basic |= VMX_BASIC_INOUT; 7035 } 7036 7037 static void nested_vmx_setup_cr_fixed(struct nested_vmx_msrs *msrs) 7038 { 7039 /* 7040 * These MSRs specify bits which the guest must keep fixed on 7041 * while L1 is in VMXON mode (in L1's root mode, or running an L2). 7042 * We picked the standard core2 setting. 7043 */ 7044 #define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) 7045 #define VMXON_CR4_ALWAYSON X86_CR4_VMXE 7046 msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON; 7047 msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON; 7048 7049 /* These MSRs specify bits which the guest must keep fixed off. */ 7050 rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1); 7051 rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1); 7052 7053 if (vmx_umip_emulated()) 7054 msrs->cr4_fixed1 |= X86_CR4_UMIP; 7055 } 7056 7057 /* 7058 * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be 7059 * returned for the various VMX controls MSRs when nested VMX is enabled. 7060 * The same values should also be used to verify that vmcs12 control fields are 7061 * valid during nested entry from L1 to L2. 7062 * Each of these control msrs has a low and high 32-bit half: A low bit is on 7063 * if the corresponding bit in the (32-bit) control field *must* be on, and a 7064 * bit in the high half is on if the corresponding bit in the control field 7065 * may be on. See also vmx_control_verify(). 7066 */ 7067 void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps) 7068 { 7069 struct nested_vmx_msrs *msrs = &vmcs_conf->nested; 7070 7071 /* 7072 * Note that as a general rule, the high half of the MSRs (bits in 7073 * the control fields which may be 1) should be initialized by the 7074 * intersection of the underlying hardware's MSR (i.e., features which 7075 * can be supported) and the list of features we want to expose - 7076 * because they are known to be properly supported in our code. 7077 * Also, usually, the low half of the MSRs (bits which must be 1) can 7078 * be set to 0, meaning that L1 may turn off any of these bits. The 7079 * reason is that if one of these bits is necessary, it will appear 7080 * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control 7081 * fields of vmcs01 and vmcs02, will turn these bits off - and 7082 * nested_vmx_l1_wants_exit() will not pass related exits to L1. 7083 * These rules have exceptions below. 7084 */ 7085 nested_vmx_setup_pinbased_ctls(vmcs_conf, msrs); 7086 7087 nested_vmx_setup_exit_ctls(vmcs_conf, msrs); 7088 7089 nested_vmx_setup_entry_ctls(vmcs_conf, msrs); 7090 7091 nested_vmx_setup_cpubased_ctls(vmcs_conf, msrs); 7092 7093 nested_vmx_setup_secondary_ctls(ept_caps, vmcs_conf, msrs); 7094 7095 nested_vmx_setup_misc_data(vmcs_conf, msrs); 7096 7097 nested_vmx_setup_basic(msrs); 7098 7099 nested_vmx_setup_cr_fixed(msrs); 7100 7101 msrs->vmcs_enum = nested_vmx_calc_vmcs_enum_msr(); 7102 } 7103 7104 void nested_vmx_hardware_unsetup(void) 7105 { 7106 int i; 7107 7108 if (enable_shadow_vmcs) { 7109 for (i = 0; i < VMX_BITMAP_NR; i++) 7110 free_page((unsigned long)vmx_bitmap[i]); 7111 } 7112 } 7113 7114 __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) 7115 { 7116 int i; 7117 7118 if (!cpu_has_vmx_shadow_vmcs()) 7119 enable_shadow_vmcs = 0; 7120 if (enable_shadow_vmcs) { 7121 for (i = 0; i < VMX_BITMAP_NR; i++) { 7122 /* 7123 * The vmx_bitmap is not tied to a VM and so should 7124 * not be charged to a memcg. 7125 */ 7126 vmx_bitmap[i] = (unsigned long *) 7127 __get_free_page(GFP_KERNEL); 7128 if (!vmx_bitmap[i]) { 7129 nested_vmx_hardware_unsetup(); 7130 return -ENOMEM; 7131 } 7132 } 7133 7134 init_vmcs_shadow_fields(); 7135 } 7136 7137 exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear; 7138 exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch; 7139 exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld; 7140 exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst; 7141 exit_handlers[EXIT_REASON_VMREAD] = handle_vmread; 7142 exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume; 7143 exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite; 7144 exit_handlers[EXIT_REASON_VMOFF] = handle_vmxoff; 7145 exit_handlers[EXIT_REASON_VMON] = handle_vmxon; 7146 exit_handlers[EXIT_REASON_INVEPT] = handle_invept; 7147 exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid; 7148 exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc; 7149 7150 return 0; 7151 } 7152 7153 struct kvm_x86_nested_ops vmx_nested_ops = { 7154 .leave_nested = vmx_leave_nested, 7155 .is_exception_vmexit = nested_vmx_is_exception_vmexit, 7156 .check_events = vmx_check_nested_events, 7157 .has_events = vmx_has_nested_events, 7158 .triple_fault = nested_vmx_triple_fault, 7159 .get_state = vmx_get_nested_state, 7160 .set_state = vmx_set_nested_state, 7161 .get_nested_state_pages = vmx_get_nested_state_pages, 7162 .write_log_dirty = nested_vmx_write_pml_buffer, 7163 #ifdef CONFIG_KVM_HYPERV 7164 .enable_evmcs = nested_enable_evmcs, 7165 .get_evmcs_version = nested_get_evmcs_version, 7166 .hv_inject_synthetic_vmexit_post_tlb_flush = vmx_hv_inject_synthetic_vmexit_post_tlb_flush, 7167 #endif 7168 }; 7169