1 // SPDX-License-Identifier: GPL-2.0 2 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 3 4 #include <linux/objtool.h> 5 #include <linux/percpu.h> 6 7 #include <asm/debugreg.h> 8 #include <asm/mmu_context.h> 9 #include <asm/msr.h> 10 11 #include "x86.h" 12 #include "cpuid.h" 13 #include "hyperv.h" 14 #include "mmu.h" 15 #include "nested.h" 16 #include "pmu.h" 17 #include "posted_intr.h" 18 #include "sgx.h" 19 #include "trace.h" 20 #include "vmx.h" 21 #include "smm.h" 22 #include "x86_ops.h" 23 24 static bool __read_mostly enable_shadow_vmcs = 1; 25 module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); 26 27 static bool __ro_after_init warn_on_missed_cc; 28 module_param(warn_on_missed_cc, bool, 0444); 29 30 #define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK 31 32 /* 33 * Hyper-V requires all of these, so mark them as supported even though 34 * they are just treated the same as all-context. 35 */ 36 #define VMX_VPID_EXTENT_SUPPORTED_MASK \ 37 (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \ 38 VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \ 39 VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \ 40 VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT) 41 42 #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 43 44 enum { 45 VMX_VMREAD_BITMAP, 46 VMX_VMWRITE_BITMAP, 47 VMX_BITMAP_NR 48 }; 49 static unsigned long *vmx_bitmap[VMX_BITMAP_NR]; 50 51 #define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP]) 52 #define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP]) 53 54 struct shadow_vmcs_field { 55 u16 encoding; 56 u16 offset; 57 }; 58 static struct shadow_vmcs_field shadow_read_only_fields[] = { 59 #define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) }, 60 #include "vmcs_shadow_fields.h" 61 }; 62 static int max_shadow_read_only_fields = 63 ARRAY_SIZE(shadow_read_only_fields); 64 65 static struct shadow_vmcs_field shadow_read_write_fields[] = { 66 #define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) }, 67 #include "vmcs_shadow_fields.h" 68 }; 69 static int max_shadow_read_write_fields = 70 ARRAY_SIZE(shadow_read_write_fields); 71 72 static void init_vmcs_shadow_fields(void) 73 { 74 int i, j; 75 76 memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); 77 memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); 78 79 for (i = j = 0; i < max_shadow_read_only_fields; i++) { 80 struct shadow_vmcs_field entry = shadow_read_only_fields[i]; 81 u16 field = entry.encoding; 82 83 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && 84 (i + 1 == max_shadow_read_only_fields || 85 shadow_read_only_fields[i + 1].encoding != field + 1)) 86 pr_err("Missing field from shadow_read_only_field %x\n", 87 field + 1); 88 89 if (get_vmcs12_field_offset(field) < 0) 90 continue; 91 92 clear_bit(field, vmx_vmread_bitmap); 93 if (field & 1) 94 #ifdef CONFIG_X86_64 95 continue; 96 #else 97 entry.offset += sizeof(u32); 98 #endif 99 shadow_read_only_fields[j++] = entry; 100 } 101 max_shadow_read_only_fields = j; 102 103 for (i = j = 0; i < max_shadow_read_write_fields; i++) { 104 struct shadow_vmcs_field entry = shadow_read_write_fields[i]; 105 u16 field = entry.encoding; 106 107 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && 108 (i + 1 == max_shadow_read_write_fields || 109 shadow_read_write_fields[i + 1].encoding != field + 1)) 110 pr_err("Missing field from shadow_read_write_field %x\n", 111 field + 1); 112 113 WARN_ONCE(field >= GUEST_ES_AR_BYTES && 114 field <= GUEST_TR_AR_BYTES, 115 "Update vmcs12_write_any() to drop reserved bits from AR_BYTES"); 116 117 if (get_vmcs12_field_offset(field) < 0) 118 continue; 119 120 /* 121 * KVM emulates PML and the VMX preemption timer irrespective 122 * of hardware support, but shadowing their related VMCS fields 123 * requires hardware support as the CPU will reject VMWRITEs to 124 * fields that don't exist. 125 */ 126 switch (field) { 127 case GUEST_PML_INDEX: 128 if (!cpu_has_vmx_pml()) 129 continue; 130 break; 131 case VMX_PREEMPTION_TIMER_VALUE: 132 if (!cpu_has_vmx_preemption_timer()) 133 continue; 134 break; 135 default: 136 break; 137 } 138 139 clear_bit(field, vmx_vmwrite_bitmap); 140 clear_bit(field, vmx_vmread_bitmap); 141 if (field & 1) 142 #ifdef CONFIG_X86_64 143 continue; 144 #else 145 entry.offset += sizeof(u32); 146 #endif 147 shadow_read_write_fields[j++] = entry; 148 } 149 max_shadow_read_write_fields = j; 150 } 151 152 /* 153 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), 154 * set the success or error code of an emulated VMX instruction (as specified 155 * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated 156 * instruction. 157 */ 158 static int nested_vmx_succeed(struct kvm_vcpu *vcpu) 159 { 160 vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) 161 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 162 X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); 163 return kvm_skip_emulated_instruction(vcpu); 164 } 165 166 static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu) 167 { 168 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 169 & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | 170 X86_EFLAGS_SF | X86_EFLAGS_OF)) 171 | X86_EFLAGS_CF); 172 return kvm_skip_emulated_instruction(vcpu); 173 } 174 175 static int nested_vmx_failValid(struct kvm_vcpu *vcpu, 176 u32 vm_instruction_error) 177 { 178 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 179 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 180 X86_EFLAGS_SF | X86_EFLAGS_OF)) 181 | X86_EFLAGS_ZF); 182 get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; 183 /* 184 * We don't need to force sync to shadow VMCS because 185 * VM_INSTRUCTION_ERROR is not shadowed. Enlightened VMCS 'shadows' all 186 * fields and thus must be synced. 187 */ 188 if (nested_vmx_is_evmptr12_set(to_vmx(vcpu))) 189 to_vmx(vcpu)->nested.need_vmcs12_to_shadow_sync = true; 190 191 return kvm_skip_emulated_instruction(vcpu); 192 } 193 194 static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error) 195 { 196 struct vcpu_vmx *vmx = to_vmx(vcpu); 197 198 /* 199 * failValid writes the error number to the current VMCS, which 200 * can't be done if there isn't a current VMCS. 201 */ 202 if (vmx->nested.current_vmptr == INVALID_GPA && 203 !nested_vmx_is_evmptr12_valid(vmx)) 204 return nested_vmx_failInvalid(vcpu); 205 206 return nested_vmx_failValid(vcpu, vm_instruction_error); 207 } 208 209 static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) 210 { 211 /* TODO: not to reset guest simply here. */ 212 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 213 pr_debug_ratelimited("nested vmx abort, indicator %d\n", indicator); 214 } 215 216 static inline bool vmx_control_verify(u32 control, u32 low, u32 high) 217 { 218 return fixed_bits_valid(control, low, high); 219 } 220 221 static inline u64 vmx_control_msr(u32 low, u32 high) 222 { 223 return low | ((u64)high << 32); 224 } 225 226 static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) 227 { 228 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); 229 vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); 230 vmx->nested.need_vmcs12_to_shadow_sync = false; 231 } 232 233 static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) 234 { 235 #ifdef CONFIG_KVM_HYPERV 236 struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); 237 struct vcpu_vmx *vmx = to_vmx(vcpu); 238 239 kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map); 240 vmx->nested.hv_evmcs = NULL; 241 vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; 242 243 if (hv_vcpu) { 244 hv_vcpu->nested.pa_page_gpa = INVALID_GPA; 245 hv_vcpu->nested.vm_id = 0; 246 hv_vcpu->nested.vp_id = 0; 247 } 248 #endif 249 } 250 251 static bool nested_evmcs_handle_vmclear(struct kvm_vcpu *vcpu, gpa_t vmptr) 252 { 253 #ifdef CONFIG_KVM_HYPERV 254 struct vcpu_vmx *vmx = to_vmx(vcpu); 255 /* 256 * When Enlightened VMEntry is enabled on the calling CPU we treat 257 * memory area pointer by vmptr as Enlightened VMCS (as there's no good 258 * way to distinguish it from VMCS12) and we must not corrupt it by 259 * writing to the non-existent 'launch_state' field. The area doesn't 260 * have to be the currently active EVMCS on the calling CPU and there's 261 * nothing KVM has to do to transition it from 'active' to 'non-active' 262 * state. It is possible that the area will stay mapped as 263 * vmx->nested.hv_evmcs but this shouldn't be a problem. 264 */ 265 if (!guest_cpu_cap_has_evmcs(vcpu) || 266 !evmptr_is_valid(nested_get_evmptr(vcpu))) 267 return false; 268 269 if (nested_vmx_evmcs(vmx) && vmptr == vmx->nested.hv_evmcs_vmptr) 270 nested_release_evmcs(vcpu); 271 272 return true; 273 #else 274 return false; 275 #endif 276 } 277 278 static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, 279 struct loaded_vmcs *prev) 280 { 281 struct vmcs_host_state *dest, *src; 282 283 if (unlikely(!vmx->vt.guest_state_loaded)) 284 return; 285 286 src = &prev->host_state; 287 dest = &vmx->loaded_vmcs->host_state; 288 289 vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base); 290 dest->ldt_sel = src->ldt_sel; 291 #ifdef CONFIG_X86_64 292 dest->ds_sel = src->ds_sel; 293 dest->es_sel = src->es_sel; 294 #endif 295 } 296 297 static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) 298 { 299 struct vcpu_vmx *vmx = to_vmx(vcpu); 300 struct loaded_vmcs *prev; 301 int cpu; 302 303 if (WARN_ON_ONCE(vmx->loaded_vmcs == vmcs)) 304 return; 305 306 cpu = get_cpu(); 307 prev = vmx->loaded_vmcs; 308 vmx->loaded_vmcs = vmcs; 309 vmx_vcpu_load_vmcs(vcpu, cpu); 310 vmx_sync_vmcs_host_state(vmx, prev); 311 put_cpu(); 312 313 kvm_clear_available_registers(vcpu, VMX_REGS_LAZY_LOAD_SET); 314 315 /* 316 * All lazily updated registers will be reloaded from VMCS12 on both 317 * vmentry and vmexit. 318 */ 319 kvm_reset_dirty_registers(vcpu); 320 } 321 322 static void nested_put_vmcs12_pages(struct kvm_vcpu *vcpu) 323 { 324 struct vcpu_vmx *vmx = to_vmx(vcpu); 325 326 kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map); 327 kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map); 328 kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map); 329 vmx->nested.pi_desc = NULL; 330 } 331 332 /* 333 * Free whatever needs to be freed from vmx->nested when L1 goes down, or 334 * just stops using VMX. 335 */ 336 static void free_nested(struct kvm_vcpu *vcpu) 337 { 338 struct vcpu_vmx *vmx = to_vmx(vcpu); 339 340 if (WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01)) 341 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 342 343 if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) 344 return; 345 346 kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 347 348 vmx->nested.vmxon = false; 349 vmx->nested.smm.vmxon = false; 350 vmx->nested.vmxon_ptr = INVALID_GPA; 351 free_vpid(vmx->nested.vpid02); 352 vmx->nested.posted_intr_nv = -1; 353 vmx->nested.current_vmptr = INVALID_GPA; 354 if (enable_shadow_vmcs) { 355 vmx_disable_shadow_vmcs(vmx); 356 vmcs_clear(vmx->vmcs01.shadow_vmcs); 357 free_vmcs(vmx->vmcs01.shadow_vmcs); 358 vmx->vmcs01.shadow_vmcs = NULL; 359 } 360 kfree(vmx->nested.cached_vmcs12); 361 vmx->nested.cached_vmcs12 = NULL; 362 kfree(vmx->nested.cached_shadow_vmcs12); 363 vmx->nested.cached_shadow_vmcs12 = NULL; 364 365 nested_put_vmcs12_pages(vcpu); 366 367 kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 368 369 nested_release_evmcs(vcpu); 370 371 free_loaded_vmcs(&vmx->nested.vmcs02); 372 } 373 374 /* 375 * Ensure that the current vmcs of the logical processor is the 376 * vmcs01 of the vcpu before calling free_nested(). 377 */ 378 void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu) 379 { 380 vcpu_load(vcpu); 381 vmx_leave_nested(vcpu); 382 vcpu_put(vcpu); 383 } 384 385 #define EPTP_PA_MASK GENMASK_ULL(51, 12) 386 387 static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp) 388 { 389 return VALID_PAGE(root_hpa) && 390 ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK)); 391 } 392 393 static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp, 394 gpa_t addr) 395 { 396 unsigned long roots = 0; 397 uint i; 398 struct kvm_mmu_root_info *cached_root; 399 400 WARN_ON_ONCE(!mmu_is_nested(vcpu)); 401 402 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 403 cached_root = &vcpu->arch.mmu->prev_roots[i]; 404 405 if (nested_ept_root_matches(cached_root->hpa, cached_root->pgd, 406 eptp)) 407 roots |= KVM_MMU_ROOT_PREVIOUS(i); 408 } 409 if (roots) 410 kvm_mmu_invalidate_addr(vcpu, vcpu->arch.mmu, addr, roots); 411 } 412 413 static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, 414 struct x86_exception *fault, 415 bool from_hardware) 416 { 417 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 418 struct vcpu_vmx *vmx = to_vmx(vcpu); 419 unsigned long exit_qualification; 420 u32 vm_exit_reason; 421 422 if (vmx->nested.pml_full) { 423 vm_exit_reason = EXIT_REASON_PML_FULL; 424 vmx->nested.pml_full = false; 425 426 /* 427 * It should be impossible to trigger a nested PML Full VM-Exit 428 * for anything other than an EPT Violation from L2. KVM *can* 429 * trigger nEPT page fault injection in response to an EPT 430 * Misconfig, e.g. if the MMIO SPTE was stale and L1's EPT 431 * tables also changed, but KVM should not treat EPT Misconfig 432 * VM-Exits as writes. 433 */ 434 WARN_ON_ONCE(vmx->vt.exit_reason.basic != EXIT_REASON_EPT_VIOLATION); 435 436 /* 437 * PML Full and EPT Violation VM-Exits both use bit 12 to report 438 * "NMI unblocking due to IRET", i.e. the bit can be propagated 439 * as-is from the original EXIT_QUALIFICATION. 440 */ 441 exit_qualification = vmx_get_exit_qual(vcpu) & INTR_INFO_UNBLOCK_NMI; 442 } else { 443 if (fault->error_code & PFERR_RSVD_MASK) { 444 vm_exit_reason = EXIT_REASON_EPT_MISCONFIG; 445 exit_qualification = 0; 446 } else { 447 u64 mask = EPT_VIOLATION_GVA_IS_VALID | 448 EPT_VIOLATION_GVA_TRANSLATED; 449 450 if (vmx->nested.msrs.ept_caps & VMX_EPT_ADVANCED_VMEXIT_INFO_BIT) 451 mask |= EPT_VIOLATION_GVA_USER | 452 EPT_VIOLATION_GVA_WRITABLE | 453 EPT_VIOLATION_GVA_NX; 454 455 exit_qualification = fault->exit_qualification & ~mask; 456 457 /* 458 * Use the EXIT_QUALIFICATION from the VMCS if and only 459 * if the hardware VM-Exit from L2 was an EPT Violation. 460 * If the fault is synthesized, then EXIT_QUALIFICATION 461 * is stale and/or holds entirely different data. And 462 * conversely, KVM _must_ rely on EXIT_QUALIFICATION if 463 * the fault came from hardware, because KVM only sees 464 * and walks the faulting GPA. 465 */ 466 if (from_hardware) 467 exit_qualification |= vmx_get_exit_qual(vcpu) & mask; 468 else 469 exit_qualification |= fault->exit_qualification & mask; 470 471 vm_exit_reason = EXIT_REASON_EPT_VIOLATION; 472 } 473 474 /* 475 * Although the caller (kvm_inject_emulated_page_fault) would 476 * have already synced the faulting address in the shadow EPT 477 * tables for the current EPTP12, we also need to sync it for 478 * any other cached EPTP02s based on the same EP4TA, since the 479 * TLB associates mappings to the EP4TA rather than the full EPTP. 480 */ 481 nested_ept_invalidate_addr(vcpu, vmcs12->ept_pointer, 482 fault->address); 483 } 484 485 nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification); 486 vmcs12->guest_physical_address = fault->address; 487 } 488 489 static inline bool nested_ept_mbec_enabled(struct kvm_vcpu *vcpu) 490 { 491 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 492 493 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC); 494 } 495 496 static void nested_ept_new_eptp(struct kvm_vcpu *vcpu) 497 { 498 struct vcpu_vmx *vmx = to_vmx(vcpu); 499 bool execonly = vmx->nested.msrs.ept_caps & VMX_EPT_EXECUTE_ONLY_BIT; 500 int ept_lpage_level = ept_caps_to_lpage_level(vmx->nested.msrs.ept_caps); 501 502 kvm_init_shadow_ept_mmu(vcpu, execonly, ept_lpage_level, 503 nested_ept_ad_enabled(vcpu), 504 nested_ept_mbec_enabled(vcpu), 505 nested_ept_get_eptp(vcpu)); 506 } 507 508 static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) 509 { 510 WARN_ON(mmu_is_nested(vcpu)); 511 512 vcpu->arch.mmu = &vcpu->arch.guest_mmu; 513 nested_ept_new_eptp(vcpu); 514 vcpu->arch.mmu->get_guest_pgd = nested_ept_get_eptp; 515 vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault; 516 vcpu->arch.mmu->get_pdptr = kvm_pdptr_read; 517 518 vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; 519 } 520 521 static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) 522 { 523 vcpu->arch.mmu = &vcpu->arch.root_mmu; 524 vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; 525 } 526 527 static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, 528 u16 error_code) 529 { 530 bool inequality, bit; 531 532 bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0; 533 inequality = 534 (error_code & vmcs12->page_fault_error_code_mask) != 535 vmcs12->page_fault_error_code_match; 536 return inequality ^ bit; 537 } 538 539 static bool nested_vmx_is_exception_vmexit(struct kvm_vcpu *vcpu, u8 vector, 540 u32 error_code) 541 { 542 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 543 544 /* 545 * Drop bits 31:16 of the error code when performing the #PF mask+match 546 * check. All VMCS fields involved are 32 bits, but Intel CPUs never 547 * set bits 31:16 and VMX disallows setting bits 31:16 in the injected 548 * error code. Including the to-be-dropped bits in the check might 549 * result in an "impossible" or missed exit from L1's perspective. 550 */ 551 if (vector == PF_VECTOR) 552 return nested_vmx_is_page_fault_vmexit(vmcs12, (u16)error_code); 553 554 return (vmcs12->exception_bitmap & (1u << vector)); 555 } 556 557 static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu, 558 struct vmcs12 *vmcs12) 559 { 560 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 561 return 0; 562 563 if (CC(!page_address_valid(vcpu, vmcs12->io_bitmap_a)) || 564 CC(!page_address_valid(vcpu, vmcs12->io_bitmap_b))) 565 return -EINVAL; 566 567 return 0; 568 } 569 570 static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu, 571 struct vmcs12 *vmcs12) 572 { 573 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 574 return 0; 575 576 if (CC(!page_address_valid(vcpu, vmcs12->msr_bitmap))) 577 return -EINVAL; 578 579 return 0; 580 } 581 582 static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu, 583 struct vmcs12 *vmcs12) 584 { 585 if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) 586 return 0; 587 588 if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr))) 589 return -EINVAL; 590 591 if (CC(!nested_cpu_has_vid(vmcs12) && vmcs12->tpr_threshold >> 4)) 592 return -EINVAL; 593 594 return 0; 595 } 596 597 /* 598 * For x2APIC MSRs, ignore the vmcs01 bitmap. L1 can enable x2APIC without L1 599 * itself utilizing x2APIC. All MSRs were previously set to be intercepted, 600 * only the "disable intercept" case needs to be handled. 601 */ 602 static void nested_vmx_disable_intercept_for_x2apic_msr(unsigned long *msr_bitmap_l1, 603 unsigned long *msr_bitmap_l0, 604 u32 msr, int type) 605 { 606 if (type & MSR_TYPE_R && !vmx_test_msr_bitmap_read(msr_bitmap_l1, msr)) 607 vmx_clear_msr_bitmap_read(msr_bitmap_l0, msr); 608 609 if (type & MSR_TYPE_W && !vmx_test_msr_bitmap_write(msr_bitmap_l1, msr)) 610 vmx_clear_msr_bitmap_write(msr_bitmap_l0, msr); 611 } 612 613 static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) 614 { 615 int msr; 616 617 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 618 unsigned word = msr / BITS_PER_LONG; 619 620 msr_bitmap[word] = ~0; 621 msr_bitmap[word + (0x800 / sizeof(long))] = ~0; 622 } 623 } 624 625 #define BUILD_NVMX_MSR_INTERCEPT_HELPER(rw) \ 626 static inline \ 627 void nested_vmx_set_msr_##rw##_intercept(struct vcpu_vmx *vmx, \ 628 unsigned long *msr_bitmap_l1, \ 629 unsigned long *msr_bitmap_l0, u32 msr) \ 630 { \ 631 if (vmx_test_msr_bitmap_##rw(vmx->vmcs01.msr_bitmap, msr) || \ 632 vmx_test_msr_bitmap_##rw(msr_bitmap_l1, msr)) \ 633 vmx_set_msr_bitmap_##rw(msr_bitmap_l0, msr); \ 634 else \ 635 vmx_clear_msr_bitmap_##rw(msr_bitmap_l0, msr); \ 636 } 637 BUILD_NVMX_MSR_INTERCEPT_HELPER(read) 638 BUILD_NVMX_MSR_INTERCEPT_HELPER(write) 639 640 static inline void nested_vmx_set_intercept_for_msr(struct vcpu_vmx *vmx, 641 unsigned long *msr_bitmap_l1, 642 unsigned long *msr_bitmap_l0, 643 u32 msr, int types) 644 { 645 if (types & MSR_TYPE_R) 646 nested_vmx_set_msr_read_intercept(vmx, msr_bitmap_l1, 647 msr_bitmap_l0, msr); 648 if (types & MSR_TYPE_W) 649 nested_vmx_set_msr_write_intercept(vmx, msr_bitmap_l1, 650 msr_bitmap_l0, msr); 651 } 652 653 #define nested_vmx_merge_msr_bitmaps(msr, type) \ 654 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, \ 655 msr_bitmap_l0, msr, type) 656 657 #define nested_vmx_merge_msr_bitmaps_read(msr) \ 658 nested_vmx_merge_msr_bitmaps(msr, MSR_TYPE_R) 659 660 #define nested_vmx_merge_msr_bitmaps_write(msr) \ 661 nested_vmx_merge_msr_bitmaps(msr, MSR_TYPE_W) 662 663 #define nested_vmx_merge_msr_bitmaps_rw(msr) \ 664 nested_vmx_merge_msr_bitmaps(msr, MSR_TYPE_RW) 665 666 static void nested_vmx_merge_pmu_msr_bitmaps(struct kvm_vcpu *vcpu, 667 unsigned long *msr_bitmap_l1, 668 unsigned long *msr_bitmap_l0) 669 { 670 struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 671 struct vcpu_vmx *vmx = to_vmx(vcpu); 672 int i; 673 674 /* 675 * Skip the merges if the vCPU doesn't have a mediated PMU MSR, i.e. if 676 * none of the MSRs can possibly be passed through to L1. 677 */ 678 if (!kvm_vcpu_has_mediated_pmu(vcpu)) 679 return; 680 681 for (i = 0; i < pmu->nr_arch_gp_counters; i++) { 682 nested_vmx_merge_msr_bitmaps_rw(MSR_IA32_PERFCTR0 + i); 683 nested_vmx_merge_msr_bitmaps_rw(MSR_IA32_PMC0 + i); 684 } 685 686 for (i = 0; i < pmu->nr_arch_fixed_counters; i++) 687 nested_vmx_merge_msr_bitmaps_rw(MSR_CORE_PERF_FIXED_CTR0 + i); 688 689 nested_vmx_merge_msr_bitmaps_rw(MSR_CORE_PERF_GLOBAL_CTRL); 690 nested_vmx_merge_msr_bitmaps_read(MSR_CORE_PERF_GLOBAL_STATUS); 691 nested_vmx_merge_msr_bitmaps_write(MSR_CORE_PERF_GLOBAL_OVF_CTRL); 692 } 693 694 /* 695 * Merge L0's and L1's MSR bitmap, return false to indicate that 696 * we do not use the hardware. 697 */ 698 static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, 699 struct vmcs12 *vmcs12) 700 { 701 struct vcpu_vmx *vmx = to_vmx(vcpu); 702 int msr; 703 unsigned long *msr_bitmap_l1; 704 unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap; 705 struct kvm_host_map map; 706 707 /* Nothing to do if the MSR bitmap is not in use. */ 708 if (!cpu_has_vmx_msr_bitmap() || 709 !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 710 return false; 711 712 /* 713 * MSR bitmap update can be skipped when: 714 * - MSR bitmap for L1 hasn't changed. 715 * - Nested hypervisor (L1) is attempting to launch the same L2 as 716 * before. 717 * - Nested hypervisor (L1) has enabled 'Enlightened MSR Bitmap' feature 718 * and tells KVM (L0) there were no changes in MSR bitmap for L2. 719 */ 720 if (!vmx->nested.force_msr_bitmap_recalc) { 721 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 722 723 if (evmcs && evmcs->hv_enlightenments_control.msr_bitmap && 724 evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP) 725 return true; 726 } 727 728 if (kvm_vcpu_map_readonly(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), &map)) 729 return false; 730 731 msr_bitmap_l1 = (unsigned long *)map.hva; 732 733 /* 734 * To keep the control flow simple, pay eight 8-byte writes (sixteen 735 * 4-byte writes on 32-bit systems) up front to enable intercepts for 736 * the x2APIC MSR range and selectively toggle those relevant to L2. 737 */ 738 enable_x2apic_msr_intercepts(msr_bitmap_l0); 739 740 if (nested_cpu_has_virt_x2apic_mode(vmcs12)) { 741 if (nested_cpu_has_apic_reg_virt(vmcs12)) { 742 /* 743 * L0 need not intercept reads for MSRs between 0x800 744 * and 0x8ff, it just lets the processor take the value 745 * from the virtual-APIC page; take those 256 bits 746 * directly from the L1 bitmap. 747 */ 748 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 749 unsigned word = msr / BITS_PER_LONG; 750 751 msr_bitmap_l0[word] = msr_bitmap_l1[word]; 752 } 753 } 754 755 nested_vmx_disable_intercept_for_x2apic_msr( 756 msr_bitmap_l1, msr_bitmap_l0, 757 X2APIC_MSR(APIC_TASKPRI), 758 MSR_TYPE_R | MSR_TYPE_W); 759 760 if (nested_cpu_has_vid(vmcs12)) { 761 nested_vmx_disable_intercept_for_x2apic_msr( 762 msr_bitmap_l1, msr_bitmap_l0, 763 X2APIC_MSR(APIC_EOI), 764 MSR_TYPE_W); 765 nested_vmx_disable_intercept_for_x2apic_msr( 766 msr_bitmap_l1, msr_bitmap_l0, 767 X2APIC_MSR(APIC_SELF_IPI), 768 MSR_TYPE_W); 769 } 770 } 771 772 /* 773 * Always check vmcs01's bitmap to honor userspace MSR filters and any 774 * other runtime changes to vmcs01's bitmap, e.g. dynamic pass-through. 775 */ 776 #ifdef CONFIG_X86_64 777 nested_vmx_merge_msr_bitmaps_rw(MSR_FS_BASE); 778 nested_vmx_merge_msr_bitmaps_rw(MSR_GS_BASE); 779 nested_vmx_merge_msr_bitmaps_rw(MSR_KERNEL_GS_BASE); 780 #endif 781 nested_vmx_merge_msr_bitmaps_rw(MSR_IA32_SPEC_CTRL); 782 nested_vmx_merge_msr_bitmaps_write(MSR_IA32_PRED_CMD); 783 nested_vmx_merge_msr_bitmaps_write(MSR_IA32_FLUSH_CMD); 784 785 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 786 MSR_IA32_APERF, MSR_TYPE_R); 787 788 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 789 MSR_IA32_MPERF, MSR_TYPE_R); 790 791 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 792 MSR_IA32_U_CET, MSR_TYPE_RW); 793 794 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 795 MSR_IA32_S_CET, MSR_TYPE_RW); 796 797 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 798 MSR_IA32_PL0_SSP, MSR_TYPE_RW); 799 800 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 801 MSR_IA32_PL1_SSP, MSR_TYPE_RW); 802 803 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 804 MSR_IA32_PL2_SSP, MSR_TYPE_RW); 805 806 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 807 MSR_IA32_PL3_SSP, MSR_TYPE_RW); 808 809 nested_vmx_merge_pmu_msr_bitmaps(vcpu, msr_bitmap_l1, msr_bitmap_l0); 810 811 kvm_vcpu_unmap(vcpu, &map); 812 813 vmx->nested.force_msr_bitmap_recalc = false; 814 815 return true; 816 } 817 818 static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, 819 struct vmcs12 *vmcs12) 820 { 821 struct vcpu_vmx *vmx = to_vmx(vcpu); 822 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; 823 824 if (!nested_cpu_has_shadow_vmcs(vmcs12) || 825 vmcs12->vmcs_link_pointer == INVALID_GPA) 826 return; 827 828 if (ghc->gpa != vmcs12->vmcs_link_pointer && 829 kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, 830 vmcs12->vmcs_link_pointer, VMCS12_SIZE)) 831 return; 832 833 kvm_read_guest_cached(vcpu->kvm, ghc, get_shadow_vmcs12(vcpu), 834 VMCS12_SIZE); 835 } 836 837 static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, 838 struct vmcs12 *vmcs12) 839 { 840 struct vcpu_vmx *vmx = to_vmx(vcpu); 841 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; 842 843 if (!nested_cpu_has_shadow_vmcs(vmcs12) || 844 vmcs12->vmcs_link_pointer == INVALID_GPA) 845 return; 846 847 if (ghc->gpa != vmcs12->vmcs_link_pointer && 848 kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, 849 vmcs12->vmcs_link_pointer, VMCS12_SIZE)) 850 return; 851 852 kvm_write_guest_cached(vcpu->kvm, ghc, get_shadow_vmcs12(vcpu), 853 VMCS12_SIZE); 854 } 855 856 /* 857 * In nested virtualization, check if L1 has set 858 * VM_EXIT_ACK_INTR_ON_EXIT 859 */ 860 static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu) 861 { 862 return get_vmcs12(vcpu)->vm_exit_controls & 863 VM_EXIT_ACK_INTR_ON_EXIT; 864 } 865 866 static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu, 867 struct vmcs12 *vmcs12) 868 { 869 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && 870 CC(!page_address_valid(vcpu, vmcs12->apic_access_addr))) 871 return -EINVAL; 872 else 873 return 0; 874 } 875 876 static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, 877 struct vmcs12 *vmcs12) 878 { 879 if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && 880 !nested_cpu_has_apic_reg_virt(vmcs12) && 881 !nested_cpu_has_vid(vmcs12) && 882 !nested_cpu_has_posted_intr(vmcs12)) 883 return 0; 884 885 /* 886 * If virtualize x2apic mode is enabled, 887 * virtualize apic access must be disabled. 888 */ 889 if (CC(nested_cpu_has_virt_x2apic_mode(vmcs12) && 890 nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))) 891 return -EINVAL; 892 893 /* 894 * If virtual interrupt delivery is enabled, 895 * we must exit on external interrupts. 896 */ 897 if (CC(nested_cpu_has_vid(vmcs12) && !nested_exit_on_intr(vcpu))) 898 return -EINVAL; 899 900 /* 901 * bits 15:8 should be zero in posted_intr_nv, 902 * the descriptor address has been already checked 903 * in nested_get_vmcs12_pages. 904 * 905 * bits 5:0 of posted_intr_desc_addr should be zero. 906 */ 907 if (nested_cpu_has_posted_intr(vmcs12) && 908 (CC(!nested_cpu_has_vid(vmcs12)) || 909 CC(!nested_exit_intr_ack_set(vcpu)) || 910 CC((vmcs12->posted_intr_nv & 0xff00)) || 911 CC(!kvm_vcpu_is_legal_aligned_gpa(vcpu, vmcs12->posted_intr_desc_addr, 64)))) 912 return -EINVAL; 913 914 /* tpr shadow is needed by all apicv features. */ 915 if (CC(!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))) 916 return -EINVAL; 917 918 return 0; 919 } 920 921 static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu) 922 { 923 struct vcpu_vmx *vmx = to_vmx(vcpu); 924 u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, 925 vmx->nested.msrs.misc_high); 926 927 return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER; 928 } 929 930 static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, 931 u32 count, u64 addr) 932 { 933 if (count == 0) 934 return 0; 935 936 /* 937 * Exceeding the limit results in architecturally _undefined_ behavior, 938 * i.e. KVM is allowed to do literally anything in response to a bad 939 * limit. Immediately generate a consistency check so that code that 940 * consumes the count doesn't need to worry about extreme edge cases. 941 */ 942 if (count > nested_vmx_max_atomic_switch_msrs(vcpu)) 943 return -EINVAL; 944 945 if (!kvm_vcpu_is_legal_aligned_gpa(vcpu, addr, 16) || 946 !kvm_vcpu_is_legal_gpa(vcpu, (addr + count * sizeof(struct vmx_msr_entry) - 1))) 947 return -EINVAL; 948 949 return 0; 950 } 951 952 static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu, 953 struct vmcs12 *vmcs12) 954 { 955 if (CC(nested_vmx_check_msr_switch(vcpu, 956 vmcs12->vm_exit_msr_load_count, 957 vmcs12->vm_exit_msr_load_addr)) || 958 CC(nested_vmx_check_msr_switch(vcpu, 959 vmcs12->vm_exit_msr_store_count, 960 vmcs12->vm_exit_msr_store_addr))) 961 return -EINVAL; 962 963 return 0; 964 } 965 966 static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu, 967 struct vmcs12 *vmcs12) 968 { 969 if (CC(nested_vmx_check_msr_switch(vcpu, 970 vmcs12->vm_entry_msr_load_count, 971 vmcs12->vm_entry_msr_load_addr))) 972 return -EINVAL; 973 974 return 0; 975 } 976 977 static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu, 978 struct vmcs12 *vmcs12) 979 { 980 if (!nested_cpu_has_pml(vmcs12)) 981 return 0; 982 983 if (CC(!nested_cpu_has_ept(vmcs12)) || 984 CC(!page_address_valid(vcpu, vmcs12->pml_address))) 985 return -EINVAL; 986 987 return 0; 988 } 989 990 static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu, 991 struct vmcs12 *vmcs12) 992 { 993 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) && 994 !nested_cpu_has_ept(vmcs12))) 995 return -EINVAL; 996 return 0; 997 } 998 999 static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu, 1000 struct vmcs12 *vmcs12) 1001 { 1002 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) && 1003 !nested_cpu_has_ept(vmcs12))) 1004 return -EINVAL; 1005 return 0; 1006 } 1007 1008 static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu, 1009 struct vmcs12 *vmcs12) 1010 { 1011 if (!nested_cpu_has_shadow_vmcs(vmcs12)) 1012 return 0; 1013 1014 if (CC(!page_address_valid(vcpu, vmcs12->vmread_bitmap)) || 1015 CC(!page_address_valid(vcpu, vmcs12->vmwrite_bitmap))) 1016 return -EINVAL; 1017 1018 return 0; 1019 } 1020 1021 static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, 1022 struct vmx_msr_entry *e) 1023 { 1024 /* x2APIC MSR accesses are not allowed */ 1025 if (CC(vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8)) 1026 return -EINVAL; 1027 if (CC(e->index == MSR_IA32_UCODE_WRITE) || /* SDM Table 35-2 */ 1028 CC(e->index == MSR_IA32_UCODE_REV)) 1029 return -EINVAL; 1030 if (CC(e->reserved != 0)) 1031 return -EINVAL; 1032 return 0; 1033 } 1034 1035 static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu, 1036 struct vmx_msr_entry *e) 1037 { 1038 if (CC(e->index == MSR_FS_BASE) || 1039 CC(e->index == MSR_GS_BASE) || 1040 CC(e->index == MSR_IA32_SMM_MONITOR_CTL) || /* SMM is not supported */ 1041 nested_vmx_msr_check_common(vcpu, e)) 1042 return -EINVAL; 1043 return 0; 1044 } 1045 1046 static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, 1047 struct vmx_msr_entry *e) 1048 { 1049 if (CC(e->index == MSR_IA32_SMBASE) || /* SMM is not supported */ 1050 nested_vmx_msr_check_common(vcpu, e)) 1051 return -EINVAL; 1052 return 0; 1053 } 1054 1055 /* 1056 * Load guest's/host's msr at nested entry/exit. 1057 * return 0 for success, entry index for failure. 1058 * 1059 * One of the failure modes for MSR load/store is when a list exceeds the 1060 * virtual hardware's capacity. To maintain compatibility with hardware inasmuch 1061 * as possible, process all valid entries before failing rather than precheck 1062 * for a capacity violation. 1063 */ 1064 static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 1065 { 1066 u32 i; 1067 struct vmx_msr_entry e; 1068 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); 1069 1070 for (i = 0; i < count; i++) { 1071 if (WARN_ON_ONCE(i >= max_msr_list_size)) 1072 goto fail; 1073 1074 if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), 1075 &e, sizeof(e))) { 1076 pr_debug_ratelimited( 1077 "%s cannot read MSR entry (%u, 0x%08llx)\n", 1078 __func__, i, gpa + i * sizeof(e)); 1079 goto fail; 1080 } 1081 if (nested_vmx_load_msr_check(vcpu, &e)) { 1082 pr_debug_ratelimited( 1083 "%s check failed (%u, 0x%x, 0x%x)\n", 1084 __func__, i, e.index, e.reserved); 1085 goto fail; 1086 } 1087 if (kvm_emulate_msr_write(vcpu, e.index, e.value)) { 1088 pr_debug_ratelimited( 1089 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 1090 __func__, i, e.index, e.value); 1091 goto fail; 1092 } 1093 } 1094 return 0; 1095 fail: 1096 /* Note, max_msr_list_size is at most 4096, i.e. this can't wrap. */ 1097 return i + 1; 1098 } 1099 1100 static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu, 1101 u32 msr_index, 1102 u64 *data) 1103 { 1104 struct vcpu_vmx *vmx = to_vmx(vcpu); 1105 1106 /* 1107 * If the L0 hypervisor stored a more accurate value for the TSC that 1108 * does not include the time taken for emulation of the L2->L1 1109 * VM-exit in L0, use the more accurate value. 1110 */ 1111 if (msr_index == MSR_IA32_TSC && vmx->nested.tsc_autostore_slot >= 0) { 1112 int slot = vmx->nested.tsc_autostore_slot; 1113 u64 host_tsc = vmx->msr_autostore.val[slot].value; 1114 1115 *data = kvm_read_l1_tsc(vcpu, host_tsc); 1116 return true; 1117 } 1118 1119 if (kvm_emulate_msr_read(vcpu, msr_index, data)) { 1120 pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__, 1121 msr_index); 1122 return false; 1123 } 1124 return true; 1125 } 1126 1127 static bool read_and_check_msr_entry(struct kvm_vcpu *vcpu, u64 gpa, int i, 1128 struct vmx_msr_entry *e) 1129 { 1130 if (kvm_vcpu_read_guest(vcpu, 1131 gpa + i * sizeof(*e), 1132 e, 2 * sizeof(u32))) { 1133 pr_debug_ratelimited( 1134 "%s cannot read MSR entry (%u, 0x%08llx)\n", 1135 __func__, i, gpa + i * sizeof(*e)); 1136 return false; 1137 } 1138 if (nested_vmx_store_msr_check(vcpu, e)) { 1139 pr_debug_ratelimited( 1140 "%s check failed (%u, 0x%x, 0x%x)\n", 1141 __func__, i, e->index, e->reserved); 1142 return false; 1143 } 1144 return true; 1145 } 1146 1147 static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 1148 { 1149 u64 data; 1150 u32 i; 1151 struct vmx_msr_entry e; 1152 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); 1153 1154 for (i = 0; i < count; i++) { 1155 if (WARN_ON_ONCE(i >= max_msr_list_size)) 1156 return -EINVAL; 1157 1158 if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) 1159 return -EINVAL; 1160 1161 if (!nested_vmx_get_vmexit_msr_value(vcpu, e.index, &data)) 1162 return -EINVAL; 1163 1164 if (kvm_vcpu_write_guest(vcpu, 1165 gpa + i * sizeof(e) + 1166 offsetof(struct vmx_msr_entry, value), 1167 &data, sizeof(data))) { 1168 pr_debug_ratelimited( 1169 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 1170 __func__, i, e.index, data); 1171 return -EINVAL; 1172 } 1173 } 1174 return 0; 1175 } 1176 1177 static bool nested_msr_store_list_has_msr(struct kvm_vcpu *vcpu, u32 msr_index) 1178 { 1179 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1180 u32 count = vmcs12->vm_exit_msr_store_count; 1181 u64 gpa = vmcs12->vm_exit_msr_store_addr; 1182 struct vmx_msr_entry e; 1183 u32 i; 1184 1185 for (i = 0; i < count; i++) { 1186 if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) 1187 return false; 1188 1189 if (e.index == msr_index) 1190 return true; 1191 } 1192 return false; 1193 } 1194 1195 /* 1196 * Load guest's/host's cr3 at nested entry/exit. @nested_ept is true if we are 1197 * emulating VM-Entry into a guest with EPT enabled. On failure, the expected 1198 * Exit Qualification (for a VM-Entry consistency check VM-Exit) is assigned to 1199 * @entry_failure_code. 1200 */ 1201 static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, 1202 bool nested_ept, bool reload_pdptrs, 1203 enum vm_entry_failure_code *entry_failure_code) 1204 { 1205 if (CC(!kvm_vcpu_is_legal_cr3(vcpu, cr3))) { 1206 *entry_failure_code = ENTRY_FAIL_DEFAULT; 1207 return -EINVAL; 1208 } 1209 1210 /* 1211 * If PAE paging and EPT are both on, CR3 is not used by the CPU and 1212 * must not be dereferenced. 1213 */ 1214 if (reload_pdptrs && !nested_ept && is_pae_paging(vcpu) && 1215 CC(!load_pdptrs(vcpu, cr3))) { 1216 *entry_failure_code = ENTRY_FAIL_PDPTE; 1217 return -EINVAL; 1218 } 1219 1220 vcpu->arch.cr3 = cr3; 1221 kvm_register_mark_dirty(vcpu, VCPU_REG_CR3); 1222 1223 /* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */ 1224 kvm_init_mmu(vcpu); 1225 1226 if (!nested_ept) 1227 kvm_mmu_new_pgd(vcpu, cr3); 1228 1229 return 0; 1230 } 1231 1232 /* 1233 * Returns if KVM is able to config CPU to tag TLB entries 1234 * populated by L2 differently than TLB entries populated 1235 * by L1. 1236 * 1237 * If L0 uses EPT, L1 and L2 run with different EPTP because 1238 * guest_mode is part of kvm_mmu_page_role. Thus, TLB entries 1239 * are tagged with different EPTP. 1240 * 1241 * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged 1242 * with different VPID (L1 entries are tagged with vmx->vpid 1243 * while L2 entries are tagged with vmx->nested.vpid02). 1244 */ 1245 static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu) 1246 { 1247 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1248 1249 return enable_ept || 1250 (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02); 1251 } 1252 1253 static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu, 1254 struct vmcs12 *vmcs12, 1255 bool is_vmenter) 1256 { 1257 struct vcpu_vmx *vmx = to_vmx(vcpu); 1258 1259 /* Handle pending Hyper-V TLB flush requests */ 1260 kvm_hv_nested_transtion_tlb_flush(vcpu, enable_ept); 1261 1262 /* 1263 * If VPID is disabled, then guest TLB accesses use VPID=0, i.e. the 1264 * same VPID as the host, and so architecturally, linear and combined 1265 * mappings for VPID=0 must be flushed at VM-Enter and VM-Exit. KVM 1266 * emulates L2 sharing L1's VPID=0 by using vpid01 while running L2, 1267 * and so KVM must also emulate TLB flush of VPID=0, i.e. vpid01. This 1268 * is required if VPID is disabled in KVM, as a TLB flush (there are no 1269 * VPIDs) still occurs from L1's perspective, and KVM may need to 1270 * synchronize the MMU in response to the guest TLB flush. 1271 * 1272 * Note, using TLB_FLUSH_GUEST is correct even if nested EPT is in use. 1273 * EPT is a special snowflake, as guest-physical mappings aren't 1274 * flushed on VPID invalidations, including VM-Enter or VM-Exit with 1275 * VPID disabled. As a result, KVM _never_ needs to sync nEPT 1276 * entries on VM-Enter because L1 can't rely on VM-Enter to flush 1277 * those mappings. 1278 */ 1279 if (!nested_cpu_has_vpid(vmcs12)) { 1280 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 1281 return; 1282 } 1283 1284 /* L2 should never have a VPID if VPID is disabled. */ 1285 WARN_ON(!enable_vpid); 1286 1287 /* 1288 * VPID is enabled and in use by vmcs12. If vpid12 is changing, then 1289 * emulate a guest TLB flush as KVM does not track vpid12 history nor 1290 * is the VPID incorporated into the MMU context. I.e. KVM must assume 1291 * that the new vpid12 has never been used and thus represents a new 1292 * guest ASID that cannot have entries in the TLB. 1293 */ 1294 if (is_vmenter && vmcs12->virtual_processor_id != vmx->nested.last_vpid) { 1295 vmx->nested.last_vpid = vmcs12->virtual_processor_id; 1296 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 1297 return; 1298 } 1299 1300 /* 1301 * If VPID is enabled, used by vmc12, and vpid12 is not changing but 1302 * does not have a unique TLB tag (ASID), i.e. EPT is disabled and 1303 * KVM was unable to allocate a VPID for L2, flush the current context 1304 * as the effective ASID is common to both L1 and L2. 1305 */ 1306 if (!nested_has_guest_tlb_tag(vcpu)) 1307 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); 1308 } 1309 1310 static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) 1311 { 1312 superset &= mask; 1313 subset &= mask; 1314 1315 return (superset | subset) == superset; 1316 } 1317 1318 static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) 1319 { 1320 const u64 feature_bits = VMX_BASIC_DUAL_MONITOR_TREATMENT | 1321 VMX_BASIC_INOUT | 1322 VMX_BASIC_TRUE_CTLS | 1323 VMX_BASIC_NO_HW_ERROR_CODE_CC; 1324 1325 const u64 reserved_bits = GENMASK_ULL(63, 57) | 1326 GENMASK_ULL(47, 45) | 1327 BIT_ULL(31); 1328 1329 u64 vmx_basic = vmcs_config.nested.basic; 1330 1331 BUILD_BUG_ON(feature_bits & reserved_bits); 1332 1333 /* 1334 * Except for 32BIT_PHYS_ADDR_ONLY, which is an anti-feature bit (has 1335 * inverted polarity), the incoming value must not set feature bits or 1336 * reserved bits that aren't allowed/supported by KVM. Fields, i.e. 1337 * multi-bit values, are explicitly checked below. 1338 */ 1339 if (!is_bitwise_subset(vmx_basic, data, feature_bits | reserved_bits)) 1340 return -EINVAL; 1341 1342 /* 1343 * KVM does not emulate a version of VMX that constrains physical 1344 * addresses of VMX structures (e.g. VMCS) to 32-bits. 1345 */ 1346 if (data & VMX_BASIC_32BIT_PHYS_ADDR_ONLY) 1347 return -EINVAL; 1348 1349 if (vmx_basic_vmcs_revision_id(vmx_basic) != 1350 vmx_basic_vmcs_revision_id(data)) 1351 return -EINVAL; 1352 1353 if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data)) 1354 return -EINVAL; 1355 1356 vmx->nested.msrs.basic = data; 1357 return 0; 1358 } 1359 1360 static void vmx_get_control_msr(struct nested_vmx_msrs *msrs, u32 msr_index, 1361 u32 **low, u32 **high) 1362 { 1363 switch (msr_index) { 1364 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1365 *low = &msrs->pinbased_ctls_low; 1366 *high = &msrs->pinbased_ctls_high; 1367 break; 1368 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1369 *low = &msrs->procbased_ctls_low; 1370 *high = &msrs->procbased_ctls_high; 1371 break; 1372 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1373 *low = &msrs->exit_ctls_low; 1374 *high = &msrs->exit_ctls_high; 1375 break; 1376 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1377 *low = &msrs->entry_ctls_low; 1378 *high = &msrs->entry_ctls_high; 1379 break; 1380 case MSR_IA32_VMX_PROCBASED_CTLS2: 1381 *low = &msrs->secondary_ctls_low; 1382 *high = &msrs->secondary_ctls_high; 1383 break; 1384 default: 1385 BUG(); 1386 } 1387 } 1388 1389 static int 1390 vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) 1391 { 1392 u32 *lowp, *highp; 1393 u64 supported; 1394 1395 vmx_get_control_msr(&vmcs_config.nested, msr_index, &lowp, &highp); 1396 1397 supported = vmx_control_msr(*lowp, *highp); 1398 1399 /* Check must-be-1 bits are still 1. */ 1400 if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0))) 1401 return -EINVAL; 1402 1403 /* Check must-be-0 bits are still 0. */ 1404 if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32))) 1405 return -EINVAL; 1406 1407 vmx_get_control_msr(&vmx->nested.msrs, msr_index, &lowp, &highp); 1408 *lowp = data; 1409 *highp = data >> 32; 1410 return 0; 1411 } 1412 1413 static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) 1414 { 1415 const u64 feature_bits = VMX_MISC_SAVE_EFER_LMA | 1416 VMX_MISC_ACTIVITY_HLT | 1417 VMX_MISC_ACTIVITY_SHUTDOWN | 1418 VMX_MISC_ACTIVITY_WAIT_SIPI | 1419 VMX_MISC_INTEL_PT | 1420 VMX_MISC_RDMSR_IN_SMM | 1421 VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | 1422 VMX_MISC_VMXOFF_BLOCK_SMI | 1423 VMX_MISC_ZERO_LEN_INS; 1424 1425 const u64 reserved_bits = BIT_ULL(31) | GENMASK_ULL(13, 9); 1426 1427 u64 vmx_misc = vmx_control_msr(vmcs_config.nested.misc_low, 1428 vmcs_config.nested.misc_high); 1429 1430 BUILD_BUG_ON(feature_bits & reserved_bits); 1431 1432 /* 1433 * The incoming value must not set feature bits or reserved bits that 1434 * aren't allowed/supported by KVM. Fields, i.e. multi-bit values, are 1435 * explicitly checked below. 1436 */ 1437 if (!is_bitwise_subset(vmx_misc, data, feature_bits | reserved_bits)) 1438 return -EINVAL; 1439 1440 if ((vmx->nested.msrs.pinbased_ctls_high & 1441 PIN_BASED_VMX_PREEMPTION_TIMER) && 1442 vmx_misc_preemption_timer_rate(data) != 1443 vmx_misc_preemption_timer_rate(vmx_misc)) 1444 return -EINVAL; 1445 1446 if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc)) 1447 return -EINVAL; 1448 1449 if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc)) 1450 return -EINVAL; 1451 1452 if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc)) 1453 return -EINVAL; 1454 1455 vmx->nested.msrs.misc_low = data; 1456 vmx->nested.msrs.misc_high = data >> 32; 1457 1458 return 0; 1459 } 1460 1461 static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data) 1462 { 1463 u64 vmx_ept_vpid_cap = vmx_control_msr(vmcs_config.nested.ept_caps, 1464 vmcs_config.nested.vpid_caps); 1465 1466 /* Every bit is either reserved or a feature bit. */ 1467 if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL)) 1468 return -EINVAL; 1469 1470 vmx->nested.msrs.ept_caps = data; 1471 vmx->nested.msrs.vpid_caps = data >> 32; 1472 return 0; 1473 } 1474 1475 static u64 *vmx_get_fixed0_msr(struct nested_vmx_msrs *msrs, u32 msr_index) 1476 { 1477 switch (msr_index) { 1478 case MSR_IA32_VMX_CR0_FIXED0: 1479 return &msrs->cr0_fixed0; 1480 case MSR_IA32_VMX_CR4_FIXED0: 1481 return &msrs->cr4_fixed0; 1482 default: 1483 BUG(); 1484 } 1485 } 1486 1487 static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) 1488 { 1489 const u64 *msr = vmx_get_fixed0_msr(&vmcs_config.nested, msr_index); 1490 1491 /* 1492 * 1 bits (which indicates bits which "must-be-1" during VMX operation) 1493 * must be 1 in the restored value. 1494 */ 1495 if (!is_bitwise_subset(data, *msr, -1ULL)) 1496 return -EINVAL; 1497 1498 *vmx_get_fixed0_msr(&vmx->nested.msrs, msr_index) = data; 1499 return 0; 1500 } 1501 1502 /* 1503 * Called when userspace is restoring VMX MSRs. 1504 * 1505 * Returns 0 on success, non-0 otherwise. 1506 */ 1507 int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 1508 { 1509 struct vcpu_vmx *vmx = to_vmx(vcpu); 1510 1511 /* 1512 * Don't allow changes to the VMX capability MSRs while the vCPU 1513 * is in VMX operation. 1514 */ 1515 if (vmx->nested.vmxon) 1516 return -EBUSY; 1517 1518 switch (msr_index) { 1519 case MSR_IA32_VMX_BASIC: 1520 return vmx_restore_vmx_basic(vmx, data); 1521 case MSR_IA32_VMX_PINBASED_CTLS: 1522 case MSR_IA32_VMX_PROCBASED_CTLS: 1523 case MSR_IA32_VMX_EXIT_CTLS: 1524 case MSR_IA32_VMX_ENTRY_CTLS: 1525 /* 1526 * The "non-true" VMX capability MSRs are generated from the 1527 * "true" MSRs, so we do not support restoring them directly. 1528 * 1529 * If userspace wants to emulate VMX_BASIC[55]=0, userspace 1530 * should restore the "true" MSRs with the must-be-1 bits 1531 * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND 1532 * DEFAULT SETTINGS". 1533 */ 1534 return -EINVAL; 1535 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1536 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1537 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1538 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1539 case MSR_IA32_VMX_PROCBASED_CTLS2: 1540 return vmx_restore_control_msr(vmx, msr_index, data); 1541 case MSR_IA32_VMX_MISC: 1542 return vmx_restore_vmx_misc(vmx, data); 1543 case MSR_IA32_VMX_CR0_FIXED0: 1544 case MSR_IA32_VMX_CR4_FIXED0: 1545 return vmx_restore_fixed0_msr(vmx, msr_index, data); 1546 case MSR_IA32_VMX_CR0_FIXED1: 1547 case MSR_IA32_VMX_CR4_FIXED1: 1548 /* 1549 * These MSRs are generated based on the vCPU's CPUID, so we 1550 * do not support restoring them directly. 1551 */ 1552 return -EINVAL; 1553 case MSR_IA32_VMX_EPT_VPID_CAP: 1554 return vmx_restore_vmx_ept_vpid_cap(vmx, data); 1555 case MSR_IA32_VMX_VMCS_ENUM: 1556 vmx->nested.msrs.vmcs_enum = data; 1557 return 0; 1558 case MSR_IA32_VMX_VMFUNC: 1559 if (data & ~vmcs_config.nested.vmfunc_controls) 1560 return -EINVAL; 1561 vmx->nested.msrs.vmfunc_controls = data; 1562 return 0; 1563 default: 1564 /* 1565 * The rest of the VMX capability MSRs do not support restore. 1566 */ 1567 return -EINVAL; 1568 } 1569 } 1570 1571 /* Returns 0 on success, non-0 otherwise. */ 1572 int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata) 1573 { 1574 switch (msr_index) { 1575 case MSR_IA32_VMX_BASIC: 1576 *pdata = msrs->basic; 1577 break; 1578 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1579 case MSR_IA32_VMX_PINBASED_CTLS: 1580 *pdata = vmx_control_msr( 1581 msrs->pinbased_ctls_low, 1582 msrs->pinbased_ctls_high); 1583 if (msr_index == MSR_IA32_VMX_PINBASED_CTLS) 1584 *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 1585 break; 1586 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1587 case MSR_IA32_VMX_PROCBASED_CTLS: 1588 *pdata = vmx_control_msr( 1589 msrs->procbased_ctls_low, 1590 msrs->procbased_ctls_high); 1591 if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS) 1592 *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 1593 break; 1594 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1595 case MSR_IA32_VMX_EXIT_CTLS: 1596 *pdata = vmx_control_msr( 1597 msrs->exit_ctls_low, 1598 msrs->exit_ctls_high); 1599 if (msr_index == MSR_IA32_VMX_EXIT_CTLS) 1600 *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 1601 break; 1602 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1603 case MSR_IA32_VMX_ENTRY_CTLS: 1604 *pdata = vmx_control_msr( 1605 msrs->entry_ctls_low, 1606 msrs->entry_ctls_high); 1607 if (msr_index == MSR_IA32_VMX_ENTRY_CTLS) 1608 *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 1609 break; 1610 case MSR_IA32_VMX_MISC: 1611 *pdata = vmx_control_msr( 1612 msrs->misc_low, 1613 msrs->misc_high); 1614 break; 1615 case MSR_IA32_VMX_CR0_FIXED0: 1616 *pdata = msrs->cr0_fixed0; 1617 break; 1618 case MSR_IA32_VMX_CR0_FIXED1: 1619 *pdata = msrs->cr0_fixed1; 1620 break; 1621 case MSR_IA32_VMX_CR4_FIXED0: 1622 *pdata = msrs->cr4_fixed0; 1623 break; 1624 case MSR_IA32_VMX_CR4_FIXED1: 1625 *pdata = msrs->cr4_fixed1; 1626 break; 1627 case MSR_IA32_VMX_VMCS_ENUM: 1628 *pdata = msrs->vmcs_enum; 1629 break; 1630 case MSR_IA32_VMX_PROCBASED_CTLS2: 1631 *pdata = vmx_control_msr( 1632 msrs->secondary_ctls_low, 1633 msrs->secondary_ctls_high); 1634 break; 1635 case MSR_IA32_VMX_EPT_VPID_CAP: 1636 *pdata = msrs->ept_caps | 1637 ((u64)msrs->vpid_caps << 32); 1638 break; 1639 case MSR_IA32_VMX_VMFUNC: 1640 *pdata = msrs->vmfunc_controls; 1641 break; 1642 default: 1643 return 1; 1644 } 1645 1646 return 0; 1647 } 1648 1649 /* 1650 * Copy the writable VMCS shadow fields back to the VMCS12, in case they have 1651 * been modified by the L1 guest. Note, "writable" in this context means 1652 * "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of 1653 * fields tagged SHADOW_FIELD_RO may or may not align with the "read-only" 1654 * VM-exit information fields (which are actually writable if the vCPU is 1655 * configured to support "VMWRITE to any supported field in the VMCS"). 1656 */ 1657 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) 1658 { 1659 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 1660 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); 1661 struct shadow_vmcs_field field; 1662 unsigned long val; 1663 int i; 1664 1665 if (WARN_ON(!shadow_vmcs)) 1666 return; 1667 1668 preempt_disable(); 1669 1670 vmcs_load(shadow_vmcs); 1671 1672 for (i = 0; i < max_shadow_read_write_fields; i++) { 1673 field = shadow_read_write_fields[i]; 1674 val = __vmcs_readl(field.encoding); 1675 vmcs12_write_any(vmcs12, field.encoding, field.offset, val); 1676 } 1677 1678 vmcs_clear(shadow_vmcs); 1679 vmcs_load(vmx->loaded_vmcs->vmcs); 1680 1681 preempt_enable(); 1682 } 1683 1684 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) 1685 { 1686 const struct shadow_vmcs_field *fields[] = { 1687 shadow_read_write_fields, 1688 shadow_read_only_fields 1689 }; 1690 const int max_fields[] = { 1691 max_shadow_read_write_fields, 1692 max_shadow_read_only_fields 1693 }; 1694 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 1695 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); 1696 struct shadow_vmcs_field field; 1697 unsigned long val; 1698 int i, q; 1699 1700 if (WARN_ON(!shadow_vmcs)) 1701 return; 1702 1703 vmcs_load(shadow_vmcs); 1704 1705 for (q = 0; q < ARRAY_SIZE(fields); q++) { 1706 for (i = 0; i < max_fields[q]; i++) { 1707 field = fields[q][i]; 1708 val = vmcs12_read_any(vmcs12, field.encoding, 1709 field.offset); 1710 __vmcs_writel(field.encoding, val); 1711 } 1712 } 1713 1714 vmcs_clear(shadow_vmcs); 1715 vmcs_load(vmx->loaded_vmcs->vmcs); 1716 } 1717 1718 static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields) 1719 { 1720 #ifdef CONFIG_KVM_HYPERV 1721 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; 1722 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 1723 struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(&vmx->vcpu); 1724 1725 /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */ 1726 vmcs12->tpr_threshold = evmcs->tpr_threshold; 1727 vmcs12->guest_rip = evmcs->guest_rip; 1728 1729 if (unlikely(!(hv_clean_fields & 1730 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ENLIGHTENMENTSCONTROL))) { 1731 hv_vcpu->nested.pa_page_gpa = evmcs->partition_assist_page; 1732 hv_vcpu->nested.vm_id = evmcs->hv_vm_id; 1733 hv_vcpu->nested.vp_id = evmcs->hv_vp_id; 1734 } 1735 1736 if (unlikely(!(hv_clean_fields & 1737 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) { 1738 vmcs12->guest_rsp = evmcs->guest_rsp; 1739 vmcs12->guest_rflags = evmcs->guest_rflags; 1740 vmcs12->guest_interruptibility_info = 1741 evmcs->guest_interruptibility_info; 1742 /* 1743 * Not present in struct vmcs12: 1744 * vmcs12->guest_ssp = evmcs->guest_ssp; 1745 */ 1746 } 1747 1748 if (unlikely(!(hv_clean_fields & 1749 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) { 1750 vmcs12->cpu_based_vm_exec_control = 1751 evmcs->cpu_based_vm_exec_control; 1752 } 1753 1754 if (unlikely(!(hv_clean_fields & 1755 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN))) { 1756 vmcs12->exception_bitmap = evmcs->exception_bitmap; 1757 } 1758 1759 if (unlikely(!(hv_clean_fields & 1760 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) { 1761 vmcs12->vm_entry_controls = evmcs->vm_entry_controls; 1762 } 1763 1764 if (unlikely(!(hv_clean_fields & 1765 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) { 1766 vmcs12->vm_entry_intr_info_field = 1767 evmcs->vm_entry_intr_info_field; 1768 vmcs12->vm_entry_exception_error_code = 1769 evmcs->vm_entry_exception_error_code; 1770 vmcs12->vm_entry_instruction_len = 1771 evmcs->vm_entry_instruction_len; 1772 } 1773 1774 if (unlikely(!(hv_clean_fields & 1775 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) { 1776 vmcs12->host_ia32_pat = evmcs->host_ia32_pat; 1777 vmcs12->host_ia32_efer = evmcs->host_ia32_efer; 1778 vmcs12->host_cr0 = evmcs->host_cr0; 1779 vmcs12->host_cr3 = evmcs->host_cr3; 1780 vmcs12->host_cr4 = evmcs->host_cr4; 1781 vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp; 1782 vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip; 1783 vmcs12->host_rip = evmcs->host_rip; 1784 vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs; 1785 vmcs12->host_es_selector = evmcs->host_es_selector; 1786 vmcs12->host_cs_selector = evmcs->host_cs_selector; 1787 vmcs12->host_ss_selector = evmcs->host_ss_selector; 1788 vmcs12->host_ds_selector = evmcs->host_ds_selector; 1789 vmcs12->host_fs_selector = evmcs->host_fs_selector; 1790 vmcs12->host_gs_selector = evmcs->host_gs_selector; 1791 vmcs12->host_tr_selector = evmcs->host_tr_selector; 1792 vmcs12->host_ia32_perf_global_ctrl = evmcs->host_ia32_perf_global_ctrl; 1793 /* 1794 * Not present in struct vmcs12: 1795 * vmcs12->host_ia32_s_cet = evmcs->host_ia32_s_cet; 1796 * vmcs12->host_ssp = evmcs->host_ssp; 1797 * vmcs12->host_ia32_int_ssp_table_addr = evmcs->host_ia32_int_ssp_table_addr; 1798 */ 1799 } 1800 1801 if (unlikely(!(hv_clean_fields & 1802 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1))) { 1803 vmcs12->pin_based_vm_exec_control = 1804 evmcs->pin_based_vm_exec_control; 1805 vmcs12->vm_exit_controls = evmcs->vm_exit_controls; 1806 vmcs12->secondary_vm_exec_control = 1807 evmcs->secondary_vm_exec_control; 1808 } 1809 1810 if (unlikely(!(hv_clean_fields & 1811 HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) { 1812 vmcs12->io_bitmap_a = evmcs->io_bitmap_a; 1813 vmcs12->io_bitmap_b = evmcs->io_bitmap_b; 1814 } 1815 1816 if (unlikely(!(hv_clean_fields & 1817 HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) { 1818 vmcs12->msr_bitmap = evmcs->msr_bitmap; 1819 } 1820 1821 if (unlikely(!(hv_clean_fields & 1822 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) { 1823 vmcs12->guest_es_base = evmcs->guest_es_base; 1824 vmcs12->guest_cs_base = evmcs->guest_cs_base; 1825 vmcs12->guest_ss_base = evmcs->guest_ss_base; 1826 vmcs12->guest_ds_base = evmcs->guest_ds_base; 1827 vmcs12->guest_fs_base = evmcs->guest_fs_base; 1828 vmcs12->guest_gs_base = evmcs->guest_gs_base; 1829 vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base; 1830 vmcs12->guest_tr_base = evmcs->guest_tr_base; 1831 vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base; 1832 vmcs12->guest_idtr_base = evmcs->guest_idtr_base; 1833 vmcs12->guest_es_limit = evmcs->guest_es_limit; 1834 vmcs12->guest_cs_limit = evmcs->guest_cs_limit; 1835 vmcs12->guest_ss_limit = evmcs->guest_ss_limit; 1836 vmcs12->guest_ds_limit = evmcs->guest_ds_limit; 1837 vmcs12->guest_fs_limit = evmcs->guest_fs_limit; 1838 vmcs12->guest_gs_limit = evmcs->guest_gs_limit; 1839 vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit; 1840 vmcs12->guest_tr_limit = evmcs->guest_tr_limit; 1841 vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit; 1842 vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit; 1843 vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes; 1844 vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes; 1845 vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes; 1846 vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes; 1847 vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes; 1848 vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes; 1849 vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes; 1850 vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes; 1851 vmcs12->guest_es_selector = evmcs->guest_es_selector; 1852 vmcs12->guest_cs_selector = evmcs->guest_cs_selector; 1853 vmcs12->guest_ss_selector = evmcs->guest_ss_selector; 1854 vmcs12->guest_ds_selector = evmcs->guest_ds_selector; 1855 vmcs12->guest_fs_selector = evmcs->guest_fs_selector; 1856 vmcs12->guest_gs_selector = evmcs->guest_gs_selector; 1857 vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector; 1858 vmcs12->guest_tr_selector = evmcs->guest_tr_selector; 1859 } 1860 1861 if (unlikely(!(hv_clean_fields & 1862 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) { 1863 vmcs12->tsc_offset = evmcs->tsc_offset; 1864 vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr; 1865 vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap; 1866 vmcs12->encls_exiting_bitmap = evmcs->encls_exiting_bitmap; 1867 vmcs12->tsc_multiplier = evmcs->tsc_multiplier; 1868 } 1869 1870 if (unlikely(!(hv_clean_fields & 1871 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) { 1872 vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask; 1873 vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask; 1874 vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow; 1875 vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow; 1876 vmcs12->guest_cr0 = evmcs->guest_cr0; 1877 vmcs12->guest_cr3 = evmcs->guest_cr3; 1878 vmcs12->guest_cr4 = evmcs->guest_cr4; 1879 vmcs12->guest_dr7 = evmcs->guest_dr7; 1880 } 1881 1882 if (unlikely(!(hv_clean_fields & 1883 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) { 1884 vmcs12->host_fs_base = evmcs->host_fs_base; 1885 vmcs12->host_gs_base = evmcs->host_gs_base; 1886 vmcs12->host_tr_base = evmcs->host_tr_base; 1887 vmcs12->host_gdtr_base = evmcs->host_gdtr_base; 1888 vmcs12->host_idtr_base = evmcs->host_idtr_base; 1889 vmcs12->host_rsp = evmcs->host_rsp; 1890 } 1891 1892 if (unlikely(!(hv_clean_fields & 1893 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) { 1894 vmcs12->ept_pointer = evmcs->ept_pointer; 1895 vmcs12->virtual_processor_id = evmcs->virtual_processor_id; 1896 } 1897 1898 if (unlikely(!(hv_clean_fields & 1899 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) { 1900 vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer; 1901 vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl; 1902 vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat; 1903 vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer; 1904 vmcs12->guest_pdptr0 = evmcs->guest_pdptr0; 1905 vmcs12->guest_pdptr1 = evmcs->guest_pdptr1; 1906 vmcs12->guest_pdptr2 = evmcs->guest_pdptr2; 1907 vmcs12->guest_pdptr3 = evmcs->guest_pdptr3; 1908 vmcs12->guest_pending_dbg_exceptions = 1909 evmcs->guest_pending_dbg_exceptions; 1910 vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp; 1911 vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip; 1912 vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs; 1913 vmcs12->guest_activity_state = evmcs->guest_activity_state; 1914 vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs; 1915 vmcs12->guest_ia32_perf_global_ctrl = evmcs->guest_ia32_perf_global_ctrl; 1916 /* 1917 * Not present in struct vmcs12: 1918 * vmcs12->guest_ia32_s_cet = evmcs->guest_ia32_s_cet; 1919 * vmcs12->guest_ia32_lbr_ctl = evmcs->guest_ia32_lbr_ctl; 1920 * vmcs12->guest_ia32_int_ssp_table_addr = evmcs->guest_ia32_int_ssp_table_addr; 1921 */ 1922 } 1923 1924 /* 1925 * Not used? 1926 * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr; 1927 * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr; 1928 * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr; 1929 * vmcs12->page_fault_error_code_mask = 1930 * evmcs->page_fault_error_code_mask; 1931 * vmcs12->page_fault_error_code_match = 1932 * evmcs->page_fault_error_code_match; 1933 * vmcs12->cr3_target_count = evmcs->cr3_target_count; 1934 * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count; 1935 * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count; 1936 * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count; 1937 */ 1938 1939 /* 1940 * Read only fields: 1941 * vmcs12->guest_physical_address = evmcs->guest_physical_address; 1942 * vmcs12->vm_instruction_error = evmcs->vm_instruction_error; 1943 * vmcs12->vm_exit_reason = evmcs->vm_exit_reason; 1944 * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info; 1945 * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code; 1946 * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field; 1947 * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code; 1948 * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len; 1949 * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info; 1950 * vmcs12->exit_qualification = evmcs->exit_qualification; 1951 * vmcs12->guest_linear_address = evmcs->guest_linear_address; 1952 * 1953 * Not present in struct vmcs12: 1954 * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx; 1955 * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi; 1956 * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi; 1957 * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip; 1958 */ 1959 1960 return; 1961 #else /* CONFIG_KVM_HYPERV */ 1962 KVM_BUG_ON(1, vmx->vcpu.kvm); 1963 #endif /* CONFIG_KVM_HYPERV */ 1964 } 1965 1966 static void copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) 1967 { 1968 #ifdef CONFIG_KVM_HYPERV 1969 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; 1970 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 1971 1972 /* 1973 * Should not be changed by KVM: 1974 * 1975 * evmcs->host_es_selector = vmcs12->host_es_selector; 1976 * evmcs->host_cs_selector = vmcs12->host_cs_selector; 1977 * evmcs->host_ss_selector = vmcs12->host_ss_selector; 1978 * evmcs->host_ds_selector = vmcs12->host_ds_selector; 1979 * evmcs->host_fs_selector = vmcs12->host_fs_selector; 1980 * evmcs->host_gs_selector = vmcs12->host_gs_selector; 1981 * evmcs->host_tr_selector = vmcs12->host_tr_selector; 1982 * evmcs->host_ia32_pat = vmcs12->host_ia32_pat; 1983 * evmcs->host_ia32_efer = vmcs12->host_ia32_efer; 1984 * evmcs->host_cr0 = vmcs12->host_cr0; 1985 * evmcs->host_cr3 = vmcs12->host_cr3; 1986 * evmcs->host_cr4 = vmcs12->host_cr4; 1987 * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp; 1988 * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip; 1989 * evmcs->host_rip = vmcs12->host_rip; 1990 * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs; 1991 * evmcs->host_fs_base = vmcs12->host_fs_base; 1992 * evmcs->host_gs_base = vmcs12->host_gs_base; 1993 * evmcs->host_tr_base = vmcs12->host_tr_base; 1994 * evmcs->host_gdtr_base = vmcs12->host_gdtr_base; 1995 * evmcs->host_idtr_base = vmcs12->host_idtr_base; 1996 * evmcs->host_rsp = vmcs12->host_rsp; 1997 * sync_vmcs02_to_vmcs12() doesn't read these: 1998 * evmcs->io_bitmap_a = vmcs12->io_bitmap_a; 1999 * evmcs->io_bitmap_b = vmcs12->io_bitmap_b; 2000 * evmcs->msr_bitmap = vmcs12->msr_bitmap; 2001 * evmcs->ept_pointer = vmcs12->ept_pointer; 2002 * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap; 2003 * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr; 2004 * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr; 2005 * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr; 2006 * evmcs->tpr_threshold = vmcs12->tpr_threshold; 2007 * evmcs->virtual_processor_id = vmcs12->virtual_processor_id; 2008 * evmcs->exception_bitmap = vmcs12->exception_bitmap; 2009 * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer; 2010 * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control; 2011 * evmcs->vm_exit_controls = vmcs12->vm_exit_controls; 2012 * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control; 2013 * evmcs->page_fault_error_code_mask = 2014 * vmcs12->page_fault_error_code_mask; 2015 * evmcs->page_fault_error_code_match = 2016 * vmcs12->page_fault_error_code_match; 2017 * evmcs->cr3_target_count = vmcs12->cr3_target_count; 2018 * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr; 2019 * evmcs->tsc_offset = vmcs12->tsc_offset; 2020 * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl; 2021 * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask; 2022 * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask; 2023 * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow; 2024 * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow; 2025 * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count; 2026 * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count; 2027 * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count; 2028 * evmcs->guest_ia32_perf_global_ctrl = vmcs12->guest_ia32_perf_global_ctrl; 2029 * evmcs->host_ia32_perf_global_ctrl = vmcs12->host_ia32_perf_global_ctrl; 2030 * evmcs->encls_exiting_bitmap = vmcs12->encls_exiting_bitmap; 2031 * evmcs->tsc_multiplier = vmcs12->tsc_multiplier; 2032 * 2033 * Not present in struct vmcs12: 2034 * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx; 2035 * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi; 2036 * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi; 2037 * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip; 2038 * evmcs->host_ia32_s_cet = vmcs12->host_ia32_s_cet; 2039 * evmcs->host_ssp = vmcs12->host_ssp; 2040 * evmcs->host_ia32_int_ssp_table_addr = vmcs12->host_ia32_int_ssp_table_addr; 2041 * evmcs->guest_ia32_s_cet = vmcs12->guest_ia32_s_cet; 2042 * evmcs->guest_ia32_lbr_ctl = vmcs12->guest_ia32_lbr_ctl; 2043 * evmcs->guest_ia32_int_ssp_table_addr = vmcs12->guest_ia32_int_ssp_table_addr; 2044 * evmcs->guest_ssp = vmcs12->guest_ssp; 2045 */ 2046 2047 evmcs->guest_es_selector = vmcs12->guest_es_selector; 2048 evmcs->guest_cs_selector = vmcs12->guest_cs_selector; 2049 evmcs->guest_ss_selector = vmcs12->guest_ss_selector; 2050 evmcs->guest_ds_selector = vmcs12->guest_ds_selector; 2051 evmcs->guest_fs_selector = vmcs12->guest_fs_selector; 2052 evmcs->guest_gs_selector = vmcs12->guest_gs_selector; 2053 evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector; 2054 evmcs->guest_tr_selector = vmcs12->guest_tr_selector; 2055 2056 evmcs->guest_es_limit = vmcs12->guest_es_limit; 2057 evmcs->guest_cs_limit = vmcs12->guest_cs_limit; 2058 evmcs->guest_ss_limit = vmcs12->guest_ss_limit; 2059 evmcs->guest_ds_limit = vmcs12->guest_ds_limit; 2060 evmcs->guest_fs_limit = vmcs12->guest_fs_limit; 2061 evmcs->guest_gs_limit = vmcs12->guest_gs_limit; 2062 evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit; 2063 evmcs->guest_tr_limit = vmcs12->guest_tr_limit; 2064 evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit; 2065 evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit; 2066 2067 evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes; 2068 evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes; 2069 evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes; 2070 evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes; 2071 evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes; 2072 evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes; 2073 evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes; 2074 evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes; 2075 2076 evmcs->guest_es_base = vmcs12->guest_es_base; 2077 evmcs->guest_cs_base = vmcs12->guest_cs_base; 2078 evmcs->guest_ss_base = vmcs12->guest_ss_base; 2079 evmcs->guest_ds_base = vmcs12->guest_ds_base; 2080 evmcs->guest_fs_base = vmcs12->guest_fs_base; 2081 evmcs->guest_gs_base = vmcs12->guest_gs_base; 2082 evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base; 2083 evmcs->guest_tr_base = vmcs12->guest_tr_base; 2084 evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base; 2085 evmcs->guest_idtr_base = vmcs12->guest_idtr_base; 2086 2087 evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat; 2088 evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer; 2089 2090 evmcs->guest_pdptr0 = vmcs12->guest_pdptr0; 2091 evmcs->guest_pdptr1 = vmcs12->guest_pdptr1; 2092 evmcs->guest_pdptr2 = vmcs12->guest_pdptr2; 2093 evmcs->guest_pdptr3 = vmcs12->guest_pdptr3; 2094 2095 evmcs->guest_pending_dbg_exceptions = 2096 vmcs12->guest_pending_dbg_exceptions; 2097 evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp; 2098 evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip; 2099 2100 evmcs->guest_activity_state = vmcs12->guest_activity_state; 2101 evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs; 2102 2103 evmcs->guest_cr0 = vmcs12->guest_cr0; 2104 evmcs->guest_cr3 = vmcs12->guest_cr3; 2105 evmcs->guest_cr4 = vmcs12->guest_cr4; 2106 evmcs->guest_dr7 = vmcs12->guest_dr7; 2107 2108 evmcs->guest_physical_address = vmcs12->guest_physical_address; 2109 2110 evmcs->vm_instruction_error = vmcs12->vm_instruction_error; 2111 evmcs->vm_exit_reason = vmcs12->vm_exit_reason; 2112 evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info; 2113 evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code; 2114 evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field; 2115 evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code; 2116 evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len; 2117 evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info; 2118 2119 evmcs->exit_qualification = vmcs12->exit_qualification; 2120 2121 evmcs->guest_linear_address = vmcs12->guest_linear_address; 2122 evmcs->guest_rsp = vmcs12->guest_rsp; 2123 evmcs->guest_rflags = vmcs12->guest_rflags; 2124 2125 evmcs->guest_interruptibility_info = 2126 vmcs12->guest_interruptibility_info; 2127 evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control; 2128 evmcs->vm_entry_controls = vmcs12->vm_entry_controls; 2129 evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field; 2130 evmcs->vm_entry_exception_error_code = 2131 vmcs12->vm_entry_exception_error_code; 2132 evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len; 2133 2134 evmcs->guest_rip = vmcs12->guest_rip; 2135 2136 evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs; 2137 2138 return; 2139 #else /* CONFIG_KVM_HYPERV */ 2140 KVM_BUG_ON(1, vmx->vcpu.kvm); 2141 #endif /* CONFIG_KVM_HYPERV */ 2142 } 2143 2144 /* 2145 * This is an equivalent of the nested hypervisor executing the vmptrld 2146 * instruction. 2147 */ 2148 static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld( 2149 struct kvm_vcpu *vcpu, bool from_launch) 2150 { 2151 #ifdef CONFIG_KVM_HYPERV 2152 struct vcpu_vmx *vmx = to_vmx(vcpu); 2153 bool evmcs_gpa_changed = false; 2154 u64 evmcs_gpa; 2155 2156 if (likely(!guest_cpu_cap_has_evmcs(vcpu))) 2157 return EVMPTRLD_DISABLED; 2158 2159 evmcs_gpa = nested_get_evmptr(vcpu); 2160 if (!evmptr_is_valid(evmcs_gpa)) { 2161 nested_release_evmcs(vcpu); 2162 return EVMPTRLD_DISABLED; 2163 } 2164 2165 if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) { 2166 vmx->nested.current_vmptr = INVALID_GPA; 2167 2168 nested_release_evmcs(vcpu); 2169 2170 if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa), 2171 &vmx->nested.hv_evmcs_map)) 2172 return EVMPTRLD_ERROR; 2173 2174 vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva; 2175 2176 /* 2177 * Currently, KVM only supports eVMCS version 1 2178 * (== KVM_EVMCS_VERSION) and thus we expect guest to set this 2179 * value to first u32 field of eVMCS which should specify eVMCS 2180 * VersionNumber. 2181 * 2182 * Guest should be aware of supported eVMCS versions by host by 2183 * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is 2184 * expected to set this CPUID leaf according to the value 2185 * returned in vmcs_version from nested_enable_evmcs(). 2186 * 2187 * However, it turns out that Microsoft Hyper-V fails to comply 2188 * to their own invented interface: When Hyper-V use eVMCS, it 2189 * just sets first u32 field of eVMCS to revision_id specified 2190 * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number 2191 * which is one of the supported versions specified in 2192 * CPUID.0x4000000A.EAX[0:15]. 2193 * 2194 * To overcome Hyper-V bug, we accept here either a supported 2195 * eVMCS version or VMCS12 revision_id as valid values for first 2196 * u32 field of eVMCS. 2197 */ 2198 if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) && 2199 (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) { 2200 nested_release_evmcs(vcpu); 2201 return EVMPTRLD_VMFAIL; 2202 } 2203 2204 vmx->nested.hv_evmcs_vmptr = evmcs_gpa; 2205 2206 evmcs_gpa_changed = true; 2207 /* 2208 * Unlike normal vmcs12, enlightened vmcs12 is not fully 2209 * reloaded from guest's memory (read only fields, fields not 2210 * present in struct hv_enlightened_vmcs, ...). Make sure there 2211 * are no leftovers. 2212 */ 2213 if (from_launch) { 2214 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2215 memset(vmcs12, 0, sizeof(*vmcs12)); 2216 vmcs12->hdr.revision_id = VMCS12_REVISION; 2217 } 2218 2219 } 2220 2221 /* 2222 * Clean fields data can't be used on VMLAUNCH and when we switch 2223 * between different L2 guests as KVM keeps a single VMCS12 per L1. 2224 */ 2225 if (from_launch || evmcs_gpa_changed) { 2226 vmx->nested.hv_evmcs->hv_clean_fields &= 2227 ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 2228 2229 vmx->nested.force_msr_bitmap_recalc = true; 2230 } 2231 2232 return EVMPTRLD_SUCCEEDED; 2233 #else 2234 return EVMPTRLD_DISABLED; 2235 #endif 2236 } 2237 2238 void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu) 2239 { 2240 struct vcpu_vmx *vmx = to_vmx(vcpu); 2241 2242 if (nested_vmx_is_evmptr12_valid(vmx)) 2243 copy_vmcs12_to_enlightened(vmx); 2244 else 2245 copy_vmcs12_to_shadow(vmx); 2246 2247 vmx->nested.need_vmcs12_to_shadow_sync = false; 2248 } 2249 2250 static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) 2251 { 2252 struct vcpu_vmx *vmx = 2253 container_of(timer, struct vcpu_vmx, nested.preemption_timer); 2254 2255 vmx->nested.preemption_timer_expired = true; 2256 kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); 2257 kvm_vcpu_kick(&vmx->vcpu); 2258 2259 return HRTIMER_NORESTART; 2260 } 2261 2262 static u64 vmx_calc_preemption_timer_value(struct kvm_vcpu *vcpu) 2263 { 2264 struct vcpu_vmx *vmx = to_vmx(vcpu); 2265 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2266 2267 u64 l1_scaled_tsc = kvm_read_l1_tsc(vcpu, rdtsc()) >> 2268 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 2269 2270 if (!vmx->nested.has_preemption_timer_deadline) { 2271 vmx->nested.preemption_timer_deadline = 2272 vmcs12->vmx_preemption_timer_value + l1_scaled_tsc; 2273 vmx->nested.has_preemption_timer_deadline = true; 2274 } 2275 return vmx->nested.preemption_timer_deadline - l1_scaled_tsc; 2276 } 2277 2278 static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu, 2279 u64 preemption_timeout) 2280 { 2281 struct vcpu_vmx *vmx = to_vmx(vcpu); 2282 2283 /* 2284 * A timer value of zero is architecturally guaranteed to cause 2285 * a VMExit prior to executing any instructions in the guest. 2286 */ 2287 if (preemption_timeout == 0) { 2288 vmx_preemption_timer_fn(&vmx->nested.preemption_timer); 2289 return; 2290 } 2291 2292 if (vcpu->arch.virtual_tsc_khz == 0) 2293 return; 2294 2295 preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 2296 preemption_timeout *= 1000000; 2297 do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); 2298 hrtimer_start(&vmx->nested.preemption_timer, 2299 ktime_add_ns(ktime_get(), preemption_timeout), 2300 HRTIMER_MODE_ABS_PINNED); 2301 } 2302 2303 static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2304 { 2305 if (vmx->vcpu.arch.nested_run_pending && 2306 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) 2307 return vmcs12->guest_ia32_efer; 2308 else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) 2309 return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME); 2310 else 2311 return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME); 2312 } 2313 2314 static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) 2315 { 2316 struct kvm *kvm = vmx->vcpu.kvm; 2317 2318 /* 2319 * If vmcs02 hasn't been initialized, set the constant vmcs02 state 2320 * according to L0's settings (vmcs12 is irrelevant here). Host 2321 * fields that come from L0 and are not constant, e.g. HOST_CR3, 2322 * will be set as needed prior to VMLAUNCH/VMRESUME. 2323 */ 2324 if (vmx->nested.vmcs02_initialized) 2325 return; 2326 vmx->nested.vmcs02_initialized = true; 2327 2328 if (vmx->ve_info) 2329 vmcs_write64(VE_INFORMATION_ADDRESS, __pa(vmx->ve_info)); 2330 2331 /* All VMFUNCs are currently emulated through L0 vmexits. */ 2332 if (cpu_has_vmx_vmfunc()) 2333 vmcs_write64(VM_FUNCTION_CONTROL, 0); 2334 2335 if (cpu_has_vmx_posted_intr()) 2336 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR); 2337 2338 if (cpu_has_vmx_msr_bitmap()) 2339 vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap)); 2340 2341 /* 2342 * PML is emulated for L2, but never enabled in hardware as the MMU 2343 * handles A/D emulation. Disabling PML for L2 also avoids having to 2344 * deal with filtering out L2 GPAs from the buffer. 2345 */ 2346 if (enable_pml) { 2347 vmcs_write64(PML_ADDRESS, 0); 2348 vmcs_write16(GUEST_PML_INDEX, -1); 2349 } 2350 2351 if (cpu_has_vmx_encls_vmexit()) 2352 vmcs_write64(ENCLS_EXITING_BITMAP, INVALID_GPA); 2353 2354 if (kvm_notify_vmexit_enabled(kvm)) 2355 vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window); 2356 2357 /* 2358 * Set the MSR load/store lists to match L0's settings. Only the 2359 * addresses are constant (for vmcs02), the counts can change based 2360 * on L2's behavior, e.g. switching to/from long mode. 2361 */ 2362 vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.val)); 2363 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); 2364 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); 2365 2366 vmx_set_constant_host_state(vmx); 2367 } 2368 2369 static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx, 2370 struct vmcs12 *vmcs12) 2371 { 2372 prepare_vmcs02_constant_state(vmx); 2373 2374 vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); 2375 2376 /* 2377 * If VPID is disabled, then guest TLB accesses use VPID=0, i.e. the 2378 * same VPID as the host. Emulate this behavior by using vpid01 for L2 2379 * if VPID is disabled in vmcs12. Note, if VPID is disabled, VM-Enter 2380 * and VM-Exit are architecturally required to flush VPID=0, but *only* 2381 * VPID=0. I.e. using vpid02 would be ok (so long as KVM emulates the 2382 * required flushes), but doing so would cause KVM to over-flush. E.g. 2383 * if L1 runs L2 X with VPID12=1, then runs L2 Y with VPID12 disabled, 2384 * and then runs L2 X again, then KVM can and should retain TLB entries 2385 * for VPID12=1. 2386 */ 2387 if (enable_vpid) { 2388 if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) 2389 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); 2390 else 2391 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 2392 } 2393 } 2394 2395 static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs01, 2396 struct vmcs12 *vmcs12) 2397 { 2398 u32 exec_control; 2399 u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12); 2400 2401 if (vmx->nested.dirty_vmcs12 || nested_vmx_is_evmptr12_valid(vmx)) 2402 prepare_vmcs02_early_rare(vmx, vmcs12); 2403 2404 /* 2405 * PIN CONTROLS 2406 */ 2407 exec_control = __pin_controls_get(vmcs01); 2408 exec_control |= (vmcs12->pin_based_vm_exec_control & 2409 ~PIN_BASED_VMX_PREEMPTION_TIMER); 2410 2411 /* Posted interrupts setting is only taken from vmcs12. */ 2412 vmx->nested.pi_pending = false; 2413 if (nested_cpu_has_posted_intr(vmcs12)) { 2414 vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; 2415 } else { 2416 vmx->nested.posted_intr_nv = -1; 2417 exec_control &= ~PIN_BASED_POSTED_INTR; 2418 } 2419 pin_controls_set(vmx, exec_control); 2420 2421 /* 2422 * EXEC CONTROLS 2423 */ 2424 exec_control = __exec_controls_get(vmcs01); /* L0's desires */ 2425 exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING; 2426 exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING; 2427 exec_control &= ~CPU_BASED_TPR_SHADOW; 2428 exec_control |= vmcs12->cpu_based_vm_exec_control; 2429 2430 if (exec_control & CPU_BASED_TPR_SHADOW) 2431 vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); 2432 #ifdef CONFIG_X86_64 2433 else 2434 exec_control |= CPU_BASED_CR8_LOAD_EXITING | 2435 CPU_BASED_CR8_STORE_EXITING; 2436 #endif 2437 2438 /* 2439 * A vmexit (to either L1 hypervisor or L0 userspace) is always needed 2440 * for I/O port accesses. 2441 */ 2442 exec_control |= CPU_BASED_UNCOND_IO_EXITING; 2443 exec_control &= ~CPU_BASED_USE_IO_BITMAPS; 2444 2445 /* 2446 * This bit will be computed in nested_get_vmcs12_pages, because 2447 * we do not have access to L1's MSR bitmap yet. For now, keep 2448 * the same bit as before, hoping to avoid multiple VMWRITEs that 2449 * only set/clear this bit. 2450 */ 2451 exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; 2452 exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS; 2453 2454 exec_controls_set(vmx, exec_control); 2455 2456 /* 2457 * SECONDARY EXEC CONTROLS 2458 */ 2459 if (cpu_has_secondary_exec_ctrls()) { 2460 exec_control = __secondary_exec_controls_get(vmcs01); 2461 2462 /* Take the following fields only from vmcs12 */ 2463 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 2464 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 2465 SECONDARY_EXEC_ENABLE_INVPCID | 2466 SECONDARY_EXEC_ENABLE_RDTSCP | 2467 SECONDARY_EXEC_ENABLE_XSAVES | 2468 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | 2469 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 2470 SECONDARY_EXEC_APIC_REGISTER_VIRT | 2471 SECONDARY_EXEC_ENABLE_VMFUNC | 2472 SECONDARY_EXEC_MODE_BASED_EPT_EXEC | 2473 SECONDARY_EXEC_DESC); 2474 2475 if (nested_cpu_has(vmcs12, 2476 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) 2477 exec_control |= vmcs12->secondary_vm_exec_control; 2478 2479 /* PML is emulated and never enabled in hardware for L2. */ 2480 exec_control &= ~SECONDARY_EXEC_ENABLE_PML; 2481 2482 /* VMCS shadowing for L2 is emulated for now */ 2483 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; 2484 2485 /* 2486 * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4() 2487 * will not have to rewrite the controls just for this bit. 2488 */ 2489 if (vmx_umip_emulated() && (vmcs12->guest_cr4 & X86_CR4_UMIP)) 2490 exec_control |= SECONDARY_EXEC_DESC; 2491 2492 if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) 2493 vmcs_write16(GUEST_INTR_STATUS, 2494 vmcs12->guest_intr_status); 2495 2496 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) 2497 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 2498 2499 if (exec_control & SECONDARY_EXEC_ENCLS_EXITING) 2500 vmx_write_encls_bitmap(&vmx->vcpu, vmcs12); 2501 2502 secondary_exec_controls_set(vmx, exec_control); 2503 } 2504 2505 /* 2506 * ENTRY CONTROLS 2507 * 2508 * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE 2509 * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate 2510 * on the related bits (if supported by the CPU) in the hope that 2511 * we can avoid VMWrites during vmx_set_efer(). 2512 * 2513 * Similarly, take vmcs01's PERF_GLOBAL_CTRL in the hope that if KVM is 2514 * loading PERF_GLOBAL_CTRL via the VMCS for L1, then KVM will want to 2515 * do the same for L2. 2516 */ 2517 exec_control = __vm_entry_controls_get(vmcs01); 2518 exec_control |= (vmcs12->vm_entry_controls & 2519 ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL); 2520 exec_control &= ~(VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER); 2521 if (cpu_has_load_ia32_efer()) { 2522 if (guest_efer & EFER_LMA) 2523 exec_control |= VM_ENTRY_IA32E_MODE; 2524 if (guest_efer != kvm_host.efer) 2525 exec_control |= VM_ENTRY_LOAD_IA32_EFER; 2526 } 2527 vm_entry_controls_set(vmx, exec_control); 2528 2529 /* 2530 * EXIT CONTROLS 2531 * 2532 * L2->L1 exit controls are emulated - the hardware exit is to L0 so 2533 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER 2534 * bits may be modified by vmx_set_efer() in prepare_vmcs02(). 2535 */ 2536 exec_control = __vm_exit_controls_get(vmcs01); 2537 if (cpu_has_load_ia32_efer() && guest_efer != kvm_host.efer) 2538 exec_control |= VM_EXIT_LOAD_IA32_EFER; 2539 else 2540 exec_control &= ~VM_EXIT_LOAD_IA32_EFER; 2541 vm_exit_controls_set(vmx, exec_control); 2542 2543 /* 2544 * Interrupt/Exception Fields 2545 */ 2546 if (vmx->vcpu.arch.nested_run_pending) { 2547 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 2548 vmcs12->vm_entry_intr_info_field); 2549 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 2550 vmcs12->vm_entry_exception_error_code); 2551 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 2552 vmcs12->vm_entry_instruction_len); 2553 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 2554 vmcs12->guest_interruptibility_info); 2555 vmx->loaded_vmcs->nmi_known_unmasked = 2556 !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI); 2557 } else { 2558 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); 2559 } 2560 } 2561 2562 static void vmcs_read_cet_state(struct kvm_vcpu *vcpu, u64 *s_cet, 2563 u64 *ssp, u64 *ssp_tbl) 2564 { 2565 if (guest_cpu_cap_has(vcpu, X86_FEATURE_IBT) || 2566 guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) 2567 *s_cet = vmcs_readl(GUEST_S_CET); 2568 2569 if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) { 2570 *ssp = vmcs_readl(GUEST_SSP); 2571 *ssp_tbl = vmcs_readl(GUEST_INTR_SSP_TABLE); 2572 } 2573 } 2574 2575 static void vmcs_write_cet_state(struct kvm_vcpu *vcpu, u64 s_cet, 2576 u64 ssp, u64 ssp_tbl) 2577 { 2578 if (guest_cpu_cap_has(vcpu, X86_FEATURE_IBT) || 2579 guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) 2580 vmcs_writel(GUEST_S_CET, s_cet); 2581 2582 if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) { 2583 vmcs_writel(GUEST_SSP, ssp); 2584 vmcs_writel(GUEST_INTR_SSP_TABLE, ssp_tbl); 2585 } 2586 } 2587 2588 static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2589 { 2590 struct hv_enlightened_vmcs *hv_evmcs = nested_vmx_evmcs(vmx); 2591 2592 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2593 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { 2594 2595 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); 2596 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); 2597 vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); 2598 vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector); 2599 vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector); 2600 vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector); 2601 vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector); 2602 vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector); 2603 vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit); 2604 vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit); 2605 vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit); 2606 vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit); 2607 vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit); 2608 vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit); 2609 vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit); 2610 vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit); 2611 vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); 2612 vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); 2613 vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); 2614 vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); 2615 vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); 2616 vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); 2617 vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); 2618 vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes); 2619 vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes); 2620 vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes); 2621 vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); 2622 vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); 2623 vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base); 2624 vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base); 2625 vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base); 2626 vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base); 2627 vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base); 2628 vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); 2629 vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); 2630 vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); 2631 2632 vmx_segment_cache_clear(vmx); 2633 } 2634 2635 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2636 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) { 2637 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); 2638 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 2639 vmcs12->guest_pending_dbg_exceptions); 2640 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); 2641 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); 2642 2643 if (kvm_mpx_supported() && vmx->vcpu.arch.nested_run_pending && 2644 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) 2645 vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); 2646 } 2647 2648 if (nested_cpu_has_xsaves(vmcs12)) 2649 vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap); 2650 2651 /* 2652 * Whether page-faults are trapped is determined by a combination of 2653 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. If L0 2654 * doesn't care about page faults then we should set all of these to 2655 * L1's desires. However, if L0 does care about (some) page faults, it 2656 * is not easy (if at all possible?) to merge L0 and L1's desires, we 2657 * simply ask to exit on each and every L2 page fault. This is done by 2658 * setting MASK=MATCH=0 and (see below) EB.PF=1. 2659 * Note that below we don't need special code to set EB.PF beyond the 2660 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, 2661 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when 2662 * !enable_ept, EB.PF is 1, so the "or" will always be 1. 2663 */ 2664 if (vmx_need_pf_intercept(&vmx->vcpu)) { 2665 /* 2666 * TODO: if both L0 and L1 need the same MASK and MATCH, 2667 * go ahead and use it? 2668 */ 2669 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); 2670 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); 2671 } else { 2672 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, vmcs12->page_fault_error_code_mask); 2673 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, vmcs12->page_fault_error_code_match); 2674 } 2675 2676 if (cpu_has_vmx_apicv()) { 2677 vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0); 2678 vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1); 2679 vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2); 2680 vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3); 2681 } 2682 2683 /* 2684 * If vmcs12 is configured to save TSC on exit via the auto-store list, 2685 * append the MSR to vmcs02's auto-store list so that KVM effectively 2686 * reads TSC at the time of VM-Exit from L2. The saved value will be 2687 * propagated to vmcs12's list on nested VM-Exit. 2688 * 2689 * Don't increment the number of MSRs in the vCPU structure, as saving 2690 * TSC is specific to this particular incarnation of vmcb02, i.e. must 2691 * not bleed into vmcs01. 2692 */ 2693 if (nested_msr_store_list_has_msr(&vmx->vcpu, MSR_IA32_TSC) && 2694 !WARN_ON_ONCE(vmx->msr_autostore.nr >= ARRAY_SIZE(vmx->msr_autostore.val))) { 2695 vmx->nested.tsc_autostore_slot = vmx->msr_autostore.nr; 2696 vmx->msr_autostore.val[vmx->msr_autostore.nr].index = MSR_IA32_TSC; 2697 2698 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.nr + 1); 2699 } else { 2700 vmx->nested.tsc_autostore_slot = -1; 2701 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.nr); 2702 } 2703 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 2704 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 2705 2706 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE) 2707 vmcs_write_cet_state(&vmx->vcpu, vmcs12->guest_s_cet, 2708 vmcs12->guest_ssp, vmcs12->guest_ssp_tbl); 2709 2710 set_cr4_guest_host_mask(vmx); 2711 } 2712 2713 /* 2714 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested 2715 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it 2716 * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2 2717 * guest in a way that will both be appropriate to L1's requests, and our 2718 * needs. In addition to modifying the active vmcs (which is vmcs02), this 2719 * function also has additional necessary side-effects, like setting various 2720 * vcpu->arch fields. 2721 * Returns 0 on success, 1 on failure. Invalid state exit qualification code 2722 * is assigned to entry_failure_code on failure. 2723 */ 2724 static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 2725 bool from_vmentry, 2726 enum vm_entry_failure_code *entry_failure_code) 2727 { 2728 struct vcpu_vmx *vmx = to_vmx(vcpu); 2729 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 2730 bool load_guest_pdptrs_vmcs12 = false; 2731 2732 if (vmx->nested.dirty_vmcs12 || nested_vmx_is_evmptr12_valid(vmx)) { 2733 prepare_vmcs02_rare(vmx, vmcs12); 2734 vmx->nested.dirty_vmcs12 = false; 2735 2736 load_guest_pdptrs_vmcs12 = !nested_vmx_is_evmptr12_valid(vmx) || 2737 !(evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1); 2738 } 2739 2740 if (vcpu->arch.nested_run_pending && 2741 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { 2742 kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); 2743 vmx_guest_debugctl_write(vcpu, vmcs12->guest_ia32_debugctl & 2744 vmx_get_supported_debugctl(vcpu, false)); 2745 } else { 2746 kvm_set_dr(vcpu, 7, vcpu->arch.dr7); 2747 vmx_guest_debugctl_write(vcpu, vmx->nested.pre_vmenter_debugctl); 2748 } 2749 2750 if (!vcpu->arch.nested_run_pending || 2751 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE)) 2752 vmcs_write_cet_state(vcpu, vmx->nested.pre_vmenter_s_cet, 2753 vmx->nested.pre_vmenter_ssp, 2754 vmx->nested.pre_vmenter_ssp_tbl); 2755 2756 if (kvm_mpx_supported() && (!vcpu->arch.nested_run_pending || 2757 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) 2758 vmcs_write64(GUEST_BNDCFGS, vmx->nested.pre_vmenter_bndcfgs); 2759 vmx_set_rflags(vcpu, vmcs12->guest_rflags); 2760 2761 /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the 2762 * bitwise-or of what L1 wants to trap for L2, and what we want to 2763 * trap. Note that CR0.TS also needs updating - we do this later. 2764 */ 2765 vmx_update_exception_bitmap(vcpu); 2766 vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; 2767 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); 2768 2769 if (vcpu->arch.nested_run_pending && 2770 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) { 2771 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); 2772 vcpu->arch.pat = vmcs12->guest_ia32_pat; 2773 } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { 2774 vmcs_write64(GUEST_IA32_PAT, vcpu->arch.pat); 2775 } 2776 2777 vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset( 2778 vcpu->arch.l1_tsc_offset, 2779 vmx_get_l2_tsc_offset(vcpu), 2780 vmx_get_l2_tsc_multiplier(vcpu)); 2781 2782 vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier( 2783 vcpu->arch.l1_tsc_scaling_ratio, 2784 vmx_get_l2_tsc_multiplier(vcpu)); 2785 2786 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 2787 if (kvm_caps.has_tsc_control) 2788 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); 2789 2790 nested_vmx_transition_tlb_flush(vcpu, vmcs12, true); 2791 2792 if (nested_cpu_has_ept(vmcs12)) 2793 nested_ept_init_mmu_context(vcpu); 2794 2795 /* 2796 * Override the CR0/CR4 read shadows after setting the effective guest 2797 * CR0/CR4. The common helpers also set the shadows, but they don't 2798 * account for vmcs12's cr0/4_guest_host_mask. 2799 */ 2800 vmx_set_cr0(vcpu, vmcs12->guest_cr0); 2801 vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); 2802 2803 vmx_set_cr4(vcpu, vmcs12->guest_cr4); 2804 vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12)); 2805 2806 vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12); 2807 /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ 2808 vmx_set_efer(vcpu, vcpu->arch.efer); 2809 2810 /* 2811 * Guest state is invalid and unrestricted guest is disabled, 2812 * which means L1 attempted VMEntry to L2 with invalid state. 2813 * Fail the VMEntry. 2814 * 2815 * However when force loading the guest state (SMM exit or 2816 * loading nested state after migration, it is possible to 2817 * have invalid guest state now, which will be later fixed by 2818 * restoring L2 register state 2819 */ 2820 if (CC(from_vmentry && !vmx_guest_state_valid(vcpu))) { 2821 *entry_failure_code = ENTRY_FAIL_DEFAULT; 2822 return -EINVAL; 2823 } 2824 2825 /* Shadow page tables on either EPT or shadow page tables. */ 2826 if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12), 2827 from_vmentry, entry_failure_code)) 2828 return -EINVAL; 2829 2830 /* 2831 * Immediately write vmcs02.GUEST_CR3. It will be propagated to vmcs12 2832 * on nested VM-Exit, which can occur without actually running L2 and 2833 * thus without hitting vmx_load_mmu_pgd(), e.g. if L1 is entering L2 with 2834 * vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the 2835 * transition to HLT instead of running L2. 2836 */ 2837 if (enable_ept) 2838 vmcs_writel(GUEST_CR3, vmcs12->guest_cr3); 2839 2840 /* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */ 2841 if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) && 2842 is_pae_paging(vcpu)) { 2843 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 2844 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 2845 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 2846 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 2847 } 2848 2849 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && 2850 kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)) && 2851 WARN_ON_ONCE(__kvm_emulate_msr_write(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 2852 vmcs12->guest_ia32_perf_global_ctrl))) { 2853 *entry_failure_code = ENTRY_FAIL_DEFAULT; 2854 return -EINVAL; 2855 } 2856 2857 kvm_rsp_write(vcpu, vmcs12->guest_rsp); 2858 kvm_rip_write(vcpu, vmcs12->guest_rip); 2859 2860 /* 2861 * It was observed that genuine Hyper-V running in L1 doesn't reset 2862 * 'hv_clean_fields' by itself, it only sets the corresponding dirty 2863 * bits when it changes a field in eVMCS. Mark all fields as clean 2864 * here. 2865 */ 2866 if (nested_vmx_is_evmptr12_valid(vmx)) 2867 evmcs->hv_clean_fields |= HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 2868 2869 return 0; 2870 } 2871 2872 static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12) 2873 { 2874 if (CC(!nested_cpu_has_nmi_exiting(vmcs12) && 2875 nested_cpu_has_virtual_nmis(vmcs12))) 2876 return -EINVAL; 2877 2878 if (CC(!nested_cpu_has_virtual_nmis(vmcs12) && 2879 nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING))) 2880 return -EINVAL; 2881 2882 return 0; 2883 } 2884 2885 static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp) 2886 { 2887 struct vcpu_vmx *vmx = to_vmx(vcpu); 2888 2889 /* Check for memory type validity */ 2890 switch (new_eptp & VMX_EPTP_MT_MASK) { 2891 case VMX_EPTP_MT_UC: 2892 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT))) 2893 return false; 2894 break; 2895 case VMX_EPTP_MT_WB: 2896 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT))) 2897 return false; 2898 break; 2899 default: 2900 return false; 2901 } 2902 2903 /* Page-walk levels validity. */ 2904 switch (new_eptp & VMX_EPTP_PWL_MASK) { 2905 case VMX_EPTP_PWL_5: 2906 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_5_BIT))) 2907 return false; 2908 break; 2909 case VMX_EPTP_PWL_4: 2910 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_4_BIT))) 2911 return false; 2912 break; 2913 default: 2914 return false; 2915 } 2916 2917 /* Reserved bits should not be set */ 2918 if (CC(!kvm_vcpu_is_legal_gpa(vcpu, new_eptp) || ((new_eptp >> 7) & 0x1f))) 2919 return false; 2920 2921 /* AD, if set, should be supported */ 2922 if (new_eptp & VMX_EPTP_AD_ENABLE_BIT) { 2923 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT))) 2924 return false; 2925 } 2926 2927 return true; 2928 } 2929 2930 /* 2931 * Checks related to VM-Execution Control Fields 2932 */ 2933 static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu, 2934 struct vmcs12 *vmcs12) 2935 { 2936 struct vcpu_vmx *vmx = to_vmx(vcpu); 2937 2938 if (CC(!vmx_control_verify(vmcs12->pin_based_vm_exec_control, 2939 vmx->nested.msrs.pinbased_ctls_low, 2940 vmx->nested.msrs.pinbased_ctls_high)) || 2941 CC(!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, 2942 vmx->nested.msrs.procbased_ctls_low, 2943 vmx->nested.msrs.procbased_ctls_high))) 2944 return -EINVAL; 2945 2946 if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && 2947 CC(!vmx_control_verify(vmcs12->secondary_vm_exec_control, 2948 vmx->nested.msrs.secondary_ctls_low, 2949 vmx->nested.msrs.secondary_ctls_high))) 2950 return -EINVAL; 2951 2952 if (CC(vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) || 2953 nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) || 2954 nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) || 2955 nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) || 2956 nested_vmx_check_apic_access_controls(vcpu, vmcs12) || 2957 nested_vmx_check_apicv_controls(vcpu, vmcs12) || 2958 nested_vmx_check_nmi_controls(vmcs12) || 2959 nested_vmx_check_pml_controls(vcpu, vmcs12) || 2960 nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) || 2961 nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) || 2962 nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) || 2963 CC(nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id)) 2964 return -EINVAL; 2965 2966 if (!nested_cpu_has_preemption_timer(vmcs12) && 2967 nested_cpu_has_save_preemption_timer(vmcs12)) 2968 return -EINVAL; 2969 2970 if (nested_cpu_has_ept(vmcs12) && 2971 CC(!nested_vmx_check_eptp(vcpu, vmcs12->ept_pointer))) 2972 return -EINVAL; 2973 2974 if (nested_cpu_has_vmfunc(vmcs12)) { 2975 if (CC(vmcs12->vm_function_control & 2976 ~vmx->nested.msrs.vmfunc_controls)) 2977 return -EINVAL; 2978 2979 if (nested_cpu_has_eptp_switching(vmcs12)) { 2980 if (CC(!nested_cpu_has_ept(vmcs12)) || 2981 CC(!page_address_valid(vcpu, vmcs12->eptp_list_address))) 2982 return -EINVAL; 2983 } 2984 } 2985 2986 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING) && 2987 CC(!vmcs12->tsc_multiplier)) 2988 return -EINVAL; 2989 2990 return 0; 2991 } 2992 2993 /* 2994 * Checks related to VM-Exit Control Fields 2995 */ 2996 static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu, 2997 struct vmcs12 *vmcs12) 2998 { 2999 struct vcpu_vmx *vmx = to_vmx(vcpu); 3000 3001 if (CC(!vmx_control_verify(vmcs12->vm_exit_controls, 3002 vmx->nested.msrs.exit_ctls_low, 3003 vmx->nested.msrs.exit_ctls_high)) || 3004 CC(nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12))) 3005 return -EINVAL; 3006 3007 return 0; 3008 } 3009 3010 /* 3011 * Checks related to VM-Entry Control Fields 3012 */ 3013 static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, 3014 struct vmcs12 *vmcs12) 3015 { 3016 struct vcpu_vmx *vmx = to_vmx(vcpu); 3017 3018 if (CC(!vmx_control_verify(vmcs12->vm_entry_controls, 3019 vmx->nested.msrs.entry_ctls_low, 3020 vmx->nested.msrs.entry_ctls_high))) 3021 return -EINVAL; 3022 3023 /* 3024 * From the Intel SDM, volume 3: 3025 * Fields relevant to VM-entry event injection must be set properly. 3026 * These fields are the VM-entry interruption-information field, the 3027 * VM-entry exception error code, and the VM-entry instruction length. 3028 */ 3029 if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) { 3030 u32 intr_info = vmcs12->vm_entry_intr_info_field; 3031 u8 vector = intr_info & INTR_INFO_VECTOR_MASK; 3032 u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK; 3033 bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK; 3034 bool urg = nested_cpu_has2(vmcs12, 3035 SECONDARY_EXEC_UNRESTRICTED_GUEST); 3036 bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE; 3037 3038 /* VM-entry interruption-info field: interruption type */ 3039 if (CC(intr_type == INTR_TYPE_RESERVED) || 3040 CC(intr_type == INTR_TYPE_OTHER_EVENT && 3041 !nested_cpu_supports_monitor_trap_flag(vcpu))) 3042 return -EINVAL; 3043 3044 /* VM-entry interruption-info field: vector */ 3045 if (CC(intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) || 3046 CC(intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) || 3047 CC(intr_type == INTR_TYPE_OTHER_EVENT && vector != 0)) 3048 return -EINVAL; 3049 3050 /* 3051 * Cannot deliver error code in real mode or if the interrupt 3052 * type is not hardware exception. For other cases, do the 3053 * consistency check only if the vCPU doesn't enumerate 3054 * VMX_BASIC_NO_HW_ERROR_CODE_CC. 3055 */ 3056 if (!prot_mode || intr_type != INTR_TYPE_HARD_EXCEPTION) { 3057 if (CC(has_error_code)) 3058 return -EINVAL; 3059 } else if (!nested_cpu_has_no_hw_errcode_cc(vcpu)) { 3060 if (CC(has_error_code != x86_exception_has_error_code(vector))) 3061 return -EINVAL; 3062 } 3063 3064 /* VM-entry exception error code */ 3065 if (CC(has_error_code && 3066 vmcs12->vm_entry_exception_error_code & GENMASK(31, 16))) 3067 return -EINVAL; 3068 3069 /* VM-entry interruption-info field: reserved bits */ 3070 if (CC(intr_info & INTR_INFO_RESVD_BITS_MASK)) 3071 return -EINVAL; 3072 3073 /* VM-entry instruction length */ 3074 switch (intr_type) { 3075 case INTR_TYPE_SOFT_EXCEPTION: 3076 case INTR_TYPE_SOFT_INTR: 3077 case INTR_TYPE_PRIV_SW_EXCEPTION: 3078 if (CC(vmcs12->vm_entry_instruction_len > X86_MAX_INSTRUCTION_LENGTH) || 3079 CC(vmcs12->vm_entry_instruction_len == 0 && 3080 CC(!nested_cpu_has_zero_length_injection(vcpu)))) 3081 return -EINVAL; 3082 } 3083 } 3084 3085 if (nested_vmx_check_entry_msr_switch_controls(vcpu, vmcs12)) 3086 return -EINVAL; 3087 3088 return 0; 3089 } 3090 3091 static int nested_vmx_check_controls(struct kvm_vcpu *vcpu, 3092 struct vmcs12 *vmcs12) 3093 { 3094 if (nested_check_vm_execution_controls(vcpu, vmcs12) || 3095 nested_check_vm_exit_controls(vcpu, vmcs12) || 3096 nested_check_vm_entry_controls(vcpu, vmcs12)) 3097 return -EINVAL; 3098 3099 #ifdef CONFIG_KVM_HYPERV 3100 if (guest_cpu_cap_has_evmcs(vcpu)) 3101 return nested_evmcs_check_controls(vmcs12); 3102 #endif 3103 3104 return 0; 3105 } 3106 3107 static int nested_vmx_check_controls_late(struct kvm_vcpu *vcpu, 3108 struct vmcs12 *vmcs12) 3109 { 3110 void *vapic = to_vmx(vcpu)->nested.virtual_apic_map.hva; 3111 u32 vtpr = vapic ? (*(u32 *)(vapic + APIC_TASKPRI)) >> 4 : 0; 3112 3113 /* 3114 * Don't bother with the consistency checks if KVM isn't configured to 3115 * WARN on missed consistency checks, as KVM needs to rely on hardware 3116 * to fully detect an illegal vTPR vs. TRP Threshold combination due to 3117 * the vTPR being writable by L1 at all times (it's an in-memory value, 3118 * not a VMCS field). I.e. even if the check passes now, it might fail 3119 * at the actual VM-Enter. 3120 * 3121 * Keying off the module param also allows treating an invalid vAPIC 3122 * mapping as a consistency check failure without increasing the risk 3123 * of breaking a "real" VM. 3124 */ 3125 if (!warn_on_missed_cc) 3126 return 0; 3127 3128 if ((exec_controls_get(to_vmx(vcpu)) & CPU_BASED_TPR_SHADOW) && 3129 nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW) && 3130 !nested_cpu_has_vid(vmcs12) && 3131 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && 3132 (CC(!vapic) || 3133 CC((vmcs12->tpr_threshold & GENMASK(3, 0)) > (vtpr & GENMASK(3, 0))))) 3134 return -EINVAL; 3135 3136 return 0; 3137 } 3138 3139 static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu, 3140 struct vmcs12 *vmcs12) 3141 { 3142 #ifdef CONFIG_X86_64 3143 if (CC(!!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) != 3144 !!(vcpu->arch.efer & EFER_LMA))) 3145 return -EINVAL; 3146 #endif 3147 return 0; 3148 } 3149 3150 static bool is_l1_noncanonical_address_on_vmexit(u64 la, struct vmcs12 *vmcs12) 3151 { 3152 /* 3153 * Check that the given linear address is canonical after a VM exit 3154 * from L2, based on HOST_CR4.LA57 value that will be loaded for L1. 3155 */ 3156 u8 l1_address_bits_on_exit = (vmcs12->host_cr4 & X86_CR4_LA57) ? 57 : 48; 3157 3158 return !__is_canonical_address(la, l1_address_bits_on_exit); 3159 } 3160 3161 static int nested_vmx_check_cet_state_common(struct kvm_vcpu *vcpu, u64 s_cet, 3162 u64 ssp, u64 ssp_tbl) 3163 { 3164 if (CC(!kvm_is_valid_u_s_cet(vcpu, s_cet)) || CC(!IS_ALIGNED(ssp, 4)) || 3165 CC(is_noncanonical_msr_address(ssp_tbl, vcpu))) 3166 return -EINVAL; 3167 3168 return 0; 3169 } 3170 3171 static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, 3172 struct vmcs12 *vmcs12) 3173 { 3174 bool ia32e = !!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE); 3175 3176 if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) || 3177 CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) || 3178 CC(!kvm_vcpu_is_legal_cr3(vcpu, vmcs12->host_cr3))) 3179 return -EINVAL; 3180 3181 if (CC(vmcs12->host_cr4 & X86_CR4_CET && !(vmcs12->host_cr0 & X86_CR0_WP))) 3182 return -EINVAL; 3183 3184 if (CC(is_noncanonical_msr_address(vmcs12->host_ia32_sysenter_esp, vcpu)) || 3185 CC(is_noncanonical_msr_address(vmcs12->host_ia32_sysenter_eip, vcpu))) 3186 return -EINVAL; 3187 3188 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) && 3189 CC(!kvm_pat_valid(vmcs12->host_ia32_pat))) 3190 return -EINVAL; 3191 3192 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && 3193 CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), 3194 vmcs12->host_ia32_perf_global_ctrl))) 3195 return -EINVAL; 3196 3197 if (ia32e) { 3198 if (CC(!(vmcs12->host_cr4 & X86_CR4_PAE))) 3199 return -EINVAL; 3200 } else { 3201 if (CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) || 3202 CC(vmcs12->host_cr4 & X86_CR4_PCIDE) || 3203 CC((vmcs12->host_rip) >> 32)) 3204 return -EINVAL; 3205 } 3206 3207 if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3208 CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3209 CC(vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3210 CC(vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3211 CC(vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3212 CC(vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3213 CC(vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 3214 CC(vmcs12->host_cs_selector == 0) || 3215 CC(vmcs12->host_tr_selector == 0) || 3216 CC(vmcs12->host_ss_selector == 0 && !ia32e)) 3217 return -EINVAL; 3218 3219 if (CC(is_noncanonical_base_address(vmcs12->host_fs_base, vcpu)) || 3220 CC(is_noncanonical_base_address(vmcs12->host_gs_base, vcpu)) || 3221 CC(is_noncanonical_base_address(vmcs12->host_gdtr_base, vcpu)) || 3222 CC(is_noncanonical_base_address(vmcs12->host_idtr_base, vcpu)) || 3223 CC(is_noncanonical_base_address(vmcs12->host_tr_base, vcpu)) || 3224 CC(is_l1_noncanonical_address_on_vmexit(vmcs12->host_rip, vmcs12))) 3225 return -EINVAL; 3226 3227 /* 3228 * If the load IA32_EFER VM-exit control is 1, bits reserved in the 3229 * IA32_EFER MSR must be 0 in the field for that register. In addition, 3230 * the values of the LMA and LME bits in the field must each be that of 3231 * the host address-space size VM-exit control. 3232 */ 3233 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) { 3234 if (CC(!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer)) || 3235 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA)) || 3236 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))) 3237 return -EINVAL; 3238 } 3239 3240 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_CET_STATE) { 3241 if (nested_vmx_check_cet_state_common(vcpu, vmcs12->host_s_cet, 3242 vmcs12->host_ssp, 3243 vmcs12->host_ssp_tbl)) 3244 return -EINVAL; 3245 3246 /* 3247 * IA32_S_CET and SSP must be canonical if the host will 3248 * enter 64-bit mode after VM-exit; otherwise, higher 3249 * 32-bits must be all 0s. 3250 */ 3251 if (ia32e) { 3252 if (CC(is_noncanonical_msr_address(vmcs12->host_s_cet, vcpu)) || 3253 CC(is_noncanonical_msr_address(vmcs12->host_ssp, vcpu))) 3254 return -EINVAL; 3255 } else { 3256 if (CC(vmcs12->host_s_cet >> 32) || CC(vmcs12->host_ssp >> 32)) 3257 return -EINVAL; 3258 } 3259 } 3260 3261 return 0; 3262 } 3263 3264 static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, 3265 struct vmcs12 *vmcs12) 3266 { 3267 struct vcpu_vmx *vmx = to_vmx(vcpu); 3268 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; 3269 struct vmcs_hdr hdr; 3270 3271 if (vmcs12->vmcs_link_pointer == INVALID_GPA) 3272 return 0; 3273 3274 if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))) 3275 return -EINVAL; 3276 3277 if (ghc->gpa != vmcs12->vmcs_link_pointer && 3278 CC(kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, 3279 vmcs12->vmcs_link_pointer, VMCS12_SIZE))) 3280 return -EINVAL; 3281 3282 if (CC(kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr, 3283 offsetof(struct vmcs12, hdr), 3284 sizeof(hdr)))) 3285 return -EINVAL; 3286 3287 if (CC(hdr.revision_id != VMCS12_REVISION) || 3288 CC(hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))) 3289 return -EINVAL; 3290 3291 return 0; 3292 } 3293 3294 /* 3295 * Checks related to Guest Non-register State 3296 */ 3297 static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12) 3298 { 3299 if (CC(vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && 3300 vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT && 3301 vmcs12->guest_activity_state != GUEST_ACTIVITY_WAIT_SIPI)) 3302 return -EINVAL; 3303 3304 return 0; 3305 } 3306 3307 static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, 3308 struct vmcs12 *vmcs12, 3309 enum vm_entry_failure_code *entry_failure_code) 3310 { 3311 bool ia32e = !!(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE); 3312 3313 *entry_failure_code = ENTRY_FAIL_DEFAULT; 3314 3315 if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) || 3316 CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))) 3317 return -EINVAL; 3318 3319 if (CC(vmcs12->guest_cr4 & X86_CR4_CET && !(vmcs12->guest_cr0 & X86_CR0_WP))) 3320 return -EINVAL; 3321 3322 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { 3323 u64 debugctl = vmcs12->guest_ia32_debugctl; 3324 3325 /* 3326 * FREEZE_IN_SMM is not virtualized, but allow L1 to set it in 3327 * vmcs12's DEBUGCTL under a quirk for backwards compatibility. 3328 * Note that the quirk only relaxes the consistency check. The 3329 * vmcc02 bit is still under the control of the host. In 3330 * particular, if a host administrator decides to clear the bit, 3331 * then L1 has no say in the matter. 3332 */ 3333 if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_VMCS12_ALLOW_FREEZE_IN_SMM)) 3334 debugctl &= ~DEBUGCTLMSR_FREEZE_IN_SMM; 3335 3336 if (CC(!kvm_dr7_valid(vmcs12->guest_dr7)) || 3337 CC(!vmx_is_valid_debugctl(vcpu, debugctl, false))) 3338 return -EINVAL; 3339 } 3340 3341 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) && 3342 CC(!kvm_pat_valid(vmcs12->guest_ia32_pat))) 3343 return -EINVAL; 3344 3345 if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) { 3346 *entry_failure_code = ENTRY_FAIL_VMCS_LINK_PTR; 3347 return -EINVAL; 3348 } 3349 3350 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && 3351 CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), 3352 vmcs12->guest_ia32_perf_global_ctrl))) 3353 return -EINVAL; 3354 3355 if (CC((vmcs12->guest_cr0 & (X86_CR0_PG | X86_CR0_PE)) == X86_CR0_PG)) 3356 return -EINVAL; 3357 3358 if (CC(ia32e && !(vmcs12->guest_cr4 & X86_CR4_PAE)) || 3359 CC(ia32e && !(vmcs12->guest_cr0 & X86_CR0_PG))) 3360 return -EINVAL; 3361 3362 /* 3363 * If the load IA32_EFER VM-entry control is 1, the following checks 3364 * are performed on the field for the IA32_EFER MSR: 3365 * - Bits reserved in the IA32_EFER MSR must be 0. 3366 * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of 3367 * the IA-32e mode guest VM-exit control. It must also be identical 3368 * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to 3369 * CR0.PG) is 1. 3370 */ 3371 if (vcpu->arch.nested_run_pending && 3372 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) { 3373 if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) || 3374 CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) || 3375 CC(((vmcs12->guest_cr0 & X86_CR0_PG) && 3376 ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))) 3377 return -EINVAL; 3378 } 3379 3380 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) && 3381 (CC(is_noncanonical_msr_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) || 3382 CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))) 3383 return -EINVAL; 3384 3385 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE) { 3386 if (nested_vmx_check_cet_state_common(vcpu, vmcs12->guest_s_cet, 3387 vmcs12->guest_ssp, 3388 vmcs12->guest_ssp_tbl)) 3389 return -EINVAL; 3390 3391 /* 3392 * Guest SSP must have 63:N bits identical, rather than 3393 * be canonical (i.e., 63:N-1 bits identical), where N is 3394 * the CPU's maximum linear-address width. Similar to 3395 * is_noncanonical_msr_address(), use the host's 3396 * linear-address width. 3397 */ 3398 if (CC(!__is_canonical_address(vmcs12->guest_ssp, max_host_virt_addr_bits() + 1))) 3399 return -EINVAL; 3400 } 3401 3402 if (nested_check_guest_non_reg_state(vmcs12)) 3403 return -EINVAL; 3404 3405 return 0; 3406 } 3407 3408 #ifdef CONFIG_KVM_HYPERV 3409 static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu) 3410 { 3411 struct vcpu_vmx *vmx = to_vmx(vcpu); 3412 3413 /* 3414 * hv_evmcs may end up being not mapped after migration (when 3415 * L2 was running), map it here to make sure vmcs12 changes are 3416 * properly reflected. 3417 */ 3418 if (guest_cpu_cap_has_evmcs(vcpu) && 3419 vmx->nested.hv_evmcs_vmptr == EVMPTR_MAP_PENDING) { 3420 enum nested_evmptrld_status evmptrld_status = 3421 nested_vmx_handle_enlightened_vmptrld(vcpu, false); 3422 3423 if (evmptrld_status == EVMPTRLD_VMFAIL || 3424 evmptrld_status == EVMPTRLD_ERROR) 3425 return false; 3426 3427 /* 3428 * Post migration VMCS12 always provides the most actual 3429 * information, copy it to eVMCS upon entry. 3430 */ 3431 vmx->nested.need_vmcs12_to_shadow_sync = true; 3432 } 3433 3434 return true; 3435 } 3436 #endif 3437 3438 static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) 3439 { 3440 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3441 struct vcpu_vmx *vmx = to_vmx(vcpu); 3442 struct kvm_host_map *map; 3443 3444 if (!vcpu->arch.pdptrs_from_userspace && 3445 !nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { 3446 /* 3447 * Reload the guest's PDPTRs since after a migration 3448 * the guest CR3 might be restored prior to setting the nested 3449 * state which can lead to a load of wrong PDPTRs. 3450 */ 3451 if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3))) 3452 return false; 3453 } 3454 3455 3456 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 3457 map = &vmx->nested.apic_access_page_map; 3458 3459 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->apic_access_addr), map)) { 3460 vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(map->pfn)); 3461 } else { 3462 pr_debug_ratelimited("%s: no backing for APIC-access address in vmcs12\n", 3463 __func__); 3464 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3465 vcpu->run->internal.suberror = 3466 KVM_INTERNAL_ERROR_EMULATION; 3467 vcpu->run->internal.ndata = 0; 3468 return false; 3469 } 3470 } 3471 3472 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 3473 map = &vmx->nested.virtual_apic_map; 3474 3475 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) { 3476 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn)); 3477 } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) && 3478 nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) && 3479 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 3480 /* 3481 * The processor will never use the TPR shadow, simply 3482 * clear the bit from the execution control. Such a 3483 * configuration is useless, but it happens in tests. 3484 * For any other configuration, failing the vm entry is 3485 * _not_ what the processor does but it's basically the 3486 * only possibility we have. 3487 */ 3488 exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW); 3489 } else { 3490 /* 3491 * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to 3492 * force VM-Entry to fail. 3493 */ 3494 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, INVALID_GPA); 3495 } 3496 } 3497 3498 if (nested_cpu_has_posted_intr(vmcs12)) { 3499 map = &vmx->nested.pi_desc_map; 3500 3501 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) { 3502 vmx->nested.pi_desc = 3503 (struct pi_desc *)(((void *)map->hva) + 3504 offset_in_page(vmcs12->posted_intr_desc_addr)); 3505 vmcs_write64(POSTED_INTR_DESC_ADDR, 3506 pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr)); 3507 } else { 3508 /* 3509 * Defer the KVM_INTERNAL_EXIT until KVM tries to 3510 * access the contents of the VMCS12 posted interrupt 3511 * descriptor. (Note that KVM may do this when it 3512 * should not, per the architectural specification.) 3513 */ 3514 vmx->nested.pi_desc = NULL; 3515 pin_controls_clearbit(vmx, PIN_BASED_POSTED_INTR); 3516 } 3517 } 3518 if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12)) 3519 exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS); 3520 else 3521 exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS); 3522 3523 return true; 3524 } 3525 3526 static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu) 3527 { 3528 #ifdef CONFIG_KVM_HYPERV 3529 /* 3530 * Note: nested_get_evmcs_page() also updates 'vp_assist_page' copy 3531 * in 'struct kvm_vcpu_hv' in case eVMCS is in use, this is mandatory 3532 * to make nested_evmcs_l2_tlb_flush_enabled() work correctly post 3533 * migration. 3534 */ 3535 if (!nested_get_evmcs_page(vcpu)) { 3536 pr_debug_ratelimited("%s: enlightened vmptrld failed\n", 3537 __func__); 3538 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3539 vcpu->run->internal.suberror = 3540 KVM_INTERNAL_ERROR_EMULATION; 3541 vcpu->run->internal.ndata = 0; 3542 3543 return false; 3544 } 3545 #endif 3546 3547 if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu)) 3548 return false; 3549 3550 return true; 3551 } 3552 3553 static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa) 3554 { 3555 struct vmcs12 *vmcs12; 3556 struct vcpu_vmx *vmx = to_vmx(vcpu); 3557 gpa_t dst; 3558 3559 if (WARN_ON_ONCE(!is_guest_mode(vcpu))) 3560 return 0; 3561 3562 if (WARN_ON_ONCE(vmx->nested.pml_full)) 3563 return 1; 3564 3565 /* 3566 * Check if PML is enabled for the nested guest. Whether eptp bit 6 is 3567 * set is already checked as part of A/D emulation. 3568 */ 3569 vmcs12 = get_vmcs12(vcpu); 3570 if (!nested_cpu_has_pml(vmcs12)) 3571 return 0; 3572 3573 if (vmcs12->guest_pml_index >= PML_LOG_NR_ENTRIES) { 3574 vmx->nested.pml_full = true; 3575 return 1; 3576 } 3577 3578 gpa &= ~0xFFFull; 3579 dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index; 3580 3581 if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa, 3582 offset_in_page(dst), sizeof(gpa))) 3583 return 0; 3584 3585 vmcs12->guest_pml_index--; 3586 3587 return 0; 3588 } 3589 3590 /* 3591 * Intel's VMX Instruction Reference specifies a common set of prerequisites 3592 * for running VMX instructions (except VMXON, whose prerequisites are 3593 * slightly different). It also specifies what exception to inject otherwise. 3594 * Note that many of these exceptions have priority over VM exits, so they 3595 * don't have to be checked again here. 3596 */ 3597 static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) 3598 { 3599 if (!to_vmx(vcpu)->nested.vmxon) { 3600 kvm_queue_exception(vcpu, UD_VECTOR); 3601 return 0; 3602 } 3603 3604 if (vmx_get_cpl(vcpu)) { 3605 kvm_inject_gp(vcpu, 0); 3606 return 0; 3607 } 3608 3609 return 1; 3610 } 3611 3612 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 3613 struct vmcs12 *vmcs12); 3614 3615 /* 3616 * If from_vmentry is false, this is being called from state restore (either RSM 3617 * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume. 3618 * 3619 * Returns: 3620 * NVMX_VMENTRY_SUCCESS: Entered VMX non-root mode 3621 * NVMX_VMENTRY_VMFAIL: Consistency check VMFail 3622 * NVMX_VMENTRY_VMEXIT: Consistency check VMExit 3623 * NVMX_VMENTRY_KVM_INTERNAL_ERROR: KVM internal error 3624 */ 3625 enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, 3626 bool from_vmentry) 3627 { 3628 struct vcpu_vmx *vmx = to_vmx(vcpu); 3629 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3630 enum vm_entry_failure_code entry_failure_code; 3631 union vmx_exit_reason exit_reason = { 3632 .basic = EXIT_REASON_INVALID_STATE, 3633 .failed_vmentry = 1, 3634 }; 3635 u32 failed_index; 3636 3637 trace_kvm_nested_vmenter(kvm_rip_read(vcpu), 3638 vmx->nested.current_vmptr, 3639 vmcs12->guest_rip, 3640 vmcs12->guest_intr_status, 3641 vmcs12->vm_entry_intr_info_field, 3642 vmcs12->secondary_vm_exec_control & SECONDARY_EXEC_ENABLE_EPT, 3643 vmcs12->ept_pointer, 3644 vmcs12->guest_cr3, 3645 KVM_ISA_VMX); 3646 3647 kvm_service_local_tlb_flush_requests(vcpu); 3648 3649 if (!vcpu->arch.nested_run_pending || 3650 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) 3651 vmx->nested.pre_vmenter_debugctl = vmx_guest_debugctl_read(); 3652 if (kvm_mpx_supported() && 3653 (!vcpu->arch.nested_run_pending || 3654 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) 3655 vmx->nested.pre_vmenter_bndcfgs = vmcs_read64(GUEST_BNDCFGS); 3656 3657 if (!vcpu->arch.nested_run_pending || 3658 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE)) 3659 vmcs_read_cet_state(vcpu, &vmx->nested.pre_vmenter_s_cet, 3660 &vmx->nested.pre_vmenter_ssp, 3661 &vmx->nested.pre_vmenter_ssp_tbl); 3662 3663 /* 3664 * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled. In the 3665 * event of a "late" VM-Fail, i.e. a VM-Fail detected by hardware but 3666 * not KVM, KVM must unwind its software model to the pre-VM-Entry host 3667 * state. When EPT is disabled, GUEST_CR3 holds KVM's shadow CR3, not 3668 * L1's "real" CR3, which causes nested_vmx_restore_host_state() to 3669 * corrupt vcpu->arch.cr3. Stuffing vmcs01.GUEST_CR3 results in the 3670 * unwind naturally setting arch.cr3 to the correct value. Smashing 3671 * vmcs01.GUEST_CR3 is safe because nested VM-Exits, and the unwind, 3672 * reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is guaranteed to be 3673 * overwritten with a shadow CR3 prior to re-entering L1. 3674 */ 3675 if (!enable_ept) 3676 vmcs_writel(GUEST_CR3, vcpu->arch.cr3); 3677 3678 vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); 3679 3680 prepare_vmcs02_early(vmx, &vmx->vmcs01, vmcs12); 3681 3682 if (from_vmentry) { 3683 if (unlikely(!nested_get_vmcs12_pages(vcpu))) { 3684 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3685 return NVMX_VMENTRY_KVM_INTERNAL_ERROR; 3686 } 3687 3688 if (nested_vmx_check_controls_late(vcpu, vmcs12)) { 3689 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3690 return NVMX_VMENTRY_VMFAIL; 3691 } 3692 3693 if (nested_vmx_check_guest_state(vcpu, vmcs12, 3694 &entry_failure_code)) { 3695 exit_reason.basic = EXIT_REASON_INVALID_STATE; 3696 vmcs12->exit_qualification = entry_failure_code; 3697 goto vmentry_fail_vmexit; 3698 } 3699 } 3700 3701 enter_guest_mode(vcpu); 3702 3703 if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &entry_failure_code)) { 3704 exit_reason.basic = EXIT_REASON_INVALID_STATE; 3705 vmcs12->exit_qualification = entry_failure_code; 3706 goto vmentry_fail_vmexit_guest_mode; 3707 } 3708 3709 if (from_vmentry) { 3710 failed_index = nested_vmx_load_msr(vcpu, 3711 vmcs12->vm_entry_msr_load_addr, 3712 vmcs12->vm_entry_msr_load_count); 3713 if (failed_index) { 3714 exit_reason.basic = EXIT_REASON_MSR_LOAD_FAIL; 3715 vmcs12->exit_qualification = failed_index; 3716 goto vmentry_fail_vmexit_guest_mode; 3717 } 3718 } else { 3719 /* 3720 * The MMU is not initialized to point at the right entities yet and 3721 * "get pages" would need to read data from the guest (i.e. we will 3722 * need to perform gpa to hpa translation). Request a call 3723 * to nested_get_vmcs12_pages before the next VM-entry. The MSRs 3724 * have already been set at vmentry time and should not be reset. 3725 */ 3726 kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 3727 } 3728 3729 /* 3730 * Re-evaluate pending events if L1 had a pending IRQ/NMI/INIT/SIPI 3731 * when it executed VMLAUNCH/VMRESUME, as entering non-root mode can 3732 * effectively unblock various events, e.g. INIT/SIPI cause VM-Exit 3733 * unconditionally. Take care to pull data from vmcs01 as appropriate, 3734 * e.g. when checking for interrupt windows, as vmcs02 is now loaded. 3735 */ 3736 if ((__exec_controls_get(&vmx->vmcs01) & (CPU_BASED_INTR_WINDOW_EXITING | 3737 CPU_BASED_NMI_WINDOW_EXITING)) || 3738 kvm_apic_has_pending_init_or_sipi(vcpu) || 3739 kvm_apic_has_interrupt(vcpu)) 3740 kvm_make_request(KVM_REQ_EVENT, vcpu); 3741 3742 /* 3743 * Do not start the preemption timer hrtimer until after we know 3744 * we are successful, so that only nested_vmx_vmexit needs to cancel 3745 * the timer. 3746 */ 3747 vmx->nested.preemption_timer_expired = false; 3748 if (nested_cpu_has_preemption_timer(vmcs12)) { 3749 u64 timer_value = vmx_calc_preemption_timer_value(vcpu); 3750 vmx_start_preemption_timer(vcpu, timer_value); 3751 } 3752 3753 /* 3754 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point 3755 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet 3756 * returned as far as L1 is concerned. It will only return (and set 3757 * the success flag) when L2 exits (see nested_vmx_vmexit()). 3758 */ 3759 return NVMX_VMENTRY_SUCCESS; 3760 3761 /* 3762 * A failed consistency check that leads to a VMExit during L1's 3763 * VMEnter to L2 is a variation of a normal VMexit, as explained in 3764 * 26.7 "VM-entry failures during or after loading guest state". 3765 */ 3766 vmentry_fail_vmexit_guest_mode: 3767 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING) 3768 vcpu->arch.tsc_offset -= vmcs12->tsc_offset; 3769 leave_guest_mode(vcpu); 3770 3771 vmentry_fail_vmexit: 3772 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3773 3774 if (!from_vmentry) 3775 return NVMX_VMENTRY_VMEXIT; 3776 3777 load_vmcs12_host_state(vcpu, vmcs12); 3778 vmcs12->vm_exit_reason = exit_reason.full; 3779 if (enable_shadow_vmcs || nested_vmx_is_evmptr12_valid(vmx)) 3780 vmx->nested.need_vmcs12_to_shadow_sync = true; 3781 return NVMX_VMENTRY_VMEXIT; 3782 } 3783 3784 /* 3785 * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1 3786 * for running an L2 nested guest. 3787 */ 3788 static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) 3789 { 3790 struct vmcs12 *vmcs12; 3791 enum nvmx_vmentry_status status; 3792 struct vcpu_vmx *vmx = to_vmx(vcpu); 3793 u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu); 3794 enum nested_evmptrld_status evmptrld_status; 3795 3796 if (!nested_vmx_check_permission(vcpu)) 3797 return 1; 3798 3799 evmptrld_status = nested_vmx_handle_enlightened_vmptrld(vcpu, launch); 3800 if (evmptrld_status == EVMPTRLD_ERROR) { 3801 kvm_queue_exception(vcpu, UD_VECTOR); 3802 return 1; 3803 } 3804 3805 kvm_pmu_branch_retired(vcpu); 3806 3807 if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) 3808 return nested_vmx_failInvalid(vcpu); 3809 3810 if (CC(!nested_vmx_is_evmptr12_valid(vmx) && 3811 vmx->nested.current_vmptr == INVALID_GPA)) 3812 return nested_vmx_failInvalid(vcpu); 3813 3814 vmcs12 = get_vmcs12(vcpu); 3815 3816 /* 3817 * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact 3818 * that there *is* a valid VMCS pointer, RFLAGS.CF is set 3819 * rather than RFLAGS.ZF, and no error number is stored to the 3820 * VM-instruction error field. 3821 */ 3822 if (CC(vmcs12->hdr.shadow_vmcs)) 3823 return nested_vmx_failInvalid(vcpu); 3824 3825 if (nested_vmx_is_evmptr12_valid(vmx)) { 3826 struct hv_enlightened_vmcs *evmcs = nested_vmx_evmcs(vmx); 3827 3828 copy_enlightened_to_vmcs12(vmx, evmcs->hv_clean_fields); 3829 /* Enlightened VMCS doesn't have launch state */ 3830 vmcs12->launch_state = !launch; 3831 } else if (enable_shadow_vmcs) { 3832 copy_shadow_to_vmcs12(vmx); 3833 } 3834 3835 /* 3836 * The nested entry process starts with enforcing various prerequisites 3837 * on vmcs12 as required by the Intel SDM, and act appropriately when 3838 * they fail: As the SDM explains, some conditions should cause the 3839 * instruction to fail, while others will cause the instruction to seem 3840 * to succeed, but return an EXIT_REASON_INVALID_STATE. 3841 * To speed up the normal (success) code path, we should avoid checking 3842 * for misconfigurations which will anyway be caught by the processor 3843 * when using the merged vmcs02. 3844 */ 3845 if (CC(interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS)) 3846 return nested_vmx_fail(vcpu, VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS); 3847 3848 if (CC(vmcs12->launch_state == launch)) 3849 return nested_vmx_fail(vcpu, 3850 launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS 3851 : VMXERR_VMRESUME_NONLAUNCHED_VMCS); 3852 3853 if (nested_vmx_check_controls(vcpu, vmcs12)) 3854 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3855 3856 if (nested_vmx_check_address_space_size(vcpu, vmcs12)) 3857 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); 3858 3859 if (nested_vmx_check_host_state(vcpu, vmcs12)) 3860 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); 3861 3862 /* 3863 * We're finally done with prerequisite checking, and can start with 3864 * the nested entry. 3865 */ 3866 vcpu->arch.nested_run_pending = KVM_NESTED_RUN_PENDING; 3867 vmx->nested.has_preemption_timer_deadline = false; 3868 status = nested_vmx_enter_non_root_mode(vcpu, true); 3869 if (unlikely(status != NVMX_VMENTRY_SUCCESS)) 3870 goto vmentry_failed; 3871 3872 /* Hide L1D cache contents from the nested guest. */ 3873 kvm_request_l1tf_flush_l1d(); 3874 3875 /* 3876 * Must happen outside of nested_vmx_enter_non_root_mode() as it will 3877 * also be used as part of restoring nVMX state for 3878 * snapshot restore (migration). 3879 * 3880 * In this flow, it is assumed that vmcs12 cache was 3881 * transferred as part of captured nVMX state and should 3882 * therefore not be read from guest memory (which may not 3883 * exist on destination host yet). 3884 */ 3885 nested_cache_shadow_vmcs12(vcpu, vmcs12); 3886 3887 switch (vmcs12->guest_activity_state) { 3888 case GUEST_ACTIVITY_HLT: 3889 /* 3890 * If we're entering a halted L2 vcpu and the L2 vcpu won't be 3891 * awakened by event injection or by an NMI-window VM-exit or 3892 * by an interrupt-window VM-exit, halt the vcpu. 3893 */ 3894 if (!(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) && 3895 !nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING) && 3896 !(nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING) && 3897 (vmcs12->guest_rflags & X86_EFLAGS_IF))) { 3898 vcpu->arch.nested_run_pending = 0; 3899 return kvm_emulate_halt_noskip(vcpu); 3900 } 3901 break; 3902 case GUEST_ACTIVITY_WAIT_SIPI: 3903 vcpu->arch.nested_run_pending = 0; 3904 kvm_set_mp_state(vcpu, KVM_MP_STATE_INIT_RECEIVED); 3905 break; 3906 default: 3907 break; 3908 } 3909 3910 return 1; 3911 3912 vmentry_failed: 3913 vcpu->arch.nested_run_pending = 0; 3914 if (status == NVMX_VMENTRY_KVM_INTERNAL_ERROR) 3915 return 0; 3916 if (status == NVMX_VMENTRY_VMEXIT) 3917 return 1; 3918 WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL); 3919 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3920 } 3921 3922 /* 3923 * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date 3924 * because L2 may have changed some cr0 bits directly (CR0_GUEST_HOST_MASK). 3925 * This function returns the new value we should put in vmcs12.guest_cr0. 3926 * It's not enough to just return the vmcs02 GUEST_CR0. Rather, 3927 * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now 3928 * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0 3929 * didn't trap the bit, because if L1 did, so would L0). 3930 * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have 3931 * been modified by L2, and L1 knows it. So just leave the old value of 3932 * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0 3933 * isn't relevant, because if L0 traps this bit it can set it to anything. 3934 * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have 3935 * changed these bits, and therefore they need to be updated, but L0 3936 * didn't necessarily allow them to be changed in GUEST_CR0 - and rather 3937 * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there. 3938 */ 3939 static inline unsigned long 3940 vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3941 { 3942 return 3943 /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) | 3944 /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) | 3945 /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask | 3946 vcpu->arch.cr0_guest_owned_bits)); 3947 } 3948 3949 static inline unsigned long 3950 vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3951 { 3952 return 3953 /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) | 3954 /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) | 3955 /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask | 3956 vcpu->arch.cr4_guest_owned_bits)); 3957 } 3958 3959 static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, 3960 struct vmcs12 *vmcs12, 3961 u32 vm_exit_reason, u32 exit_intr_info) 3962 { 3963 u32 idt_vectoring; 3964 unsigned int nr; 3965 3966 /* 3967 * Per the SDM, VM-Exits due to double and triple faults are never 3968 * considered to occur during event delivery, even if the double/triple 3969 * fault is the result of an escalating vectoring issue. 3970 * 3971 * Note, the SDM qualifies the double fault behavior with "The original 3972 * event results in a double-fault exception". It's unclear why the 3973 * qualification exists since exits due to double fault can occur only 3974 * while vectoring a different exception (injected events are never 3975 * subject to interception), i.e. there's _always_ an original event. 3976 * 3977 * The SDM also uses NMI as a confusing example for the "original event 3978 * causes the VM exit directly" clause. NMI isn't special in any way, 3979 * the same rule applies to all events that cause an exit directly. 3980 * NMI is an odd choice for the example because NMIs can only occur on 3981 * instruction boundaries, i.e. they _can't_ occur during vectoring. 3982 */ 3983 if ((u16)vm_exit_reason == EXIT_REASON_TRIPLE_FAULT || 3984 ((u16)vm_exit_reason == EXIT_REASON_EXCEPTION_NMI && 3985 is_double_fault(exit_intr_info))) { 3986 vmcs12->idt_vectoring_info_field = 0; 3987 } else if (vcpu->arch.exception.injected) { 3988 nr = vcpu->arch.exception.vector; 3989 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 3990 3991 if (kvm_exception_is_soft(nr)) { 3992 vmcs12->vm_exit_instruction_len = 3993 vcpu->arch.event_exit_inst_len; 3994 idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION; 3995 } else 3996 idt_vectoring |= INTR_TYPE_HARD_EXCEPTION; 3997 3998 if (vcpu->arch.exception.has_error_code) { 3999 idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK; 4000 vmcs12->idt_vectoring_error_code = 4001 vcpu->arch.exception.error_code; 4002 } 4003 4004 vmcs12->idt_vectoring_info_field = idt_vectoring; 4005 } else if (vcpu->arch.nmi_injected) { 4006 vmcs12->idt_vectoring_info_field = 4007 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR; 4008 } else if (vcpu->arch.interrupt.injected) { 4009 nr = vcpu->arch.interrupt.nr; 4010 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 4011 4012 if (vcpu->arch.interrupt.soft) { 4013 idt_vectoring |= INTR_TYPE_SOFT_INTR; 4014 vmcs12->vm_entry_instruction_len = 4015 vcpu->arch.event_exit_inst_len; 4016 } else 4017 idt_vectoring |= INTR_TYPE_EXT_INTR; 4018 4019 vmcs12->idt_vectoring_info_field = idt_vectoring; 4020 } else { 4021 vmcs12->idt_vectoring_info_field = 0; 4022 } 4023 } 4024 4025 static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) 4026 { 4027 struct vcpu_vmx *vmx = to_vmx(vcpu); 4028 int max_irr; 4029 void *vapic_page; 4030 u16 status; 4031 4032 if (!vmx->nested.pi_pending) 4033 return 0; 4034 4035 if (!vmx->nested.pi_desc) 4036 goto mmio_needed; 4037 4038 vmx->nested.pi_pending = false; 4039 4040 if (!pi_test_and_clear_on(vmx->nested.pi_desc)) 4041 return 0; 4042 4043 max_irr = pi_find_highest_vector(vmx->nested.pi_desc); 4044 if (max_irr > 0) { 4045 vapic_page = vmx->nested.virtual_apic_map.hva; 4046 if (!vapic_page) 4047 goto mmio_needed; 4048 4049 __kvm_apic_update_irr(vmx->nested.pi_desc->pir, 4050 vapic_page, &max_irr); 4051 status = vmcs_read16(GUEST_INTR_STATUS); 4052 if ((u8)max_irr > ((u8)status & 0xff)) { 4053 status &= ~0xff; 4054 status |= (u8)max_irr; 4055 vmcs_write16(GUEST_INTR_STATUS, status); 4056 } 4057 } 4058 4059 kvm_vcpu_map_mark_dirty(vcpu, &vmx->nested.virtual_apic_map); 4060 kvm_vcpu_map_mark_dirty(vcpu, &vmx->nested.pi_desc_map); 4061 return 0; 4062 4063 mmio_needed: 4064 kvm_handle_memory_failure(vcpu, X86EMUL_IO_NEEDED, NULL); 4065 return -ENXIO; 4066 } 4067 4068 static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu) 4069 { 4070 struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit; 4071 u32 intr_info = ex->vector | INTR_INFO_VALID_MASK; 4072 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4073 unsigned long exit_qual; 4074 4075 if (ex->has_payload) { 4076 exit_qual = ex->payload; 4077 } else if (ex->vector == PF_VECTOR) { 4078 exit_qual = vcpu->arch.cr2; 4079 } else if (ex->vector == DB_VECTOR) { 4080 exit_qual = vcpu->arch.dr6; 4081 exit_qual &= ~DR6_BT; 4082 exit_qual ^= DR6_ACTIVE_LOW; 4083 } else { 4084 exit_qual = 0; 4085 } 4086 4087 /* 4088 * Unlike AMD's Paged Real Mode, which reports an error code on #PF 4089 * VM-Exits even if the CPU is in Real Mode, Intel VMX never sets the 4090 * "has error code" flags on VM-Exit if the CPU is in Real Mode. 4091 */ 4092 if (ex->has_error_code && is_protmode(vcpu)) { 4093 /* 4094 * Intel CPUs do not generate error codes with bits 31:16 set, 4095 * and more importantly VMX disallows setting bits 31:16 in the 4096 * injected error code for VM-Entry. Drop the bits to mimic 4097 * hardware and avoid inducing failure on nested VM-Entry if L1 4098 * chooses to inject the exception back to L2. AMD CPUs _do_ 4099 * generate "full" 32-bit error codes, so KVM allows userspace 4100 * to inject exception error codes with bits 31:16 set. 4101 */ 4102 vmcs12->vm_exit_intr_error_code = (u16)ex->error_code; 4103 intr_info |= INTR_INFO_DELIVER_CODE_MASK; 4104 } 4105 4106 if (kvm_exception_is_soft(ex->vector)) 4107 intr_info |= INTR_TYPE_SOFT_EXCEPTION; 4108 else 4109 intr_info |= INTR_TYPE_HARD_EXCEPTION; 4110 4111 if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) && 4112 vmx_get_nmi_mask(vcpu)) 4113 intr_info |= INTR_INFO_UNBLOCK_NMI; 4114 4115 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual); 4116 } 4117 4118 /* 4119 * Returns true if a debug trap is (likely) pending delivery. Infer the class 4120 * of a #DB (trap-like vs. fault-like) from the exception payload (to-be-DR6). 4121 * Using the payload is flawed because code breakpoints (fault-like) and data 4122 * breakpoints (trap-like) set the same bits in DR6 (breakpoint detected), i.e. 4123 * this will return false positives if a to-be-injected code breakpoint #DB is 4124 * pending (from KVM's perspective, but not "pending" across an instruction 4125 * boundary). ICEBP, a.k.a. INT1, is also not reflected here even though it 4126 * too is trap-like. 4127 * 4128 * KVM "works" despite these flaws as ICEBP isn't currently supported by the 4129 * emulator, Monitor Trap Flag is not marked pending on intercepted #DBs (the 4130 * #DB has already happened), and MTF isn't marked pending on code breakpoints 4131 * from the emulator (because such #DBs are fault-like and thus don't trigger 4132 * actions that fire on instruction retire). 4133 */ 4134 static unsigned long vmx_get_pending_dbg_trap(struct kvm_queued_exception *ex) 4135 { 4136 if (!ex->pending || ex->vector != DB_VECTOR) 4137 return 0; 4138 4139 /* General Detect #DBs are always fault-like. */ 4140 return ex->payload & ~DR6_BD; 4141 } 4142 4143 /* 4144 * Returns true if there's a pending #DB exception that is lower priority than 4145 * a pending Monitor Trap Flag VM-Exit. TSS T-flag #DBs are not emulated by 4146 * KVM, but could theoretically be injected by userspace. Note, this code is 4147 * imperfect, see above. 4148 */ 4149 static bool vmx_is_low_priority_db_trap(struct kvm_queued_exception *ex) 4150 { 4151 return vmx_get_pending_dbg_trap(ex) & ~DR6_BT; 4152 } 4153 4154 /* 4155 * Certain VM-exits set the 'pending debug exceptions' field to indicate a 4156 * recognized #DB (data or single-step) that has yet to be delivered. Since KVM 4157 * represents these debug traps with a payload that is said to be compatible 4158 * with the 'pending debug exceptions' field, write the payload to the VMCS 4159 * field if a VM-exit is delivered before the debug trap. 4160 */ 4161 static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu) 4162 { 4163 unsigned long pending_dbg; 4164 4165 pending_dbg = vmx_get_pending_dbg_trap(&vcpu->arch.exception); 4166 if (pending_dbg) 4167 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, pending_dbg); 4168 } 4169 4170 static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu) 4171 { 4172 return nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && 4173 to_vmx(vcpu)->nested.preemption_timer_expired; 4174 } 4175 4176 static bool vmx_has_nested_events(struct kvm_vcpu *vcpu, bool for_injection) 4177 { 4178 struct vcpu_vmx *vmx = to_vmx(vcpu); 4179 void *vapic = vmx->nested.virtual_apic_map.hva; 4180 int max_irr, vppr; 4181 4182 if (nested_vmx_preemption_timer_pending(vcpu) || 4183 vmx->nested.mtf_pending) 4184 return true; 4185 4186 /* 4187 * Virtual Interrupt Delivery doesn't require manual injection. Either 4188 * the interrupt is already in GUEST_RVI and will be recognized by CPU 4189 * at VM-Entry, or there is a KVM_REQ_EVENT pending and KVM will move 4190 * the interrupt from the PIR to RVI prior to entering the guest. 4191 */ 4192 if (for_injection) 4193 return false; 4194 4195 if (!nested_cpu_has_vid(get_vmcs12(vcpu)) || 4196 __vmx_interrupt_blocked(vcpu)) 4197 return false; 4198 4199 if (!vapic) 4200 return false; 4201 4202 vppr = *((u32 *)(vapic + APIC_PROCPRI)); 4203 4204 max_irr = vmx_get_rvi(); 4205 if ((max_irr & 0xf0) > (vppr & 0xf0)) 4206 return true; 4207 4208 if (vmx->nested.pi_pending && vmx->nested.pi_desc && 4209 pi_test_on(vmx->nested.pi_desc)) { 4210 max_irr = pi_find_highest_vector(vmx->nested.pi_desc); 4211 if (max_irr > 0 && (max_irr & 0xf0) > (vppr & 0xf0)) 4212 return true; 4213 } 4214 4215 return false; 4216 } 4217 4218 /* 4219 * Per the Intel SDM's table "Priority Among Concurrent Events", with minor 4220 * edits to fill in missing examples, e.g. #DB due to split-lock accesses, 4221 * and less minor edits to splice in the priority of VMX Non-Root specific 4222 * events, e.g. MTF and NMI/INTR-window exiting. 4223 * 4224 * 1 Hardware Reset and Machine Checks 4225 * - RESET 4226 * - Machine Check 4227 * 4228 * 2 Trap on Task Switch 4229 * - T flag in TSS is set (on task switch) 4230 * 4231 * 3 External Hardware Interventions 4232 * - FLUSH 4233 * - STOPCLK 4234 * - SMI 4235 * - INIT 4236 * 4237 * 3.5 Monitor Trap Flag (MTF) VM-exit[1] 4238 * 4239 * 4 Traps on Previous Instruction 4240 * - Breakpoints 4241 * - Trap-class Debug Exceptions (#DB due to TF flag set, data/I-O 4242 * breakpoint, or #DB due to a split-lock access) 4243 * 4244 * 4.3 VMX-preemption timer expired VM-exit 4245 * 4246 * 4.6 NMI-window exiting VM-exit[2] 4247 * 4248 * 5 Nonmaskable Interrupts (NMI) 4249 * 4250 * 5.5 Interrupt-window exiting VM-exit and Virtual-interrupt delivery 4251 * 4252 * 6 Maskable Hardware Interrupts 4253 * 4254 * 7 Code Breakpoint Fault 4255 * 4256 * 8 Faults from Fetching Next Instruction 4257 * - Code-Segment Limit Violation 4258 * - Code Page Fault 4259 * - Control protection exception (missing ENDBRANCH at target of indirect 4260 * call or jump) 4261 * 4262 * 9 Faults from Decoding Next Instruction 4263 * - Instruction length > 15 bytes 4264 * - Invalid Opcode 4265 * - Coprocessor Not Available 4266 * 4267 *10 Faults on Executing Instruction 4268 * - Overflow 4269 * - Bound error 4270 * - Invalid TSS 4271 * - Segment Not Present 4272 * - Stack fault 4273 * - General Protection 4274 * - Data Page Fault 4275 * - Alignment Check 4276 * - x86 FPU Floating-point exception 4277 * - SIMD floating-point exception 4278 * - Virtualization exception 4279 * - Control protection exception 4280 * 4281 * [1] Per the "Monitor Trap Flag" section: System-management interrupts (SMIs), 4282 * INIT signals, and higher priority events take priority over MTF VM exits. 4283 * MTF VM exits take priority over debug-trap exceptions and lower priority 4284 * events. 4285 * 4286 * [2] Debug-trap exceptions and higher priority events take priority over VM exits 4287 * caused by the VMX-preemption timer. VM exits caused by the VMX-preemption 4288 * timer take priority over VM exits caused by the "NMI-window exiting" 4289 * VM-execution control and lower priority events. 4290 * 4291 * [3] Debug-trap exceptions and higher priority events take priority over VM exits 4292 * caused by "NMI-window exiting". VM exits caused by this control take 4293 * priority over non-maskable interrupts (NMIs) and lower priority events. 4294 * 4295 * [4] Virtual-interrupt delivery has the same priority as that of VM exits due to 4296 * the 1-setting of the "interrupt-window exiting" VM-execution control. Thus, 4297 * non-maskable interrupts (NMIs) and higher priority events take priority over 4298 * delivery of a virtual interrupt; delivery of a virtual interrupt takes 4299 * priority over external interrupts and lower priority events. 4300 */ 4301 static int vmx_check_nested_events(struct kvm_vcpu *vcpu) 4302 { 4303 struct kvm_lapic *apic = vcpu->arch.apic; 4304 struct vcpu_vmx *vmx = to_vmx(vcpu); 4305 /* 4306 * Only a pending nested run blocks a pending exception. If there is a 4307 * previously injected event, the pending exception occurred while said 4308 * event was being delivered and thus needs to be handled. 4309 */ 4310 bool block_nested_exceptions = vcpu->arch.nested_run_pending; 4311 /* 4312 * Events that don't require injection, i.e. that are virtualized by 4313 * hardware, aren't blocked by a pending VM-Enter as KVM doesn't need 4314 * to regain control in order to deliver the event, and hardware will 4315 * handle event ordering, e.g. with respect to injected exceptions. 4316 * 4317 * But, new events (not exceptions) are only recognized at instruction 4318 * boundaries. If an event needs reinjection, then KVM is handling a 4319 * VM-Exit that occurred _during_ instruction execution; new events, 4320 * irrespective of whether or not they're injected, are blocked until 4321 * the instruction completes. 4322 */ 4323 bool block_non_injected_events = kvm_event_needs_reinjection(vcpu); 4324 /* 4325 * Inject events are blocked by nested VM-Enter, as KVM is responsible 4326 * for managing priority between concurrent events, i.e. KVM needs to 4327 * wait until after VM-Enter completes to deliver injected events. 4328 */ 4329 bool block_nested_events = block_nested_exceptions || 4330 block_non_injected_events; 4331 4332 if (lapic_in_kernel(vcpu) && 4333 test_bit(KVM_APIC_INIT, &apic->pending_events)) { 4334 if (block_nested_events) 4335 return -EBUSY; 4336 nested_vmx_update_pending_dbg(vcpu); 4337 clear_bit(KVM_APIC_INIT, &apic->pending_events); 4338 if (vcpu->arch.mp_state != KVM_MP_STATE_INIT_RECEIVED) 4339 nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0); 4340 4341 /* MTF is discarded if the vCPU is in WFS. */ 4342 vmx->nested.mtf_pending = false; 4343 return 0; 4344 } 4345 4346 if (lapic_in_kernel(vcpu) && 4347 test_bit(KVM_APIC_SIPI, &apic->pending_events)) { 4348 if (block_nested_events) 4349 return -EBUSY; 4350 4351 clear_bit(KVM_APIC_SIPI, &apic->pending_events); 4352 if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { 4353 nested_vmx_vmexit(vcpu, EXIT_REASON_SIPI_SIGNAL, 0, 4354 apic->sipi_vector & 0xFFUL); 4355 return 0; 4356 } 4357 /* Fallthrough, the SIPI is completely ignored. */ 4358 } 4359 4360 /* 4361 * Process exceptions that are higher priority than Monitor Trap Flag: 4362 * fault-like exceptions, TSS T flag #DB (not emulated by KVM, but 4363 * could theoretically come in from userspace), and ICEBP (INT1). 4364 * 4365 * TODO: SMIs have higher priority than MTF and trap-like #DBs (except 4366 * for TSS T flag #DBs). KVM also doesn't save/restore pending MTF 4367 * across SMI/RSM as it should; that needs to be addressed in order to 4368 * prioritize SMI over MTF and trap-like #DBs. 4369 */ 4370 if (vcpu->arch.exception_vmexit.pending && 4371 !vmx_is_low_priority_db_trap(&vcpu->arch.exception_vmexit)) { 4372 if (block_nested_exceptions) 4373 return -EBUSY; 4374 4375 nested_vmx_inject_exception_vmexit(vcpu); 4376 return 0; 4377 } 4378 4379 if (vcpu->arch.exception.pending && 4380 !vmx_is_low_priority_db_trap(&vcpu->arch.exception)) { 4381 if (block_nested_exceptions) 4382 return -EBUSY; 4383 goto no_vmexit; 4384 } 4385 4386 if (vmx->nested.mtf_pending) { 4387 if (block_nested_events) 4388 return -EBUSY; 4389 nested_vmx_update_pending_dbg(vcpu); 4390 nested_vmx_vmexit(vcpu, EXIT_REASON_MONITOR_TRAP_FLAG, 0, 0); 4391 return 0; 4392 } 4393 4394 if (vcpu->arch.exception_vmexit.pending) { 4395 if (block_nested_exceptions) 4396 return -EBUSY; 4397 4398 nested_vmx_inject_exception_vmexit(vcpu); 4399 return 0; 4400 } 4401 4402 if (vcpu->arch.exception.pending) { 4403 if (block_nested_exceptions) 4404 return -EBUSY; 4405 goto no_vmexit; 4406 } 4407 4408 if (nested_vmx_preemption_timer_pending(vcpu)) { 4409 if (block_nested_events) 4410 return -EBUSY; 4411 nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0); 4412 return 0; 4413 } 4414 4415 if (vcpu->arch.smi_pending && !is_smm(vcpu)) { 4416 if (block_nested_events) 4417 return -EBUSY; 4418 goto no_vmexit; 4419 } 4420 4421 if (vcpu->arch.nmi_pending && !vmx_nmi_blocked(vcpu)) { 4422 if (block_nested_events) 4423 return -EBUSY; 4424 if (!nested_exit_on_nmi(vcpu)) 4425 goto no_vmexit; 4426 4427 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, 4428 NMI_VECTOR | INTR_TYPE_NMI_INTR | 4429 INTR_INFO_VALID_MASK, 0); 4430 /* 4431 * The NMI-triggered VM exit counts as injection: 4432 * clear this one and block further NMIs. 4433 */ 4434 vcpu->arch.nmi_pending = 0; 4435 vmx_set_nmi_mask(vcpu, true); 4436 return 0; 4437 } 4438 4439 if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) { 4440 int irq; 4441 4442 if (!nested_exit_on_intr(vcpu)) { 4443 if (block_nested_events) 4444 return -EBUSY; 4445 4446 goto no_vmexit; 4447 } 4448 4449 if (!nested_exit_intr_ack_set(vcpu)) { 4450 if (block_nested_events) 4451 return -EBUSY; 4452 4453 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); 4454 return 0; 4455 } 4456 4457 irq = kvm_cpu_get_extint(vcpu); 4458 if (irq != -1) { 4459 if (block_nested_events) 4460 return -EBUSY; 4461 4462 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 4463 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0); 4464 return 0; 4465 } 4466 4467 irq = kvm_apic_has_interrupt(vcpu); 4468 if (WARN_ON_ONCE(irq < 0)) 4469 goto no_vmexit; 4470 4471 /* 4472 * If the IRQ is L2's PI notification vector, process posted 4473 * interrupts for L2 instead of injecting VM-Exit, as the 4474 * detection/morphing architecturally occurs when the IRQ is 4475 * delivered to the CPU. Note, only interrupts that are routed 4476 * through the local APIC trigger posted interrupt processing, 4477 * and enabling posted interrupts requires ACK-on-exit. 4478 */ 4479 if (irq == vmx->nested.posted_intr_nv) { 4480 /* 4481 * Nested posted interrupts are delivered via RVI, i.e. 4482 * aren't injected by KVM, and so can be queued even if 4483 * manual event injection is disallowed. 4484 */ 4485 if (block_non_injected_events) 4486 return -EBUSY; 4487 4488 vmx->nested.pi_pending = true; 4489 kvm_apic_clear_irr(vcpu, irq); 4490 goto no_vmexit; 4491 } 4492 4493 if (block_nested_events) 4494 return -EBUSY; 4495 4496 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 4497 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0); 4498 4499 /* 4500 * ACK the interrupt _after_ emulating VM-Exit, as the IRQ must 4501 * be marked as in-service in vmcs01.GUEST_INTERRUPT_STATUS.SVI 4502 * if APICv is active. 4503 */ 4504 kvm_apic_ack_interrupt(vcpu, irq); 4505 return 0; 4506 } 4507 4508 no_vmexit: 4509 return vmx_complete_nested_posted_interrupt(vcpu); 4510 } 4511 4512 static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) 4513 { 4514 ktime_t remaining = 4515 hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer); 4516 u64 value; 4517 4518 if (ktime_to_ns(remaining) <= 0) 4519 return 0; 4520 4521 value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz; 4522 do_div(value, 1000000); 4523 return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 4524 } 4525 4526 static bool is_vmcs12_ext_field(unsigned long field) 4527 { 4528 switch (field) { 4529 case GUEST_ES_SELECTOR: 4530 case GUEST_CS_SELECTOR: 4531 case GUEST_SS_SELECTOR: 4532 case GUEST_DS_SELECTOR: 4533 case GUEST_FS_SELECTOR: 4534 case GUEST_GS_SELECTOR: 4535 case GUEST_LDTR_SELECTOR: 4536 case GUEST_TR_SELECTOR: 4537 case GUEST_ES_LIMIT: 4538 case GUEST_CS_LIMIT: 4539 case GUEST_SS_LIMIT: 4540 case GUEST_DS_LIMIT: 4541 case GUEST_FS_LIMIT: 4542 case GUEST_GS_LIMIT: 4543 case GUEST_LDTR_LIMIT: 4544 case GUEST_TR_LIMIT: 4545 case GUEST_GDTR_LIMIT: 4546 case GUEST_IDTR_LIMIT: 4547 case GUEST_ES_AR_BYTES: 4548 case GUEST_DS_AR_BYTES: 4549 case GUEST_FS_AR_BYTES: 4550 case GUEST_GS_AR_BYTES: 4551 case GUEST_LDTR_AR_BYTES: 4552 case GUEST_TR_AR_BYTES: 4553 case GUEST_ES_BASE: 4554 case GUEST_CS_BASE: 4555 case GUEST_SS_BASE: 4556 case GUEST_DS_BASE: 4557 case GUEST_FS_BASE: 4558 case GUEST_GS_BASE: 4559 case GUEST_LDTR_BASE: 4560 case GUEST_TR_BASE: 4561 case GUEST_GDTR_BASE: 4562 case GUEST_IDTR_BASE: 4563 case GUEST_PENDING_DBG_EXCEPTIONS: 4564 case GUEST_BNDCFGS: 4565 return true; 4566 default: 4567 break; 4568 } 4569 4570 return false; 4571 } 4572 4573 static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, 4574 struct vmcs12 *vmcs12) 4575 { 4576 struct vcpu_vmx *vmx = to_vmx(vcpu); 4577 4578 vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); 4579 vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR); 4580 vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR); 4581 vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR); 4582 vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR); 4583 vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR); 4584 vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR); 4585 vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR); 4586 vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT); 4587 vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT); 4588 vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT); 4589 vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT); 4590 vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT); 4591 vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT); 4592 vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT); 4593 vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT); 4594 vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT); 4595 vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT); 4596 vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES); 4597 vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES); 4598 vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES); 4599 vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES); 4600 vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES); 4601 vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES); 4602 vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE); 4603 vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE); 4604 vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE); 4605 vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE); 4606 vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE); 4607 vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE); 4608 vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE); 4609 vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE); 4610 vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); 4611 vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); 4612 vmcs12->guest_pending_dbg_exceptions = 4613 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); 4614 4615 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false; 4616 } 4617 4618 static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, 4619 struct vmcs12 *vmcs12) 4620 { 4621 struct vcpu_vmx *vmx = to_vmx(vcpu); 4622 int cpu; 4623 4624 if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare) 4625 return; 4626 4627 4628 WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01); 4629 4630 cpu = get_cpu(); 4631 vmx->loaded_vmcs = &vmx->nested.vmcs02; 4632 vmx_vcpu_load_vmcs(vcpu, cpu); 4633 4634 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 4635 4636 vmx->loaded_vmcs = &vmx->vmcs01; 4637 vmx_vcpu_load_vmcs(vcpu, cpu); 4638 put_cpu(); 4639 } 4640 4641 /* 4642 * Update the guest state fields of vmcs12 to reflect changes that 4643 * occurred while L2 was running. (The "IA-32e mode guest" bit of the 4644 * VM-entry controls is also updated, since this is really a guest 4645 * state bit.) 4646 */ 4647 static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 4648 { 4649 struct vcpu_vmx *vmx = to_vmx(vcpu); 4650 4651 if (nested_vmx_is_evmptr12_valid(vmx)) 4652 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 4653 4654 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = 4655 !nested_vmx_is_evmptr12_valid(vmx); 4656 4657 vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); 4658 vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); 4659 4660 vmcs12->guest_rsp = kvm_rsp_read(vcpu); 4661 vmcs12->guest_rip = kvm_rip_read(vcpu); 4662 vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); 4663 4664 vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); 4665 vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); 4666 4667 vmcs12->guest_interruptibility_info = 4668 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 4669 4670 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) 4671 vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT; 4672 else if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) 4673 vmcs12->guest_activity_state = GUEST_ACTIVITY_WAIT_SIPI; 4674 else 4675 vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; 4676 4677 if (nested_cpu_has_preemption_timer(vmcs12) && 4678 vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER && 4679 !vcpu->arch.nested_run_pending) 4680 vmcs12->vmx_preemption_timer_value = 4681 vmx_get_preemption_timer_value(vcpu); 4682 4683 /* 4684 * In some cases (usually, nested EPT), L2 is allowed to change its 4685 * own CR3 without exiting. If it has changed it, we must keep it. 4686 * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined 4687 * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12. 4688 * 4689 * Additionally, restore L2's PDPTR to vmcs12. 4690 */ 4691 if (enable_ept) { 4692 vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3); 4693 if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { 4694 vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0); 4695 vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1); 4696 vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2); 4697 vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); 4698 } 4699 } 4700 4701 vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); 4702 4703 if (nested_cpu_has_vid(vmcs12)) 4704 vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS); 4705 4706 vmcs12->vm_entry_controls = 4707 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | 4708 (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); 4709 4710 /* 4711 * Note! Save DR7, but intentionally don't grab DEBUGCTL from vmcs02. 4712 * Writes to DEBUGCTL that aren't intercepted by L1 are immediately 4713 * propagated to vmcs12 (see vmx_set_msr()), as the value loaded into 4714 * vmcs02 doesn't strictly track vmcs12. 4715 */ 4716 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) 4717 vmcs12->guest_dr7 = vcpu->arch.dr7; 4718 4719 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) 4720 vmcs12->guest_ia32_efer = vcpu->arch.efer; 4721 4722 vmcs_read_cet_state(&vmx->vcpu, &vmcs12->guest_s_cet, 4723 &vmcs12->guest_ssp, 4724 &vmcs12->guest_ssp_tbl); 4725 } 4726 4727 /* 4728 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits 4729 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), 4730 * and this function updates it to reflect the changes to the guest state while 4731 * L2 was running (and perhaps made some exits which were handled directly by L0 4732 * without going back to L1), and to reflect the exit reason. 4733 * Note that we do not have to copy here all VMCS fields, just those that 4734 * could have changed by the L2 guest or the exit - i.e., the guest-state and 4735 * exit-information fields only. Other fields are modified by L1 with VMWRITE, 4736 * which already writes to vmcs12 directly. 4737 */ 4738 static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 4739 u32 vm_exit_reason, u32 exit_intr_info, 4740 unsigned long exit_qualification, u32 exit_insn_len) 4741 { 4742 /* update exit information fields: */ 4743 vmcs12->vm_exit_reason = vm_exit_reason; 4744 if (vmx_get_exit_reason(vcpu).enclave_mode) 4745 vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE; 4746 vmcs12->exit_qualification = exit_qualification; 4747 4748 /* 4749 * On VM-Exit due to a failed VM-Entry, the VMCS isn't marked launched 4750 * and only EXIT_REASON and EXIT_QUALIFICATION are updated, all other 4751 * exit info fields are unmodified. 4752 */ 4753 if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { 4754 vmcs12->launch_state = 1; 4755 4756 /* vm_entry_intr_info_field is cleared on exit. Emulate this 4757 * instead of reading the real value. */ 4758 vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK; 4759 4760 /* 4761 * Transfer the event that L0 or L1 may wanted to inject into 4762 * L2 to IDT_VECTORING_INFO_FIELD. 4763 */ 4764 vmcs12_save_pending_event(vcpu, vmcs12, 4765 vm_exit_reason, exit_intr_info); 4766 4767 vmcs12->vm_exit_intr_info = exit_intr_info; 4768 vmcs12->vm_exit_instruction_len = exit_insn_len; 4769 vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 4770 4771 /* 4772 * According to spec, there's no need to store the guest's 4773 * MSRs if the exit is due to a VM-entry failure that occurs 4774 * during or after loading the guest state. Since this exit 4775 * does not fall in that category, we need to save the MSRs. 4776 */ 4777 if (nested_vmx_store_msr(vcpu, 4778 vmcs12->vm_exit_msr_store_addr, 4779 vmcs12->vm_exit_msr_store_count)) 4780 nested_vmx_abort(vcpu, 4781 VMX_ABORT_SAVE_GUEST_MSR_FAIL); 4782 } 4783 } 4784 4785 /* 4786 * A part of what we need to when the nested L2 guest exits and we want to 4787 * run its L1 parent, is to reset L1's guest state to the host state specified 4788 * in vmcs12. 4789 * This function is to be called not only on normal nested exit, but also on 4790 * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry 4791 * Failures During or After Loading Guest State"). 4792 * This function should be called when the active VMCS is L1's (vmcs01). 4793 */ 4794 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 4795 struct vmcs12 *vmcs12) 4796 { 4797 enum vm_entry_failure_code ignored; 4798 struct kvm_segment seg; 4799 4800 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) 4801 vcpu->arch.efer = vmcs12->host_ia32_efer; 4802 else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 4803 vcpu->arch.efer |= (EFER_LMA | EFER_LME); 4804 else 4805 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); 4806 vmx_set_efer(vcpu, vcpu->arch.efer); 4807 4808 kvm_rsp_write(vcpu, vmcs12->host_rsp); 4809 kvm_rip_write(vcpu, vmcs12->host_rip); 4810 vmx_set_rflags(vcpu, X86_EFLAGS_FIXED); 4811 vmx_set_interrupt_shadow(vcpu, 0); 4812 4813 /* 4814 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't 4815 * actually changed, because vmx_set_cr0 refers to efer set above. 4816 * 4817 * CR0_GUEST_HOST_MASK is already set in the original vmcs01 4818 * (KVM doesn't change it); 4819 */ 4820 vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); 4821 vmx_set_cr0(vcpu, vmcs12->host_cr0); 4822 4823 /* Same as above - no reason to call set_cr4_guest_host_mask(). */ 4824 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 4825 vmx_set_cr4(vcpu, vmcs12->host_cr4); 4826 4827 nested_ept_uninit_mmu_context(vcpu); 4828 4829 /* 4830 * Only PDPTE load can fail as the value of cr3 was checked on entry and 4831 * couldn't have changed. 4832 */ 4833 if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, true, &ignored)) 4834 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); 4835 4836 nested_vmx_transition_tlb_flush(vcpu, vmcs12, false); 4837 4838 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); 4839 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); 4840 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); 4841 vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); 4842 vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); 4843 vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF); 4844 vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF); 4845 4846 /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ 4847 if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) 4848 vmcs_write64(GUEST_BNDCFGS, 0); 4849 4850 /* 4851 * Load CET state from host state if VM_EXIT_LOAD_CET_STATE is set. 4852 * otherwise CET state should be retained across VM-exit, i.e., 4853 * guest values should be propagated from vmcs12 to vmcs01. 4854 */ 4855 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_CET_STATE) 4856 vmcs_write_cet_state(vcpu, vmcs12->host_s_cet, vmcs12->host_ssp, 4857 vmcs12->host_ssp_tbl); 4858 else 4859 vmcs_write_cet_state(vcpu, vmcs12->guest_s_cet, vmcs12->guest_ssp, 4860 vmcs12->guest_ssp_tbl); 4861 4862 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { 4863 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); 4864 vcpu->arch.pat = vmcs12->host_ia32_pat; 4865 } 4866 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && 4867 kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu))) 4868 WARN_ON_ONCE(__kvm_emulate_msr_write(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 4869 vmcs12->host_ia32_perf_global_ctrl)); 4870 4871 /* Set L1 segment info according to Intel SDM 4872 27.5.2 Loading Host Segment and Descriptor-Table Registers */ 4873 seg = (struct kvm_segment) { 4874 .base = 0, 4875 .limit = 0xFFFFFFFF, 4876 .selector = vmcs12->host_cs_selector, 4877 .type = 11, 4878 .present = 1, 4879 .s = 1, 4880 .g = 1 4881 }; 4882 if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 4883 seg.l = 1; 4884 else 4885 seg.db = 1; 4886 __vmx_set_segment(vcpu, &seg, VCPU_SREG_CS); 4887 seg = (struct kvm_segment) { 4888 .base = 0, 4889 .limit = 0xFFFFFFFF, 4890 .type = 3, 4891 .present = 1, 4892 .s = 1, 4893 .db = 1, 4894 .g = 1 4895 }; 4896 seg.selector = vmcs12->host_ds_selector; 4897 __vmx_set_segment(vcpu, &seg, VCPU_SREG_DS); 4898 seg.selector = vmcs12->host_es_selector; 4899 __vmx_set_segment(vcpu, &seg, VCPU_SREG_ES); 4900 seg.selector = vmcs12->host_ss_selector; 4901 __vmx_set_segment(vcpu, &seg, VCPU_SREG_SS); 4902 seg.selector = vmcs12->host_fs_selector; 4903 seg.base = vmcs12->host_fs_base; 4904 __vmx_set_segment(vcpu, &seg, VCPU_SREG_FS); 4905 seg.selector = vmcs12->host_gs_selector; 4906 seg.base = vmcs12->host_gs_base; 4907 __vmx_set_segment(vcpu, &seg, VCPU_SREG_GS); 4908 seg = (struct kvm_segment) { 4909 .base = vmcs12->host_tr_base, 4910 .limit = 0x67, 4911 .selector = vmcs12->host_tr_selector, 4912 .type = 11, 4913 .present = 1 4914 }; 4915 __vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); 4916 4917 memset(&seg, 0, sizeof(seg)); 4918 seg.unusable = 1; 4919 __vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR); 4920 4921 kvm_set_dr(vcpu, 7, 0x400); 4922 vmx_guest_debugctl_write(vcpu, 0); 4923 4924 if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, 4925 vmcs12->vm_exit_msr_load_count)) 4926 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 4927 4928 to_vt(vcpu)->emulation_required = vmx_emulation_required(vcpu); 4929 } 4930 4931 static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) 4932 { 4933 struct vmx_uret_msr *efer_msr; 4934 unsigned int i; 4935 4936 if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER) 4937 return vmcs_read64(GUEST_IA32_EFER); 4938 4939 if (cpu_has_load_ia32_efer()) 4940 return kvm_host.efer; 4941 4942 for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) { 4943 if (vmx->msr_autoload.guest.val[i].index == MSR_EFER) 4944 return vmx->msr_autoload.guest.val[i].value; 4945 } 4946 4947 efer_msr = vmx_find_uret_msr(vmx, MSR_EFER); 4948 if (efer_msr) 4949 return efer_msr->data; 4950 4951 return kvm_host.efer; 4952 } 4953 4954 static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) 4955 { 4956 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4957 struct vcpu_vmx *vmx = to_vmx(vcpu); 4958 struct vmx_msr_entry g, h; 4959 gpa_t gpa; 4960 u32 i, j; 4961 4962 vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT); 4963 4964 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { 4965 /* 4966 * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set 4967 * as vmcs01.GUEST_DR7 contains a userspace defined value 4968 * and vcpu->arch.dr7 is not squirreled away before the 4969 * nested VMENTER (not worth adding a variable in nested_vmx). 4970 */ 4971 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 4972 kvm_set_dr(vcpu, 7, DR7_FIXED_1); 4973 else 4974 WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7))); 4975 } 4976 4977 /* Reload DEBUGCTL to ensure vmcs01 has a fresh FREEZE_IN_SMM value. */ 4978 vmx_reload_guest_debugctl(vcpu); 4979 4980 /* 4981 * Note that calling vmx_set_{efer,cr0,cr4} is important as they 4982 * handle a variety of side effects to KVM's software model. 4983 */ 4984 vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx)); 4985 4986 vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); 4987 vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW)); 4988 4989 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 4990 vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW)); 4991 4992 nested_ept_uninit_mmu_context(vcpu); 4993 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 4994 kvm_register_mark_available(vcpu, VCPU_REG_CR3); 4995 4996 /* 4997 * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs 4998 * from vmcs01 (if necessary). The PDPTRs are not loaded on 4999 * VMFail, like everything else we just need to ensure our 5000 * software model is up-to-date. 5001 */ 5002 if (enable_ept && is_pae_paging(vcpu)) 5003 ept_save_pdptrs(vcpu); 5004 5005 kvm_mmu_reset_context(vcpu); 5006 5007 /* 5008 * This nasty bit of open coding is a compromise between blindly 5009 * loading L1's MSRs using the exit load lists (incorrect emulation 5010 * of VMFail), leaving the nested VM's MSRs in the software model 5011 * (incorrect behavior) and snapshotting the modified MSRs (too 5012 * expensive since the lists are unbound by hardware). For each 5013 * MSR that was (prematurely) loaded from the nested VMEntry load 5014 * list, reload it from the exit load list if it exists and differs 5015 * from the guest value. The intent is to stuff host state as 5016 * silently as possible, not to fully process the exit load list. 5017 */ 5018 for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) { 5019 gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g)); 5020 if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) { 5021 pr_debug_ratelimited( 5022 "%s read MSR index failed (%u, 0x%08llx)\n", 5023 __func__, i, gpa); 5024 goto vmabort; 5025 } 5026 5027 for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) { 5028 gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h)); 5029 if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) { 5030 pr_debug_ratelimited( 5031 "%s read MSR failed (%u, 0x%08llx)\n", 5032 __func__, j, gpa); 5033 goto vmabort; 5034 } 5035 if (h.index != g.index) 5036 continue; 5037 if (h.value == g.value) 5038 break; 5039 5040 if (nested_vmx_load_msr_check(vcpu, &h)) { 5041 pr_debug_ratelimited( 5042 "%s check failed (%u, 0x%x, 0x%x)\n", 5043 __func__, j, h.index, h.reserved); 5044 goto vmabort; 5045 } 5046 5047 if (kvm_emulate_msr_write(vcpu, h.index, h.value)) { 5048 pr_debug_ratelimited( 5049 "%s WRMSR failed (%u, 0x%x, 0x%llx)\n", 5050 __func__, j, h.index, h.value); 5051 goto vmabort; 5052 } 5053 } 5054 } 5055 5056 return; 5057 5058 vmabort: 5059 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 5060 } 5061 5062 /* 5063 * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1 5064 * and modify vmcs12 to make it see what it would expect to see there if 5065 * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) 5066 */ 5067 void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, 5068 u32 exit_intr_info, unsigned long exit_qualification, 5069 u32 exit_insn_len) 5070 { 5071 struct vcpu_vmx *vmx = to_vmx(vcpu); 5072 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5073 5074 /* Pending MTF traps are discarded on VM-Exit. */ 5075 vmx->nested.mtf_pending = false; 5076 5077 /* trying to cancel vmlaunch/vmresume is a bug */ 5078 kvm_warn_on_nested_run_pending(vcpu); 5079 5080 #ifdef CONFIG_KVM_HYPERV 5081 if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) { 5082 /* 5083 * KVM_REQ_GET_NESTED_STATE_PAGES is also used to map 5084 * Enlightened VMCS after migration and we still need to 5085 * do that when something is forcing L2->L1 exit prior to 5086 * the first L2 run. 5087 */ 5088 (void)nested_get_evmcs_page(vcpu); 5089 } 5090 #endif 5091 5092 /* Service pending TLB flush requests for L2 before switching to L1. */ 5093 kvm_service_local_tlb_flush_requests(vcpu); 5094 5095 /* 5096 * VCPU_REG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between 5097 * now and the new vmentry. Ensure that the VMCS02 PDPTR fields are 5098 * up-to-date before switching to L1. 5099 */ 5100 if (enable_ept && is_pae_paging(vcpu)) 5101 vmx_ept_load_pdptrs(vcpu); 5102 5103 leave_guest_mode(vcpu); 5104 5105 if (nested_cpu_has_preemption_timer(vmcs12)) 5106 hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); 5107 5108 if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) { 5109 vcpu->arch.tsc_offset = vcpu->arch.l1_tsc_offset; 5110 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING)) 5111 vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio; 5112 } 5113 5114 if (likely(!vmx->fail)) { 5115 sync_vmcs02_to_vmcs12(vcpu, vmcs12); 5116 5117 if (vm_exit_reason != -1) 5118 prepare_vmcs12(vcpu, vmcs12, vm_exit_reason, 5119 exit_intr_info, exit_qualification, 5120 exit_insn_len); 5121 5122 /* 5123 * Must happen outside of sync_vmcs02_to_vmcs12() as it will 5124 * also be used to capture vmcs12 cache as part of 5125 * capturing nVMX state for snapshot (migration). 5126 * 5127 * Otherwise, this flush will dirty guest memory at a 5128 * point it is already assumed by user-space to be 5129 * immutable. 5130 */ 5131 nested_flush_cached_shadow_vmcs12(vcpu, vmcs12); 5132 } else { 5133 /* 5134 * The only expected VM-instruction error is "VM entry with 5135 * invalid control field(s)." Anything else indicates a 5136 * problem with L0. 5137 */ 5138 WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != 5139 VMXERR_ENTRY_INVALID_CONTROL_FIELD); 5140 5141 /* VM-Fail at VM-Entry means KVM missed a consistency check. */ 5142 WARN_ON_ONCE(warn_on_missed_cc); 5143 } 5144 5145 /* 5146 * Drop events/exceptions that were queued for re-injection to L2 5147 * (picked up via vmx_complete_interrupts()), as well as exceptions 5148 * that were pending for L2. Note, this must NOT be hoisted above 5149 * prepare_vmcs12(), events/exceptions queued for re-injection need to 5150 * be captured in vmcs12 (see vmcs12_save_pending_event()). 5151 */ 5152 vcpu->arch.nmi_injected = false; 5153 kvm_clear_exception_queue(vcpu); 5154 kvm_clear_interrupt_queue(vcpu); 5155 5156 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 5157 5158 kvm_nested_vmexit_handle_ibrs(vcpu); 5159 5160 /* 5161 * Update any VMCS fields that might have changed while vmcs02 was the 5162 * active VMCS. The tracking is per-vCPU, not per-VMCS. 5163 */ 5164 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.nr); 5165 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 5166 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 5167 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 5168 if (kvm_caps.has_tsc_control) 5169 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); 5170 5171 nested_put_vmcs12_pages(vcpu); 5172 5173 if ((vm_exit_reason != -1) && 5174 (enable_shadow_vmcs || nested_vmx_is_evmptr12_valid(vmx))) 5175 vmx->nested.need_vmcs12_to_shadow_sync = true; 5176 5177 /* in case we halted in L2 */ 5178 kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); 5179 5180 if (likely(!vmx->fail)) { 5181 if (vm_exit_reason != -1) 5182 trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, 5183 vmcs12->exit_qualification, 5184 vmcs12->idt_vectoring_info_field, 5185 vmcs12->vm_exit_intr_info, 5186 vmcs12->vm_exit_intr_error_code, 5187 KVM_ISA_VMX); 5188 5189 load_vmcs12_host_state(vcpu, vmcs12); 5190 5191 /* 5192 * Process events if an injectable IRQ or NMI is pending, even 5193 * if the event is blocked (RFLAGS.IF is cleared on VM-Exit). 5194 * If an event became pending while L2 was active, KVM needs to 5195 * either inject the event or request an IRQ/NMI window. SMIs 5196 * don't need to be processed as SMM is mutually exclusive with 5197 * non-root mode. INIT/SIPI don't need to be checked as INIT 5198 * is blocked post-VMXON, and SIPIs are ignored. 5199 */ 5200 if (kvm_cpu_has_injectable_intr(vcpu) || vcpu->arch.nmi_pending) 5201 kvm_make_request(KVM_REQ_EVENT, vcpu); 5202 return; 5203 } 5204 5205 /* 5206 * After an early L2 VM-entry failure, we're now back 5207 * in L1 which thinks it just finished a VMLAUNCH or 5208 * VMRESUME instruction, so we need to set the failure 5209 * flag and the VM-instruction error field of the VMCS 5210 * accordingly, and skip the emulated instruction. 5211 */ 5212 (void)nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 5213 5214 /* 5215 * Restore L1's host state to KVM's software model. We're here 5216 * because a consistency check was caught by hardware, which 5217 * means some amount of guest state has been propagated to KVM's 5218 * model and needs to be unwound to the host's state. 5219 */ 5220 nested_vmx_restore_host_state(vcpu); 5221 5222 vmx->fail = 0; 5223 } 5224 5225 static void nested_vmx_triple_fault(struct kvm_vcpu *vcpu) 5226 { 5227 kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu); 5228 nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0); 5229 } 5230 5231 /* 5232 * Decode the memory-address operand of a vmx instruction, as recorded on an 5233 * exit caused by such an instruction (run by a guest hypervisor). 5234 * On success, returns 0. When the operand is invalid, returns 1 and throws 5235 * #UD, #GP, or #SS. 5236 */ 5237 int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, 5238 u32 vmx_instruction_info, bool wr, int len, gva_t *ret) 5239 { 5240 gva_t off; 5241 bool exn; 5242 struct kvm_segment s; 5243 5244 /* 5245 * According to Vol. 3B, "Information for VM Exits Due to Instruction 5246 * Execution", on an exit, vmx_instruction_info holds most of the 5247 * addressing components of the operand. Only the displacement part 5248 * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). 5249 * For how an actual address is calculated from all these components, 5250 * refer to Vol. 1, "Operand Addressing". 5251 */ 5252 int scaling = vmx_instruction_info & 3; 5253 int addr_size = (vmx_instruction_info >> 7) & 7; 5254 bool is_reg = vmx_instruction_info & (1u << 10); 5255 int seg_reg = (vmx_instruction_info >> 15) & 7; 5256 int index_reg = (vmx_instruction_info >> 18) & 0xf; 5257 bool index_is_valid = !(vmx_instruction_info & (1u << 22)); 5258 int base_reg = (vmx_instruction_info >> 23) & 0xf; 5259 bool base_is_valid = !(vmx_instruction_info & (1u << 27)); 5260 5261 if (is_reg) { 5262 kvm_queue_exception(vcpu, UD_VECTOR); 5263 return 1; 5264 } 5265 5266 /* Addr = segment_base + offset */ 5267 /* offset = base + [index * scale] + displacement */ 5268 off = exit_qualification; /* holds the displacement */ 5269 if (addr_size == 1) 5270 off = (gva_t)sign_extend64(off, 31); 5271 else if (addr_size == 0) 5272 off = (gva_t)sign_extend64(off, 15); 5273 if (base_is_valid) 5274 off += kvm_register_read(vcpu, base_reg); 5275 if (index_is_valid) 5276 off += kvm_register_read(vcpu, index_reg) << scaling; 5277 vmx_get_segment(vcpu, &s, seg_reg); 5278 5279 /* 5280 * The effective address, i.e. @off, of a memory operand is truncated 5281 * based on the address size of the instruction. Note that this is 5282 * the *effective address*, i.e. the address prior to accounting for 5283 * the segment's base. 5284 */ 5285 if (addr_size == 1) /* 32 bit */ 5286 off &= 0xffffffff; 5287 else if (addr_size == 0) /* 16 bit */ 5288 off &= 0xffff; 5289 5290 /* Checks for #GP/#SS exceptions. */ 5291 exn = false; 5292 if (is_long_mode(vcpu)) { 5293 /* 5294 * The virtual/linear address is never truncated in 64-bit 5295 * mode, e.g. a 32-bit address size can yield a 64-bit virtual 5296 * address when using FS/GS with a non-zero base. 5297 */ 5298 if (seg_reg == VCPU_SREG_FS || seg_reg == VCPU_SREG_GS) 5299 *ret = s.base + off; 5300 else 5301 *ret = off; 5302 5303 *ret = vmx_get_untagged_addr(vcpu, *ret, 0); 5304 /* Long mode: #GP(0)/#SS(0) if the memory address is in a 5305 * non-canonical form. This is the only check on the memory 5306 * destination for long mode! 5307 */ 5308 exn = is_noncanonical_address(*ret, vcpu, 0); 5309 } else { 5310 /* 5311 * When not in long mode, the virtual/linear address is 5312 * unconditionally truncated to 32 bits regardless of the 5313 * address size. 5314 */ 5315 *ret = (s.base + off) & 0xffffffff; 5316 5317 /* Protected mode: apply checks for segment validity in the 5318 * following order: 5319 * - segment type check (#GP(0) may be thrown) 5320 * - usability check (#GP(0)/#SS(0)) 5321 * - limit check (#GP(0)/#SS(0)) 5322 */ 5323 if (wr) 5324 /* #GP(0) if the destination operand is located in a 5325 * read-only data segment or any code segment. 5326 */ 5327 exn = ((s.type & 0xa) == 0 || (s.type & 8)); 5328 else 5329 /* #GP(0) if the source operand is located in an 5330 * execute-only code segment 5331 */ 5332 exn = ((s.type & 0xa) == 8); 5333 if (exn) { 5334 kvm_queue_exception_e(vcpu, GP_VECTOR, 0); 5335 return 1; 5336 } 5337 /* Protected mode: #GP(0)/#SS(0) if the segment is unusable. 5338 */ 5339 exn = (s.unusable != 0); 5340 5341 /* 5342 * Protected mode: #GP(0)/#SS(0) if the memory operand is 5343 * outside the segment limit. All CPUs that support VMX ignore 5344 * limit checks for flat segments, i.e. segments with base==0, 5345 * limit==0xffffffff and of type expand-up data or code. 5346 */ 5347 if (!(s.base == 0 && s.limit == 0xffffffff && 5348 ((s.type & 8) || !(s.type & 4)))) 5349 exn = exn || ((u64)off + len - 1 > s.limit); 5350 } 5351 if (exn) { 5352 kvm_queue_exception_e(vcpu, 5353 seg_reg == VCPU_SREG_SS ? 5354 SS_VECTOR : GP_VECTOR, 5355 0); 5356 return 1; 5357 } 5358 5359 return 0; 5360 } 5361 5362 static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer, 5363 int *ret) 5364 { 5365 gva_t gva; 5366 struct x86_exception e; 5367 int r; 5368 5369 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 5370 vmcs_read32(VMX_INSTRUCTION_INFO), false, 5371 sizeof(*vmpointer), &gva)) { 5372 *ret = 1; 5373 return -EINVAL; 5374 } 5375 5376 r = kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e); 5377 if (r != X86EMUL_CONTINUE) { 5378 *ret = kvm_handle_memory_failure(vcpu, r, &e); 5379 return -EINVAL; 5380 } 5381 5382 return 0; 5383 } 5384 5385 /* 5386 * Allocate a shadow VMCS and associate it with the currently loaded 5387 * VMCS, unless such a shadow VMCS already exists. The newly allocated 5388 * VMCS is also VMCLEARed, so that it is ready for use. 5389 */ 5390 static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu) 5391 { 5392 struct vcpu_vmx *vmx = to_vmx(vcpu); 5393 struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs; 5394 5395 /* 5396 * KVM allocates a shadow VMCS only when L1 executes VMXON and frees it 5397 * when L1 executes VMXOFF or the vCPU is forced out of nested 5398 * operation. VMXON faults if the CPU is already post-VMXON, so it 5399 * should be impossible to already have an allocated shadow VMCS. KVM 5400 * doesn't support virtualization of VMCS shadowing, so vmcs01 should 5401 * always be the loaded VMCS. 5402 */ 5403 if (WARN_ON(loaded_vmcs != &vmx->vmcs01 || loaded_vmcs->shadow_vmcs)) 5404 return loaded_vmcs->shadow_vmcs; 5405 5406 loaded_vmcs->shadow_vmcs = alloc_vmcs(true); 5407 if (loaded_vmcs->shadow_vmcs) 5408 vmcs_clear(loaded_vmcs->shadow_vmcs); 5409 5410 return loaded_vmcs->shadow_vmcs; 5411 } 5412 5413 static int enter_vmx_operation(struct kvm_vcpu *vcpu) 5414 { 5415 struct vcpu_vmx *vmx = to_vmx(vcpu); 5416 int r; 5417 5418 r = alloc_loaded_vmcs(&vmx->nested.vmcs02); 5419 if (r < 0) 5420 goto out_vmcs02; 5421 5422 vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); 5423 if (!vmx->nested.cached_vmcs12) 5424 goto out_cached_vmcs12; 5425 5426 vmx->nested.shadow_vmcs12_cache.gpa = INVALID_GPA; 5427 vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); 5428 if (!vmx->nested.cached_shadow_vmcs12) 5429 goto out_cached_shadow_vmcs12; 5430 5431 if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu)) 5432 goto out_shadow_vmcs; 5433 5434 hrtimer_setup(&vmx->nested.preemption_timer, vmx_preemption_timer_fn, CLOCK_MONOTONIC, 5435 HRTIMER_MODE_ABS_PINNED); 5436 5437 vmx->nested.vpid02 = allocate_vpid(); 5438 5439 vmx->nested.vmcs02_initialized = false; 5440 vmx->nested.vmxon = true; 5441 5442 if (vmx_pt_mode_is_host_guest()) { 5443 vmx->pt_desc.guest.ctl = 0; 5444 pt_update_intercept_for_msr(vcpu); 5445 } 5446 5447 return 0; 5448 5449 out_shadow_vmcs: 5450 kfree(vmx->nested.cached_shadow_vmcs12); 5451 5452 out_cached_shadow_vmcs12: 5453 kfree(vmx->nested.cached_vmcs12); 5454 5455 out_cached_vmcs12: 5456 free_loaded_vmcs(&vmx->nested.vmcs02); 5457 5458 out_vmcs02: 5459 return -ENOMEM; 5460 } 5461 5462 /* Emulate the VMXON instruction. */ 5463 static int handle_vmxon(struct kvm_vcpu *vcpu) 5464 { 5465 int ret; 5466 gpa_t vmptr; 5467 uint32_t revision; 5468 struct vcpu_vmx *vmx = to_vmx(vcpu); 5469 const u64 VMXON_NEEDED_FEATURES = FEAT_CTL_LOCKED 5470 | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; 5471 5472 /* 5473 * Manually check CR4.VMXE checks, KVM must force CR4.VMXE=1 to enter 5474 * the guest and so cannot rely on hardware to perform the check, 5475 * which has higher priority than VM-Exit (see Intel SDM's pseudocode 5476 * for VMXON). 5477 * 5478 * Rely on hardware for the other pre-VM-Exit checks, CR0.PE=1, !VM86 5479 * and !COMPATIBILITY modes. For an unrestricted guest, KVM doesn't 5480 * force any of the relevant guest state. For a restricted guest, KVM 5481 * does force CR0.PE=1, but only to also force VM86 in order to emulate 5482 * Real Mode, and so there's no need to check CR0.PE manually. 5483 */ 5484 if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_VMXE)) { 5485 kvm_queue_exception(vcpu, UD_VECTOR); 5486 return 1; 5487 } 5488 5489 /* 5490 * The CPL is checked for "not in VMX operation" and for "in VMX root", 5491 * and has higher priority than the VM-Fail due to being post-VMXON, 5492 * i.e. VMXON #GPs outside of VMX non-root if CPL!=0. In VMX non-root, 5493 * VMXON causes VM-Exit and KVM unconditionally forwards VMXON VM-Exits 5494 * from L2 to L1, i.e. there's no need to check for the vCPU being in 5495 * VMX non-root. 5496 * 5497 * Forwarding the VM-Exit unconditionally, i.e. without performing the 5498 * #UD checks (see above), is functionally ok because KVM doesn't allow 5499 * L1 to run L2 without CR4.VMXE=0, and because KVM never modifies L2's 5500 * CR0 or CR4, i.e. it's L2's responsibility to emulate #UDs that are 5501 * missed by hardware due to shadowing CR0 and/or CR4. 5502 */ 5503 if (vmx_get_cpl(vcpu)) { 5504 kvm_inject_gp(vcpu, 0); 5505 return 1; 5506 } 5507 5508 if (vmx->nested.vmxon) 5509 return nested_vmx_fail(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); 5510 5511 /* 5512 * Invalid CR0/CR4 generates #GP. These checks are performed if and 5513 * only if the vCPU isn't already in VMX operation, i.e. effectively 5514 * have lower priority than the VM-Fail above. 5515 */ 5516 if (!nested_host_cr0_valid(vcpu, kvm_read_cr0(vcpu)) || 5517 !nested_host_cr4_valid(vcpu, kvm_read_cr4(vcpu))) { 5518 kvm_inject_gp(vcpu, 0); 5519 return 1; 5520 } 5521 5522 if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) 5523 != VMXON_NEEDED_FEATURES) { 5524 kvm_inject_gp(vcpu, 0); 5525 return 1; 5526 } 5527 5528 if (nested_vmx_get_vmptr(vcpu, &vmptr, &ret)) 5529 return ret; 5530 5531 /* 5532 * SDM 3: 24.11.5 5533 * The first 4 bytes of VMXON region contain the supported 5534 * VMCS revision identifier 5535 * 5536 * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case; 5537 * which replaces physical address width with 32 5538 */ 5539 if (!page_address_valid(vcpu, vmptr)) 5540 return nested_vmx_failInvalid(vcpu); 5541 5542 if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) || 5543 revision != VMCS12_REVISION) 5544 return nested_vmx_failInvalid(vcpu); 5545 5546 vmx->nested.vmxon_ptr = vmptr; 5547 ret = enter_vmx_operation(vcpu); 5548 if (ret) 5549 return ret; 5550 5551 return nested_vmx_succeed(vcpu); 5552 } 5553 5554 static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) 5555 { 5556 struct vcpu_vmx *vmx = to_vmx(vcpu); 5557 5558 if (vmx->nested.current_vmptr == INVALID_GPA) 5559 return; 5560 5561 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); 5562 5563 if (enable_shadow_vmcs) { 5564 /* copy to memory all shadowed fields in case 5565 they were modified */ 5566 copy_shadow_to_vmcs12(vmx); 5567 vmx_disable_shadow_vmcs(vmx); 5568 } 5569 vmx->nested.posted_intr_nv = -1; 5570 5571 /* Flush VMCS12 to guest memory */ 5572 kvm_vcpu_write_guest_page(vcpu, 5573 vmx->nested.current_vmptr >> PAGE_SHIFT, 5574 vmx->nested.cached_vmcs12, 0, VMCS12_SIZE); 5575 5576 kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 5577 5578 vmx->nested.current_vmptr = INVALID_GPA; 5579 } 5580 5581 /* Emulate the VMXOFF instruction */ 5582 static int handle_vmxoff(struct kvm_vcpu *vcpu) 5583 { 5584 if (!nested_vmx_check_permission(vcpu)) 5585 return 1; 5586 5587 free_nested(vcpu); 5588 5589 if (kvm_apic_has_pending_init_or_sipi(vcpu)) 5590 kvm_make_request(KVM_REQ_EVENT, vcpu); 5591 5592 return nested_vmx_succeed(vcpu); 5593 } 5594 5595 /* Emulate the VMCLEAR instruction */ 5596 static int handle_vmclear(struct kvm_vcpu *vcpu) 5597 { 5598 struct vcpu_vmx *vmx = to_vmx(vcpu); 5599 u32 zero = 0; 5600 gpa_t vmptr; 5601 int r; 5602 5603 if (!nested_vmx_check_permission(vcpu)) 5604 return 1; 5605 5606 if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) 5607 return r; 5608 5609 if (!page_address_valid(vcpu, vmptr)) 5610 return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS); 5611 5612 if (vmptr == vmx->nested.vmxon_ptr) 5613 return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_VMXON_POINTER); 5614 5615 if (likely(!nested_evmcs_handle_vmclear(vcpu, vmptr))) { 5616 if (vmptr == vmx->nested.current_vmptr) 5617 nested_release_vmcs12(vcpu); 5618 5619 /* 5620 * Silently ignore memory errors on VMCLEAR, Intel's pseudocode 5621 * for VMCLEAR includes a "ensure that data for VMCS referenced 5622 * by the operand is in memory" clause that guards writes to 5623 * memory, i.e. doing nothing for I/O is architecturally valid. 5624 * 5625 * FIXME: Suppress failures if and only if no memslot is found, 5626 * i.e. exit to userspace if __copy_to_user() fails. 5627 */ 5628 (void)kvm_vcpu_write_guest(vcpu, 5629 vmptr + offsetof(struct vmcs12, 5630 launch_state), 5631 &zero, sizeof(zero)); 5632 } 5633 5634 return nested_vmx_succeed(vcpu); 5635 } 5636 5637 /* Emulate the VMLAUNCH instruction */ 5638 static int handle_vmlaunch(struct kvm_vcpu *vcpu) 5639 { 5640 return nested_vmx_run(vcpu, true); 5641 } 5642 5643 /* Emulate the VMRESUME instruction */ 5644 static int handle_vmresume(struct kvm_vcpu *vcpu) 5645 { 5646 5647 return nested_vmx_run(vcpu, false); 5648 } 5649 5650 static int handle_vmread(struct kvm_vcpu *vcpu) 5651 { 5652 struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) 5653 : get_vmcs12(vcpu); 5654 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5655 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5656 struct vcpu_vmx *vmx = to_vmx(vcpu); 5657 struct x86_exception e; 5658 unsigned long field; 5659 u64 value; 5660 gva_t gva = 0; 5661 short offset; 5662 int len, r; 5663 5664 if (!nested_vmx_check_permission(vcpu)) 5665 return 1; 5666 5667 /* Decode instruction info and find the field to read */ 5668 field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); 5669 5670 if (!nested_vmx_is_evmptr12_valid(vmx)) { 5671 /* 5672 * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, 5673 * any VMREAD sets the ALU flags for VMfailInvalid. 5674 */ 5675 if (vmx->nested.current_vmptr == INVALID_GPA || 5676 (is_guest_mode(vcpu) && 5677 get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) 5678 return nested_vmx_failInvalid(vcpu); 5679 5680 offset = get_vmcs12_field_offset(field); 5681 if (offset < 0) 5682 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5683 5684 if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field)) 5685 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 5686 5687 /* Read the field, zero-extended to a u64 value */ 5688 value = vmcs12_read_any(vmcs12, field, offset); 5689 } else { 5690 /* 5691 * Hyper-V TLFS (as of 6.0b) explicitly states, that while an 5692 * enlightened VMCS is active VMREAD/VMWRITE instructions are 5693 * unsupported. Unfortunately, certain versions of Windows 11 5694 * don't comply with this requirement which is not enforced in 5695 * genuine Hyper-V. Allow VMREAD from an enlightened VMCS as a 5696 * workaround, as misbehaving guests will panic on VM-Fail. 5697 * Note, enlightened VMCS is incompatible with shadow VMCS so 5698 * all VMREADs from L2 should go to L1. 5699 */ 5700 if (WARN_ON_ONCE(is_guest_mode(vcpu))) 5701 return nested_vmx_failInvalid(vcpu); 5702 5703 offset = evmcs_field_offset(field, NULL); 5704 if (offset < 0) 5705 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5706 5707 /* Read the field, zero-extended to a u64 value */ 5708 value = evmcs_read_any(nested_vmx_evmcs(vmx), field, offset); 5709 } 5710 5711 /* 5712 * Now copy part of this value to register or memory, as requested. 5713 * Note that the number of bits actually copied is 32 or 64 depending 5714 * on the guest's mode (32 or 64 bit), not on the given field's length. 5715 */ 5716 if (instr_info & BIT(10)) { 5717 kvm_register_write(vcpu, (((instr_info) >> 3) & 0xf), value); 5718 } else { 5719 len = is_64_bit_mode(vcpu) ? 8 : 4; 5720 if (get_vmx_mem_address(vcpu, exit_qualification, 5721 instr_info, true, len, &gva)) 5722 return 1; 5723 /* _system ok, nested_vmx_check_permission has verified cpl=0 */ 5724 r = kvm_write_guest_virt_system(vcpu, gva, &value, len, &e); 5725 if (r != X86EMUL_CONTINUE) 5726 return kvm_handle_memory_failure(vcpu, r, &e); 5727 } 5728 5729 return nested_vmx_succeed(vcpu); 5730 } 5731 5732 static bool is_shadow_field_rw(unsigned long field) 5733 { 5734 switch (field) { 5735 #define SHADOW_FIELD_RW(x, y) case x: 5736 #include "vmcs_shadow_fields.h" 5737 return true; 5738 default: 5739 break; 5740 } 5741 return false; 5742 } 5743 5744 static bool is_shadow_field_ro(unsigned long field) 5745 { 5746 switch (field) { 5747 #define SHADOW_FIELD_RO(x, y) case x: 5748 #include "vmcs_shadow_fields.h" 5749 return true; 5750 default: 5751 break; 5752 } 5753 return false; 5754 } 5755 5756 static int handle_vmwrite(struct kvm_vcpu *vcpu) 5757 { 5758 struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) 5759 : get_vmcs12(vcpu); 5760 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5761 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5762 struct vcpu_vmx *vmx = to_vmx(vcpu); 5763 struct x86_exception e; 5764 unsigned long field; 5765 short offset; 5766 gva_t gva; 5767 int len, r; 5768 5769 /* 5770 * The value to write might be 32 or 64 bits, depending on L1's long 5771 * mode, and eventually we need to write that into a field of several 5772 * possible lengths. The code below first zero-extends the value to 64 5773 * bit (value), and then copies only the appropriate number of 5774 * bits into the vmcs12 field. 5775 */ 5776 u64 value = 0; 5777 5778 if (!nested_vmx_check_permission(vcpu)) 5779 return 1; 5780 5781 /* 5782 * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, 5783 * any VMWRITE sets the ALU flags for VMfailInvalid. 5784 */ 5785 if (vmx->nested.current_vmptr == INVALID_GPA || 5786 (is_guest_mode(vcpu) && 5787 get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) 5788 return nested_vmx_failInvalid(vcpu); 5789 5790 if (instr_info & BIT(10)) 5791 value = kvm_register_read(vcpu, (((instr_info) >> 3) & 0xf)); 5792 else { 5793 len = is_64_bit_mode(vcpu) ? 8 : 4; 5794 if (get_vmx_mem_address(vcpu, exit_qualification, 5795 instr_info, false, len, &gva)) 5796 return 1; 5797 r = kvm_read_guest_virt(vcpu, gva, &value, len, &e); 5798 if (r != X86EMUL_CONTINUE) 5799 return kvm_handle_memory_failure(vcpu, r, &e); 5800 } 5801 5802 field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); 5803 5804 offset = get_vmcs12_field_offset(field); 5805 if (offset < 0) 5806 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5807 5808 /* 5809 * If the vCPU supports "VMWRITE to any supported field in the 5810 * VMCS," then the "read-only" fields are actually read/write. 5811 */ 5812 if (vmcs_field_readonly(field) && 5813 !nested_cpu_has_vmwrite_any_field(vcpu)) 5814 return nested_vmx_fail(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); 5815 5816 /* 5817 * Ensure vmcs12 is up-to-date before any VMWRITE that dirties 5818 * vmcs12, else we may crush a field or consume a stale value. 5819 */ 5820 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) 5821 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 5822 5823 /* 5824 * Some Intel CPUs intentionally drop the reserved bits of the AR byte 5825 * fields on VMWRITE. Emulate this behavior to ensure consistent KVM 5826 * behavior regardless of the underlying hardware, e.g. if an AR_BYTE 5827 * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD 5828 * from L1 will return a different value than VMREAD from L2 (L1 sees 5829 * the stripped down value, L2 sees the full value as stored by KVM). 5830 */ 5831 if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES) 5832 value &= 0x1f0ff; 5833 5834 vmcs12_write_any(vmcs12, field, offset, value); 5835 5836 /* 5837 * Do not track vmcs12 dirty-state if in guest-mode as we actually 5838 * dirty shadow vmcs12 instead of vmcs12. Fields that can be updated 5839 * by L1 without a vmexit are always updated in the vmcs02, i.e. don't 5840 * "dirty" vmcs12, all others go down the prepare_vmcs02() slow path. 5841 */ 5842 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) { 5843 /* 5844 * L1 can read these fields without exiting, ensure the 5845 * shadow VMCS is up-to-date. 5846 */ 5847 if (enable_shadow_vmcs && is_shadow_field_ro(field)) { 5848 preempt_disable(); 5849 vmcs_load(vmx->vmcs01.shadow_vmcs); 5850 5851 __vmcs_writel(field, value); 5852 5853 vmcs_clear(vmx->vmcs01.shadow_vmcs); 5854 vmcs_load(vmx->loaded_vmcs->vmcs); 5855 preempt_enable(); 5856 } 5857 vmx->nested.dirty_vmcs12 = true; 5858 } 5859 5860 return nested_vmx_succeed(vcpu); 5861 } 5862 5863 static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr) 5864 { 5865 vmx->nested.current_vmptr = vmptr; 5866 if (enable_shadow_vmcs) { 5867 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); 5868 vmcs_write64(VMCS_LINK_POINTER, 5869 __pa(vmx->vmcs01.shadow_vmcs)); 5870 vmx->nested.need_vmcs12_to_shadow_sync = true; 5871 } 5872 vmx->nested.dirty_vmcs12 = true; 5873 vmx->nested.force_msr_bitmap_recalc = true; 5874 } 5875 5876 /* Emulate the VMPTRLD instruction */ 5877 static int handle_vmptrld(struct kvm_vcpu *vcpu) 5878 { 5879 struct vcpu_vmx *vmx = to_vmx(vcpu); 5880 gpa_t vmptr; 5881 int r; 5882 5883 if (!nested_vmx_check_permission(vcpu)) 5884 return 1; 5885 5886 if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) 5887 return r; 5888 5889 if (!page_address_valid(vcpu, vmptr)) 5890 return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS); 5891 5892 if (vmptr == vmx->nested.vmxon_ptr) 5893 return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_VMXON_POINTER); 5894 5895 /* Forbid normal VMPTRLD if Enlightened version was used */ 5896 if (nested_vmx_is_evmptr12_valid(vmx)) 5897 return 1; 5898 5899 if (vmx->nested.current_vmptr != vmptr) { 5900 struct gfn_to_hva_cache *ghc = &vmx->nested.vmcs12_cache; 5901 struct vmcs_hdr hdr; 5902 5903 if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmptr, VMCS12_SIZE)) { 5904 /* 5905 * Reads from an unbacked page return all 1s, 5906 * which means that the 32 bits located at the 5907 * given physical address won't match the required 5908 * VMCS12_REVISION identifier. 5909 */ 5910 return nested_vmx_fail(vcpu, 5911 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5912 } 5913 5914 if (kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr, 5915 offsetof(struct vmcs12, hdr), 5916 sizeof(hdr))) { 5917 return nested_vmx_fail(vcpu, 5918 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5919 } 5920 5921 if (hdr.revision_id != VMCS12_REVISION || 5922 (hdr.shadow_vmcs && 5923 !nested_cpu_has_vmx_shadow_vmcs(vcpu))) { 5924 return nested_vmx_fail(vcpu, 5925 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5926 } 5927 5928 nested_release_vmcs12(vcpu); 5929 5930 /* 5931 * Load VMCS12 from guest memory since it is not already 5932 * cached. 5933 */ 5934 if (kvm_read_guest_cached(vcpu->kvm, ghc, vmx->nested.cached_vmcs12, 5935 VMCS12_SIZE)) { 5936 return nested_vmx_fail(vcpu, 5937 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5938 } 5939 5940 set_current_vmptr(vmx, vmptr); 5941 } 5942 5943 return nested_vmx_succeed(vcpu); 5944 } 5945 5946 /* Emulate the VMPTRST instruction */ 5947 static int handle_vmptrst(struct kvm_vcpu *vcpu) 5948 { 5949 unsigned long exit_qual = vmx_get_exit_qual(vcpu); 5950 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5951 gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr; 5952 struct x86_exception e; 5953 gva_t gva; 5954 int r; 5955 5956 if (!nested_vmx_check_permission(vcpu)) 5957 return 1; 5958 5959 if (unlikely(nested_vmx_is_evmptr12_valid(to_vmx(vcpu)))) 5960 return 1; 5961 5962 if (get_vmx_mem_address(vcpu, exit_qual, instr_info, 5963 true, sizeof(gpa_t), &gva)) 5964 return 1; 5965 /* *_system ok, nested_vmx_check_permission has verified cpl=0 */ 5966 r = kvm_write_guest_virt_system(vcpu, gva, (void *)¤t_vmptr, 5967 sizeof(gpa_t), &e); 5968 if (r != X86EMUL_CONTINUE) 5969 return kvm_handle_memory_failure(vcpu, r, &e); 5970 5971 return nested_vmx_succeed(vcpu); 5972 } 5973 5974 /* Emulate the INVEPT instruction */ 5975 static int handle_invept(struct kvm_vcpu *vcpu) 5976 { 5977 struct vcpu_vmx *vmx = to_vmx(vcpu); 5978 u32 vmx_instruction_info, types; 5979 unsigned long type, roots_to_free; 5980 struct kvm_mmu *mmu; 5981 gva_t gva; 5982 struct x86_exception e; 5983 struct { 5984 u64 eptp, gpa; 5985 } operand; 5986 int i, r, gpr_index; 5987 5988 if (!(vmx->nested.msrs.secondary_ctls_high & 5989 SECONDARY_EXEC_ENABLE_EPT) || 5990 !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) { 5991 kvm_queue_exception(vcpu, UD_VECTOR); 5992 return 1; 5993 } 5994 5995 if (!nested_vmx_check_permission(vcpu)) 5996 return 1; 5997 5998 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5999 gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); 6000 type = kvm_register_read(vcpu, gpr_index); 6001 6002 types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; 6003 6004 if (type >= 32 || !(types & (1 << type))) 6005 return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 6006 6007 /* According to the Intel VMX instruction reference, the memory 6008 * operand is read even if it isn't needed (e.g., for type==global) 6009 */ 6010 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 6011 vmx_instruction_info, false, sizeof(operand), &gva)) 6012 return 1; 6013 r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); 6014 if (r != X86EMUL_CONTINUE) 6015 return kvm_handle_memory_failure(vcpu, r, &e); 6016 6017 /* 6018 * Nested EPT roots are always held through guest_mmu, 6019 * not root_mmu. 6020 */ 6021 mmu = &vcpu->arch.guest_mmu; 6022 6023 switch (type) { 6024 case VMX_EPT_EXTENT_CONTEXT: 6025 if (!nested_vmx_check_eptp(vcpu, operand.eptp)) 6026 return nested_vmx_fail(vcpu, 6027 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 6028 6029 roots_to_free = 0; 6030 if (nested_ept_root_matches(mmu->root.hpa, mmu->root.pgd, 6031 operand.eptp)) 6032 roots_to_free |= KVM_MMU_ROOT_CURRENT; 6033 6034 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 6035 if (nested_ept_root_matches(mmu->prev_roots[i].hpa, 6036 mmu->prev_roots[i].pgd, 6037 operand.eptp)) 6038 roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); 6039 } 6040 break; 6041 case VMX_EPT_EXTENT_GLOBAL: 6042 roots_to_free = KVM_MMU_ROOTS_ALL; 6043 break; 6044 default: 6045 BUG(); 6046 break; 6047 } 6048 6049 if (roots_to_free) 6050 kvm_mmu_free_roots(vcpu->kvm, mmu, roots_to_free); 6051 6052 return nested_vmx_succeed(vcpu); 6053 } 6054 6055 static int handle_invvpid(struct kvm_vcpu *vcpu) 6056 { 6057 struct vcpu_vmx *vmx = to_vmx(vcpu); 6058 u32 vmx_instruction_info; 6059 unsigned long type, types; 6060 gva_t gva; 6061 struct x86_exception e; 6062 struct { 6063 u64 vpid; 6064 u64 gla; 6065 } operand; 6066 u16 vpid02; 6067 int r, gpr_index; 6068 6069 if (!(vmx->nested.msrs.secondary_ctls_high & 6070 SECONDARY_EXEC_ENABLE_VPID) || 6071 !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) { 6072 kvm_queue_exception(vcpu, UD_VECTOR); 6073 return 1; 6074 } 6075 6076 if (!nested_vmx_check_permission(vcpu)) 6077 return 1; 6078 6079 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 6080 gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); 6081 type = kvm_register_read(vcpu, gpr_index); 6082 6083 types = (vmx->nested.msrs.vpid_caps & 6084 VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; 6085 6086 if (type >= 32 || !(types & (1 << type))) 6087 return nested_vmx_fail(vcpu, 6088 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 6089 6090 /* according to the intel vmx instruction reference, the memory 6091 * operand is read even if it isn't needed (e.g., for type==global) 6092 */ 6093 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 6094 vmx_instruction_info, false, sizeof(operand), &gva)) 6095 return 1; 6096 r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); 6097 if (r != X86EMUL_CONTINUE) 6098 return kvm_handle_memory_failure(vcpu, r, &e); 6099 6100 if (operand.vpid >> 16) 6101 return nested_vmx_fail(vcpu, 6102 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 6103 6104 /* 6105 * Always flush the effective vpid02, i.e. never flush the current VPID 6106 * and never explicitly flush vpid01. INVVPID targets a VPID, not a 6107 * VMCS, and so whether or not the current vmcs12 has VPID enabled is 6108 * irrelevant (and there may not be a loaded vmcs12). 6109 */ 6110 vpid02 = nested_get_vpid02(vcpu); 6111 switch (type) { 6112 case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: 6113 /* 6114 * LAM doesn't apply to addresses that are inputs to TLB 6115 * invalidation. 6116 */ 6117 if (!operand.vpid || 6118 is_noncanonical_invlpg_address(operand.gla, vcpu)) 6119 return nested_vmx_fail(vcpu, 6120 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 6121 vpid_sync_vcpu_addr(vpid02, operand.gla); 6122 break; 6123 case VMX_VPID_EXTENT_SINGLE_CONTEXT: 6124 case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: 6125 if (!operand.vpid) 6126 return nested_vmx_fail(vcpu, 6127 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 6128 vpid_sync_context(vpid02); 6129 break; 6130 case VMX_VPID_EXTENT_ALL_CONTEXT: 6131 vpid_sync_context(vpid02); 6132 break; 6133 default: 6134 WARN_ON_ONCE(1); 6135 return kvm_skip_emulated_instruction(vcpu); 6136 } 6137 6138 /* 6139 * Sync the shadow page tables if EPT is disabled, L1 is invalidating 6140 * linear mappings for L2 (tagged with L2's VPID). Free all guest 6141 * roots as VPIDs are not tracked in the MMU role. 6142 * 6143 * Note, this operates on root_mmu, not guest_mmu, as L1 and L2 share 6144 * an MMU when EPT is disabled. 6145 * 6146 * TODO: sync only the affected SPTEs for INVDIVIDUAL_ADDR. 6147 */ 6148 if (!enable_ept) 6149 kvm_mmu_free_guest_mode_roots(vcpu->kvm, &vcpu->arch.root_mmu); 6150 6151 return nested_vmx_succeed(vcpu); 6152 } 6153 6154 static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, 6155 struct vmcs12 *vmcs12) 6156 { 6157 u32 index = kvm_ecx_read(vcpu); 6158 u64 new_eptp; 6159 6160 if (WARN_ON_ONCE(!nested_cpu_has_ept(vmcs12))) 6161 return 1; 6162 if (index >= VMFUNC_EPTP_ENTRIES) 6163 return 1; 6164 6165 if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT, 6166 &new_eptp, index * 8, 8)) 6167 return 1; 6168 6169 /* 6170 * If the (L2) guest does a vmfunc to the currently 6171 * active ept pointer, we don't have to do anything else 6172 */ 6173 if (vmcs12->ept_pointer != new_eptp) { 6174 if (!nested_vmx_check_eptp(vcpu, new_eptp)) 6175 return 1; 6176 6177 vmcs12->ept_pointer = new_eptp; 6178 nested_ept_new_eptp(vcpu); 6179 6180 if (!nested_cpu_has_vpid(vmcs12)) 6181 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 6182 } 6183 6184 return 0; 6185 } 6186 6187 static int handle_vmfunc(struct kvm_vcpu *vcpu) 6188 { 6189 struct vcpu_vmx *vmx = to_vmx(vcpu); 6190 struct vmcs12 *vmcs12; 6191 u32 function = kvm_eax_read(vcpu); 6192 6193 /* 6194 * VMFUNC should never execute cleanly while L1 is active; KVM supports 6195 * VMFUNC for nested VMs, but not for L1. 6196 */ 6197 if (WARN_ON_ONCE(!is_guest_mode(vcpu))) { 6198 kvm_queue_exception(vcpu, UD_VECTOR); 6199 return 1; 6200 } 6201 6202 vmcs12 = get_vmcs12(vcpu); 6203 6204 /* 6205 * #UD on out-of-bounds function has priority over VM-Exit, and VMFUNC 6206 * is enabled in vmcs02 if and only if it's enabled in vmcs12. 6207 */ 6208 if (WARN_ON_ONCE((function > 63) || !nested_cpu_has_vmfunc(vmcs12))) { 6209 kvm_queue_exception(vcpu, UD_VECTOR); 6210 return 1; 6211 } 6212 6213 if (!(vmcs12->vm_function_control & BIT_ULL(function))) 6214 goto fail; 6215 6216 switch (function) { 6217 case 0: 6218 if (nested_vmx_eptp_switching(vcpu, vmcs12)) 6219 goto fail; 6220 break; 6221 default: 6222 goto fail; 6223 } 6224 return kvm_skip_emulated_instruction(vcpu); 6225 6226 fail: 6227 /* 6228 * This is effectively a reflected VM-Exit, as opposed to a synthesized 6229 * nested VM-Exit. Pass the original exit reason, i.e. don't hardcode 6230 * EXIT_REASON_VMFUNC as the exit reason. 6231 */ 6232 nested_vmx_vmexit(vcpu, vmx->vt.exit_reason.full, 6233 vmx_get_intr_info(vcpu), 6234 vmx_get_exit_qual(vcpu)); 6235 return 1; 6236 } 6237 6238 /* 6239 * Return true if an IO instruction with the specified port and size should cause 6240 * a VM-exit into L1. 6241 */ 6242 bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port, 6243 int size) 6244 { 6245 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6246 gpa_t bitmap, last_bitmap; 6247 u8 b; 6248 6249 last_bitmap = INVALID_GPA; 6250 b = -1; 6251 6252 while (size > 0) { 6253 if (port < 0x8000) 6254 bitmap = vmcs12->io_bitmap_a; 6255 else if (port < 0x10000) 6256 bitmap = vmcs12->io_bitmap_b; 6257 else 6258 return true; 6259 bitmap += (port & 0x7fff) / 8; 6260 6261 if (last_bitmap != bitmap) 6262 if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1)) 6263 return true; 6264 if (b & (1 << (port & 7))) 6265 return true; 6266 6267 port++; 6268 size--; 6269 last_bitmap = bitmap; 6270 } 6271 6272 return false; 6273 } 6274 6275 static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, 6276 struct vmcs12 *vmcs12) 6277 { 6278 unsigned long exit_qualification; 6279 unsigned short port; 6280 int size; 6281 6282 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 6283 return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); 6284 6285 exit_qualification = vmx_get_exit_qual(vcpu); 6286 6287 port = exit_qualification >> 16; 6288 size = (exit_qualification & 7) + 1; 6289 6290 return nested_vmx_check_io_bitmaps(vcpu, port, size); 6291 } 6292 6293 /* 6294 * Return 1 if we should exit from L2 to L1 to handle an MSR access, 6295 * rather than handle it ourselves in L0. I.e., check whether L1 expressed 6296 * disinterest in the current event (read or write a specific MSR) by using an 6297 * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps. 6298 */ 6299 static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, 6300 struct vmcs12 *vmcs12, 6301 union vmx_exit_reason exit_reason) 6302 { 6303 u32 msr_index; 6304 gpa_t bitmap; 6305 6306 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 6307 return true; 6308 6309 if (exit_reason.basic == EXIT_REASON_MSR_READ_IMM || 6310 exit_reason.basic == EXIT_REASON_MSR_WRITE_IMM) 6311 msr_index = vmx_get_exit_qual(vcpu); 6312 else 6313 msr_index = kvm_ecx_read(vcpu); 6314 6315 /* 6316 * The MSR_BITMAP page is divided into four 1024-byte bitmaps, 6317 * for the four combinations of read/write and low/high MSR numbers. 6318 * First we need to figure out which of the four to use: 6319 */ 6320 bitmap = vmcs12->msr_bitmap; 6321 if (exit_reason.basic == EXIT_REASON_MSR_WRITE || 6322 exit_reason.basic == EXIT_REASON_MSR_WRITE_IMM) 6323 bitmap += 2048; 6324 if (msr_index >= 0xc0000000) { 6325 msr_index -= 0xc0000000; 6326 bitmap += 1024; 6327 } 6328 6329 /* Then read the msr_index'th bit from this bitmap: */ 6330 if (msr_index < 1024*8) { 6331 unsigned char b; 6332 if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1)) 6333 return true; 6334 return 1 & (b >> (msr_index & 7)); 6335 } else 6336 return true; /* let L1 handle the wrong parameter */ 6337 } 6338 6339 /* 6340 * Return 1 if we should exit from L2 to L1 to handle a CR access exit, 6341 * rather than handle it ourselves in L0. I.e., check if L1 wanted to 6342 * intercept (via guest_host_mask etc.) the current event. 6343 */ 6344 static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, 6345 struct vmcs12 *vmcs12) 6346 { 6347 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 6348 int cr = exit_qualification & 15; 6349 int reg; 6350 unsigned long val; 6351 6352 switch ((exit_qualification >> 4) & 3) { 6353 case 0: /* mov to cr */ 6354 reg = (exit_qualification >> 8) & 15; 6355 val = kvm_register_read(vcpu, reg); 6356 switch (cr) { 6357 case 0: 6358 if (vmcs12->cr0_guest_host_mask & 6359 (val ^ vmcs12->cr0_read_shadow)) 6360 return true; 6361 break; 6362 case 3: 6363 if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) 6364 return true; 6365 break; 6366 case 4: 6367 if (vmcs12->cr4_guest_host_mask & 6368 (vmcs12->cr4_read_shadow ^ val)) 6369 return true; 6370 break; 6371 case 8: 6372 if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING)) 6373 return true; 6374 break; 6375 } 6376 break; 6377 case 2: /* clts */ 6378 if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) && 6379 (vmcs12->cr0_read_shadow & X86_CR0_TS)) 6380 return true; 6381 break; 6382 case 1: /* mov from cr */ 6383 switch (cr) { 6384 case 3: 6385 if (vmcs12->cpu_based_vm_exec_control & 6386 CPU_BASED_CR3_STORE_EXITING) 6387 return true; 6388 break; 6389 case 8: 6390 if (vmcs12->cpu_based_vm_exec_control & 6391 CPU_BASED_CR8_STORE_EXITING) 6392 return true; 6393 break; 6394 } 6395 break; 6396 case 3: /* lmsw */ 6397 /* 6398 * lmsw can change bits 1..3 of cr0, and only set bit 0 of 6399 * cr0. Other attempted changes are ignored, with no exit. 6400 */ 6401 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; 6402 if (vmcs12->cr0_guest_host_mask & 0xe & 6403 (val ^ vmcs12->cr0_read_shadow)) 6404 return true; 6405 if ((vmcs12->cr0_guest_host_mask & 0x1) && 6406 !(vmcs12->cr0_read_shadow & 0x1) && 6407 (val & 0x1)) 6408 return true; 6409 break; 6410 } 6411 return false; 6412 } 6413 6414 static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu, 6415 struct vmcs12 *vmcs12) 6416 { 6417 u32 encls_leaf; 6418 6419 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SGX) || 6420 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING)) 6421 return false; 6422 6423 encls_leaf = kvm_eax_read(vcpu); 6424 if (encls_leaf > 62) 6425 encls_leaf = 63; 6426 return vmcs12->encls_exiting_bitmap & BIT_ULL(encls_leaf); 6427 } 6428 6429 static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu, 6430 struct vmcs12 *vmcs12, gpa_t bitmap) 6431 { 6432 u32 vmx_instruction_info; 6433 unsigned long field; 6434 u8 b; 6435 6436 if (!nested_cpu_has_shadow_vmcs(vmcs12)) 6437 return true; 6438 6439 /* Decode instruction info and find the field to access */ 6440 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 6441 field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); 6442 6443 /* Out-of-range fields always cause a VM exit from L2 to L1 */ 6444 if (field >> 15) 6445 return true; 6446 6447 if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1)) 6448 return true; 6449 6450 return 1 & (b >> (field & 7)); 6451 } 6452 6453 static bool nested_vmx_exit_handled_mtf(struct vmcs12 *vmcs12) 6454 { 6455 u32 entry_intr_info = vmcs12->vm_entry_intr_info_field; 6456 6457 if (nested_cpu_has_mtf(vmcs12)) 6458 return true; 6459 6460 /* 6461 * An MTF VM-exit may be injected into the guest by setting the 6462 * interruption-type to 7 (other event) and the vector field to 0. Such 6463 * is the case regardless of the 'monitor trap flag' VM-execution 6464 * control. 6465 */ 6466 return entry_intr_info == (INTR_INFO_VALID_MASK 6467 | INTR_TYPE_OTHER_EVENT); 6468 } 6469 6470 /* 6471 * Return true if L0 wants to handle an exit from L2 regardless of whether or not 6472 * L1 wants the exit. Only call this when in is_guest_mode (L2). 6473 */ 6474 static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, 6475 union vmx_exit_reason exit_reason) 6476 { 6477 u32 intr_info; 6478 6479 switch ((u16)exit_reason.basic) { 6480 case EXIT_REASON_EXCEPTION_NMI: 6481 intr_info = vmx_get_intr_info(vcpu); 6482 if (is_nmi(intr_info)) 6483 return true; 6484 else if (is_page_fault(intr_info)) 6485 return vcpu->arch.apf.host_apf_flags || 6486 vmx_need_pf_intercept(vcpu); 6487 else if (is_debug(intr_info) && 6488 vcpu->guest_debug & 6489 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) 6490 return true; 6491 else if (is_breakpoint(intr_info) && 6492 vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 6493 return true; 6494 else if (is_alignment_check(intr_info) && 6495 !vmx_guest_inject_ac(vcpu)) 6496 return true; 6497 else if (is_ve_fault(intr_info)) 6498 return true; 6499 return false; 6500 case EXIT_REASON_EXTERNAL_INTERRUPT: 6501 return true; 6502 case EXIT_REASON_MCE_DURING_VMENTRY: 6503 return true; 6504 case EXIT_REASON_EPT_VIOLATION: 6505 /* 6506 * L0 always deals with the EPT violation. If nested EPT is 6507 * used, and the nested mmu code discovers that the address is 6508 * missing in the guest EPT table (EPT12), the EPT violation 6509 * will be injected with nested_ept_inject_page_fault() 6510 */ 6511 return true; 6512 case EXIT_REASON_EPT_MISCONFIG: 6513 /* 6514 * L2 never uses directly L1's EPT, but rather L0's own EPT 6515 * table (shadow on EPT) or a merged EPT table that L0 built 6516 * (EPT on EPT). So any problems with the structure of the 6517 * table is L0's fault. 6518 */ 6519 return true; 6520 case EXIT_REASON_PREEMPTION_TIMER: 6521 return true; 6522 case EXIT_REASON_PML_FULL: 6523 /* 6524 * PML is emulated for an L1 VMM and should never be enabled in 6525 * vmcs02, always "handle" PML_FULL by exiting to userspace. 6526 */ 6527 return true; 6528 case EXIT_REASON_VMFUNC: 6529 /* VM functions are emulated through L2->L0 vmexits. */ 6530 return true; 6531 case EXIT_REASON_BUS_LOCK: 6532 /* 6533 * At present, bus lock VM exit is never exposed to L1. 6534 * Handle L2's bus locks in L0 directly. 6535 */ 6536 return true; 6537 #ifdef CONFIG_KVM_HYPERV 6538 case EXIT_REASON_VMCALL: 6539 /* Hyper-V L2 TLB flush hypercall is handled by L0 */ 6540 return guest_hv_cpuid_has_l2_tlb_flush(vcpu) && 6541 nested_evmcs_l2_tlb_flush_enabled(vcpu) && 6542 kvm_hv_is_tlb_flush_hcall(vcpu); 6543 #endif 6544 case EXIT_REASON_CPUID: 6545 return !kvm_is_cpuid_allowed(vcpu); 6546 default: 6547 break; 6548 } 6549 return false; 6550 } 6551 6552 /* 6553 * Return 1 if L1 wants to intercept an exit from L2. Only call this when in 6554 * is_guest_mode (L2). 6555 */ 6556 static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, 6557 union vmx_exit_reason exit_reason) 6558 { 6559 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6560 u32 intr_info; 6561 6562 switch ((u16)exit_reason.basic) { 6563 case EXIT_REASON_EXCEPTION_NMI: 6564 intr_info = vmx_get_intr_info(vcpu); 6565 if (is_nmi(intr_info)) 6566 return true; 6567 else if (is_page_fault(intr_info)) 6568 return true; 6569 return vmcs12->exception_bitmap & 6570 (1u << (intr_info & INTR_INFO_VECTOR_MASK)); 6571 case EXIT_REASON_EXTERNAL_INTERRUPT: 6572 return nested_exit_on_intr(vcpu); 6573 case EXIT_REASON_TRIPLE_FAULT: 6574 return true; 6575 case EXIT_REASON_INTERRUPT_WINDOW: 6576 return nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING); 6577 case EXIT_REASON_NMI_WINDOW: 6578 return nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING); 6579 case EXIT_REASON_TASK_SWITCH: 6580 return true; 6581 case EXIT_REASON_CPUID: 6582 return true; 6583 case EXIT_REASON_HLT: 6584 return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); 6585 case EXIT_REASON_INVD: 6586 return true; 6587 case EXIT_REASON_INVLPG: 6588 return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 6589 case EXIT_REASON_RDPMC: 6590 return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); 6591 case EXIT_REASON_RDRAND: 6592 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING); 6593 case EXIT_REASON_RDSEED: 6594 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING); 6595 case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: 6596 return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); 6597 case EXIT_REASON_VMREAD: 6598 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, 6599 vmcs12->vmread_bitmap); 6600 case EXIT_REASON_VMWRITE: 6601 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, 6602 vmcs12->vmwrite_bitmap); 6603 case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: 6604 case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: 6605 case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME: 6606 case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: 6607 case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID: 6608 /* 6609 * VMX instructions trap unconditionally. This allows L1 to 6610 * emulate them for its L2 guest, i.e., allows 3-level nesting! 6611 */ 6612 return true; 6613 case EXIT_REASON_CR_ACCESS: 6614 return nested_vmx_exit_handled_cr(vcpu, vmcs12); 6615 case EXIT_REASON_DR_ACCESS: 6616 return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); 6617 case EXIT_REASON_IO_INSTRUCTION: 6618 return nested_vmx_exit_handled_io(vcpu, vmcs12); 6619 case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR: 6620 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC); 6621 case EXIT_REASON_MSR_READ: 6622 case EXIT_REASON_MSR_WRITE: 6623 case EXIT_REASON_MSR_READ_IMM: 6624 case EXIT_REASON_MSR_WRITE_IMM: 6625 return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); 6626 case EXIT_REASON_INVALID_STATE: 6627 return true; 6628 case EXIT_REASON_MWAIT_INSTRUCTION: 6629 return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); 6630 case EXIT_REASON_MONITOR_TRAP_FLAG: 6631 return nested_vmx_exit_handled_mtf(vmcs12); 6632 case EXIT_REASON_MONITOR_INSTRUCTION: 6633 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING); 6634 case EXIT_REASON_PAUSE_INSTRUCTION: 6635 return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) || 6636 nested_cpu_has2(vmcs12, 6637 SECONDARY_EXEC_PAUSE_LOOP_EXITING); 6638 case EXIT_REASON_MCE_DURING_VMENTRY: 6639 return true; 6640 case EXIT_REASON_TPR_BELOW_THRESHOLD: 6641 return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW); 6642 case EXIT_REASON_APIC_ACCESS: 6643 case EXIT_REASON_APIC_WRITE: 6644 case EXIT_REASON_EOI_INDUCED: 6645 /* 6646 * The controls for "virtualize APIC accesses," "APIC- 6647 * register virtualization," and "virtual-interrupt 6648 * delivery" only come from vmcs12. 6649 */ 6650 return true; 6651 case EXIT_REASON_INVPCID: 6652 return 6653 nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) && 6654 nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 6655 case EXIT_REASON_WBINVD: 6656 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); 6657 case EXIT_REASON_XSETBV: 6658 return true; 6659 case EXIT_REASON_XSAVES: 6660 case EXIT_REASON_XRSTORS: 6661 /* 6662 * Always forward XSAVES/XRSTORS to L1 as KVM doesn't utilize 6663 * XSS-bitmap, and always loads vmcs02 with vmcs12's XSS-bitmap 6664 * verbatim, i.e. any exit is due to L1's bitmap. WARN if 6665 * XSAVES isn't enabled, as the CPU is supposed to inject #UD 6666 * in that case, before consulting the XSS-bitmap. 6667 */ 6668 WARN_ON_ONCE(!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_XSAVES)); 6669 return true; 6670 case EXIT_REASON_UMWAIT: 6671 case EXIT_REASON_TPAUSE: 6672 return nested_cpu_has2(vmcs12, 6673 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE); 6674 case EXIT_REASON_ENCLS: 6675 return nested_vmx_exit_handled_encls(vcpu, vmcs12); 6676 case EXIT_REASON_NOTIFY: 6677 /* Notify VM exit is not exposed to L1 */ 6678 return false; 6679 case EXIT_REASON_SEAMCALL: 6680 case EXIT_REASON_TDCALL: 6681 /* 6682 * SEAMCALL and TDCALL unconditionally VM-Exit, but aren't 6683 * virtualized by KVM for L1 hypervisors, i.e. L1 should 6684 * never want or expect such an exit. 6685 */ 6686 return false; 6687 default: 6688 return true; 6689 } 6690 } 6691 6692 /* 6693 * Conditionally reflect a VM-Exit into L1. Returns %true if the VM-Exit was 6694 * reflected into L1. 6695 */ 6696 bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu) 6697 { 6698 struct vcpu_vmx *vmx = to_vmx(vcpu); 6699 union vmx_exit_reason exit_reason = vmx->vt.exit_reason; 6700 unsigned long exit_qual; 6701 u32 exit_intr_info; 6702 6703 kvm_warn_on_nested_run_pending(vcpu); 6704 6705 /* 6706 * Late nested VM-Fail shares the same flow as nested VM-Exit since KVM 6707 * has already loaded L2's state. 6708 */ 6709 if (unlikely(vmx->fail)) { 6710 trace_kvm_nested_vmenter_failed( 6711 "hardware VM-instruction error: ", 6712 vmcs_read32(VM_INSTRUCTION_ERROR)); 6713 exit_intr_info = 0; 6714 exit_qual = 0; 6715 goto reflect_vmexit; 6716 } 6717 6718 trace_kvm_nested_vmexit(vcpu, KVM_ISA_VMX); 6719 6720 /* If L0 (KVM) wants the exit, it trumps L1's desires. */ 6721 if (nested_vmx_l0_wants_exit(vcpu, exit_reason)) 6722 return false; 6723 6724 /* If L1 doesn't want the exit, handle it in L0. */ 6725 if (!nested_vmx_l1_wants_exit(vcpu, exit_reason)) 6726 return false; 6727 6728 /* 6729 * vmcs.VM_EXIT_INTR_INFO is only valid for EXCEPTION_NMI exits. For 6730 * EXTERNAL_INTERRUPT, the value for vmcs12->vm_exit_intr_info would 6731 * need to be synthesized by querying the in-kernel LAPIC, but external 6732 * interrupts are never reflected to L1 so it's a non-issue. 6733 */ 6734 exit_intr_info = vmx_get_intr_info(vcpu); 6735 if (is_exception_with_error_code(exit_intr_info)) { 6736 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6737 6738 vmcs12->vm_exit_intr_error_code = 6739 vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 6740 } 6741 exit_qual = vmx_get_exit_qual(vcpu); 6742 6743 reflect_vmexit: 6744 nested_vmx_vmexit(vcpu, exit_reason.full, exit_intr_info, exit_qual); 6745 return true; 6746 } 6747 6748 static int vmx_get_nested_state(struct kvm_vcpu *vcpu, 6749 struct kvm_nested_state __user *user_kvm_nested_state, 6750 u32 user_data_size) 6751 { 6752 struct vcpu_vmx *vmx; 6753 struct vmcs12 *vmcs12; 6754 struct kvm_nested_state kvm_state = { 6755 .flags = 0, 6756 .format = KVM_STATE_NESTED_FORMAT_VMX, 6757 .size = sizeof(kvm_state), 6758 .hdr.vmx.flags = 0, 6759 .hdr.vmx.vmxon_pa = INVALID_GPA, 6760 .hdr.vmx.vmcs12_pa = INVALID_GPA, 6761 .hdr.vmx.preemption_timer_deadline = 0, 6762 }; 6763 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = 6764 &user_kvm_nested_state->data.vmx[0]; 6765 6766 if (!vcpu) 6767 return kvm_state.size + sizeof(*user_vmx_nested_state); 6768 6769 vmx = to_vmx(vcpu); 6770 vmcs12 = get_vmcs12(vcpu); 6771 6772 if (guest_cpu_cap_has(vcpu, X86_FEATURE_VMX) && 6773 (vmx->nested.vmxon || vmx->nested.smm.vmxon)) { 6774 kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr; 6775 kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr; 6776 6777 if (vmx_has_valid_vmcs12(vcpu)) { 6778 kvm_state.size += sizeof(user_vmx_nested_state->vmcs12); 6779 6780 /* 'hv_evmcs_vmptr' can also be EVMPTR_MAP_PENDING here */ 6781 if (nested_vmx_is_evmptr12_set(vmx)) 6782 kvm_state.flags |= KVM_STATE_NESTED_EVMCS; 6783 6784 if (is_guest_mode(vcpu) && 6785 nested_cpu_has_shadow_vmcs(vmcs12) && 6786 vmcs12->vmcs_link_pointer != INVALID_GPA) 6787 kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12); 6788 } 6789 6790 if (vmx->nested.smm.vmxon) 6791 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON; 6792 6793 if (vmx->nested.smm.guest_mode) 6794 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE; 6795 6796 if (is_guest_mode(vcpu)) { 6797 kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE; 6798 6799 if (vcpu->arch.nested_run_pending) 6800 kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING; 6801 6802 if (vmx->nested.mtf_pending) 6803 kvm_state.flags |= KVM_STATE_NESTED_MTF_PENDING; 6804 6805 if (nested_cpu_has_preemption_timer(vmcs12) && 6806 vmx->nested.has_preemption_timer_deadline) { 6807 kvm_state.hdr.vmx.flags |= 6808 KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE; 6809 kvm_state.hdr.vmx.preemption_timer_deadline = 6810 vmx->nested.preemption_timer_deadline; 6811 } 6812 } 6813 } 6814 6815 if (user_data_size < kvm_state.size) 6816 goto out; 6817 6818 if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state))) 6819 return -EFAULT; 6820 6821 if (!vmx_has_valid_vmcs12(vcpu)) 6822 goto out; 6823 6824 /* 6825 * When running L2, the authoritative vmcs12 state is in the 6826 * vmcs02. When running L1, the authoritative vmcs12 state is 6827 * in the shadow or enlightened vmcs linked to vmcs01, unless 6828 * need_vmcs12_to_shadow_sync is set, in which case, the authoritative 6829 * vmcs12 state is in the vmcs12 already. 6830 */ 6831 if (is_guest_mode(vcpu)) { 6832 sync_vmcs02_to_vmcs12(vcpu, vmcs12); 6833 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 6834 } else { 6835 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); 6836 if (!vmx->nested.need_vmcs12_to_shadow_sync) { 6837 if (nested_vmx_is_evmptr12_valid(vmx)) 6838 /* 6839 * L1 hypervisor is not obliged to keep eVMCS 6840 * clean fields data always up-to-date while 6841 * not in guest mode, 'hv_clean_fields' is only 6842 * supposed to be actual upon vmentry so we need 6843 * to ignore it here and do full copy. 6844 */ 6845 copy_enlightened_to_vmcs12(vmx, 0); 6846 else if (enable_shadow_vmcs) 6847 copy_shadow_to_vmcs12(vmx); 6848 } 6849 } 6850 6851 BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE); 6852 BUILD_BUG_ON(sizeof(user_vmx_nested_state->shadow_vmcs12) < VMCS12_SIZE); 6853 6854 /* 6855 * Copy over the full allocated size of vmcs12 rather than just the size 6856 * of the struct. 6857 */ 6858 if (copy_to_user(user_vmx_nested_state->vmcs12, vmcs12, VMCS12_SIZE)) 6859 return -EFAULT; 6860 6861 if (nested_cpu_has_shadow_vmcs(vmcs12) && 6862 vmcs12->vmcs_link_pointer != INVALID_GPA) { 6863 if (copy_to_user(user_vmx_nested_state->shadow_vmcs12, 6864 get_shadow_vmcs12(vcpu), VMCS12_SIZE)) 6865 return -EFAULT; 6866 } 6867 out: 6868 return kvm_state.size; 6869 } 6870 6871 void vmx_leave_nested(struct kvm_vcpu *vcpu) 6872 { 6873 if (is_guest_mode(vcpu)) { 6874 vcpu->arch.nested_run_pending = 0; 6875 nested_vmx_vmexit(vcpu, -1, 0, 0); 6876 } 6877 free_nested(vcpu); 6878 } 6879 6880 int nested_vmx_check_restored_vmcs12(struct kvm_vcpu *vcpu) 6881 { 6882 enum vm_entry_failure_code ignored; 6883 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6884 6885 if (nested_cpu_has_shadow_vmcs(vmcs12) && 6886 vmcs12->vmcs_link_pointer != INVALID_GPA) { 6887 struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); 6888 6889 if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION || 6890 !shadow_vmcs12->hdr.shadow_vmcs) 6891 return -EINVAL; 6892 } 6893 6894 if (nested_vmx_check_controls(vcpu, vmcs12) || 6895 nested_vmx_check_host_state(vcpu, vmcs12) || 6896 nested_vmx_check_guest_state(vcpu, vmcs12, &ignored)) 6897 return -EINVAL; 6898 6899 return 0; 6900 } 6901 6902 static int vmx_set_nested_state(struct kvm_vcpu *vcpu, 6903 struct kvm_nested_state __user *user_kvm_nested_state, 6904 struct kvm_nested_state *kvm_state) 6905 { 6906 struct vcpu_vmx *vmx = to_vmx(vcpu); 6907 struct vmcs12 *vmcs12; 6908 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = 6909 &user_kvm_nested_state->data.vmx[0]; 6910 int ret; 6911 6912 if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX) 6913 return -EINVAL; 6914 6915 if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) { 6916 if (kvm_state->hdr.vmx.smm.flags) 6917 return -EINVAL; 6918 6919 if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) 6920 return -EINVAL; 6921 6922 /* 6923 * KVM_STATE_NESTED_EVMCS used to signal that KVM should 6924 * enable eVMCS capability on vCPU. However, since then 6925 * code was changed such that flag signals vmcs12 should 6926 * be copied into eVMCS in guest memory. 6927 * 6928 * To preserve backwards compatibility, allow user 6929 * to set this flag even when there is no VMXON region. 6930 */ 6931 if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS) 6932 return -EINVAL; 6933 } else { 6934 if (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX)) 6935 return -EINVAL; 6936 6937 if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa)) 6938 return -EINVAL; 6939 } 6940 6941 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 6942 (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 6943 return -EINVAL; 6944 6945 if (kvm_state->hdr.vmx.smm.flags & 6946 ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON)) 6947 return -EINVAL; 6948 6949 if (kvm_state->hdr.vmx.flags & ~KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) 6950 return -EINVAL; 6951 6952 /* 6953 * SMM temporarily disables VMX, so we cannot be in guest mode, 6954 * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags 6955 * must be zero. 6956 */ 6957 if (is_smm(vcpu) ? 6958 (kvm_state->flags & 6959 (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING)) 6960 : kvm_state->hdr.vmx.smm.flags) 6961 return -EINVAL; 6962 6963 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 6964 !(kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON)) 6965 return -EINVAL; 6966 6967 if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) && 6968 (!guest_cpu_cap_has(vcpu, X86_FEATURE_VMX) || 6969 !vmx->nested.enlightened_vmcs_enabled)) 6970 return -EINVAL; 6971 6972 vmx_leave_nested(vcpu); 6973 6974 if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) 6975 return 0; 6976 6977 vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa; 6978 ret = enter_vmx_operation(vcpu); 6979 if (ret) 6980 return ret; 6981 6982 /* Empty 'VMXON' state is permitted if no VMCS loaded */ 6983 if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) { 6984 /* See vmx_has_valid_vmcs12. */ 6985 if ((kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE) || 6986 (kvm_state->flags & KVM_STATE_NESTED_EVMCS) || 6987 (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA)) 6988 return -EINVAL; 6989 else 6990 return 0; 6991 } 6992 6993 if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) { 6994 if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa || 6995 !page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa)) 6996 return -EINVAL; 6997 6998 set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa); 6999 #ifdef CONFIG_KVM_HYPERV 7000 } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) { 7001 /* 7002 * nested_vmx_handle_enlightened_vmptrld() cannot be called 7003 * directly from here as HV_X64_MSR_VP_ASSIST_PAGE may not be 7004 * restored yet. EVMCS will be mapped from 7005 * nested_get_vmcs12_pages(). 7006 */ 7007 vmx->nested.hv_evmcs_vmptr = EVMPTR_MAP_PENDING; 7008 kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 7009 #endif 7010 } else { 7011 return -EINVAL; 7012 } 7013 7014 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) { 7015 vmx->nested.smm.vmxon = true; 7016 vmx->nested.vmxon = false; 7017 7018 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) 7019 vmx->nested.smm.guest_mode = true; 7020 } 7021 7022 vmcs12 = get_vmcs12(vcpu); 7023 if (copy_from_user(vmcs12, user_vmx_nested_state->vmcs12, sizeof(*vmcs12))) 7024 return -EFAULT; 7025 7026 if (vmcs12->hdr.revision_id != VMCS12_REVISION) 7027 return -EINVAL; 7028 7029 if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 7030 return 0; 7031 7032 if (kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING) 7033 vcpu->arch.nested_run_pending = KVM_NESTED_RUN_PENDING_UNTRUSTED; 7034 else 7035 vcpu->arch.nested_run_pending = 0; 7036 7037 vmx->nested.mtf_pending = 7038 !!(kvm_state->flags & KVM_STATE_NESTED_MTF_PENDING); 7039 7040 if (nested_cpu_has_shadow_vmcs(vmcs12) && 7041 vmcs12->vmcs_link_pointer != INVALID_GPA) { 7042 struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); 7043 7044 ret = -EINVAL; 7045 if (kvm_state->size < 7046 sizeof(*kvm_state) + 7047 sizeof(user_vmx_nested_state->vmcs12) + sizeof(*shadow_vmcs12)) 7048 goto error_guest_mode; 7049 7050 ret = -EFAULT; 7051 if (copy_from_user(shadow_vmcs12, 7052 user_vmx_nested_state->shadow_vmcs12, 7053 sizeof(*shadow_vmcs12))) 7054 goto error_guest_mode; 7055 } 7056 7057 vmx->nested.has_preemption_timer_deadline = false; 7058 if (kvm_state->hdr.vmx.flags & KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) { 7059 vmx->nested.has_preemption_timer_deadline = true; 7060 vmx->nested.preemption_timer_deadline = 7061 kvm_state->hdr.vmx.preemption_timer_deadline; 7062 } 7063 7064 ret = nested_vmx_check_restored_vmcs12(vcpu); 7065 if (ret < 0) 7066 goto error_guest_mode; 7067 7068 vmx->nested.dirty_vmcs12 = true; 7069 vmx->nested.force_msr_bitmap_recalc = true; 7070 ret = nested_vmx_enter_non_root_mode(vcpu, false); 7071 if (ret) 7072 goto error_guest_mode; 7073 7074 if (vmx->nested.mtf_pending) 7075 kvm_make_request(KVM_REQ_EVENT, vcpu); 7076 7077 return 0; 7078 7079 error_guest_mode: 7080 vcpu->arch.nested_run_pending = 0; 7081 return ret; 7082 } 7083 7084 void nested_vmx_set_vmcs_shadowing_bitmap(void) 7085 { 7086 if (enable_shadow_vmcs) { 7087 vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); 7088 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); 7089 } 7090 } 7091 7092 static u64 nested_vmx_calc_vmcs_enum_msr(void) 7093 { 7094 /* 7095 * Note these are the so called "index" of the VMCS field encoding, not 7096 * the index into vmcs12. 7097 */ 7098 unsigned int max_idx, idx; 7099 int i; 7100 7101 /* 7102 * For better or worse, KVM allows VMREAD/VMWRITE to all fields in 7103 * vmcs12, regardless of whether or not the associated feature is 7104 * exposed to L1. Simply find the field with the highest index. 7105 */ 7106 max_idx = 0; 7107 for (i = 0; i < nr_vmcs12_fields; i++) { 7108 /* The vmcs12 table is very, very sparsely populated. */ 7109 if (!vmcs12_field_offsets[i]) 7110 continue; 7111 7112 idx = vmcs_field_index(VMCS12_IDX_TO_ENC(i)); 7113 if (idx > max_idx) 7114 max_idx = idx; 7115 } 7116 7117 return (u64)max_idx << VMCS_FIELD_INDEX_SHIFT; 7118 } 7119 7120 static void nested_vmx_setup_pinbased_ctls(struct vmcs_config *vmcs_conf, 7121 struct nested_vmx_msrs *msrs) 7122 { 7123 msrs->pinbased_ctls_low = 7124 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 7125 7126 msrs->pinbased_ctls_high = vmcs_conf->pin_based_exec_ctrl; 7127 msrs->pinbased_ctls_high &= 7128 PIN_BASED_EXT_INTR_MASK | 7129 PIN_BASED_NMI_EXITING | 7130 PIN_BASED_VIRTUAL_NMIS | 7131 (enable_apicv ? PIN_BASED_POSTED_INTR : 0); 7132 msrs->pinbased_ctls_high |= 7133 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 7134 PIN_BASED_VMX_PREEMPTION_TIMER; 7135 } 7136 7137 static void nested_vmx_setup_exit_ctls(struct vmcs_config *vmcs_conf, 7138 struct nested_vmx_msrs *msrs) 7139 { 7140 msrs->exit_ctls_low = 7141 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 7142 7143 msrs->exit_ctls_high = vmcs_conf->vmexit_ctrl; 7144 msrs->exit_ctls_high &= 7145 #ifdef CONFIG_X86_64 7146 VM_EXIT_HOST_ADDR_SPACE_SIZE | 7147 #endif 7148 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT | 7149 VM_EXIT_CLEAR_BNDCFGS | VM_EXIT_LOAD_CET_STATE; 7150 msrs->exit_ctls_high |= 7151 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | 7152 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | 7153 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT | 7154 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; 7155 7156 if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) && 7157 !kvm_cpu_cap_has(X86_FEATURE_IBT)) 7158 msrs->exit_ctls_high &= ~VM_EXIT_LOAD_CET_STATE; 7159 7160 /* We support free control of debug control saving. */ 7161 msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; 7162 } 7163 7164 static void nested_vmx_setup_entry_ctls(struct vmcs_config *vmcs_conf, 7165 struct nested_vmx_msrs *msrs) 7166 { 7167 msrs->entry_ctls_low = 7168 VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 7169 7170 msrs->entry_ctls_high = vmcs_conf->vmentry_ctrl; 7171 msrs->entry_ctls_high &= 7172 #ifdef CONFIG_X86_64 7173 VM_ENTRY_IA32E_MODE | 7174 #endif 7175 VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS | 7176 VM_ENTRY_LOAD_CET_STATE; 7177 msrs->entry_ctls_high |= 7178 (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER | 7179 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL); 7180 7181 if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) && 7182 !kvm_cpu_cap_has(X86_FEATURE_IBT)) 7183 msrs->entry_ctls_high &= ~VM_ENTRY_LOAD_CET_STATE; 7184 7185 /* We support free control of debug control loading. */ 7186 msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; 7187 } 7188 7189 static void nested_vmx_setup_cpubased_ctls(struct vmcs_config *vmcs_conf, 7190 struct nested_vmx_msrs *msrs) 7191 { 7192 msrs->procbased_ctls_low = 7193 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 7194 7195 msrs->procbased_ctls_high = vmcs_conf->cpu_based_exec_ctrl; 7196 msrs->procbased_ctls_high &= 7197 CPU_BASED_INTR_WINDOW_EXITING | 7198 CPU_BASED_NMI_WINDOW_EXITING | CPU_BASED_USE_TSC_OFFSETTING | 7199 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | 7200 CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | 7201 CPU_BASED_CR3_STORE_EXITING | 7202 #ifdef CONFIG_X86_64 7203 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | 7204 #endif 7205 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | 7206 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG | 7207 CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING | 7208 CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING | 7209 CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 7210 /* 7211 * We can allow some features even when not supported by the 7212 * hardware. For example, L1 can specify an MSR bitmap - and we 7213 * can use it to avoid exits to L1 - even when L0 runs L2 7214 * without MSR bitmaps. 7215 */ 7216 msrs->procbased_ctls_high |= 7217 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 7218 CPU_BASED_USE_MSR_BITMAPS; 7219 7220 /* We support free control of CR3 access interception. */ 7221 msrs->procbased_ctls_low &= 7222 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); 7223 } 7224 7225 static void nested_vmx_setup_secondary_ctls(u32 ept_caps, 7226 struct vmcs_config *vmcs_conf, 7227 struct nested_vmx_msrs *msrs) 7228 { 7229 msrs->secondary_ctls_low = 0; 7230 7231 msrs->secondary_ctls_high = vmcs_conf->cpu_based_2nd_exec_ctrl; 7232 msrs->secondary_ctls_high &= 7233 SECONDARY_EXEC_DESC | 7234 SECONDARY_EXEC_ENABLE_RDTSCP | 7235 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 7236 SECONDARY_EXEC_WBINVD_EXITING | 7237 SECONDARY_EXEC_APIC_REGISTER_VIRT | 7238 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 7239 SECONDARY_EXEC_RDRAND_EXITING | 7240 SECONDARY_EXEC_ENABLE_INVPCID | 7241 SECONDARY_EXEC_ENABLE_VMFUNC | 7242 SECONDARY_EXEC_RDSEED_EXITING | 7243 SECONDARY_EXEC_ENABLE_XSAVES | 7244 SECONDARY_EXEC_TSC_SCALING | 7245 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; 7246 7247 /* 7248 * We can emulate "VMCS shadowing," even if the hardware 7249 * doesn't support it. 7250 */ 7251 msrs->secondary_ctls_high |= 7252 SECONDARY_EXEC_SHADOW_VMCS; 7253 7254 if (enable_ept) { 7255 /* nested EPT: emulate EPT also to L1 */ 7256 msrs->secondary_ctls_high |= 7257 SECONDARY_EXEC_ENABLE_EPT; 7258 msrs->ept_caps = 7259 VMX_EPT_PAGE_WALK_4_BIT | 7260 VMX_EPT_PAGE_WALK_5_BIT | 7261 VMX_EPTP_WB_BIT | 7262 VMX_EPT_INVEPT_BIT | 7263 VMX_EPT_EXECUTE_ONLY_BIT | 7264 VMX_EPT_ADVANCED_VMEXIT_INFO_BIT; 7265 7266 msrs->ept_caps &= ept_caps; 7267 msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | 7268 VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT | 7269 VMX_EPT_1GB_PAGE_BIT; 7270 if (enable_ept_ad_bits) { 7271 msrs->secondary_ctls_high |= 7272 SECONDARY_EXEC_ENABLE_PML; 7273 msrs->ept_caps |= VMX_EPT_AD_BIT; 7274 } 7275 7276 if (enable_mbec) 7277 msrs->secondary_ctls_high |= 7278 SECONDARY_EXEC_MODE_BASED_EPT_EXEC; 7279 /* 7280 * Advertise EPTP switching irrespective of hardware support, 7281 * KVM emulates it in software so long as VMFUNC is supported. 7282 */ 7283 if (cpu_has_vmx_vmfunc()) 7284 msrs->vmfunc_controls = VMX_VMFUNC_EPTP_SWITCHING; 7285 } 7286 7287 /* 7288 * Old versions of KVM use the single-context version without 7289 * checking for support, so declare that it is supported even 7290 * though it is treated as global context. The alternative is 7291 * not failing the single-context invvpid, and it is worse. 7292 */ 7293 if (enable_vpid) { 7294 msrs->secondary_ctls_high |= 7295 SECONDARY_EXEC_ENABLE_VPID; 7296 msrs->vpid_caps = VMX_VPID_INVVPID_BIT | 7297 VMX_VPID_EXTENT_SUPPORTED_MASK; 7298 } 7299 7300 if (enable_unrestricted_guest) 7301 msrs->secondary_ctls_high |= 7302 SECONDARY_EXEC_UNRESTRICTED_GUEST; 7303 7304 if (flexpriority_enabled) 7305 msrs->secondary_ctls_high |= 7306 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 7307 7308 if (enable_sgx) 7309 msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING; 7310 } 7311 7312 static void nested_vmx_setup_misc_data(struct vmcs_config *vmcs_conf, 7313 struct nested_vmx_msrs *msrs) 7314 { 7315 msrs->misc_low = (u32)vmcs_conf->misc & VMX_MISC_SAVE_EFER_LMA; 7316 msrs->misc_low |= 7317 VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | 7318 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | 7319 VMX_MISC_ACTIVITY_HLT | 7320 VMX_MISC_ACTIVITY_WAIT_SIPI; 7321 msrs->misc_high = 0; 7322 } 7323 7324 static void nested_vmx_setup_basic(struct nested_vmx_msrs *msrs) 7325 { 7326 /* 7327 * This MSR reports some information about VMX support. We 7328 * should return information about the VMX we emulate for the 7329 * guest, and the VMCS structure we give it - not about the 7330 * VMX support of the underlying hardware. 7331 */ 7332 msrs->basic = vmx_basic_encode_vmcs_info(VMCS12_REVISION, VMCS12_SIZE, 7333 X86_MEMTYPE_WB); 7334 7335 msrs->basic |= VMX_BASIC_TRUE_CTLS; 7336 if (cpu_has_vmx_basic_inout()) 7337 msrs->basic |= VMX_BASIC_INOUT; 7338 if (cpu_has_vmx_basic_no_hw_errcode_cc()) 7339 msrs->basic |= VMX_BASIC_NO_HW_ERROR_CODE_CC; 7340 } 7341 7342 static void nested_vmx_setup_cr_fixed(struct nested_vmx_msrs *msrs) 7343 { 7344 /* 7345 * These MSRs specify bits which the guest must keep fixed on 7346 * while L1 is in VMXON mode (in L1's root mode, or running an L2). 7347 * We picked the standard core2 setting. 7348 */ 7349 #define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) 7350 #define VMXON_CR4_ALWAYSON X86_CR4_VMXE 7351 msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON; 7352 msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON; 7353 7354 /* These MSRs specify bits which the guest must keep fixed off. */ 7355 rdmsrq(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1); 7356 rdmsrq(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1); 7357 7358 if (vmx_umip_emulated()) 7359 msrs->cr4_fixed1 |= X86_CR4_UMIP; 7360 } 7361 7362 /* 7363 * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be 7364 * returned for the various VMX controls MSRs when nested VMX is enabled. 7365 * The same values should also be used to verify that vmcs12 control fields are 7366 * valid during nested entry from L1 to L2. 7367 * Each of these control msrs has a low and high 32-bit half: A low bit is on 7368 * if the corresponding bit in the (32-bit) control field *must* be on, and a 7369 * bit in the high half is on if the corresponding bit in the control field 7370 * may be on. See also vmx_control_verify(). 7371 */ 7372 void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps) 7373 { 7374 struct nested_vmx_msrs *msrs = &vmcs_conf->nested; 7375 7376 /* 7377 * Note that as a general rule, the high half of the MSRs (bits in 7378 * the control fields which may be 1) should be initialized by the 7379 * intersection of the underlying hardware's MSR (i.e., features which 7380 * can be supported) and the list of features we want to expose - 7381 * because they are known to be properly supported in our code. 7382 * Also, usually, the low half of the MSRs (bits which must be 1) can 7383 * be set to 0, meaning that L1 may turn off any of these bits. The 7384 * reason is that if one of these bits is necessary, it will appear 7385 * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control 7386 * fields of vmcs01 and vmcs02, will turn these bits off - and 7387 * nested_vmx_l1_wants_exit() will not pass related exits to L1. 7388 * These rules have exceptions below. 7389 */ 7390 nested_vmx_setup_pinbased_ctls(vmcs_conf, msrs); 7391 7392 nested_vmx_setup_exit_ctls(vmcs_conf, msrs); 7393 7394 nested_vmx_setup_entry_ctls(vmcs_conf, msrs); 7395 7396 nested_vmx_setup_cpubased_ctls(vmcs_conf, msrs); 7397 7398 nested_vmx_setup_secondary_ctls(ept_caps, vmcs_conf, msrs); 7399 7400 nested_vmx_setup_misc_data(vmcs_conf, msrs); 7401 7402 nested_vmx_setup_basic(msrs); 7403 7404 nested_vmx_setup_cr_fixed(msrs); 7405 7406 msrs->vmcs_enum = nested_vmx_calc_vmcs_enum_msr(); 7407 } 7408 7409 void nested_vmx_hardware_unsetup(void) 7410 { 7411 int i; 7412 7413 if (enable_shadow_vmcs) { 7414 for (i = 0; i < VMX_BITMAP_NR; i++) 7415 free_page((unsigned long)vmx_bitmap[i]); 7416 } 7417 } 7418 7419 __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) 7420 { 7421 int i; 7422 7423 /* 7424 * Note! The set of supported vmcs12 fields is consumed by both VMX 7425 * MSR and shadow VMCS setup. 7426 */ 7427 nested_vmx_setup_vmcs12_fields(); 7428 7429 nested_vmx_setup_ctls_msrs(&vmcs_config, vmx_capability.ept); 7430 7431 if (!cpu_has_vmx_shadow_vmcs()) 7432 enable_shadow_vmcs = 0; 7433 if (enable_shadow_vmcs) { 7434 for (i = 0; i < VMX_BITMAP_NR; i++) { 7435 /* 7436 * The vmx_bitmap is not tied to a VM and so should 7437 * not be charged to a memcg. 7438 */ 7439 vmx_bitmap[i] = (unsigned long *) 7440 __get_free_page(GFP_KERNEL); 7441 if (!vmx_bitmap[i]) { 7442 nested_vmx_hardware_unsetup(); 7443 return -ENOMEM; 7444 } 7445 } 7446 7447 init_vmcs_shadow_fields(); 7448 } 7449 7450 exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear; 7451 exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch; 7452 exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld; 7453 exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst; 7454 exit_handlers[EXIT_REASON_VMREAD] = handle_vmread; 7455 exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume; 7456 exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite; 7457 exit_handlers[EXIT_REASON_VMOFF] = handle_vmxoff; 7458 exit_handlers[EXIT_REASON_VMON] = handle_vmxon; 7459 exit_handlers[EXIT_REASON_INVEPT] = handle_invept; 7460 exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid; 7461 exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc; 7462 7463 return 0; 7464 } 7465 7466 static gpa_t vmx_translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, 7467 u64 access, 7468 struct x86_exception *exception, 7469 u64 pte_access) 7470 { 7471 struct kvm_mmu *mmu = vcpu->arch.mmu; 7472 7473 BUG_ON(!mmu_is_nested(vcpu)); 7474 7475 /* 7476 * MBEC differentiates based on the effective U/S bit of 7477 * the guest page tables; not the processor CPL. 7478 */ 7479 access &= ~PFERR_USER_MASK; 7480 if ((pte_access & ACC_USER_MASK) && (access & PFERR_GUEST_FINAL_MASK)) 7481 access |= PFERR_USER_MASK; 7482 7483 return mmu->gva_to_gpa(vcpu, mmu, gpa, access, exception); 7484 } 7485 7486 struct kvm_x86_nested_ops vmx_nested_ops = { 7487 .leave_nested = vmx_leave_nested, 7488 .translate_nested_gpa = vmx_translate_nested_gpa, 7489 .is_exception_vmexit = nested_vmx_is_exception_vmexit, 7490 .check_events = vmx_check_nested_events, 7491 .has_events = vmx_has_nested_events, 7492 .triple_fault = nested_vmx_triple_fault, 7493 .get_state = vmx_get_nested_state, 7494 .set_state = vmx_set_nested_state, 7495 .get_nested_state_pages = vmx_get_nested_state_pages, 7496 .write_log_dirty = nested_vmx_write_pml_buffer, 7497 #ifdef CONFIG_KVM_HYPERV 7498 .enable_evmcs = nested_enable_evmcs, 7499 .get_evmcs_version = nested_get_evmcs_version, 7500 .hv_inject_synthetic_vmexit_post_tlb_flush = vmx_hv_inject_synthetic_vmexit_post_tlb_flush, 7501 #endif 7502 }; 7503